diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_fc4_s2048/README.md b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_fc4_s2048/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_fc4_s2048/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_fc4_s2048/adapter_config.json b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_fc4_s2048/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..286aafcf7ccd318e080dba3bacb341fe7046f2e3 --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_fc4_s2048/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "/data/wiedmann/hub/models--lmms-lab--llava-onevision-qwen2-7b-ov/snapshots/0b07bf7565e244cf4f39982249eafe8cd799d6dd", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 256, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "revision": null, + "target_modules": [ + "up_proj", + "v_proj", + "k_proj", + "o_proj", + "q_proj", + "gate_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_fc4_s2048/adapter_model.bin b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_fc4_s2048/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..8fadb42db6fe54344c408d61139718ef54d766af --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_fc4_s2048/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cf5202f06cfe045e2437983d0655b3d319505b7e55e5b07b3dfa2e81f74213e +size 692127130 diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_fc4_s2048/config.json b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_fc4_s2048/config.json new file mode 100644 index 0000000000000000000000000000000000000000..524c06f1795a643f88d925d43550e0a0d49eb19f --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_fc4_s2048/config.json @@ -0,0 +1,67 @@ +{ + "_name_or_path": "/data/wiedmann/hub/models--lmms-lab--llava-onevision-qwen2-7b-ov/snapshots/0b07bf7565e244cf4f39982249eafe8cd799d6dd", + "add_faster_video": false, + "add_time_instruction": false, + "architectures": [ + "LlavaQwenForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "faster_token_stride": 10, + "force_sample": false, + "hidden_act": "silu", + "hidden_size": 3584, + "image_aspect_ratio": "square", + "image_crop_resolution": null, + "image_grid_pinpoints": null, + "image_split_resolution": null, + "image_token_index": 151646, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "mm_hidden_size": 1152, + "mm_newline_position": "no_token", + "mm_patch_merge_type": "spatial_unpad", + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_resampler_type": "streaming_agg", + "mm_spatial_pool_mode": "bilinear", + "mm_spatial_pool_stride": null, + "mm_streaming_frames_per_chunk": 4, + "mm_streaming_input_dim": 1152, + "mm_streaming_num_heads": 8, + "mm_streaming_num_layers": 4, + "mm_streaming_num_state_tokens": 2048, + "mm_streaming_patches_per_frame": 729, + "mm_streaming_state_dim": 1152, + "mm_streaming_vision_chunk_size": 8, + "mm_tunable_parts": "mm_mlp_adapter,mm_vision_resampler", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-384", + "mm_vision_tower_lr": null, + "model_type": "llava_qwen", + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "pos_skipping_range": 4096, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 131072, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 8192, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.40.0.dev0", + "use_cache": true, + "use_mm_proj": true, + "use_pos_skipping": false, + "use_sliding_window": false, + "vision_tower_pretrained": null, + "vocab_size": 152064 +} diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_fc4_s2048/generation_config.json b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_fc4_s2048/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..19a297221acb87418d4388a3decef2282c6d7316 --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_fc4_s2048/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.05, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.40.0.dev0" +} diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_fc4_s2048/non_lora_trainables.bin b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_fc4_s2048/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..f940bfd16b7afc3bf208e644d68dd928fa0f5438 --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_fc4_s2048/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e6b77bee65b1ee2acaf54000f4efe77755b71b066a16e63f5bc97a132a26ee5 +size 211441788 diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_fc4_s2048/trainer_state.json b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_fc4_s2048/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f51b8d458debc028f096bb6fa6f53b950ffa61e3 --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_fc4_s2048/trainer_state.json @@ -0,0 +1,7492 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999413730433253, + "eval_steps": 500, + "global_step": 1066, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 53.56119918823242, + "learning_rate": 3.125e-06, + "loss": 1.2329, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 44.24412155151367, + "learning_rate": 6.25e-06, + "loss": 1.253, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 31.141427993774414, + "learning_rate": 9.375000000000001e-06, + "loss": 1.1976, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 8.549352645874023, + "learning_rate": 1.25e-05, + "loss": 0.9174, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 6.200796127319336, + "learning_rate": 1.5625e-05, + "loss": 0.9193, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 3.501561164855957, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.7185, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 7.535277366638184, + "learning_rate": 2.1875e-05, + "loss": 0.599, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 1.7859258651733398, + "learning_rate": 2.5e-05, + "loss": 0.4878, + "step": 8 + }, + { + "epoch": 0.01, + "grad_norm": 1.49070405960083, + "learning_rate": 2.8125000000000003e-05, + "loss": 0.4525, + "step": 9 + }, + { + "epoch": 0.01, + "grad_norm": 0.8042423725128174, + "learning_rate": 3.125e-05, + "loss": 0.3429, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 0.8497580885887146, + "learning_rate": 3.4375e-05, + "loss": 0.3007, + "step": 11 + }, + { + "epoch": 0.01, + "grad_norm": 0.7737765312194824, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.4551, + "step": 12 + }, + { + "epoch": 0.01, + "grad_norm": 0.9109475016593933, + "learning_rate": 4.0625000000000005e-05, + "loss": 0.5592, + "step": 13 + }, + { + "epoch": 0.01, + "grad_norm": 0.6791670918464661, + "learning_rate": 4.375e-05, + "loss": 0.3611, + "step": 14 + }, + { + "epoch": 0.01, + "grad_norm": 0.7185907363891602, + "learning_rate": 4.6875e-05, + "loss": 0.4888, + "step": 15 + }, + { + "epoch": 0.02, + "grad_norm": 0.7224956750869751, + "learning_rate": 5e-05, + "loss": 0.3629, + "step": 16 + }, + { + "epoch": 0.02, + "grad_norm": 0.5531966090202332, + "learning_rate": 5.3125000000000004e-05, + "loss": 0.2789, + "step": 17 + }, + { + "epoch": 0.02, + "grad_norm": 0.7150623202323914, + "learning_rate": 5.6250000000000005e-05, + "loss": 0.3422, + "step": 18 + }, + { + "epoch": 0.02, + "grad_norm": 0.4981580078601837, + "learning_rate": 5.9375e-05, + "loss": 0.3128, + "step": 19 + }, + { + "epoch": 0.02, + "grad_norm": 0.712741494178772, + "learning_rate": 6.25e-05, + "loss": 0.4357, + "step": 20 + }, + { + "epoch": 0.02, + "grad_norm": 0.5813599824905396, + "learning_rate": 6.562500000000001e-05, + "loss": 0.3287, + "step": 21 + }, + { + "epoch": 0.02, + "grad_norm": 0.6859356760978699, + "learning_rate": 6.875e-05, + "loss": 0.4216, + "step": 22 + }, + { + "epoch": 0.02, + "grad_norm": 0.6142020225524902, + "learning_rate": 7.1875e-05, + "loss": 0.3745, + "step": 23 + }, + { + "epoch": 0.02, + "grad_norm": 0.5768157243728638, + "learning_rate": 7.500000000000001e-05, + "loss": 0.3246, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 0.5831692218780518, + "learning_rate": 7.8125e-05, + "loss": 0.3969, + "step": 25 + }, + { + "epoch": 0.02, + "grad_norm": 0.5747249126434326, + "learning_rate": 8.125000000000001e-05, + "loss": 0.4105, + "step": 26 + }, + { + "epoch": 0.03, + "grad_norm": 0.5389623641967773, + "learning_rate": 8.4375e-05, + "loss": 0.3935, + "step": 27 + }, + { + "epoch": 0.03, + "grad_norm": 0.6806145310401917, + "learning_rate": 8.75e-05, + "loss": 0.3193, + "step": 28 + }, + { + "epoch": 0.03, + "grad_norm": 0.5326066613197327, + "learning_rate": 9.062500000000001e-05, + "loss": 0.3592, + "step": 29 + }, + { + "epoch": 0.03, + "grad_norm": 0.6331548690795898, + "learning_rate": 9.375e-05, + "loss": 0.4431, + "step": 30 + }, + { + "epoch": 0.03, + "grad_norm": 0.5740853548049927, + "learning_rate": 9.687500000000001e-05, + "loss": 0.3317, + "step": 31 + }, + { + "epoch": 0.03, + "grad_norm": 0.4643768072128296, + "learning_rate": 0.0001, + "loss": 0.2877, + "step": 32 + }, + { + "epoch": 0.03, + "grad_norm": 0.5114359259605408, + "learning_rate": 9.999976921990784e-05, + "loss": 0.314, + "step": 33 + }, + { + "epoch": 0.03, + "grad_norm": 0.5951298475265503, + "learning_rate": 9.999907688176173e-05, + "loss": 0.4149, + "step": 34 + }, + { + "epoch": 0.03, + "grad_norm": 0.47754591703414917, + "learning_rate": 9.999792299195278e-05, + "loss": 0.2769, + "step": 35 + }, + { + "epoch": 0.03, + "grad_norm": 0.5500280857086182, + "learning_rate": 9.999630756113278e-05, + "loss": 0.3778, + "step": 36 + }, + { + "epoch": 0.03, + "grad_norm": 0.48305949568748474, + "learning_rate": 9.999423060421411e-05, + "loss": 0.3141, + "step": 37 + }, + { + "epoch": 0.04, + "grad_norm": 0.4953254461288452, + "learning_rate": 9.999169214036958e-05, + "loss": 0.3525, + "step": 38 + }, + { + "epoch": 0.04, + "grad_norm": 0.47390633821487427, + "learning_rate": 9.998869219303227e-05, + "loss": 0.3096, + "step": 39 + }, + { + "epoch": 0.04, + "grad_norm": 0.5356336236000061, + "learning_rate": 9.998523078989529e-05, + "loss": 0.3706, + "step": 40 + }, + { + "epoch": 0.04, + "grad_norm": 0.46947455406188965, + "learning_rate": 9.998130796291156e-05, + "loss": 0.3649, + "step": 41 + }, + { + "epoch": 0.04, + "grad_norm": 0.5029218196868896, + "learning_rate": 9.997692374829352e-05, + "loss": 0.3149, + "step": 42 + }, + { + "epoch": 0.04, + "grad_norm": 2.274848699569702, + "learning_rate": 9.997207818651274e-05, + "loss": 0.2937, + "step": 43 + }, + { + "epoch": 0.04, + "grad_norm": 0.4324891269207001, + "learning_rate": 9.996677132229957e-05, + "loss": 0.3225, + "step": 44 + }, + { + "epoch": 0.04, + "grad_norm": 0.4642087519168854, + "learning_rate": 9.996100320464274e-05, + "loss": 0.3577, + "step": 45 + }, + { + "epoch": 0.04, + "grad_norm": 0.4427544176578522, + "learning_rate": 9.995477388678897e-05, + "loss": 0.2711, + "step": 46 + }, + { + "epoch": 0.04, + "grad_norm": 0.503923773765564, + "learning_rate": 9.994808342624234e-05, + "loss": 0.3, + "step": 47 + }, + { + "epoch": 0.05, + "grad_norm": 0.4020770788192749, + "learning_rate": 9.994093188476382e-05, + "loss": 0.2837, + "step": 48 + }, + { + "epoch": 0.05, + "grad_norm": 0.4590403437614441, + "learning_rate": 9.993331932837079e-05, + "loss": 0.3554, + "step": 49 + }, + { + "epoch": 0.05, + "grad_norm": 0.4729478359222412, + "learning_rate": 9.992524582733629e-05, + "loss": 0.276, + "step": 50 + }, + { + "epoch": 0.05, + "grad_norm": 0.4242483675479889, + "learning_rate": 9.991671145618846e-05, + "loss": 0.2385, + "step": 51 + }, + { + "epoch": 0.05, + "grad_norm": 0.4706641733646393, + "learning_rate": 9.99077162937098e-05, + "loss": 0.329, + "step": 52 + }, + { + "epoch": 0.05, + "grad_norm": 0.486446738243103, + "learning_rate": 9.989826042293652e-05, + "loss": 0.3326, + "step": 53 + }, + { + "epoch": 0.05, + "grad_norm": 0.4750838875770569, + "learning_rate": 9.988834393115767e-05, + "loss": 0.3182, + "step": 54 + }, + { + "epoch": 0.05, + "grad_norm": 0.5082477331161499, + "learning_rate": 9.987796690991439e-05, + "loss": 0.3621, + "step": 55 + }, + { + "epoch": 0.05, + "grad_norm": 0.46366146206855774, + "learning_rate": 9.98671294549991e-05, + "loss": 0.317, + "step": 56 + }, + { + "epoch": 0.05, + "grad_norm": 0.43866562843322754, + "learning_rate": 9.985583166645455e-05, + "loss": 0.2456, + "step": 57 + }, + { + "epoch": 0.05, + "grad_norm": 0.41880330443382263, + "learning_rate": 9.98440736485729e-05, + "loss": 0.2903, + "step": 58 + }, + { + "epoch": 0.06, + "grad_norm": 0.40110716223716736, + "learning_rate": 9.983185550989487e-05, + "loss": 0.2215, + "step": 59 + }, + { + "epoch": 0.06, + "grad_norm": 0.46664467453956604, + "learning_rate": 9.981917736320851e-05, + "loss": 0.3576, + "step": 60 + }, + { + "epoch": 0.06, + "grad_norm": 0.36552098393440247, + "learning_rate": 9.980603932554845e-05, + "loss": 0.2001, + "step": 61 + }, + { + "epoch": 0.06, + "grad_norm": 0.40886834263801575, + "learning_rate": 9.979244151819453e-05, + "loss": 0.247, + "step": 62 + }, + { + "epoch": 0.06, + "grad_norm": 0.4742773473262787, + "learning_rate": 9.97783840666709e-05, + "loss": 0.3561, + "step": 63 + }, + { + "epoch": 0.06, + "grad_norm": 0.44993096590042114, + "learning_rate": 9.976386710074478e-05, + "loss": 0.3031, + "step": 64 + }, + { + "epoch": 0.06, + "grad_norm": 0.42588794231414795, + "learning_rate": 9.974889075442521e-05, + "loss": 0.2303, + "step": 65 + }, + { + "epoch": 0.06, + "grad_norm": 0.4792451858520508, + "learning_rate": 9.97334551659619e-05, + "loss": 0.3047, + "step": 66 + }, + { + "epoch": 0.06, + "grad_norm": 0.45113521814346313, + "learning_rate": 9.971756047784393e-05, + "loss": 0.3148, + "step": 67 + }, + { + "epoch": 0.06, + "grad_norm": 0.43510547280311584, + "learning_rate": 9.970120683679838e-05, + "loss": 0.3894, + "step": 68 + }, + { + "epoch": 0.06, + "grad_norm": 0.4710015058517456, + "learning_rate": 9.968439439378905e-05, + "loss": 0.3901, + "step": 69 + }, + { + "epoch": 0.07, + "grad_norm": 0.4542221426963806, + "learning_rate": 9.966712330401504e-05, + "loss": 0.3633, + "step": 70 + }, + { + "epoch": 0.07, + "grad_norm": 0.5098312497138977, + "learning_rate": 9.964939372690926e-05, + "loss": 0.4011, + "step": 71 + }, + { + "epoch": 0.07, + "grad_norm": 0.7176426649093628, + "learning_rate": 9.96312058261371e-05, + "loss": 0.3472, + "step": 72 + }, + { + "epoch": 0.07, + "grad_norm": 0.44791948795318604, + "learning_rate": 9.961255976959473e-05, + "loss": 0.3242, + "step": 73 + }, + { + "epoch": 0.07, + "grad_norm": 0.49342313408851624, + "learning_rate": 9.959345572940771e-05, + "loss": 0.4155, + "step": 74 + }, + { + "epoch": 0.07, + "grad_norm": 0.38717854022979736, + "learning_rate": 9.957389388192935e-05, + "loss": 0.2929, + "step": 75 + }, + { + "epoch": 0.07, + "grad_norm": 0.4250815808773041, + "learning_rate": 9.9553874407739e-05, + "loss": 0.2688, + "step": 76 + }, + { + "epoch": 0.07, + "grad_norm": 0.3985184133052826, + "learning_rate": 9.953339749164057e-05, + "loss": 0.3616, + "step": 77 + }, + { + "epoch": 0.07, + "grad_norm": 0.4453914165496826, + "learning_rate": 9.951246332266057e-05, + "loss": 0.3602, + "step": 78 + }, + { + "epoch": 0.07, + "grad_norm": 0.46241435408592224, + "learning_rate": 9.949107209404665e-05, + "loss": 0.4017, + "step": 79 + }, + { + "epoch": 0.08, + "grad_norm": 0.40397438406944275, + "learning_rate": 9.946922400326554e-05, + "loss": 0.3773, + "step": 80 + }, + { + "epoch": 0.08, + "grad_norm": 0.459163635969162, + "learning_rate": 9.944691925200145e-05, + "loss": 0.3643, + "step": 81 + }, + { + "epoch": 0.08, + "grad_norm": 0.44683462381362915, + "learning_rate": 9.942415804615406e-05, + "loss": 0.3826, + "step": 82 + }, + { + "epoch": 0.08, + "grad_norm": 0.481871098279953, + "learning_rate": 9.940094059583671e-05, + "loss": 0.3892, + "step": 83 + }, + { + "epoch": 0.08, + "grad_norm": 0.42870786786079407, + "learning_rate": 9.937726711537442e-05, + "loss": 0.2709, + "step": 84 + }, + { + "epoch": 0.08, + "grad_norm": 0.41368356347084045, + "learning_rate": 9.93531378233019e-05, + "loss": 0.3394, + "step": 85 + }, + { + "epoch": 0.08, + "grad_norm": 0.48373904824256897, + "learning_rate": 9.932855294236154e-05, + "loss": 0.3624, + "step": 86 + }, + { + "epoch": 0.08, + "grad_norm": 0.4892710745334625, + "learning_rate": 9.930351269950143e-05, + "loss": 0.406, + "step": 87 + }, + { + "epoch": 0.08, + "grad_norm": 0.4693341851234436, + "learning_rate": 9.927801732587312e-05, + "loss": 0.3009, + "step": 88 + }, + { + "epoch": 0.08, + "grad_norm": 0.3869418501853943, + "learning_rate": 9.925206705682962e-05, + "loss": 0.2619, + "step": 89 + }, + { + "epoch": 0.08, + "grad_norm": 0.5536014437675476, + "learning_rate": 9.92256621319231e-05, + "loss": 0.4697, + "step": 90 + }, + { + "epoch": 0.09, + "grad_norm": 0.4700610935688019, + "learning_rate": 9.919880279490286e-05, + "loss": 0.3152, + "step": 91 + }, + { + "epoch": 0.09, + "grad_norm": 0.35885682702064514, + "learning_rate": 9.917148929371288e-05, + "loss": 0.1736, + "step": 92 + }, + { + "epoch": 0.09, + "grad_norm": 0.48592039942741394, + "learning_rate": 9.914372188048964e-05, + "loss": 0.3746, + "step": 93 + }, + { + "epoch": 0.09, + "grad_norm": 0.37780046463012695, + "learning_rate": 9.911550081155983e-05, + "loss": 0.2652, + "step": 94 + }, + { + "epoch": 0.09, + "grad_norm": 0.48445338010787964, + "learning_rate": 9.908682634743784e-05, + "loss": 0.3928, + "step": 95 + }, + { + "epoch": 0.09, + "grad_norm": 0.41280871629714966, + "learning_rate": 9.905769875282352e-05, + "loss": 0.2897, + "step": 96 + }, + { + "epoch": 0.09, + "grad_norm": 0.4266187250614166, + "learning_rate": 9.902811829659961e-05, + "loss": 0.3058, + "step": 97 + }, + { + "epoch": 0.09, + "grad_norm": 0.49948224425315857, + "learning_rate": 9.899808525182935e-05, + "loss": 0.3773, + "step": 98 + }, + { + "epoch": 0.09, + "grad_norm": 0.3814532160758972, + "learning_rate": 9.896759989575386e-05, + "loss": 0.2951, + "step": 99 + }, + { + "epoch": 0.09, + "grad_norm": 0.4628780782222748, + "learning_rate": 9.893666250978971e-05, + "loss": 0.3758, + "step": 100 + }, + { + "epoch": 0.09, + "grad_norm": 0.3314332664012909, + "learning_rate": 9.890527337952617e-05, + "loss": 0.202, + "step": 101 + }, + { + "epoch": 0.1, + "grad_norm": 0.43076321482658386, + "learning_rate": 9.887343279472272e-05, + "loss": 0.3034, + "step": 102 + }, + { + "epoch": 0.1, + "grad_norm": 0.4612520933151245, + "learning_rate": 9.884114104930628e-05, + "loss": 0.3215, + "step": 103 + }, + { + "epoch": 0.1, + "grad_norm": 0.3903290033340454, + "learning_rate": 9.880839844136854e-05, + "loss": 0.2625, + "step": 104 + }, + { + "epoch": 0.1, + "grad_norm": 0.4545579254627228, + "learning_rate": 9.877520527316317e-05, + "loss": 0.398, + "step": 105 + }, + { + "epoch": 0.1, + "grad_norm": 0.3861580193042755, + "learning_rate": 9.874156185110306e-05, + "loss": 0.3056, + "step": 106 + }, + { + "epoch": 0.1, + "grad_norm": 0.48816314339637756, + "learning_rate": 9.870746848575751e-05, + "loss": 0.3215, + "step": 107 + }, + { + "epoch": 0.1, + "grad_norm": 0.4321497976779938, + "learning_rate": 9.86729254918493e-05, + "loss": 0.2767, + "step": 108 + }, + { + "epoch": 0.1, + "grad_norm": 0.39948931336402893, + "learning_rate": 9.863793318825186e-05, + "loss": 0.2968, + "step": 109 + }, + { + "epoch": 0.1, + "grad_norm": 0.4157947301864624, + "learning_rate": 9.860249189798627e-05, + "loss": 0.4069, + "step": 110 + }, + { + "epoch": 0.1, + "grad_norm": 0.35165318846702576, + "learning_rate": 9.856660194821829e-05, + "loss": 0.2218, + "step": 111 + }, + { + "epoch": 0.11, + "grad_norm": 0.4359529912471771, + "learning_rate": 9.853026367025535e-05, + "loss": 0.3191, + "step": 112 + }, + { + "epoch": 0.11, + "grad_norm": 0.3869144320487976, + "learning_rate": 9.849347739954352e-05, + "loss": 0.2881, + "step": 113 + }, + { + "epoch": 0.11, + "grad_norm": 0.5311065912246704, + "learning_rate": 9.845624347566433e-05, + "loss": 0.4661, + "step": 114 + }, + { + "epoch": 0.11, + "grad_norm": 0.3647288680076599, + "learning_rate": 9.841856224233174e-05, + "loss": 0.2192, + "step": 115 + }, + { + "epoch": 0.11, + "grad_norm": 0.48680463433265686, + "learning_rate": 9.83804340473889e-05, + "loss": 0.4224, + "step": 116 + }, + { + "epoch": 0.11, + "grad_norm": 0.4152064025402069, + "learning_rate": 9.83418592428049e-05, + "loss": 0.2983, + "step": 117 + }, + { + "epoch": 0.11, + "grad_norm": 0.4352041780948639, + "learning_rate": 9.830283818467163e-05, + "loss": 0.3396, + "step": 118 + }, + { + "epoch": 0.11, + "grad_norm": 0.45859429240226746, + "learning_rate": 9.826337123320046e-05, + "loss": 0.3314, + "step": 119 + }, + { + "epoch": 0.11, + "grad_norm": 0.44424307346343994, + "learning_rate": 9.822345875271883e-05, + "loss": 0.3391, + "step": 120 + }, + { + "epoch": 0.11, + "grad_norm": 0.4700634181499481, + "learning_rate": 9.818310111166699e-05, + "loss": 0.341, + "step": 121 + }, + { + "epoch": 0.11, + "grad_norm": 0.4977334141731262, + "learning_rate": 9.814229868259452e-05, + "loss": 0.3589, + "step": 122 + }, + { + "epoch": 0.12, + "grad_norm": 0.4814792275428772, + "learning_rate": 9.810105184215699e-05, + "loss": 0.4187, + "step": 123 + }, + { + "epoch": 0.12, + "grad_norm": 0.4522244334220886, + "learning_rate": 9.805936097111234e-05, + "loss": 0.2712, + "step": 124 + }, + { + "epoch": 0.12, + "grad_norm": 0.5123592019081116, + "learning_rate": 9.801722645431754e-05, + "loss": 0.3991, + "step": 125 + }, + { + "epoch": 0.12, + "grad_norm": 0.42592427134513855, + "learning_rate": 9.797464868072488e-05, + "loss": 0.3108, + "step": 126 + }, + { + "epoch": 0.12, + "grad_norm": 0.4312685430049896, + "learning_rate": 9.793162804337845e-05, + "loss": 0.3203, + "step": 127 + }, + { + "epoch": 0.12, + "grad_norm": 0.4493767023086548, + "learning_rate": 9.788816493941051e-05, + "loss": 0.3489, + "step": 128 + }, + { + "epoch": 0.12, + "grad_norm": 0.45616286993026733, + "learning_rate": 9.784425977003784e-05, + "loss": 0.3597, + "step": 129 + }, + { + "epoch": 0.12, + "grad_norm": 0.4272081255912781, + "learning_rate": 9.779991294055802e-05, + "loss": 0.3742, + "step": 130 + }, + { + "epoch": 0.12, + "grad_norm": 0.3702671229839325, + "learning_rate": 9.775512486034563e-05, + "loss": 0.2558, + "step": 131 + }, + { + "epoch": 0.12, + "grad_norm": 0.582704484462738, + "learning_rate": 9.770989594284857e-05, + "loss": 0.5771, + "step": 132 + }, + { + "epoch": 0.12, + "grad_norm": 0.4729974567890167, + "learning_rate": 9.766422660558421e-05, + "loss": 0.3888, + "step": 133 + }, + { + "epoch": 0.13, + "grad_norm": 0.3848215341567993, + "learning_rate": 9.761811727013548e-05, + "loss": 0.2774, + "step": 134 + }, + { + "epoch": 0.13, + "grad_norm": 0.3735813796520233, + "learning_rate": 9.757156836214706e-05, + "loss": 0.3127, + "step": 135 + }, + { + "epoch": 0.13, + "grad_norm": 0.3998161554336548, + "learning_rate": 9.752458031132141e-05, + "loss": 0.3537, + "step": 136 + }, + { + "epoch": 0.13, + "grad_norm": 0.3877810537815094, + "learning_rate": 9.747715355141478e-05, + "loss": 0.2334, + "step": 137 + }, + { + "epoch": 0.13, + "grad_norm": 0.36280906200408936, + "learning_rate": 9.742928852023325e-05, + "loss": 0.325, + "step": 138 + }, + { + "epoch": 0.13, + "grad_norm": 0.43097636103630066, + "learning_rate": 9.73809856596287e-05, + "loss": 0.3603, + "step": 139 + }, + { + "epoch": 0.13, + "grad_norm": 0.4008861482143402, + "learning_rate": 9.733224541549464e-05, + "loss": 0.3164, + "step": 140 + }, + { + "epoch": 0.13, + "grad_norm": 0.43347349762916565, + "learning_rate": 9.728306823776221e-05, + "loss": 0.3332, + "step": 141 + }, + { + "epoch": 0.13, + "grad_norm": 0.4022083878517151, + "learning_rate": 9.723345458039594e-05, + "loss": 0.3515, + "step": 142 + }, + { + "epoch": 0.13, + "grad_norm": 0.3428991436958313, + "learning_rate": 9.718340490138965e-05, + "loss": 0.1898, + "step": 143 + }, + { + "epoch": 0.14, + "grad_norm": 0.45103755593299866, + "learning_rate": 9.713291966276206e-05, + "loss": 0.3417, + "step": 144 + }, + { + "epoch": 0.14, + "grad_norm": 0.4582728445529938, + "learning_rate": 9.708199933055272e-05, + "loss": 0.4211, + "step": 145 + }, + { + "epoch": 0.14, + "grad_norm": 0.48150041699409485, + "learning_rate": 9.70306443748176e-05, + "loss": 0.4027, + "step": 146 + }, + { + "epoch": 0.14, + "grad_norm": 0.3726791441440582, + "learning_rate": 9.697885526962474e-05, + "loss": 0.2824, + "step": 147 + }, + { + "epoch": 0.14, + "grad_norm": 0.41239234805107117, + "learning_rate": 9.692663249304992e-05, + "loss": 0.3291, + "step": 148 + }, + { + "epoch": 0.14, + "grad_norm": 0.48422345519065857, + "learning_rate": 9.687397652717223e-05, + "loss": 0.464, + "step": 149 + }, + { + "epoch": 0.14, + "grad_norm": 0.4588571786880493, + "learning_rate": 9.682088785806963e-05, + "loss": 0.3762, + "step": 150 + }, + { + "epoch": 0.14, + "grad_norm": 0.3948071002960205, + "learning_rate": 9.67673669758144e-05, + "loss": 0.2757, + "step": 151 + }, + { + "epoch": 0.14, + "grad_norm": 0.42584747076034546, + "learning_rate": 9.671341437446877e-05, + "loss": 0.3365, + "step": 152 + }, + { + "epoch": 0.14, + "grad_norm": 0.5137955546379089, + "learning_rate": 9.665903055208014e-05, + "loss": 0.4396, + "step": 153 + }, + { + "epoch": 0.14, + "grad_norm": 0.42933642864227295, + "learning_rate": 9.660421601067666e-05, + "loss": 0.3334, + "step": 154 + }, + { + "epoch": 0.15, + "grad_norm": 0.36864256858825684, + "learning_rate": 9.654897125626252e-05, + "loss": 0.2852, + "step": 155 + }, + { + "epoch": 0.15, + "grad_norm": 0.37826991081237793, + "learning_rate": 9.649329679881334e-05, + "loss": 0.2815, + "step": 156 + }, + { + "epoch": 0.15, + "grad_norm": 0.3164992332458496, + "learning_rate": 9.643719315227133e-05, + "loss": 0.1955, + "step": 157 + }, + { + "epoch": 0.15, + "grad_norm": 0.43146809935569763, + "learning_rate": 9.63806608345407e-05, + "loss": 0.3581, + "step": 158 + }, + { + "epoch": 0.15, + "grad_norm": 0.4671701490879059, + "learning_rate": 9.632370036748279e-05, + "loss": 0.3826, + "step": 159 + }, + { + "epoch": 0.15, + "grad_norm": 0.4189399480819702, + "learning_rate": 9.626631227691127e-05, + "loss": 0.2845, + "step": 160 + }, + { + "epoch": 0.15, + "grad_norm": 0.38566893339157104, + "learning_rate": 9.62084970925873e-05, + "loss": 0.245, + "step": 161 + }, + { + "epoch": 0.15, + "grad_norm": 0.4231869876384735, + "learning_rate": 9.615025534821462e-05, + "loss": 0.2744, + "step": 162 + }, + { + "epoch": 0.15, + "grad_norm": 0.4824386239051819, + "learning_rate": 9.609158758143464e-05, + "loss": 0.3822, + "step": 163 + }, + { + "epoch": 0.15, + "grad_norm": 0.4428365230560303, + "learning_rate": 9.603249433382144e-05, + "loss": 0.2882, + "step": 164 + }, + { + "epoch": 0.15, + "grad_norm": 0.4528397023677826, + "learning_rate": 9.597297615087685e-05, + "loss": 0.268, + "step": 165 + }, + { + "epoch": 0.16, + "grad_norm": 0.3863323926925659, + "learning_rate": 9.591303358202535e-05, + "loss": 0.2234, + "step": 166 + }, + { + "epoch": 0.16, + "grad_norm": 0.4577626883983612, + "learning_rate": 9.585266718060897e-05, + "loss": 0.354, + "step": 167 + }, + { + "epoch": 0.16, + "grad_norm": 0.4039238691329956, + "learning_rate": 9.579187750388227e-05, + "loss": 0.2139, + "step": 168 + }, + { + "epoch": 0.16, + "grad_norm": 0.43405696749687195, + "learning_rate": 9.573066511300714e-05, + "loss": 0.3414, + "step": 169 + }, + { + "epoch": 0.16, + "grad_norm": 0.4332447052001953, + "learning_rate": 9.566903057304764e-05, + "loss": 0.3628, + "step": 170 + }, + { + "epoch": 0.16, + "grad_norm": 0.4371618628501892, + "learning_rate": 9.560697445296474e-05, + "loss": 0.3303, + "step": 171 + }, + { + "epoch": 0.16, + "grad_norm": 0.5302167534828186, + "learning_rate": 9.554449732561113e-05, + "loss": 0.3516, + "step": 172 + }, + { + "epoch": 0.16, + "grad_norm": 0.4591212272644043, + "learning_rate": 9.548159976772592e-05, + "loss": 0.3495, + "step": 173 + }, + { + "epoch": 0.16, + "grad_norm": 0.47322651743888855, + "learning_rate": 9.541828235992926e-05, + "loss": 0.3561, + "step": 174 + }, + { + "epoch": 0.16, + "grad_norm": 0.38110026717185974, + "learning_rate": 9.535454568671704e-05, + "loss": 0.2689, + "step": 175 + }, + { + "epoch": 0.17, + "grad_norm": 0.443582147359848, + "learning_rate": 9.529039033645548e-05, + "loss": 0.2448, + "step": 176 + }, + { + "epoch": 0.17, + "grad_norm": 0.37832385301589966, + "learning_rate": 9.522581690137567e-05, + "loss": 0.305, + "step": 177 + }, + { + "epoch": 0.17, + "grad_norm": 0.3523258566856384, + "learning_rate": 9.516082597756815e-05, + "loss": 0.2324, + "step": 178 + }, + { + "epoch": 0.17, + "grad_norm": 0.4567958414554596, + "learning_rate": 9.509541816497737e-05, + "loss": 0.4185, + "step": 179 + }, + { + "epoch": 0.17, + "grad_norm": 0.3902451694011688, + "learning_rate": 9.50295940673962e-05, + "loss": 0.2496, + "step": 180 + }, + { + "epoch": 0.17, + "grad_norm": 0.43655723333358765, + "learning_rate": 9.496335429246026e-05, + "loss": 0.391, + "step": 181 + }, + { + "epoch": 0.17, + "grad_norm": 0.419964462518692, + "learning_rate": 9.489669945164242e-05, + "loss": 0.3263, + "step": 182 + }, + { + "epoch": 0.17, + "grad_norm": 0.3563212454319, + "learning_rate": 9.482963016024709e-05, + "loss": 0.224, + "step": 183 + }, + { + "epoch": 0.17, + "grad_norm": 0.3605343997478485, + "learning_rate": 9.476214703740454e-05, + "loss": 0.2404, + "step": 184 + }, + { + "epoch": 0.17, + "grad_norm": 0.5000762343406677, + "learning_rate": 9.469425070606524e-05, + "loss": 0.5458, + "step": 185 + }, + { + "epoch": 0.17, + "grad_norm": 0.40717747807502747, + "learning_rate": 9.462594179299406e-05, + "loss": 0.2314, + "step": 186 + }, + { + "epoch": 0.18, + "grad_norm": 0.41368338465690613, + "learning_rate": 9.45572209287645e-05, + "loss": 0.3461, + "step": 187 + }, + { + "epoch": 0.18, + "grad_norm": 0.5594714879989624, + "learning_rate": 9.44880887477528e-05, + "loss": 0.3762, + "step": 188 + }, + { + "epoch": 0.18, + "grad_norm": 0.4597151279449463, + "learning_rate": 9.441854588813228e-05, + "loss": 0.2904, + "step": 189 + }, + { + "epoch": 0.18, + "grad_norm": 0.39208316802978516, + "learning_rate": 9.43485929918672e-05, + "loss": 0.323, + "step": 190 + }, + { + "epoch": 0.18, + "grad_norm": 0.5003756284713745, + "learning_rate": 9.427823070470699e-05, + "loss": 0.4213, + "step": 191 + }, + { + "epoch": 0.18, + "grad_norm": 0.3722604811191559, + "learning_rate": 9.420745967618026e-05, + "loss": 0.2739, + "step": 192 + }, + { + "epoch": 0.18, + "grad_norm": 0.30610206723213196, + "learning_rate": 9.413628055958878e-05, + "loss": 0.1887, + "step": 193 + }, + { + "epoch": 0.18, + "grad_norm": 0.4051414430141449, + "learning_rate": 9.406469401200151e-05, + "loss": 0.3377, + "step": 194 + }, + { + "epoch": 0.18, + "grad_norm": 0.45952826738357544, + "learning_rate": 9.399270069424842e-05, + "loss": 0.3374, + "step": 195 + }, + { + "epoch": 0.18, + "grad_norm": 0.45123541355133057, + "learning_rate": 9.392030127091452e-05, + "loss": 0.2862, + "step": 196 + }, + { + "epoch": 0.18, + "grad_norm": 0.3941093981266022, + "learning_rate": 9.384749641033359e-05, + "loss": 0.2781, + "step": 197 + }, + { + "epoch": 0.19, + "grad_norm": 0.45097070932388306, + "learning_rate": 9.377428678458214e-05, + "loss": 0.3586, + "step": 198 + }, + { + "epoch": 0.19, + "grad_norm": 0.4081740379333496, + "learning_rate": 9.370067306947316e-05, + "loss": 0.274, + "step": 199 + }, + { + "epoch": 0.19, + "grad_norm": 0.47927120327949524, + "learning_rate": 9.362665594454984e-05, + "loss": 0.4323, + "step": 200 + }, + { + "epoch": 0.19, + "grad_norm": 0.4218602478504181, + "learning_rate": 9.355223609307933e-05, + "loss": 0.2805, + "step": 201 + }, + { + "epoch": 0.19, + "grad_norm": 0.47772476077079773, + "learning_rate": 9.347741420204643e-05, + "loss": 0.361, + "step": 202 + }, + { + "epoch": 0.19, + "grad_norm": 0.4543375074863434, + "learning_rate": 9.340219096214727e-05, + "loss": 0.2929, + "step": 203 + }, + { + "epoch": 0.19, + "grad_norm": 0.4603528082370758, + "learning_rate": 9.33265670677829e-05, + "loss": 0.364, + "step": 204 + }, + { + "epoch": 0.19, + "grad_norm": 0.4432274103164673, + "learning_rate": 9.325054321705289e-05, + "loss": 0.3073, + "step": 205 + }, + { + "epoch": 0.19, + "grad_norm": 0.43326976895332336, + "learning_rate": 9.317412011174886e-05, + "loss": 0.3113, + "step": 206 + }, + { + "epoch": 0.19, + "grad_norm": 0.4768969714641571, + "learning_rate": 9.309729845734813e-05, + "loss": 0.4033, + "step": 207 + }, + { + "epoch": 0.2, + "grad_norm": 0.36311060190200806, + "learning_rate": 9.302007896300698e-05, + "loss": 0.2685, + "step": 208 + }, + { + "epoch": 0.2, + "grad_norm": 0.41629478335380554, + "learning_rate": 9.29424623415543e-05, + "loss": 0.3202, + "step": 209 + }, + { + "epoch": 0.2, + "grad_norm": 0.47300565242767334, + "learning_rate": 9.286444930948496e-05, + "loss": 0.361, + "step": 210 + }, + { + "epoch": 0.2, + "grad_norm": 0.363154798746109, + "learning_rate": 9.278604058695313e-05, + "loss": 0.2366, + "step": 211 + }, + { + "epoch": 0.2, + "grad_norm": 0.4519854784011841, + "learning_rate": 9.270723689776568e-05, + "loss": 0.3207, + "step": 212 + }, + { + "epoch": 0.2, + "grad_norm": 0.4606704115867615, + "learning_rate": 9.262803896937555e-05, + "loss": 0.4074, + "step": 213 + }, + { + "epoch": 0.2, + "grad_norm": 0.42994555830955505, + "learning_rate": 9.254844753287493e-05, + "loss": 0.277, + "step": 214 + }, + { + "epoch": 0.2, + "grad_norm": 0.4647696912288666, + "learning_rate": 9.24684633229886e-05, + "loss": 0.3115, + "step": 215 + }, + { + "epoch": 0.2, + "grad_norm": 0.5536186099052429, + "learning_rate": 9.238808707806706e-05, + "loss": 0.3567, + "step": 216 + }, + { + "epoch": 0.2, + "grad_norm": 0.48201286792755127, + "learning_rate": 9.230731954007983e-05, + "loss": 0.369, + "step": 217 + }, + { + "epoch": 0.2, + "grad_norm": 0.5291519165039062, + "learning_rate": 9.222616145460849e-05, + "loss": 0.421, + "step": 218 + }, + { + "epoch": 0.21, + "grad_norm": 0.4186636507511139, + "learning_rate": 9.214461357083985e-05, + "loss": 0.2158, + "step": 219 + }, + { + "epoch": 0.21, + "grad_norm": 0.3635309636592865, + "learning_rate": 9.206267664155907e-05, + "loss": 0.2536, + "step": 220 + }, + { + "epoch": 0.21, + "grad_norm": 0.4199702739715576, + "learning_rate": 9.198035142314259e-05, + "loss": 0.3763, + "step": 221 + }, + { + "epoch": 0.21, + "grad_norm": 0.4509720504283905, + "learning_rate": 9.189763867555129e-05, + "loss": 0.3784, + "step": 222 + }, + { + "epoch": 0.21, + "grad_norm": 0.42019206285476685, + "learning_rate": 9.181453916232339e-05, + "loss": 0.2547, + "step": 223 + }, + { + "epoch": 0.21, + "grad_norm": 0.4254032373428345, + "learning_rate": 9.173105365056742e-05, + "loss": 0.3493, + "step": 224 + }, + { + "epoch": 0.21, + "grad_norm": 0.47555145621299744, + "learning_rate": 9.164718291095515e-05, + "loss": 0.375, + "step": 225 + }, + { + "epoch": 0.21, + "grad_norm": 0.40298861265182495, + "learning_rate": 9.156292771771446e-05, + "loss": 0.3356, + "step": 226 + }, + { + "epoch": 0.21, + "grad_norm": 0.379826158285141, + "learning_rate": 9.14782888486222e-05, + "loss": 0.3092, + "step": 227 + }, + { + "epoch": 0.21, + "grad_norm": 0.38476112484931946, + "learning_rate": 9.1393267084997e-05, + "loss": 0.2987, + "step": 228 + }, + { + "epoch": 0.21, + "grad_norm": 0.46264320611953735, + "learning_rate": 9.130786321169209e-05, + "loss": 0.3771, + "step": 229 + }, + { + "epoch": 0.22, + "grad_norm": 0.41812068223953247, + "learning_rate": 9.122207801708802e-05, + "loss": 0.3283, + "step": 230 + }, + { + "epoch": 0.22, + "grad_norm": 0.3706599771976471, + "learning_rate": 9.113591229308538e-05, + "loss": 0.2443, + "step": 231 + }, + { + "epoch": 0.22, + "grad_norm": 0.4221385419368744, + "learning_rate": 9.104936683509755e-05, + "loss": 0.3107, + "step": 232 + }, + { + "epoch": 0.22, + "grad_norm": 0.3780944347381592, + "learning_rate": 9.096244244204324e-05, + "loss": 0.3095, + "step": 233 + }, + { + "epoch": 0.22, + "grad_norm": 0.4322282075881958, + "learning_rate": 9.087513991633924e-05, + "loss": 0.3187, + "step": 234 + }, + { + "epoch": 0.22, + "grad_norm": 0.42131513357162476, + "learning_rate": 9.078746006389298e-05, + "loss": 0.3539, + "step": 235 + }, + { + "epoch": 0.22, + "grad_norm": 0.32596921920776367, + "learning_rate": 9.069940369409499e-05, + "loss": 0.1888, + "step": 236 + }, + { + "epoch": 0.22, + "grad_norm": 0.4986932873725891, + "learning_rate": 9.061097161981159e-05, + "loss": 0.406, + "step": 237 + }, + { + "epoch": 0.22, + "grad_norm": 0.4771842360496521, + "learning_rate": 9.052216465737726e-05, + "loss": 0.3748, + "step": 238 + }, + { + "epoch": 0.22, + "grad_norm": 0.46478331089019775, + "learning_rate": 9.043298362658714e-05, + "loss": 0.36, + "step": 239 + }, + { + "epoch": 0.23, + "grad_norm": 0.36818572878837585, + "learning_rate": 9.034342935068952e-05, + "loss": 0.1987, + "step": 240 + }, + { + "epoch": 0.23, + "grad_norm": 0.4378790259361267, + "learning_rate": 9.025350265637815e-05, + "loss": 0.3699, + "step": 241 + }, + { + "epoch": 0.23, + "grad_norm": 0.46986663341522217, + "learning_rate": 9.016320437378466e-05, + "loss": 0.3178, + "step": 242 + }, + { + "epoch": 0.23, + "grad_norm": 0.44161608815193176, + "learning_rate": 9.007253533647089e-05, + "loss": 0.3466, + "step": 243 + }, + { + "epoch": 0.23, + "grad_norm": 0.4717167913913727, + "learning_rate": 8.998149638142119e-05, + "loss": 0.3324, + "step": 244 + }, + { + "epoch": 0.23, + "grad_norm": 0.5091121792793274, + "learning_rate": 8.98900883490347e-05, + "loss": 0.4375, + "step": 245 + }, + { + "epoch": 0.23, + "grad_norm": 0.3929200768470764, + "learning_rate": 8.979831208311758e-05, + "loss": 0.2908, + "step": 246 + }, + { + "epoch": 0.23, + "grad_norm": 0.46935197710990906, + "learning_rate": 8.970616843087524e-05, + "loss": 0.3222, + "step": 247 + }, + { + "epoch": 0.23, + "grad_norm": 0.3767223656177521, + "learning_rate": 8.96136582429045e-05, + "loss": 0.2201, + "step": 248 + }, + { + "epoch": 0.23, + "grad_norm": 0.46659228205680847, + "learning_rate": 8.952078237318575e-05, + "loss": 0.3446, + "step": 249 + }, + { + "epoch": 0.23, + "grad_norm": 0.4290735423564911, + "learning_rate": 8.942754167907507e-05, + "loss": 0.2723, + "step": 250 + }, + { + "epoch": 0.24, + "grad_norm": 0.45299649238586426, + "learning_rate": 8.933393702129628e-05, + "loss": 0.2574, + "step": 251 + }, + { + "epoch": 0.24, + "grad_norm": 0.4417344629764557, + "learning_rate": 8.923996926393305e-05, + "loss": 0.4022, + "step": 252 + }, + { + "epoch": 0.24, + "grad_norm": 0.4162667393684387, + "learning_rate": 8.91456392744209e-05, + "loss": 0.3696, + "step": 253 + }, + { + "epoch": 0.24, + "grad_norm": 0.47140949964523315, + "learning_rate": 8.905094792353917e-05, + "loss": 0.3376, + "step": 254 + }, + { + "epoch": 0.24, + "grad_norm": 0.4572603702545166, + "learning_rate": 8.895589608540297e-05, + "loss": 0.3583, + "step": 255 + }, + { + "epoch": 0.24, + "grad_norm": 0.35900965332984924, + "learning_rate": 8.886048463745525e-05, + "loss": 0.1709, + "step": 256 + }, + { + "epoch": 0.24, + "grad_norm": 0.4216223359107971, + "learning_rate": 8.876471446045847e-05, + "loss": 0.299, + "step": 257 + }, + { + "epoch": 0.24, + "grad_norm": 0.41913264989852905, + "learning_rate": 8.866858643848665e-05, + "loss": 0.2399, + "step": 258 + }, + { + "epoch": 0.24, + "grad_norm": 0.3832896053791046, + "learning_rate": 8.857210145891715e-05, + "loss": 0.2869, + "step": 259 + }, + { + "epoch": 0.24, + "grad_norm": 0.3874155879020691, + "learning_rate": 8.847526041242246e-05, + "loss": 0.1823, + "step": 260 + }, + { + "epoch": 0.24, + "grad_norm": 0.37168049812316895, + "learning_rate": 8.8378064192962e-05, + "loss": 0.239, + "step": 261 + }, + { + "epoch": 0.25, + "grad_norm": 0.43271905183792114, + "learning_rate": 8.82805136977739e-05, + "loss": 0.3633, + "step": 262 + }, + { + "epoch": 0.25, + "grad_norm": 0.48894739151000977, + "learning_rate": 8.818260982736661e-05, + "loss": 0.3543, + "step": 263 + }, + { + "epoch": 0.25, + "grad_norm": 0.3934341371059418, + "learning_rate": 8.808435348551071e-05, + "loss": 0.2181, + "step": 264 + }, + { + "epoch": 0.25, + "grad_norm": 0.36207613348960876, + "learning_rate": 8.798574557923053e-05, + "loss": 0.2183, + "step": 265 + }, + { + "epoch": 0.25, + "grad_norm": 0.3730492889881134, + "learning_rate": 8.788678701879573e-05, + "loss": 0.31, + "step": 266 + }, + { + "epoch": 0.25, + "grad_norm": 0.36091452836990356, + "learning_rate": 8.778747871771292e-05, + "loss": 0.2373, + "step": 267 + }, + { + "epoch": 0.25, + "grad_norm": 0.4387395679950714, + "learning_rate": 8.768782159271727e-05, + "loss": 0.3354, + "step": 268 + }, + { + "epoch": 0.25, + "grad_norm": 0.3900839686393738, + "learning_rate": 8.758781656376398e-05, + "loss": 0.2464, + "step": 269 + }, + { + "epoch": 0.25, + "grad_norm": 0.4690368175506592, + "learning_rate": 8.748746455401986e-05, + "loss": 0.3327, + "step": 270 + }, + { + "epoch": 0.25, + "grad_norm": 0.4869069755077362, + "learning_rate": 8.738676648985476e-05, + "loss": 0.3151, + "step": 271 + }, + { + "epoch": 0.26, + "grad_norm": 0.42331576347351074, + "learning_rate": 8.7285723300833e-05, + "loss": 0.3736, + "step": 272 + }, + { + "epoch": 0.26, + "grad_norm": 0.39506635069847107, + "learning_rate": 8.718433591970485e-05, + "loss": 0.2875, + "step": 273 + }, + { + "epoch": 0.26, + "grad_norm": 0.4790605902671814, + "learning_rate": 8.708260528239788e-05, + "loss": 0.3883, + "step": 274 + }, + { + "epoch": 0.26, + "grad_norm": 0.408903568983078, + "learning_rate": 8.698053232800832e-05, + "loss": 0.2666, + "step": 275 + }, + { + "epoch": 0.26, + "grad_norm": 0.42906439304351807, + "learning_rate": 8.68781179987924e-05, + "loss": 0.361, + "step": 276 + }, + { + "epoch": 0.26, + "grad_norm": 0.3851872384548187, + "learning_rate": 8.677536324015765e-05, + "loss": 0.2827, + "step": 277 + }, + { + "epoch": 0.26, + "grad_norm": 0.40262821316719055, + "learning_rate": 8.667226900065419e-05, + "loss": 0.3229, + "step": 278 + }, + { + "epoch": 0.26, + "grad_norm": 0.4792792797088623, + "learning_rate": 8.656883623196592e-05, + "loss": 0.3196, + "step": 279 + }, + { + "epoch": 0.26, + "grad_norm": 0.4359915256500244, + "learning_rate": 8.646506588890183e-05, + "loss": 0.303, + "step": 280 + }, + { + "epoch": 0.26, + "grad_norm": 0.49512985348701477, + "learning_rate": 8.636095892938707e-05, + "loss": 0.3536, + "step": 281 + }, + { + "epoch": 0.26, + "grad_norm": 0.4611496925354004, + "learning_rate": 8.62565163144542e-05, + "loss": 0.4029, + "step": 282 + }, + { + "epoch": 0.27, + "grad_norm": 0.40325888991355896, + "learning_rate": 8.615173900823426e-05, + "loss": 0.2887, + "step": 283 + }, + { + "epoch": 0.27, + "grad_norm": 0.40099236369132996, + "learning_rate": 8.60466279779479e-05, + "loss": 0.2887, + "step": 284 + }, + { + "epoch": 0.27, + "grad_norm": 0.3835624158382416, + "learning_rate": 8.594118419389647e-05, + "loss": 0.33, + "step": 285 + }, + { + "epoch": 0.27, + "grad_norm": 0.43427804112434387, + "learning_rate": 8.583540862945301e-05, + "loss": 0.3084, + "step": 286 + }, + { + "epoch": 0.27, + "grad_norm": 0.3395395278930664, + "learning_rate": 8.57293022610533e-05, + "loss": 0.2129, + "step": 287 + }, + { + "epoch": 0.27, + "grad_norm": 0.40782567858695984, + "learning_rate": 8.562286606818684e-05, + "loss": 0.2399, + "step": 288 + }, + { + "epoch": 0.27, + "grad_norm": 0.5204586982727051, + "learning_rate": 8.55161010333878e-05, + "loss": 0.5042, + "step": 289 + }, + { + "epoch": 0.27, + "grad_norm": 0.5167766809463501, + "learning_rate": 8.540900814222598e-05, + "loss": 0.3576, + "step": 290 + }, + { + "epoch": 0.27, + "grad_norm": 0.49364760518074036, + "learning_rate": 8.530158838329765e-05, + "loss": 0.3282, + "step": 291 + }, + { + "epoch": 0.27, + "grad_norm": 0.5482507348060608, + "learning_rate": 8.519384274821649e-05, + "loss": 0.4414, + "step": 292 + }, + { + "epoch": 0.27, + "grad_norm": 0.39613139629364014, + "learning_rate": 8.508577223160442e-05, + "loss": 0.2545, + "step": 293 + }, + { + "epoch": 0.28, + "grad_norm": 0.4539114534854889, + "learning_rate": 8.497737783108238e-05, + "loss": 0.3221, + "step": 294 + }, + { + "epoch": 0.28, + "grad_norm": 0.40794727206230164, + "learning_rate": 8.486866054726114e-05, + "loss": 0.2796, + "step": 295 + }, + { + "epoch": 0.28, + "grad_norm": 0.40882447361946106, + "learning_rate": 8.475962138373213e-05, + "loss": 0.2639, + "step": 296 + }, + { + "epoch": 0.28, + "grad_norm": 0.4423540234565735, + "learning_rate": 8.465026134705805e-05, + "loss": 0.3054, + "step": 297 + }, + { + "epoch": 0.28, + "grad_norm": 0.4468096196651459, + "learning_rate": 8.454058144676366e-05, + "loss": 0.2876, + "step": 298 + }, + { + "epoch": 0.28, + "grad_norm": 0.39803358912467957, + "learning_rate": 8.443058269532651e-05, + "loss": 0.3147, + "step": 299 + }, + { + "epoch": 0.28, + "grad_norm": 0.4247153103351593, + "learning_rate": 8.432026610816745e-05, + "loss": 0.2773, + "step": 300 + }, + { + "epoch": 0.28, + "grad_norm": 2.700711965560913, + "learning_rate": 8.420963270364137e-05, + "loss": 0.3315, + "step": 301 + }, + { + "epoch": 0.28, + "grad_norm": 0.4842532277107239, + "learning_rate": 8.409868350302774e-05, + "loss": 0.3274, + "step": 302 + }, + { + "epoch": 0.28, + "grad_norm": 0.5232789516448975, + "learning_rate": 8.398741953052127e-05, + "loss": 0.2746, + "step": 303 + }, + { + "epoch": 0.29, + "grad_norm": 0.5666669011116028, + "learning_rate": 8.387584181322233e-05, + "loss": 0.304, + "step": 304 + }, + { + "epoch": 0.29, + "grad_norm": 0.5082411170005798, + "learning_rate": 8.376395138112754e-05, + "loss": 0.35, + "step": 305 + }, + { + "epoch": 0.29, + "grad_norm": 0.41404369473457336, + "learning_rate": 8.365174926712032e-05, + "loss": 0.3421, + "step": 306 + }, + { + "epoch": 0.29, + "grad_norm": 0.44390758872032166, + "learning_rate": 8.353923650696118e-05, + "loss": 0.3472, + "step": 307 + }, + { + "epoch": 0.29, + "grad_norm": 0.4442753195762634, + "learning_rate": 8.342641413927837e-05, + "loss": 0.3628, + "step": 308 + }, + { + "epoch": 0.29, + "grad_norm": 0.4544326961040497, + "learning_rate": 8.331328320555812e-05, + "loss": 0.3457, + "step": 309 + }, + { + "epoch": 0.29, + "grad_norm": 0.5052947402000427, + "learning_rate": 8.319984475013512e-05, + "loss": 0.3999, + "step": 310 + }, + { + "epoch": 0.29, + "grad_norm": 0.4428708553314209, + "learning_rate": 8.308609982018286e-05, + "loss": 0.3385, + "step": 311 + }, + { + "epoch": 0.29, + "grad_norm": 0.3840278089046478, + "learning_rate": 8.297204946570398e-05, + "loss": 0.2749, + "step": 312 + }, + { + "epoch": 0.29, + "grad_norm": 0.42293739318847656, + "learning_rate": 8.285769473952052e-05, + "loss": 0.2628, + "step": 313 + }, + { + "epoch": 0.29, + "grad_norm": 0.6979223489761353, + "learning_rate": 8.274303669726426e-05, + "loss": 0.318, + "step": 314 + }, + { + "epoch": 0.3, + "grad_norm": 0.45772454142570496, + "learning_rate": 8.262807639736692e-05, + "loss": 0.4072, + "step": 315 + }, + { + "epoch": 0.3, + "grad_norm": 0.47757357358932495, + "learning_rate": 8.251281490105045e-05, + "loss": 0.3767, + "step": 316 + }, + { + "epoch": 0.3, + "grad_norm": 0.38831740617752075, + "learning_rate": 8.239725327231721e-05, + "loss": 0.3062, + "step": 317 + }, + { + "epoch": 0.3, + "grad_norm": 0.40905988216400146, + "learning_rate": 8.228139257794012e-05, + "loss": 0.3033, + "step": 318 + }, + { + "epoch": 0.3, + "grad_norm": 0.42288145422935486, + "learning_rate": 8.216523388745287e-05, + "loss": 0.3485, + "step": 319 + }, + { + "epoch": 0.3, + "grad_norm": 0.4163537919521332, + "learning_rate": 8.204877827313997e-05, + "loss": 0.3293, + "step": 320 + }, + { + "epoch": 0.3, + "grad_norm": 0.3951532244682312, + "learning_rate": 8.193202681002692e-05, + "loss": 0.3387, + "step": 321 + }, + { + "epoch": 0.3, + "grad_norm": 0.2811068594455719, + "learning_rate": 8.181498057587027e-05, + "loss": 0.1691, + "step": 322 + }, + { + "epoch": 0.3, + "grad_norm": 0.3517349362373352, + "learning_rate": 8.169764065114764e-05, + "loss": 0.273, + "step": 323 + }, + { + "epoch": 0.3, + "grad_norm": 0.4180085062980652, + "learning_rate": 8.158000811904778e-05, + "loss": 0.2496, + "step": 324 + }, + { + "epoch": 0.3, + "grad_norm": 0.41197821497917175, + "learning_rate": 8.146208406546053e-05, + "loss": 0.3327, + "step": 325 + }, + { + "epoch": 0.31, + "grad_norm": 0.4668791592121124, + "learning_rate": 8.134386957896688e-05, + "loss": 0.4066, + "step": 326 + }, + { + "epoch": 0.31, + "grad_norm": 0.4602564871311188, + "learning_rate": 8.122536575082882e-05, + "loss": 0.4185, + "step": 327 + }, + { + "epoch": 0.31, + "grad_norm": 0.4547500014305115, + "learning_rate": 8.110657367497933e-05, + "loss": 0.2798, + "step": 328 + }, + { + "epoch": 0.31, + "grad_norm": 0.34806668758392334, + "learning_rate": 8.098749444801224e-05, + "loss": 0.235, + "step": 329 + }, + { + "epoch": 0.31, + "grad_norm": 0.4281531870365143, + "learning_rate": 8.08681291691722e-05, + "loss": 0.3517, + "step": 330 + }, + { + "epoch": 0.31, + "grad_norm": 0.43151262402534485, + "learning_rate": 8.074847894034434e-05, + "loss": 0.2808, + "step": 331 + }, + { + "epoch": 0.31, + "grad_norm": 0.4350490868091583, + "learning_rate": 8.062854486604435e-05, + "loss": 0.2601, + "step": 332 + }, + { + "epoch": 0.31, + "grad_norm": 0.39234495162963867, + "learning_rate": 8.050832805340806e-05, + "loss": 0.2806, + "step": 333 + }, + { + "epoch": 0.31, + "grad_norm": 0.4032362103462219, + "learning_rate": 8.038782961218136e-05, + "loss": 0.3327, + "step": 334 + }, + { + "epoch": 0.31, + "grad_norm": 0.5096771717071533, + "learning_rate": 8.026705065470996e-05, + "loss": 0.3412, + "step": 335 + }, + { + "epoch": 0.32, + "grad_norm": 0.4058472514152527, + "learning_rate": 8.014599229592894e-05, + "loss": 0.3071, + "step": 336 + }, + { + "epoch": 0.32, + "grad_norm": 0.454199880361557, + "learning_rate": 8.002465565335271e-05, + "loss": 0.2554, + "step": 337 + }, + { + "epoch": 0.32, + "grad_norm": 0.4330787658691406, + "learning_rate": 7.990304184706455e-05, + "loss": 0.2798, + "step": 338 + }, + { + "epoch": 0.32, + "grad_norm": 0.42377325892448425, + "learning_rate": 7.978115199970621e-05, + "loss": 0.3761, + "step": 339 + }, + { + "epoch": 0.32, + "grad_norm": 0.3846011459827423, + "learning_rate": 7.965898723646776e-05, + "loss": 0.301, + "step": 340 + }, + { + "epoch": 0.32, + "grad_norm": 0.5546414256095886, + "learning_rate": 7.953654868507699e-05, + "loss": 0.4261, + "step": 341 + }, + { + "epoch": 0.32, + "grad_norm": 0.4243573248386383, + "learning_rate": 7.941383747578912e-05, + "loss": 0.338, + "step": 342 + }, + { + "epoch": 0.32, + "grad_norm": 0.41652941703796387, + "learning_rate": 7.929085474137629e-05, + "loss": 0.3427, + "step": 343 + }, + { + "epoch": 0.32, + "grad_norm": 0.36712539196014404, + "learning_rate": 7.91676016171172e-05, + "loss": 0.3195, + "step": 344 + }, + { + "epoch": 0.32, + "grad_norm": 0.4737776815891266, + "learning_rate": 7.904407924078654e-05, + "loss": 0.4586, + "step": 345 + }, + { + "epoch": 0.32, + "grad_norm": 0.4451066553592682, + "learning_rate": 7.892028875264451e-05, + "loss": 0.3344, + "step": 346 + }, + { + "epoch": 0.33, + "grad_norm": 0.3951125144958496, + "learning_rate": 7.879623129542633e-05, + "loss": 0.2924, + "step": 347 + }, + { + "epoch": 0.33, + "grad_norm": 0.42933401465415955, + "learning_rate": 7.867190801433166e-05, + "loss": 0.3728, + "step": 348 + }, + { + "epoch": 0.33, + "grad_norm": 0.3992808759212494, + "learning_rate": 7.854732005701402e-05, + "loss": 0.3446, + "step": 349 + }, + { + "epoch": 0.33, + "grad_norm": 0.39772018790245056, + "learning_rate": 7.842246857357023e-05, + "loss": 0.2887, + "step": 350 + }, + { + "epoch": 0.33, + "grad_norm": 0.33922725915908813, + "learning_rate": 7.829735471652978e-05, + "loss": 0.2267, + "step": 351 + }, + { + "epoch": 0.33, + "grad_norm": 0.4072892367839813, + "learning_rate": 7.817197964084411e-05, + "loss": 0.2689, + "step": 352 + }, + { + "epoch": 0.33, + "grad_norm": 0.32967281341552734, + "learning_rate": 7.804634450387616e-05, + "loss": 0.1847, + "step": 353 + }, + { + "epoch": 0.33, + "grad_norm": 0.42083194851875305, + "learning_rate": 7.792045046538941e-05, + "loss": 0.3112, + "step": 354 + }, + { + "epoch": 0.33, + "grad_norm": 0.8271592855453491, + "learning_rate": 7.77942986875374e-05, + "loss": 0.434, + "step": 355 + }, + { + "epoch": 0.33, + "grad_norm": 0.40372174978256226, + "learning_rate": 7.766789033485287e-05, + "loss": 0.3318, + "step": 356 + }, + { + "epoch": 0.33, + "grad_norm": 0.43399181962013245, + "learning_rate": 7.75412265742371e-05, + "loss": 0.3854, + "step": 357 + }, + { + "epoch": 0.34, + "grad_norm": 0.4727342128753662, + "learning_rate": 7.741430857494904e-05, + "loss": 0.318, + "step": 358 + }, + { + "epoch": 0.34, + "grad_norm": 0.43026426434516907, + "learning_rate": 7.728713750859458e-05, + "loss": 0.3445, + "step": 359 + }, + { + "epoch": 0.34, + "grad_norm": 0.4170074164867401, + "learning_rate": 7.715971454911577e-05, + "loss": 0.3448, + "step": 360 + }, + { + "epoch": 0.34, + "grad_norm": 0.44696909189224243, + "learning_rate": 7.703204087277988e-05, + "loss": 0.3512, + "step": 361 + }, + { + "epoch": 0.34, + "grad_norm": 0.36404502391815186, + "learning_rate": 7.690411765816864e-05, + "loss": 0.2587, + "step": 362 + }, + { + "epoch": 0.34, + "grad_norm": 0.4109265208244324, + "learning_rate": 7.677594608616729e-05, + "loss": 0.307, + "step": 363 + }, + { + "epoch": 0.34, + "grad_norm": 0.3944926857948303, + "learning_rate": 7.66475273399537e-05, + "loss": 0.2815, + "step": 364 + }, + { + "epoch": 0.34, + "grad_norm": 0.39314037561416626, + "learning_rate": 7.651886260498751e-05, + "loss": 0.269, + "step": 365 + }, + { + "epoch": 0.34, + "grad_norm": 0.4431683421134949, + "learning_rate": 7.638995306899908e-05, + "loss": 0.3221, + "step": 366 + }, + { + "epoch": 0.34, + "grad_norm": 0.3648792505264282, + "learning_rate": 7.626079992197857e-05, + "loss": 0.278, + "step": 367 + }, + { + "epoch": 0.35, + "grad_norm": 0.42419299483299255, + "learning_rate": 7.613140435616503e-05, + "loss": 0.3199, + "step": 368 + }, + { + "epoch": 0.35, + "grad_norm": 0.4009580612182617, + "learning_rate": 7.600176756603525e-05, + "loss": 0.2531, + "step": 369 + }, + { + "epoch": 0.35, + "grad_norm": 0.3897629380226135, + "learning_rate": 7.587189074829284e-05, + "loss": 0.3039, + "step": 370 + }, + { + "epoch": 0.35, + "grad_norm": 0.33926069736480713, + "learning_rate": 7.57417751018572e-05, + "loss": 0.2319, + "step": 371 + }, + { + "epoch": 0.35, + "grad_norm": 0.37864574790000916, + "learning_rate": 7.561142182785233e-05, + "loss": 0.2429, + "step": 372 + }, + { + "epoch": 0.35, + "grad_norm": 0.4201953113079071, + "learning_rate": 7.548083212959588e-05, + "loss": 0.3103, + "step": 373 + }, + { + "epoch": 0.35, + "grad_norm": 0.44398027658462524, + "learning_rate": 7.535000721258791e-05, + "loss": 0.2644, + "step": 374 + }, + { + "epoch": 0.35, + "grad_norm": 0.43067774176597595, + "learning_rate": 7.521894828449994e-05, + "loss": 0.3685, + "step": 375 + }, + { + "epoch": 0.35, + "grad_norm": 0.4563278555870056, + "learning_rate": 7.508765655516358e-05, + "loss": 0.3404, + "step": 376 + }, + { + "epoch": 0.35, + "grad_norm": 0.3916566073894501, + "learning_rate": 7.495613323655953e-05, + "loss": 0.3062, + "step": 377 + }, + { + "epoch": 0.35, + "grad_norm": 0.4126124680042267, + "learning_rate": 7.482437954280635e-05, + "loss": 0.2608, + "step": 378 + }, + { + "epoch": 0.36, + "grad_norm": 0.44294944405555725, + "learning_rate": 7.469239669014923e-05, + "loss": 0.3678, + "step": 379 + }, + { + "epoch": 0.36, + "grad_norm": 0.4036175608634949, + "learning_rate": 7.456018589694873e-05, + "loss": 0.2877, + "step": 380 + }, + { + "epoch": 0.36, + "grad_norm": 0.5387861728668213, + "learning_rate": 7.442774838366965e-05, + "loss": 0.3948, + "step": 381 + }, + { + "epoch": 0.36, + "grad_norm": 0.28586551547050476, + "learning_rate": 7.429508537286963e-05, + "loss": 0.1913, + "step": 382 + }, + { + "epoch": 0.36, + "grad_norm": 0.43656590580940247, + "learning_rate": 7.416219808918794e-05, + "loss": 0.3534, + "step": 383 + }, + { + "epoch": 0.36, + "grad_norm": 0.4448322653770447, + "learning_rate": 7.402908775933419e-05, + "loss": 0.3686, + "step": 384 + }, + { + "epoch": 0.36, + "grad_norm": 0.45444345474243164, + "learning_rate": 7.389575561207692e-05, + "loss": 0.4593, + "step": 385 + }, + { + "epoch": 0.36, + "grad_norm": 0.4999052882194519, + "learning_rate": 7.376220287823236e-05, + "loss": 0.3738, + "step": 386 + }, + { + "epoch": 0.36, + "grad_norm": 0.4560929536819458, + "learning_rate": 7.3628430790653e-05, + "loss": 0.3286, + "step": 387 + }, + { + "epoch": 0.36, + "grad_norm": 0.4010728597640991, + "learning_rate": 7.349444058421619e-05, + "loss": 0.2886, + "step": 388 + }, + { + "epoch": 0.36, + "grad_norm": 0.39646634459495544, + "learning_rate": 7.336023349581287e-05, + "loss": 0.2403, + "step": 389 + }, + { + "epoch": 0.37, + "grad_norm": 0.36710843443870544, + "learning_rate": 7.322581076433596e-05, + "loss": 0.2863, + "step": 390 + }, + { + "epoch": 0.37, + "grad_norm": 0.49684014916419983, + "learning_rate": 7.309117363066912e-05, + "loss": 0.4096, + "step": 391 + }, + { + "epoch": 0.37, + "grad_norm": 0.42137226462364197, + "learning_rate": 7.295632333767513e-05, + "loss": 0.3551, + "step": 392 + }, + { + "epoch": 0.37, + "grad_norm": 0.3678036630153656, + "learning_rate": 7.28212611301845e-05, + "loss": 0.2341, + "step": 393 + }, + { + "epoch": 0.37, + "grad_norm": 0.43836355209350586, + "learning_rate": 7.2685988254984e-05, + "loss": 0.364, + "step": 394 + }, + { + "epoch": 0.37, + "grad_norm": 0.4423292279243469, + "learning_rate": 7.255050596080509e-05, + "loss": 0.3527, + "step": 395 + }, + { + "epoch": 0.37, + "grad_norm": 0.2825776934623718, + "learning_rate": 7.241481549831243e-05, + "loss": 0.1635, + "step": 396 + }, + { + "epoch": 0.37, + "grad_norm": 0.4246549606323242, + "learning_rate": 7.22789181200923e-05, + "loss": 0.3436, + "step": 397 + }, + { + "epoch": 0.37, + "grad_norm": 0.4510544240474701, + "learning_rate": 7.214281508064107e-05, + "loss": 0.3554, + "step": 398 + }, + { + "epoch": 0.37, + "grad_norm": 0.36521950364112854, + "learning_rate": 7.200650763635366e-05, + "loss": 0.2404, + "step": 399 + }, + { + "epoch": 0.38, + "grad_norm": 0.45220625400543213, + "learning_rate": 7.186999704551181e-05, + "loss": 0.3145, + "step": 400 + }, + { + "epoch": 0.38, + "grad_norm": 0.44834861159324646, + "learning_rate": 7.173328456827263e-05, + "loss": 0.3142, + "step": 401 + }, + { + "epoch": 0.38, + "grad_norm": 0.31816065311431885, + "learning_rate": 7.15963714666568e-05, + "loss": 0.1982, + "step": 402 + }, + { + "epoch": 0.38, + "grad_norm": 0.424333781003952, + "learning_rate": 7.145925900453709e-05, + "loss": 0.3539, + "step": 403 + }, + { + "epoch": 0.38, + "grad_norm": 0.3757897913455963, + "learning_rate": 7.132194844762654e-05, + "loss": 0.2778, + "step": 404 + }, + { + "epoch": 0.38, + "grad_norm": 0.4314412474632263, + "learning_rate": 7.118444106346687e-05, + "loss": 0.3578, + "step": 405 + }, + { + "epoch": 0.38, + "grad_norm": 0.42340290546417236, + "learning_rate": 7.104673812141675e-05, + "loss": 0.3659, + "step": 406 + }, + { + "epoch": 0.38, + "grad_norm": 0.4033401906490326, + "learning_rate": 7.090884089264011e-05, + "loss": 0.2682, + "step": 407 + }, + { + "epoch": 0.38, + "grad_norm": 0.35764697194099426, + "learning_rate": 7.077075065009433e-05, + "loss": 0.231, + "step": 408 + }, + { + "epoch": 0.38, + "grad_norm": 0.43582141399383545, + "learning_rate": 7.063246866851858e-05, + "loss": 0.2786, + "step": 409 + }, + { + "epoch": 0.38, + "grad_norm": 0.529912531375885, + "learning_rate": 7.049399622442198e-05, + "loss": 0.3742, + "step": 410 + }, + { + "epoch": 0.39, + "grad_norm": 0.4656778872013092, + "learning_rate": 7.035533459607189e-05, + "loss": 0.3333, + "step": 411 + }, + { + "epoch": 0.39, + "grad_norm": 0.40634432435035706, + "learning_rate": 7.021648506348204e-05, + "loss": 0.3145, + "step": 412 + }, + { + "epoch": 0.39, + "grad_norm": 0.4188442826271057, + "learning_rate": 7.007744890840073e-05, + "loss": 0.2721, + "step": 413 + }, + { + "epoch": 0.39, + "grad_norm": 0.44638150930404663, + "learning_rate": 6.993822741429907e-05, + "loss": 0.3794, + "step": 414 + }, + { + "epoch": 0.39, + "grad_norm": 0.36345523595809937, + "learning_rate": 6.979882186635897e-05, + "loss": 0.2157, + "step": 415 + }, + { + "epoch": 0.39, + "grad_norm": 0.45502936840057373, + "learning_rate": 6.965923355146147e-05, + "loss": 0.3469, + "step": 416 + }, + { + "epoch": 0.39, + "grad_norm": 0.37767454981803894, + "learning_rate": 6.951946375817474e-05, + "loss": 0.2824, + "step": 417 + }, + { + "epoch": 0.39, + "grad_norm": 0.4633556008338928, + "learning_rate": 6.937951377674221e-05, + "loss": 0.2924, + "step": 418 + }, + { + "epoch": 0.39, + "grad_norm": 0.43885359168052673, + "learning_rate": 6.923938489907066e-05, + "loss": 0.3259, + "step": 419 + }, + { + "epoch": 0.39, + "grad_norm": 0.4263167083263397, + "learning_rate": 6.909907841871829e-05, + "loss": 0.2902, + "step": 420 + }, + { + "epoch": 0.39, + "grad_norm": 0.42423805594444275, + "learning_rate": 6.895859563088283e-05, + "loss": 0.3641, + "step": 421 + }, + { + "epoch": 0.4, + "grad_norm": 0.3689592778682709, + "learning_rate": 6.881793783238948e-05, + "loss": 0.2492, + "step": 422 + }, + { + "epoch": 0.4, + "grad_norm": 0.47483524680137634, + "learning_rate": 6.867710632167903e-05, + "loss": 0.3558, + "step": 423 + }, + { + "epoch": 0.4, + "grad_norm": 0.36466506123542786, + "learning_rate": 6.853610239879586e-05, + "loss": 0.2739, + "step": 424 + }, + { + "epoch": 0.4, + "grad_norm": 0.35030409693717957, + "learning_rate": 6.839492736537588e-05, + "loss": 0.1946, + "step": 425 + }, + { + "epoch": 0.4, + "grad_norm": 0.46116241812705994, + "learning_rate": 6.82535825246346e-05, + "loss": 0.2689, + "step": 426 + }, + { + "epoch": 0.4, + "grad_norm": 0.47451573610305786, + "learning_rate": 6.811206918135502e-05, + "loss": 0.3971, + "step": 427 + }, + { + "epoch": 0.4, + "grad_norm": 0.5025789141654968, + "learning_rate": 6.797038864187564e-05, + "loss": 0.351, + "step": 428 + }, + { + "epoch": 0.4, + "grad_norm": 0.37761175632476807, + "learning_rate": 6.782854221407838e-05, + "loss": 0.2974, + "step": 429 + }, + { + "epoch": 0.4, + "grad_norm": 0.44670093059539795, + "learning_rate": 6.768653120737652e-05, + "loss": 0.3839, + "step": 430 + }, + { + "epoch": 0.4, + "grad_norm": 0.4108104109764099, + "learning_rate": 6.754435693270258e-05, + "loss": 0.2815, + "step": 431 + }, + { + "epoch": 0.41, + "grad_norm": 0.45673632621765137, + "learning_rate": 6.740202070249623e-05, + "loss": 0.3454, + "step": 432 + }, + { + "epoch": 0.41, + "grad_norm": 0.4945111572742462, + "learning_rate": 6.725952383069222e-05, + "loss": 0.4538, + "step": 433 + }, + { + "epoch": 0.41, + "grad_norm": 0.43138283491134644, + "learning_rate": 6.711686763270818e-05, + "loss": 0.3702, + "step": 434 + }, + { + "epoch": 0.41, + "grad_norm": 0.4254596531391144, + "learning_rate": 6.697405342543258e-05, + "loss": 0.3817, + "step": 435 + }, + { + "epoch": 0.41, + "grad_norm": 0.36549097299575806, + "learning_rate": 6.683108252721238e-05, + "loss": 0.3039, + "step": 436 + }, + { + "epoch": 0.41, + "grad_norm": 0.4733044505119324, + "learning_rate": 6.668795625784113e-05, + "loss": 0.4458, + "step": 437 + }, + { + "epoch": 0.41, + "grad_norm": 0.35971495509147644, + "learning_rate": 6.654467593854657e-05, + "loss": 0.2318, + "step": 438 + }, + { + "epoch": 0.41, + "grad_norm": 0.4079780578613281, + "learning_rate": 6.640124289197845e-05, + "loss": 0.3086, + "step": 439 + }, + { + "epoch": 0.41, + "grad_norm": 0.42413631081581116, + "learning_rate": 6.625765844219652e-05, + "loss": 0.3687, + "step": 440 + }, + { + "epoch": 0.41, + "grad_norm": 0.39297375082969666, + "learning_rate": 6.611392391465802e-05, + "loss": 0.297, + "step": 441 + }, + { + "epoch": 0.41, + "grad_norm": 0.4315466284751892, + "learning_rate": 6.597004063620567e-05, + "loss": 0.3317, + "step": 442 + }, + { + "epoch": 0.42, + "grad_norm": 0.29865145683288574, + "learning_rate": 6.582600993505534e-05, + "loss": 0.1866, + "step": 443 + }, + { + "epoch": 0.42, + "grad_norm": 0.47077304124832153, + "learning_rate": 6.568183314078376e-05, + "loss": 0.4135, + "step": 444 + }, + { + "epoch": 0.42, + "grad_norm": 0.3485214412212372, + "learning_rate": 6.553751158431627e-05, + "loss": 0.2308, + "step": 445 + }, + { + "epoch": 0.42, + "grad_norm": 0.3782065808773041, + "learning_rate": 6.539304659791456e-05, + "loss": 0.2667, + "step": 446 + }, + { + "epoch": 0.42, + "grad_norm": 0.3845309019088745, + "learning_rate": 6.524843951516434e-05, + "loss": 0.2734, + "step": 447 + }, + { + "epoch": 0.42, + "grad_norm": 0.4229350686073303, + "learning_rate": 6.510369167096308e-05, + "loss": 0.2934, + "step": 448 + }, + { + "epoch": 0.42, + "grad_norm": 0.3936381936073303, + "learning_rate": 6.495880440150756e-05, + "loss": 0.2677, + "step": 449 + }, + { + "epoch": 0.42, + "grad_norm": 0.39204105734825134, + "learning_rate": 6.481377904428171e-05, + "loss": 0.26, + "step": 450 + }, + { + "epoch": 0.42, + "grad_norm": 0.41730165481567383, + "learning_rate": 6.466861693804413e-05, + "loss": 0.3263, + "step": 451 + }, + { + "epoch": 0.42, + "grad_norm": 0.4841783344745636, + "learning_rate": 6.45233194228158e-05, + "loss": 0.4078, + "step": 452 + }, + { + "epoch": 0.42, + "grad_norm": 0.43934983015060425, + "learning_rate": 6.437788783986766e-05, + "loss": 0.2471, + "step": 453 + }, + { + "epoch": 0.43, + "grad_norm": 0.43099477887153625, + "learning_rate": 6.42323235317083e-05, + "loss": 0.3084, + "step": 454 + }, + { + "epoch": 0.43, + "grad_norm": 0.4412413537502289, + "learning_rate": 6.408662784207149e-05, + "loss": 0.291, + "step": 455 + }, + { + "epoch": 0.43, + "grad_norm": 0.4185587763786316, + "learning_rate": 6.394080211590381e-05, + "loss": 0.2864, + "step": 456 + }, + { + "epoch": 0.43, + "grad_norm": 0.35048070549964905, + "learning_rate": 6.379484769935223e-05, + "loss": 0.2278, + "step": 457 + }, + { + "epoch": 0.43, + "grad_norm": 0.4162713587284088, + "learning_rate": 6.364876593975173e-05, + "loss": 0.2714, + "step": 458 + }, + { + "epoch": 0.43, + "grad_norm": 0.34501340985298157, + "learning_rate": 6.350255818561277e-05, + "loss": 0.2115, + "step": 459 + }, + { + "epoch": 0.43, + "grad_norm": 0.4311944246292114, + "learning_rate": 6.335622578660889e-05, + "loss": 0.3105, + "step": 460 + }, + { + "epoch": 0.43, + "grad_norm": 0.3668000102043152, + "learning_rate": 6.320977009356431e-05, + "loss": 0.2445, + "step": 461 + }, + { + "epoch": 0.43, + "grad_norm": 0.49384593963623047, + "learning_rate": 6.306319245844133e-05, + "loss": 0.3664, + "step": 462 + }, + { + "epoch": 0.43, + "grad_norm": 0.4154500663280487, + "learning_rate": 6.291649423432799e-05, + "loss": 0.2472, + "step": 463 + }, + { + "epoch": 0.44, + "grad_norm": 0.3742479681968689, + "learning_rate": 6.276967677542542e-05, + "loss": 0.2682, + "step": 464 + }, + { + "epoch": 0.44, + "grad_norm": 0.41379591822624207, + "learning_rate": 6.262274143703554e-05, + "loss": 0.3027, + "step": 465 + }, + { + "epoch": 0.44, + "grad_norm": 0.42273882031440735, + "learning_rate": 6.24756895755484e-05, + "loss": 0.2976, + "step": 466 + }, + { + "epoch": 0.44, + "grad_norm": 0.36084771156311035, + "learning_rate": 6.232852254842962e-05, + "loss": 0.2578, + "step": 467 + }, + { + "epoch": 0.44, + "grad_norm": 0.4634334444999695, + "learning_rate": 6.218124171420806e-05, + "loss": 0.3502, + "step": 468 + }, + { + "epoch": 0.44, + "grad_norm": 0.3936538100242615, + "learning_rate": 6.203384843246307e-05, + "loss": 0.3405, + "step": 469 + }, + { + "epoch": 0.44, + "grad_norm": 0.3866216838359833, + "learning_rate": 6.188634406381207e-05, + "loss": 0.33, + "step": 470 + }, + { + "epoch": 0.44, + "grad_norm": 0.48050788044929504, + "learning_rate": 6.173872996989793e-05, + "loss": 0.3355, + "step": 471 + }, + { + "epoch": 0.44, + "grad_norm": 0.3211848735809326, + "learning_rate": 6.159100751337642e-05, + "loss": 0.204, + "step": 472 + }, + { + "epoch": 0.44, + "grad_norm": 0.3827769160270691, + "learning_rate": 6.144317805790361e-05, + "loss": 0.2832, + "step": 473 + }, + { + "epoch": 0.44, + "grad_norm": 0.3828567564487457, + "learning_rate": 6.129524296812335e-05, + "loss": 0.252, + "step": 474 + }, + { + "epoch": 0.45, + "grad_norm": 0.3291488289833069, + "learning_rate": 6.114720360965453e-05, + "loss": 0.1864, + "step": 475 + }, + { + "epoch": 0.45, + "grad_norm": 0.4140992760658264, + "learning_rate": 6.099906134907868e-05, + "loss": 0.2806, + "step": 476 + }, + { + "epoch": 0.45, + "grad_norm": 0.436565101146698, + "learning_rate": 6.085081755392714e-05, + "loss": 0.3529, + "step": 477 + }, + { + "epoch": 0.45, + "grad_norm": 0.4582252502441406, + "learning_rate": 6.07024735926686e-05, + "loss": 0.3275, + "step": 478 + }, + { + "epoch": 0.45, + "grad_norm": 0.4374159276485443, + "learning_rate": 6.055403083469637e-05, + "loss": 0.3755, + "step": 479 + }, + { + "epoch": 0.45, + "grad_norm": 0.489577978849411, + "learning_rate": 6.04054906503158e-05, + "loss": 0.408, + "step": 480 + }, + { + "epoch": 0.45, + "grad_norm": 0.46644705533981323, + "learning_rate": 6.0256854410731565e-05, + "loss": 0.3451, + "step": 481 + }, + { + "epoch": 0.45, + "grad_norm": 0.46568021178245544, + "learning_rate": 6.010812348803509e-05, + "loss": 0.3739, + "step": 482 + }, + { + "epoch": 0.45, + "grad_norm": 0.45645302534103394, + "learning_rate": 5.99592992551918e-05, + "loss": 0.4199, + "step": 483 + }, + { + "epoch": 0.45, + "grad_norm": 0.416567325592041, + "learning_rate": 5.9810383086028535e-05, + "loss": 0.3734, + "step": 484 + }, + { + "epoch": 0.45, + "grad_norm": 0.3670683801174164, + "learning_rate": 5.9661376355220734e-05, + "loss": 0.2927, + "step": 485 + }, + { + "epoch": 0.46, + "grad_norm": 0.47614961862564087, + "learning_rate": 5.9512280438279914e-05, + "loss": 0.414, + "step": 486 + }, + { + "epoch": 0.46, + "grad_norm": 0.3908349871635437, + "learning_rate": 5.936309671154084e-05, + "loss": 0.337, + "step": 487 + }, + { + "epoch": 0.46, + "grad_norm": 0.4337731599807739, + "learning_rate": 5.9213826552148886e-05, + "loss": 0.3428, + "step": 488 + }, + { + "epoch": 0.46, + "grad_norm": 0.40524789690971375, + "learning_rate": 5.906447133804731e-05, + "loss": 0.3534, + "step": 489 + }, + { + "epoch": 0.46, + "grad_norm": 0.3826107382774353, + "learning_rate": 5.891503244796448e-05, + "loss": 0.2569, + "step": 490 + }, + { + "epoch": 0.46, + "grad_norm": 0.3548087775707245, + "learning_rate": 5.8765511261401254e-05, + "loss": 0.2976, + "step": 491 + }, + { + "epoch": 0.46, + "grad_norm": 0.3621930778026581, + "learning_rate": 5.861590915861817e-05, + "loss": 0.2355, + "step": 492 + }, + { + "epoch": 0.46, + "grad_norm": 0.3902803361415863, + "learning_rate": 5.846622752062268e-05, + "loss": 0.2642, + "step": 493 + }, + { + "epoch": 0.46, + "grad_norm": 0.4269382059574127, + "learning_rate": 5.831646772915651e-05, + "loss": 0.3481, + "step": 494 + }, + { + "epoch": 0.46, + "grad_norm": 0.4209762513637543, + "learning_rate": 5.816663116668276e-05, + "loss": 0.303, + "step": 495 + }, + { + "epoch": 0.47, + "grad_norm": 0.39953798055648804, + "learning_rate": 5.801671921637328e-05, + "loss": 0.3168, + "step": 496 + }, + { + "epoch": 0.47, + "grad_norm": 0.4956504702568054, + "learning_rate": 5.786673326209584e-05, + "loss": 0.404, + "step": 497 + }, + { + "epoch": 0.47, + "grad_norm": 0.38132691383361816, + "learning_rate": 5.7716674688401286e-05, + "loss": 0.2658, + "step": 498 + }, + { + "epoch": 0.47, + "grad_norm": 0.4979095458984375, + "learning_rate": 5.756654488051091e-05, + "loss": 0.391, + "step": 499 + }, + { + "epoch": 0.47, + "grad_norm": 0.4013850688934326, + "learning_rate": 5.7416345224303524e-05, + "loss": 0.2979, + "step": 500 + }, + { + "epoch": 0.47, + "grad_norm": 0.3869968056678772, + "learning_rate": 5.7266077106302785e-05, + "loss": 0.2835, + "step": 501 + }, + { + "epoch": 0.47, + "grad_norm": 0.3484272062778473, + "learning_rate": 5.7115741913664264e-05, + "loss": 0.2679, + "step": 502 + }, + { + "epoch": 0.47, + "grad_norm": 0.3580166697502136, + "learning_rate": 5.696534103416276e-05, + "loss": 0.2667, + "step": 503 + }, + { + "epoch": 0.47, + "grad_norm": 0.4448302090167999, + "learning_rate": 5.6814875856179414e-05, + "loss": 0.3617, + "step": 504 + }, + { + "epoch": 0.47, + "grad_norm": 0.5003171563148499, + "learning_rate": 5.666434776868895e-05, + "loss": 0.3631, + "step": 505 + }, + { + "epoch": 0.47, + "grad_norm": 0.46512144804000854, + "learning_rate": 5.651375816124679e-05, + "loss": 0.3196, + "step": 506 + }, + { + "epoch": 0.48, + "grad_norm": 0.4460465610027313, + "learning_rate": 5.636310842397629e-05, + "loss": 0.3175, + "step": 507 + }, + { + "epoch": 0.48, + "grad_norm": 0.43357494473457336, + "learning_rate": 5.621239994755583e-05, + "loss": 0.3122, + "step": 508 + }, + { + "epoch": 0.48, + "grad_norm": 0.4276319146156311, + "learning_rate": 5.606163412320608e-05, + "loss": 0.2931, + "step": 509 + }, + { + "epoch": 0.48, + "grad_norm": 0.4117043614387512, + "learning_rate": 5.5910812342677065e-05, + "loss": 0.2695, + "step": 510 + }, + { + "epoch": 0.48, + "grad_norm": 0.3576218783855438, + "learning_rate": 5.575993599823536e-05, + "loss": 0.2507, + "step": 511 + }, + { + "epoch": 0.48, + "grad_norm": 0.35004672408103943, + "learning_rate": 5.560900648265124e-05, + "loss": 0.1801, + "step": 512 + }, + { + "epoch": 0.48, + "grad_norm": 0.49767759442329407, + "learning_rate": 5.545802518918579e-05, + "loss": 0.4032, + "step": 513 + }, + { + "epoch": 0.48, + "grad_norm": 0.5268928408622742, + "learning_rate": 5.5306993511578096e-05, + "loss": 0.4158, + "step": 514 + }, + { + "epoch": 0.48, + "grad_norm": 0.46446239948272705, + "learning_rate": 5.515591284403234e-05, + "loss": 0.3088, + "step": 515 + }, + { + "epoch": 0.48, + "grad_norm": 0.39802390336990356, + "learning_rate": 5.5004784581204927e-05, + "loss": 0.301, + "step": 516 + }, + { + "epoch": 0.48, + "grad_norm": 0.3617722690105438, + "learning_rate": 5.485361011819164e-05, + "loss": 0.2405, + "step": 517 + }, + { + "epoch": 0.49, + "grad_norm": 0.4040432274341583, + "learning_rate": 5.4702390850514726e-05, + "loss": 0.2692, + "step": 518 + }, + { + "epoch": 0.49, + "grad_norm": 0.41277679800987244, + "learning_rate": 5.455112817411006e-05, + "loss": 0.3128, + "step": 519 + }, + { + "epoch": 0.49, + "grad_norm": 0.41128095984458923, + "learning_rate": 5.4399823485314226e-05, + "loss": 0.2589, + "step": 520 + }, + { + "epoch": 0.49, + "grad_norm": 0.44558587670326233, + "learning_rate": 5.4248478180851604e-05, + "loss": 0.3861, + "step": 521 + }, + { + "epoch": 0.49, + "grad_norm": 0.36561065912246704, + "learning_rate": 5.409709365782154e-05, + "loss": 0.2454, + "step": 522 + }, + { + "epoch": 0.49, + "grad_norm": 0.46217024326324463, + "learning_rate": 5.3945671313685386e-05, + "loss": 0.3887, + "step": 523 + }, + { + "epoch": 0.49, + "grad_norm": 0.37300512194633484, + "learning_rate": 5.379421254625366e-05, + "loss": 0.2722, + "step": 524 + }, + { + "epoch": 0.49, + "grad_norm": 0.39892271161079407, + "learning_rate": 5.364271875367311e-05, + "loss": 0.2965, + "step": 525 + }, + { + "epoch": 0.49, + "grad_norm": 0.43289822340011597, + "learning_rate": 5.3491191334413746e-05, + "loss": 0.3766, + "step": 526 + }, + { + "epoch": 0.49, + "grad_norm": 0.37322935461997986, + "learning_rate": 5.3339631687256084e-05, + "loss": 0.2401, + "step": 527 + }, + { + "epoch": 0.5, + "grad_norm": 0.401250422000885, + "learning_rate": 5.318804121127807e-05, + "loss": 0.2671, + "step": 528 + }, + { + "epoch": 0.5, + "grad_norm": 0.45596757531166077, + "learning_rate": 5.3036421305842276e-05, + "loss": 0.3871, + "step": 529 + }, + { + "epoch": 0.5, + "grad_norm": 0.35917335748672485, + "learning_rate": 5.288477337058293e-05, + "loss": 0.2398, + "step": 530 + }, + { + "epoch": 0.5, + "grad_norm": 0.4593126177787781, + "learning_rate": 5.273309880539301e-05, + "loss": 0.3669, + "step": 531 + }, + { + "epoch": 0.5, + "grad_norm": 0.5227428078651428, + "learning_rate": 5.258139901041132e-05, + "loss": 0.3048, + "step": 532 + }, + { + "epoch": 0.5, + "grad_norm": 0.31588104367256165, + "learning_rate": 5.242967538600957e-05, + "loss": 0.1823, + "step": 533 + }, + { + "epoch": 0.5, + "grad_norm": 0.4211719036102295, + "learning_rate": 5.227792933277943e-05, + "loss": 0.3316, + "step": 534 + }, + { + "epoch": 0.5, + "grad_norm": 0.3306739926338196, + "learning_rate": 5.212616225151965e-05, + "loss": 0.2222, + "step": 535 + }, + { + "epoch": 0.5, + "grad_norm": 0.43753111362457275, + "learning_rate": 5.197437554322304e-05, + "loss": 0.4076, + "step": 536 + }, + { + "epoch": 0.5, + "grad_norm": 0.4082089364528656, + "learning_rate": 5.182257060906365e-05, + "loss": 0.2929, + "step": 537 + }, + { + "epoch": 0.5, + "grad_norm": 0.4506302773952484, + "learning_rate": 5.167074885038373e-05, + "loss": 0.3252, + "step": 538 + }, + { + "epoch": 0.51, + "grad_norm": 0.35362350940704346, + "learning_rate": 5.151891166868086e-05, + "loss": 0.2435, + "step": 539 + }, + { + "epoch": 0.51, + "grad_norm": 0.5236488580703735, + "learning_rate": 5.1367060465595006e-05, + "loss": 0.4728, + "step": 540 + }, + { + "epoch": 0.51, + "grad_norm": 0.3949301242828369, + "learning_rate": 5.121519664289553e-05, + "loss": 0.2987, + "step": 541 + }, + { + "epoch": 0.51, + "grad_norm": 0.4690653383731842, + "learning_rate": 5.106332160246834e-05, + "loss": 0.409, + "step": 542 + }, + { + "epoch": 0.51, + "grad_norm": 0.3642612099647522, + "learning_rate": 5.0911436746302834e-05, + "loss": 0.2921, + "step": 543 + }, + { + "epoch": 0.51, + "grad_norm": 0.372464120388031, + "learning_rate": 5.075954347647909e-05, + "loss": 0.2481, + "step": 544 + }, + { + "epoch": 0.51, + "grad_norm": 0.37555721402168274, + "learning_rate": 5.0607643195154796e-05, + "loss": 0.2836, + "step": 545 + }, + { + "epoch": 0.51, + "grad_norm": 0.3466416001319885, + "learning_rate": 5.045573730455241e-05, + "loss": 0.2854, + "step": 546 + }, + { + "epoch": 0.51, + "grad_norm": 0.3371579349040985, + "learning_rate": 5.030382720694612e-05, + "loss": 0.2514, + "step": 547 + }, + { + "epoch": 0.51, + "grad_norm": 0.42777353525161743, + "learning_rate": 5.0151914304649015e-05, + "loss": 0.3554, + "step": 548 + }, + { + "epoch": 0.51, + "grad_norm": 0.34071090817451477, + "learning_rate": 5e-05, + "loss": 0.2285, + "step": 549 + }, + { + "epoch": 0.52, + "grad_norm": 0.43033647537231445, + "learning_rate": 4.984808569535101e-05, + "loss": 0.3356, + "step": 550 + }, + { + "epoch": 0.52, + "grad_norm": 0.40992996096611023, + "learning_rate": 4.969617279305388e-05, + "loss": 0.3154, + "step": 551 + }, + { + "epoch": 0.52, + "grad_norm": 0.3767620623111725, + "learning_rate": 4.954426269544761e-05, + "loss": 0.2635, + "step": 552 + }, + { + "epoch": 0.52, + "grad_norm": 0.40804165601730347, + "learning_rate": 4.939235680484522e-05, + "loss": 0.3069, + "step": 553 + }, + { + "epoch": 0.52, + "grad_norm": 0.3589411675930023, + "learning_rate": 4.924045652352092e-05, + "loss": 0.24, + "step": 554 + }, + { + "epoch": 0.52, + "grad_norm": 0.38028955459594727, + "learning_rate": 4.908856325369718e-05, + "loss": 0.2939, + "step": 555 + }, + { + "epoch": 0.52, + "grad_norm": 0.3343428671360016, + "learning_rate": 4.893667839753168e-05, + "loss": 0.1953, + "step": 556 + }, + { + "epoch": 0.52, + "grad_norm": 0.4250152111053467, + "learning_rate": 4.878480335710448e-05, + "loss": 0.357, + "step": 557 + }, + { + "epoch": 0.52, + "grad_norm": 0.2950625419616699, + "learning_rate": 4.8632939534405006e-05, + "loss": 0.1712, + "step": 558 + }, + { + "epoch": 0.52, + "grad_norm": 0.3887781798839569, + "learning_rate": 4.8481088331319146e-05, + "loss": 0.3143, + "step": 559 + }, + { + "epoch": 0.53, + "grad_norm": 0.35813620686531067, + "learning_rate": 4.832925114961629e-05, + "loss": 0.2471, + "step": 560 + }, + { + "epoch": 0.53, + "grad_norm": 0.4876914322376251, + "learning_rate": 4.817742939093635e-05, + "loss": 0.4083, + "step": 561 + }, + { + "epoch": 0.53, + "grad_norm": 0.43107870221138, + "learning_rate": 4.8025624456776966e-05, + "loss": 0.273, + "step": 562 + }, + { + "epoch": 0.53, + "grad_norm": 0.3670273423194885, + "learning_rate": 4.787383774848037e-05, + "loss": 0.2949, + "step": 563 + }, + { + "epoch": 0.53, + "grad_norm": 0.47010329365730286, + "learning_rate": 4.772207066722057e-05, + "loss": 0.3449, + "step": 564 + }, + { + "epoch": 0.53, + "grad_norm": 0.4678891897201538, + "learning_rate": 4.757032461399044e-05, + "loss": 0.3232, + "step": 565 + }, + { + "epoch": 0.53, + "grad_norm": 0.38481488823890686, + "learning_rate": 4.7418600989588694e-05, + "loss": 0.2349, + "step": 566 + }, + { + "epoch": 0.53, + "grad_norm": 0.3220939636230469, + "learning_rate": 4.726690119460701e-05, + "loss": 0.191, + "step": 567 + }, + { + "epoch": 0.53, + "grad_norm": 0.39618757367134094, + "learning_rate": 4.7115226629417075e-05, + "loss": 0.3071, + "step": 568 + }, + { + "epoch": 0.53, + "grad_norm": 0.4422629773616791, + "learning_rate": 4.6963578694157736e-05, + "loss": 0.2635, + "step": 569 + }, + { + "epoch": 0.53, + "grad_norm": 0.4440760314464569, + "learning_rate": 4.681195878872194e-05, + "loss": 0.3334, + "step": 570 + }, + { + "epoch": 0.54, + "grad_norm": 0.3999294340610504, + "learning_rate": 4.666036831274392e-05, + "loss": 0.2625, + "step": 571 + }, + { + "epoch": 0.54, + "grad_norm": 0.43602943420410156, + "learning_rate": 4.6508808665586265e-05, + "loss": 0.3042, + "step": 572 + }, + { + "epoch": 0.54, + "grad_norm": 0.33296841382980347, + "learning_rate": 4.635728124632692e-05, + "loss": 0.2561, + "step": 573 + }, + { + "epoch": 0.54, + "grad_norm": 0.43026474118232727, + "learning_rate": 4.6205787453746336e-05, + "loss": 0.284, + "step": 574 + }, + { + "epoch": 0.54, + "grad_norm": 0.34284353256225586, + "learning_rate": 4.605432868631462e-05, + "loss": 0.2312, + "step": 575 + }, + { + "epoch": 0.54, + "grad_norm": 0.5019268989562988, + "learning_rate": 4.590290634217848e-05, + "loss": 0.3643, + "step": 576 + }, + { + "epoch": 0.54, + "grad_norm": 0.4448760151863098, + "learning_rate": 4.57515218191484e-05, + "loss": 0.3368, + "step": 577 + }, + { + "epoch": 0.54, + "grad_norm": 0.3717033863067627, + "learning_rate": 4.5600176514685786e-05, + "loss": 0.2658, + "step": 578 + }, + { + "epoch": 0.54, + "grad_norm": 0.4158264994621277, + "learning_rate": 4.5448871825889946e-05, + "loss": 0.3105, + "step": 579 + }, + { + "epoch": 0.54, + "grad_norm": 0.4152980446815491, + "learning_rate": 4.52976091494853e-05, + "loss": 0.2573, + "step": 580 + }, + { + "epoch": 0.54, + "grad_norm": 0.3900246322154999, + "learning_rate": 4.514638988180837e-05, + "loss": 0.2733, + "step": 581 + }, + { + "epoch": 0.55, + "grad_norm": 0.4747683107852936, + "learning_rate": 4.4995215418795085e-05, + "loss": 0.3899, + "step": 582 + }, + { + "epoch": 0.55, + "grad_norm": 0.4682185649871826, + "learning_rate": 4.484408715596768e-05, + "loss": 0.301, + "step": 583 + }, + { + "epoch": 0.55, + "grad_norm": 0.41452059149742126, + "learning_rate": 4.4693006488421915e-05, + "loss": 0.2765, + "step": 584 + }, + { + "epoch": 0.55, + "grad_norm": 0.4349689185619354, + "learning_rate": 4.454197481081422e-05, + "loss": 0.2998, + "step": 585 + }, + { + "epoch": 0.55, + "grad_norm": 0.39544400572776794, + "learning_rate": 4.439099351734878e-05, + "loss": 0.2777, + "step": 586 + }, + { + "epoch": 0.55, + "grad_norm": 0.4015561044216156, + "learning_rate": 4.4240064001764646e-05, + "loss": 0.2579, + "step": 587 + }, + { + "epoch": 0.55, + "grad_norm": 0.353808730840683, + "learning_rate": 4.4089187657322953e-05, + "loss": 0.2631, + "step": 588 + }, + { + "epoch": 0.55, + "grad_norm": 0.35133761167526245, + "learning_rate": 4.393836587679394e-05, + "loss": 0.2423, + "step": 589 + }, + { + "epoch": 0.55, + "grad_norm": 0.47139087319374084, + "learning_rate": 4.3787600052444174e-05, + "loss": 0.3469, + "step": 590 + }, + { + "epoch": 0.55, + "grad_norm": 0.47959962487220764, + "learning_rate": 4.363689157602373e-05, + "loss": 0.4231, + "step": 591 + }, + { + "epoch": 0.56, + "grad_norm": 0.3742324709892273, + "learning_rate": 4.348624183875322e-05, + "loss": 0.2906, + "step": 592 + }, + { + "epoch": 0.56, + "grad_norm": 0.3570776879787445, + "learning_rate": 4.333565223131107e-05, + "loss": 0.2154, + "step": 593 + }, + { + "epoch": 0.56, + "grad_norm": 0.42536646127700806, + "learning_rate": 4.318512414382059e-05, + "loss": 0.3376, + "step": 594 + }, + { + "epoch": 0.56, + "grad_norm": 0.40395909547805786, + "learning_rate": 4.3034658965837255e-05, + "loss": 0.1824, + "step": 595 + }, + { + "epoch": 0.56, + "grad_norm": 0.42154961824417114, + "learning_rate": 4.288425808633575e-05, + "loss": 0.3297, + "step": 596 + }, + { + "epoch": 0.56, + "grad_norm": 0.3924674391746521, + "learning_rate": 4.273392289369723e-05, + "loss": 0.277, + "step": 597 + }, + { + "epoch": 0.56, + "grad_norm": 0.36088064312934875, + "learning_rate": 4.258365477569648e-05, + "loss": 0.2243, + "step": 598 + }, + { + "epoch": 0.56, + "grad_norm": 0.4456118941307068, + "learning_rate": 4.2433455119489105e-05, + "loss": 0.317, + "step": 599 + }, + { + "epoch": 0.56, + "grad_norm": 0.44901955127716064, + "learning_rate": 4.228332531159871e-05, + "loss": 0.3589, + "step": 600 + }, + { + "epoch": 0.56, + "grad_norm": 0.47233232855796814, + "learning_rate": 4.2133266737904176e-05, + "loss": 0.2958, + "step": 601 + }, + { + "epoch": 0.56, + "grad_norm": 0.43617701530456543, + "learning_rate": 4.1983280783626724e-05, + "loss": 0.3634, + "step": 602 + }, + { + "epoch": 0.57, + "grad_norm": 0.4319142699241638, + "learning_rate": 4.183336883331723e-05, + "loss": 0.3653, + "step": 603 + }, + { + "epoch": 0.57, + "grad_norm": 0.43837970495224, + "learning_rate": 4.1683532270843504e-05, + "loss": 0.3446, + "step": 604 + }, + { + "epoch": 0.57, + "grad_norm": 0.41829127073287964, + "learning_rate": 4.153377247937732e-05, + "loss": 0.3062, + "step": 605 + }, + { + "epoch": 0.57, + "grad_norm": 0.41083550453186035, + "learning_rate": 4.138409084138185e-05, + "loss": 0.3098, + "step": 606 + }, + { + "epoch": 0.57, + "grad_norm": 0.4804052412509918, + "learning_rate": 4.1234488738598744e-05, + "loss": 0.2789, + "step": 607 + }, + { + "epoch": 0.57, + "grad_norm": 0.3317751884460449, + "learning_rate": 4.108496755203553e-05, + "loss": 0.2116, + "step": 608 + }, + { + "epoch": 0.57, + "grad_norm": 0.40524640679359436, + "learning_rate": 4.0935528661952716e-05, + "loss": 0.3023, + "step": 609 + }, + { + "epoch": 0.57, + "grad_norm": 0.41121968626976013, + "learning_rate": 4.0786173447851126e-05, + "loss": 0.3146, + "step": 610 + }, + { + "epoch": 0.57, + "grad_norm": 0.385101854801178, + "learning_rate": 4.063690328845916e-05, + "loss": 0.2655, + "step": 611 + }, + { + "epoch": 0.57, + "grad_norm": 0.3425019681453705, + "learning_rate": 4.04877195617201e-05, + "loss": 0.2553, + "step": 612 + }, + { + "epoch": 0.58, + "grad_norm": 0.4267544448375702, + "learning_rate": 4.033862364477927e-05, + "loss": 0.3549, + "step": 613 + }, + { + "epoch": 0.58, + "grad_norm": 0.3356296420097351, + "learning_rate": 4.0189616913971484e-05, + "loss": 0.2249, + "step": 614 + }, + { + "epoch": 0.58, + "grad_norm": 0.4114169478416443, + "learning_rate": 4.0040700744808204e-05, + "loss": 0.3824, + "step": 615 + }, + { + "epoch": 0.58, + "grad_norm": 0.3610970675945282, + "learning_rate": 3.989187651196493e-05, + "loss": 0.2309, + "step": 616 + }, + { + "epoch": 0.58, + "grad_norm": 0.4254869520664215, + "learning_rate": 3.974314558926844e-05, + "loss": 0.3008, + "step": 617 + }, + { + "epoch": 0.58, + "grad_norm": 0.36089685559272766, + "learning_rate": 3.9594509349684216e-05, + "loss": 0.2992, + "step": 618 + }, + { + "epoch": 0.58, + "grad_norm": 0.40409085154533386, + "learning_rate": 3.9445969165303647e-05, + "loss": 0.2812, + "step": 619 + }, + { + "epoch": 0.58, + "grad_norm": 0.458126425743103, + "learning_rate": 3.929752640733141e-05, + "loss": 0.3108, + "step": 620 + }, + { + "epoch": 0.58, + "grad_norm": 0.41229763627052307, + "learning_rate": 3.914918244607287e-05, + "loss": 0.2853, + "step": 621 + }, + { + "epoch": 0.58, + "grad_norm": 0.3905296325683594, + "learning_rate": 3.900093865092134e-05, + "loss": 0.2386, + "step": 622 + }, + { + "epoch": 0.58, + "grad_norm": 0.3707413077354431, + "learning_rate": 3.885279639034546e-05, + "loss": 0.2428, + "step": 623 + }, + { + "epoch": 0.59, + "grad_norm": 0.4487210810184479, + "learning_rate": 3.870475703187667e-05, + "loss": 0.2722, + "step": 624 + }, + { + "epoch": 0.59, + "grad_norm": 0.45813697576522827, + "learning_rate": 3.855682194209639e-05, + "loss": 0.2897, + "step": 625 + }, + { + "epoch": 0.59, + "grad_norm": 0.31196317076683044, + "learning_rate": 3.840899248662358e-05, + "loss": 0.2186, + "step": 626 + }, + { + "epoch": 0.59, + "grad_norm": 0.40307724475860596, + "learning_rate": 3.8261270030102084e-05, + "loss": 0.2858, + "step": 627 + }, + { + "epoch": 0.59, + "grad_norm": 0.4220031797885895, + "learning_rate": 3.8113655936187947e-05, + "loss": 0.3035, + "step": 628 + }, + { + "epoch": 0.59, + "grad_norm": 0.39654064178466797, + "learning_rate": 3.796615156753696e-05, + "loss": 0.3392, + "step": 629 + }, + { + "epoch": 0.59, + "grad_norm": 0.4065658748149872, + "learning_rate": 3.7818758285791955e-05, + "loss": 0.3046, + "step": 630 + }, + { + "epoch": 0.59, + "grad_norm": 0.3877567648887634, + "learning_rate": 3.767147745157039e-05, + "loss": 0.3327, + "step": 631 + }, + { + "epoch": 0.59, + "grad_norm": 0.5222271680831909, + "learning_rate": 3.7524310424451635e-05, + "loss": 0.3651, + "step": 632 + }, + { + "epoch": 0.59, + "grad_norm": 0.3869018852710724, + "learning_rate": 3.7377258562964454e-05, + "loss": 0.3138, + "step": 633 + }, + { + "epoch": 0.59, + "grad_norm": 0.42713502049446106, + "learning_rate": 3.723032322457458e-05, + "loss": 0.3476, + "step": 634 + }, + { + "epoch": 0.6, + "grad_norm": 0.47757217288017273, + "learning_rate": 3.708350576567204e-05, + "loss": 0.4056, + "step": 635 + }, + { + "epoch": 0.6, + "grad_norm": 0.37596315145492554, + "learning_rate": 3.6936807541558674e-05, + "loss": 0.2626, + "step": 636 + }, + { + "epoch": 0.6, + "grad_norm": 0.48347583413124084, + "learning_rate": 3.6790229906435705e-05, + "loss": 0.4516, + "step": 637 + }, + { + "epoch": 0.6, + "grad_norm": 0.3164611756801605, + "learning_rate": 3.664377421339111e-05, + "loss": 0.2061, + "step": 638 + }, + { + "epoch": 0.6, + "grad_norm": 0.4118727445602417, + "learning_rate": 3.6497441814387247e-05, + "loss": 0.2964, + "step": 639 + }, + { + "epoch": 0.6, + "grad_norm": 0.3826994001865387, + "learning_rate": 3.6351234060248286e-05, + "loss": 0.2836, + "step": 640 + }, + { + "epoch": 0.6, + "grad_norm": 0.44861793518066406, + "learning_rate": 3.6205152300647784e-05, + "loss": 0.3768, + "step": 641 + }, + { + "epoch": 0.6, + "grad_norm": 0.3583225905895233, + "learning_rate": 3.605919788409621e-05, + "loss": 0.2999, + "step": 642 + }, + { + "epoch": 0.6, + "grad_norm": 0.3917998671531677, + "learning_rate": 3.591337215792852e-05, + "loss": 0.3231, + "step": 643 + }, + { + "epoch": 0.6, + "grad_norm": 0.3839118182659149, + "learning_rate": 3.5767676468291713e-05, + "loss": 0.2315, + "step": 644 + }, + { + "epoch": 0.61, + "grad_norm": 0.4764842987060547, + "learning_rate": 3.562211216013235e-05, + "loss": 0.4668, + "step": 645 + }, + { + "epoch": 0.61, + "grad_norm": 0.38360172510147095, + "learning_rate": 3.5476680577184206e-05, + "loss": 0.325, + "step": 646 + }, + { + "epoch": 0.61, + "grad_norm": 0.33429163694381714, + "learning_rate": 3.533138306195588e-05, + "loss": 0.2305, + "step": 647 + }, + { + "epoch": 0.61, + "grad_norm": 0.44550594687461853, + "learning_rate": 3.5186220955718306e-05, + "loss": 0.2942, + "step": 648 + }, + { + "epoch": 0.61, + "grad_norm": 0.3722698390483856, + "learning_rate": 3.5041195598492446e-05, + "loss": 0.3052, + "step": 649 + }, + { + "epoch": 0.61, + "grad_norm": 0.37912797927856445, + "learning_rate": 3.489630832903694e-05, + "loss": 0.2645, + "step": 650 + }, + { + "epoch": 0.61, + "grad_norm": 0.46016430854797363, + "learning_rate": 3.475156048483567e-05, + "loss": 0.3226, + "step": 651 + }, + { + "epoch": 0.61, + "grad_norm": 0.4096508324146271, + "learning_rate": 3.460695340208546e-05, + "loss": 0.3527, + "step": 652 + }, + { + "epoch": 0.61, + "grad_norm": 0.39146289229393005, + "learning_rate": 3.446248841568375e-05, + "loss": 0.275, + "step": 653 + }, + { + "epoch": 0.61, + "grad_norm": 0.41832536458969116, + "learning_rate": 3.431816685921625e-05, + "loss": 0.3173, + "step": 654 + }, + { + "epoch": 0.61, + "grad_norm": 0.35429659485816956, + "learning_rate": 3.417399006494466e-05, + "loss": 0.2347, + "step": 655 + }, + { + "epoch": 0.62, + "grad_norm": 0.36290469765663147, + "learning_rate": 3.402995936379433e-05, + "loss": 0.2451, + "step": 656 + }, + { + "epoch": 0.62, + "grad_norm": 0.38287097215652466, + "learning_rate": 3.3886076085341986e-05, + "loss": 0.2538, + "step": 657 + }, + { + "epoch": 0.62, + "grad_norm": 0.3295687735080719, + "learning_rate": 3.37423415578035e-05, + "loss": 0.2345, + "step": 658 + }, + { + "epoch": 0.62, + "grad_norm": 0.37062516808509827, + "learning_rate": 3.3598757108021546e-05, + "loss": 0.2345, + "step": 659 + }, + { + "epoch": 0.62, + "grad_norm": 0.3951757848262787, + "learning_rate": 3.345532406145345e-05, + "loss": 0.2836, + "step": 660 + }, + { + "epoch": 0.62, + "grad_norm": 0.35152289271354675, + "learning_rate": 3.331204374215888e-05, + "loss": 0.2325, + "step": 661 + }, + { + "epoch": 0.62, + "grad_norm": 0.36952489614486694, + "learning_rate": 3.316891747278761e-05, + "loss": 0.2382, + "step": 662 + }, + { + "epoch": 0.62, + "grad_norm": 0.45323944091796875, + "learning_rate": 3.302594657456744e-05, + "loss": 0.375, + "step": 663 + }, + { + "epoch": 0.62, + "grad_norm": 0.4516223073005676, + "learning_rate": 3.288313236729183e-05, + "loss": 0.3336, + "step": 664 + }, + { + "epoch": 0.62, + "grad_norm": 0.4407421350479126, + "learning_rate": 3.274047616930781e-05, + "loss": 0.3417, + "step": 665 + }, + { + "epoch": 0.62, + "grad_norm": 0.41437798738479614, + "learning_rate": 3.259797929750378e-05, + "loss": 0.2577, + "step": 666 + }, + { + "epoch": 0.63, + "grad_norm": 0.38213199377059937, + "learning_rate": 3.245564306729744e-05, + "loss": 0.2891, + "step": 667 + }, + { + "epoch": 0.63, + "grad_norm": 0.3836421072483063, + "learning_rate": 3.231346879262349e-05, + "loss": 0.2795, + "step": 668 + }, + { + "epoch": 0.63, + "grad_norm": 0.3943834900856018, + "learning_rate": 3.217145778592162e-05, + "loss": 0.3158, + "step": 669 + }, + { + "epoch": 0.63, + "grad_norm": 0.4441621005535126, + "learning_rate": 3.202961135812437e-05, + "loss": 0.3423, + "step": 670 + }, + { + "epoch": 0.63, + "grad_norm": 0.4147234559059143, + "learning_rate": 3.1887930818644996e-05, + "loss": 0.3495, + "step": 671 + }, + { + "epoch": 0.63, + "grad_norm": 0.41143837571144104, + "learning_rate": 3.1746417475365405e-05, + "loss": 0.2875, + "step": 672 + }, + { + "epoch": 0.63, + "grad_norm": 0.46894603967666626, + "learning_rate": 3.1605072634624125e-05, + "loss": 0.4085, + "step": 673 + }, + { + "epoch": 0.63, + "grad_norm": 0.42437368631362915, + "learning_rate": 3.146389760120416e-05, + "loss": 0.2643, + "step": 674 + }, + { + "epoch": 0.63, + "grad_norm": 0.4013984203338623, + "learning_rate": 3.132289367832097e-05, + "loss": 0.2912, + "step": 675 + }, + { + "epoch": 0.63, + "grad_norm": 0.45246797800064087, + "learning_rate": 3.118206216761053e-05, + "loss": 0.3124, + "step": 676 + }, + { + "epoch": 0.64, + "grad_norm": 0.377215713262558, + "learning_rate": 3.104140436911719e-05, + "loss": 0.1922, + "step": 677 + }, + { + "epoch": 0.64, + "grad_norm": 0.39179039001464844, + "learning_rate": 3.0900921581281725e-05, + "loss": 0.3054, + "step": 678 + }, + { + "epoch": 0.64, + "grad_norm": 0.42899250984191895, + "learning_rate": 3.076061510092935e-05, + "loss": 0.3311, + "step": 679 + }, + { + "epoch": 0.64, + "grad_norm": 0.38909435272216797, + "learning_rate": 3.062048622325779e-05, + "loss": 0.3041, + "step": 680 + }, + { + "epoch": 0.64, + "grad_norm": 0.41301432251930237, + "learning_rate": 3.0480536241825263e-05, + "loss": 0.2429, + "step": 681 + }, + { + "epoch": 0.64, + "grad_norm": 0.43931370973587036, + "learning_rate": 3.034076644853853e-05, + "loss": 0.2854, + "step": 682 + }, + { + "epoch": 0.64, + "grad_norm": 0.45271268486976624, + "learning_rate": 3.0201178133641038e-05, + "loss": 0.3819, + "step": 683 + }, + { + "epoch": 0.64, + "grad_norm": 0.3465835154056549, + "learning_rate": 3.0061772585700953e-05, + "loss": 0.2388, + "step": 684 + }, + { + "epoch": 0.64, + "grad_norm": 0.39036041498184204, + "learning_rate": 2.992255109159926e-05, + "loss": 0.2508, + "step": 685 + }, + { + "epoch": 0.64, + "grad_norm": 0.39026275277137756, + "learning_rate": 2.9783514936517965e-05, + "loss": 0.3071, + "step": 686 + }, + { + "epoch": 0.64, + "grad_norm": 0.40448805689811707, + "learning_rate": 2.9644665403928117e-05, + "loss": 0.2968, + "step": 687 + }, + { + "epoch": 0.65, + "grad_norm": 0.47924384474754333, + "learning_rate": 2.950600377557804e-05, + "loss": 0.4256, + "step": 688 + }, + { + "epoch": 0.65, + "grad_norm": 0.4134678542613983, + "learning_rate": 2.9367531331481436e-05, + "loss": 0.2633, + "step": 689 + }, + { + "epoch": 0.65, + "grad_norm": 0.4081580638885498, + "learning_rate": 2.9229249349905684e-05, + "loss": 0.3271, + "step": 690 + }, + { + "epoch": 0.65, + "grad_norm": 0.386106938123703, + "learning_rate": 2.909115910735991e-05, + "loss": 0.2604, + "step": 691 + }, + { + "epoch": 0.65, + "grad_norm": 0.34313780069351196, + "learning_rate": 2.895326187858326e-05, + "loss": 0.238, + "step": 692 + }, + { + "epoch": 0.65, + "grad_norm": 0.47729331254959106, + "learning_rate": 2.881555893653314e-05, + "loss": 0.4743, + "step": 693 + }, + { + "epoch": 0.65, + "grad_norm": 0.40500473976135254, + "learning_rate": 2.8678051552373487e-05, + "loss": 0.2936, + "step": 694 + }, + { + "epoch": 0.65, + "grad_norm": 0.3432915508747101, + "learning_rate": 2.854074099546291e-05, + "loss": 0.2432, + "step": 695 + }, + { + "epoch": 0.65, + "grad_norm": 0.4056801199913025, + "learning_rate": 2.8403628533343206e-05, + "loss": 0.2039, + "step": 696 + }, + { + "epoch": 0.65, + "grad_norm": 0.3884841203689575, + "learning_rate": 2.826671543172738e-05, + "loss": 0.3125, + "step": 697 + }, + { + "epoch": 0.65, + "grad_norm": 0.34453776478767395, + "learning_rate": 2.8130002954488183e-05, + "loss": 0.2515, + "step": 698 + }, + { + "epoch": 0.66, + "grad_norm": 0.3966999650001526, + "learning_rate": 2.799349236364634e-05, + "loss": 0.3374, + "step": 699 + }, + { + "epoch": 0.66, + "grad_norm": 0.40421062707901, + "learning_rate": 2.7857184919358937e-05, + "loss": 0.3175, + "step": 700 + }, + { + "epoch": 0.66, + "grad_norm": 0.3502410650253296, + "learning_rate": 2.7721081879907718e-05, + "loss": 0.2444, + "step": 701 + }, + { + "epoch": 0.66, + "grad_norm": 0.499380886554718, + "learning_rate": 2.7585184501687577e-05, + "loss": 0.3606, + "step": 702 + }, + { + "epoch": 0.66, + "grad_norm": 0.4188764989376068, + "learning_rate": 2.74494940391949e-05, + "loss": 0.359, + "step": 703 + }, + { + "epoch": 0.66, + "grad_norm": 0.42254406213760376, + "learning_rate": 2.731401174501601e-05, + "loss": 0.3138, + "step": 704 + }, + { + "epoch": 0.66, + "grad_norm": 0.35356250405311584, + "learning_rate": 2.7178738869815506e-05, + "loss": 0.2556, + "step": 705 + }, + { + "epoch": 0.66, + "grad_norm": 0.34609168767929077, + "learning_rate": 2.7043676662324878e-05, + "loss": 0.2317, + "step": 706 + }, + { + "epoch": 0.66, + "grad_norm": 0.4097319543361664, + "learning_rate": 2.69088263693309e-05, + "loss": 0.3008, + "step": 707 + }, + { + "epoch": 0.66, + "grad_norm": 0.37519168853759766, + "learning_rate": 2.6774189235664026e-05, + "loss": 0.2497, + "step": 708 + }, + { + "epoch": 0.67, + "grad_norm": 0.47046300768852234, + "learning_rate": 2.663976650418715e-05, + "loss": 0.4079, + "step": 709 + }, + { + "epoch": 0.67, + "grad_norm": 0.4490043818950653, + "learning_rate": 2.650555941578381e-05, + "loss": 0.409, + "step": 710 + }, + { + "epoch": 0.67, + "grad_norm": 0.39349454641342163, + "learning_rate": 2.6371569209347014e-05, + "loss": 0.2863, + "step": 711 + }, + { + "epoch": 0.67, + "grad_norm": 0.4129186272621155, + "learning_rate": 2.6237797121767634e-05, + "loss": 0.3697, + "step": 712 + }, + { + "epoch": 0.67, + "grad_norm": 0.3730863332748413, + "learning_rate": 2.6104244387923082e-05, + "loss": 0.2673, + "step": 713 + }, + { + "epoch": 0.67, + "grad_norm": 0.329772412776947, + "learning_rate": 2.5970912240665813e-05, + "loss": 0.1977, + "step": 714 + }, + { + "epoch": 0.67, + "grad_norm": 0.3878955543041229, + "learning_rate": 2.5837801910812053e-05, + "loss": 0.2983, + "step": 715 + }, + { + "epoch": 0.67, + "grad_norm": 0.3468000888824463, + "learning_rate": 2.5704914627130374e-05, + "loss": 0.2667, + "step": 716 + }, + { + "epoch": 0.67, + "grad_norm": 0.4333817958831787, + "learning_rate": 2.5572251616330373e-05, + "loss": 0.3788, + "step": 717 + }, + { + "epoch": 0.67, + "grad_norm": 0.3754556477069855, + "learning_rate": 2.5439814103051284e-05, + "loss": 0.2182, + "step": 718 + }, + { + "epoch": 0.67, + "grad_norm": 0.2976343035697937, + "learning_rate": 2.530760330985079e-05, + "loss": 0.178, + "step": 719 + }, + { + "epoch": 0.68, + "grad_norm": 0.4496738314628601, + "learning_rate": 2.517562045719367e-05, + "loss": 0.3968, + "step": 720 + }, + { + "epoch": 0.68, + "grad_norm": 0.3929567039012909, + "learning_rate": 2.504386676344047e-05, + "loss": 0.3243, + "step": 721 + }, + { + "epoch": 0.68, + "grad_norm": 0.40064507722854614, + "learning_rate": 2.4912343444836445e-05, + "loss": 0.3359, + "step": 722 + }, + { + "epoch": 0.68, + "grad_norm": 0.4283524751663208, + "learning_rate": 2.4781051715500076e-05, + "loss": 0.3824, + "step": 723 + }, + { + "epoch": 0.68, + "grad_norm": 0.3501032888889313, + "learning_rate": 2.46499927874121e-05, + "loss": 0.214, + "step": 724 + }, + { + "epoch": 0.68, + "grad_norm": 0.3761228024959564, + "learning_rate": 2.4519167870404125e-05, + "loss": 0.2473, + "step": 725 + }, + { + "epoch": 0.68, + "grad_norm": 0.4208412170410156, + "learning_rate": 2.4388578172147675e-05, + "loss": 0.3179, + "step": 726 + }, + { + "epoch": 0.68, + "grad_norm": 0.38908398151397705, + "learning_rate": 2.4258224898142807e-05, + "loss": 0.2357, + "step": 727 + }, + { + "epoch": 0.68, + "grad_norm": 0.4253256022930145, + "learning_rate": 2.4128109251707155e-05, + "loss": 0.3923, + "step": 728 + }, + { + "epoch": 0.68, + "grad_norm": 0.38555988669395447, + "learning_rate": 2.399823243396476e-05, + "loss": 0.2836, + "step": 729 + }, + { + "epoch": 0.68, + "grad_norm": 0.39008665084838867, + "learning_rate": 2.3868595643834995e-05, + "loss": 0.2493, + "step": 730 + }, + { + "epoch": 0.69, + "grad_norm": 0.409845232963562, + "learning_rate": 2.373920007802144e-05, + "loss": 0.3188, + "step": 731 + }, + { + "epoch": 0.69, + "grad_norm": 0.3560733199119568, + "learning_rate": 2.361004693100094e-05, + "loss": 0.2564, + "step": 732 + }, + { + "epoch": 0.69, + "grad_norm": 0.40539562702178955, + "learning_rate": 2.3481137395012513e-05, + "loss": 0.2965, + "step": 733 + }, + { + "epoch": 0.69, + "grad_norm": 0.38528934121131897, + "learning_rate": 2.3352472660046295e-05, + "loss": 0.2615, + "step": 734 + }, + { + "epoch": 0.69, + "grad_norm": 0.37828436493873596, + "learning_rate": 2.322405391383273e-05, + "loss": 0.303, + "step": 735 + }, + { + "epoch": 0.69, + "grad_norm": 0.4200487732887268, + "learning_rate": 2.3095882341831372e-05, + "loss": 0.3986, + "step": 736 + }, + { + "epoch": 0.69, + "grad_norm": 0.37558552622795105, + "learning_rate": 2.296795912722014e-05, + "loss": 0.3008, + "step": 737 + }, + { + "epoch": 0.69, + "grad_norm": 0.35239487886428833, + "learning_rate": 2.284028545088423e-05, + "loss": 0.2579, + "step": 738 + }, + { + "epoch": 0.69, + "grad_norm": 0.3456938862800598, + "learning_rate": 2.2712862491405436e-05, + "loss": 0.2618, + "step": 739 + }, + { + "epoch": 0.69, + "grad_norm": 0.3525884747505188, + "learning_rate": 2.258569142505098e-05, + "loss": 0.2489, + "step": 740 + }, + { + "epoch": 0.7, + "grad_norm": 0.41400423645973206, + "learning_rate": 2.2458773425762912e-05, + "loss": 0.2781, + "step": 741 + }, + { + "epoch": 0.7, + "grad_norm": 0.45001018047332764, + "learning_rate": 2.2332109665147127e-05, + "loss": 0.3845, + "step": 742 + }, + { + "epoch": 0.7, + "grad_norm": 0.40422332286834717, + "learning_rate": 2.2205701312462617e-05, + "loss": 0.351, + "step": 743 + }, + { + "epoch": 0.7, + "grad_norm": 0.39631444215774536, + "learning_rate": 2.2079549534610606e-05, + "loss": 0.2952, + "step": 744 + }, + { + "epoch": 0.7, + "grad_norm": 0.3830812871456146, + "learning_rate": 2.1953655496123853e-05, + "loss": 0.2824, + "step": 745 + }, + { + "epoch": 0.7, + "grad_norm": 0.4185529351234436, + "learning_rate": 2.1828020359155905e-05, + "loss": 0.2806, + "step": 746 + }, + { + "epoch": 0.7, + "grad_norm": 0.3633481562137604, + "learning_rate": 2.1702645283470236e-05, + "loss": 0.2395, + "step": 747 + }, + { + "epoch": 0.7, + "grad_norm": 0.4514968693256378, + "learning_rate": 2.1577531426429782e-05, + "loss": 0.3592, + "step": 748 + }, + { + "epoch": 0.7, + "grad_norm": 0.33255240321159363, + "learning_rate": 2.1452679942985993e-05, + "loss": 0.2244, + "step": 749 + }, + { + "epoch": 0.7, + "grad_norm": 0.4340452253818512, + "learning_rate": 2.132809198566837e-05, + "loss": 0.3434, + "step": 750 + }, + { + "epoch": 0.7, + "grad_norm": 0.4698704183101654, + "learning_rate": 2.1203768704573672e-05, + "loss": 0.3705, + "step": 751 + }, + { + "epoch": 0.71, + "grad_norm": 0.3831236660480499, + "learning_rate": 2.1079711247355505e-05, + "loss": 0.2703, + "step": 752 + }, + { + "epoch": 0.71, + "grad_norm": 0.44130879640579224, + "learning_rate": 2.095592075921347e-05, + "loss": 0.2404, + "step": 753 + }, + { + "epoch": 0.71, + "grad_norm": 0.3395165503025055, + "learning_rate": 2.08323983828828e-05, + "loss": 0.1883, + "step": 754 + }, + { + "epoch": 0.71, + "grad_norm": 0.43158164620399475, + "learning_rate": 2.0709145258623704e-05, + "loss": 0.3831, + "step": 755 + }, + { + "epoch": 0.71, + "grad_norm": 0.40739014744758606, + "learning_rate": 2.0586162524210895e-05, + "loss": 0.2756, + "step": 756 + }, + { + "epoch": 0.71, + "grad_norm": 0.4256882071495056, + "learning_rate": 2.0463451314923015e-05, + "loss": 0.322, + "step": 757 + }, + { + "epoch": 0.71, + "grad_norm": 0.43485283851623535, + "learning_rate": 2.0341012763532243e-05, + "loss": 0.3444, + "step": 758 + }, + { + "epoch": 0.71, + "grad_norm": 0.36340105533599854, + "learning_rate": 2.021884800029379e-05, + "loss": 0.2585, + "step": 759 + }, + { + "epoch": 0.71, + "grad_norm": 0.39190852642059326, + "learning_rate": 2.009695815293548e-05, + "loss": 0.294, + "step": 760 + }, + { + "epoch": 0.71, + "grad_norm": 0.4237244427204132, + "learning_rate": 1.9975344346647297e-05, + "loss": 0.31, + "step": 761 + }, + { + "epoch": 0.71, + "grad_norm": 0.3808172345161438, + "learning_rate": 1.9854007704071064e-05, + "loss": 0.2931, + "step": 762 + }, + { + "epoch": 0.72, + "grad_norm": 0.3244672119617462, + "learning_rate": 1.973294934529007e-05, + "loss": 0.2037, + "step": 763 + }, + { + "epoch": 0.72, + "grad_norm": 0.3610917925834656, + "learning_rate": 1.961217038781863e-05, + "loss": 0.2589, + "step": 764 + }, + { + "epoch": 0.72, + "grad_norm": 0.4579406976699829, + "learning_rate": 1.9491671946591962e-05, + "loss": 0.3183, + "step": 765 + }, + { + "epoch": 0.72, + "grad_norm": 0.4583299160003662, + "learning_rate": 1.9371455133955675e-05, + "loss": 0.349, + "step": 766 + }, + { + "epoch": 0.72, + "grad_norm": 0.39002010226249695, + "learning_rate": 1.925152105965567e-05, + "loss": 0.2644, + "step": 767 + }, + { + "epoch": 0.72, + "grad_norm": 0.35974547266960144, + "learning_rate": 1.9131870830827818e-05, + "loss": 0.2283, + "step": 768 + }, + { + "epoch": 0.72, + "grad_norm": 0.4203035831451416, + "learning_rate": 1.9012505551987765e-05, + "loss": 0.2639, + "step": 769 + }, + { + "epoch": 0.72, + "grad_norm": 0.3690374791622162, + "learning_rate": 1.8893426325020686e-05, + "loss": 0.2513, + "step": 770 + }, + { + "epoch": 0.72, + "grad_norm": 0.42623621225357056, + "learning_rate": 1.8774634249171185e-05, + "loss": 0.3978, + "step": 771 + }, + { + "epoch": 0.72, + "grad_norm": 0.3870992958545685, + "learning_rate": 1.8656130421033123e-05, + "loss": 0.2935, + "step": 772 + }, + { + "epoch": 0.73, + "grad_norm": 0.35730135440826416, + "learning_rate": 1.8537915934539486e-05, + "loss": 0.1643, + "step": 773 + }, + { + "epoch": 0.73, + "grad_norm": 0.3966630697250366, + "learning_rate": 1.841999188095224e-05, + "loss": 0.3057, + "step": 774 + }, + { + "epoch": 0.73, + "grad_norm": 0.4256872832775116, + "learning_rate": 1.830235934885237e-05, + "loss": 0.3644, + "step": 775 + }, + { + "epoch": 0.73, + "grad_norm": 0.35789552330970764, + "learning_rate": 1.818501942412975e-05, + "loss": 0.2894, + "step": 776 + }, + { + "epoch": 0.73, + "grad_norm": 0.40891268849372864, + "learning_rate": 1.8067973189973075e-05, + "loss": 0.3312, + "step": 777 + }, + { + "epoch": 0.73, + "grad_norm": 0.4185810387134552, + "learning_rate": 1.7951221726860045e-05, + "loss": 0.3332, + "step": 778 + }, + { + "epoch": 0.73, + "grad_norm": 0.4217431843280792, + "learning_rate": 1.7834766112547142e-05, + "loss": 0.3096, + "step": 779 + }, + { + "epoch": 0.73, + "grad_norm": 0.38097184896469116, + "learning_rate": 1.771860742205988e-05, + "loss": 0.3022, + "step": 780 + }, + { + "epoch": 0.73, + "grad_norm": 0.4239228665828705, + "learning_rate": 1.7602746727682796e-05, + "loss": 0.3631, + "step": 781 + }, + { + "epoch": 0.73, + "grad_norm": 0.36988747119903564, + "learning_rate": 1.7487185098949565e-05, + "loss": 0.276, + "step": 782 + }, + { + "epoch": 0.73, + "grad_norm": 0.4268799424171448, + "learning_rate": 1.7371923602633078e-05, + "loss": 0.3585, + "step": 783 + }, + { + "epoch": 0.74, + "grad_norm": 0.42823129892349243, + "learning_rate": 1.725696330273575e-05, + "loss": 0.368, + "step": 784 + }, + { + "epoch": 0.74, + "grad_norm": 0.39439141750335693, + "learning_rate": 1.7142305260479474e-05, + "loss": 0.3613, + "step": 785 + }, + { + "epoch": 0.74, + "grad_norm": 0.39605358242988586, + "learning_rate": 1.7027950534296027e-05, + "loss": 0.3753, + "step": 786 + }, + { + "epoch": 0.74, + "grad_norm": 0.3405487537384033, + "learning_rate": 1.6913900179817144e-05, + "loss": 0.2243, + "step": 787 + }, + { + "epoch": 0.74, + "grad_norm": 0.3856514096260071, + "learning_rate": 1.6800155249864896e-05, + "loss": 0.3305, + "step": 788 + }, + { + "epoch": 0.74, + "grad_norm": 0.3783203661441803, + "learning_rate": 1.668671679444192e-05, + "loss": 0.2892, + "step": 789 + }, + { + "epoch": 0.74, + "grad_norm": 0.41972652077674866, + "learning_rate": 1.6573585860721646e-05, + "loss": 0.3498, + "step": 790 + }, + { + "epoch": 0.74, + "grad_norm": 0.37746182084083557, + "learning_rate": 1.646076349303884e-05, + "loss": 0.277, + "step": 791 + }, + { + "epoch": 0.74, + "grad_norm": 0.3971934914588928, + "learning_rate": 1.63482507328797e-05, + "loss": 0.2944, + "step": 792 + }, + { + "epoch": 0.74, + "grad_norm": 0.2976275384426117, + "learning_rate": 1.6236048618872456e-05, + "loss": 0.2105, + "step": 793 + }, + { + "epoch": 0.74, + "grad_norm": 0.5275766849517822, + "learning_rate": 1.6124158186777676e-05, + "loss": 0.3172, + "step": 794 + }, + { + "epoch": 0.75, + "grad_norm": 0.42608052492141724, + "learning_rate": 1.6012580469478743e-05, + "loss": 0.3299, + "step": 795 + }, + { + "epoch": 0.75, + "grad_norm": 0.32864508032798767, + "learning_rate": 1.5901316496972262e-05, + "loss": 0.1943, + "step": 796 + }, + { + "epoch": 0.75, + "grad_norm": 0.3908816874027252, + "learning_rate": 1.5790367296358644e-05, + "loss": 0.2985, + "step": 797 + }, + { + "epoch": 0.75, + "grad_norm": 0.3235936760902405, + "learning_rate": 1.5679733891832556e-05, + "loss": 0.2088, + "step": 798 + }, + { + "epoch": 0.75, + "grad_norm": 0.38437268137931824, + "learning_rate": 1.55694173046735e-05, + "loss": 0.2778, + "step": 799 + }, + { + "epoch": 0.75, + "grad_norm": 0.36689984798431396, + "learning_rate": 1.5459418553236343e-05, + "loss": 0.2769, + "step": 800 + }, + { + "epoch": 0.75, + "grad_norm": 0.3990126848220825, + "learning_rate": 1.5349738652941968e-05, + "loss": 0.3203, + "step": 801 + }, + { + "epoch": 0.75, + "grad_norm": 0.3759137690067291, + "learning_rate": 1.5240378616267886e-05, + "loss": 0.2586, + "step": 802 + }, + { + "epoch": 0.75, + "grad_norm": 0.38042765855789185, + "learning_rate": 1.5131339452738863e-05, + "loss": 0.3687, + "step": 803 + }, + { + "epoch": 0.75, + "grad_norm": 0.37907275557518005, + "learning_rate": 1.5022622168917649e-05, + "loss": 0.272, + "step": 804 + }, + { + "epoch": 0.76, + "grad_norm": 0.3538564443588257, + "learning_rate": 1.4914227768395595e-05, + "loss": 0.2581, + "step": 805 + }, + { + "epoch": 0.76, + "grad_norm": 0.39212673902511597, + "learning_rate": 1.4806157251783515e-05, + "loss": 0.291, + "step": 806 + }, + { + "epoch": 0.76, + "grad_norm": 0.36042389273643494, + "learning_rate": 1.4698411616702356e-05, + "loss": 0.2512, + "step": 807 + }, + { + "epoch": 0.76, + "grad_norm": 0.387197881937027, + "learning_rate": 1.4590991857774038e-05, + "loss": 0.3171, + "step": 808 + }, + { + "epoch": 0.76, + "grad_norm": 0.38724789023399353, + "learning_rate": 1.4483898966612209e-05, + "loss": 0.3734, + "step": 809 + }, + { + "epoch": 0.76, + "grad_norm": 0.3722423017024994, + "learning_rate": 1.437713393181317e-05, + "loss": 0.2774, + "step": 810 + }, + { + "epoch": 0.76, + "grad_norm": 0.40014025568962097, + "learning_rate": 1.4270697738946704e-05, + "loss": 0.2615, + "step": 811 + }, + { + "epoch": 0.76, + "grad_norm": 0.3334675431251526, + "learning_rate": 1.4164591370547004e-05, + "loss": 0.2324, + "step": 812 + }, + { + "epoch": 0.76, + "grad_norm": 0.4055711030960083, + "learning_rate": 1.4058815806103542e-05, + "loss": 0.2911, + "step": 813 + }, + { + "epoch": 0.76, + "grad_norm": 0.40564215183258057, + "learning_rate": 1.3953372022052107e-05, + "loss": 0.3312, + "step": 814 + }, + { + "epoch": 0.76, + "grad_norm": 0.34772953391075134, + "learning_rate": 1.3848260991765755e-05, + "loss": 0.2875, + "step": 815 + }, + { + "epoch": 0.77, + "grad_norm": 0.4844672381877899, + "learning_rate": 1.3743483685545811e-05, + "loss": 0.4066, + "step": 816 + }, + { + "epoch": 0.77, + "grad_norm": 0.42509323358535767, + "learning_rate": 1.363904107061294e-05, + "loss": 0.3193, + "step": 817 + }, + { + "epoch": 0.77, + "grad_norm": 0.43162110447883606, + "learning_rate": 1.3534934111098179e-05, + "loss": 0.3408, + "step": 818 + }, + { + "epoch": 0.77, + "grad_norm": 0.3848356306552887, + "learning_rate": 1.3431163768034077e-05, + "loss": 0.314, + "step": 819 + }, + { + "epoch": 0.77, + "grad_norm": 0.35395869612693787, + "learning_rate": 1.3327730999345817e-05, + "loss": 0.2277, + "step": 820 + }, + { + "epoch": 0.77, + "grad_norm": 0.35360512137413025, + "learning_rate": 1.3224636759842363e-05, + "loss": 0.2947, + "step": 821 + }, + { + "epoch": 0.77, + "grad_norm": 0.3542770445346832, + "learning_rate": 1.3121882001207614e-05, + "loss": 0.223, + "step": 822 + }, + { + "epoch": 0.77, + "grad_norm": 0.38950300216674805, + "learning_rate": 1.3019467671991692e-05, + "loss": 0.2419, + "step": 823 + }, + { + "epoch": 0.77, + "grad_norm": 0.4121183454990387, + "learning_rate": 1.2917394717602121e-05, + "loss": 0.3552, + "step": 824 + }, + { + "epoch": 0.77, + "grad_norm": 0.34702908992767334, + "learning_rate": 1.2815664080295159e-05, + "loss": 0.2483, + "step": 825 + }, + { + "epoch": 0.77, + "grad_norm": 0.34193891286849976, + "learning_rate": 1.2714276699166994e-05, + "loss": 0.2278, + "step": 826 + }, + { + "epoch": 0.78, + "grad_norm": 0.37185028195381165, + "learning_rate": 1.261323351014525e-05, + "loss": 0.2543, + "step": 827 + }, + { + "epoch": 0.78, + "grad_norm": 0.31859350204467773, + "learning_rate": 1.251253544598014e-05, + "loss": 0.2054, + "step": 828 + }, + { + "epoch": 0.78, + "grad_norm": 0.35622653365135193, + "learning_rate": 1.241218343623602e-05, + "loss": 0.3048, + "step": 829 + }, + { + "epoch": 0.78, + "grad_norm": 0.3775176405906677, + "learning_rate": 1.2312178407282749e-05, + "loss": 0.2991, + "step": 830 + }, + { + "epoch": 0.78, + "grad_norm": 0.3728039264678955, + "learning_rate": 1.2212521282287092e-05, + "loss": 0.3001, + "step": 831 + }, + { + "epoch": 0.78, + "grad_norm": 0.42662662267684937, + "learning_rate": 1.2113212981204292e-05, + "loss": 0.3589, + "step": 832 + }, + { + "epoch": 0.78, + "grad_norm": 0.41523322463035583, + "learning_rate": 1.2014254420769466e-05, + "loss": 0.3247, + "step": 833 + }, + { + "epoch": 0.78, + "grad_norm": 0.3754156231880188, + "learning_rate": 1.1915646514489292e-05, + "loss": 0.2881, + "step": 834 + }, + { + "epoch": 0.78, + "grad_norm": 0.39000576734542847, + "learning_rate": 1.1817390172633403e-05, + "loss": 0.3189, + "step": 835 + }, + { + "epoch": 0.78, + "grad_norm": 0.3429880738258362, + "learning_rate": 1.1719486302226118e-05, + "loss": 0.2378, + "step": 836 + }, + { + "epoch": 0.79, + "grad_norm": 0.5210950374603271, + "learning_rate": 1.1621935807038003e-05, + "loss": 0.4818, + "step": 837 + }, + { + "epoch": 0.79, + "grad_norm": 0.41101348400115967, + "learning_rate": 1.152473958757756e-05, + "loss": 0.3383, + "step": 838 + }, + { + "epoch": 0.79, + "grad_norm": 0.3532755970954895, + "learning_rate": 1.1427898541082855e-05, + "loss": 0.2378, + "step": 839 + }, + { + "epoch": 0.79, + "grad_norm": 0.4082874655723572, + "learning_rate": 1.133141356151336e-05, + "loss": 0.3243, + "step": 840 + }, + { + "epoch": 0.79, + "grad_norm": 0.40283286571502686, + "learning_rate": 1.123528553954154e-05, + "loss": 0.3047, + "step": 841 + }, + { + "epoch": 0.79, + "grad_norm": 0.30116912722587585, + "learning_rate": 1.1139515362544755e-05, + "loss": 0.1996, + "step": 842 + }, + { + "epoch": 0.79, + "grad_norm": 0.4485073685646057, + "learning_rate": 1.1044103914597031e-05, + "loss": 0.3775, + "step": 843 + }, + { + "epoch": 0.79, + "grad_norm": 0.46858519315719604, + "learning_rate": 1.0949052076460853e-05, + "loss": 0.4176, + "step": 844 + }, + { + "epoch": 0.79, + "grad_norm": 0.4420608878135681, + "learning_rate": 1.085436072557911e-05, + "loss": 0.3277, + "step": 845 + }, + { + "epoch": 0.79, + "grad_norm": 0.3578694462776184, + "learning_rate": 1.0760030736066951e-05, + "loss": 0.2433, + "step": 846 + }, + { + "epoch": 0.79, + "grad_norm": 0.3760528564453125, + "learning_rate": 1.0666062978703733e-05, + "loss": 0.2819, + "step": 847 + }, + { + "epoch": 0.8, + "grad_norm": 0.37020227313041687, + "learning_rate": 1.0572458320924943e-05, + "loss": 0.2603, + "step": 848 + }, + { + "epoch": 0.8, + "grad_norm": 0.4318196773529053, + "learning_rate": 1.0479217626814253e-05, + "loss": 0.3103, + "step": 849 + }, + { + "epoch": 0.8, + "grad_norm": 0.31711700558662415, + "learning_rate": 1.0386341757095502e-05, + "loss": 0.1956, + "step": 850 + }, + { + "epoch": 0.8, + "grad_norm": 0.4092193841934204, + "learning_rate": 1.0293831569124774e-05, + "loss": 0.3291, + "step": 851 + }, + { + "epoch": 0.8, + "grad_norm": 0.4069688618183136, + "learning_rate": 1.0201687916882418e-05, + "loss": 0.3463, + "step": 852 + }, + { + "epoch": 0.8, + "grad_norm": 0.35108041763305664, + "learning_rate": 1.0109911650965314e-05, + "loss": 0.2141, + "step": 853 + }, + { + "epoch": 0.8, + "grad_norm": 0.5507402420043945, + "learning_rate": 1.0018503618578818e-05, + "loss": 0.521, + "step": 854 + }, + { + "epoch": 0.8, + "grad_norm": 0.40518254041671753, + "learning_rate": 9.927464663529118e-06, + "loss": 0.2749, + "step": 855 + }, + { + "epoch": 0.8, + "grad_norm": 0.4341948926448822, + "learning_rate": 9.836795626215356e-06, + "loss": 0.3684, + "step": 856 + }, + { + "epoch": 0.8, + "grad_norm": 0.36623236536979675, + "learning_rate": 9.746497343621857e-06, + "loss": 0.296, + "step": 857 + }, + { + "epoch": 0.8, + "grad_norm": 0.33644017577171326, + "learning_rate": 9.656570649310481e-06, + "loss": 0.2323, + "step": 858 + }, + { + "epoch": 0.81, + "grad_norm": 0.3651730418205261, + "learning_rate": 9.567016373412857e-06, + "loss": 0.3039, + "step": 859 + }, + { + "epoch": 0.81, + "grad_norm": 0.46405401825904846, + "learning_rate": 9.477835342622759e-06, + "loss": 0.3854, + "step": 860 + }, + { + "epoch": 0.81, + "grad_norm": 0.44806769490242004, + "learning_rate": 9.389028380188419e-06, + "loss": 0.3988, + "step": 861 + }, + { + "epoch": 0.81, + "grad_norm": 0.3805658519268036, + "learning_rate": 9.300596305905013e-06, + "loss": 0.2861, + "step": 862 + }, + { + "epoch": 0.81, + "grad_norm": 0.39312711358070374, + "learning_rate": 9.212539936107029e-06, + "loss": 0.2729, + "step": 863 + }, + { + "epoch": 0.81, + "grad_norm": 0.34711524844169617, + "learning_rate": 9.124860083660769e-06, + "loss": 0.2602, + "step": 864 + }, + { + "epoch": 0.81, + "grad_norm": 0.33725085854530334, + "learning_rate": 9.037557557956766e-06, + "loss": 0.1977, + "step": 865 + }, + { + "epoch": 0.81, + "grad_norm": 0.4098747670650482, + "learning_rate": 8.950633164902467e-06, + "loss": 0.3187, + "step": 866 + }, + { + "epoch": 0.81, + "grad_norm": 0.418828547000885, + "learning_rate": 8.86408770691462e-06, + "loss": 0.2814, + "step": 867 + }, + { + "epoch": 0.81, + "grad_norm": 0.33042341470718384, + "learning_rate": 8.777921982911996e-06, + "loss": 0.2307, + "step": 868 + }, + { + "epoch": 0.82, + "grad_norm": 0.3797352612018585, + "learning_rate": 8.692136788307903e-06, + "loss": 0.2654, + "step": 869 + }, + { + "epoch": 0.82, + "grad_norm": 0.3360169529914856, + "learning_rate": 8.606732915003002e-06, + "loss": 0.2483, + "step": 870 + }, + { + "epoch": 0.82, + "grad_norm": 0.36900776624679565, + "learning_rate": 8.521711151377803e-06, + "loss": 0.2554, + "step": 871 + }, + { + "epoch": 0.82, + "grad_norm": 0.4058022201061249, + "learning_rate": 8.437072282285535e-06, + "loss": 0.318, + "step": 872 + }, + { + "epoch": 0.82, + "grad_norm": 0.3715745210647583, + "learning_rate": 8.35281708904485e-06, + "loss": 0.2381, + "step": 873 + }, + { + "epoch": 0.82, + "grad_norm": 0.39454740285873413, + "learning_rate": 8.268946349432582e-06, + "loss": 0.3084, + "step": 874 + }, + { + "epoch": 0.82, + "grad_norm": 0.42099666595458984, + "learning_rate": 8.185460837676612e-06, + "loss": 0.292, + "step": 875 + }, + { + "epoch": 0.82, + "grad_norm": 0.45269036293029785, + "learning_rate": 8.102361324448715e-06, + "loss": 0.2864, + "step": 876 + }, + { + "epoch": 0.82, + "grad_norm": 0.41998958587646484, + "learning_rate": 8.019648576857425e-06, + "loss": 0.2996, + "step": 877 + }, + { + "epoch": 0.82, + "grad_norm": 0.4255835711956024, + "learning_rate": 7.937323358440935e-06, + "loss": 0.3387, + "step": 878 + }, + { + "epoch": 0.82, + "grad_norm": 0.352857381105423, + "learning_rate": 7.85538642916015e-06, + "loss": 0.2711, + "step": 879 + }, + { + "epoch": 0.83, + "grad_norm": 0.3418557047843933, + "learning_rate": 7.773838545391515e-06, + "loss": 0.2004, + "step": 880 + }, + { + "epoch": 0.83, + "grad_norm": 0.3853307068347931, + "learning_rate": 7.692680459920188e-06, + "loss": 0.2871, + "step": 881 + }, + { + "epoch": 0.83, + "grad_norm": 0.34913262724876404, + "learning_rate": 7.6119129219329395e-06, + "loss": 0.228, + "step": 882 + }, + { + "epoch": 0.83, + "grad_norm": 0.42278945446014404, + "learning_rate": 7.5315366770114195e-06, + "loss": 0.3265, + "step": 883 + }, + { + "epoch": 0.83, + "grad_norm": 0.4618382453918457, + "learning_rate": 7.4515524671250725e-06, + "loss": 0.2927, + "step": 884 + }, + { + "epoch": 0.83, + "grad_norm": 0.3939862549304962, + "learning_rate": 7.371961030624452e-06, + "loss": 0.2553, + "step": 885 + }, + { + "epoch": 0.83, + "grad_norm": 0.37247055768966675, + "learning_rate": 7.292763102234329e-06, + "loss": 0.278, + "step": 886 + }, + { + "epoch": 0.83, + "grad_norm": 0.3525467813014984, + "learning_rate": 7.213959413046894e-06, + "loss": 0.2045, + "step": 887 + }, + { + "epoch": 0.83, + "grad_norm": 0.34763529896736145, + "learning_rate": 7.135550690515052e-06, + "loss": 0.272, + "step": 888 + }, + { + "epoch": 0.83, + "grad_norm": 0.37334975600242615, + "learning_rate": 7.057537658445701e-06, + "loss": 0.2354, + "step": 889 + }, + { + "epoch": 0.83, + "grad_norm": 0.3018919825553894, + "learning_rate": 6.979921036993042e-06, + "loss": 0.1974, + "step": 890 + }, + { + "epoch": 0.84, + "grad_norm": 0.42736706137657166, + "learning_rate": 6.902701542651874e-06, + "loss": 0.2821, + "step": 891 + }, + { + "epoch": 0.84, + "grad_norm": 0.33115968108177185, + "learning_rate": 6.825879888251135e-06, + "loss": 0.179, + "step": 892 + }, + { + "epoch": 0.84, + "grad_norm": 0.38003167510032654, + "learning_rate": 6.749456782947122e-06, + "loss": 0.2484, + "step": 893 + }, + { + "epoch": 0.84, + "grad_norm": 0.40489086508750916, + "learning_rate": 6.6734329322171165e-06, + "loss": 0.3016, + "step": 894 + }, + { + "epoch": 0.84, + "grad_norm": 0.3853268027305603, + "learning_rate": 6.597809037852726e-06, + "loss": 0.3321, + "step": 895 + }, + { + "epoch": 0.84, + "grad_norm": 0.3887777626514435, + "learning_rate": 6.522585797953579e-06, + "loss": 0.2402, + "step": 896 + }, + { + "epoch": 0.84, + "grad_norm": 0.46126478910446167, + "learning_rate": 6.447763906920679e-06, + "loss": 0.3896, + "step": 897 + }, + { + "epoch": 0.84, + "grad_norm": 0.4578423798084259, + "learning_rate": 6.373344055450165e-06, + "loss": 0.3362, + "step": 898 + }, + { + "epoch": 0.84, + "grad_norm": 0.35297033190727234, + "learning_rate": 6.2993269305268495e-06, + "loss": 0.2309, + "step": 899 + }, + { + "epoch": 0.84, + "grad_norm": 0.41612932085990906, + "learning_rate": 6.2257132154178665e-06, + "loss": 0.3451, + "step": 900 + }, + { + "epoch": 0.85, + "grad_norm": 0.41109976172447205, + "learning_rate": 6.152503589666425e-06, + "loss": 0.3227, + "step": 901 + }, + { + "epoch": 0.85, + "grad_norm": 0.41766732931137085, + "learning_rate": 6.079698729085498e-06, + "loss": 0.3057, + "step": 902 + }, + { + "epoch": 0.85, + "grad_norm": 0.29895317554473877, + "learning_rate": 6.007299305751585e-06, + "loss": 0.1809, + "step": 903 + }, + { + "epoch": 0.85, + "grad_norm": 0.35016998648643494, + "learning_rate": 5.935305987998496e-06, + "loss": 0.2466, + "step": 904 + }, + { + "epoch": 0.85, + "grad_norm": 0.3651289939880371, + "learning_rate": 5.863719440411214e-06, + "loss": 0.2383, + "step": 905 + }, + { + "epoch": 0.85, + "grad_norm": 0.37916186451911926, + "learning_rate": 5.792540323819751e-06, + "loss": 0.3321, + "step": 906 + }, + { + "epoch": 0.85, + "grad_norm": 0.3627545237541199, + "learning_rate": 5.721769295293034e-06, + "loss": 0.2442, + "step": 907 + }, + { + "epoch": 0.85, + "grad_norm": 0.4374503791332245, + "learning_rate": 5.651407008132809e-06, + "loss": 0.3614, + "step": 908 + }, + { + "epoch": 0.85, + "grad_norm": 0.36250773072242737, + "learning_rate": 5.5814541118677284e-06, + "loss": 0.2129, + "step": 909 + }, + { + "epoch": 0.85, + "grad_norm": 0.41063565015792847, + "learning_rate": 5.5119112522471924e-06, + "loss": 0.3507, + "step": 910 + }, + { + "epoch": 0.85, + "grad_norm": 0.37300723791122437, + "learning_rate": 5.442779071235516e-06, + "loss": 0.252, + "step": 911 + }, + { + "epoch": 0.86, + "grad_norm": 0.33876121044158936, + "learning_rate": 5.374058207005944e-06, + "loss": 0.2375, + "step": 912 + }, + { + "epoch": 0.86, + "grad_norm": 0.38935375213623047, + "learning_rate": 5.305749293934764e-06, + "loss": 0.2552, + "step": 913 + }, + { + "epoch": 0.86, + "grad_norm": 0.42013880610466003, + "learning_rate": 5.237852962595469e-06, + "loss": 0.2975, + "step": 914 + }, + { + "epoch": 0.86, + "grad_norm": 0.3413439393043518, + "learning_rate": 5.170369839752925e-06, + "loss": 0.2017, + "step": 915 + }, + { + "epoch": 0.86, + "grad_norm": 0.3880120813846588, + "learning_rate": 5.1033005483575925e-06, + "loss": 0.2561, + "step": 916 + }, + { + "epoch": 0.86, + "grad_norm": 0.40935495495796204, + "learning_rate": 5.036645707539745e-06, + "loss": 0.2604, + "step": 917 + }, + { + "epoch": 0.86, + "grad_norm": 0.4027320444583893, + "learning_rate": 4.9704059326038055e-06, + "loss": 0.2602, + "step": 918 + }, + { + "epoch": 0.86, + "grad_norm": 0.3935575783252716, + "learning_rate": 4.90458183502262e-06, + "loss": 0.2899, + "step": 919 + }, + { + "epoch": 0.86, + "grad_norm": 0.4156310260295868, + "learning_rate": 4.839174022431858e-06, + "loss": 0.3689, + "step": 920 + }, + { + "epoch": 0.86, + "grad_norm": 0.4116358160972595, + "learning_rate": 4.7741830986243356e-06, + "loss": 0.3135, + "step": 921 + }, + { + "epoch": 0.86, + "grad_norm": 0.3684353828430176, + "learning_rate": 4.709609663544534e-06, + "loss": 0.2824, + "step": 922 + }, + { + "epoch": 0.87, + "grad_norm": 0.3770512044429779, + "learning_rate": 4.645454313282965e-06, + "loss": 0.2948, + "step": 923 + }, + { + "epoch": 0.87, + "grad_norm": 0.47205623984336853, + "learning_rate": 4.581717640070743e-06, + "loss": 0.3442, + "step": 924 + }, + { + "epoch": 0.87, + "grad_norm": 0.42810696363449097, + "learning_rate": 4.5184002322740785e-06, + "loss": 0.347, + "step": 925 + }, + { + "epoch": 0.87, + "grad_norm": 0.31437572836875916, + "learning_rate": 4.455502674388873e-06, + "loss": 0.2313, + "step": 926 + }, + { + "epoch": 0.87, + "grad_norm": 0.46491920948028564, + "learning_rate": 4.3930255470352736e-06, + "loss": 0.3452, + "step": 927 + }, + { + "epoch": 0.87, + "grad_norm": 0.4385717511177063, + "learning_rate": 4.330969426952375e-06, + "loss": 0.3261, + "step": 928 + }, + { + "epoch": 0.87, + "grad_norm": 0.396494060754776, + "learning_rate": 4.269334886992876e-06, + "loss": 0.281, + "step": 929 + }, + { + "epoch": 0.87, + "grad_norm": 0.38151490688323975, + "learning_rate": 4.208122496117744e-06, + "loss": 0.2138, + "step": 930 + }, + { + "epoch": 0.87, + "grad_norm": 0.3869413435459137, + "learning_rate": 4.147332819391048e-06, + "loss": 0.2732, + "step": 931 + }, + { + "epoch": 0.87, + "grad_norm": 0.4076899588108063, + "learning_rate": 4.0869664179746694e-06, + "loss": 0.2687, + "step": 932 + }, + { + "epoch": 0.88, + "grad_norm": 0.35533949732780457, + "learning_rate": 4.027023849123157e-06, + "loss": 0.2326, + "step": 933 + }, + { + "epoch": 0.88, + "grad_norm": 0.3902985751628876, + "learning_rate": 3.967505666178556e-06, + "loss": 0.2674, + "step": 934 + }, + { + "epoch": 0.88, + "grad_norm": 0.3649982213973999, + "learning_rate": 3.908412418565371e-06, + "loss": 0.2548, + "step": 935 + }, + { + "epoch": 0.88, + "grad_norm": 0.35166463255882263, + "learning_rate": 3.849744651785381e-06, + "loss": 0.246, + "step": 936 + }, + { + "epoch": 0.88, + "grad_norm": 0.3820720314979553, + "learning_rate": 3.7915029074126974e-06, + "loss": 0.3574, + "step": 937 + }, + { + "epoch": 0.88, + "grad_norm": 0.4962540566921234, + "learning_rate": 3.7336877230887246e-06, + "loss": 0.4256, + "step": 938 + }, + { + "epoch": 0.88, + "grad_norm": 0.4633179008960724, + "learning_rate": 3.676299632517216e-06, + "loss": 0.421, + "step": 939 + }, + { + "epoch": 0.88, + "grad_norm": 0.48635226488113403, + "learning_rate": 3.619339165459307e-06, + "loss": 0.3362, + "step": 940 + }, + { + "epoch": 0.88, + "grad_norm": 0.35124728083610535, + "learning_rate": 3.562806847728678e-06, + "loss": 0.2662, + "step": 941 + }, + { + "epoch": 0.88, + "grad_norm": 0.4047708213329315, + "learning_rate": 3.5067032011866783e-06, + "loss": 0.3547, + "step": 942 + }, + { + "epoch": 0.88, + "grad_norm": 0.30400896072387695, + "learning_rate": 3.4510287437374835e-06, + "loss": 0.1609, + "step": 943 + }, + { + "epoch": 0.89, + "grad_norm": 0.3707870841026306, + "learning_rate": 3.3957839893233536e-06, + "loss": 0.2381, + "step": 944 + }, + { + "epoch": 0.89, + "grad_norm": 0.3312855064868927, + "learning_rate": 3.340969447919873e-06, + "loss": 0.246, + "step": 945 + }, + { + "epoch": 0.89, + "grad_norm": 0.3357097804546356, + "learning_rate": 3.286585625531241e-06, + "loss": 0.2066, + "step": 946 + }, + { + "epoch": 0.89, + "grad_norm": 0.33315232396125793, + "learning_rate": 3.232633024185583e-06, + "loss": 0.2208, + "step": 947 + }, + { + "epoch": 0.89, + "grad_norm": 0.3273695707321167, + "learning_rate": 3.1791121419303794e-06, + "loss": 0.2188, + "step": 948 + }, + { + "epoch": 0.89, + "grad_norm": 0.40958088636398315, + "learning_rate": 3.1260234728277717e-06, + "loss": 0.2762, + "step": 949 + }, + { + "epoch": 0.89, + "grad_norm": 0.3858436644077301, + "learning_rate": 3.0733675069500865e-06, + "loss": 0.2863, + "step": 950 + }, + { + "epoch": 0.89, + "grad_norm": 0.4404066503047943, + "learning_rate": 3.0211447303752695e-06, + "loss": 0.2966, + "step": 951 + }, + { + "epoch": 0.89, + "grad_norm": 0.3974824845790863, + "learning_rate": 2.9693556251824185e-06, + "loss": 0.2889, + "step": 952 + }, + { + "epoch": 0.89, + "grad_norm": 0.4611467719078064, + "learning_rate": 2.9180006694472906e-06, + "loss": 0.4199, + "step": 953 + }, + { + "epoch": 0.89, + "grad_norm": 0.4060475826263428, + "learning_rate": 2.867080337237954e-06, + "loss": 0.3649, + "step": 954 + }, + { + "epoch": 0.9, + "grad_norm": 0.43869730830192566, + "learning_rate": 2.8165950986103805e-06, + "loss": 0.3723, + "step": 955 + }, + { + "epoch": 0.9, + "grad_norm": 0.4404228925704956, + "learning_rate": 2.7665454196040664e-06, + "loss": 0.3284, + "step": 956 + }, + { + "epoch": 0.9, + "grad_norm": 0.40427085757255554, + "learning_rate": 2.716931762237801e-06, + "loss": 0.2748, + "step": 957 + }, + { + "epoch": 0.9, + "grad_norm": 0.41850516200065613, + "learning_rate": 2.667754584505372e-06, + "loss": 0.3376, + "step": 958 + }, + { + "epoch": 0.9, + "grad_norm": 0.49761441349983215, + "learning_rate": 2.6190143403713174e-06, + "loss": 0.3464, + "step": 959 + }, + { + "epoch": 0.9, + "grad_norm": 0.42231816053390503, + "learning_rate": 2.5707114797667465e-06, + "loss": 0.2808, + "step": 960 + }, + { + "epoch": 0.9, + "grad_norm": 0.4168257713317871, + "learning_rate": 2.522846448585231e-06, + "loss": 0.3563, + "step": 961 + }, + { + "epoch": 0.9, + "grad_norm": 0.44141292572021484, + "learning_rate": 2.4754196886785986e-06, + "loss": 0.2718, + "step": 962 + }, + { + "epoch": 0.9, + "grad_norm": 0.3264952600002289, + "learning_rate": 2.4284316378529404e-06, + "loss": 0.2626, + "step": 963 + }, + { + "epoch": 0.9, + "grad_norm": 0.3807451128959656, + "learning_rate": 2.3818827298645207e-06, + "loss": 0.2457, + "step": 964 + }, + { + "epoch": 0.91, + "grad_norm": 0.3312126696109772, + "learning_rate": 2.335773394415802e-06, + "loss": 0.2161, + "step": 965 + }, + { + "epoch": 0.91, + "grad_norm": 0.3949142396450043, + "learning_rate": 2.2901040571514322e-06, + "loss": 0.2985, + "step": 966 + }, + { + "epoch": 0.91, + "grad_norm": 0.4175260066986084, + "learning_rate": 2.2448751396543787e-06, + "loss": 0.2932, + "step": 967 + }, + { + "epoch": 0.91, + "grad_norm": 0.33168932795524597, + "learning_rate": 2.2000870594419908e-06, + "loss": 0.1803, + "step": 968 + }, + { + "epoch": 0.91, + "grad_norm": 0.3617693781852722, + "learning_rate": 2.155740229962161e-06, + "loss": 0.2597, + "step": 969 + }, + { + "epoch": 0.91, + "grad_norm": 0.3556663990020752, + "learning_rate": 2.1118350605894955e-06, + "loss": 0.3051, + "step": 970 + }, + { + "epoch": 0.91, + "grad_norm": 0.36153483390808105, + "learning_rate": 2.068371956621562e-06, + "loss": 0.2594, + "step": 971 + }, + { + "epoch": 0.91, + "grad_norm": 0.42352989315986633, + "learning_rate": 2.0253513192751373e-06, + "loss": 0.3176, + "step": 972 + }, + { + "epoch": 0.91, + "grad_norm": 0.38619571924209595, + "learning_rate": 1.982773545682459e-06, + "loss": 0.291, + "step": 973 + }, + { + "epoch": 0.91, + "grad_norm": 0.3379153311252594, + "learning_rate": 1.9406390288876586e-06, + "loss": 0.2215, + "step": 974 + }, + { + "epoch": 0.91, + "grad_norm": 0.3745526671409607, + "learning_rate": 1.8989481578430223e-06, + "loss": 0.243, + "step": 975 + }, + { + "epoch": 0.92, + "grad_norm": 0.4201480448246002, + "learning_rate": 1.8577013174054857e-06, + "loss": 0.3248, + "step": 976 + }, + { + "epoch": 0.92, + "grad_norm": 0.40764811635017395, + "learning_rate": 1.8168988883330185e-06, + "loss": 0.3478, + "step": 977 + }, + { + "epoch": 0.92, + "grad_norm": 0.46223610639572144, + "learning_rate": 1.7765412472811771e-06, + "loss": 0.4414, + "step": 978 + }, + { + "epoch": 0.92, + "grad_norm": 0.3968757092952728, + "learning_rate": 1.7366287667995417e-06, + "loss": 0.2254, + "step": 979 + }, + { + "epoch": 0.92, + "grad_norm": 0.37118399143218994, + "learning_rate": 1.697161815328363e-06, + "loss": 0.2569, + "step": 980 + }, + { + "epoch": 0.92, + "grad_norm": 0.3366474509239197, + "learning_rate": 1.6581407571951092e-06, + "loss": 0.2767, + "step": 981 + }, + { + "epoch": 0.92, + "grad_norm": 0.35800257325172424, + "learning_rate": 1.6195659526111185e-06, + "loss": 0.2592, + "step": 982 + }, + { + "epoch": 0.92, + "grad_norm": 0.41821491718292236, + "learning_rate": 1.5814377576682527e-06, + "loss": 0.3047, + "step": 983 + }, + { + "epoch": 0.92, + "grad_norm": 0.35340431332588196, + "learning_rate": 1.5437565243356656e-06, + "loss": 0.1947, + "step": 984 + }, + { + "epoch": 0.92, + "grad_norm": 0.4180166721343994, + "learning_rate": 1.5065226004564893e-06, + "loss": 0.3728, + "step": 985 + }, + { + "epoch": 0.92, + "grad_norm": 0.3838116526603699, + "learning_rate": 1.4697363297446477e-06, + "loss": 0.2781, + "step": 986 + }, + { + "epoch": 0.93, + "grad_norm": 0.4529761075973511, + "learning_rate": 1.4333980517817203e-06, + "loss": 0.3834, + "step": 987 + }, + { + "epoch": 0.93, + "grad_norm": 0.34723663330078125, + "learning_rate": 1.3975081020137392e-06, + "loss": 0.1994, + "step": 988 + }, + { + "epoch": 0.93, + "grad_norm": 0.5195287466049194, + "learning_rate": 1.3620668117481472e-06, + "loss": 0.3468, + "step": 989 + }, + { + "epoch": 0.93, + "grad_norm": 0.37333688139915466, + "learning_rate": 1.3270745081506997e-06, + "loss": 0.3263, + "step": 990 + }, + { + "epoch": 0.93, + "grad_norm": 0.5042914152145386, + "learning_rate": 1.292531514242501e-06, + "loss": 0.4647, + "step": 991 + }, + { + "epoch": 0.93, + "grad_norm": 0.4056055247783661, + "learning_rate": 1.2584381488969454e-06, + "loss": 0.297, + "step": 992 + }, + { + "epoch": 0.93, + "grad_norm": 0.3029918968677521, + "learning_rate": 1.2247947268368364e-06, + "loss": 0.2179, + "step": 993 + }, + { + "epoch": 0.93, + "grad_norm": 0.390737384557724, + "learning_rate": 1.191601558631461e-06, + "loss": 0.257, + "step": 994 + }, + { + "epoch": 0.93, + "grad_norm": 0.35613298416137695, + "learning_rate": 1.1588589506937198e-06, + "loss": 0.2713, + "step": 995 + }, + { + "epoch": 0.93, + "grad_norm": 0.5220173597335815, + "learning_rate": 1.126567205277279e-06, + "loss": 0.3513, + "step": 996 + }, + { + "epoch": 0.94, + "grad_norm": 0.3554520308971405, + "learning_rate": 1.094726620473835e-06, + "loss": 0.2744, + "step": 997 + }, + { + "epoch": 0.94, + "grad_norm": 0.4104813039302826, + "learning_rate": 1.0633374902103088e-06, + "loss": 0.3098, + "step": 998 + }, + { + "epoch": 0.94, + "grad_norm": 0.3933941423892975, + "learning_rate": 1.0324001042461395e-06, + "loss": 0.2521, + "step": 999 + }, + { + "epoch": 0.94, + "grad_norm": 0.4088434875011444, + "learning_rate": 1.0019147481706625e-06, + "loss": 0.3249, + "step": 1000 + }, + { + "epoch": 0.94, + "grad_norm": 0.4932601749897003, + "learning_rate": 9.718817034003901e-07, + "loss": 0.4119, + "step": 1001 + }, + { + "epoch": 0.94, + "grad_norm": 0.33127981424331665, + "learning_rate": 9.423012471764914e-07, + "loss": 0.1965, + "step": 1002 + }, + { + "epoch": 0.94, + "grad_norm": 0.35370326042175293, + "learning_rate": 9.131736525621603e-07, + "loss": 0.2197, + "step": 1003 + }, + { + "epoch": 0.94, + "grad_norm": 0.40955832600593567, + "learning_rate": 8.844991884401854e-07, + "loss": 0.2728, + "step": 1004 + }, + { + "epoch": 0.94, + "grad_norm": 0.3946859538555145, + "learning_rate": 8.56278119510362e-07, + "loss": 0.2659, + "step": 1005 + }, + { + "epoch": 0.94, + "grad_norm": 0.4458279609680176, + "learning_rate": 8.285107062871333e-07, + "loss": 0.348, + "step": 1006 + }, + { + "epoch": 0.94, + "grad_norm": 0.35338565707206726, + "learning_rate": 8.011972050971483e-07, + "loss": 0.2703, + "step": 1007 + }, + { + "epoch": 0.95, + "grad_norm": 0.39034345746040344, + "learning_rate": 7.74337868076902e-07, + "loss": 0.3125, + "step": 1008 + }, + { + "epoch": 0.95, + "grad_norm": 0.44654595851898193, + "learning_rate": 7.479329431703985e-07, + "loss": 0.399, + "step": 1009 + }, + { + "epoch": 0.95, + "grad_norm": 0.4880700409412384, + "learning_rate": 7.21982674126881e-07, + "loss": 0.3532, + "step": 1010 + }, + { + "epoch": 0.95, + "grad_norm": 0.35642099380493164, + "learning_rate": 6.964873004985717e-07, + "loss": 0.2302, + "step": 1011 + }, + { + "epoch": 0.95, + "grad_norm": 0.4704366624355316, + "learning_rate": 6.714470576384579e-07, + "loss": 0.3791, + "step": 1012 + }, + { + "epoch": 0.95, + "grad_norm": 0.3759339153766632, + "learning_rate": 6.468621766981154e-07, + "loss": 0.2714, + "step": 1013 + }, + { + "epoch": 0.95, + "grad_norm": 0.3735077381134033, + "learning_rate": 6.227328846255931e-07, + "loss": 0.2643, + "step": 1014 + }, + { + "epoch": 0.95, + "grad_norm": 0.36363470554351807, + "learning_rate": 5.990594041632991e-07, + "loss": 0.2521, + "step": 1015 + }, + { + "epoch": 0.95, + "grad_norm": 0.3581157624721527, + "learning_rate": 5.758419538459459e-07, + "loss": 0.268, + "step": 1016 + }, + { + "epoch": 0.95, + "grad_norm": 0.4589844048023224, + "learning_rate": 5.530807479985633e-07, + "loss": 0.3436, + "step": 1017 + }, + { + "epoch": 0.95, + "grad_norm": 0.32665497064590454, + "learning_rate": 5.307759967344672e-07, + "loss": 0.2617, + "step": 1018 + }, + { + "epoch": 0.96, + "grad_norm": 0.4270218312740326, + "learning_rate": 5.089279059533658e-07, + "loss": 0.3261, + "step": 1019 + }, + { + "epoch": 0.96, + "grad_norm": 0.4280019998550415, + "learning_rate": 4.87536677339423e-07, + "loss": 0.348, + "step": 1020 + }, + { + "epoch": 0.96, + "grad_norm": 0.3628242313861847, + "learning_rate": 4.666025083594483e-07, + "loss": 0.2567, + "step": 1021 + }, + { + "epoch": 0.96, + "grad_norm": 0.40238940715789795, + "learning_rate": 4.461255922609986e-07, + "loss": 0.301, + "step": 1022 + }, + { + "epoch": 0.96, + "grad_norm": 0.4348450005054474, + "learning_rate": 4.261061180706627e-07, + "loss": 0.3572, + "step": 1023 + }, + { + "epoch": 0.96, + "grad_norm": 0.3945859968662262, + "learning_rate": 4.065442705922906e-07, + "loss": 0.3631, + "step": 1024 + }, + { + "epoch": 0.96, + "grad_norm": 0.4079526662826538, + "learning_rate": 3.8744023040528374e-07, + "loss": 0.2975, + "step": 1025 + }, + { + "epoch": 0.96, + "grad_norm": 0.4182088375091553, + "learning_rate": 3.687941738629186e-07, + "loss": 0.3489, + "step": 1026 + }, + { + "epoch": 0.96, + "grad_norm": 0.3973821997642517, + "learning_rate": 3.5060627309074224e-07, + "loss": 0.2775, + "step": 1027 + }, + { + "epoch": 0.96, + "grad_norm": 0.406399667263031, + "learning_rate": 3.3287669598497383e-07, + "loss": 0.2776, + "step": 1028 + }, + { + "epoch": 0.97, + "grad_norm": 0.44541242718696594, + "learning_rate": 3.156056062109503e-07, + "loss": 0.3415, + "step": 1029 + }, + { + "epoch": 0.97, + "grad_norm": 0.3318565785884857, + "learning_rate": 2.987931632016272e-07, + "loss": 0.2179, + "step": 1030 + }, + { + "epoch": 0.97, + "grad_norm": 0.4106653332710266, + "learning_rate": 2.824395221560805e-07, + "loss": 0.2856, + "step": 1031 + }, + { + "epoch": 0.97, + "grad_norm": 0.3946784436702728, + "learning_rate": 2.665448340381016e-07, + "loss": 0.2658, + "step": 1032 + }, + { + "epoch": 0.97, + "grad_norm": 0.3629414141178131, + "learning_rate": 2.511092455747932e-07, + "loss": 0.2869, + "step": 1033 + }, + { + "epoch": 0.97, + "grad_norm": 0.37011194229125977, + "learning_rate": 2.361328992552314e-07, + "loss": 0.2481, + "step": 1034 + }, + { + "epoch": 0.97, + "grad_norm": 0.47178444266319275, + "learning_rate": 2.2161593332910013e-07, + "loss": 0.3622, + "step": 1035 + }, + { + "epoch": 0.97, + "grad_norm": 0.44122573733329773, + "learning_rate": 2.0755848180547543e-07, + "loss": 0.3903, + "step": 1036 + }, + { + "epoch": 0.97, + "grad_norm": 0.32538658380508423, + "learning_rate": 1.9396067445155986e-07, + "loss": 0.2214, + "step": 1037 + }, + { + "epoch": 0.97, + "grad_norm": 0.43575894832611084, + "learning_rate": 1.8082263679148337e-07, + "loss": 0.3991, + "step": 1038 + }, + { + "epoch": 0.97, + "grad_norm": 0.42153045535087585, + "learning_rate": 1.681444901051432e-07, + "loss": 0.3468, + "step": 1039 + }, + { + "epoch": 0.98, + "grad_norm": 0.39070653915405273, + "learning_rate": 1.5592635142709367e-07, + "loss": 0.2821, + "step": 1040 + }, + { + "epoch": 0.98, + "grad_norm": 0.373835951089859, + "learning_rate": 1.4416833354546356e-07, + "loss": 0.2638, + "step": 1041 + }, + { + "epoch": 0.98, + "grad_norm": 0.38927844166755676, + "learning_rate": 1.328705450009071e-07, + "loss": 0.2571, + "step": 1042 + }, + { + "epoch": 0.98, + "grad_norm": 0.4438738226890564, + "learning_rate": 1.2203309008561592e-07, + "loss": 0.3562, + "step": 1043 + }, + { + "epoch": 0.98, + "grad_norm": 0.4671138525009155, + "learning_rate": 1.1165606884234181e-07, + "loss": 0.3204, + "step": 1044 + }, + { + "epoch": 0.98, + "grad_norm": 0.43793073296546936, + "learning_rate": 1.0173957706348659e-07, + "loss": 0.3208, + "step": 1045 + }, + { + "epoch": 0.98, + "grad_norm": 0.40344715118408203, + "learning_rate": 9.228370629019711e-08, + "loss": 0.2873, + "step": 1046 + }, + { + "epoch": 0.98, + "grad_norm": 0.425571471452713, + "learning_rate": 8.328854381154938e-08, + "loss": 0.3374, + "step": 1047 + }, + { + "epoch": 0.98, + "grad_norm": 0.3397400677204132, + "learning_rate": 7.475417266371576e-08, + "loss": 0.2717, + "step": 1048 + }, + { + "epoch": 0.98, + "grad_norm": 0.3563021719455719, + "learning_rate": 6.668067162921566e-08, + "loss": 0.2625, + "step": 1049 + }, + { + "epoch": 0.98, + "grad_norm": 0.39349859952926636, + "learning_rate": 5.906811523618272e-08, + "loss": 0.2895, + "step": 1050 + }, + { + "epoch": 0.99, + "grad_norm": 0.44752970337867737, + "learning_rate": 5.191657375767656e-08, + "loss": 0.3835, + "step": 1051 + }, + { + "epoch": 0.99, + "grad_norm": 0.40532049536705017, + "learning_rate": 4.522611321103321e-08, + "loss": 0.2817, + "step": 1052 + }, + { + "epoch": 0.99, + "grad_norm": 0.35706475377082825, + "learning_rate": 3.8996795357254535e-08, + "loss": 0.2191, + "step": 1053 + }, + { + "epoch": 0.99, + "grad_norm": 0.5339099168777466, + "learning_rate": 3.322867770044202e-08, + "loss": 0.3686, + "step": 1054 + }, + { + "epoch": 0.99, + "grad_norm": 0.47053080797195435, + "learning_rate": 2.792181348726941e-08, + "loss": 0.3318, + "step": 1055 + }, + { + "epoch": 0.99, + "grad_norm": 0.4522939920425415, + "learning_rate": 2.3076251706477536e-08, + "loss": 0.3358, + "step": 1056 + }, + { + "epoch": 0.99, + "grad_norm": 0.46299952268600464, + "learning_rate": 1.869203708843581e-08, + "loss": 0.3769, + "step": 1057 + }, + { + "epoch": 0.99, + "grad_norm": 0.3941827714443207, + "learning_rate": 1.476921010471477e-08, + "loss": 0.3189, + "step": 1058 + }, + { + "epoch": 0.99, + "grad_norm": 0.3861558139324188, + "learning_rate": 1.1307806967741919e-08, + "loss": 0.2869, + "step": 1059 + }, + { + "epoch": 0.99, + "grad_norm": 0.32701894640922546, + "learning_rate": 8.307859630429793e-09, + "loss": 0.2303, + "step": 1060 + }, + { + "epoch": 1.0, + "grad_norm": 0.44485971331596375, + "learning_rate": 5.7693957858984125e-09, + "loss": 0.3537, + "step": 1061 + }, + { + "epoch": 1.0, + "grad_norm": 0.4371122419834137, + "learning_rate": 3.6924388672254785e-09, + "loss": 0.3147, + "step": 1062 + }, + { + "epoch": 1.0, + "grad_norm": 0.37674111127853394, + "learning_rate": 2.0770080472298783e-09, + "loss": 0.286, + "step": 1063 + }, + { + "epoch": 1.0, + "grad_norm": 0.48630061745643616, + "learning_rate": 9.231182382773984e-10, + "loss": 0.3817, + "step": 1064 + }, + { + "epoch": 1.0, + "grad_norm": 0.37980979681015015, + "learning_rate": 2.307800921641512e-10, + "loss": 0.283, + "step": 1065 + }, + { + "epoch": 1.0, + "grad_norm": 0.4119957387447357, + "learning_rate": 0.0, + "loss": 0.2733, + "step": 1066 + }, + { + "epoch": 1.0, + "step": 1066, + "total_flos": 1.171159722670162e+17, + "train_loss": 0.3121360269205655, + "train_runtime": 51953.7782, + "train_samples_per_second": 0.657, + "train_steps_per_second": 0.021 + } + ], + "logging_steps": 1.0, + "max_steps": 1066, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 1.171159722670162e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}