diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_only_lora_agg_fc4_s2048/README.md b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_only_lora_agg_fc4_s2048/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_only_lora_agg_fc4_s2048/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_only_lora_agg_fc4_s2048/adapter_config.json b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_only_lora_agg_fc4_s2048/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..bdf910678210d8ccdf49a4409f54a5815db54f77 --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_only_lora_agg_fc4_s2048/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "/data/wiedmann/hub/models--lmms-lab--llava-onevision-qwen2-7b-ov/snapshots/0b07bf7565e244cf4f39982249eafe8cd799d6dd", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 256, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "q_proj", + "k_proj", + "o_proj", + "gate_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_only_lora_agg_fc4_s2048/adapter_model.bin b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_only_lora_agg_fc4_s2048/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..04a05f0c6693085b6b1fe856c3e75d8307fd9be3 --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_only_lora_agg_fc4_s2048/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7d6c3ea53b723a805f00cdad89353275883987aaba37e6b1eb5c9792ec4fde7 +size 692127130 diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_only_lora_agg_fc4_s2048/config.json b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_only_lora_agg_fc4_s2048/config.json new file mode 100644 index 0000000000000000000000000000000000000000..296c88047f8e2b39d47783382a227fda663ab28a --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_only_lora_agg_fc4_s2048/config.json @@ -0,0 +1,68 @@ +{ + "_name_or_path": "/data/wiedmann/hub/models--lmms-lab--llava-onevision-qwen2-7b-ov/snapshots/0b07bf7565e244cf4f39982249eafe8cd799d6dd", + "add_faster_video": false, + "add_time_instruction": false, + "architectures": [ + "LlavaQwenForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "faster_token_stride": 10, + "force_sample": false, + "hidden_act": "silu", + "hidden_size": 3584, + "image_aspect_ratio": "square", + "image_crop_resolution": null, + "image_grid_pinpoints": null, + "image_split_resolution": null, + "image_token_index": 151646, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "mm_hidden_size": 1152, + "mm_newline_position": "no_token", + "mm_patch_merge_type": "spatial_unpad", + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_resampler_type": "streaming_agg", + "mm_spatial_pool_mode": "bilinear", + "mm_spatial_pool_stride": null, + "mm_streaming_frames_per_chunk": 4, + "mm_streaming_input_dim": 1152, + "mm_streaming_mlp_ratio": 4.0, + "mm_streaming_num_heads": 8, + "mm_streaming_num_layers": 4, + "mm_streaming_num_state_tokens": 2048, + "mm_streaming_patches_per_frame": 729, + "mm_streaming_state_dim": 1152, + "mm_streaming_vision_chunk_size": 8, + "mm_tunable_parts": "mm_vision_resampler", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-384", + "mm_vision_tower_lr": null, + "model_type": "llava_qwen", + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "pos_skipping_range": 4096, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 131072, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 8192, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.40.0.dev0", + "use_cache": true, + "use_mm_proj": true, + "use_pos_skipping": false, + "use_sliding_window": false, + "vision_tower_pretrained": null, + "vocab_size": 152064 +} diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_only_lora_agg_fc4_s2048/generation_config.json b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_only_lora_agg_fc4_s2048/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..19a297221acb87418d4388a3decef2282c6d7316 --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_only_lora_agg_fc4_s2048/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.05, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.40.0.dev0" +} diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_only_lora_agg_fc4_s2048/non_lora_trainables.bin b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_only_lora_agg_fc4_s2048/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..79e0264811faf21b6cd2191fec21518e015bd0fc --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_only_lora_agg_fc4_s2048/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33845fa64adaa245f31c2ee34d19c4324d2bc3d1624b344301c3863c804ae76d +size 177478484 diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_only_lora_agg_fc4_s2048/trainer_state.json b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_only_lora_agg_fc4_s2048/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..ecbae5742f44c72311dd26e5aefb8e9d6f722d62 --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_only_lora_agg_fc4_s2048/trainer_state.json @@ -0,0 +1,7492 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999413730433253, + "eval_steps": 500, + "global_step": 1066, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 18.367340087890625, + "learning_rate": 3.125e-06, + "loss": 1.1997, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 20.36041259765625, + "learning_rate": 6.25e-06, + "loss": 1.2617, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 15.435030937194824, + "learning_rate": 9.375000000000001e-06, + "loss": 1.1302, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 11.671319007873535, + "learning_rate": 1.25e-05, + "loss": 0.9212, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 8.698945045471191, + "learning_rate": 1.5625e-05, + "loss": 0.8626, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 4.7610063552856445, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.6768, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 2.763170003890991, + "learning_rate": 2.1875e-05, + "loss": 0.554, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 1.646397352218628, + "learning_rate": 2.5e-05, + "loss": 0.4462, + "step": 8 + }, + { + "epoch": 0.01, + "grad_norm": 1.2087757587432861, + "learning_rate": 2.8125000000000003e-05, + "loss": 0.4335, + "step": 9 + }, + { + "epoch": 0.01, + "grad_norm": 1.0779973268508911, + "learning_rate": 3.125e-05, + "loss": 0.3426, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 0.8232861757278442, + "learning_rate": 3.4375e-05, + "loss": 0.2855, + "step": 11 + }, + { + "epoch": 0.01, + "grad_norm": 0.7847572565078735, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.429, + "step": 12 + }, + { + "epoch": 0.01, + "grad_norm": 0.9708127975463867, + "learning_rate": 4.0625000000000005e-05, + "loss": 0.5351, + "step": 13 + }, + { + "epoch": 0.01, + "grad_norm": 1.680749535560608, + "learning_rate": 4.375e-05, + "loss": 0.3552, + "step": 14 + }, + { + "epoch": 0.01, + "grad_norm": 0.754399299621582, + "learning_rate": 4.6875e-05, + "loss": 0.4887, + "step": 15 + }, + { + "epoch": 0.02, + "grad_norm": 0.8019405007362366, + "learning_rate": 5e-05, + "loss": 0.3746, + "step": 16 + }, + { + "epoch": 0.02, + "grad_norm": 3.1756885051727295, + "learning_rate": 5.3125000000000004e-05, + "loss": 0.2939, + "step": 17 + }, + { + "epoch": 0.02, + "grad_norm": 1.1910041570663452, + "learning_rate": 5.6250000000000005e-05, + "loss": 0.3498, + "step": 18 + }, + { + "epoch": 0.02, + "grad_norm": 0.7563303709030151, + "learning_rate": 5.9375e-05, + "loss": 0.3151, + "step": 19 + }, + { + "epoch": 0.02, + "grad_norm": 0.6908413767814636, + "learning_rate": 6.25e-05, + "loss": 0.427, + "step": 20 + }, + { + "epoch": 0.02, + "grad_norm": 0.5701287388801575, + "learning_rate": 6.562500000000001e-05, + "loss": 0.3341, + "step": 21 + }, + { + "epoch": 0.02, + "grad_norm": 0.6663073301315308, + "learning_rate": 6.875e-05, + "loss": 0.4151, + "step": 22 + }, + { + "epoch": 0.02, + "grad_norm": 0.647335410118103, + "learning_rate": 7.1875e-05, + "loss": 0.3664, + "step": 23 + }, + { + "epoch": 0.02, + "grad_norm": 1.250497817993164, + "learning_rate": 7.500000000000001e-05, + "loss": 0.3233, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 0.6220181584358215, + "learning_rate": 7.8125e-05, + "loss": 0.3933, + "step": 25 + }, + { + "epoch": 0.02, + "grad_norm": 0.6255826354026794, + "learning_rate": 8.125000000000001e-05, + "loss": 0.4192, + "step": 26 + }, + { + "epoch": 0.03, + "grad_norm": 0.7323064804077148, + "learning_rate": 8.4375e-05, + "loss": 0.4033, + "step": 27 + }, + { + "epoch": 0.03, + "grad_norm": 0.8183202147483826, + "learning_rate": 8.75e-05, + "loss": 0.3325, + "step": 28 + }, + { + "epoch": 0.03, + "grad_norm": 0.56742924451828, + "learning_rate": 9.062500000000001e-05, + "loss": 0.3638, + "step": 29 + }, + { + "epoch": 0.03, + "grad_norm": 0.6514180898666382, + "learning_rate": 9.375e-05, + "loss": 0.4459, + "step": 30 + }, + { + "epoch": 0.03, + "grad_norm": 0.5290864109992981, + "learning_rate": 9.687500000000001e-05, + "loss": 0.3277, + "step": 31 + }, + { + "epoch": 0.03, + "grad_norm": 0.47609055042266846, + "learning_rate": 0.0001, + "loss": 0.2922, + "step": 32 + }, + { + "epoch": 0.03, + "grad_norm": 0.5185883641242981, + "learning_rate": 9.999976921990784e-05, + "loss": 0.3217, + "step": 33 + }, + { + "epoch": 0.03, + "grad_norm": 0.5954408049583435, + "learning_rate": 9.999907688176173e-05, + "loss": 0.4124, + "step": 34 + }, + { + "epoch": 0.03, + "grad_norm": 0.4328826069831848, + "learning_rate": 9.999792299195278e-05, + "loss": 0.2708, + "step": 35 + }, + { + "epoch": 0.03, + "grad_norm": 0.629949152469635, + "learning_rate": 9.999630756113278e-05, + "loss": 0.3774, + "step": 36 + }, + { + "epoch": 0.03, + "grad_norm": 0.4580020010471344, + "learning_rate": 9.999423060421411e-05, + "loss": 0.3097, + "step": 37 + }, + { + "epoch": 0.04, + "grad_norm": 0.48158058524131775, + "learning_rate": 9.999169214036958e-05, + "loss": 0.3514, + "step": 38 + }, + { + "epoch": 0.04, + "grad_norm": 0.4538012742996216, + "learning_rate": 9.998869219303227e-05, + "loss": 0.3093, + "step": 39 + }, + { + "epoch": 0.04, + "grad_norm": 0.5379505753517151, + "learning_rate": 9.998523078989529e-05, + "loss": 0.3777, + "step": 40 + }, + { + "epoch": 0.04, + "grad_norm": 0.445813924074173, + "learning_rate": 9.998130796291156e-05, + "loss": 0.3601, + "step": 41 + }, + { + "epoch": 0.04, + "grad_norm": 0.5426644682884216, + "learning_rate": 9.997692374829352e-05, + "loss": 0.3143, + "step": 42 + }, + { + "epoch": 0.04, + "grad_norm": 0.4094751179218292, + "learning_rate": 9.997207818651274e-05, + "loss": 0.2912, + "step": 43 + }, + { + "epoch": 0.04, + "grad_norm": 0.4211678206920624, + "learning_rate": 9.996677132229957e-05, + "loss": 0.3235, + "step": 44 + }, + { + "epoch": 0.04, + "grad_norm": 0.4476563334465027, + "learning_rate": 9.996100320464274e-05, + "loss": 0.3559, + "step": 45 + }, + { + "epoch": 0.04, + "grad_norm": 0.4389052093029022, + "learning_rate": 9.995477388678897e-05, + "loss": 0.2772, + "step": 46 + }, + { + "epoch": 0.04, + "grad_norm": 0.5050854086875916, + "learning_rate": 9.994808342624234e-05, + "loss": 0.2992, + "step": 47 + }, + { + "epoch": 0.05, + "grad_norm": 0.39739540219306946, + "learning_rate": 9.994093188476382e-05, + "loss": 0.2832, + "step": 48 + }, + { + "epoch": 0.05, + "grad_norm": 0.4640009105205536, + "learning_rate": 9.993331932837079e-05, + "loss": 0.355, + "step": 49 + }, + { + "epoch": 0.05, + "grad_norm": 0.5145576596260071, + "learning_rate": 9.992524582733629e-05, + "loss": 0.2745, + "step": 50 + }, + { + "epoch": 0.05, + "grad_norm": 0.4169548451900482, + "learning_rate": 9.991671145618846e-05, + "loss": 0.2376, + "step": 51 + }, + { + "epoch": 0.05, + "grad_norm": 0.46700263023376465, + "learning_rate": 9.99077162937098e-05, + "loss": 0.3257, + "step": 52 + }, + { + "epoch": 0.05, + "grad_norm": 0.4765835702419281, + "learning_rate": 9.989826042293652e-05, + "loss": 0.3293, + "step": 53 + }, + { + "epoch": 0.05, + "grad_norm": 0.48216092586517334, + "learning_rate": 9.988834393115767e-05, + "loss": 0.3152, + "step": 54 + }, + { + "epoch": 0.05, + "grad_norm": 0.518074631690979, + "learning_rate": 9.987796690991439e-05, + "loss": 0.3669, + "step": 55 + }, + { + "epoch": 0.05, + "grad_norm": 0.4637100100517273, + "learning_rate": 9.98671294549991e-05, + "loss": 0.3148, + "step": 56 + }, + { + "epoch": 0.05, + "grad_norm": 0.43243342638015747, + "learning_rate": 9.985583166645455e-05, + "loss": 0.2475, + "step": 57 + }, + { + "epoch": 0.05, + "grad_norm": 0.4122597277164459, + "learning_rate": 9.98440736485729e-05, + "loss": 0.2878, + "step": 58 + }, + { + "epoch": 0.06, + "grad_norm": 0.4131738841533661, + "learning_rate": 9.983185550989487e-05, + "loss": 0.2217, + "step": 59 + }, + { + "epoch": 0.06, + "grad_norm": 0.4752655327320099, + "learning_rate": 9.981917736320851e-05, + "loss": 0.3538, + "step": 60 + }, + { + "epoch": 0.06, + "grad_norm": 0.37928274273872375, + "learning_rate": 9.980603932554845e-05, + "loss": 0.1967, + "step": 61 + }, + { + "epoch": 0.06, + "grad_norm": 0.4203208386898041, + "learning_rate": 9.979244151819453e-05, + "loss": 0.2483, + "step": 62 + }, + { + "epoch": 0.06, + "grad_norm": 0.49629998207092285, + "learning_rate": 9.97783840666709e-05, + "loss": 0.3649, + "step": 63 + }, + { + "epoch": 0.06, + "grad_norm": 0.4602612853050232, + "learning_rate": 9.976386710074478e-05, + "loss": 0.3001, + "step": 64 + }, + { + "epoch": 0.06, + "grad_norm": 0.43678635358810425, + "learning_rate": 9.974889075442521e-05, + "loss": 0.2301, + "step": 65 + }, + { + "epoch": 0.06, + "grad_norm": 0.49575650691986084, + "learning_rate": 9.97334551659619e-05, + "loss": 0.3064, + "step": 66 + }, + { + "epoch": 0.06, + "grad_norm": 0.4498412609100342, + "learning_rate": 9.971756047784393e-05, + "loss": 0.3118, + "step": 67 + }, + { + "epoch": 0.06, + "grad_norm": 0.4476684629917145, + "learning_rate": 9.970120683679838e-05, + "loss": 0.3873, + "step": 68 + }, + { + "epoch": 0.06, + "grad_norm": 0.4587801396846771, + "learning_rate": 9.968439439378905e-05, + "loss": 0.3894, + "step": 69 + }, + { + "epoch": 0.07, + "grad_norm": 0.4500073790550232, + "learning_rate": 9.966712330401504e-05, + "loss": 0.3612, + "step": 70 + }, + { + "epoch": 0.07, + "grad_norm": 0.4947085976600647, + "learning_rate": 9.964939372690926e-05, + "loss": 0.4023, + "step": 71 + }, + { + "epoch": 0.07, + "grad_norm": 0.4781345725059509, + "learning_rate": 9.96312058261371e-05, + "loss": 0.3429, + "step": 72 + }, + { + "epoch": 0.07, + "grad_norm": 0.43458712100982666, + "learning_rate": 9.961255976959473e-05, + "loss": 0.3179, + "step": 73 + }, + { + "epoch": 0.07, + "grad_norm": 0.49464428424835205, + "learning_rate": 9.959345572940771e-05, + "loss": 0.4133, + "step": 74 + }, + { + "epoch": 0.07, + "grad_norm": 0.38269466161727905, + "learning_rate": 9.957389388192935e-05, + "loss": 0.2905, + "step": 75 + }, + { + "epoch": 0.07, + "grad_norm": 0.43023020029067993, + "learning_rate": 9.9553874407739e-05, + "loss": 0.2674, + "step": 76 + }, + { + "epoch": 0.07, + "grad_norm": 0.4165527820587158, + "learning_rate": 9.953339749164057e-05, + "loss": 0.3692, + "step": 77 + }, + { + "epoch": 0.07, + "grad_norm": 0.44769737124443054, + "learning_rate": 9.951246332266057e-05, + "loss": 0.3606, + "step": 78 + }, + { + "epoch": 0.07, + "grad_norm": 0.45688146352767944, + "learning_rate": 9.949107209404665e-05, + "loss": 0.4039, + "step": 79 + }, + { + "epoch": 0.08, + "grad_norm": 0.4197036623954773, + "learning_rate": 9.946922400326554e-05, + "loss": 0.377, + "step": 80 + }, + { + "epoch": 0.08, + "grad_norm": 0.4688505232334137, + "learning_rate": 9.944691925200145e-05, + "loss": 0.3651, + "step": 81 + }, + { + "epoch": 0.08, + "grad_norm": 0.4570940136909485, + "learning_rate": 9.942415804615406e-05, + "loss": 0.3847, + "step": 82 + }, + { + "epoch": 0.08, + "grad_norm": 0.4865867495536804, + "learning_rate": 9.940094059583671e-05, + "loss": 0.3874, + "step": 83 + }, + { + "epoch": 0.08, + "grad_norm": 0.4403970539569855, + "learning_rate": 9.937726711537442e-05, + "loss": 0.2722, + "step": 84 + }, + { + "epoch": 0.08, + "grad_norm": 0.41962695121765137, + "learning_rate": 9.93531378233019e-05, + "loss": 0.341, + "step": 85 + }, + { + "epoch": 0.08, + "grad_norm": 0.5067736506462097, + "learning_rate": 9.932855294236154e-05, + "loss": 0.3613, + "step": 86 + }, + { + "epoch": 0.08, + "grad_norm": 0.5133152604103088, + "learning_rate": 9.930351269950143e-05, + "loss": 0.4087, + "step": 87 + }, + { + "epoch": 0.08, + "grad_norm": 0.4906843602657318, + "learning_rate": 9.927801732587312e-05, + "loss": 0.3011, + "step": 88 + }, + { + "epoch": 0.08, + "grad_norm": 0.3938533067703247, + "learning_rate": 9.925206705682962e-05, + "loss": 0.2591, + "step": 89 + }, + { + "epoch": 0.08, + "grad_norm": 0.5528268814086914, + "learning_rate": 9.92256621319231e-05, + "loss": 0.4683, + "step": 90 + }, + { + "epoch": 0.09, + "grad_norm": 0.47138574719429016, + "learning_rate": 9.919880279490286e-05, + "loss": 0.3129, + "step": 91 + }, + { + "epoch": 0.09, + "grad_norm": 0.3691083788871765, + "learning_rate": 9.917148929371288e-05, + "loss": 0.1754, + "step": 92 + }, + { + "epoch": 0.09, + "grad_norm": 0.49399805068969727, + "learning_rate": 9.914372188048964e-05, + "loss": 0.3776, + "step": 93 + }, + { + "epoch": 0.09, + "grad_norm": 0.39202970266342163, + "learning_rate": 9.911550081155983e-05, + "loss": 0.2701, + "step": 94 + }, + { + "epoch": 0.09, + "grad_norm": 0.4894983768463135, + "learning_rate": 9.908682634743784e-05, + "loss": 0.3881, + "step": 95 + }, + { + "epoch": 0.09, + "grad_norm": 0.41826292872428894, + "learning_rate": 9.905769875282352e-05, + "loss": 0.2901, + "step": 96 + }, + { + "epoch": 0.09, + "grad_norm": 0.44304174184799194, + "learning_rate": 9.902811829659961e-05, + "loss": 0.3045, + "step": 97 + }, + { + "epoch": 0.09, + "grad_norm": 0.5131700038909912, + "learning_rate": 9.899808525182935e-05, + "loss": 0.3788, + "step": 98 + }, + { + "epoch": 0.09, + "grad_norm": 0.3887397348880768, + "learning_rate": 9.896759989575386e-05, + "loss": 0.2958, + "step": 99 + }, + { + "epoch": 0.09, + "grad_norm": 0.463975727558136, + "learning_rate": 9.893666250978971e-05, + "loss": 0.379, + "step": 100 + }, + { + "epoch": 0.09, + "grad_norm": 0.3379839062690735, + "learning_rate": 9.890527337952617e-05, + "loss": 0.203, + "step": 101 + }, + { + "epoch": 0.1, + "grad_norm": 0.4153408110141754, + "learning_rate": 9.887343279472272e-05, + "loss": 0.2995, + "step": 102 + }, + { + "epoch": 0.1, + "grad_norm": 0.46450096368789673, + "learning_rate": 9.884114104930628e-05, + "loss": 0.3221, + "step": 103 + }, + { + "epoch": 0.1, + "grad_norm": 0.39753568172454834, + "learning_rate": 9.880839844136854e-05, + "loss": 0.2657, + "step": 104 + }, + { + "epoch": 0.1, + "grad_norm": 0.44558021426200867, + "learning_rate": 9.877520527316317e-05, + "loss": 0.3989, + "step": 105 + }, + { + "epoch": 0.1, + "grad_norm": 0.3921840786933899, + "learning_rate": 9.874156185110306e-05, + "loss": 0.3058, + "step": 106 + }, + { + "epoch": 0.1, + "grad_norm": 0.4834759533405304, + "learning_rate": 9.870746848575751e-05, + "loss": 0.3249, + "step": 107 + }, + { + "epoch": 0.1, + "grad_norm": 0.45938247442245483, + "learning_rate": 9.86729254918493e-05, + "loss": 0.279, + "step": 108 + }, + { + "epoch": 0.1, + "grad_norm": 0.39709120988845825, + "learning_rate": 9.863793318825186e-05, + "loss": 0.2953, + "step": 109 + }, + { + "epoch": 0.1, + "grad_norm": 0.4169032573699951, + "learning_rate": 9.860249189798627e-05, + "loss": 0.409, + "step": 110 + }, + { + "epoch": 0.1, + "grad_norm": 0.3651694357395172, + "learning_rate": 9.856660194821829e-05, + "loss": 0.2225, + "step": 111 + }, + { + "epoch": 0.11, + "grad_norm": 0.4336308240890503, + "learning_rate": 9.853026367025535e-05, + "loss": 0.3209, + "step": 112 + }, + { + "epoch": 0.11, + "grad_norm": 0.3963097631931305, + "learning_rate": 9.849347739954352e-05, + "loss": 0.2915, + "step": 113 + }, + { + "epoch": 0.11, + "grad_norm": 0.5299807190895081, + "learning_rate": 9.845624347566433e-05, + "loss": 0.4689, + "step": 114 + }, + { + "epoch": 0.11, + "grad_norm": 0.37367191910743713, + "learning_rate": 9.841856224233174e-05, + "loss": 0.2219, + "step": 115 + }, + { + "epoch": 0.11, + "grad_norm": 0.473271906375885, + "learning_rate": 9.83804340473889e-05, + "loss": 0.4204, + "step": 116 + }, + { + "epoch": 0.11, + "grad_norm": 0.41578221321105957, + "learning_rate": 9.83418592428049e-05, + "loss": 0.3003, + "step": 117 + }, + { + "epoch": 0.11, + "grad_norm": 0.4305388033390045, + "learning_rate": 9.830283818467163e-05, + "loss": 0.3411, + "step": 118 + }, + { + "epoch": 0.11, + "grad_norm": 0.4396352767944336, + "learning_rate": 9.826337123320046e-05, + "loss": 0.328, + "step": 119 + }, + { + "epoch": 0.11, + "grad_norm": 0.44212085008621216, + "learning_rate": 9.822345875271883e-05, + "loss": 0.343, + "step": 120 + }, + { + "epoch": 0.11, + "grad_norm": 0.4692837595939636, + "learning_rate": 9.818310111166699e-05, + "loss": 0.3424, + "step": 121 + }, + { + "epoch": 0.11, + "grad_norm": 0.5092015266418457, + "learning_rate": 9.814229868259452e-05, + "loss": 0.3566, + "step": 122 + }, + { + "epoch": 0.12, + "grad_norm": 0.4727444052696228, + "learning_rate": 9.810105184215699e-05, + "loss": 0.4161, + "step": 123 + }, + { + "epoch": 0.12, + "grad_norm": 0.42226845026016235, + "learning_rate": 9.805936097111234e-05, + "loss": 0.2661, + "step": 124 + }, + { + "epoch": 0.12, + "grad_norm": 0.5039162635803223, + "learning_rate": 9.801722645431754e-05, + "loss": 0.4034, + "step": 125 + }, + { + "epoch": 0.12, + "grad_norm": 0.4233477711677551, + "learning_rate": 9.797464868072488e-05, + "loss": 0.3068, + "step": 126 + }, + { + "epoch": 0.12, + "grad_norm": 0.4316577613353729, + "learning_rate": 9.793162804337845e-05, + "loss": 0.3214, + "step": 127 + }, + { + "epoch": 0.12, + "grad_norm": 0.44522222876548767, + "learning_rate": 9.788816493941051e-05, + "loss": 0.346, + "step": 128 + }, + { + "epoch": 0.12, + "grad_norm": 0.44972801208496094, + "learning_rate": 9.784425977003784e-05, + "loss": 0.357, + "step": 129 + }, + { + "epoch": 0.12, + "grad_norm": 0.4264127016067505, + "learning_rate": 9.779991294055802e-05, + "loss": 0.3754, + "step": 130 + }, + { + "epoch": 0.12, + "grad_norm": 0.3643350303173065, + "learning_rate": 9.775512486034563e-05, + "loss": 0.2567, + "step": 131 + }, + { + "epoch": 0.12, + "grad_norm": 0.5998265147209167, + "learning_rate": 9.770989594284857e-05, + "loss": 0.5771, + "step": 132 + }, + { + "epoch": 0.12, + "grad_norm": 0.4687020480632782, + "learning_rate": 9.766422660558421e-05, + "loss": 0.3903, + "step": 133 + }, + { + "epoch": 0.13, + "grad_norm": 0.3895106017589569, + "learning_rate": 9.761811727013548e-05, + "loss": 0.2792, + "step": 134 + }, + { + "epoch": 0.13, + "grad_norm": 0.37773844599723816, + "learning_rate": 9.757156836214706e-05, + "loss": 0.3133, + "step": 135 + }, + { + "epoch": 0.13, + "grad_norm": 0.39893820881843567, + "learning_rate": 9.752458031132141e-05, + "loss": 0.3563, + "step": 136 + }, + { + "epoch": 0.13, + "grad_norm": 0.38628295063972473, + "learning_rate": 9.747715355141478e-05, + "loss": 0.2313, + "step": 137 + }, + { + "epoch": 0.13, + "grad_norm": 0.3644122779369354, + "learning_rate": 9.742928852023325e-05, + "loss": 0.3237, + "step": 138 + }, + { + "epoch": 0.13, + "grad_norm": 0.4312066435813904, + "learning_rate": 9.73809856596287e-05, + "loss": 0.3646, + "step": 139 + }, + { + "epoch": 0.13, + "grad_norm": 0.4073641002178192, + "learning_rate": 9.733224541549464e-05, + "loss": 0.3162, + "step": 140 + }, + { + "epoch": 0.13, + "grad_norm": 0.4195702373981476, + "learning_rate": 9.728306823776221e-05, + "loss": 0.3279, + "step": 141 + }, + { + "epoch": 0.13, + "grad_norm": 0.4012298882007599, + "learning_rate": 9.723345458039594e-05, + "loss": 0.3518, + "step": 142 + }, + { + "epoch": 0.13, + "grad_norm": 0.3413154184818268, + "learning_rate": 9.718340490138965e-05, + "loss": 0.1902, + "step": 143 + }, + { + "epoch": 0.14, + "grad_norm": 0.45729732513427734, + "learning_rate": 9.713291966276206e-05, + "loss": 0.3451, + "step": 144 + }, + { + "epoch": 0.14, + "grad_norm": 0.4502311050891876, + "learning_rate": 9.708199933055272e-05, + "loss": 0.4213, + "step": 145 + }, + { + "epoch": 0.14, + "grad_norm": 0.47957178950309753, + "learning_rate": 9.70306443748176e-05, + "loss": 0.4034, + "step": 146 + }, + { + "epoch": 0.14, + "grad_norm": 0.37078186869621277, + "learning_rate": 9.697885526962474e-05, + "loss": 0.2809, + "step": 147 + }, + { + "epoch": 0.14, + "grad_norm": 0.41138362884521484, + "learning_rate": 9.692663249304992e-05, + "loss": 0.3335, + "step": 148 + }, + { + "epoch": 0.14, + "grad_norm": 0.4864597022533417, + "learning_rate": 9.687397652717223e-05, + "loss": 0.4674, + "step": 149 + }, + { + "epoch": 0.14, + "grad_norm": 0.4568451941013336, + "learning_rate": 9.682088785806963e-05, + "loss": 0.3774, + "step": 150 + }, + { + "epoch": 0.14, + "grad_norm": 0.4001672565937042, + "learning_rate": 9.67673669758144e-05, + "loss": 0.2757, + "step": 151 + }, + { + "epoch": 0.14, + "grad_norm": 0.42816731333732605, + "learning_rate": 9.671341437446877e-05, + "loss": 0.3363, + "step": 152 + }, + { + "epoch": 0.14, + "grad_norm": 0.5091586112976074, + "learning_rate": 9.665903055208014e-05, + "loss": 0.4422, + "step": 153 + }, + { + "epoch": 0.14, + "grad_norm": 0.4335111975669861, + "learning_rate": 9.660421601067666e-05, + "loss": 0.3311, + "step": 154 + }, + { + "epoch": 0.15, + "grad_norm": 0.36849528551101685, + "learning_rate": 9.654897125626252e-05, + "loss": 0.2864, + "step": 155 + }, + { + "epoch": 0.15, + "grad_norm": 0.38034534454345703, + "learning_rate": 9.649329679881334e-05, + "loss": 0.2848, + "step": 156 + }, + { + "epoch": 0.15, + "grad_norm": 0.30810433626174927, + "learning_rate": 9.643719315227133e-05, + "loss": 0.1939, + "step": 157 + }, + { + "epoch": 0.15, + "grad_norm": 0.4283190965652466, + "learning_rate": 9.63806608345407e-05, + "loss": 0.3569, + "step": 158 + }, + { + "epoch": 0.15, + "grad_norm": 0.4550926387310028, + "learning_rate": 9.632370036748279e-05, + "loss": 0.3769, + "step": 159 + }, + { + "epoch": 0.15, + "grad_norm": 0.4202008545398712, + "learning_rate": 9.626631227691127e-05, + "loss": 0.288, + "step": 160 + }, + { + "epoch": 0.15, + "grad_norm": 0.3845793306827545, + "learning_rate": 9.62084970925873e-05, + "loss": 0.2443, + "step": 161 + }, + { + "epoch": 0.15, + "grad_norm": 0.41995301842689514, + "learning_rate": 9.615025534821462e-05, + "loss": 0.2772, + "step": 162 + }, + { + "epoch": 0.15, + "grad_norm": 0.4846205711364746, + "learning_rate": 9.609158758143464e-05, + "loss": 0.3856, + "step": 163 + }, + { + "epoch": 0.15, + "grad_norm": 0.4353344440460205, + "learning_rate": 9.603249433382144e-05, + "loss": 0.2875, + "step": 164 + }, + { + "epoch": 0.15, + "grad_norm": 0.4365904629230499, + "learning_rate": 9.597297615087685e-05, + "loss": 0.2667, + "step": 165 + }, + { + "epoch": 0.16, + "grad_norm": 0.38388529419898987, + "learning_rate": 9.591303358202535e-05, + "loss": 0.2221, + "step": 166 + }, + { + "epoch": 0.16, + "grad_norm": 0.45872360467910767, + "learning_rate": 9.585266718060897e-05, + "loss": 0.3565, + "step": 167 + }, + { + "epoch": 0.16, + "grad_norm": 0.40336596965789795, + "learning_rate": 9.579187750388227e-05, + "loss": 0.2167, + "step": 168 + }, + { + "epoch": 0.16, + "grad_norm": 0.43868064880371094, + "learning_rate": 9.573066511300714e-05, + "loss": 0.3417, + "step": 169 + }, + { + "epoch": 0.16, + "grad_norm": 0.43917733430862427, + "learning_rate": 9.566903057304764e-05, + "loss": 0.3646, + "step": 170 + }, + { + "epoch": 0.16, + "grad_norm": 0.44352856278419495, + "learning_rate": 9.560697445296474e-05, + "loss": 0.3252, + "step": 171 + }, + { + "epoch": 0.16, + "grad_norm": 0.5583091378211975, + "learning_rate": 9.554449732561113e-05, + "loss": 0.351, + "step": 172 + }, + { + "epoch": 0.16, + "grad_norm": 0.43708336353302, + "learning_rate": 9.548159976772592e-05, + "loss": 0.3471, + "step": 173 + }, + { + "epoch": 0.16, + "grad_norm": 0.4961797297000885, + "learning_rate": 9.541828235992926e-05, + "loss": 0.3535, + "step": 174 + }, + { + "epoch": 0.16, + "grad_norm": 0.380731999874115, + "learning_rate": 9.535454568671704e-05, + "loss": 0.2705, + "step": 175 + }, + { + "epoch": 0.17, + "grad_norm": 0.44424885511398315, + "learning_rate": 9.529039033645548e-05, + "loss": 0.2366, + "step": 176 + }, + { + "epoch": 0.17, + "grad_norm": 0.3777495324611664, + "learning_rate": 9.522581690137567e-05, + "loss": 0.305, + "step": 177 + }, + { + "epoch": 0.17, + "grad_norm": 0.35404613614082336, + "learning_rate": 9.516082597756815e-05, + "loss": 0.2314, + "step": 178 + }, + { + "epoch": 0.17, + "grad_norm": 0.4622362554073334, + "learning_rate": 9.509541816497737e-05, + "loss": 0.4166, + "step": 179 + }, + { + "epoch": 0.17, + "grad_norm": 0.41214224696159363, + "learning_rate": 9.50295940673962e-05, + "loss": 0.253, + "step": 180 + }, + { + "epoch": 0.17, + "grad_norm": 0.442114919424057, + "learning_rate": 9.496335429246026e-05, + "loss": 0.3943, + "step": 181 + }, + { + "epoch": 0.17, + "grad_norm": 0.4232344329357147, + "learning_rate": 9.489669945164242e-05, + "loss": 0.3273, + "step": 182 + }, + { + "epoch": 0.17, + "grad_norm": 0.35261598229408264, + "learning_rate": 9.482963016024709e-05, + "loss": 0.222, + "step": 183 + }, + { + "epoch": 0.17, + "grad_norm": 0.3651808500289917, + "learning_rate": 9.476214703740454e-05, + "loss": 0.244, + "step": 184 + }, + { + "epoch": 0.17, + "grad_norm": 0.509012758731842, + "learning_rate": 9.469425070606524e-05, + "loss": 0.5481, + "step": 185 + }, + { + "epoch": 0.17, + "grad_norm": 0.40018436312675476, + "learning_rate": 9.462594179299406e-05, + "loss": 0.2319, + "step": 186 + }, + { + "epoch": 0.18, + "grad_norm": 0.4184328615665436, + "learning_rate": 9.45572209287645e-05, + "loss": 0.3469, + "step": 187 + }, + { + "epoch": 0.18, + "grad_norm": 0.5992292761802673, + "learning_rate": 9.44880887477528e-05, + "loss": 0.3817, + "step": 188 + }, + { + "epoch": 0.18, + "grad_norm": 0.4589436650276184, + "learning_rate": 9.441854588813228e-05, + "loss": 0.2914, + "step": 189 + }, + { + "epoch": 0.18, + "grad_norm": 0.38457149267196655, + "learning_rate": 9.43485929918672e-05, + "loss": 0.3181, + "step": 190 + }, + { + "epoch": 0.18, + "grad_norm": 0.4888402819633484, + "learning_rate": 9.427823070470699e-05, + "loss": 0.4259, + "step": 191 + }, + { + "epoch": 0.18, + "grad_norm": 0.37316766381263733, + "learning_rate": 9.420745967618026e-05, + "loss": 0.2753, + "step": 192 + }, + { + "epoch": 0.18, + "grad_norm": 0.308633416891098, + "learning_rate": 9.413628055958878e-05, + "loss": 0.1875, + "step": 193 + }, + { + "epoch": 0.18, + "grad_norm": 0.41204187273979187, + "learning_rate": 9.406469401200151e-05, + "loss": 0.3384, + "step": 194 + }, + { + "epoch": 0.18, + "grad_norm": 0.4597542881965637, + "learning_rate": 9.399270069424842e-05, + "loss": 0.3385, + "step": 195 + }, + { + "epoch": 0.18, + "grad_norm": 0.46371549367904663, + "learning_rate": 9.392030127091452e-05, + "loss": 0.2874, + "step": 196 + }, + { + "epoch": 0.18, + "grad_norm": 0.3977811932563782, + "learning_rate": 9.384749641033359e-05, + "loss": 0.2797, + "step": 197 + }, + { + "epoch": 0.19, + "grad_norm": 0.4490582346916199, + "learning_rate": 9.377428678458214e-05, + "loss": 0.356, + "step": 198 + }, + { + "epoch": 0.19, + "grad_norm": 0.41083958745002747, + "learning_rate": 9.370067306947316e-05, + "loss": 0.2747, + "step": 199 + }, + { + "epoch": 0.19, + "grad_norm": 0.48376941680908203, + "learning_rate": 9.362665594454984e-05, + "loss": 0.4317, + "step": 200 + }, + { + "epoch": 0.19, + "grad_norm": 0.44248223304748535, + "learning_rate": 9.355223609307933e-05, + "loss": 0.2799, + "step": 201 + }, + { + "epoch": 0.19, + "grad_norm": 0.4844808578491211, + "learning_rate": 9.347741420204643e-05, + "loss": 0.3624, + "step": 202 + }, + { + "epoch": 0.19, + "grad_norm": 0.4655194580554962, + "learning_rate": 9.340219096214727e-05, + "loss": 0.2899, + "step": 203 + }, + { + "epoch": 0.19, + "grad_norm": 0.4689055383205414, + "learning_rate": 9.33265670677829e-05, + "loss": 0.3721, + "step": 204 + }, + { + "epoch": 0.19, + "grad_norm": 0.4450901746749878, + "learning_rate": 9.325054321705289e-05, + "loss": 0.3079, + "step": 205 + }, + { + "epoch": 0.19, + "grad_norm": 0.4498708248138428, + "learning_rate": 9.317412011174886e-05, + "loss": 0.3111, + "step": 206 + }, + { + "epoch": 0.19, + "grad_norm": 0.48850810527801514, + "learning_rate": 9.309729845734813e-05, + "loss": 0.4046, + "step": 207 + }, + { + "epoch": 0.2, + "grad_norm": 0.37347522377967834, + "learning_rate": 9.302007896300698e-05, + "loss": 0.271, + "step": 208 + }, + { + "epoch": 0.2, + "grad_norm": 0.42164427042007446, + "learning_rate": 9.29424623415543e-05, + "loss": 0.3197, + "step": 209 + }, + { + "epoch": 0.2, + "grad_norm": 0.4926708936691284, + "learning_rate": 9.286444930948496e-05, + "loss": 0.3602, + "step": 210 + }, + { + "epoch": 0.2, + "grad_norm": 0.3644413352012634, + "learning_rate": 9.278604058695313e-05, + "loss": 0.2342, + "step": 211 + }, + { + "epoch": 0.2, + "grad_norm": 0.45351725816726685, + "learning_rate": 9.270723689776568e-05, + "loss": 0.3237, + "step": 212 + }, + { + "epoch": 0.2, + "grad_norm": 0.4711061418056488, + "learning_rate": 9.262803896937555e-05, + "loss": 0.4067, + "step": 213 + }, + { + "epoch": 0.2, + "grad_norm": 0.43553972244262695, + "learning_rate": 9.254844753287493e-05, + "loss": 0.2759, + "step": 214 + }, + { + "epoch": 0.2, + "grad_norm": 0.4814302623271942, + "learning_rate": 9.24684633229886e-05, + "loss": 0.3125, + "step": 215 + }, + { + "epoch": 0.2, + "grad_norm": 0.5601657032966614, + "learning_rate": 9.238808707806706e-05, + "loss": 0.3608, + "step": 216 + }, + { + "epoch": 0.2, + "grad_norm": 0.4870937764644623, + "learning_rate": 9.230731954007983e-05, + "loss": 0.3721, + "step": 217 + }, + { + "epoch": 0.2, + "grad_norm": 0.5292274951934814, + "learning_rate": 9.222616145460849e-05, + "loss": 0.4209, + "step": 218 + }, + { + "epoch": 0.21, + "grad_norm": 0.43616461753845215, + "learning_rate": 9.214461357083985e-05, + "loss": 0.2172, + "step": 219 + }, + { + "epoch": 0.21, + "grad_norm": 0.361106812953949, + "learning_rate": 9.206267664155907e-05, + "loss": 0.2494, + "step": 220 + }, + { + "epoch": 0.21, + "grad_norm": 0.4230857491493225, + "learning_rate": 9.198035142314259e-05, + "loss": 0.3769, + "step": 221 + }, + { + "epoch": 0.21, + "grad_norm": 0.44878044724464417, + "learning_rate": 9.189763867555129e-05, + "loss": 0.3755, + "step": 222 + }, + { + "epoch": 0.21, + "grad_norm": 0.5428657531738281, + "learning_rate": 9.181453916232339e-05, + "loss": 0.2581, + "step": 223 + }, + { + "epoch": 0.21, + "grad_norm": 0.41663435101509094, + "learning_rate": 9.173105365056742e-05, + "loss": 0.3474, + "step": 224 + }, + { + "epoch": 0.21, + "grad_norm": 0.4844930171966553, + "learning_rate": 9.164718291095515e-05, + "loss": 0.3686, + "step": 225 + }, + { + "epoch": 0.21, + "grad_norm": 0.40770450234413147, + "learning_rate": 9.156292771771446e-05, + "loss": 0.3386, + "step": 226 + }, + { + "epoch": 0.21, + "grad_norm": 0.39481139183044434, + "learning_rate": 9.14782888486222e-05, + "loss": 0.3119, + "step": 227 + }, + { + "epoch": 0.21, + "grad_norm": 0.38520610332489014, + "learning_rate": 9.1393267084997e-05, + "loss": 0.3014, + "step": 228 + }, + { + "epoch": 0.21, + "grad_norm": 0.46185630559921265, + "learning_rate": 9.130786321169209e-05, + "loss": 0.3699, + "step": 229 + }, + { + "epoch": 0.22, + "grad_norm": 0.4153963029384613, + "learning_rate": 9.122207801708802e-05, + "loss": 0.3289, + "step": 230 + }, + { + "epoch": 0.22, + "grad_norm": 0.3800516724586487, + "learning_rate": 9.113591229308538e-05, + "loss": 0.244, + "step": 231 + }, + { + "epoch": 0.22, + "grad_norm": 0.41449904441833496, + "learning_rate": 9.104936683509755e-05, + "loss": 0.3108, + "step": 232 + }, + { + "epoch": 0.22, + "grad_norm": 0.39178889989852905, + "learning_rate": 9.096244244204324e-05, + "loss": 0.3124, + "step": 233 + }, + { + "epoch": 0.22, + "grad_norm": 0.4312244951725006, + "learning_rate": 9.087513991633924e-05, + "loss": 0.3155, + "step": 234 + }, + { + "epoch": 0.22, + "grad_norm": 0.427442729473114, + "learning_rate": 9.078746006389298e-05, + "loss": 0.3512, + "step": 235 + }, + { + "epoch": 0.22, + "grad_norm": 0.33089083433151245, + "learning_rate": 9.069940369409499e-05, + "loss": 0.1927, + "step": 236 + }, + { + "epoch": 0.22, + "grad_norm": 0.4994446635246277, + "learning_rate": 9.061097161981159e-05, + "loss": 0.4073, + "step": 237 + }, + { + "epoch": 0.22, + "grad_norm": 0.47397279739379883, + "learning_rate": 9.052216465737726e-05, + "loss": 0.3754, + "step": 238 + }, + { + "epoch": 0.22, + "grad_norm": 0.4524490535259247, + "learning_rate": 9.043298362658714e-05, + "loss": 0.3629, + "step": 239 + }, + { + "epoch": 0.23, + "grad_norm": 0.37869277596473694, + "learning_rate": 9.034342935068952e-05, + "loss": 0.1984, + "step": 240 + }, + { + "epoch": 0.23, + "grad_norm": 0.4369172155857086, + "learning_rate": 9.025350265637815e-05, + "loss": 0.3684, + "step": 241 + }, + { + "epoch": 0.23, + "grad_norm": 0.4749734699726105, + "learning_rate": 9.016320437378466e-05, + "loss": 0.3212, + "step": 242 + }, + { + "epoch": 0.23, + "grad_norm": 0.4339480996131897, + "learning_rate": 9.007253533647089e-05, + "loss": 0.3463, + "step": 243 + }, + { + "epoch": 0.23, + "grad_norm": 0.4679538309574127, + "learning_rate": 8.998149638142119e-05, + "loss": 0.326, + "step": 244 + }, + { + "epoch": 0.23, + "grad_norm": 0.503963053226471, + "learning_rate": 8.98900883490347e-05, + "loss": 0.4358, + "step": 245 + }, + { + "epoch": 0.23, + "grad_norm": 0.38353750109672546, + "learning_rate": 8.979831208311758e-05, + "loss": 0.2923, + "step": 246 + }, + { + "epoch": 0.23, + "grad_norm": 0.4563164710998535, + "learning_rate": 8.970616843087524e-05, + "loss": 0.3187, + "step": 247 + }, + { + "epoch": 0.23, + "grad_norm": 0.36256474256515503, + "learning_rate": 8.96136582429045e-05, + "loss": 0.2175, + "step": 248 + }, + { + "epoch": 0.23, + "grad_norm": 0.4684314429759979, + "learning_rate": 8.952078237318575e-05, + "loss": 0.3427, + "step": 249 + }, + { + "epoch": 0.23, + "grad_norm": 0.45074453949928284, + "learning_rate": 8.942754167907507e-05, + "loss": 0.2726, + "step": 250 + }, + { + "epoch": 0.24, + "grad_norm": 0.5070464015007019, + "learning_rate": 8.933393702129628e-05, + "loss": 0.2587, + "step": 251 + }, + { + "epoch": 0.24, + "grad_norm": 0.44061899185180664, + "learning_rate": 8.923996926393305e-05, + "loss": 0.3961, + "step": 252 + }, + { + "epoch": 0.24, + "grad_norm": 0.4211723208427429, + "learning_rate": 8.91456392744209e-05, + "loss": 0.3661, + "step": 253 + }, + { + "epoch": 0.24, + "grad_norm": 0.4651309847831726, + "learning_rate": 8.905094792353917e-05, + "loss": 0.3365, + "step": 254 + }, + { + "epoch": 0.24, + "grad_norm": 0.4560156464576721, + "learning_rate": 8.895589608540297e-05, + "loss": 0.3558, + "step": 255 + }, + { + "epoch": 0.24, + "grad_norm": 0.3777078688144684, + "learning_rate": 8.886048463745525e-05, + "loss": 0.175, + "step": 256 + }, + { + "epoch": 0.24, + "grad_norm": 0.4200543463230133, + "learning_rate": 8.876471446045847e-05, + "loss": 0.2923, + "step": 257 + }, + { + "epoch": 0.24, + "grad_norm": 0.44682615995407104, + "learning_rate": 8.866858643848665e-05, + "loss": 0.2414, + "step": 258 + }, + { + "epoch": 0.24, + "grad_norm": 0.3942912817001343, + "learning_rate": 8.857210145891715e-05, + "loss": 0.282, + "step": 259 + }, + { + "epoch": 0.24, + "grad_norm": 0.41610509157180786, + "learning_rate": 8.847526041242246e-05, + "loss": 0.1799, + "step": 260 + }, + { + "epoch": 0.24, + "grad_norm": 0.39465031027793884, + "learning_rate": 8.8378064192962e-05, + "loss": 0.2455, + "step": 261 + }, + { + "epoch": 0.25, + "grad_norm": 0.45565101504325867, + "learning_rate": 8.82805136977739e-05, + "loss": 0.3674, + "step": 262 + }, + { + "epoch": 0.25, + "grad_norm": 0.4826546013355255, + "learning_rate": 8.818260982736661e-05, + "loss": 0.3592, + "step": 263 + }, + { + "epoch": 0.25, + "grad_norm": 0.41021135449409485, + "learning_rate": 8.808435348551071e-05, + "loss": 0.2217, + "step": 264 + }, + { + "epoch": 0.25, + "grad_norm": 0.3801558315753937, + "learning_rate": 8.798574557923053e-05, + "loss": 0.2213, + "step": 265 + }, + { + "epoch": 0.25, + "grad_norm": 0.37459704279899597, + "learning_rate": 8.788678701879573e-05, + "loss": 0.3109, + "step": 266 + }, + { + "epoch": 0.25, + "grad_norm": 0.36728647351264954, + "learning_rate": 8.778747871771292e-05, + "loss": 0.2384, + "step": 267 + }, + { + "epoch": 0.25, + "grad_norm": 0.44370126724243164, + "learning_rate": 8.768782159271727e-05, + "loss": 0.3362, + "step": 268 + }, + { + "epoch": 0.25, + "grad_norm": 0.3853534460067749, + "learning_rate": 8.758781656376398e-05, + "loss": 0.2443, + "step": 269 + }, + { + "epoch": 0.25, + "grad_norm": 0.45374879240989685, + "learning_rate": 8.748746455401986e-05, + "loss": 0.3262, + "step": 270 + }, + { + "epoch": 0.25, + "grad_norm": 0.47816920280456543, + "learning_rate": 8.738676648985476e-05, + "loss": 0.3123, + "step": 271 + }, + { + "epoch": 0.26, + "grad_norm": 0.4292296767234802, + "learning_rate": 8.7285723300833e-05, + "loss": 0.3724, + "step": 272 + }, + { + "epoch": 0.26, + "grad_norm": 0.3887421786785126, + "learning_rate": 8.718433591970485e-05, + "loss": 0.2855, + "step": 273 + }, + { + "epoch": 0.26, + "grad_norm": 0.49635881185531616, + "learning_rate": 8.708260528239788e-05, + "loss": 0.3917, + "step": 274 + }, + { + "epoch": 0.26, + "grad_norm": 0.39939600229263306, + "learning_rate": 8.698053232800832e-05, + "loss": 0.2616, + "step": 275 + }, + { + "epoch": 0.26, + "grad_norm": 0.42968565225601196, + "learning_rate": 8.68781179987924e-05, + "loss": 0.3605, + "step": 276 + }, + { + "epoch": 0.26, + "grad_norm": 0.385392427444458, + "learning_rate": 8.677536324015765e-05, + "loss": 0.2805, + "step": 277 + }, + { + "epoch": 0.26, + "grad_norm": 0.41780418157577515, + "learning_rate": 8.667226900065419e-05, + "loss": 0.3242, + "step": 278 + }, + { + "epoch": 0.26, + "grad_norm": 0.48109957575798035, + "learning_rate": 8.656883623196592e-05, + "loss": 0.3191, + "step": 279 + }, + { + "epoch": 0.26, + "grad_norm": 0.45149287581443787, + "learning_rate": 8.646506588890183e-05, + "loss": 0.3069, + "step": 280 + }, + { + "epoch": 0.26, + "grad_norm": 0.510420024394989, + "learning_rate": 8.636095892938707e-05, + "loss": 0.3468, + "step": 281 + }, + { + "epoch": 0.26, + "grad_norm": 0.46956875920295715, + "learning_rate": 8.62565163144542e-05, + "loss": 0.4075, + "step": 282 + }, + { + "epoch": 0.27, + "grad_norm": 0.40705329179763794, + "learning_rate": 8.615173900823426e-05, + "loss": 0.2867, + "step": 283 + }, + { + "epoch": 0.27, + "grad_norm": 0.3982028663158417, + "learning_rate": 8.60466279779479e-05, + "loss": 0.2893, + "step": 284 + }, + { + "epoch": 0.27, + "grad_norm": 0.39112627506256104, + "learning_rate": 8.594118419389647e-05, + "loss": 0.3286, + "step": 285 + }, + { + "epoch": 0.27, + "grad_norm": 0.439872682094574, + "learning_rate": 8.583540862945301e-05, + "loss": 0.307, + "step": 286 + }, + { + "epoch": 0.27, + "grad_norm": 0.34527483582496643, + "learning_rate": 8.57293022610533e-05, + "loss": 0.2122, + "step": 287 + }, + { + "epoch": 0.27, + "grad_norm": 0.41789668798446655, + "learning_rate": 8.562286606818684e-05, + "loss": 0.236, + "step": 288 + }, + { + "epoch": 0.27, + "grad_norm": 0.5295957922935486, + "learning_rate": 8.55161010333878e-05, + "loss": 0.504, + "step": 289 + }, + { + "epoch": 0.27, + "grad_norm": 0.5455564856529236, + "learning_rate": 8.540900814222598e-05, + "loss": 0.3537, + "step": 290 + }, + { + "epoch": 0.27, + "grad_norm": 0.5789703726768494, + "learning_rate": 8.530158838329765e-05, + "loss": 0.3238, + "step": 291 + }, + { + "epoch": 0.27, + "grad_norm": 0.5657013654708862, + "learning_rate": 8.519384274821649e-05, + "loss": 0.4341, + "step": 292 + }, + { + "epoch": 0.27, + "grad_norm": 0.4126622676849365, + "learning_rate": 8.508577223160442e-05, + "loss": 0.2597, + "step": 293 + }, + { + "epoch": 0.28, + "grad_norm": 0.47889232635498047, + "learning_rate": 8.497737783108238e-05, + "loss": 0.3202, + "step": 294 + }, + { + "epoch": 0.28, + "grad_norm": 0.6022685766220093, + "learning_rate": 8.486866054726114e-05, + "loss": 0.3014, + "step": 295 + }, + { + "epoch": 0.28, + "grad_norm": 0.4188893437385559, + "learning_rate": 8.475962138373213e-05, + "loss": 0.2655, + "step": 296 + }, + { + "epoch": 0.28, + "grad_norm": 0.4596685767173767, + "learning_rate": 8.465026134705805e-05, + "loss": 0.3072, + "step": 297 + }, + { + "epoch": 0.28, + "grad_norm": 0.4453608989715576, + "learning_rate": 8.454058144676366e-05, + "loss": 0.2954, + "step": 298 + }, + { + "epoch": 0.28, + "grad_norm": 0.40905341506004333, + "learning_rate": 8.443058269532651e-05, + "loss": 0.3124, + "step": 299 + }, + { + "epoch": 0.28, + "grad_norm": 0.43399733304977417, + "learning_rate": 8.432026610816745e-05, + "loss": 0.2793, + "step": 300 + }, + { + "epoch": 0.28, + "grad_norm": 0.4337409734725952, + "learning_rate": 8.420963270364137e-05, + "loss": 0.2741, + "step": 301 + }, + { + "epoch": 0.28, + "grad_norm": 0.4746047854423523, + "learning_rate": 8.409868350302774e-05, + "loss": 0.3285, + "step": 302 + }, + { + "epoch": 0.28, + "grad_norm": 0.45777222514152527, + "learning_rate": 8.398741953052127e-05, + "loss": 0.281, + "step": 303 + }, + { + "epoch": 0.29, + "grad_norm": 0.4513487219810486, + "learning_rate": 8.387584181322233e-05, + "loss": 0.3038, + "step": 304 + }, + { + "epoch": 0.29, + "grad_norm": 0.5118147730827332, + "learning_rate": 8.376395138112754e-05, + "loss": 0.3482, + "step": 305 + }, + { + "epoch": 0.29, + "grad_norm": 0.42370742559432983, + "learning_rate": 8.365174926712032e-05, + "loss": 0.3431, + "step": 306 + }, + { + "epoch": 0.29, + "grad_norm": 0.4493395984172821, + "learning_rate": 8.353923650696118e-05, + "loss": 0.3475, + "step": 307 + }, + { + "epoch": 0.29, + "grad_norm": 0.4581238329410553, + "learning_rate": 8.342641413927837e-05, + "loss": 0.3637, + "step": 308 + }, + { + "epoch": 0.29, + "grad_norm": 0.4623344838619232, + "learning_rate": 8.331328320555812e-05, + "loss": 0.3504, + "step": 309 + }, + { + "epoch": 0.29, + "grad_norm": 0.5001980066299438, + "learning_rate": 8.319984475013512e-05, + "loss": 0.4016, + "step": 310 + }, + { + "epoch": 0.29, + "grad_norm": 0.4373179078102112, + "learning_rate": 8.308609982018286e-05, + "loss": 0.3375, + "step": 311 + }, + { + "epoch": 0.29, + "grad_norm": 0.38677600026130676, + "learning_rate": 8.297204946570398e-05, + "loss": 0.2704, + "step": 312 + }, + { + "epoch": 0.29, + "grad_norm": 0.44620731472969055, + "learning_rate": 8.285769473952052e-05, + "loss": 0.2628, + "step": 313 + }, + { + "epoch": 0.29, + "grad_norm": 0.49608850479125977, + "learning_rate": 8.274303669726426e-05, + "loss": 0.3184, + "step": 314 + }, + { + "epoch": 0.3, + "grad_norm": 0.4633454382419586, + "learning_rate": 8.262807639736692e-05, + "loss": 0.404, + "step": 315 + }, + { + "epoch": 0.3, + "grad_norm": 0.47432637214660645, + "learning_rate": 8.251281490105045e-05, + "loss": 0.3733, + "step": 316 + }, + { + "epoch": 0.3, + "grad_norm": 0.38651883602142334, + "learning_rate": 8.239725327231721e-05, + "loss": 0.303, + "step": 317 + }, + { + "epoch": 0.3, + "grad_norm": 0.4193047285079956, + "learning_rate": 8.228139257794012e-05, + "loss": 0.3084, + "step": 318 + }, + { + "epoch": 0.3, + "grad_norm": 0.4297058582305908, + "learning_rate": 8.216523388745287e-05, + "loss": 0.3503, + "step": 319 + }, + { + "epoch": 0.3, + "grad_norm": 0.4172101616859436, + "learning_rate": 8.204877827313997e-05, + "loss": 0.3307, + "step": 320 + }, + { + "epoch": 0.3, + "grad_norm": 0.39230528473854065, + "learning_rate": 8.193202681002692e-05, + "loss": 0.339, + "step": 321 + }, + { + "epoch": 0.3, + "grad_norm": 0.2820037603378296, + "learning_rate": 8.181498057587027e-05, + "loss": 0.1672, + "step": 322 + }, + { + "epoch": 0.3, + "grad_norm": 0.3549487292766571, + "learning_rate": 8.169764065114764e-05, + "loss": 0.2718, + "step": 323 + }, + { + "epoch": 0.3, + "grad_norm": 0.41914433240890503, + "learning_rate": 8.158000811904778e-05, + "loss": 0.2528, + "step": 324 + }, + { + "epoch": 0.3, + "grad_norm": 0.40389931201934814, + "learning_rate": 8.146208406546053e-05, + "loss": 0.3286, + "step": 325 + }, + { + "epoch": 0.31, + "grad_norm": 0.4596593677997589, + "learning_rate": 8.134386957896688e-05, + "loss": 0.4036, + "step": 326 + }, + { + "epoch": 0.31, + "grad_norm": 0.45193785429000854, + "learning_rate": 8.122536575082882e-05, + "loss": 0.418, + "step": 327 + }, + { + "epoch": 0.31, + "grad_norm": 0.4680340886116028, + "learning_rate": 8.110657367497933e-05, + "loss": 0.2809, + "step": 328 + }, + { + "epoch": 0.31, + "grad_norm": 0.3416012227535248, + "learning_rate": 8.098749444801224e-05, + "loss": 0.235, + "step": 329 + }, + { + "epoch": 0.31, + "grad_norm": 0.43442729115486145, + "learning_rate": 8.08681291691722e-05, + "loss": 0.3546, + "step": 330 + }, + { + "epoch": 0.31, + "grad_norm": 0.4265907108783722, + "learning_rate": 8.074847894034434e-05, + "loss": 0.2733, + "step": 331 + }, + { + "epoch": 0.31, + "grad_norm": 0.4317761957645416, + "learning_rate": 8.062854486604435e-05, + "loss": 0.2601, + "step": 332 + }, + { + "epoch": 0.31, + "grad_norm": 0.38275545835494995, + "learning_rate": 8.050832805340806e-05, + "loss": 0.2794, + "step": 333 + }, + { + "epoch": 0.31, + "grad_norm": 0.3954709470272064, + "learning_rate": 8.038782961218136e-05, + "loss": 0.3348, + "step": 334 + }, + { + "epoch": 0.31, + "grad_norm": 0.5072319507598877, + "learning_rate": 8.026705065470996e-05, + "loss": 0.343, + "step": 335 + }, + { + "epoch": 0.32, + "grad_norm": 0.4034208357334137, + "learning_rate": 8.014599229592894e-05, + "loss": 0.3071, + "step": 336 + }, + { + "epoch": 0.32, + "grad_norm": 0.4476311206817627, + "learning_rate": 8.002465565335271e-05, + "loss": 0.256, + "step": 337 + }, + { + "epoch": 0.32, + "grad_norm": 0.41503357887268066, + "learning_rate": 7.990304184706455e-05, + "loss": 0.277, + "step": 338 + }, + { + "epoch": 0.32, + "grad_norm": 0.4218122363090515, + "learning_rate": 7.978115199970621e-05, + "loss": 0.3754, + "step": 339 + }, + { + "epoch": 0.32, + "grad_norm": 0.3906747102737427, + "learning_rate": 7.965898723646776e-05, + "loss": 0.304, + "step": 340 + }, + { + "epoch": 0.32, + "grad_norm": 0.5392120480537415, + "learning_rate": 7.953654868507699e-05, + "loss": 0.416, + "step": 341 + }, + { + "epoch": 0.32, + "grad_norm": 0.4278102219104767, + "learning_rate": 7.941383747578912e-05, + "loss": 0.343, + "step": 342 + }, + { + "epoch": 0.32, + "grad_norm": 0.4145180583000183, + "learning_rate": 7.929085474137629e-05, + "loss": 0.3392, + "step": 343 + }, + { + "epoch": 0.32, + "grad_norm": 0.37485140562057495, + "learning_rate": 7.91676016171172e-05, + "loss": 0.321, + "step": 344 + }, + { + "epoch": 0.32, + "grad_norm": 0.4715069532394409, + "learning_rate": 7.904407924078654e-05, + "loss": 0.4593, + "step": 345 + }, + { + "epoch": 0.32, + "grad_norm": 0.44009581208229065, + "learning_rate": 7.892028875264451e-05, + "loss": 0.3349, + "step": 346 + }, + { + "epoch": 0.33, + "grad_norm": 0.3941679894924164, + "learning_rate": 7.879623129542633e-05, + "loss": 0.2938, + "step": 347 + }, + { + "epoch": 0.33, + "grad_norm": 0.4242309331893921, + "learning_rate": 7.867190801433166e-05, + "loss": 0.3678, + "step": 348 + }, + { + "epoch": 0.33, + "grad_norm": 0.40279605984687805, + "learning_rate": 7.854732005701402e-05, + "loss": 0.3434, + "step": 349 + }, + { + "epoch": 0.33, + "grad_norm": 0.40267717838287354, + "learning_rate": 7.842246857357023e-05, + "loss": 0.29, + "step": 350 + }, + { + "epoch": 0.33, + "grad_norm": 0.3405179977416992, + "learning_rate": 7.829735471652978e-05, + "loss": 0.2241, + "step": 351 + }, + { + "epoch": 0.33, + "grad_norm": 0.39818453788757324, + "learning_rate": 7.817197964084411e-05, + "loss": 0.2662, + "step": 352 + }, + { + "epoch": 0.33, + "grad_norm": 0.3334313929080963, + "learning_rate": 7.804634450387616e-05, + "loss": 0.1877, + "step": 353 + }, + { + "epoch": 0.33, + "grad_norm": 0.41519755125045776, + "learning_rate": 7.792045046538941e-05, + "loss": 0.3066, + "step": 354 + }, + { + "epoch": 0.33, + "grad_norm": 0.4714450538158417, + "learning_rate": 7.77942986875374e-05, + "loss": 0.4239, + "step": 355 + }, + { + "epoch": 0.33, + "grad_norm": 0.38951438665390015, + "learning_rate": 7.766789033485287e-05, + "loss": 0.3293, + "step": 356 + }, + { + "epoch": 0.33, + "grad_norm": 0.4261366128921509, + "learning_rate": 7.75412265742371e-05, + "loss": 0.3883, + "step": 357 + }, + { + "epoch": 0.34, + "grad_norm": 0.43968912959098816, + "learning_rate": 7.741430857494904e-05, + "loss": 0.3126, + "step": 358 + }, + { + "epoch": 0.34, + "grad_norm": 0.4306524395942688, + "learning_rate": 7.728713750859458e-05, + "loss": 0.3449, + "step": 359 + }, + { + "epoch": 0.34, + "grad_norm": 0.42536354064941406, + "learning_rate": 7.715971454911577e-05, + "loss": 0.3506, + "step": 360 + }, + { + "epoch": 0.34, + "grad_norm": 0.4529925286769867, + "learning_rate": 7.703204087277988e-05, + "loss": 0.3568, + "step": 361 + }, + { + "epoch": 0.34, + "grad_norm": 0.36422109603881836, + "learning_rate": 7.690411765816864e-05, + "loss": 0.2616, + "step": 362 + }, + { + "epoch": 0.34, + "grad_norm": 0.4082551896572113, + "learning_rate": 7.677594608616729e-05, + "loss": 0.3083, + "step": 363 + }, + { + "epoch": 0.34, + "grad_norm": 0.38873544335365295, + "learning_rate": 7.66475273399537e-05, + "loss": 0.2812, + "step": 364 + }, + { + "epoch": 0.34, + "grad_norm": 0.3795793950557709, + "learning_rate": 7.651886260498751e-05, + "loss": 0.2658, + "step": 365 + }, + { + "epoch": 0.34, + "grad_norm": 0.4415530264377594, + "learning_rate": 7.638995306899908e-05, + "loss": 0.327, + "step": 366 + }, + { + "epoch": 0.34, + "grad_norm": 0.3712081015110016, + "learning_rate": 7.626079992197857e-05, + "loss": 0.2781, + "step": 367 + }, + { + "epoch": 0.35, + "grad_norm": 0.40912961959838867, + "learning_rate": 7.613140435616503e-05, + "loss": 0.3211, + "step": 368 + }, + { + "epoch": 0.35, + "grad_norm": 0.4007984399795532, + "learning_rate": 7.600176756603525e-05, + "loss": 0.2528, + "step": 369 + }, + { + "epoch": 0.35, + "grad_norm": 0.3977168798446655, + "learning_rate": 7.587189074829284e-05, + "loss": 0.306, + "step": 370 + }, + { + "epoch": 0.35, + "grad_norm": 0.333519846200943, + "learning_rate": 7.57417751018572e-05, + "loss": 0.2291, + "step": 371 + }, + { + "epoch": 0.35, + "grad_norm": 0.37693852186203003, + "learning_rate": 7.561142182785233e-05, + "loss": 0.2407, + "step": 372 + }, + { + "epoch": 0.35, + "grad_norm": 0.40604284405708313, + "learning_rate": 7.548083212959588e-05, + "loss": 0.3062, + "step": 373 + }, + { + "epoch": 0.35, + "grad_norm": 0.4472860097885132, + "learning_rate": 7.535000721258791e-05, + "loss": 0.2598, + "step": 374 + }, + { + "epoch": 0.35, + "grad_norm": 0.4256829023361206, + "learning_rate": 7.521894828449994e-05, + "loss": 0.3627, + "step": 375 + }, + { + "epoch": 0.35, + "grad_norm": 0.4579877555370331, + "learning_rate": 7.508765655516358e-05, + "loss": 0.3436, + "step": 376 + }, + { + "epoch": 0.35, + "grad_norm": 0.38452377915382385, + "learning_rate": 7.495613323655953e-05, + "loss": 0.3056, + "step": 377 + }, + { + "epoch": 0.35, + "grad_norm": 0.411670446395874, + "learning_rate": 7.482437954280635e-05, + "loss": 0.2649, + "step": 378 + }, + { + "epoch": 0.36, + "grad_norm": 0.4431185722351074, + "learning_rate": 7.469239669014923e-05, + "loss": 0.3693, + "step": 379 + }, + { + "epoch": 0.36, + "grad_norm": 0.3988078534603119, + "learning_rate": 7.456018589694873e-05, + "loss": 0.2902, + "step": 380 + }, + { + "epoch": 0.36, + "grad_norm": 0.5329432487487793, + "learning_rate": 7.442774838366965e-05, + "loss": 0.3991, + "step": 381 + }, + { + "epoch": 0.36, + "grad_norm": 0.2802259624004364, + "learning_rate": 7.429508537286963e-05, + "loss": 0.1907, + "step": 382 + }, + { + "epoch": 0.36, + "grad_norm": 0.43637794256210327, + "learning_rate": 7.416219808918794e-05, + "loss": 0.3517, + "step": 383 + }, + { + "epoch": 0.36, + "grad_norm": 0.4463917911052704, + "learning_rate": 7.402908775933419e-05, + "loss": 0.3699, + "step": 384 + }, + { + "epoch": 0.36, + "grad_norm": 0.46155545115470886, + "learning_rate": 7.389575561207692e-05, + "loss": 0.4614, + "step": 385 + }, + { + "epoch": 0.36, + "grad_norm": 0.4998500943183899, + "learning_rate": 7.376220287823236e-05, + "loss": 0.3709, + "step": 386 + }, + { + "epoch": 0.36, + "grad_norm": 0.45294034481048584, + "learning_rate": 7.3628430790653e-05, + "loss": 0.3206, + "step": 387 + }, + { + "epoch": 0.36, + "grad_norm": 0.39041468501091003, + "learning_rate": 7.349444058421619e-05, + "loss": 0.2901, + "step": 388 + }, + { + "epoch": 0.36, + "grad_norm": 0.40528184175491333, + "learning_rate": 7.336023349581287e-05, + "loss": 0.245, + "step": 389 + }, + { + "epoch": 0.37, + "grad_norm": 0.37183213233947754, + "learning_rate": 7.322581076433596e-05, + "loss": 0.2901, + "step": 390 + }, + { + "epoch": 0.37, + "grad_norm": 0.4894709289073944, + "learning_rate": 7.309117363066912e-05, + "loss": 0.4137, + "step": 391 + }, + { + "epoch": 0.37, + "grad_norm": 0.41468536853790283, + "learning_rate": 7.295632333767513e-05, + "loss": 0.3505, + "step": 392 + }, + { + "epoch": 0.37, + "grad_norm": 0.356780469417572, + "learning_rate": 7.28212611301845e-05, + "loss": 0.2336, + "step": 393 + }, + { + "epoch": 0.37, + "grad_norm": 0.4367281198501587, + "learning_rate": 7.2685988254984e-05, + "loss": 0.361, + "step": 394 + }, + { + "epoch": 0.37, + "grad_norm": 0.43233007192611694, + "learning_rate": 7.255050596080509e-05, + "loss": 0.3527, + "step": 395 + }, + { + "epoch": 0.37, + "grad_norm": 0.2684633731842041, + "learning_rate": 7.241481549831243e-05, + "loss": 0.1598, + "step": 396 + }, + { + "epoch": 0.37, + "grad_norm": 0.41831842064857483, + "learning_rate": 7.22789181200923e-05, + "loss": 0.341, + "step": 397 + }, + { + "epoch": 0.37, + "grad_norm": 0.4490970969200134, + "learning_rate": 7.214281508064107e-05, + "loss": 0.3595, + "step": 398 + }, + { + "epoch": 0.37, + "grad_norm": 0.36620378494262695, + "learning_rate": 7.200650763635366e-05, + "loss": 0.2399, + "step": 399 + }, + { + "epoch": 0.38, + "grad_norm": 0.44731780886650085, + "learning_rate": 7.186999704551181e-05, + "loss": 0.3095, + "step": 400 + }, + { + "epoch": 0.38, + "grad_norm": 0.44940871000289917, + "learning_rate": 7.173328456827263e-05, + "loss": 0.3172, + "step": 401 + }, + { + "epoch": 0.38, + "grad_norm": 0.32171034812927246, + "learning_rate": 7.15963714666568e-05, + "loss": 0.2014, + "step": 402 + }, + { + "epoch": 0.38, + "grad_norm": 0.43398237228393555, + "learning_rate": 7.145925900453709e-05, + "loss": 0.3541, + "step": 403 + }, + { + "epoch": 0.38, + "grad_norm": 0.3726755976676941, + "learning_rate": 7.132194844762654e-05, + "loss": 0.273, + "step": 404 + }, + { + "epoch": 0.38, + "grad_norm": 0.43420591950416565, + "learning_rate": 7.118444106346687e-05, + "loss": 0.3587, + "step": 405 + }, + { + "epoch": 0.38, + "grad_norm": 0.42400193214416504, + "learning_rate": 7.104673812141675e-05, + "loss": 0.3649, + "step": 406 + }, + { + "epoch": 0.38, + "grad_norm": 0.4047689139842987, + "learning_rate": 7.090884089264011e-05, + "loss": 0.2675, + "step": 407 + }, + { + "epoch": 0.38, + "grad_norm": 0.3567971885204315, + "learning_rate": 7.077075065009433e-05, + "loss": 0.2321, + "step": 408 + }, + { + "epoch": 0.38, + "grad_norm": 0.4200761616230011, + "learning_rate": 7.063246866851858e-05, + "loss": 0.2741, + "step": 409 + }, + { + "epoch": 0.38, + "grad_norm": 0.525682806968689, + "learning_rate": 7.049399622442198e-05, + "loss": 0.3688, + "step": 410 + }, + { + "epoch": 0.39, + "grad_norm": 0.46288007497787476, + "learning_rate": 7.035533459607189e-05, + "loss": 0.3295, + "step": 411 + }, + { + "epoch": 0.39, + "grad_norm": 0.4013391435146332, + "learning_rate": 7.021648506348204e-05, + "loss": 0.3131, + "step": 412 + }, + { + "epoch": 0.39, + "grad_norm": 0.41562074422836304, + "learning_rate": 7.007744890840073e-05, + "loss": 0.2775, + "step": 413 + }, + { + "epoch": 0.39, + "grad_norm": 0.43653470277786255, + "learning_rate": 6.993822741429907e-05, + "loss": 0.3803, + "step": 414 + }, + { + "epoch": 0.39, + "grad_norm": 0.3548017740249634, + "learning_rate": 6.979882186635897e-05, + "loss": 0.2141, + "step": 415 + }, + { + "epoch": 0.39, + "grad_norm": 0.4461126923561096, + "learning_rate": 6.965923355146147e-05, + "loss": 0.3511, + "step": 416 + }, + { + "epoch": 0.39, + "grad_norm": 0.3795164227485657, + "learning_rate": 6.951946375817474e-05, + "loss": 0.2828, + "step": 417 + }, + { + "epoch": 0.39, + "grad_norm": 0.46577778458595276, + "learning_rate": 6.937951377674221e-05, + "loss": 0.2939, + "step": 418 + }, + { + "epoch": 0.39, + "grad_norm": 0.4420101046562195, + "learning_rate": 6.923938489907066e-05, + "loss": 0.3268, + "step": 419 + }, + { + "epoch": 0.39, + "grad_norm": 0.4184642732143402, + "learning_rate": 6.909907841871829e-05, + "loss": 0.2857, + "step": 420 + }, + { + "epoch": 0.39, + "grad_norm": 0.4399305582046509, + "learning_rate": 6.895859563088283e-05, + "loss": 0.3702, + "step": 421 + }, + { + "epoch": 0.4, + "grad_norm": 0.36446893215179443, + "learning_rate": 6.881793783238948e-05, + "loss": 0.2467, + "step": 422 + }, + { + "epoch": 0.4, + "grad_norm": 0.4695288836956024, + "learning_rate": 6.867710632167903e-05, + "loss": 0.3516, + "step": 423 + }, + { + "epoch": 0.4, + "grad_norm": 0.3637704849243164, + "learning_rate": 6.853610239879586e-05, + "loss": 0.2723, + "step": 424 + }, + { + "epoch": 0.4, + "grad_norm": 0.3477213978767395, + "learning_rate": 6.839492736537588e-05, + "loss": 0.1958, + "step": 425 + }, + { + "epoch": 0.4, + "grad_norm": 0.4522038698196411, + "learning_rate": 6.82535825246346e-05, + "loss": 0.2695, + "step": 426 + }, + { + "epoch": 0.4, + "grad_norm": 0.4621041715145111, + "learning_rate": 6.811206918135502e-05, + "loss": 0.3995, + "step": 427 + }, + { + "epoch": 0.4, + "grad_norm": 0.4887210726737976, + "learning_rate": 6.797038864187564e-05, + "loss": 0.3456, + "step": 428 + }, + { + "epoch": 0.4, + "grad_norm": 0.37454670667648315, + "learning_rate": 6.782854221407838e-05, + "loss": 0.2969, + "step": 429 + }, + { + "epoch": 0.4, + "grad_norm": 0.4346952438354492, + "learning_rate": 6.768653120737652e-05, + "loss": 0.3828, + "step": 430 + }, + { + "epoch": 0.4, + "grad_norm": 0.404827356338501, + "learning_rate": 6.754435693270258e-05, + "loss": 0.2817, + "step": 431 + }, + { + "epoch": 0.41, + "grad_norm": 0.4594029486179352, + "learning_rate": 6.740202070249623e-05, + "loss": 0.3467, + "step": 432 + }, + { + "epoch": 0.41, + "grad_norm": 0.4898674190044403, + "learning_rate": 6.725952383069222e-05, + "loss": 0.4575, + "step": 433 + }, + { + "epoch": 0.41, + "grad_norm": 0.4370651841163635, + "learning_rate": 6.711686763270818e-05, + "loss": 0.3752, + "step": 434 + }, + { + "epoch": 0.41, + "grad_norm": 0.4255489110946655, + "learning_rate": 6.697405342543258e-05, + "loss": 0.378, + "step": 435 + }, + { + "epoch": 0.41, + "grad_norm": 0.3686978220939636, + "learning_rate": 6.683108252721238e-05, + "loss": 0.3042, + "step": 436 + }, + { + "epoch": 0.41, + "grad_norm": 0.4671451449394226, + "learning_rate": 6.668795625784113e-05, + "loss": 0.4444, + "step": 437 + }, + { + "epoch": 0.41, + "grad_norm": 0.351331502199173, + "learning_rate": 6.654467593854657e-05, + "loss": 0.2294, + "step": 438 + }, + { + "epoch": 0.41, + "grad_norm": 0.40956103801727295, + "learning_rate": 6.640124289197845e-05, + "loss": 0.3081, + "step": 439 + }, + { + "epoch": 0.41, + "grad_norm": 0.4217970669269562, + "learning_rate": 6.625765844219652e-05, + "loss": 0.3677, + "step": 440 + }, + { + "epoch": 0.41, + "grad_norm": 0.3987672030925751, + "learning_rate": 6.611392391465802e-05, + "loss": 0.2973, + "step": 441 + }, + { + "epoch": 0.41, + "grad_norm": 0.4282805621623993, + "learning_rate": 6.597004063620567e-05, + "loss": 0.3339, + "step": 442 + }, + { + "epoch": 0.42, + "grad_norm": 0.29611936211586, + "learning_rate": 6.582600993505534e-05, + "loss": 0.1904, + "step": 443 + }, + { + "epoch": 0.42, + "grad_norm": 0.4696163833141327, + "learning_rate": 6.568183314078376e-05, + "loss": 0.4168, + "step": 444 + }, + { + "epoch": 0.42, + "grad_norm": 0.33832165598869324, + "learning_rate": 6.553751158431627e-05, + "loss": 0.2301, + "step": 445 + }, + { + "epoch": 0.42, + "grad_norm": 0.379827082157135, + "learning_rate": 6.539304659791456e-05, + "loss": 0.2657, + "step": 446 + }, + { + "epoch": 0.42, + "grad_norm": 0.387509286403656, + "learning_rate": 6.524843951516434e-05, + "loss": 0.2709, + "step": 447 + }, + { + "epoch": 0.42, + "grad_norm": 0.4099612236022949, + "learning_rate": 6.510369167096308e-05, + "loss": 0.295, + "step": 448 + }, + { + "epoch": 0.42, + "grad_norm": 0.38275423645973206, + "learning_rate": 6.495880440150756e-05, + "loss": 0.2644, + "step": 449 + }, + { + "epoch": 0.42, + "grad_norm": 0.3869870603084564, + "learning_rate": 6.481377904428171e-05, + "loss": 0.2606, + "step": 450 + }, + { + "epoch": 0.42, + "grad_norm": 0.41019243001937866, + "learning_rate": 6.466861693804413e-05, + "loss": 0.3219, + "step": 451 + }, + { + "epoch": 0.42, + "grad_norm": 0.4798837900161743, + "learning_rate": 6.45233194228158e-05, + "loss": 0.4045, + "step": 452 + }, + { + "epoch": 0.42, + "grad_norm": 0.4423982501029968, + "learning_rate": 6.437788783986766e-05, + "loss": 0.2412, + "step": 453 + }, + { + "epoch": 0.43, + "grad_norm": 0.45012742280960083, + "learning_rate": 6.42323235317083e-05, + "loss": 0.3143, + "step": 454 + }, + { + "epoch": 0.43, + "grad_norm": 0.4368098974227905, + "learning_rate": 6.408662784207149e-05, + "loss": 0.296, + "step": 455 + }, + { + "epoch": 0.43, + "grad_norm": 0.4497329890727997, + "learning_rate": 6.394080211590381e-05, + "loss": 0.2861, + "step": 456 + }, + { + "epoch": 0.43, + "grad_norm": 0.3450700342655182, + "learning_rate": 6.379484769935223e-05, + "loss": 0.2295, + "step": 457 + }, + { + "epoch": 0.43, + "grad_norm": 0.4247216284275055, + "learning_rate": 6.364876593975173e-05, + "loss": 0.2657, + "step": 458 + }, + { + "epoch": 0.43, + "grad_norm": 0.38082388043403625, + "learning_rate": 6.350255818561277e-05, + "loss": 0.2137, + "step": 459 + }, + { + "epoch": 0.43, + "grad_norm": 0.43894073367118835, + "learning_rate": 6.335622578660889e-05, + "loss": 0.313, + "step": 460 + }, + { + "epoch": 0.43, + "grad_norm": 0.36916911602020264, + "learning_rate": 6.320977009356431e-05, + "loss": 0.2436, + "step": 461 + }, + { + "epoch": 0.43, + "grad_norm": 0.5032907724380493, + "learning_rate": 6.306319245844133e-05, + "loss": 0.3732, + "step": 462 + }, + { + "epoch": 0.43, + "grad_norm": 0.40374746918678284, + "learning_rate": 6.291649423432799e-05, + "loss": 0.2411, + "step": 463 + }, + { + "epoch": 0.44, + "grad_norm": 0.36921069025993347, + "learning_rate": 6.276967677542542e-05, + "loss": 0.266, + "step": 464 + }, + { + "epoch": 0.44, + "grad_norm": 0.407466858625412, + "learning_rate": 6.262274143703554e-05, + "loss": 0.306, + "step": 465 + }, + { + "epoch": 0.44, + "grad_norm": 0.42362383008003235, + "learning_rate": 6.24756895755484e-05, + "loss": 0.2955, + "step": 466 + }, + { + "epoch": 0.44, + "grad_norm": 0.3616067171096802, + "learning_rate": 6.232852254842962e-05, + "loss": 0.2596, + "step": 467 + }, + { + "epoch": 0.44, + "grad_norm": 0.4745740592479706, + "learning_rate": 6.218124171420806e-05, + "loss": 0.3476, + "step": 468 + }, + { + "epoch": 0.44, + "grad_norm": 0.4039102792739868, + "learning_rate": 6.203384843246307e-05, + "loss": 0.3447, + "step": 469 + }, + { + "epoch": 0.44, + "grad_norm": 0.3852016031742096, + "learning_rate": 6.188634406381207e-05, + "loss": 0.3348, + "step": 470 + }, + { + "epoch": 0.44, + "grad_norm": 0.4698221683502197, + "learning_rate": 6.173872996989793e-05, + "loss": 0.3371, + "step": 471 + }, + { + "epoch": 0.44, + "grad_norm": 0.3284843862056732, + "learning_rate": 6.159100751337642e-05, + "loss": 0.2062, + "step": 472 + }, + { + "epoch": 0.44, + "grad_norm": 0.3801678419113159, + "learning_rate": 6.144317805790361e-05, + "loss": 0.2848, + "step": 473 + }, + { + "epoch": 0.44, + "grad_norm": 0.38208192586898804, + "learning_rate": 6.129524296812335e-05, + "loss": 0.2542, + "step": 474 + }, + { + "epoch": 0.45, + "grad_norm": 0.3336544334888458, + "learning_rate": 6.114720360965453e-05, + "loss": 0.187, + "step": 475 + }, + { + "epoch": 0.45, + "grad_norm": 0.41276127099990845, + "learning_rate": 6.099906134907868e-05, + "loss": 0.2808, + "step": 476 + }, + { + "epoch": 0.45, + "grad_norm": 0.4322654604911804, + "learning_rate": 6.085081755392714e-05, + "loss": 0.35, + "step": 477 + }, + { + "epoch": 0.45, + "grad_norm": 0.45964300632476807, + "learning_rate": 6.07024735926686e-05, + "loss": 0.3284, + "step": 478 + }, + { + "epoch": 0.45, + "grad_norm": 0.4558175802230835, + "learning_rate": 6.055403083469637e-05, + "loss": 0.3817, + "step": 479 + }, + { + "epoch": 0.45, + "grad_norm": 0.5052785873413086, + "learning_rate": 6.04054906503158e-05, + "loss": 0.4212, + "step": 480 + }, + { + "epoch": 0.45, + "grad_norm": 0.4662330746650696, + "learning_rate": 6.0256854410731565e-05, + "loss": 0.3404, + "step": 481 + }, + { + "epoch": 0.45, + "grad_norm": 0.4600191116333008, + "learning_rate": 6.010812348803509e-05, + "loss": 0.3785, + "step": 482 + }, + { + "epoch": 0.45, + "grad_norm": 0.45039844512939453, + "learning_rate": 5.99592992551918e-05, + "loss": 0.4206, + "step": 483 + }, + { + "epoch": 0.45, + "grad_norm": 0.42035818099975586, + "learning_rate": 5.9810383086028535e-05, + "loss": 0.3766, + "step": 484 + }, + { + "epoch": 0.45, + "grad_norm": 0.35778695344924927, + "learning_rate": 5.9661376355220734e-05, + "loss": 0.2886, + "step": 485 + }, + { + "epoch": 0.46, + "grad_norm": 0.48496320843696594, + "learning_rate": 5.9512280438279914e-05, + "loss": 0.4084, + "step": 486 + }, + { + "epoch": 0.46, + "grad_norm": 0.39029964804649353, + "learning_rate": 5.936309671154084e-05, + "loss": 0.3367, + "step": 487 + }, + { + "epoch": 0.46, + "grad_norm": 0.4327787458896637, + "learning_rate": 5.9213826552148886e-05, + "loss": 0.3404, + "step": 488 + }, + { + "epoch": 0.46, + "grad_norm": 0.41896331310272217, + "learning_rate": 5.906447133804731e-05, + "loss": 0.3544, + "step": 489 + }, + { + "epoch": 0.46, + "grad_norm": 0.37838464975357056, + "learning_rate": 5.891503244796448e-05, + "loss": 0.2552, + "step": 490 + }, + { + "epoch": 0.46, + "grad_norm": 0.3590022027492523, + "learning_rate": 5.8765511261401254e-05, + "loss": 0.2994, + "step": 491 + }, + { + "epoch": 0.46, + "grad_norm": 0.3592306673526764, + "learning_rate": 5.861590915861817e-05, + "loss": 0.2334, + "step": 492 + }, + { + "epoch": 0.46, + "grad_norm": 0.389812707901001, + "learning_rate": 5.846622752062268e-05, + "loss": 0.2642, + "step": 493 + }, + { + "epoch": 0.46, + "grad_norm": 0.431536465883255, + "learning_rate": 5.831646772915651e-05, + "loss": 0.3448, + "step": 494 + }, + { + "epoch": 0.46, + "grad_norm": 0.4194413423538208, + "learning_rate": 5.816663116668276e-05, + "loss": 0.3033, + "step": 495 + }, + { + "epoch": 0.47, + "grad_norm": 0.4021466076374054, + "learning_rate": 5.801671921637328e-05, + "loss": 0.3144, + "step": 496 + }, + { + "epoch": 0.47, + "grad_norm": 0.5036161541938782, + "learning_rate": 5.786673326209584e-05, + "loss": 0.4045, + "step": 497 + }, + { + "epoch": 0.47, + "grad_norm": 0.3767940104007721, + "learning_rate": 5.7716674688401286e-05, + "loss": 0.2666, + "step": 498 + }, + { + "epoch": 0.47, + "grad_norm": 0.5003858208656311, + "learning_rate": 5.756654488051091e-05, + "loss": 0.3855, + "step": 499 + }, + { + "epoch": 0.47, + "grad_norm": 0.4110037684440613, + "learning_rate": 5.7416345224303524e-05, + "loss": 0.3001, + "step": 500 + }, + { + "epoch": 0.47, + "grad_norm": 0.38694775104522705, + "learning_rate": 5.7266077106302785e-05, + "loss": 0.2862, + "step": 501 + }, + { + "epoch": 0.47, + "grad_norm": 0.34114885330200195, + "learning_rate": 5.7115741913664264e-05, + "loss": 0.267, + "step": 502 + }, + { + "epoch": 0.47, + "grad_norm": 0.37278828024864197, + "learning_rate": 5.696534103416276e-05, + "loss": 0.271, + "step": 503 + }, + { + "epoch": 0.47, + "grad_norm": 0.4475939869880676, + "learning_rate": 5.6814875856179414e-05, + "loss": 0.3599, + "step": 504 + }, + { + "epoch": 0.47, + "grad_norm": 0.5131115913391113, + "learning_rate": 5.666434776868895e-05, + "loss": 0.3666, + "step": 505 + }, + { + "epoch": 0.47, + "grad_norm": 0.4798625409603119, + "learning_rate": 5.651375816124679e-05, + "loss": 0.3203, + "step": 506 + }, + { + "epoch": 0.48, + "grad_norm": 0.42612072825431824, + "learning_rate": 5.636310842397629e-05, + "loss": 0.3146, + "step": 507 + }, + { + "epoch": 0.48, + "grad_norm": 0.45160189270973206, + "learning_rate": 5.621239994755583e-05, + "loss": 0.3138, + "step": 508 + }, + { + "epoch": 0.48, + "grad_norm": 0.4129311740398407, + "learning_rate": 5.606163412320608e-05, + "loss": 0.2952, + "step": 509 + }, + { + "epoch": 0.48, + "grad_norm": 0.41268205642700195, + "learning_rate": 5.5910812342677065e-05, + "loss": 0.2656, + "step": 510 + }, + { + "epoch": 0.48, + "grad_norm": 0.36380165815353394, + "learning_rate": 5.575993599823536e-05, + "loss": 0.2511, + "step": 511 + }, + { + "epoch": 0.48, + "grad_norm": 0.3615700900554657, + "learning_rate": 5.560900648265124e-05, + "loss": 0.1815, + "step": 512 + }, + { + "epoch": 0.48, + "grad_norm": 0.49483224749565125, + "learning_rate": 5.545802518918579e-05, + "loss": 0.4012, + "step": 513 + }, + { + "epoch": 0.48, + "grad_norm": 0.5183067321777344, + "learning_rate": 5.5306993511578096e-05, + "loss": 0.4256, + "step": 514 + }, + { + "epoch": 0.48, + "grad_norm": 0.48457270860671997, + "learning_rate": 5.515591284403234e-05, + "loss": 0.3114, + "step": 515 + }, + { + "epoch": 0.48, + "grad_norm": 0.40141454339027405, + "learning_rate": 5.5004784581204927e-05, + "loss": 0.2999, + "step": 516 + }, + { + "epoch": 0.48, + "grad_norm": 0.36214885115623474, + "learning_rate": 5.485361011819164e-05, + "loss": 0.2415, + "step": 517 + }, + { + "epoch": 0.49, + "grad_norm": 0.3985465466976166, + "learning_rate": 5.4702390850514726e-05, + "loss": 0.2639, + "step": 518 + }, + { + "epoch": 0.49, + "grad_norm": 0.4212491512298584, + "learning_rate": 5.455112817411006e-05, + "loss": 0.3155, + "step": 519 + }, + { + "epoch": 0.49, + "grad_norm": 0.406974196434021, + "learning_rate": 5.4399823485314226e-05, + "loss": 0.2597, + "step": 520 + }, + { + "epoch": 0.49, + "grad_norm": 0.44170641899108887, + "learning_rate": 5.4248478180851604e-05, + "loss": 0.3835, + "step": 521 + }, + { + "epoch": 0.49, + "grad_norm": 0.3631415069103241, + "learning_rate": 5.409709365782154e-05, + "loss": 0.2471, + "step": 522 + }, + { + "epoch": 0.49, + "grad_norm": 0.464281290769577, + "learning_rate": 5.3945671313685386e-05, + "loss": 0.3869, + "step": 523 + }, + { + "epoch": 0.49, + "grad_norm": 0.381562739610672, + "learning_rate": 5.379421254625366e-05, + "loss": 0.2764, + "step": 524 + }, + { + "epoch": 0.49, + "grad_norm": 0.4001575708389282, + "learning_rate": 5.364271875367311e-05, + "loss": 0.2963, + "step": 525 + }, + { + "epoch": 0.49, + "grad_norm": 0.42583319544792175, + "learning_rate": 5.3491191334413746e-05, + "loss": 0.3755, + "step": 526 + }, + { + "epoch": 0.49, + "grad_norm": 0.3744286596775055, + "learning_rate": 5.3339631687256084e-05, + "loss": 0.2448, + "step": 527 + }, + { + "epoch": 0.5, + "grad_norm": 0.39580249786376953, + "learning_rate": 5.318804121127807e-05, + "loss": 0.2644, + "step": 528 + }, + { + "epoch": 0.5, + "grad_norm": 0.46420353651046753, + "learning_rate": 5.3036421305842276e-05, + "loss": 0.3886, + "step": 529 + }, + { + "epoch": 0.5, + "grad_norm": 0.3602772355079651, + "learning_rate": 5.288477337058293e-05, + "loss": 0.2409, + "step": 530 + }, + { + "epoch": 0.5, + "grad_norm": 0.4621635973453522, + "learning_rate": 5.273309880539301e-05, + "loss": 0.368, + "step": 531 + }, + { + "epoch": 0.5, + "grad_norm": 0.534472644329071, + "learning_rate": 5.258139901041132e-05, + "loss": 0.3049, + "step": 532 + }, + { + "epoch": 0.5, + "grad_norm": 0.3050483465194702, + "learning_rate": 5.242967538600957e-05, + "loss": 0.1831, + "step": 533 + }, + { + "epoch": 0.5, + "grad_norm": 0.427960067987442, + "learning_rate": 5.227792933277943e-05, + "loss": 0.3298, + "step": 534 + }, + { + "epoch": 0.5, + "grad_norm": 0.31485849618911743, + "learning_rate": 5.212616225151965e-05, + "loss": 0.2202, + "step": 535 + }, + { + "epoch": 0.5, + "grad_norm": 0.44149473309516907, + "learning_rate": 5.197437554322304e-05, + "loss": 0.4113, + "step": 536 + }, + { + "epoch": 0.5, + "grad_norm": 0.4465602934360504, + "learning_rate": 5.182257060906365e-05, + "loss": 0.3018, + "step": 537 + }, + { + "epoch": 0.5, + "grad_norm": 0.4594295620918274, + "learning_rate": 5.167074885038373e-05, + "loss": 0.3268, + "step": 538 + }, + { + "epoch": 0.51, + "grad_norm": 0.3535444140434265, + "learning_rate": 5.151891166868086e-05, + "loss": 0.2448, + "step": 539 + }, + { + "epoch": 0.51, + "grad_norm": 0.5269205570220947, + "learning_rate": 5.1367060465595006e-05, + "loss": 0.4655, + "step": 540 + }, + { + "epoch": 0.51, + "grad_norm": 0.3851945698261261, + "learning_rate": 5.121519664289553e-05, + "loss": 0.2964, + "step": 541 + }, + { + "epoch": 0.51, + "grad_norm": 0.4726141393184662, + "learning_rate": 5.106332160246834e-05, + "loss": 0.4122, + "step": 542 + }, + { + "epoch": 0.51, + "grad_norm": 0.37341347336769104, + "learning_rate": 5.0911436746302834e-05, + "loss": 0.2887, + "step": 543 + }, + { + "epoch": 0.51, + "grad_norm": 0.3809630274772644, + "learning_rate": 5.075954347647909e-05, + "loss": 0.2476, + "step": 544 + }, + { + "epoch": 0.51, + "grad_norm": 0.3767317235469818, + "learning_rate": 5.0607643195154796e-05, + "loss": 0.2833, + "step": 545 + }, + { + "epoch": 0.51, + "grad_norm": 0.3544347584247589, + "learning_rate": 5.045573730455241e-05, + "loss": 0.286, + "step": 546 + }, + { + "epoch": 0.51, + "grad_norm": 0.34240320324897766, + "learning_rate": 5.030382720694612e-05, + "loss": 0.2543, + "step": 547 + }, + { + "epoch": 0.51, + "grad_norm": 0.4301561117172241, + "learning_rate": 5.0151914304649015e-05, + "loss": 0.35, + "step": 548 + }, + { + "epoch": 0.51, + "grad_norm": 0.34088242053985596, + "learning_rate": 5e-05, + "loss": 0.228, + "step": 549 + }, + { + "epoch": 0.52, + "grad_norm": 0.4396790564060211, + "learning_rate": 4.984808569535101e-05, + "loss": 0.3361, + "step": 550 + }, + { + "epoch": 0.52, + "grad_norm": 0.4188593029975891, + "learning_rate": 4.969617279305388e-05, + "loss": 0.3144, + "step": 551 + }, + { + "epoch": 0.52, + "grad_norm": 0.3811877965927124, + "learning_rate": 4.954426269544761e-05, + "loss": 0.2667, + "step": 552 + }, + { + "epoch": 0.52, + "grad_norm": 0.40995457768440247, + "learning_rate": 4.939235680484522e-05, + "loss": 0.3144, + "step": 553 + }, + { + "epoch": 0.52, + "grad_norm": 0.3613314926624298, + "learning_rate": 4.924045652352092e-05, + "loss": 0.2425, + "step": 554 + }, + { + "epoch": 0.52, + "grad_norm": 0.38490086793899536, + "learning_rate": 4.908856325369718e-05, + "loss": 0.2996, + "step": 555 + }, + { + "epoch": 0.52, + "grad_norm": 0.3282059133052826, + "learning_rate": 4.893667839753168e-05, + "loss": 0.1906, + "step": 556 + }, + { + "epoch": 0.52, + "grad_norm": 0.4107285737991333, + "learning_rate": 4.878480335710448e-05, + "loss": 0.3535, + "step": 557 + }, + { + "epoch": 0.52, + "grad_norm": 0.2970867156982422, + "learning_rate": 4.8632939534405006e-05, + "loss": 0.1745, + "step": 558 + }, + { + "epoch": 0.52, + "grad_norm": 0.39580798149108887, + "learning_rate": 4.8481088331319146e-05, + "loss": 0.3176, + "step": 559 + }, + { + "epoch": 0.53, + "grad_norm": 0.3547394573688507, + "learning_rate": 4.832925114961629e-05, + "loss": 0.2462, + "step": 560 + }, + { + "epoch": 0.53, + "grad_norm": 0.4645850956439972, + "learning_rate": 4.817742939093635e-05, + "loss": 0.4052, + "step": 561 + }, + { + "epoch": 0.53, + "grad_norm": 0.4204746186733246, + "learning_rate": 4.8025624456776966e-05, + "loss": 0.268, + "step": 562 + }, + { + "epoch": 0.53, + "grad_norm": 0.3667820394039154, + "learning_rate": 4.787383774848037e-05, + "loss": 0.2983, + "step": 563 + }, + { + "epoch": 0.53, + "grad_norm": 0.479818731546402, + "learning_rate": 4.772207066722057e-05, + "loss": 0.3442, + "step": 564 + }, + { + "epoch": 0.53, + "grad_norm": 0.4877625107765198, + "learning_rate": 4.757032461399044e-05, + "loss": 0.3294, + "step": 565 + }, + { + "epoch": 0.53, + "grad_norm": 0.3830382227897644, + "learning_rate": 4.7418600989588694e-05, + "loss": 0.2375, + "step": 566 + }, + { + "epoch": 0.53, + "grad_norm": 0.3333114981651306, + "learning_rate": 4.726690119460701e-05, + "loss": 0.1962, + "step": 567 + }, + { + "epoch": 0.53, + "grad_norm": 0.4095388352870941, + "learning_rate": 4.7115226629417075e-05, + "loss": 0.3104, + "step": 568 + }, + { + "epoch": 0.53, + "grad_norm": 0.4421548843383789, + "learning_rate": 4.6963578694157736e-05, + "loss": 0.2638, + "step": 569 + }, + { + "epoch": 0.53, + "grad_norm": 0.4537515640258789, + "learning_rate": 4.681195878872194e-05, + "loss": 0.3362, + "step": 570 + }, + { + "epoch": 0.54, + "grad_norm": 0.3993587791919708, + "learning_rate": 4.666036831274392e-05, + "loss": 0.2565, + "step": 571 + }, + { + "epoch": 0.54, + "grad_norm": 0.4283117651939392, + "learning_rate": 4.6508808665586265e-05, + "loss": 0.3057, + "step": 572 + }, + { + "epoch": 0.54, + "grad_norm": 0.33654728531837463, + "learning_rate": 4.635728124632692e-05, + "loss": 0.2612, + "step": 573 + }, + { + "epoch": 0.54, + "grad_norm": 0.4272298514842987, + "learning_rate": 4.6205787453746336e-05, + "loss": 0.282, + "step": 574 + }, + { + "epoch": 0.54, + "grad_norm": 0.3670824468135834, + "learning_rate": 4.605432868631462e-05, + "loss": 0.2354, + "step": 575 + }, + { + "epoch": 0.54, + "grad_norm": 0.5032891631126404, + "learning_rate": 4.590290634217848e-05, + "loss": 0.3752, + "step": 576 + }, + { + "epoch": 0.54, + "grad_norm": 0.4507879316806793, + "learning_rate": 4.57515218191484e-05, + "loss": 0.3363, + "step": 577 + }, + { + "epoch": 0.54, + "grad_norm": 0.37974223494529724, + "learning_rate": 4.5600176514685786e-05, + "loss": 0.2745, + "step": 578 + }, + { + "epoch": 0.54, + "grad_norm": 0.42613649368286133, + "learning_rate": 4.5448871825889946e-05, + "loss": 0.3073, + "step": 579 + }, + { + "epoch": 0.54, + "grad_norm": 0.4176366627216339, + "learning_rate": 4.52976091494853e-05, + "loss": 0.2573, + "step": 580 + }, + { + "epoch": 0.54, + "grad_norm": 0.3846084475517273, + "learning_rate": 4.514638988180837e-05, + "loss": 0.2727, + "step": 581 + }, + { + "epoch": 0.55, + "grad_norm": 0.4902336001396179, + "learning_rate": 4.4995215418795085e-05, + "loss": 0.3941, + "step": 582 + }, + { + "epoch": 0.55, + "grad_norm": 0.41963398456573486, + "learning_rate": 4.484408715596768e-05, + "loss": 0.2968, + "step": 583 + }, + { + "epoch": 0.55, + "grad_norm": 0.4140608012676239, + "learning_rate": 4.4693006488421915e-05, + "loss": 0.2776, + "step": 584 + }, + { + "epoch": 0.55, + "grad_norm": 0.4394283592700958, + "learning_rate": 4.454197481081422e-05, + "loss": 0.2955, + "step": 585 + }, + { + "epoch": 0.55, + "grad_norm": 0.4049432575702667, + "learning_rate": 4.439099351734878e-05, + "loss": 0.2811, + "step": 586 + }, + { + "epoch": 0.55, + "grad_norm": 0.40243345499038696, + "learning_rate": 4.4240064001764646e-05, + "loss": 0.265, + "step": 587 + }, + { + "epoch": 0.55, + "grad_norm": 0.4278285503387451, + "learning_rate": 4.4089187657322953e-05, + "loss": 0.2675, + "step": 588 + }, + { + "epoch": 0.55, + "grad_norm": 0.3500414192676544, + "learning_rate": 4.393836587679394e-05, + "loss": 0.2449, + "step": 589 + }, + { + "epoch": 0.55, + "grad_norm": 0.4892849326133728, + "learning_rate": 4.3787600052444174e-05, + "loss": 0.3522, + "step": 590 + }, + { + "epoch": 0.55, + "grad_norm": 0.4855495095252991, + "learning_rate": 4.363689157602373e-05, + "loss": 0.4272, + "step": 591 + }, + { + "epoch": 0.56, + "grad_norm": 0.37122058868408203, + "learning_rate": 4.348624183875322e-05, + "loss": 0.2937, + "step": 592 + }, + { + "epoch": 0.56, + "grad_norm": 0.356032133102417, + "learning_rate": 4.333565223131107e-05, + "loss": 0.2191, + "step": 593 + }, + { + "epoch": 0.56, + "grad_norm": 0.4197503924369812, + "learning_rate": 4.318512414382059e-05, + "loss": 0.3349, + "step": 594 + }, + { + "epoch": 0.56, + "grad_norm": 0.39776870608329773, + "learning_rate": 4.3034658965837255e-05, + "loss": 0.1818, + "step": 595 + }, + { + "epoch": 0.56, + "grad_norm": 0.4147531986236572, + "learning_rate": 4.288425808633575e-05, + "loss": 0.336, + "step": 596 + }, + { + "epoch": 0.56, + "grad_norm": 0.3913092017173767, + "learning_rate": 4.273392289369723e-05, + "loss": 0.2746, + "step": 597 + }, + { + "epoch": 0.56, + "grad_norm": 0.36486339569091797, + "learning_rate": 4.258365477569648e-05, + "loss": 0.227, + "step": 598 + }, + { + "epoch": 0.56, + "grad_norm": 0.4407197833061218, + "learning_rate": 4.2433455119489105e-05, + "loss": 0.3245, + "step": 599 + }, + { + "epoch": 0.56, + "grad_norm": 0.4557214677333832, + "learning_rate": 4.228332531159871e-05, + "loss": 0.3608, + "step": 600 + }, + { + "epoch": 0.56, + "grad_norm": 0.4273921251296997, + "learning_rate": 4.2133266737904176e-05, + "loss": 0.2968, + "step": 601 + }, + { + "epoch": 0.56, + "grad_norm": 0.4376170039176941, + "learning_rate": 4.1983280783626724e-05, + "loss": 0.3633, + "step": 602 + }, + { + "epoch": 0.57, + "grad_norm": 0.444912850856781, + "learning_rate": 4.183336883331723e-05, + "loss": 0.3685, + "step": 603 + }, + { + "epoch": 0.57, + "grad_norm": 0.4346312880516052, + "learning_rate": 4.1683532270843504e-05, + "loss": 0.3486, + "step": 604 + }, + { + "epoch": 0.57, + "grad_norm": 0.4119686484336853, + "learning_rate": 4.153377247937732e-05, + "loss": 0.3092, + "step": 605 + }, + { + "epoch": 0.57, + "grad_norm": 0.4005814492702484, + "learning_rate": 4.138409084138185e-05, + "loss": 0.3097, + "step": 606 + }, + { + "epoch": 0.57, + "grad_norm": 0.4748894274234772, + "learning_rate": 4.1234488738598744e-05, + "loss": 0.2733, + "step": 607 + }, + { + "epoch": 0.57, + "grad_norm": 0.33364054560661316, + "learning_rate": 4.108496755203553e-05, + "loss": 0.2098, + "step": 608 + }, + { + "epoch": 0.57, + "grad_norm": 0.41625064611434937, + "learning_rate": 4.0935528661952716e-05, + "loss": 0.3067, + "step": 609 + }, + { + "epoch": 0.57, + "grad_norm": 0.40368083119392395, + "learning_rate": 4.0786173447851126e-05, + "loss": 0.3118, + "step": 610 + }, + { + "epoch": 0.57, + "grad_norm": 0.392261266708374, + "learning_rate": 4.063690328845916e-05, + "loss": 0.269, + "step": 611 + }, + { + "epoch": 0.57, + "grad_norm": 0.34611016511917114, + "learning_rate": 4.04877195617201e-05, + "loss": 0.2569, + "step": 612 + }, + { + "epoch": 0.58, + "grad_norm": 0.4573889672756195, + "learning_rate": 4.033862364477927e-05, + "loss": 0.355, + "step": 613 + }, + { + "epoch": 0.58, + "grad_norm": 0.3446807563304901, + "learning_rate": 4.0189616913971484e-05, + "loss": 0.2251, + "step": 614 + }, + { + "epoch": 0.58, + "grad_norm": 0.41673514246940613, + "learning_rate": 4.0040700744808204e-05, + "loss": 0.385, + "step": 615 + }, + { + "epoch": 0.58, + "grad_norm": 0.3703855872154236, + "learning_rate": 3.989187651196493e-05, + "loss": 0.2316, + "step": 616 + }, + { + "epoch": 0.58, + "grad_norm": 0.41116270422935486, + "learning_rate": 3.974314558926844e-05, + "loss": 0.3043, + "step": 617 + }, + { + "epoch": 0.58, + "grad_norm": 0.36566266417503357, + "learning_rate": 3.9594509349684216e-05, + "loss": 0.3039, + "step": 618 + }, + { + "epoch": 0.58, + "grad_norm": 0.40920472145080566, + "learning_rate": 3.9445969165303647e-05, + "loss": 0.2838, + "step": 619 + }, + { + "epoch": 0.58, + "grad_norm": 0.4240154027938843, + "learning_rate": 3.929752640733141e-05, + "loss": 0.3118, + "step": 620 + }, + { + "epoch": 0.58, + "grad_norm": 0.42745116353034973, + "learning_rate": 3.914918244607287e-05, + "loss": 0.2864, + "step": 621 + }, + { + "epoch": 0.58, + "grad_norm": 0.3916018009185791, + "learning_rate": 3.900093865092134e-05, + "loss": 0.2405, + "step": 622 + }, + { + "epoch": 0.58, + "grad_norm": 0.38008588552474976, + "learning_rate": 3.885279639034546e-05, + "loss": 0.2444, + "step": 623 + }, + { + "epoch": 0.59, + "grad_norm": 0.4288583993911743, + "learning_rate": 3.870475703187667e-05, + "loss": 0.2701, + "step": 624 + }, + { + "epoch": 0.59, + "grad_norm": 0.4599088728427887, + "learning_rate": 3.855682194209639e-05, + "loss": 0.292, + "step": 625 + }, + { + "epoch": 0.59, + "grad_norm": 0.3144804537296295, + "learning_rate": 3.840899248662358e-05, + "loss": 0.2212, + "step": 626 + }, + { + "epoch": 0.59, + "grad_norm": 0.39628955721855164, + "learning_rate": 3.8261270030102084e-05, + "loss": 0.2823, + "step": 627 + }, + { + "epoch": 0.59, + "grad_norm": 0.42753270268440247, + "learning_rate": 3.8113655936187947e-05, + "loss": 0.3056, + "step": 628 + }, + { + "epoch": 0.59, + "grad_norm": 0.3899834454059601, + "learning_rate": 3.796615156753696e-05, + "loss": 0.3375, + "step": 629 + }, + { + "epoch": 0.59, + "grad_norm": 0.4072708785533905, + "learning_rate": 3.7818758285791955e-05, + "loss": 0.3034, + "step": 630 + }, + { + "epoch": 0.59, + "grad_norm": 0.37316933274269104, + "learning_rate": 3.767147745157039e-05, + "loss": 0.3322, + "step": 631 + }, + { + "epoch": 0.59, + "grad_norm": 0.5441315770149231, + "learning_rate": 3.7524310424451635e-05, + "loss": 0.3682, + "step": 632 + }, + { + "epoch": 0.59, + "grad_norm": 0.4088776111602783, + "learning_rate": 3.7377258562964454e-05, + "loss": 0.3208, + "step": 633 + }, + { + "epoch": 0.59, + "grad_norm": 0.42660993337631226, + "learning_rate": 3.723032322457458e-05, + "loss": 0.3428, + "step": 634 + }, + { + "epoch": 0.6, + "grad_norm": 0.49614468216896057, + "learning_rate": 3.708350576567204e-05, + "loss": 0.4129, + "step": 635 + }, + { + "epoch": 0.6, + "grad_norm": 0.3900456428527832, + "learning_rate": 3.6936807541558674e-05, + "loss": 0.2697, + "step": 636 + }, + { + "epoch": 0.6, + "grad_norm": 0.47792574763298035, + "learning_rate": 3.6790229906435705e-05, + "loss": 0.4538, + "step": 637 + }, + { + "epoch": 0.6, + "grad_norm": 0.39278462529182434, + "learning_rate": 3.664377421339111e-05, + "loss": 0.2113, + "step": 638 + }, + { + "epoch": 0.6, + "grad_norm": 0.3987380862236023, + "learning_rate": 3.6497441814387247e-05, + "loss": 0.2929, + "step": 639 + }, + { + "epoch": 0.6, + "grad_norm": 0.3924303948879242, + "learning_rate": 3.6351234060248286e-05, + "loss": 0.2891, + "step": 640 + }, + { + "epoch": 0.6, + "grad_norm": 0.4462815225124359, + "learning_rate": 3.6205152300647784e-05, + "loss": 0.3793, + "step": 641 + }, + { + "epoch": 0.6, + "grad_norm": 0.361194372177124, + "learning_rate": 3.605919788409621e-05, + "loss": 0.304, + "step": 642 + }, + { + "epoch": 0.6, + "grad_norm": 0.3908407390117645, + "learning_rate": 3.591337215792852e-05, + "loss": 0.3189, + "step": 643 + }, + { + "epoch": 0.6, + "grad_norm": 0.37534841895103455, + "learning_rate": 3.5767676468291713e-05, + "loss": 0.237, + "step": 644 + }, + { + "epoch": 0.61, + "grad_norm": 0.4761614203453064, + "learning_rate": 3.562211216013235e-05, + "loss": 0.4667, + "step": 645 + }, + { + "epoch": 0.61, + "grad_norm": 0.40542420744895935, + "learning_rate": 3.5476680577184206e-05, + "loss": 0.33, + "step": 646 + }, + { + "epoch": 0.61, + "grad_norm": 0.33448925614356995, + "learning_rate": 3.533138306195588e-05, + "loss": 0.2378, + "step": 647 + }, + { + "epoch": 0.61, + "grad_norm": 0.4175407290458679, + "learning_rate": 3.5186220955718306e-05, + "loss": 0.2922, + "step": 648 + }, + { + "epoch": 0.61, + "grad_norm": 0.3705791234970093, + "learning_rate": 3.5041195598492446e-05, + "loss": 0.3063, + "step": 649 + }, + { + "epoch": 0.61, + "grad_norm": 0.3703921139240265, + "learning_rate": 3.489630832903694e-05, + "loss": 0.2685, + "step": 650 + }, + { + "epoch": 0.61, + "grad_norm": 0.44679349660873413, + "learning_rate": 3.475156048483567e-05, + "loss": 0.325, + "step": 651 + }, + { + "epoch": 0.61, + "grad_norm": 0.4002423882484436, + "learning_rate": 3.460695340208546e-05, + "loss": 0.3584, + "step": 652 + }, + { + "epoch": 0.61, + "grad_norm": 0.38548558950424194, + "learning_rate": 3.446248841568375e-05, + "loss": 0.2725, + "step": 653 + }, + { + "epoch": 0.61, + "grad_norm": 0.41728684306144714, + "learning_rate": 3.431816685921625e-05, + "loss": 0.3218, + "step": 654 + }, + { + "epoch": 0.61, + "grad_norm": 0.3534894585609436, + "learning_rate": 3.417399006494466e-05, + "loss": 0.2367, + "step": 655 + }, + { + "epoch": 0.62, + "grad_norm": 0.3368327021598816, + "learning_rate": 3.402995936379433e-05, + "loss": 0.2411, + "step": 656 + }, + { + "epoch": 0.62, + "grad_norm": 0.36815300583839417, + "learning_rate": 3.3886076085341986e-05, + "loss": 0.2482, + "step": 657 + }, + { + "epoch": 0.62, + "grad_norm": 0.3257908821105957, + "learning_rate": 3.37423415578035e-05, + "loss": 0.2375, + "step": 658 + }, + { + "epoch": 0.62, + "grad_norm": 0.36529654264450073, + "learning_rate": 3.3598757108021546e-05, + "loss": 0.2368, + "step": 659 + }, + { + "epoch": 0.62, + "grad_norm": 0.3795417845249176, + "learning_rate": 3.345532406145345e-05, + "loss": 0.2878, + "step": 660 + }, + { + "epoch": 0.62, + "grad_norm": 0.34727954864501953, + "learning_rate": 3.331204374215888e-05, + "loss": 0.2283, + "step": 661 + }, + { + "epoch": 0.62, + "grad_norm": 0.3729381561279297, + "learning_rate": 3.316891747278761e-05, + "loss": 0.2449, + "step": 662 + }, + { + "epoch": 0.62, + "grad_norm": 0.4766552746295929, + "learning_rate": 3.302594657456744e-05, + "loss": 0.3729, + "step": 663 + }, + { + "epoch": 0.62, + "grad_norm": 0.45169883966445923, + "learning_rate": 3.288313236729183e-05, + "loss": 0.3373, + "step": 664 + }, + { + "epoch": 0.62, + "grad_norm": 0.4404531717300415, + "learning_rate": 3.274047616930781e-05, + "loss": 0.3399, + "step": 665 + }, + { + "epoch": 0.62, + "grad_norm": 0.4247913062572479, + "learning_rate": 3.259797929750378e-05, + "loss": 0.2615, + "step": 666 + }, + { + "epoch": 0.63, + "grad_norm": 0.3882124722003937, + "learning_rate": 3.245564306729744e-05, + "loss": 0.2928, + "step": 667 + }, + { + "epoch": 0.63, + "grad_norm": 0.3825494945049286, + "learning_rate": 3.231346879262349e-05, + "loss": 0.2818, + "step": 668 + }, + { + "epoch": 0.63, + "grad_norm": 0.38851484656333923, + "learning_rate": 3.217145778592162e-05, + "loss": 0.3119, + "step": 669 + }, + { + "epoch": 0.63, + "grad_norm": 0.4527076184749603, + "learning_rate": 3.202961135812437e-05, + "loss": 0.3445, + "step": 670 + }, + { + "epoch": 0.63, + "grad_norm": 0.41588783264160156, + "learning_rate": 3.1887930818644996e-05, + "loss": 0.3486, + "step": 671 + }, + { + "epoch": 0.63, + "grad_norm": 0.4113435447216034, + "learning_rate": 3.1746417475365405e-05, + "loss": 0.2806, + "step": 672 + }, + { + "epoch": 0.63, + "grad_norm": 0.45771273970603943, + "learning_rate": 3.1605072634624125e-05, + "loss": 0.4103, + "step": 673 + }, + { + "epoch": 0.63, + "grad_norm": 0.4430786073207855, + "learning_rate": 3.146389760120416e-05, + "loss": 0.2651, + "step": 674 + }, + { + "epoch": 0.63, + "grad_norm": 0.4159556031227112, + "learning_rate": 3.132289367832097e-05, + "loss": 0.292, + "step": 675 + }, + { + "epoch": 0.63, + "grad_norm": 0.4217040240764618, + "learning_rate": 3.118206216761053e-05, + "loss": 0.3106, + "step": 676 + }, + { + "epoch": 0.64, + "grad_norm": 0.3670906126499176, + "learning_rate": 3.104140436911719e-05, + "loss": 0.189, + "step": 677 + }, + { + "epoch": 0.64, + "grad_norm": 0.3929611146450043, + "learning_rate": 3.0900921581281725e-05, + "loss": 0.3106, + "step": 678 + }, + { + "epoch": 0.64, + "grad_norm": 0.4370023012161255, + "learning_rate": 3.076061510092935e-05, + "loss": 0.3362, + "step": 679 + }, + { + "epoch": 0.64, + "grad_norm": 0.39130982756614685, + "learning_rate": 3.062048622325779e-05, + "loss": 0.3072, + "step": 680 + }, + { + "epoch": 0.64, + "grad_norm": 0.4033917188644409, + "learning_rate": 3.0480536241825263e-05, + "loss": 0.2458, + "step": 681 + }, + { + "epoch": 0.64, + "grad_norm": 0.43826785683631897, + "learning_rate": 3.034076644853853e-05, + "loss": 0.2895, + "step": 682 + }, + { + "epoch": 0.64, + "grad_norm": 0.46332448720932007, + "learning_rate": 3.0201178133641038e-05, + "loss": 0.3829, + "step": 683 + }, + { + "epoch": 0.64, + "grad_norm": 0.3498688340187073, + "learning_rate": 3.0061772585700953e-05, + "loss": 0.2358, + "step": 684 + }, + { + "epoch": 0.64, + "grad_norm": 0.3833886981010437, + "learning_rate": 2.992255109159926e-05, + "loss": 0.2501, + "step": 685 + }, + { + "epoch": 0.64, + "grad_norm": 0.3906776010990143, + "learning_rate": 2.9783514936517965e-05, + "loss": 0.309, + "step": 686 + }, + { + "epoch": 0.64, + "grad_norm": 0.40844976902008057, + "learning_rate": 2.9644665403928117e-05, + "loss": 0.2933, + "step": 687 + }, + { + "epoch": 0.65, + "grad_norm": 0.4485379159450531, + "learning_rate": 2.950600377557804e-05, + "loss": 0.4246, + "step": 688 + }, + { + "epoch": 0.65, + "grad_norm": 0.398406982421875, + "learning_rate": 2.9367531331481436e-05, + "loss": 0.2596, + "step": 689 + }, + { + "epoch": 0.65, + "grad_norm": 0.40626242756843567, + "learning_rate": 2.9229249349905684e-05, + "loss": 0.3252, + "step": 690 + }, + { + "epoch": 0.65, + "grad_norm": 0.39439576864242554, + "learning_rate": 2.909115910735991e-05, + "loss": 0.2669, + "step": 691 + }, + { + "epoch": 0.65, + "grad_norm": 0.35952499508857727, + "learning_rate": 2.895326187858326e-05, + "loss": 0.2393, + "step": 692 + }, + { + "epoch": 0.65, + "grad_norm": 0.476965993642807, + "learning_rate": 2.881555893653314e-05, + "loss": 0.4749, + "step": 693 + }, + { + "epoch": 0.65, + "grad_norm": 0.40909814834594727, + "learning_rate": 2.8678051552373487e-05, + "loss": 0.3012, + "step": 694 + }, + { + "epoch": 0.65, + "grad_norm": 0.34073591232299805, + "learning_rate": 2.854074099546291e-05, + "loss": 0.2452, + "step": 695 + }, + { + "epoch": 0.65, + "grad_norm": 0.40000537037849426, + "learning_rate": 2.8403628533343206e-05, + "loss": 0.2009, + "step": 696 + }, + { + "epoch": 0.65, + "grad_norm": 0.38767191767692566, + "learning_rate": 2.826671543172738e-05, + "loss": 0.3107, + "step": 697 + }, + { + "epoch": 0.65, + "grad_norm": 0.35624590516090393, + "learning_rate": 2.8130002954488183e-05, + "loss": 0.2552, + "step": 698 + }, + { + "epoch": 0.66, + "grad_norm": 0.3999212682247162, + "learning_rate": 2.799349236364634e-05, + "loss": 0.3384, + "step": 699 + }, + { + "epoch": 0.66, + "grad_norm": 0.41786062717437744, + "learning_rate": 2.7857184919358937e-05, + "loss": 0.3188, + "step": 700 + }, + { + "epoch": 0.66, + "grad_norm": 0.3465578258037567, + "learning_rate": 2.7721081879907718e-05, + "loss": 0.2404, + "step": 701 + }, + { + "epoch": 0.66, + "grad_norm": 0.4516163766384125, + "learning_rate": 2.7585184501687577e-05, + "loss": 0.3522, + "step": 702 + }, + { + "epoch": 0.66, + "grad_norm": 0.4111158549785614, + "learning_rate": 2.74494940391949e-05, + "loss": 0.357, + "step": 703 + }, + { + "epoch": 0.66, + "grad_norm": 0.4369189143180847, + "learning_rate": 2.731401174501601e-05, + "loss": 0.3152, + "step": 704 + }, + { + "epoch": 0.66, + "grad_norm": 0.37029463052749634, + "learning_rate": 2.7178738869815506e-05, + "loss": 0.2604, + "step": 705 + }, + { + "epoch": 0.66, + "grad_norm": 0.3447953462600708, + "learning_rate": 2.7043676662324878e-05, + "loss": 0.226, + "step": 706 + }, + { + "epoch": 0.66, + "grad_norm": 0.41446515917778015, + "learning_rate": 2.69088263693309e-05, + "loss": 0.3036, + "step": 707 + }, + { + "epoch": 0.66, + "grad_norm": 0.37902921438217163, + "learning_rate": 2.6774189235664026e-05, + "loss": 0.2527, + "step": 708 + }, + { + "epoch": 0.67, + "grad_norm": 0.4798141419887543, + "learning_rate": 2.663976650418715e-05, + "loss": 0.4065, + "step": 709 + }, + { + "epoch": 0.67, + "grad_norm": 0.45657438039779663, + "learning_rate": 2.650555941578381e-05, + "loss": 0.4167, + "step": 710 + }, + { + "epoch": 0.67, + "grad_norm": 0.4082154929637909, + "learning_rate": 2.6371569209347014e-05, + "loss": 0.2944, + "step": 711 + }, + { + "epoch": 0.67, + "grad_norm": 0.41583922505378723, + "learning_rate": 2.6237797121767634e-05, + "loss": 0.3708, + "step": 712 + }, + { + "epoch": 0.67, + "grad_norm": 0.3839868903160095, + "learning_rate": 2.6104244387923082e-05, + "loss": 0.2698, + "step": 713 + }, + { + "epoch": 0.67, + "grad_norm": 0.3401528596878052, + "learning_rate": 2.5970912240665813e-05, + "loss": 0.2001, + "step": 714 + }, + { + "epoch": 0.67, + "grad_norm": 0.38874009251594543, + "learning_rate": 2.5837801910812053e-05, + "loss": 0.2999, + "step": 715 + }, + { + "epoch": 0.67, + "grad_norm": 0.33934372663497925, + "learning_rate": 2.5704914627130374e-05, + "loss": 0.2624, + "step": 716 + }, + { + "epoch": 0.67, + "grad_norm": 0.4392395317554474, + "learning_rate": 2.5572251616330373e-05, + "loss": 0.3792, + "step": 717 + }, + { + "epoch": 0.67, + "grad_norm": 0.386403352022171, + "learning_rate": 2.5439814103051284e-05, + "loss": 0.2239, + "step": 718 + }, + { + "epoch": 0.67, + "grad_norm": 0.31485530734062195, + "learning_rate": 2.530760330985079e-05, + "loss": 0.183, + "step": 719 + }, + { + "epoch": 0.68, + "grad_norm": 0.45125994086265564, + "learning_rate": 2.517562045719367e-05, + "loss": 0.3995, + "step": 720 + }, + { + "epoch": 0.68, + "grad_norm": 0.3921152353286743, + "learning_rate": 2.504386676344047e-05, + "loss": 0.3222, + "step": 721 + }, + { + "epoch": 0.68, + "grad_norm": 0.4134279489517212, + "learning_rate": 2.4912343444836445e-05, + "loss": 0.3354, + "step": 722 + }, + { + "epoch": 0.68, + "grad_norm": 0.4451929032802582, + "learning_rate": 2.4781051715500076e-05, + "loss": 0.3869, + "step": 723 + }, + { + "epoch": 0.68, + "grad_norm": 0.3529045879840851, + "learning_rate": 2.46499927874121e-05, + "loss": 0.2156, + "step": 724 + }, + { + "epoch": 0.68, + "grad_norm": 0.3834676444530487, + "learning_rate": 2.4519167870404125e-05, + "loss": 0.2475, + "step": 725 + }, + { + "epoch": 0.68, + "grad_norm": 0.43510353565216064, + "learning_rate": 2.4388578172147675e-05, + "loss": 0.3217, + "step": 726 + }, + { + "epoch": 0.68, + "grad_norm": 0.387541800737381, + "learning_rate": 2.4258224898142807e-05, + "loss": 0.2327, + "step": 727 + }, + { + "epoch": 0.68, + "grad_norm": 0.4362310767173767, + "learning_rate": 2.4128109251707155e-05, + "loss": 0.3878, + "step": 728 + }, + { + "epoch": 0.68, + "grad_norm": 0.3979613482952118, + "learning_rate": 2.399823243396476e-05, + "loss": 0.2884, + "step": 729 + }, + { + "epoch": 0.68, + "grad_norm": 0.3934382498264313, + "learning_rate": 2.3868595643834995e-05, + "loss": 0.255, + "step": 730 + }, + { + "epoch": 0.69, + "grad_norm": 0.4257848262786865, + "learning_rate": 2.373920007802144e-05, + "loss": 0.3247, + "step": 731 + }, + { + "epoch": 0.69, + "grad_norm": 0.3740795850753784, + "learning_rate": 2.361004693100094e-05, + "loss": 0.2647, + "step": 732 + }, + { + "epoch": 0.69, + "grad_norm": 0.4017227590084076, + "learning_rate": 2.3481137395012513e-05, + "loss": 0.3015, + "step": 733 + }, + { + "epoch": 0.69, + "grad_norm": 0.3802047073841095, + "learning_rate": 2.3352472660046295e-05, + "loss": 0.2571, + "step": 734 + }, + { + "epoch": 0.69, + "grad_norm": 0.3709452450275421, + "learning_rate": 2.322405391383273e-05, + "loss": 0.3034, + "step": 735 + }, + { + "epoch": 0.69, + "grad_norm": 0.4259112775325775, + "learning_rate": 2.3095882341831372e-05, + "loss": 0.401, + "step": 736 + }, + { + "epoch": 0.69, + "grad_norm": 0.3781328797340393, + "learning_rate": 2.296795912722014e-05, + "loss": 0.3018, + "step": 737 + }, + { + "epoch": 0.69, + "grad_norm": 0.3574370741844177, + "learning_rate": 2.284028545088423e-05, + "loss": 0.2579, + "step": 738 + }, + { + "epoch": 0.69, + "grad_norm": 0.3482043743133545, + "learning_rate": 2.2712862491405436e-05, + "loss": 0.2644, + "step": 739 + }, + { + "epoch": 0.69, + "grad_norm": 0.3467924892902374, + "learning_rate": 2.258569142505098e-05, + "loss": 0.2472, + "step": 740 + }, + { + "epoch": 0.7, + "grad_norm": 0.44436389207839966, + "learning_rate": 2.2458773425762912e-05, + "loss": 0.2769, + "step": 741 + }, + { + "epoch": 0.7, + "grad_norm": 0.4540431797504425, + "learning_rate": 2.2332109665147127e-05, + "loss": 0.3808, + "step": 742 + }, + { + "epoch": 0.7, + "grad_norm": 0.4173644185066223, + "learning_rate": 2.2205701312462617e-05, + "loss": 0.3577, + "step": 743 + }, + { + "epoch": 0.7, + "grad_norm": 0.39266419410705566, + "learning_rate": 2.2079549534610606e-05, + "loss": 0.2936, + "step": 744 + }, + { + "epoch": 0.7, + "grad_norm": 0.4002622365951538, + "learning_rate": 2.1953655496123853e-05, + "loss": 0.2844, + "step": 745 + }, + { + "epoch": 0.7, + "grad_norm": 0.4227045774459839, + "learning_rate": 2.1828020359155905e-05, + "loss": 0.2866, + "step": 746 + }, + { + "epoch": 0.7, + "grad_norm": 0.37462419271469116, + "learning_rate": 2.1702645283470236e-05, + "loss": 0.2438, + "step": 747 + }, + { + "epoch": 0.7, + "grad_norm": 0.46883416175842285, + "learning_rate": 2.1577531426429782e-05, + "loss": 0.3699, + "step": 748 + }, + { + "epoch": 0.7, + "grad_norm": 0.3425189256668091, + "learning_rate": 2.1452679942985993e-05, + "loss": 0.2275, + "step": 749 + }, + { + "epoch": 0.7, + "grad_norm": 0.4499514698982239, + "learning_rate": 2.132809198566837e-05, + "loss": 0.3443, + "step": 750 + }, + { + "epoch": 0.7, + "grad_norm": 0.46920251846313477, + "learning_rate": 2.1203768704573672e-05, + "loss": 0.3687, + "step": 751 + }, + { + "epoch": 0.71, + "grad_norm": 0.3865799903869629, + "learning_rate": 2.1079711247355505e-05, + "loss": 0.2733, + "step": 752 + }, + { + "epoch": 0.71, + "grad_norm": 0.41232362389564514, + "learning_rate": 2.095592075921347e-05, + "loss": 0.246, + "step": 753 + }, + { + "epoch": 0.71, + "grad_norm": 0.32365286350250244, + "learning_rate": 2.08323983828828e-05, + "loss": 0.1873, + "step": 754 + }, + { + "epoch": 0.71, + "grad_norm": 0.4555100202560425, + "learning_rate": 2.0709145258623704e-05, + "loss": 0.3822, + "step": 755 + }, + { + "epoch": 0.71, + "grad_norm": 0.41039618849754333, + "learning_rate": 2.0586162524210895e-05, + "loss": 0.283, + "step": 756 + }, + { + "epoch": 0.71, + "grad_norm": 0.43508920073509216, + "learning_rate": 2.0463451314923015e-05, + "loss": 0.3266, + "step": 757 + }, + { + "epoch": 0.71, + "grad_norm": 0.43862390518188477, + "learning_rate": 2.0341012763532243e-05, + "loss": 0.3413, + "step": 758 + }, + { + "epoch": 0.71, + "grad_norm": 0.3817911446094513, + "learning_rate": 2.021884800029379e-05, + "loss": 0.2646, + "step": 759 + }, + { + "epoch": 0.71, + "grad_norm": 0.3952636122703552, + "learning_rate": 2.009695815293548e-05, + "loss": 0.2985, + "step": 760 + }, + { + "epoch": 0.71, + "grad_norm": 0.4302140474319458, + "learning_rate": 1.9975344346647297e-05, + "loss": 0.3136, + "step": 761 + }, + { + "epoch": 0.71, + "grad_norm": 0.38452428579330444, + "learning_rate": 1.9854007704071064e-05, + "loss": 0.2897, + "step": 762 + }, + { + "epoch": 0.72, + "grad_norm": 0.3167802095413208, + "learning_rate": 1.973294934529007e-05, + "loss": 0.2055, + "step": 763 + }, + { + "epoch": 0.72, + "grad_norm": 0.3533049523830414, + "learning_rate": 1.961217038781863e-05, + "loss": 0.2597, + "step": 764 + }, + { + "epoch": 0.72, + "grad_norm": 0.4575469195842743, + "learning_rate": 1.9491671946591962e-05, + "loss": 0.3197, + "step": 765 + }, + { + "epoch": 0.72, + "grad_norm": 0.4529496729373932, + "learning_rate": 1.9371455133955675e-05, + "loss": 0.3478, + "step": 766 + }, + { + "epoch": 0.72, + "grad_norm": 0.3405405580997467, + "learning_rate": 1.925152105965567e-05, + "loss": 0.268, + "step": 767 + }, + { + "epoch": 0.72, + "grad_norm": 0.3697284758090973, + "learning_rate": 1.9131870830827818e-05, + "loss": 0.2315, + "step": 768 + }, + { + "epoch": 0.72, + "grad_norm": 0.41890275478363037, + "learning_rate": 1.9012505551987765e-05, + "loss": 0.2721, + "step": 769 + }, + { + "epoch": 0.72, + "grad_norm": 0.37630268931388855, + "learning_rate": 1.8893426325020686e-05, + "loss": 0.2562, + "step": 770 + }, + { + "epoch": 0.72, + "grad_norm": 0.4262970983982086, + "learning_rate": 1.8774634249171185e-05, + "loss": 0.4016, + "step": 771 + }, + { + "epoch": 0.72, + "grad_norm": 0.3830007016658783, + "learning_rate": 1.8656130421033123e-05, + "loss": 0.2976, + "step": 772 + }, + { + "epoch": 0.73, + "grad_norm": 0.34678715467453003, + "learning_rate": 1.8537915934539486e-05, + "loss": 0.1675, + "step": 773 + }, + { + "epoch": 0.73, + "grad_norm": 0.38951581716537476, + "learning_rate": 1.841999188095224e-05, + "loss": 0.3078, + "step": 774 + }, + { + "epoch": 0.73, + "grad_norm": 0.41767624020576477, + "learning_rate": 1.830235934885237e-05, + "loss": 0.3639, + "step": 775 + }, + { + "epoch": 0.73, + "grad_norm": 0.35205718874931335, + "learning_rate": 1.818501942412975e-05, + "loss": 0.2867, + "step": 776 + }, + { + "epoch": 0.73, + "grad_norm": 0.3999721109867096, + "learning_rate": 1.8067973189973075e-05, + "loss": 0.3284, + "step": 777 + }, + { + "epoch": 0.73, + "grad_norm": 0.4182070195674896, + "learning_rate": 1.7951221726860045e-05, + "loss": 0.337, + "step": 778 + }, + { + "epoch": 0.73, + "grad_norm": 0.41383877396583557, + "learning_rate": 1.7834766112547142e-05, + "loss": 0.3111, + "step": 779 + }, + { + "epoch": 0.73, + "grad_norm": 0.381978303194046, + "learning_rate": 1.771860742205988e-05, + "loss": 0.3027, + "step": 780 + }, + { + "epoch": 0.73, + "grad_norm": 0.41920870542526245, + "learning_rate": 1.7602746727682796e-05, + "loss": 0.3606, + "step": 781 + }, + { + "epoch": 0.73, + "grad_norm": 0.3676052391529083, + "learning_rate": 1.7487185098949565e-05, + "loss": 0.2786, + "step": 782 + }, + { + "epoch": 0.73, + "grad_norm": 0.43040353059768677, + "learning_rate": 1.7371923602633078e-05, + "loss": 0.3621, + "step": 783 + }, + { + "epoch": 0.74, + "grad_norm": 0.4287657141685486, + "learning_rate": 1.725696330273575e-05, + "loss": 0.3691, + "step": 784 + }, + { + "epoch": 0.74, + "grad_norm": 0.40313223004341125, + "learning_rate": 1.7142305260479474e-05, + "loss": 0.362, + "step": 785 + }, + { + "epoch": 0.74, + "grad_norm": 0.3955127000808716, + "learning_rate": 1.7027950534296027e-05, + "loss": 0.3765, + "step": 786 + }, + { + "epoch": 0.74, + "grad_norm": 0.34894323348999023, + "learning_rate": 1.6913900179817144e-05, + "loss": 0.2277, + "step": 787 + }, + { + "epoch": 0.74, + "grad_norm": 0.39063969254493713, + "learning_rate": 1.6800155249864896e-05, + "loss": 0.3301, + "step": 788 + }, + { + "epoch": 0.74, + "grad_norm": 0.3707781136035919, + "learning_rate": 1.668671679444192e-05, + "loss": 0.286, + "step": 789 + }, + { + "epoch": 0.74, + "grad_norm": 0.4262647330760956, + "learning_rate": 1.6573585860721646e-05, + "loss": 0.3549, + "step": 790 + }, + { + "epoch": 0.74, + "grad_norm": 0.3833974003791809, + "learning_rate": 1.646076349303884e-05, + "loss": 0.2805, + "step": 791 + }, + { + "epoch": 0.74, + "grad_norm": 0.3846880793571472, + "learning_rate": 1.63482507328797e-05, + "loss": 0.2934, + "step": 792 + }, + { + "epoch": 0.74, + "grad_norm": 0.30670422315597534, + "learning_rate": 1.6236048618872456e-05, + "loss": 0.2128, + "step": 793 + }, + { + "epoch": 0.74, + "grad_norm": 0.502191424369812, + "learning_rate": 1.6124158186777676e-05, + "loss": 0.3166, + "step": 794 + }, + { + "epoch": 0.75, + "grad_norm": 0.4251638650894165, + "learning_rate": 1.6012580469478743e-05, + "loss": 0.3323, + "step": 795 + }, + { + "epoch": 0.75, + "grad_norm": 0.3232210576534271, + "learning_rate": 1.5901316496972262e-05, + "loss": 0.1939, + "step": 796 + }, + { + "epoch": 0.75, + "grad_norm": 0.3785446286201477, + "learning_rate": 1.5790367296358644e-05, + "loss": 0.2983, + "step": 797 + }, + { + "epoch": 0.75, + "grad_norm": 0.31697627902030945, + "learning_rate": 1.5679733891832556e-05, + "loss": 0.2113, + "step": 798 + }, + { + "epoch": 0.75, + "grad_norm": 0.38774025440216064, + "learning_rate": 1.55694173046735e-05, + "loss": 0.2785, + "step": 799 + }, + { + "epoch": 0.75, + "grad_norm": 0.3695196807384491, + "learning_rate": 1.5459418553236343e-05, + "loss": 0.2784, + "step": 800 + }, + { + "epoch": 0.75, + "grad_norm": 0.4070797264575958, + "learning_rate": 1.5349738652941968e-05, + "loss": 0.3233, + "step": 801 + }, + { + "epoch": 0.75, + "grad_norm": 0.37904244661331177, + "learning_rate": 1.5240378616267886e-05, + "loss": 0.2564, + "step": 802 + }, + { + "epoch": 0.75, + "grad_norm": 0.3779933750629425, + "learning_rate": 1.5131339452738863e-05, + "loss": 0.3678, + "step": 803 + }, + { + "epoch": 0.75, + "grad_norm": 0.3886728584766388, + "learning_rate": 1.5022622168917649e-05, + "loss": 0.2751, + "step": 804 + }, + { + "epoch": 0.76, + "grad_norm": 0.3595404624938965, + "learning_rate": 1.4914227768395595e-05, + "loss": 0.2592, + "step": 805 + }, + { + "epoch": 0.76, + "grad_norm": 0.4031426012516022, + "learning_rate": 1.4806157251783515e-05, + "loss": 0.2972, + "step": 806 + }, + { + "epoch": 0.76, + "grad_norm": 0.36621609330177307, + "learning_rate": 1.4698411616702356e-05, + "loss": 0.253, + "step": 807 + }, + { + "epoch": 0.76, + "grad_norm": 0.38698112964630127, + "learning_rate": 1.4590991857774038e-05, + "loss": 0.3141, + "step": 808 + }, + { + "epoch": 0.76, + "grad_norm": 0.40530022978782654, + "learning_rate": 1.4483898966612209e-05, + "loss": 0.3769, + "step": 809 + }, + { + "epoch": 0.76, + "grad_norm": 0.3705315887928009, + "learning_rate": 1.437713393181317e-05, + "loss": 0.2756, + "step": 810 + }, + { + "epoch": 0.76, + "grad_norm": 0.40540188550949097, + "learning_rate": 1.4270697738946704e-05, + "loss": 0.2587, + "step": 811 + }, + { + "epoch": 0.76, + "grad_norm": 0.3355952203273773, + "learning_rate": 1.4164591370547004e-05, + "loss": 0.2367, + "step": 812 + }, + { + "epoch": 0.76, + "grad_norm": 0.39372363686561584, + "learning_rate": 1.4058815806103542e-05, + "loss": 0.2863, + "step": 813 + }, + { + "epoch": 0.76, + "grad_norm": 0.41405317187309265, + "learning_rate": 1.3953372022052107e-05, + "loss": 0.3338, + "step": 814 + }, + { + "epoch": 0.76, + "grad_norm": 0.34528371691703796, + "learning_rate": 1.3848260991765755e-05, + "loss": 0.2889, + "step": 815 + }, + { + "epoch": 0.77, + "grad_norm": 0.4991545081138611, + "learning_rate": 1.3743483685545811e-05, + "loss": 0.41, + "step": 816 + }, + { + "epoch": 0.77, + "grad_norm": 0.41992202401161194, + "learning_rate": 1.363904107061294e-05, + "loss": 0.3185, + "step": 817 + }, + { + "epoch": 0.77, + "grad_norm": 0.4431324601173401, + "learning_rate": 1.3534934111098179e-05, + "loss": 0.3361, + "step": 818 + }, + { + "epoch": 0.77, + "grad_norm": 0.3921603262424469, + "learning_rate": 1.3431163768034077e-05, + "loss": 0.3133, + "step": 819 + }, + { + "epoch": 0.77, + "grad_norm": 0.3576185703277588, + "learning_rate": 1.3327730999345817e-05, + "loss": 0.2269, + "step": 820 + }, + { + "epoch": 0.77, + "grad_norm": 0.35856878757476807, + "learning_rate": 1.3224636759842363e-05, + "loss": 0.2933, + "step": 821 + }, + { + "epoch": 0.77, + "grad_norm": 0.33871906995773315, + "learning_rate": 1.3121882001207614e-05, + "loss": 0.2214, + "step": 822 + }, + { + "epoch": 0.77, + "grad_norm": 0.39352303743362427, + "learning_rate": 1.3019467671991692e-05, + "loss": 0.2456, + "step": 823 + }, + { + "epoch": 0.77, + "grad_norm": 0.4099547266960144, + "learning_rate": 1.2917394717602121e-05, + "loss": 0.3549, + "step": 824 + }, + { + "epoch": 0.77, + "grad_norm": 0.3432406485080719, + "learning_rate": 1.2815664080295159e-05, + "loss": 0.2423, + "step": 825 + }, + { + "epoch": 0.77, + "grad_norm": 0.3456730842590332, + "learning_rate": 1.2714276699166994e-05, + "loss": 0.2323, + "step": 826 + }, + { + "epoch": 0.78, + "grad_norm": 0.3806013762950897, + "learning_rate": 1.261323351014525e-05, + "loss": 0.257, + "step": 827 + }, + { + "epoch": 0.78, + "grad_norm": 0.32994189858436584, + "learning_rate": 1.251253544598014e-05, + "loss": 0.2059, + "step": 828 + }, + { + "epoch": 0.78, + "grad_norm": 0.3641059696674347, + "learning_rate": 1.241218343623602e-05, + "loss": 0.3046, + "step": 829 + }, + { + "epoch": 0.78, + "grad_norm": 0.39203181862831116, + "learning_rate": 1.2312178407282749e-05, + "loss": 0.3047, + "step": 830 + }, + { + "epoch": 0.78, + "grad_norm": 0.3873448669910431, + "learning_rate": 1.2212521282287092e-05, + "loss": 0.2999, + "step": 831 + }, + { + "epoch": 0.78, + "grad_norm": 0.43280982971191406, + "learning_rate": 1.2113212981204292e-05, + "loss": 0.3629, + "step": 832 + }, + { + "epoch": 0.78, + "grad_norm": 0.42592155933380127, + "learning_rate": 1.2014254420769466e-05, + "loss": 0.3262, + "step": 833 + }, + { + "epoch": 0.78, + "grad_norm": 0.3780333995819092, + "learning_rate": 1.1915646514489292e-05, + "loss": 0.29, + "step": 834 + }, + { + "epoch": 0.78, + "grad_norm": 0.38742387294769287, + "learning_rate": 1.1817390172633403e-05, + "loss": 0.3148, + "step": 835 + }, + { + "epoch": 0.78, + "grad_norm": 0.34696269035339355, + "learning_rate": 1.1719486302226118e-05, + "loss": 0.2396, + "step": 836 + }, + { + "epoch": 0.79, + "grad_norm": 0.5252797603607178, + "learning_rate": 1.1621935807038003e-05, + "loss": 0.483, + "step": 837 + }, + { + "epoch": 0.79, + "grad_norm": 0.4171554744243622, + "learning_rate": 1.152473958757756e-05, + "loss": 0.3367, + "step": 838 + }, + { + "epoch": 0.79, + "grad_norm": 0.35319188237190247, + "learning_rate": 1.1427898541082855e-05, + "loss": 0.2354, + "step": 839 + }, + { + "epoch": 0.79, + "grad_norm": 0.43658795952796936, + "learning_rate": 1.133141356151336e-05, + "loss": 0.3273, + "step": 840 + }, + { + "epoch": 0.79, + "grad_norm": 0.4028431177139282, + "learning_rate": 1.123528553954154e-05, + "loss": 0.3028, + "step": 841 + }, + { + "epoch": 0.79, + "grad_norm": 0.30417734384536743, + "learning_rate": 1.1139515362544755e-05, + "loss": 0.2006, + "step": 842 + }, + { + "epoch": 0.79, + "grad_norm": 0.4621390700340271, + "learning_rate": 1.1044103914597031e-05, + "loss": 0.387, + "step": 843 + }, + { + "epoch": 0.79, + "grad_norm": 0.5159476399421692, + "learning_rate": 1.0949052076460853e-05, + "loss": 0.4158, + "step": 844 + }, + { + "epoch": 0.79, + "grad_norm": 0.44435352087020874, + "learning_rate": 1.085436072557911e-05, + "loss": 0.3199, + "step": 845 + }, + { + "epoch": 0.79, + "grad_norm": 0.3563813269138336, + "learning_rate": 1.0760030736066951e-05, + "loss": 0.2455, + "step": 846 + }, + { + "epoch": 0.79, + "grad_norm": 0.3790179491043091, + "learning_rate": 1.0666062978703733e-05, + "loss": 0.2853, + "step": 847 + }, + { + "epoch": 0.8, + "grad_norm": 0.3730546236038208, + "learning_rate": 1.0572458320924943e-05, + "loss": 0.264, + "step": 848 + }, + { + "epoch": 0.8, + "grad_norm": 0.43513819575309753, + "learning_rate": 1.0479217626814253e-05, + "loss": 0.3131, + "step": 849 + }, + { + "epoch": 0.8, + "grad_norm": 0.3191884160041809, + "learning_rate": 1.0386341757095502e-05, + "loss": 0.1952, + "step": 850 + }, + { + "epoch": 0.8, + "grad_norm": 0.3954907953739166, + "learning_rate": 1.0293831569124774e-05, + "loss": 0.3285, + "step": 851 + }, + { + "epoch": 0.8, + "grad_norm": 0.4020962417125702, + "learning_rate": 1.0201687916882418e-05, + "loss": 0.3469, + "step": 852 + }, + { + "epoch": 0.8, + "grad_norm": 0.3509984612464905, + "learning_rate": 1.0109911650965314e-05, + "loss": 0.2091, + "step": 853 + }, + { + "epoch": 0.8, + "grad_norm": 0.558068573474884, + "learning_rate": 1.0018503618578818e-05, + "loss": 0.5226, + "step": 854 + }, + { + "epoch": 0.8, + "grad_norm": 0.39664986729621887, + "learning_rate": 9.927464663529118e-06, + "loss": 0.2754, + "step": 855 + }, + { + "epoch": 0.8, + "grad_norm": 0.44442465901374817, + "learning_rate": 9.836795626215356e-06, + "loss": 0.3699, + "step": 856 + }, + { + "epoch": 0.8, + "grad_norm": 0.365354984998703, + "learning_rate": 9.746497343621857e-06, + "loss": 0.2974, + "step": 857 + }, + { + "epoch": 0.8, + "grad_norm": 0.33468684554100037, + "learning_rate": 9.656570649310481e-06, + "loss": 0.2339, + "step": 858 + }, + { + "epoch": 0.81, + "grad_norm": 0.36793625354766846, + "learning_rate": 9.567016373412857e-06, + "loss": 0.3041, + "step": 859 + }, + { + "epoch": 0.81, + "grad_norm": 0.46790769696235657, + "learning_rate": 9.477835342622759e-06, + "loss": 0.3799, + "step": 860 + }, + { + "epoch": 0.81, + "grad_norm": 0.4542980492115021, + "learning_rate": 9.389028380188419e-06, + "loss": 0.4007, + "step": 861 + }, + { + "epoch": 0.81, + "grad_norm": 0.37855392694473267, + "learning_rate": 9.300596305905013e-06, + "loss": 0.293, + "step": 862 + }, + { + "epoch": 0.81, + "grad_norm": 0.3878996670246124, + "learning_rate": 9.212539936107029e-06, + "loss": 0.2728, + "step": 863 + }, + { + "epoch": 0.81, + "grad_norm": 0.34039977192878723, + "learning_rate": 9.124860083660769e-06, + "loss": 0.2652, + "step": 864 + }, + { + "epoch": 0.81, + "grad_norm": 0.3311023414134979, + "learning_rate": 9.037557557956766e-06, + "loss": 0.1982, + "step": 865 + }, + { + "epoch": 0.81, + "grad_norm": 0.4137350916862488, + "learning_rate": 8.950633164902467e-06, + "loss": 0.3247, + "step": 866 + }, + { + "epoch": 0.81, + "grad_norm": 0.4069403111934662, + "learning_rate": 8.86408770691462e-06, + "loss": 0.2845, + "step": 867 + }, + { + "epoch": 0.81, + "grad_norm": 0.33959850668907166, + "learning_rate": 8.777921982911996e-06, + "loss": 0.2373, + "step": 868 + }, + { + "epoch": 0.82, + "grad_norm": 0.387346476316452, + "learning_rate": 8.692136788307903e-06, + "loss": 0.2683, + "step": 869 + }, + { + "epoch": 0.82, + "grad_norm": 0.33370476961135864, + "learning_rate": 8.606732915003002e-06, + "loss": 0.2477, + "step": 870 + }, + { + "epoch": 0.82, + "grad_norm": 0.3765987455844879, + "learning_rate": 8.521711151377803e-06, + "loss": 0.262, + "step": 871 + }, + { + "epoch": 0.82, + "grad_norm": 0.41339150071144104, + "learning_rate": 8.437072282285535e-06, + "loss": 0.3251, + "step": 872 + }, + { + "epoch": 0.82, + "grad_norm": 0.3473236858844757, + "learning_rate": 8.35281708904485e-06, + "loss": 0.2351, + "step": 873 + }, + { + "epoch": 0.82, + "grad_norm": 0.38972267508506775, + "learning_rate": 8.268946349432582e-06, + "loss": 0.3143, + "step": 874 + }, + { + "epoch": 0.82, + "grad_norm": 0.4202972948551178, + "learning_rate": 8.185460837676612e-06, + "loss": 0.2975, + "step": 875 + }, + { + "epoch": 0.82, + "grad_norm": 0.4606451094150543, + "learning_rate": 8.102361324448715e-06, + "loss": 0.2833, + "step": 876 + }, + { + "epoch": 0.82, + "grad_norm": 0.42644232511520386, + "learning_rate": 8.019648576857425e-06, + "loss": 0.3081, + "step": 877 + }, + { + "epoch": 0.82, + "grad_norm": 0.4242898225784302, + "learning_rate": 7.937323358440935e-06, + "loss": 0.3451, + "step": 878 + }, + { + "epoch": 0.82, + "grad_norm": 0.3542291522026062, + "learning_rate": 7.85538642916015e-06, + "loss": 0.2743, + "step": 879 + }, + { + "epoch": 0.83, + "grad_norm": 0.33673974871635437, + "learning_rate": 7.773838545391515e-06, + "loss": 0.2067, + "step": 880 + }, + { + "epoch": 0.83, + "grad_norm": 0.3924747109413147, + "learning_rate": 7.692680459920188e-06, + "loss": 0.2897, + "step": 881 + }, + { + "epoch": 0.83, + "grad_norm": 0.33239954710006714, + "learning_rate": 7.6119129219329395e-06, + "loss": 0.2183, + "step": 882 + }, + { + "epoch": 0.83, + "grad_norm": 0.43247902393341064, + "learning_rate": 7.5315366770114195e-06, + "loss": 0.3292, + "step": 883 + }, + { + "epoch": 0.83, + "grad_norm": 0.4543492794036865, + "learning_rate": 7.4515524671250725e-06, + "loss": 0.2968, + "step": 884 + }, + { + "epoch": 0.83, + "grad_norm": 0.39136525988578796, + "learning_rate": 7.371961030624452e-06, + "loss": 0.2538, + "step": 885 + }, + { + "epoch": 0.83, + "grad_norm": 0.3515141010284424, + "learning_rate": 7.292763102234329e-06, + "loss": 0.269, + "step": 886 + }, + { + "epoch": 0.83, + "grad_norm": 0.36654534935951233, + "learning_rate": 7.213959413046894e-06, + "loss": 0.2089, + "step": 887 + }, + { + "epoch": 0.83, + "grad_norm": 0.3514954447746277, + "learning_rate": 7.135550690515052e-06, + "loss": 0.2764, + "step": 888 + }, + { + "epoch": 0.83, + "grad_norm": 0.36854228377342224, + "learning_rate": 7.057537658445701e-06, + "loss": 0.238, + "step": 889 + }, + { + "epoch": 0.83, + "grad_norm": 0.3011555075645447, + "learning_rate": 6.979921036993042e-06, + "loss": 0.1986, + "step": 890 + }, + { + "epoch": 0.84, + "grad_norm": 0.4285282492637634, + "learning_rate": 6.902701542651874e-06, + "loss": 0.285, + "step": 891 + }, + { + "epoch": 0.84, + "grad_norm": 0.3245009481906891, + "learning_rate": 6.825879888251135e-06, + "loss": 0.1793, + "step": 892 + }, + { + "epoch": 0.84, + "grad_norm": 0.38561704754829407, + "learning_rate": 6.749456782947122e-06, + "loss": 0.2576, + "step": 893 + }, + { + "epoch": 0.84, + "grad_norm": 0.4220375120639801, + "learning_rate": 6.6734329322171165e-06, + "loss": 0.3059, + "step": 894 + }, + { + "epoch": 0.84, + "grad_norm": 0.391051322221756, + "learning_rate": 6.597809037852726e-06, + "loss": 0.3383, + "step": 895 + }, + { + "epoch": 0.84, + "grad_norm": 0.38187360763549805, + "learning_rate": 6.522585797953579e-06, + "loss": 0.238, + "step": 896 + }, + { + "epoch": 0.84, + "grad_norm": 0.4633513391017914, + "learning_rate": 6.447763906920679e-06, + "loss": 0.3882, + "step": 897 + }, + { + "epoch": 0.84, + "grad_norm": 0.48090532422065735, + "learning_rate": 6.373344055450165e-06, + "loss": 0.3395, + "step": 898 + }, + { + "epoch": 0.84, + "grad_norm": 0.3448203206062317, + "learning_rate": 6.2993269305268495e-06, + "loss": 0.2316, + "step": 899 + }, + { + "epoch": 0.84, + "grad_norm": 0.41201066970825195, + "learning_rate": 6.2257132154178665e-06, + "loss": 0.3416, + "step": 900 + }, + { + "epoch": 0.85, + "grad_norm": 0.4267149567604065, + "learning_rate": 6.152503589666425e-06, + "loss": 0.3322, + "step": 901 + }, + { + "epoch": 0.85, + "grad_norm": 0.413591206073761, + "learning_rate": 6.079698729085498e-06, + "loss": 0.3095, + "step": 902 + }, + { + "epoch": 0.85, + "grad_norm": 0.3144916296005249, + "learning_rate": 6.007299305751585e-06, + "loss": 0.1855, + "step": 903 + }, + { + "epoch": 0.85, + "grad_norm": 0.34966927766799927, + "learning_rate": 5.935305987998496e-06, + "loss": 0.2445, + "step": 904 + }, + { + "epoch": 0.85, + "grad_norm": 0.36850506067276, + "learning_rate": 5.863719440411214e-06, + "loss": 0.2391, + "step": 905 + }, + { + "epoch": 0.85, + "grad_norm": 0.37980982661247253, + "learning_rate": 5.792540323819751e-06, + "loss": 0.3392, + "step": 906 + }, + { + "epoch": 0.85, + "grad_norm": 0.36540210247039795, + "learning_rate": 5.721769295293034e-06, + "loss": 0.2487, + "step": 907 + }, + { + "epoch": 0.85, + "grad_norm": 0.4353797733783722, + "learning_rate": 5.651407008132809e-06, + "loss": 0.3582, + "step": 908 + }, + { + "epoch": 0.85, + "grad_norm": 0.3671395480632782, + "learning_rate": 5.5814541118677284e-06, + "loss": 0.2116, + "step": 909 + }, + { + "epoch": 0.85, + "grad_norm": 0.4240861237049103, + "learning_rate": 5.5119112522471924e-06, + "loss": 0.3495, + "step": 910 + }, + { + "epoch": 0.85, + "grad_norm": 0.3780825138092041, + "learning_rate": 5.442779071235516e-06, + "loss": 0.2536, + "step": 911 + }, + { + "epoch": 0.86, + "grad_norm": 0.3448036313056946, + "learning_rate": 5.374058207005944e-06, + "loss": 0.2392, + "step": 912 + }, + { + "epoch": 0.86, + "grad_norm": 0.3841763138771057, + "learning_rate": 5.305749293934764e-06, + "loss": 0.2543, + "step": 913 + }, + { + "epoch": 0.86, + "grad_norm": 0.4242177903652191, + "learning_rate": 5.237852962595469e-06, + "loss": 0.2981, + "step": 914 + }, + { + "epoch": 0.86, + "grad_norm": 0.3546169400215149, + "learning_rate": 5.170369839752925e-06, + "loss": 0.2047, + "step": 915 + }, + { + "epoch": 0.86, + "grad_norm": 0.380683571100235, + "learning_rate": 5.1033005483575925e-06, + "loss": 0.2585, + "step": 916 + }, + { + "epoch": 0.86, + "grad_norm": 0.4152151942253113, + "learning_rate": 5.036645707539745e-06, + "loss": 0.2585, + "step": 917 + }, + { + "epoch": 0.86, + "grad_norm": 0.3874864876270294, + "learning_rate": 4.9704059326038055e-06, + "loss": 0.2672, + "step": 918 + }, + { + "epoch": 0.86, + "grad_norm": 0.4094529151916504, + "learning_rate": 4.90458183502262e-06, + "loss": 0.2893, + "step": 919 + }, + { + "epoch": 0.86, + "grad_norm": 0.4090777337551117, + "learning_rate": 4.839174022431858e-06, + "loss": 0.3671, + "step": 920 + }, + { + "epoch": 0.86, + "grad_norm": 0.4188554286956787, + "learning_rate": 4.7741830986243356e-06, + "loss": 0.3102, + "step": 921 + }, + { + "epoch": 0.86, + "grad_norm": 0.3730091154575348, + "learning_rate": 4.709609663544534e-06, + "loss": 0.2852, + "step": 922 + }, + { + "epoch": 0.87, + "grad_norm": 0.38047468662261963, + "learning_rate": 4.645454313282965e-06, + "loss": 0.2954, + "step": 923 + }, + { + "epoch": 0.87, + "grad_norm": 0.4720805585384369, + "learning_rate": 4.581717640070743e-06, + "loss": 0.3471, + "step": 924 + }, + { + "epoch": 0.87, + "grad_norm": 0.4264400899410248, + "learning_rate": 4.5184002322740785e-06, + "loss": 0.346, + "step": 925 + }, + { + "epoch": 0.87, + "grad_norm": 0.31896501779556274, + "learning_rate": 4.455502674388873e-06, + "loss": 0.2315, + "step": 926 + }, + { + "epoch": 0.87, + "grad_norm": 0.4741848111152649, + "learning_rate": 4.3930255470352736e-06, + "loss": 0.3508, + "step": 927 + }, + { + "epoch": 0.87, + "grad_norm": 0.44249042868614197, + "learning_rate": 4.330969426952375e-06, + "loss": 0.3253, + "step": 928 + }, + { + "epoch": 0.87, + "grad_norm": 0.3980604410171509, + "learning_rate": 4.269334886992876e-06, + "loss": 0.2856, + "step": 929 + }, + { + "epoch": 0.87, + "grad_norm": 0.33377861976623535, + "learning_rate": 4.208122496117744e-06, + "loss": 0.2117, + "step": 930 + }, + { + "epoch": 0.87, + "grad_norm": 0.394549161195755, + "learning_rate": 4.147332819391048e-06, + "loss": 0.2755, + "step": 931 + }, + { + "epoch": 0.87, + "grad_norm": 0.4144418239593506, + "learning_rate": 4.0869664179746694e-06, + "loss": 0.2635, + "step": 932 + }, + { + "epoch": 0.88, + "grad_norm": 0.35250324010849, + "learning_rate": 4.027023849123157e-06, + "loss": 0.2294, + "step": 933 + }, + { + "epoch": 0.88, + "grad_norm": 0.38733455538749695, + "learning_rate": 3.967505666178556e-06, + "loss": 0.2676, + "step": 934 + }, + { + "epoch": 0.88, + "grad_norm": 0.39043229818344116, + "learning_rate": 3.908412418565371e-06, + "loss": 0.2569, + "step": 935 + }, + { + "epoch": 0.88, + "grad_norm": 0.3419300317764282, + "learning_rate": 3.849744651785381e-06, + "loss": 0.2464, + "step": 936 + }, + { + "epoch": 0.88, + "grad_norm": 0.3910216689109802, + "learning_rate": 3.7915029074126974e-06, + "loss": 0.355, + "step": 937 + }, + { + "epoch": 0.88, + "grad_norm": 0.50761479139328, + "learning_rate": 3.7336877230887246e-06, + "loss": 0.422, + "step": 938 + }, + { + "epoch": 0.88, + "grad_norm": 0.4381940960884094, + "learning_rate": 3.676299632517216e-06, + "loss": 0.4159, + "step": 939 + }, + { + "epoch": 0.88, + "grad_norm": 0.4971286952495575, + "learning_rate": 3.619339165459307e-06, + "loss": 0.3299, + "step": 940 + }, + { + "epoch": 0.88, + "grad_norm": 0.35524582862854004, + "learning_rate": 3.562806847728678e-06, + "loss": 0.2657, + "step": 941 + }, + { + "epoch": 0.88, + "grad_norm": 0.4120367765426636, + "learning_rate": 3.5067032011866783e-06, + "loss": 0.3563, + "step": 942 + }, + { + "epoch": 0.88, + "grad_norm": 0.3152506649494171, + "learning_rate": 3.4510287437374835e-06, + "loss": 0.1662, + "step": 943 + }, + { + "epoch": 0.89, + "grad_norm": 0.3767205476760864, + "learning_rate": 3.3957839893233536e-06, + "loss": 0.2408, + "step": 944 + }, + { + "epoch": 0.89, + "grad_norm": 0.3337627053260803, + "learning_rate": 3.340969447919873e-06, + "loss": 0.2414, + "step": 945 + }, + { + "epoch": 0.89, + "grad_norm": 0.32742202281951904, + "learning_rate": 3.286585625531241e-06, + "loss": 0.2095, + "step": 946 + }, + { + "epoch": 0.89, + "grad_norm": 0.34052425622940063, + "learning_rate": 3.232633024185583e-06, + "loss": 0.2212, + "step": 947 + }, + { + "epoch": 0.89, + "grad_norm": 0.33039572834968567, + "learning_rate": 3.1791121419303794e-06, + "loss": 0.2211, + "step": 948 + }, + { + "epoch": 0.89, + "grad_norm": 0.42443013191223145, + "learning_rate": 3.1260234728277717e-06, + "loss": 0.278, + "step": 949 + }, + { + "epoch": 0.89, + "grad_norm": 0.4003521203994751, + "learning_rate": 3.0733675069500865e-06, + "loss": 0.2895, + "step": 950 + }, + { + "epoch": 0.89, + "grad_norm": 0.4332895874977112, + "learning_rate": 3.0211447303752695e-06, + "loss": 0.2965, + "step": 951 + }, + { + "epoch": 0.89, + "grad_norm": 0.3819825351238251, + "learning_rate": 2.9693556251824185e-06, + "loss": 0.2882, + "step": 952 + }, + { + "epoch": 0.89, + "grad_norm": 0.47884687781333923, + "learning_rate": 2.9180006694472906e-06, + "loss": 0.4212, + "step": 953 + }, + { + "epoch": 0.89, + "grad_norm": 0.4191427528858185, + "learning_rate": 2.867080337237954e-06, + "loss": 0.3637, + "step": 954 + }, + { + "epoch": 0.9, + "grad_norm": 0.4317600727081299, + "learning_rate": 2.8165950986103805e-06, + "loss": 0.368, + "step": 955 + }, + { + "epoch": 0.9, + "grad_norm": 0.4285089671611786, + "learning_rate": 2.7665454196040664e-06, + "loss": 0.3267, + "step": 956 + }, + { + "epoch": 0.9, + "grad_norm": 0.40216952562332153, + "learning_rate": 2.716931762237801e-06, + "loss": 0.279, + "step": 957 + }, + { + "epoch": 0.9, + "grad_norm": 0.4264661967754364, + "learning_rate": 2.667754584505372e-06, + "loss": 0.3342, + "step": 958 + }, + { + "epoch": 0.9, + "grad_norm": 0.5141720771789551, + "learning_rate": 2.6190143403713174e-06, + "loss": 0.3507, + "step": 959 + }, + { + "epoch": 0.9, + "grad_norm": 0.4230804145336151, + "learning_rate": 2.5707114797667465e-06, + "loss": 0.2819, + "step": 960 + }, + { + "epoch": 0.9, + "grad_norm": 0.40919217467308044, + "learning_rate": 2.522846448585231e-06, + "loss": 0.349, + "step": 961 + }, + { + "epoch": 0.9, + "grad_norm": 0.4233115017414093, + "learning_rate": 2.4754196886785986e-06, + "loss": 0.2615, + "step": 962 + }, + { + "epoch": 0.9, + "grad_norm": 0.32790645956993103, + "learning_rate": 2.4284316378529404e-06, + "loss": 0.2638, + "step": 963 + }, + { + "epoch": 0.9, + "grad_norm": 0.3933560848236084, + "learning_rate": 2.3818827298645207e-06, + "loss": 0.2482, + "step": 964 + }, + { + "epoch": 0.91, + "grad_norm": 0.3430630564689636, + "learning_rate": 2.335773394415802e-06, + "loss": 0.2201, + "step": 965 + }, + { + "epoch": 0.91, + "grad_norm": 0.39491233229637146, + "learning_rate": 2.2901040571514322e-06, + "loss": 0.2974, + "step": 966 + }, + { + "epoch": 0.91, + "grad_norm": 0.42395198345184326, + "learning_rate": 2.2448751396543787e-06, + "loss": 0.2932, + "step": 967 + }, + { + "epoch": 0.91, + "grad_norm": 0.33768317103385925, + "learning_rate": 2.2000870594419908e-06, + "loss": 0.1831, + "step": 968 + }, + { + "epoch": 0.91, + "grad_norm": 0.3686957061290741, + "learning_rate": 2.155740229962161e-06, + "loss": 0.2596, + "step": 969 + }, + { + "epoch": 0.91, + "grad_norm": 0.3499439060688019, + "learning_rate": 2.1118350605894955e-06, + "loss": 0.3024, + "step": 970 + }, + { + "epoch": 0.91, + "grad_norm": 0.35845717787742615, + "learning_rate": 2.068371956621562e-06, + "loss": 0.258, + "step": 971 + }, + { + "epoch": 0.91, + "grad_norm": 0.4016646146774292, + "learning_rate": 2.0253513192751373e-06, + "loss": 0.3204, + "step": 972 + }, + { + "epoch": 0.91, + "grad_norm": 0.38734307885169983, + "learning_rate": 1.982773545682459e-06, + "loss": 0.2911, + "step": 973 + }, + { + "epoch": 0.91, + "grad_norm": 0.3539869487285614, + "learning_rate": 1.9406390288876586e-06, + "loss": 0.223, + "step": 974 + }, + { + "epoch": 0.91, + "grad_norm": 0.3853650987148285, + "learning_rate": 1.8989481578430223e-06, + "loss": 0.2413, + "step": 975 + }, + { + "epoch": 0.92, + "grad_norm": 0.4223552644252777, + "learning_rate": 1.8577013174054857e-06, + "loss": 0.3261, + "step": 976 + }, + { + "epoch": 0.92, + "grad_norm": 0.4064064025878906, + "learning_rate": 1.8168988883330185e-06, + "loss": 0.3506, + "step": 977 + }, + { + "epoch": 0.92, + "grad_norm": 0.47123757004737854, + "learning_rate": 1.7765412472811771e-06, + "loss": 0.4422, + "step": 978 + }, + { + "epoch": 0.92, + "grad_norm": 0.398388534784317, + "learning_rate": 1.7366287667995417e-06, + "loss": 0.2235, + "step": 979 + }, + { + "epoch": 0.92, + "grad_norm": 0.3645593225955963, + "learning_rate": 1.697161815328363e-06, + "loss": 0.2561, + "step": 980 + }, + { + "epoch": 0.92, + "grad_norm": 0.33609190583229065, + "learning_rate": 1.6581407571951092e-06, + "loss": 0.2756, + "step": 981 + }, + { + "epoch": 0.92, + "grad_norm": 0.3676416873931885, + "learning_rate": 1.6195659526111185e-06, + "loss": 0.2628, + "step": 982 + }, + { + "epoch": 0.92, + "grad_norm": 0.4234568476676941, + "learning_rate": 1.5814377576682527e-06, + "loss": 0.303, + "step": 983 + }, + { + "epoch": 0.92, + "grad_norm": 0.3596719801425934, + "learning_rate": 1.5437565243356656e-06, + "loss": 0.1933, + "step": 984 + }, + { + "epoch": 0.92, + "grad_norm": 0.42232537269592285, + "learning_rate": 1.5065226004564893e-06, + "loss": 0.3734, + "step": 985 + }, + { + "epoch": 0.92, + "grad_norm": 0.38415658473968506, + "learning_rate": 1.4697363297446477e-06, + "loss": 0.2759, + "step": 986 + }, + { + "epoch": 0.93, + "grad_norm": 0.4657137393951416, + "learning_rate": 1.4333980517817203e-06, + "loss": 0.3853, + "step": 987 + }, + { + "epoch": 0.93, + "grad_norm": 0.3416013717651367, + "learning_rate": 1.3975081020137392e-06, + "loss": 0.2035, + "step": 988 + }, + { + "epoch": 0.93, + "grad_norm": 0.5011183023452759, + "learning_rate": 1.3620668117481472e-06, + "loss": 0.3493, + "step": 989 + }, + { + "epoch": 0.93, + "grad_norm": 0.3816926181316376, + "learning_rate": 1.3270745081506997e-06, + "loss": 0.3283, + "step": 990 + }, + { + "epoch": 0.93, + "grad_norm": 0.5070998072624207, + "learning_rate": 1.292531514242501e-06, + "loss": 0.4696, + "step": 991 + }, + { + "epoch": 0.93, + "grad_norm": 0.40773579478263855, + "learning_rate": 1.2584381488969454e-06, + "loss": 0.2974, + "step": 992 + }, + { + "epoch": 0.93, + "grad_norm": 0.3139830529689789, + "learning_rate": 1.2247947268368364e-06, + "loss": 0.2197, + "step": 993 + }, + { + "epoch": 0.93, + "grad_norm": 0.3780119717121124, + "learning_rate": 1.191601558631461e-06, + "loss": 0.2619, + "step": 994 + }, + { + "epoch": 0.93, + "grad_norm": 0.3527129292488098, + "learning_rate": 1.1588589506937198e-06, + "loss": 0.2688, + "step": 995 + }, + { + "epoch": 0.93, + "grad_norm": 0.5044612288475037, + "learning_rate": 1.126567205277279e-06, + "loss": 0.3573, + "step": 996 + }, + { + "epoch": 0.94, + "grad_norm": 0.3708314597606659, + "learning_rate": 1.094726620473835e-06, + "loss": 0.2795, + "step": 997 + }, + { + "epoch": 0.94, + "grad_norm": 0.4164911210536957, + "learning_rate": 1.0633374902103088e-06, + "loss": 0.3088, + "step": 998 + }, + { + "epoch": 0.94, + "grad_norm": 0.38613763451576233, + "learning_rate": 1.0324001042461395e-06, + "loss": 0.2573, + "step": 999 + }, + { + "epoch": 0.94, + "grad_norm": 0.416906476020813, + "learning_rate": 1.0019147481706625e-06, + "loss": 0.3277, + "step": 1000 + }, + { + "epoch": 0.94, + "grad_norm": 0.4968281686306, + "learning_rate": 9.718817034003901e-07, + "loss": 0.4193, + "step": 1001 + }, + { + "epoch": 0.94, + "grad_norm": 0.3328755497932434, + "learning_rate": 9.423012471764914e-07, + "loss": 0.198, + "step": 1002 + }, + { + "epoch": 0.94, + "grad_norm": 0.3636288046836853, + "learning_rate": 9.131736525621603e-07, + "loss": 0.2204, + "step": 1003 + }, + { + "epoch": 0.94, + "grad_norm": 0.42198479175567627, + "learning_rate": 8.844991884401854e-07, + "loss": 0.2807, + "step": 1004 + }, + { + "epoch": 0.94, + "grad_norm": 0.39438503980636597, + "learning_rate": 8.56278119510362e-07, + "loss": 0.2664, + "step": 1005 + }, + { + "epoch": 0.94, + "grad_norm": 0.44490867853164673, + "learning_rate": 8.285107062871333e-07, + "loss": 0.3478, + "step": 1006 + }, + { + "epoch": 0.94, + "grad_norm": 0.3523523807525635, + "learning_rate": 8.011972050971483e-07, + "loss": 0.2738, + "step": 1007 + }, + { + "epoch": 0.95, + "grad_norm": 0.4044662117958069, + "learning_rate": 7.74337868076902e-07, + "loss": 0.3216, + "step": 1008 + }, + { + "epoch": 0.95, + "grad_norm": 0.45560404658317566, + "learning_rate": 7.479329431703985e-07, + "loss": 0.4045, + "step": 1009 + }, + { + "epoch": 0.95, + "grad_norm": 0.46285828948020935, + "learning_rate": 7.21982674126881e-07, + "loss": 0.3443, + "step": 1010 + }, + { + "epoch": 0.95, + "grad_norm": 0.36943697929382324, + "learning_rate": 6.964873004985717e-07, + "loss": 0.2318, + "step": 1011 + }, + { + "epoch": 0.95, + "grad_norm": 0.47358638048171997, + "learning_rate": 6.714470576384579e-07, + "loss": 0.3709, + "step": 1012 + }, + { + "epoch": 0.95, + "grad_norm": 0.3758651614189148, + "learning_rate": 6.468621766981154e-07, + "loss": 0.2717, + "step": 1013 + }, + { + "epoch": 0.95, + "grad_norm": 0.3843158781528473, + "learning_rate": 6.227328846255931e-07, + "loss": 0.2687, + "step": 1014 + }, + { + "epoch": 0.95, + "grad_norm": 0.3527068793773651, + "learning_rate": 5.990594041632991e-07, + "loss": 0.2499, + "step": 1015 + }, + { + "epoch": 0.95, + "grad_norm": 0.3666359782218933, + "learning_rate": 5.758419538459459e-07, + "loss": 0.269, + "step": 1016 + }, + { + "epoch": 0.95, + "grad_norm": 0.45985522866249084, + "learning_rate": 5.530807479985633e-07, + "loss": 0.3482, + "step": 1017 + }, + { + "epoch": 0.95, + "grad_norm": 0.3351965546607971, + "learning_rate": 5.307759967344672e-07, + "loss": 0.2629, + "step": 1018 + }, + { + "epoch": 0.96, + "grad_norm": 0.4313323199748993, + "learning_rate": 5.089279059533658e-07, + "loss": 0.3296, + "step": 1019 + }, + { + "epoch": 0.96, + "grad_norm": 0.44096696376800537, + "learning_rate": 4.87536677339423e-07, + "loss": 0.3566, + "step": 1020 + }, + { + "epoch": 0.96, + "grad_norm": 0.3699873983860016, + "learning_rate": 4.666025083594483e-07, + "loss": 0.2561, + "step": 1021 + }, + { + "epoch": 0.96, + "grad_norm": 0.3967258036136627, + "learning_rate": 4.461255922609986e-07, + "loss": 0.2996, + "step": 1022 + }, + { + "epoch": 0.96, + "grad_norm": 0.44547387957572937, + "learning_rate": 4.261061180706627e-07, + "loss": 0.3546, + "step": 1023 + }, + { + "epoch": 0.96, + "grad_norm": 0.4082671105861664, + "learning_rate": 4.065442705922906e-07, + "loss": 0.3672, + "step": 1024 + }, + { + "epoch": 0.96, + "grad_norm": 0.4149584174156189, + "learning_rate": 3.8744023040528374e-07, + "loss": 0.2984, + "step": 1025 + }, + { + "epoch": 0.96, + "grad_norm": 0.41881927847862244, + "learning_rate": 3.687941738629186e-07, + "loss": 0.3459, + "step": 1026 + }, + { + "epoch": 0.96, + "grad_norm": 0.39721134305000305, + "learning_rate": 3.5060627309074224e-07, + "loss": 0.2751, + "step": 1027 + }, + { + "epoch": 0.96, + "grad_norm": 0.40128853917121887, + "learning_rate": 3.3287669598497383e-07, + "loss": 0.2707, + "step": 1028 + }, + { + "epoch": 0.97, + "grad_norm": 0.4406619369983673, + "learning_rate": 3.156056062109503e-07, + "loss": 0.3462, + "step": 1029 + }, + { + "epoch": 0.97, + "grad_norm": 0.3375759422779083, + "learning_rate": 2.987931632016272e-07, + "loss": 0.2176, + "step": 1030 + }, + { + "epoch": 0.97, + "grad_norm": 0.4124433696269989, + "learning_rate": 2.824395221560805e-07, + "loss": 0.2863, + "step": 1031 + }, + { + "epoch": 0.97, + "grad_norm": 0.38367995619773865, + "learning_rate": 2.665448340381016e-07, + "loss": 0.2667, + "step": 1032 + }, + { + "epoch": 0.97, + "grad_norm": 0.37265920639038086, + "learning_rate": 2.511092455747932e-07, + "loss": 0.2871, + "step": 1033 + }, + { + "epoch": 0.97, + "grad_norm": 0.3789976239204407, + "learning_rate": 2.361328992552314e-07, + "loss": 0.2494, + "step": 1034 + }, + { + "epoch": 0.97, + "grad_norm": 0.48114919662475586, + "learning_rate": 2.2161593332910013e-07, + "loss": 0.3579, + "step": 1035 + }, + { + "epoch": 0.97, + "grad_norm": 0.4425215423107147, + "learning_rate": 2.0755848180547543e-07, + "loss": 0.3963, + "step": 1036 + }, + { + "epoch": 0.97, + "grad_norm": 0.32901331782341003, + "learning_rate": 1.9396067445155986e-07, + "loss": 0.2204, + "step": 1037 + }, + { + "epoch": 0.97, + "grad_norm": 0.4334326684474945, + "learning_rate": 1.8082263679148337e-07, + "loss": 0.3967, + "step": 1038 + }, + { + "epoch": 0.97, + "grad_norm": 0.4191541373729706, + "learning_rate": 1.681444901051432e-07, + "loss": 0.3374, + "step": 1039 + }, + { + "epoch": 0.98, + "grad_norm": 0.39676961302757263, + "learning_rate": 1.5592635142709367e-07, + "loss": 0.2838, + "step": 1040 + }, + { + "epoch": 0.98, + "grad_norm": 0.38776692748069763, + "learning_rate": 1.4416833354546356e-07, + "loss": 0.269, + "step": 1041 + }, + { + "epoch": 0.98, + "grad_norm": 0.40096643567085266, + "learning_rate": 1.328705450009071e-07, + "loss": 0.2566, + "step": 1042 + }, + { + "epoch": 0.98, + "grad_norm": 0.43819060921669006, + "learning_rate": 1.2203309008561592e-07, + "loss": 0.3519, + "step": 1043 + }, + { + "epoch": 0.98, + "grad_norm": 0.4747885763645172, + "learning_rate": 1.1165606884234181e-07, + "loss": 0.3204, + "step": 1044 + }, + { + "epoch": 0.98, + "grad_norm": 0.4276343882083893, + "learning_rate": 1.0173957706348659e-07, + "loss": 0.3291, + "step": 1045 + }, + { + "epoch": 0.98, + "grad_norm": 0.4158606231212616, + "learning_rate": 9.228370629019711e-08, + "loss": 0.2904, + "step": 1046 + }, + { + "epoch": 0.98, + "grad_norm": 0.44644296169281006, + "learning_rate": 8.328854381154938e-08, + "loss": 0.3455, + "step": 1047 + }, + { + "epoch": 0.98, + "grad_norm": 0.34873446822166443, + "learning_rate": 7.475417266371576e-08, + "loss": 0.2699, + "step": 1048 + }, + { + "epoch": 0.98, + "grad_norm": 0.36457881331443787, + "learning_rate": 6.668067162921566e-08, + "loss": 0.2634, + "step": 1049 + }, + { + "epoch": 0.98, + "grad_norm": 0.39075329899787903, + "learning_rate": 5.906811523618272e-08, + "loss": 0.2967, + "step": 1050 + }, + { + "epoch": 0.99, + "grad_norm": 0.4555211067199707, + "learning_rate": 5.191657375767656e-08, + "loss": 0.3807, + "step": 1051 + }, + { + "epoch": 0.99, + "grad_norm": 0.39703041315078735, + "learning_rate": 4.522611321103321e-08, + "loss": 0.2763, + "step": 1052 + }, + { + "epoch": 0.99, + "grad_norm": 0.3615953326225281, + "learning_rate": 3.8996795357254535e-08, + "loss": 0.2181, + "step": 1053 + }, + { + "epoch": 0.99, + "grad_norm": 0.5408671498298645, + "learning_rate": 3.322867770044202e-08, + "loss": 0.374, + "step": 1054 + }, + { + "epoch": 0.99, + "grad_norm": 0.463867723941803, + "learning_rate": 2.792181348726941e-08, + "loss": 0.3265, + "step": 1055 + }, + { + "epoch": 0.99, + "grad_norm": 0.46088552474975586, + "learning_rate": 2.3076251706477536e-08, + "loss": 0.3334, + "step": 1056 + }, + { + "epoch": 0.99, + "grad_norm": 0.4711132347583771, + "learning_rate": 1.869203708843581e-08, + "loss": 0.3801, + "step": 1057 + }, + { + "epoch": 0.99, + "grad_norm": 0.39382699131965637, + "learning_rate": 1.476921010471477e-08, + "loss": 0.3173, + "step": 1058 + }, + { + "epoch": 0.99, + "grad_norm": 0.4106517434120178, + "learning_rate": 1.1307806967741919e-08, + "loss": 0.2978, + "step": 1059 + }, + { + "epoch": 0.99, + "grad_norm": 0.32475370168685913, + "learning_rate": 8.307859630429793e-09, + "loss": 0.2326, + "step": 1060 + }, + { + "epoch": 1.0, + "grad_norm": 0.45648568868637085, + "learning_rate": 5.7693957858984125e-09, + "loss": 0.3561, + "step": 1061 + }, + { + "epoch": 1.0, + "grad_norm": 0.41961532831192017, + "learning_rate": 3.6924388672254785e-09, + "loss": 0.3269, + "step": 1062 + }, + { + "epoch": 1.0, + "grad_norm": 0.3751271069049835, + "learning_rate": 2.0770080472298783e-09, + "loss": 0.2859, + "step": 1063 + }, + { + "epoch": 1.0, + "grad_norm": 0.4922095239162445, + "learning_rate": 9.231182382773984e-10, + "loss": 0.3797, + "step": 1064 + }, + { + "epoch": 1.0, + "grad_norm": 0.38687193393707275, + "learning_rate": 2.307800921641512e-10, + "loss": 0.2846, + "step": 1065 + }, + { + "epoch": 1.0, + "grad_norm": 0.390020489692688, + "learning_rate": 0.0, + "loss": 0.2777, + "step": 1066 + }, + { + "epoch": 1.0, + "step": 1066, + "total_flos": 1.171159722670162e+17, + "train_loss": 0.31241298194338635, + "train_runtime": 51725.3719, + "train_samples_per_second": 0.66, + "train_steps_per_second": 0.021 + } + ], + "logging_steps": 1.0, + "max_steps": 1066, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 1.171159722670162e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}