diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_s512/README.md b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_s512/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_s512/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_s512/adapter_config.json b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_s512/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6f3405393c29c080ebca3dfa161166fb5ee55547 --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_s512/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "/data/wiedmann/hub/models--lmms-lab--llava-onevision-qwen2-7b-ov/snapshots/0b07bf7565e244cf4f39982249eafe8cd799d6dd", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 256, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "revision": null, + "target_modules": [ + "o_proj", + "down_proj", + "k_proj", + "gate_proj", + "v_proj", + "q_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_s512/adapter_model.bin b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_s512/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..7dc1d5f101effd92b3296a2b92f2f6e1a281d645 --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_s512/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:482a76f7bff433719bfe204e171b3361f0c36b7331b516ba8d1f718ff429713d +size 692127130 diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_s512/config.json b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_s512/config.json new file mode 100644 index 0000000000000000000000000000000000000000..e9f8e0e13da00f5eb71bf8a3460b68f319187baa --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_s512/config.json @@ -0,0 +1,67 @@ +{ + "_name_or_path": "/data/wiedmann/hub/models--lmms-lab--llava-onevision-qwen2-7b-ov/snapshots/0b07bf7565e244cf4f39982249eafe8cd799d6dd", + "add_faster_video": false, + "add_time_instruction": false, + "architectures": [ + "LlavaQwenForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "faster_token_stride": 10, + "force_sample": false, + "hidden_act": "silu", + "hidden_size": 3584, + "image_aspect_ratio": "square", + "image_crop_resolution": null, + "image_grid_pinpoints": null, + "image_split_resolution": null, + "image_token_index": 151646, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "mm_hidden_size": 1152, + "mm_newline_position": "no_token", + "mm_patch_merge_type": "spatial_unpad", + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_resampler_type": "streaming_agg", + "mm_spatial_pool_mode": "bilinear", + "mm_spatial_pool_stride": null, + "mm_streaming_frames_per_chunk": 1, + "mm_streaming_input_dim": 1152, + "mm_streaming_num_heads": 8, + "mm_streaming_num_layers": 4, + "mm_streaming_num_state_tokens": 512, + "mm_streaming_patches_per_frame": 729, + "mm_streaming_state_dim": 1152, + "mm_streaming_vision_chunk_size": 16, + "mm_tunable_parts": "mm_mlp_adapter,mm_vision_resampler", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-384", + "mm_vision_tower_lr": null, + "model_type": "llava_qwen", + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "pos_skipping_range": 4096, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 131072, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 8192, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.40.0.dev0", + "use_cache": true, + "use_mm_proj": true, + "use_pos_skipping": false, + "use_sliding_window": false, + "vision_tower_pretrained": null, + "vocab_size": 152064 +} diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_s512/generation_config.json b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_s512/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..19a297221acb87418d4388a3decef2282c6d7316 --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_s512/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.05, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.40.0.dev0" +} diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_s512/non_lora_trainables.bin b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_s512/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..e229f20225d783aeb0c762c5239ea1c91ff424e8 --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_s512/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8e589d969f9ff8251c350ee9c1d3f9377f0d809b42724ba2b4bc5abbf7fa0f7 +size 207902844 diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_s512/trainer_state.json b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_s512/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..623902347b28884ecfa76f0cd8a12f089495fcdb --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-nextqa-streaming_baseline_lora_agg_s512/trainer_state.json @@ -0,0 +1,7492 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999413730433253, + "eval_steps": 500, + "global_step": 1066, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 17.227703094482422, + "learning_rate": 3.125e-06, + "loss": 1.0838, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 11.005305290222168, + "learning_rate": 6.25e-06, + "loss": 1.1249, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 12.119623184204102, + "learning_rate": 9.375000000000001e-06, + "loss": 1.0595, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 9.532540321350098, + "learning_rate": 1.25e-05, + "loss": 0.9128, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 6.108962059020996, + "learning_rate": 1.5625e-05, + "loss": 0.8711, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 3.5435802936553955, + "learning_rate": 1.8750000000000002e-05, + "loss": 0.6673, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 3.0784573554992676, + "learning_rate": 2.1875e-05, + "loss": 0.5435, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 1.4409202337265015, + "learning_rate": 2.5e-05, + "loss": 0.4465, + "step": 8 + }, + { + "epoch": 0.01, + "grad_norm": 0.9887893199920654, + "learning_rate": 2.8125000000000003e-05, + "loss": 0.4325, + "step": 9 + }, + { + "epoch": 0.01, + "grad_norm": 0.9212615489959717, + "learning_rate": 3.125e-05, + "loss": 0.3307, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 1.0202654600143433, + "learning_rate": 3.4375e-05, + "loss": 0.2758, + "step": 11 + }, + { + "epoch": 0.01, + "grad_norm": 0.7716664671897888, + "learning_rate": 3.7500000000000003e-05, + "loss": 0.4347, + "step": 12 + }, + { + "epoch": 0.01, + "grad_norm": 0.8882746696472168, + "learning_rate": 4.0625000000000005e-05, + "loss": 0.5303, + "step": 13 + }, + { + "epoch": 0.01, + "grad_norm": 0.9287745356559753, + "learning_rate": 4.375e-05, + "loss": 0.3661, + "step": 14 + }, + { + "epoch": 0.01, + "grad_norm": 0.7302972674369812, + "learning_rate": 4.6875e-05, + "loss": 0.4894, + "step": 15 + }, + { + "epoch": 0.02, + "grad_norm": 0.7104817628860474, + "learning_rate": 5e-05, + "loss": 0.3584, + "step": 16 + }, + { + "epoch": 0.02, + "grad_norm": 0.5547098517417908, + "learning_rate": 5.3125000000000004e-05, + "loss": 0.2798, + "step": 17 + }, + { + "epoch": 0.02, + "grad_norm": 0.7443296909332275, + "learning_rate": 5.6250000000000005e-05, + "loss": 0.35, + "step": 18 + }, + { + "epoch": 0.02, + "grad_norm": 0.5101932287216187, + "learning_rate": 5.9375e-05, + "loss": 0.3115, + "step": 19 + }, + { + "epoch": 0.02, + "grad_norm": 0.6928537487983704, + "learning_rate": 6.25e-05, + "loss": 0.4309, + "step": 20 + }, + { + "epoch": 0.02, + "grad_norm": 0.5658664107322693, + "learning_rate": 6.562500000000001e-05, + "loss": 0.3284, + "step": 21 + }, + { + "epoch": 0.02, + "grad_norm": 0.65761399269104, + "learning_rate": 6.875e-05, + "loss": 0.4236, + "step": 22 + }, + { + "epoch": 0.02, + "grad_norm": 0.5779470801353455, + "learning_rate": 7.1875e-05, + "loss": 0.3631, + "step": 23 + }, + { + "epoch": 0.02, + "grad_norm": 0.570232629776001, + "learning_rate": 7.500000000000001e-05, + "loss": 0.3179, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 0.5996125936508179, + "learning_rate": 7.8125e-05, + "loss": 0.3948, + "step": 25 + }, + { + "epoch": 0.02, + "grad_norm": 0.6183748841285706, + "learning_rate": 8.125000000000001e-05, + "loss": 0.4139, + "step": 26 + }, + { + "epoch": 0.03, + "grad_norm": 0.578999400138855, + "learning_rate": 8.4375e-05, + "loss": 0.398, + "step": 27 + }, + { + "epoch": 0.03, + "grad_norm": 0.7683339715003967, + "learning_rate": 8.75e-05, + "loss": 0.3236, + "step": 28 + }, + { + "epoch": 0.03, + "grad_norm": 0.5391294360160828, + "learning_rate": 9.062500000000001e-05, + "loss": 0.3597, + "step": 29 + }, + { + "epoch": 0.03, + "grad_norm": 0.6637127995491028, + "learning_rate": 9.375e-05, + "loss": 0.4444, + "step": 30 + }, + { + "epoch": 0.03, + "grad_norm": 0.49874716997146606, + "learning_rate": 9.687500000000001e-05, + "loss": 0.3222, + "step": 31 + }, + { + "epoch": 0.03, + "grad_norm": 0.4726903736591339, + "learning_rate": 0.0001, + "loss": 0.2823, + "step": 32 + }, + { + "epoch": 0.03, + "grad_norm": 0.5357120633125305, + "learning_rate": 9.999976921990784e-05, + "loss": 0.3139, + "step": 33 + }, + { + "epoch": 0.03, + "grad_norm": 0.6340093016624451, + "learning_rate": 9.999907688176173e-05, + "loss": 0.4162, + "step": 34 + }, + { + "epoch": 0.03, + "grad_norm": 0.4693492650985718, + "learning_rate": 9.999792299195278e-05, + "loss": 0.2762, + "step": 35 + }, + { + "epoch": 0.03, + "grad_norm": 0.5481849312782288, + "learning_rate": 9.999630756113278e-05, + "loss": 0.3765, + "step": 36 + }, + { + "epoch": 0.03, + "grad_norm": 0.4791087806224823, + "learning_rate": 9.999423060421411e-05, + "loss": 0.3108, + "step": 37 + }, + { + "epoch": 0.04, + "grad_norm": 0.4711155295372009, + "learning_rate": 9.999169214036958e-05, + "loss": 0.3442, + "step": 38 + }, + { + "epoch": 0.04, + "grad_norm": 0.4753577709197998, + "learning_rate": 9.998869219303227e-05, + "loss": 0.3113, + "step": 39 + }, + { + "epoch": 0.04, + "grad_norm": 0.5480284094810486, + "learning_rate": 9.998523078989529e-05, + "loss": 0.3737, + "step": 40 + }, + { + "epoch": 0.04, + "grad_norm": 0.4719812870025635, + "learning_rate": 9.998130796291156e-05, + "loss": 0.3651, + "step": 41 + }, + { + "epoch": 0.04, + "grad_norm": 0.5594419836997986, + "learning_rate": 9.997692374829352e-05, + "loss": 0.3126, + "step": 42 + }, + { + "epoch": 0.04, + "grad_norm": 0.4275471568107605, + "learning_rate": 9.997207818651274e-05, + "loss": 0.2936, + "step": 43 + }, + { + "epoch": 0.04, + "grad_norm": 0.43990057706832886, + "learning_rate": 9.996677132229957e-05, + "loss": 0.3226, + "step": 44 + }, + { + "epoch": 0.04, + "grad_norm": 0.48392900824546814, + "learning_rate": 9.996100320464274e-05, + "loss": 0.3621, + "step": 45 + }, + { + "epoch": 0.04, + "grad_norm": 0.4566228687763214, + "learning_rate": 9.995477388678897e-05, + "loss": 0.2735, + "step": 46 + }, + { + "epoch": 0.04, + "grad_norm": 0.5327995419502258, + "learning_rate": 9.994808342624234e-05, + "loss": 0.3073, + "step": 47 + }, + { + "epoch": 0.05, + "grad_norm": 0.40377333760261536, + "learning_rate": 9.994093188476382e-05, + "loss": 0.2833, + "step": 48 + }, + { + "epoch": 0.05, + "grad_norm": 0.4591901898384094, + "learning_rate": 9.993331932837079e-05, + "loss": 0.3515, + "step": 49 + }, + { + "epoch": 0.05, + "grad_norm": 0.4833613634109497, + "learning_rate": 9.992524582733629e-05, + "loss": 0.2718, + "step": 50 + }, + { + "epoch": 0.05, + "grad_norm": 0.4145682752132416, + "learning_rate": 9.991671145618846e-05, + "loss": 0.2361, + "step": 51 + }, + { + "epoch": 0.05, + "grad_norm": 0.46532148122787476, + "learning_rate": 9.99077162937098e-05, + "loss": 0.3232, + "step": 52 + }, + { + "epoch": 0.05, + "grad_norm": 0.49215883016586304, + "learning_rate": 9.989826042293652e-05, + "loss": 0.3338, + "step": 53 + }, + { + "epoch": 0.05, + "grad_norm": 0.4783603250980377, + "learning_rate": 9.988834393115767e-05, + "loss": 0.3179, + "step": 54 + }, + { + "epoch": 0.05, + "grad_norm": 0.5149618983268738, + "learning_rate": 9.987796690991439e-05, + "loss": 0.3614, + "step": 55 + }, + { + "epoch": 0.05, + "grad_norm": 0.4666965901851654, + "learning_rate": 9.98671294549991e-05, + "loss": 0.3194, + "step": 56 + }, + { + "epoch": 0.05, + "grad_norm": 0.42810237407684326, + "learning_rate": 9.985583166645455e-05, + "loss": 0.2451, + "step": 57 + }, + { + "epoch": 0.05, + "grad_norm": 0.419660747051239, + "learning_rate": 9.98440736485729e-05, + "loss": 0.2881, + "step": 58 + }, + { + "epoch": 0.06, + "grad_norm": 0.40833503007888794, + "learning_rate": 9.983185550989487e-05, + "loss": 0.2195, + "step": 59 + }, + { + "epoch": 0.06, + "grad_norm": 0.4693005681037903, + "learning_rate": 9.981917736320851e-05, + "loss": 0.3611, + "step": 60 + }, + { + "epoch": 0.06, + "grad_norm": 0.36646148562431335, + "learning_rate": 9.980603932554845e-05, + "loss": 0.2019, + "step": 61 + }, + { + "epoch": 0.06, + "grad_norm": 0.39927858114242554, + "learning_rate": 9.979244151819453e-05, + "loss": 0.2467, + "step": 62 + }, + { + "epoch": 0.06, + "grad_norm": 0.4634983241558075, + "learning_rate": 9.97783840666709e-05, + "loss": 0.3597, + "step": 63 + }, + { + "epoch": 0.06, + "grad_norm": 0.4355047941207886, + "learning_rate": 9.976386710074478e-05, + "loss": 0.2993, + "step": 64 + }, + { + "epoch": 0.06, + "grad_norm": 0.4115259349346161, + "learning_rate": 9.974889075442521e-05, + "loss": 0.2293, + "step": 65 + }, + { + "epoch": 0.06, + "grad_norm": 0.4722943902015686, + "learning_rate": 9.97334551659619e-05, + "loss": 0.3061, + "step": 66 + }, + { + "epoch": 0.06, + "grad_norm": 0.4350064694881439, + "learning_rate": 9.971756047784393e-05, + "loss": 0.3101, + "step": 67 + }, + { + "epoch": 0.06, + "grad_norm": 0.44171565771102905, + "learning_rate": 9.970120683679838e-05, + "loss": 0.3909, + "step": 68 + }, + { + "epoch": 0.06, + "grad_norm": 0.45926347374916077, + "learning_rate": 9.968439439378905e-05, + "loss": 0.3899, + "step": 69 + }, + { + "epoch": 0.07, + "grad_norm": 0.44425445795059204, + "learning_rate": 9.966712330401504e-05, + "loss": 0.3668, + "step": 70 + }, + { + "epoch": 0.07, + "grad_norm": 0.4987228810787201, + "learning_rate": 9.964939372690926e-05, + "loss": 0.4006, + "step": 71 + }, + { + "epoch": 0.07, + "grad_norm": 0.4817635715007782, + "learning_rate": 9.96312058261371e-05, + "loss": 0.3439, + "step": 72 + }, + { + "epoch": 0.07, + "grad_norm": 0.43129855394363403, + "learning_rate": 9.961255976959473e-05, + "loss": 0.3223, + "step": 73 + }, + { + "epoch": 0.07, + "grad_norm": 0.4915347993373871, + "learning_rate": 9.959345572940771e-05, + "loss": 0.4157, + "step": 74 + }, + { + "epoch": 0.07, + "grad_norm": 0.38367170095443726, + "learning_rate": 9.957389388192935e-05, + "loss": 0.2921, + "step": 75 + }, + { + "epoch": 0.07, + "grad_norm": 0.4222044050693512, + "learning_rate": 9.9553874407739e-05, + "loss": 0.267, + "step": 76 + }, + { + "epoch": 0.07, + "grad_norm": 0.41161587834358215, + "learning_rate": 9.953339749164057e-05, + "loss": 0.3667, + "step": 77 + }, + { + "epoch": 0.07, + "grad_norm": 0.44461458921432495, + "learning_rate": 9.951246332266057e-05, + "loss": 0.3618, + "step": 78 + }, + { + "epoch": 0.07, + "grad_norm": 0.44937530159950256, + "learning_rate": 9.949107209404665e-05, + "loss": 0.4015, + "step": 79 + }, + { + "epoch": 0.08, + "grad_norm": 0.4108678698539734, + "learning_rate": 9.946922400326554e-05, + "loss": 0.374, + "step": 80 + }, + { + "epoch": 0.08, + "grad_norm": 0.4602833092212677, + "learning_rate": 9.944691925200145e-05, + "loss": 0.3695, + "step": 81 + }, + { + "epoch": 0.08, + "grad_norm": 0.4525197446346283, + "learning_rate": 9.942415804615406e-05, + "loss": 0.3845, + "step": 82 + }, + { + "epoch": 0.08, + "grad_norm": 0.4771604537963867, + "learning_rate": 9.940094059583671e-05, + "loss": 0.3861, + "step": 83 + }, + { + "epoch": 0.08, + "grad_norm": 0.4231971800327301, + "learning_rate": 9.937726711537442e-05, + "loss": 0.2658, + "step": 84 + }, + { + "epoch": 0.08, + "grad_norm": 0.4093407392501831, + "learning_rate": 9.93531378233019e-05, + "loss": 0.3377, + "step": 85 + }, + { + "epoch": 0.08, + "grad_norm": 0.49191173911094666, + "learning_rate": 9.932855294236154e-05, + "loss": 0.3626, + "step": 86 + }, + { + "epoch": 0.08, + "grad_norm": 0.5030225515365601, + "learning_rate": 9.930351269950143e-05, + "loss": 0.4074, + "step": 87 + }, + { + "epoch": 0.08, + "grad_norm": 0.47146856784820557, + "learning_rate": 9.927801732587312e-05, + "loss": 0.2998, + "step": 88 + }, + { + "epoch": 0.08, + "grad_norm": 0.3858545124530792, + "learning_rate": 9.925206705682962e-05, + "loss": 0.2586, + "step": 89 + }, + { + "epoch": 0.08, + "grad_norm": 0.5475620627403259, + "learning_rate": 9.92256621319231e-05, + "loss": 0.4691, + "step": 90 + }, + { + "epoch": 0.09, + "grad_norm": 0.47144803404808044, + "learning_rate": 9.919880279490286e-05, + "loss": 0.3133, + "step": 91 + }, + { + "epoch": 0.09, + "grad_norm": 0.3688579201698303, + "learning_rate": 9.917148929371288e-05, + "loss": 0.1735, + "step": 92 + }, + { + "epoch": 0.09, + "grad_norm": 0.4857231080532074, + "learning_rate": 9.914372188048964e-05, + "loss": 0.3737, + "step": 93 + }, + { + "epoch": 0.09, + "grad_norm": 0.38670048117637634, + "learning_rate": 9.911550081155983e-05, + "loss": 0.2712, + "step": 94 + }, + { + "epoch": 0.09, + "grad_norm": 0.4856467843055725, + "learning_rate": 9.908682634743784e-05, + "loss": 0.3925, + "step": 95 + }, + { + "epoch": 0.09, + "grad_norm": 0.41404563188552856, + "learning_rate": 9.905769875282352e-05, + "loss": 0.2892, + "step": 96 + }, + { + "epoch": 0.09, + "grad_norm": 0.42182257771492004, + "learning_rate": 9.902811829659961e-05, + "loss": 0.304, + "step": 97 + }, + { + "epoch": 0.09, + "grad_norm": 0.4935455620288849, + "learning_rate": 9.899808525182935e-05, + "loss": 0.3808, + "step": 98 + }, + { + "epoch": 0.09, + "grad_norm": 0.3772177994251251, + "learning_rate": 9.896759989575386e-05, + "loss": 0.2935, + "step": 99 + }, + { + "epoch": 0.09, + "grad_norm": 0.4526858925819397, + "learning_rate": 9.893666250978971e-05, + "loss": 0.3774, + "step": 100 + }, + { + "epoch": 0.09, + "grad_norm": 0.32689258456230164, + "learning_rate": 9.890527337952617e-05, + "loss": 0.2034, + "step": 101 + }, + { + "epoch": 0.1, + "grad_norm": 0.4143233597278595, + "learning_rate": 9.887343279472272e-05, + "loss": 0.3017, + "step": 102 + }, + { + "epoch": 0.1, + "grad_norm": 0.4620525538921356, + "learning_rate": 9.884114104930628e-05, + "loss": 0.3201, + "step": 103 + }, + { + "epoch": 0.1, + "grad_norm": 0.3897378146648407, + "learning_rate": 9.880839844136854e-05, + "loss": 0.2629, + "step": 104 + }, + { + "epoch": 0.1, + "grad_norm": 0.450862318277359, + "learning_rate": 9.877520527316317e-05, + "loss": 0.3982, + "step": 105 + }, + { + "epoch": 0.1, + "grad_norm": 0.38423511385917664, + "learning_rate": 9.874156185110306e-05, + "loss": 0.3077, + "step": 106 + }, + { + "epoch": 0.1, + "grad_norm": 0.47380977869033813, + "learning_rate": 9.870746848575751e-05, + "loss": 0.3188, + "step": 107 + }, + { + "epoch": 0.1, + "grad_norm": 0.43851879239082336, + "learning_rate": 9.86729254918493e-05, + "loss": 0.2765, + "step": 108 + }, + { + "epoch": 0.1, + "grad_norm": 0.4031079113483429, + "learning_rate": 9.863793318825186e-05, + "loss": 0.2989, + "step": 109 + }, + { + "epoch": 0.1, + "grad_norm": 0.4153116047382355, + "learning_rate": 9.860249189798627e-05, + "loss": 0.4076, + "step": 110 + }, + { + "epoch": 0.1, + "grad_norm": 0.35746559500694275, + "learning_rate": 9.856660194821829e-05, + "loss": 0.2245, + "step": 111 + }, + { + "epoch": 0.11, + "grad_norm": 0.4301232397556305, + "learning_rate": 9.853026367025535e-05, + "loss": 0.3171, + "step": 112 + }, + { + "epoch": 0.11, + "grad_norm": 0.41360700130462646, + "learning_rate": 9.849347739954352e-05, + "loss": 0.2908, + "step": 113 + }, + { + "epoch": 0.11, + "grad_norm": 0.5251216292381287, + "learning_rate": 9.845624347566433e-05, + "loss": 0.466, + "step": 114 + }, + { + "epoch": 0.11, + "grad_norm": 0.3653389513492584, + "learning_rate": 9.841856224233174e-05, + "loss": 0.2221, + "step": 115 + }, + { + "epoch": 0.11, + "grad_norm": 0.48610758781433105, + "learning_rate": 9.83804340473889e-05, + "loss": 0.4226, + "step": 116 + }, + { + "epoch": 0.11, + "grad_norm": 0.41285499930381775, + "learning_rate": 9.83418592428049e-05, + "loss": 0.3015, + "step": 117 + }, + { + "epoch": 0.11, + "grad_norm": 0.4366799294948578, + "learning_rate": 9.830283818467163e-05, + "loss": 0.3452, + "step": 118 + }, + { + "epoch": 0.11, + "grad_norm": 0.43274393677711487, + "learning_rate": 9.826337123320046e-05, + "loss": 0.3277, + "step": 119 + }, + { + "epoch": 0.11, + "grad_norm": 0.43159425258636475, + "learning_rate": 9.822345875271883e-05, + "loss": 0.3384, + "step": 120 + }, + { + "epoch": 0.11, + "grad_norm": 0.4645746052265167, + "learning_rate": 9.818310111166699e-05, + "loss": 0.3445, + "step": 121 + }, + { + "epoch": 0.11, + "grad_norm": 0.531746506690979, + "learning_rate": 9.814229868259452e-05, + "loss": 0.3606, + "step": 122 + }, + { + "epoch": 0.12, + "grad_norm": 0.46642574667930603, + "learning_rate": 9.810105184215699e-05, + "loss": 0.4172, + "step": 123 + }, + { + "epoch": 0.12, + "grad_norm": 0.4294590353965759, + "learning_rate": 9.805936097111234e-05, + "loss": 0.2697, + "step": 124 + }, + { + "epoch": 0.12, + "grad_norm": 0.5032444596290588, + "learning_rate": 9.801722645431754e-05, + "loss": 0.4007, + "step": 125 + }, + { + "epoch": 0.12, + "grad_norm": 0.42096102237701416, + "learning_rate": 9.797464868072488e-05, + "loss": 0.3039, + "step": 126 + }, + { + "epoch": 0.12, + "grad_norm": 0.43327075242996216, + "learning_rate": 9.793162804337845e-05, + "loss": 0.321, + "step": 127 + }, + { + "epoch": 0.12, + "grad_norm": 0.4488172233104706, + "learning_rate": 9.788816493941051e-05, + "loss": 0.3466, + "step": 128 + }, + { + "epoch": 0.12, + "grad_norm": 0.4482726454734802, + "learning_rate": 9.784425977003784e-05, + "loss": 0.3556, + "step": 129 + }, + { + "epoch": 0.12, + "grad_norm": 0.43109023571014404, + "learning_rate": 9.779991294055802e-05, + "loss": 0.3729, + "step": 130 + }, + { + "epoch": 0.12, + "grad_norm": 0.36907973885536194, + "learning_rate": 9.775512486034563e-05, + "loss": 0.2535, + "step": 131 + }, + { + "epoch": 0.12, + "grad_norm": 0.5850874781608582, + "learning_rate": 9.770989594284857e-05, + "loss": 0.5767, + "step": 132 + }, + { + "epoch": 0.12, + "grad_norm": 0.4706707000732422, + "learning_rate": 9.766422660558421e-05, + "loss": 0.3897, + "step": 133 + }, + { + "epoch": 0.13, + "grad_norm": 0.38747847080230713, + "learning_rate": 9.761811727013548e-05, + "loss": 0.2789, + "step": 134 + }, + { + "epoch": 0.13, + "grad_norm": 0.37999868392944336, + "learning_rate": 9.757156836214706e-05, + "loss": 0.3114, + "step": 135 + }, + { + "epoch": 0.13, + "grad_norm": 0.4177742302417755, + "learning_rate": 9.752458031132141e-05, + "loss": 0.3577, + "step": 136 + }, + { + "epoch": 0.13, + "grad_norm": 0.39297688007354736, + "learning_rate": 9.747715355141478e-05, + "loss": 0.2334, + "step": 137 + }, + { + "epoch": 0.13, + "grad_norm": 0.3654375672340393, + "learning_rate": 9.742928852023325e-05, + "loss": 0.3229, + "step": 138 + }, + { + "epoch": 0.13, + "grad_norm": 0.432533323764801, + "learning_rate": 9.73809856596287e-05, + "loss": 0.3597, + "step": 139 + }, + { + "epoch": 0.13, + "grad_norm": 0.4045366644859314, + "learning_rate": 9.733224541549464e-05, + "loss": 0.3186, + "step": 140 + }, + { + "epoch": 0.13, + "grad_norm": 0.4264724552631378, + "learning_rate": 9.728306823776221e-05, + "loss": 0.329, + "step": 141 + }, + { + "epoch": 0.13, + "grad_norm": 0.3992307484149933, + "learning_rate": 9.723345458039594e-05, + "loss": 0.3513, + "step": 142 + }, + { + "epoch": 0.13, + "grad_norm": 0.339410662651062, + "learning_rate": 9.718340490138965e-05, + "loss": 0.188, + "step": 143 + }, + { + "epoch": 0.14, + "grad_norm": 0.4419720470905304, + "learning_rate": 9.713291966276206e-05, + "loss": 0.3414, + "step": 144 + }, + { + "epoch": 0.14, + "grad_norm": 0.4582366645336151, + "learning_rate": 9.708199933055272e-05, + "loss": 0.4222, + "step": 145 + }, + { + "epoch": 0.14, + "grad_norm": 0.4727243483066559, + "learning_rate": 9.70306443748176e-05, + "loss": 0.3994, + "step": 146 + }, + { + "epoch": 0.14, + "grad_norm": 0.3670468330383301, + "learning_rate": 9.697885526962474e-05, + "loss": 0.2829, + "step": 147 + }, + { + "epoch": 0.14, + "grad_norm": 0.40513771772384644, + "learning_rate": 9.692663249304992e-05, + "loss": 0.333, + "step": 148 + }, + { + "epoch": 0.14, + "grad_norm": 0.48286768794059753, + "learning_rate": 9.687397652717223e-05, + "loss": 0.4656, + "step": 149 + }, + { + "epoch": 0.14, + "grad_norm": 0.45031237602233887, + "learning_rate": 9.682088785806963e-05, + "loss": 0.3733, + "step": 150 + }, + { + "epoch": 0.14, + "grad_norm": 0.3925604224205017, + "learning_rate": 9.67673669758144e-05, + "loss": 0.2769, + "step": 151 + }, + { + "epoch": 0.14, + "grad_norm": 0.4268004894256592, + "learning_rate": 9.671341437446877e-05, + "loss": 0.3381, + "step": 152 + }, + { + "epoch": 0.14, + "grad_norm": 0.5197659730911255, + "learning_rate": 9.665903055208014e-05, + "loss": 0.4432, + "step": 153 + }, + { + "epoch": 0.14, + "grad_norm": 0.44246628880500793, + "learning_rate": 9.660421601067666e-05, + "loss": 0.3358, + "step": 154 + }, + { + "epoch": 0.15, + "grad_norm": 0.36518609523773193, + "learning_rate": 9.654897125626252e-05, + "loss": 0.2827, + "step": 155 + }, + { + "epoch": 0.15, + "grad_norm": 0.37158283591270447, + "learning_rate": 9.649329679881334e-05, + "loss": 0.2822, + "step": 156 + }, + { + "epoch": 0.15, + "grad_norm": 0.31241849064826965, + "learning_rate": 9.643719315227133e-05, + "loss": 0.1952, + "step": 157 + }, + { + "epoch": 0.15, + "grad_norm": 0.4282618761062622, + "learning_rate": 9.63806608345407e-05, + "loss": 0.3535, + "step": 158 + }, + { + "epoch": 0.15, + "grad_norm": 0.4605000913143158, + "learning_rate": 9.632370036748279e-05, + "loss": 0.3793, + "step": 159 + }, + { + "epoch": 0.15, + "grad_norm": 0.4260792136192322, + "learning_rate": 9.626631227691127e-05, + "loss": 0.2857, + "step": 160 + }, + { + "epoch": 0.15, + "grad_norm": 0.39072418212890625, + "learning_rate": 9.62084970925873e-05, + "loss": 0.2469, + "step": 161 + }, + { + "epoch": 0.15, + "grad_norm": 0.43903443217277527, + "learning_rate": 9.615025534821462e-05, + "loss": 0.2761, + "step": 162 + }, + { + "epoch": 0.15, + "grad_norm": 0.4896968603134155, + "learning_rate": 9.609158758143464e-05, + "loss": 0.3861, + "step": 163 + }, + { + "epoch": 0.15, + "grad_norm": 0.4410502314567566, + "learning_rate": 9.603249433382144e-05, + "loss": 0.2861, + "step": 164 + }, + { + "epoch": 0.15, + "grad_norm": 0.45812466740608215, + "learning_rate": 9.597297615087685e-05, + "loss": 0.271, + "step": 165 + }, + { + "epoch": 0.16, + "grad_norm": 0.3951738476753235, + "learning_rate": 9.591303358202535e-05, + "loss": 0.2225, + "step": 166 + }, + { + "epoch": 0.16, + "grad_norm": 0.46651744842529297, + "learning_rate": 9.585266718060897e-05, + "loss": 0.3533, + "step": 167 + }, + { + "epoch": 0.16, + "grad_norm": 0.4054509401321411, + "learning_rate": 9.579187750388227e-05, + "loss": 0.2154, + "step": 168 + }, + { + "epoch": 0.16, + "grad_norm": 0.433390736579895, + "learning_rate": 9.573066511300714e-05, + "loss": 0.3388, + "step": 169 + }, + { + "epoch": 0.16, + "grad_norm": 0.43540117144584656, + "learning_rate": 9.566903057304764e-05, + "loss": 0.3604, + "step": 170 + }, + { + "epoch": 0.16, + "grad_norm": 0.4319068193435669, + "learning_rate": 9.560697445296474e-05, + "loss": 0.3286, + "step": 171 + }, + { + "epoch": 0.16, + "grad_norm": 0.5118317604064941, + "learning_rate": 9.554449732561113e-05, + "loss": 0.3459, + "step": 172 + }, + { + "epoch": 0.16, + "grad_norm": 0.45201918482780457, + "learning_rate": 9.548159976772592e-05, + "loss": 0.3469, + "step": 173 + }, + { + "epoch": 0.16, + "grad_norm": 0.47841161489486694, + "learning_rate": 9.541828235992926e-05, + "loss": 0.356, + "step": 174 + }, + { + "epoch": 0.16, + "grad_norm": 0.3838687539100647, + "learning_rate": 9.535454568671704e-05, + "loss": 0.2688, + "step": 175 + }, + { + "epoch": 0.17, + "grad_norm": 0.4381600022315979, + "learning_rate": 9.529039033645548e-05, + "loss": 0.2392, + "step": 176 + }, + { + "epoch": 0.17, + "grad_norm": 0.38000673055648804, + "learning_rate": 9.522581690137567e-05, + "loss": 0.3034, + "step": 177 + }, + { + "epoch": 0.17, + "grad_norm": 0.3562740981578827, + "learning_rate": 9.516082597756815e-05, + "loss": 0.232, + "step": 178 + }, + { + "epoch": 0.17, + "grad_norm": 0.4523466229438782, + "learning_rate": 9.509541816497737e-05, + "loss": 0.4132, + "step": 179 + }, + { + "epoch": 0.17, + "grad_norm": 0.4095892906188965, + "learning_rate": 9.50295940673962e-05, + "loss": 0.2558, + "step": 180 + }, + { + "epoch": 0.17, + "grad_norm": 0.4435361921787262, + "learning_rate": 9.496335429246026e-05, + "loss": 0.3918, + "step": 181 + }, + { + "epoch": 0.17, + "grad_norm": 0.426711767911911, + "learning_rate": 9.489669945164242e-05, + "loss": 0.328, + "step": 182 + }, + { + "epoch": 0.17, + "grad_norm": 0.35389336943626404, + "learning_rate": 9.482963016024709e-05, + "loss": 0.2219, + "step": 183 + }, + { + "epoch": 0.17, + "grad_norm": 0.36992374062538147, + "learning_rate": 9.476214703740454e-05, + "loss": 0.2427, + "step": 184 + }, + { + "epoch": 0.17, + "grad_norm": 0.5090086460113525, + "learning_rate": 9.469425070606524e-05, + "loss": 0.5443, + "step": 185 + }, + { + "epoch": 0.17, + "grad_norm": 0.42029502987861633, + "learning_rate": 9.462594179299406e-05, + "loss": 0.2334, + "step": 186 + }, + { + "epoch": 0.18, + "grad_norm": 0.42248085141181946, + "learning_rate": 9.45572209287645e-05, + "loss": 0.3449, + "step": 187 + }, + { + "epoch": 0.18, + "grad_norm": 0.5659753084182739, + "learning_rate": 9.44880887477528e-05, + "loss": 0.381, + "step": 188 + }, + { + "epoch": 0.18, + "grad_norm": 0.45616188645362854, + "learning_rate": 9.441854588813228e-05, + "loss": 0.29, + "step": 189 + }, + { + "epoch": 0.18, + "grad_norm": 0.394564688205719, + "learning_rate": 9.43485929918672e-05, + "loss": 0.3241, + "step": 190 + }, + { + "epoch": 0.18, + "grad_norm": 0.503383994102478, + "learning_rate": 9.427823070470699e-05, + "loss": 0.4226, + "step": 191 + }, + { + "epoch": 0.18, + "grad_norm": 0.3788263201713562, + "learning_rate": 9.420745967618026e-05, + "loss": 0.2751, + "step": 192 + }, + { + "epoch": 0.18, + "grad_norm": 0.30289629101753235, + "learning_rate": 9.413628055958878e-05, + "loss": 0.188, + "step": 193 + }, + { + "epoch": 0.18, + "grad_norm": 0.3985527753829956, + "learning_rate": 9.406469401200151e-05, + "loss": 0.3384, + "step": 194 + }, + { + "epoch": 0.18, + "grad_norm": 0.4570313096046448, + "learning_rate": 9.399270069424842e-05, + "loss": 0.3379, + "step": 195 + }, + { + "epoch": 0.18, + "grad_norm": 0.45788517594337463, + "learning_rate": 9.392030127091452e-05, + "loss": 0.2914, + "step": 196 + }, + { + "epoch": 0.18, + "grad_norm": 0.39778071641921997, + "learning_rate": 9.384749641033359e-05, + "loss": 0.28, + "step": 197 + }, + { + "epoch": 0.19, + "grad_norm": 0.4381253719329834, + "learning_rate": 9.377428678458214e-05, + "loss": 0.356, + "step": 198 + }, + { + "epoch": 0.19, + "grad_norm": 0.4055528938770294, + "learning_rate": 9.370067306947316e-05, + "loss": 0.2767, + "step": 199 + }, + { + "epoch": 0.19, + "grad_norm": 0.47014090418815613, + "learning_rate": 9.362665594454984e-05, + "loss": 0.4312, + "step": 200 + }, + { + "epoch": 0.19, + "grad_norm": 0.4234636425971985, + "learning_rate": 9.355223609307933e-05, + "loss": 0.2813, + "step": 201 + }, + { + "epoch": 0.19, + "grad_norm": 0.47207698225975037, + "learning_rate": 9.347741420204643e-05, + "loss": 0.3601, + "step": 202 + }, + { + "epoch": 0.19, + "grad_norm": 0.4559366703033447, + "learning_rate": 9.340219096214727e-05, + "loss": 0.29, + "step": 203 + }, + { + "epoch": 0.19, + "grad_norm": 0.46230751276016235, + "learning_rate": 9.33265670677829e-05, + "loss": 0.37, + "step": 204 + }, + { + "epoch": 0.19, + "grad_norm": 0.43081924319267273, + "learning_rate": 9.325054321705289e-05, + "loss": 0.3043, + "step": 205 + }, + { + "epoch": 0.19, + "grad_norm": 0.43205636739730835, + "learning_rate": 9.317412011174886e-05, + "loss": 0.3097, + "step": 206 + }, + { + "epoch": 0.19, + "grad_norm": 0.4734487235546112, + "learning_rate": 9.309729845734813e-05, + "loss": 0.4025, + "step": 207 + }, + { + "epoch": 0.2, + "grad_norm": 0.3640989661216736, + "learning_rate": 9.302007896300698e-05, + "loss": 0.2752, + "step": 208 + }, + { + "epoch": 0.2, + "grad_norm": 0.41770240664482117, + "learning_rate": 9.29424623415543e-05, + "loss": 0.316, + "step": 209 + }, + { + "epoch": 0.2, + "grad_norm": 0.48017314076423645, + "learning_rate": 9.286444930948496e-05, + "loss": 0.3579, + "step": 210 + }, + { + "epoch": 0.2, + "grad_norm": 0.37423351407051086, + "learning_rate": 9.278604058695313e-05, + "loss": 0.2375, + "step": 211 + }, + { + "epoch": 0.2, + "grad_norm": 0.4382573962211609, + "learning_rate": 9.270723689776568e-05, + "loss": 0.3206, + "step": 212 + }, + { + "epoch": 0.2, + "grad_norm": 0.4625137150287628, + "learning_rate": 9.262803896937555e-05, + "loss": 0.404, + "step": 213 + }, + { + "epoch": 0.2, + "grad_norm": 0.4267883896827698, + "learning_rate": 9.254844753287493e-05, + "loss": 0.2745, + "step": 214 + }, + { + "epoch": 0.2, + "grad_norm": 0.48072972893714905, + "learning_rate": 9.24684633229886e-05, + "loss": 0.3158, + "step": 215 + }, + { + "epoch": 0.2, + "grad_norm": 0.5718352794647217, + "learning_rate": 9.238808707806706e-05, + "loss": 0.357, + "step": 216 + }, + { + "epoch": 0.2, + "grad_norm": 0.4857891798019409, + "learning_rate": 9.230731954007983e-05, + "loss": 0.3711, + "step": 217 + }, + { + "epoch": 0.2, + "grad_norm": 0.5256755352020264, + "learning_rate": 9.222616145460849e-05, + "loss": 0.4199, + "step": 218 + }, + { + "epoch": 0.21, + "grad_norm": 0.40874987840652466, + "learning_rate": 9.214461357083985e-05, + "loss": 0.2117, + "step": 219 + }, + { + "epoch": 0.21, + "grad_norm": 0.36349964141845703, + "learning_rate": 9.206267664155907e-05, + "loss": 0.249, + "step": 220 + }, + { + "epoch": 0.21, + "grad_norm": 0.4281284511089325, + "learning_rate": 9.198035142314259e-05, + "loss": 0.3793, + "step": 221 + }, + { + "epoch": 0.21, + "grad_norm": 0.45442864298820496, + "learning_rate": 9.189763867555129e-05, + "loss": 0.3745, + "step": 222 + }, + { + "epoch": 0.21, + "grad_norm": 0.445965975522995, + "learning_rate": 9.181453916232339e-05, + "loss": 0.2544, + "step": 223 + }, + { + "epoch": 0.21, + "grad_norm": 0.4232853651046753, + "learning_rate": 9.173105365056742e-05, + "loss": 0.3489, + "step": 224 + }, + { + "epoch": 0.21, + "grad_norm": 0.4950341284275055, + "learning_rate": 9.164718291095515e-05, + "loss": 0.3743, + "step": 225 + }, + { + "epoch": 0.21, + "grad_norm": 0.4112965762615204, + "learning_rate": 9.156292771771446e-05, + "loss": 0.338, + "step": 226 + }, + { + "epoch": 0.21, + "grad_norm": 0.3986303508281708, + "learning_rate": 9.14782888486222e-05, + "loss": 0.3071, + "step": 227 + }, + { + "epoch": 0.21, + "grad_norm": 0.38412225246429443, + "learning_rate": 9.1393267084997e-05, + "loss": 0.2964, + "step": 228 + }, + { + "epoch": 0.21, + "grad_norm": 0.4604569673538208, + "learning_rate": 9.130786321169209e-05, + "loss": 0.3754, + "step": 229 + }, + { + "epoch": 0.22, + "grad_norm": 0.4137207865715027, + "learning_rate": 9.122207801708802e-05, + "loss": 0.3311, + "step": 230 + }, + { + "epoch": 0.22, + "grad_norm": 0.37709876894950867, + "learning_rate": 9.113591229308538e-05, + "loss": 0.2439, + "step": 231 + }, + { + "epoch": 0.22, + "grad_norm": 0.42634347081184387, + "learning_rate": 9.104936683509755e-05, + "loss": 0.3066, + "step": 232 + }, + { + "epoch": 0.22, + "grad_norm": 0.37276193499565125, + "learning_rate": 9.096244244204324e-05, + "loss": 0.3085, + "step": 233 + }, + { + "epoch": 0.22, + "grad_norm": 0.42944371700286865, + "learning_rate": 9.087513991633924e-05, + "loss": 0.3185, + "step": 234 + }, + { + "epoch": 0.22, + "grad_norm": 0.41260093450546265, + "learning_rate": 9.078746006389298e-05, + "loss": 0.3474, + "step": 235 + }, + { + "epoch": 0.22, + "grad_norm": 0.32577750086784363, + "learning_rate": 9.069940369409499e-05, + "loss": 0.1917, + "step": 236 + }, + { + "epoch": 0.22, + "grad_norm": 0.49313199520111084, + "learning_rate": 9.061097161981159e-05, + "loss": 0.4111, + "step": 237 + }, + { + "epoch": 0.22, + "grad_norm": 0.4716200828552246, + "learning_rate": 9.052216465737726e-05, + "loss": 0.3746, + "step": 238 + }, + { + "epoch": 0.22, + "grad_norm": 0.4609526991844177, + "learning_rate": 9.043298362658714e-05, + "loss": 0.3682, + "step": 239 + }, + { + "epoch": 0.23, + "grad_norm": 0.37921881675720215, + "learning_rate": 9.034342935068952e-05, + "loss": 0.1989, + "step": 240 + }, + { + "epoch": 0.23, + "grad_norm": 0.43589839339256287, + "learning_rate": 9.025350265637815e-05, + "loss": 0.3716, + "step": 241 + }, + { + "epoch": 0.23, + "grad_norm": 0.45863112807273865, + "learning_rate": 9.016320437378466e-05, + "loss": 0.3152, + "step": 242 + }, + { + "epoch": 0.23, + "grad_norm": 0.4138740003108978, + "learning_rate": 9.007253533647089e-05, + "loss": 0.3435, + "step": 243 + }, + { + "epoch": 0.23, + "grad_norm": 0.4855858385562897, + "learning_rate": 8.998149638142119e-05, + "loss": 0.3306, + "step": 244 + }, + { + "epoch": 0.23, + "grad_norm": 0.49547135829925537, + "learning_rate": 8.98900883490347e-05, + "loss": 0.4374, + "step": 245 + }, + { + "epoch": 0.23, + "grad_norm": 0.3813140094280243, + "learning_rate": 8.979831208311758e-05, + "loss": 0.2895, + "step": 246 + }, + { + "epoch": 0.23, + "grad_norm": 0.4578895568847656, + "learning_rate": 8.970616843087524e-05, + "loss": 0.3151, + "step": 247 + }, + { + "epoch": 0.23, + "grad_norm": 0.36515703797340393, + "learning_rate": 8.96136582429045e-05, + "loss": 0.2167, + "step": 248 + }, + { + "epoch": 0.23, + "grad_norm": 0.46578672528266907, + "learning_rate": 8.952078237318575e-05, + "loss": 0.3441, + "step": 249 + }, + { + "epoch": 0.23, + "grad_norm": 0.42086178064346313, + "learning_rate": 8.942754167907507e-05, + "loss": 0.2692, + "step": 250 + }, + { + "epoch": 0.24, + "grad_norm": 0.5085225105285645, + "learning_rate": 8.933393702129628e-05, + "loss": 0.2555, + "step": 251 + }, + { + "epoch": 0.24, + "grad_norm": 0.44771888852119446, + "learning_rate": 8.923996926393305e-05, + "loss": 0.3989, + "step": 252 + }, + { + "epoch": 0.24, + "grad_norm": 0.42549923062324524, + "learning_rate": 8.91456392744209e-05, + "loss": 0.3679, + "step": 253 + }, + { + "epoch": 0.24, + "grad_norm": 0.48210227489471436, + "learning_rate": 8.905094792353917e-05, + "loss": 0.3376, + "step": 254 + }, + { + "epoch": 0.24, + "grad_norm": 0.4970339238643646, + "learning_rate": 8.895589608540297e-05, + "loss": 0.3605, + "step": 255 + }, + { + "epoch": 0.24, + "grad_norm": 0.3592231869697571, + "learning_rate": 8.886048463745525e-05, + "loss": 0.1728, + "step": 256 + }, + { + "epoch": 0.24, + "grad_norm": 0.4326871633529663, + "learning_rate": 8.876471446045847e-05, + "loss": 0.2955, + "step": 257 + }, + { + "epoch": 0.24, + "grad_norm": 0.42976316809654236, + "learning_rate": 8.866858643848665e-05, + "loss": 0.2432, + "step": 258 + }, + { + "epoch": 0.24, + "grad_norm": 0.394156277179718, + "learning_rate": 8.857210145891715e-05, + "loss": 0.2881, + "step": 259 + }, + { + "epoch": 0.24, + "grad_norm": 0.41744494438171387, + "learning_rate": 8.847526041242246e-05, + "loss": 0.1836, + "step": 260 + }, + { + "epoch": 0.24, + "grad_norm": 0.40021783113479614, + "learning_rate": 8.8378064192962e-05, + "loss": 0.2429, + "step": 261 + }, + { + "epoch": 0.25, + "grad_norm": 0.4498775601387024, + "learning_rate": 8.82805136977739e-05, + "loss": 0.3673, + "step": 262 + }, + { + "epoch": 0.25, + "grad_norm": 0.490108847618103, + "learning_rate": 8.818260982736661e-05, + "loss": 0.3611, + "step": 263 + }, + { + "epoch": 0.25, + "grad_norm": 0.39929094910621643, + "learning_rate": 8.808435348551071e-05, + "loss": 0.2208, + "step": 264 + }, + { + "epoch": 0.25, + "grad_norm": 0.3677091896533966, + "learning_rate": 8.798574557923053e-05, + "loss": 0.2226, + "step": 265 + }, + { + "epoch": 0.25, + "grad_norm": 0.3725321888923645, + "learning_rate": 8.788678701879573e-05, + "loss": 0.3111, + "step": 266 + }, + { + "epoch": 0.25, + "grad_norm": 0.3611381947994232, + "learning_rate": 8.778747871771292e-05, + "loss": 0.237, + "step": 267 + }, + { + "epoch": 0.25, + "grad_norm": 0.4395051598548889, + "learning_rate": 8.768782159271727e-05, + "loss": 0.3296, + "step": 268 + }, + { + "epoch": 0.25, + "grad_norm": 0.40998443961143494, + "learning_rate": 8.758781656376398e-05, + "loss": 0.249, + "step": 269 + }, + { + "epoch": 0.25, + "grad_norm": 0.46020665764808655, + "learning_rate": 8.748746455401986e-05, + "loss": 0.3302, + "step": 270 + }, + { + "epoch": 0.25, + "grad_norm": 0.46815627813339233, + "learning_rate": 8.738676648985476e-05, + "loss": 0.3139, + "step": 271 + }, + { + "epoch": 0.26, + "grad_norm": 0.4065473973751068, + "learning_rate": 8.7285723300833e-05, + "loss": 0.3707, + "step": 272 + }, + { + "epoch": 0.26, + "grad_norm": 0.3898315727710724, + "learning_rate": 8.718433591970485e-05, + "loss": 0.288, + "step": 273 + }, + { + "epoch": 0.26, + "grad_norm": 0.478915810585022, + "learning_rate": 8.708260528239788e-05, + "loss": 0.3895, + "step": 274 + }, + { + "epoch": 0.26, + "grad_norm": 0.4072932004928589, + "learning_rate": 8.698053232800832e-05, + "loss": 0.2634, + "step": 275 + }, + { + "epoch": 0.26, + "grad_norm": 0.41820770502090454, + "learning_rate": 8.68781179987924e-05, + "loss": 0.3607, + "step": 276 + }, + { + "epoch": 0.26, + "grad_norm": 0.3776008188724518, + "learning_rate": 8.677536324015765e-05, + "loss": 0.278, + "step": 277 + }, + { + "epoch": 0.26, + "grad_norm": 0.3983832001686096, + "learning_rate": 8.667226900065419e-05, + "loss": 0.3231, + "step": 278 + }, + { + "epoch": 0.26, + "grad_norm": 0.472797691822052, + "learning_rate": 8.656883623196592e-05, + "loss": 0.3177, + "step": 279 + }, + { + "epoch": 0.26, + "grad_norm": 0.44463932514190674, + "learning_rate": 8.646506588890183e-05, + "loss": 0.3068, + "step": 280 + }, + { + "epoch": 0.26, + "grad_norm": 0.4819587469100952, + "learning_rate": 8.636095892938707e-05, + "loss": 0.3476, + "step": 281 + }, + { + "epoch": 0.26, + "grad_norm": 0.45399779081344604, + "learning_rate": 8.62565163144542e-05, + "loss": 0.4011, + "step": 282 + }, + { + "epoch": 0.27, + "grad_norm": 0.3941989243030548, + "learning_rate": 8.615173900823426e-05, + "loss": 0.2842, + "step": 283 + }, + { + "epoch": 0.27, + "grad_norm": 0.39772456884384155, + "learning_rate": 8.60466279779479e-05, + "loss": 0.288, + "step": 284 + }, + { + "epoch": 0.27, + "grad_norm": 0.3934876322746277, + "learning_rate": 8.594118419389647e-05, + "loss": 0.3304, + "step": 285 + }, + { + "epoch": 0.27, + "grad_norm": 0.4301411807537079, + "learning_rate": 8.583540862945301e-05, + "loss": 0.307, + "step": 286 + }, + { + "epoch": 0.27, + "grad_norm": 0.3396373391151428, + "learning_rate": 8.57293022610533e-05, + "loss": 0.2122, + "step": 287 + }, + { + "epoch": 0.27, + "grad_norm": 0.4123905301094055, + "learning_rate": 8.562286606818684e-05, + "loss": 0.238, + "step": 288 + }, + { + "epoch": 0.27, + "grad_norm": 0.5258879661560059, + "learning_rate": 8.55161010333878e-05, + "loss": 0.5028, + "step": 289 + }, + { + "epoch": 0.27, + "grad_norm": 0.4868035316467285, + "learning_rate": 8.540900814222598e-05, + "loss": 0.3512, + "step": 290 + }, + { + "epoch": 0.27, + "grad_norm": 0.49566733837127686, + "learning_rate": 8.530158838329765e-05, + "loss": 0.3262, + "step": 291 + }, + { + "epoch": 0.27, + "grad_norm": 0.5323141813278198, + "learning_rate": 8.519384274821649e-05, + "loss": 0.4359, + "step": 292 + }, + { + "epoch": 0.27, + "grad_norm": 0.41219809651374817, + "learning_rate": 8.508577223160442e-05, + "loss": 0.259, + "step": 293 + }, + { + "epoch": 0.28, + "grad_norm": 0.46348467469215393, + "learning_rate": 8.497737783108238e-05, + "loss": 0.3208, + "step": 294 + }, + { + "epoch": 0.28, + "grad_norm": 0.6380013227462769, + "learning_rate": 8.486866054726114e-05, + "loss": 0.3092, + "step": 295 + }, + { + "epoch": 0.28, + "grad_norm": 0.4216512441635132, + "learning_rate": 8.475962138373213e-05, + "loss": 0.2662, + "step": 296 + }, + { + "epoch": 0.28, + "grad_norm": 0.4462231397628784, + "learning_rate": 8.465026134705805e-05, + "loss": 0.3089, + "step": 297 + }, + { + "epoch": 0.28, + "grad_norm": 0.43319839239120483, + "learning_rate": 8.454058144676366e-05, + "loss": 0.2952, + "step": 298 + }, + { + "epoch": 0.28, + "grad_norm": 0.3860713243484497, + "learning_rate": 8.443058269532651e-05, + "loss": 0.3128, + "step": 299 + }, + { + "epoch": 0.28, + "grad_norm": 0.4144044518470764, + "learning_rate": 8.432026610816745e-05, + "loss": 0.2749, + "step": 300 + }, + { + "epoch": 0.28, + "grad_norm": 0.437470018863678, + "learning_rate": 8.420963270364137e-05, + "loss": 0.274, + "step": 301 + }, + { + "epoch": 0.28, + "grad_norm": 0.464390367269516, + "learning_rate": 8.409868350302774e-05, + "loss": 0.3264, + "step": 302 + }, + { + "epoch": 0.28, + "grad_norm": 0.45976755023002625, + "learning_rate": 8.398741953052127e-05, + "loss": 0.2807, + "step": 303 + }, + { + "epoch": 0.29, + "grad_norm": 0.44180965423583984, + "learning_rate": 8.387584181322233e-05, + "loss": 0.3016, + "step": 304 + }, + { + "epoch": 0.29, + "grad_norm": 0.4997859597206116, + "learning_rate": 8.376395138112754e-05, + "loss": 0.3449, + "step": 305 + }, + { + "epoch": 0.29, + "grad_norm": 0.4143840968608856, + "learning_rate": 8.365174926712032e-05, + "loss": 0.3417, + "step": 306 + }, + { + "epoch": 0.29, + "grad_norm": 0.4447815418243408, + "learning_rate": 8.353923650696118e-05, + "loss": 0.3467, + "step": 307 + }, + { + "epoch": 0.29, + "grad_norm": 0.4479486644268036, + "learning_rate": 8.342641413927837e-05, + "loss": 0.3673, + "step": 308 + }, + { + "epoch": 0.29, + "grad_norm": 0.44752955436706543, + "learning_rate": 8.331328320555812e-05, + "loss": 0.3458, + "step": 309 + }, + { + "epoch": 0.29, + "grad_norm": 0.5090377926826477, + "learning_rate": 8.319984475013512e-05, + "loss": 0.3991, + "step": 310 + }, + { + "epoch": 0.29, + "grad_norm": 0.43693166971206665, + "learning_rate": 8.308609982018286e-05, + "loss": 0.3353, + "step": 311 + }, + { + "epoch": 0.29, + "grad_norm": 0.38203293085098267, + "learning_rate": 8.297204946570398e-05, + "loss": 0.2731, + "step": 312 + }, + { + "epoch": 0.29, + "grad_norm": 0.4455333650112152, + "learning_rate": 8.285769473952052e-05, + "loss": 0.2632, + "step": 313 + }, + { + "epoch": 0.29, + "grad_norm": 0.4769796133041382, + "learning_rate": 8.274303669726426e-05, + "loss": 0.3187, + "step": 314 + }, + { + "epoch": 0.3, + "grad_norm": 0.45285284519195557, + "learning_rate": 8.262807639736692e-05, + "loss": 0.4072, + "step": 315 + }, + { + "epoch": 0.3, + "grad_norm": 0.47556719183921814, + "learning_rate": 8.251281490105045e-05, + "loss": 0.3732, + "step": 316 + }, + { + "epoch": 0.3, + "grad_norm": 0.385691374540329, + "learning_rate": 8.239725327231721e-05, + "loss": 0.3075, + "step": 317 + }, + { + "epoch": 0.3, + "grad_norm": 0.41788721084594727, + "learning_rate": 8.228139257794012e-05, + "loss": 0.3112, + "step": 318 + }, + { + "epoch": 0.3, + "grad_norm": 0.4231437146663666, + "learning_rate": 8.216523388745287e-05, + "loss": 0.3517, + "step": 319 + }, + { + "epoch": 0.3, + "grad_norm": 0.4116215407848358, + "learning_rate": 8.204877827313997e-05, + "loss": 0.3299, + "step": 320 + }, + { + "epoch": 0.3, + "grad_norm": 0.3918690085411072, + "learning_rate": 8.193202681002692e-05, + "loss": 0.337, + "step": 321 + }, + { + "epoch": 0.3, + "grad_norm": 0.28665515780448914, + "learning_rate": 8.181498057587027e-05, + "loss": 0.1694, + "step": 322 + }, + { + "epoch": 0.3, + "grad_norm": 0.3561767637729645, + "learning_rate": 8.169764065114764e-05, + "loss": 0.2718, + "step": 323 + }, + { + "epoch": 0.3, + "grad_norm": 0.4175536632537842, + "learning_rate": 8.158000811904778e-05, + "loss": 0.2497, + "step": 324 + }, + { + "epoch": 0.3, + "grad_norm": 0.41284066438674927, + "learning_rate": 8.146208406546053e-05, + "loss": 0.3359, + "step": 325 + }, + { + "epoch": 0.31, + "grad_norm": 0.4605599343776703, + "learning_rate": 8.134386957896688e-05, + "loss": 0.4093, + "step": 326 + }, + { + "epoch": 0.31, + "grad_norm": 0.45733022689819336, + "learning_rate": 8.122536575082882e-05, + "loss": 0.4192, + "step": 327 + }, + { + "epoch": 0.31, + "grad_norm": 0.46098488569259644, + "learning_rate": 8.110657367497933e-05, + "loss": 0.2819, + "step": 328 + }, + { + "epoch": 0.31, + "grad_norm": 0.3386194407939911, + "learning_rate": 8.098749444801224e-05, + "loss": 0.2341, + "step": 329 + }, + { + "epoch": 0.31, + "grad_norm": 0.42128705978393555, + "learning_rate": 8.08681291691722e-05, + "loss": 0.3533, + "step": 330 + }, + { + "epoch": 0.31, + "grad_norm": 0.43376579880714417, + "learning_rate": 8.074847894034434e-05, + "loss": 0.2797, + "step": 331 + }, + { + "epoch": 0.31, + "grad_norm": 0.41651150584220886, + "learning_rate": 8.062854486604435e-05, + "loss": 0.256, + "step": 332 + }, + { + "epoch": 0.31, + "grad_norm": 0.38273122906684875, + "learning_rate": 8.050832805340806e-05, + "loss": 0.2789, + "step": 333 + }, + { + "epoch": 0.31, + "grad_norm": 0.38854745030403137, + "learning_rate": 8.038782961218136e-05, + "loss": 0.328, + "step": 334 + }, + { + "epoch": 0.31, + "grad_norm": 0.49828365445137024, + "learning_rate": 8.026705065470996e-05, + "loss": 0.3411, + "step": 335 + }, + { + "epoch": 0.32, + "grad_norm": 0.39919033646583557, + "learning_rate": 8.014599229592894e-05, + "loss": 0.3047, + "step": 336 + }, + { + "epoch": 0.32, + "grad_norm": 0.440532922744751, + "learning_rate": 8.002465565335271e-05, + "loss": 0.2559, + "step": 337 + }, + { + "epoch": 0.32, + "grad_norm": 0.4270270764827728, + "learning_rate": 7.990304184706455e-05, + "loss": 0.2757, + "step": 338 + }, + { + "epoch": 0.32, + "grad_norm": 0.4185905158519745, + "learning_rate": 7.978115199970621e-05, + "loss": 0.3783, + "step": 339 + }, + { + "epoch": 0.32, + "grad_norm": 0.38478323817253113, + "learning_rate": 7.965898723646776e-05, + "loss": 0.3041, + "step": 340 + }, + { + "epoch": 0.32, + "grad_norm": 0.5213392376899719, + "learning_rate": 7.953654868507699e-05, + "loss": 0.418, + "step": 341 + }, + { + "epoch": 0.32, + "grad_norm": 0.42577335238456726, + "learning_rate": 7.941383747578912e-05, + "loss": 0.3395, + "step": 342 + }, + { + "epoch": 0.32, + "grad_norm": 0.4232735335826874, + "learning_rate": 7.929085474137629e-05, + "loss": 0.3455, + "step": 343 + }, + { + "epoch": 0.32, + "grad_norm": 0.36636146903038025, + "learning_rate": 7.91676016171172e-05, + "loss": 0.3177, + "step": 344 + }, + { + "epoch": 0.32, + "grad_norm": 0.46816015243530273, + "learning_rate": 7.904407924078654e-05, + "loss": 0.4576, + "step": 345 + }, + { + "epoch": 0.32, + "grad_norm": 0.43102020025253296, + "learning_rate": 7.892028875264451e-05, + "loss": 0.3307, + "step": 346 + }, + { + "epoch": 0.33, + "grad_norm": 0.4004449248313904, + "learning_rate": 7.879623129542633e-05, + "loss": 0.2942, + "step": 347 + }, + { + "epoch": 0.33, + "grad_norm": 0.4232417941093445, + "learning_rate": 7.867190801433166e-05, + "loss": 0.3691, + "step": 348 + }, + { + "epoch": 0.33, + "grad_norm": 0.4015827775001526, + "learning_rate": 7.854732005701402e-05, + "loss": 0.345, + "step": 349 + }, + { + "epoch": 0.33, + "grad_norm": 0.3961988389492035, + "learning_rate": 7.842246857357023e-05, + "loss": 0.2919, + "step": 350 + }, + { + "epoch": 0.33, + "grad_norm": 0.3362734317779541, + "learning_rate": 7.829735471652978e-05, + "loss": 0.2247, + "step": 351 + }, + { + "epoch": 0.33, + "grad_norm": 0.37961891293525696, + "learning_rate": 7.817197964084411e-05, + "loss": 0.2637, + "step": 352 + }, + { + "epoch": 0.33, + "grad_norm": 0.3325817883014679, + "learning_rate": 7.804634450387616e-05, + "loss": 0.1882, + "step": 353 + }, + { + "epoch": 0.33, + "grad_norm": 0.40928661823272705, + "learning_rate": 7.792045046538941e-05, + "loss": 0.3098, + "step": 354 + }, + { + "epoch": 0.33, + "grad_norm": 0.4568662941455841, + "learning_rate": 7.77942986875374e-05, + "loss": 0.419, + "step": 355 + }, + { + "epoch": 0.33, + "grad_norm": 0.395332008600235, + "learning_rate": 7.766789033485287e-05, + "loss": 0.3353, + "step": 356 + }, + { + "epoch": 0.33, + "grad_norm": 0.4348996579647064, + "learning_rate": 7.75412265742371e-05, + "loss": 0.3883, + "step": 357 + }, + { + "epoch": 0.34, + "grad_norm": 0.432668536901474, + "learning_rate": 7.741430857494904e-05, + "loss": 0.3164, + "step": 358 + }, + { + "epoch": 0.34, + "grad_norm": 0.43673163652420044, + "learning_rate": 7.728713750859458e-05, + "loss": 0.3445, + "step": 359 + }, + { + "epoch": 0.34, + "grad_norm": 0.42592430114746094, + "learning_rate": 7.715971454911577e-05, + "loss": 0.3516, + "step": 360 + }, + { + "epoch": 0.34, + "grad_norm": 0.4502371549606323, + "learning_rate": 7.703204087277988e-05, + "loss": 0.3562, + "step": 361 + }, + { + "epoch": 0.34, + "grad_norm": 0.3586786091327667, + "learning_rate": 7.690411765816864e-05, + "loss": 0.2582, + "step": 362 + }, + { + "epoch": 0.34, + "grad_norm": 0.3953322470188141, + "learning_rate": 7.677594608616729e-05, + "loss": 0.3064, + "step": 363 + }, + { + "epoch": 0.34, + "grad_norm": 0.3941778838634491, + "learning_rate": 7.66475273399537e-05, + "loss": 0.2823, + "step": 364 + }, + { + "epoch": 0.34, + "grad_norm": 0.3739788234233856, + "learning_rate": 7.651886260498751e-05, + "loss": 0.267, + "step": 365 + }, + { + "epoch": 0.34, + "grad_norm": 0.43802833557128906, + "learning_rate": 7.638995306899908e-05, + "loss": 0.3207, + "step": 366 + }, + { + "epoch": 0.34, + "grad_norm": 0.3720547556877136, + "learning_rate": 7.626079992197857e-05, + "loss": 0.2774, + "step": 367 + }, + { + "epoch": 0.35, + "grad_norm": 0.42035406827926636, + "learning_rate": 7.613140435616503e-05, + "loss": 0.3196, + "step": 368 + }, + { + "epoch": 0.35, + "grad_norm": 0.401703804731369, + "learning_rate": 7.600176756603525e-05, + "loss": 0.2521, + "step": 369 + }, + { + "epoch": 0.35, + "grad_norm": 0.3969535827636719, + "learning_rate": 7.587189074829284e-05, + "loss": 0.3049, + "step": 370 + }, + { + "epoch": 0.35, + "grad_norm": 0.33402374386787415, + "learning_rate": 7.57417751018572e-05, + "loss": 0.2278, + "step": 371 + }, + { + "epoch": 0.35, + "grad_norm": 0.37405410408973694, + "learning_rate": 7.561142182785233e-05, + "loss": 0.2392, + "step": 372 + }, + { + "epoch": 0.35, + "grad_norm": 0.42111271619796753, + "learning_rate": 7.548083212959588e-05, + "loss": 0.308, + "step": 373 + }, + { + "epoch": 0.35, + "grad_norm": 0.421460896730423, + "learning_rate": 7.535000721258791e-05, + "loss": 0.2638, + "step": 374 + }, + { + "epoch": 0.35, + "grad_norm": 0.42545416951179504, + "learning_rate": 7.521894828449994e-05, + "loss": 0.3674, + "step": 375 + }, + { + "epoch": 0.35, + "grad_norm": 0.4488581418991089, + "learning_rate": 7.508765655516358e-05, + "loss": 0.3428, + "step": 376 + }, + { + "epoch": 0.35, + "grad_norm": 0.390317440032959, + "learning_rate": 7.495613323655953e-05, + "loss": 0.3062, + "step": 377 + }, + { + "epoch": 0.35, + "grad_norm": 0.4021550714969635, + "learning_rate": 7.482437954280635e-05, + "loss": 0.2613, + "step": 378 + }, + { + "epoch": 0.36, + "grad_norm": 0.4348108768463135, + "learning_rate": 7.469239669014923e-05, + "loss": 0.3695, + "step": 379 + }, + { + "epoch": 0.36, + "grad_norm": 0.39812782406806946, + "learning_rate": 7.456018589694873e-05, + "loss": 0.2887, + "step": 380 + }, + { + "epoch": 0.36, + "grad_norm": 0.5493481755256653, + "learning_rate": 7.442774838366965e-05, + "loss": 0.3927, + "step": 381 + }, + { + "epoch": 0.36, + "grad_norm": 0.2790215313434601, + "learning_rate": 7.429508537286963e-05, + "loss": 0.1896, + "step": 382 + }, + { + "epoch": 0.36, + "grad_norm": 0.43105021119117737, + "learning_rate": 7.416219808918794e-05, + "loss": 0.351, + "step": 383 + }, + { + "epoch": 0.36, + "grad_norm": 0.4434210956096649, + "learning_rate": 7.402908775933419e-05, + "loss": 0.3709, + "step": 384 + }, + { + "epoch": 0.36, + "grad_norm": 0.4587249457836151, + "learning_rate": 7.389575561207692e-05, + "loss": 0.4602, + "step": 385 + }, + { + "epoch": 0.36, + "grad_norm": 0.4942215383052826, + "learning_rate": 7.376220287823236e-05, + "loss": 0.3745, + "step": 386 + }, + { + "epoch": 0.36, + "grad_norm": 0.45109692215919495, + "learning_rate": 7.3628430790653e-05, + "loss": 0.3226, + "step": 387 + }, + { + "epoch": 0.36, + "grad_norm": 0.39248013496398926, + "learning_rate": 7.349444058421619e-05, + "loss": 0.2886, + "step": 388 + }, + { + "epoch": 0.36, + "grad_norm": 0.4016898274421692, + "learning_rate": 7.336023349581287e-05, + "loss": 0.2429, + "step": 389 + }, + { + "epoch": 0.37, + "grad_norm": 0.3656561076641083, + "learning_rate": 7.322581076433596e-05, + "loss": 0.2889, + "step": 390 + }, + { + "epoch": 0.37, + "grad_norm": 0.49339476227760315, + "learning_rate": 7.309117363066912e-05, + "loss": 0.4121, + "step": 391 + }, + { + "epoch": 0.37, + "grad_norm": 0.41815677285194397, + "learning_rate": 7.295632333767513e-05, + "loss": 0.3564, + "step": 392 + }, + { + "epoch": 0.37, + "grad_norm": 0.35798338055610657, + "learning_rate": 7.28212611301845e-05, + "loss": 0.234, + "step": 393 + }, + { + "epoch": 0.37, + "grad_norm": 0.4400104284286499, + "learning_rate": 7.2685988254984e-05, + "loss": 0.363, + "step": 394 + }, + { + "epoch": 0.37, + "grad_norm": 0.43570441007614136, + "learning_rate": 7.255050596080509e-05, + "loss": 0.3474, + "step": 395 + }, + { + "epoch": 0.37, + "grad_norm": 0.2705797255039215, + "learning_rate": 7.241481549831243e-05, + "loss": 0.1613, + "step": 396 + }, + { + "epoch": 0.37, + "grad_norm": 0.4214705526828766, + "learning_rate": 7.22789181200923e-05, + "loss": 0.3396, + "step": 397 + }, + { + "epoch": 0.37, + "grad_norm": 0.4547910988330841, + "learning_rate": 7.214281508064107e-05, + "loss": 0.3565, + "step": 398 + }, + { + "epoch": 0.37, + "grad_norm": 0.3604981601238251, + "learning_rate": 7.200650763635366e-05, + "loss": 0.2394, + "step": 399 + }, + { + "epoch": 0.38, + "grad_norm": 0.4512694478034973, + "learning_rate": 7.186999704551181e-05, + "loss": 0.3133, + "step": 400 + }, + { + "epoch": 0.38, + "grad_norm": 0.45708590745925903, + "learning_rate": 7.173328456827263e-05, + "loss": 0.319, + "step": 401 + }, + { + "epoch": 0.38, + "grad_norm": 0.3279622197151184, + "learning_rate": 7.15963714666568e-05, + "loss": 0.2018, + "step": 402 + }, + { + "epoch": 0.38, + "grad_norm": 0.4273524284362793, + "learning_rate": 7.145925900453709e-05, + "loss": 0.3532, + "step": 403 + }, + { + "epoch": 0.38, + "grad_norm": 0.3724408745765686, + "learning_rate": 7.132194844762654e-05, + "loss": 0.2762, + "step": 404 + }, + { + "epoch": 0.38, + "grad_norm": 0.44304215908050537, + "learning_rate": 7.118444106346687e-05, + "loss": 0.3618, + "step": 405 + }, + { + "epoch": 0.38, + "grad_norm": 0.4273759424686432, + "learning_rate": 7.104673812141675e-05, + "loss": 0.3664, + "step": 406 + }, + { + "epoch": 0.38, + "grad_norm": 0.40296387672424316, + "learning_rate": 7.090884089264011e-05, + "loss": 0.2687, + "step": 407 + }, + { + "epoch": 0.38, + "grad_norm": 0.3528042137622833, + "learning_rate": 7.077075065009433e-05, + "loss": 0.2294, + "step": 408 + }, + { + "epoch": 0.38, + "grad_norm": 0.4306895136833191, + "learning_rate": 7.063246866851858e-05, + "loss": 0.2743, + "step": 409 + }, + { + "epoch": 0.38, + "grad_norm": 0.5282109379768372, + "learning_rate": 7.049399622442198e-05, + "loss": 0.3679, + "step": 410 + }, + { + "epoch": 0.39, + "grad_norm": 0.45601069927215576, + "learning_rate": 7.035533459607189e-05, + "loss": 0.3349, + "step": 411 + }, + { + "epoch": 0.39, + "grad_norm": 0.39970582723617554, + "learning_rate": 7.021648506348204e-05, + "loss": 0.3156, + "step": 412 + }, + { + "epoch": 0.39, + "grad_norm": 0.4105047583580017, + "learning_rate": 7.007744890840073e-05, + "loss": 0.2743, + "step": 413 + }, + { + "epoch": 0.39, + "grad_norm": 0.42897555232048035, + "learning_rate": 6.993822741429907e-05, + "loss": 0.3822, + "step": 414 + }, + { + "epoch": 0.39, + "grad_norm": 0.3480586111545563, + "learning_rate": 6.979882186635897e-05, + "loss": 0.2126, + "step": 415 + }, + { + "epoch": 0.39, + "grad_norm": 0.43617862462997437, + "learning_rate": 6.965923355146147e-05, + "loss": 0.3459, + "step": 416 + }, + { + "epoch": 0.39, + "grad_norm": 0.3757456839084625, + "learning_rate": 6.951946375817474e-05, + "loss": 0.2828, + "step": 417 + }, + { + "epoch": 0.39, + "grad_norm": 0.45733776688575745, + "learning_rate": 6.937951377674221e-05, + "loss": 0.2947, + "step": 418 + }, + { + "epoch": 0.39, + "grad_norm": 0.44172319769859314, + "learning_rate": 6.923938489907066e-05, + "loss": 0.3259, + "step": 419 + }, + { + "epoch": 0.39, + "grad_norm": 0.4123852849006653, + "learning_rate": 6.909907841871829e-05, + "loss": 0.2887, + "step": 420 + }, + { + "epoch": 0.39, + "grad_norm": 0.4304922819137573, + "learning_rate": 6.895859563088283e-05, + "loss": 0.3682, + "step": 421 + }, + { + "epoch": 0.4, + "grad_norm": 0.36229416728019714, + "learning_rate": 6.881793783238948e-05, + "loss": 0.2454, + "step": 422 + }, + { + "epoch": 0.4, + "grad_norm": 0.46588000655174255, + "learning_rate": 6.867710632167903e-05, + "loss": 0.3546, + "step": 423 + }, + { + "epoch": 0.4, + "grad_norm": 0.3604878783226013, + "learning_rate": 6.853610239879586e-05, + "loss": 0.2702, + "step": 424 + }, + { + "epoch": 0.4, + "grad_norm": 0.34859415888786316, + "learning_rate": 6.839492736537588e-05, + "loss": 0.1924, + "step": 425 + }, + { + "epoch": 0.4, + "grad_norm": 0.462001234292984, + "learning_rate": 6.82535825246346e-05, + "loss": 0.2681, + "step": 426 + }, + { + "epoch": 0.4, + "grad_norm": 0.46837642788887024, + "learning_rate": 6.811206918135502e-05, + "loss": 0.3998, + "step": 427 + }, + { + "epoch": 0.4, + "grad_norm": 0.4874202013015747, + "learning_rate": 6.797038864187564e-05, + "loss": 0.3459, + "step": 428 + }, + { + "epoch": 0.4, + "grad_norm": 0.3754540681838989, + "learning_rate": 6.782854221407838e-05, + "loss": 0.297, + "step": 429 + }, + { + "epoch": 0.4, + "grad_norm": 0.43778693675994873, + "learning_rate": 6.768653120737652e-05, + "loss": 0.3855, + "step": 430 + }, + { + "epoch": 0.4, + "grad_norm": 0.4074949324131012, + "learning_rate": 6.754435693270258e-05, + "loss": 0.2835, + "step": 431 + }, + { + "epoch": 0.41, + "grad_norm": 0.4566798806190491, + "learning_rate": 6.740202070249623e-05, + "loss": 0.349, + "step": 432 + }, + { + "epoch": 0.41, + "grad_norm": 0.4901081919670105, + "learning_rate": 6.725952383069222e-05, + "loss": 0.4543, + "step": 433 + }, + { + "epoch": 0.41, + "grad_norm": 0.43744221329689026, + "learning_rate": 6.711686763270818e-05, + "loss": 0.374, + "step": 434 + }, + { + "epoch": 0.41, + "grad_norm": 0.41882264614105225, + "learning_rate": 6.697405342543258e-05, + "loss": 0.3751, + "step": 435 + }, + { + "epoch": 0.41, + "grad_norm": 0.36602655053138733, + "learning_rate": 6.683108252721238e-05, + "loss": 0.3011, + "step": 436 + }, + { + "epoch": 0.41, + "grad_norm": 0.47255802154541016, + "learning_rate": 6.668795625784113e-05, + "loss": 0.4474, + "step": 437 + }, + { + "epoch": 0.41, + "grad_norm": 0.3598809540271759, + "learning_rate": 6.654467593854657e-05, + "loss": 0.2321, + "step": 438 + }, + { + "epoch": 0.41, + "grad_norm": 0.4139677584171295, + "learning_rate": 6.640124289197845e-05, + "loss": 0.3092, + "step": 439 + }, + { + "epoch": 0.41, + "grad_norm": 0.4301883578300476, + "learning_rate": 6.625765844219652e-05, + "loss": 0.369, + "step": 440 + }, + { + "epoch": 0.41, + "grad_norm": 0.39680930972099304, + "learning_rate": 6.611392391465802e-05, + "loss": 0.2968, + "step": 441 + }, + { + "epoch": 0.41, + "grad_norm": 0.4317682683467865, + "learning_rate": 6.597004063620567e-05, + "loss": 0.3328, + "step": 442 + }, + { + "epoch": 0.42, + "grad_norm": 0.29227694869041443, + "learning_rate": 6.582600993505534e-05, + "loss": 0.1873, + "step": 443 + }, + { + "epoch": 0.42, + "grad_norm": 0.4805541932582855, + "learning_rate": 6.568183314078376e-05, + "loss": 0.4159, + "step": 444 + }, + { + "epoch": 0.42, + "grad_norm": 0.3404616117477417, + "learning_rate": 6.553751158431627e-05, + "loss": 0.2327, + "step": 445 + }, + { + "epoch": 0.42, + "grad_norm": 0.3709631860256195, + "learning_rate": 6.539304659791456e-05, + "loss": 0.2642, + "step": 446 + }, + { + "epoch": 0.42, + "grad_norm": 0.38298705220222473, + "learning_rate": 6.524843951516434e-05, + "loss": 0.2727, + "step": 447 + }, + { + "epoch": 0.42, + "grad_norm": 0.43321123719215393, + "learning_rate": 6.510369167096308e-05, + "loss": 0.292, + "step": 448 + }, + { + "epoch": 0.42, + "grad_norm": 0.3808426558971405, + "learning_rate": 6.495880440150756e-05, + "loss": 0.2678, + "step": 449 + }, + { + "epoch": 0.42, + "grad_norm": 0.3932049572467804, + "learning_rate": 6.481377904428171e-05, + "loss": 0.2624, + "step": 450 + }, + { + "epoch": 0.42, + "grad_norm": 0.4109935760498047, + "learning_rate": 6.466861693804413e-05, + "loss": 0.3203, + "step": 451 + }, + { + "epoch": 0.42, + "grad_norm": 0.48460057377815247, + "learning_rate": 6.45233194228158e-05, + "loss": 0.4028, + "step": 452 + }, + { + "epoch": 0.42, + "grad_norm": 0.4326816201210022, + "learning_rate": 6.437788783986766e-05, + "loss": 0.2372, + "step": 453 + }, + { + "epoch": 0.43, + "grad_norm": 0.46684136986732483, + "learning_rate": 6.42323235317083e-05, + "loss": 0.3116, + "step": 454 + }, + { + "epoch": 0.43, + "grad_norm": 0.4408712685108185, + "learning_rate": 6.408662784207149e-05, + "loss": 0.293, + "step": 455 + }, + { + "epoch": 0.43, + "grad_norm": 0.43929120898246765, + "learning_rate": 6.394080211590381e-05, + "loss": 0.2862, + "step": 456 + }, + { + "epoch": 0.43, + "grad_norm": 0.33374032378196716, + "learning_rate": 6.379484769935223e-05, + "loss": 0.2255, + "step": 457 + }, + { + "epoch": 0.43, + "grad_norm": 0.4163099527359009, + "learning_rate": 6.364876593975173e-05, + "loss": 0.2681, + "step": 458 + }, + { + "epoch": 0.43, + "grad_norm": 0.34640195965766907, + "learning_rate": 6.350255818561277e-05, + "loss": 0.2094, + "step": 459 + }, + { + "epoch": 0.43, + "grad_norm": 0.4374372363090515, + "learning_rate": 6.335622578660889e-05, + "loss": 0.3127, + "step": 460 + }, + { + "epoch": 0.43, + "grad_norm": 0.3736974895000458, + "learning_rate": 6.320977009356431e-05, + "loss": 0.2477, + "step": 461 + }, + { + "epoch": 0.43, + "grad_norm": 0.49957776069641113, + "learning_rate": 6.306319245844133e-05, + "loss": 0.3696, + "step": 462 + }, + { + "epoch": 0.43, + "grad_norm": 0.41424816846847534, + "learning_rate": 6.291649423432799e-05, + "loss": 0.2518, + "step": 463 + }, + { + "epoch": 0.44, + "grad_norm": 0.37798556685447693, + "learning_rate": 6.276967677542542e-05, + "loss": 0.2697, + "step": 464 + }, + { + "epoch": 0.44, + "grad_norm": 0.403611958026886, + "learning_rate": 6.262274143703554e-05, + "loss": 0.3054, + "step": 465 + }, + { + "epoch": 0.44, + "grad_norm": 0.4166502356529236, + "learning_rate": 6.24756895755484e-05, + "loss": 0.2946, + "step": 466 + }, + { + "epoch": 0.44, + "grad_norm": 0.3577952980995178, + "learning_rate": 6.232852254842962e-05, + "loss": 0.2579, + "step": 467 + }, + { + "epoch": 0.44, + "grad_norm": 0.4579060673713684, + "learning_rate": 6.218124171420806e-05, + "loss": 0.3533, + "step": 468 + }, + { + "epoch": 0.44, + "grad_norm": 0.404363751411438, + "learning_rate": 6.203384843246307e-05, + "loss": 0.3408, + "step": 469 + }, + { + "epoch": 0.44, + "grad_norm": 0.3781503736972809, + "learning_rate": 6.188634406381207e-05, + "loss": 0.3318, + "step": 470 + }, + { + "epoch": 0.44, + "grad_norm": 0.47970715165138245, + "learning_rate": 6.173872996989793e-05, + "loss": 0.3375, + "step": 471 + }, + { + "epoch": 0.44, + "grad_norm": 0.3317432701587677, + "learning_rate": 6.159100751337642e-05, + "loss": 0.2029, + "step": 472 + }, + { + "epoch": 0.44, + "grad_norm": 0.4206933081150055, + "learning_rate": 6.144317805790361e-05, + "loss": 0.2868, + "step": 473 + }, + { + "epoch": 0.44, + "grad_norm": 0.38284099102020264, + "learning_rate": 6.129524296812335e-05, + "loss": 0.2567, + "step": 474 + }, + { + "epoch": 0.45, + "grad_norm": 0.33891499042510986, + "learning_rate": 6.114720360965453e-05, + "loss": 0.1857, + "step": 475 + }, + { + "epoch": 0.45, + "grad_norm": 0.41084223985671997, + "learning_rate": 6.099906134907868e-05, + "loss": 0.2805, + "step": 476 + }, + { + "epoch": 0.45, + "grad_norm": 0.43213552236557007, + "learning_rate": 6.085081755392714e-05, + "loss": 0.3463, + "step": 477 + }, + { + "epoch": 0.45, + "grad_norm": 0.45232778787612915, + "learning_rate": 6.07024735926686e-05, + "loss": 0.3279, + "step": 478 + }, + { + "epoch": 0.45, + "grad_norm": 0.4519384205341339, + "learning_rate": 6.055403083469637e-05, + "loss": 0.3846, + "step": 479 + }, + { + "epoch": 0.45, + "grad_norm": 0.48919233679771423, + "learning_rate": 6.04054906503158e-05, + "loss": 0.4084, + "step": 480 + }, + { + "epoch": 0.45, + "grad_norm": 0.45906567573547363, + "learning_rate": 6.0256854410731565e-05, + "loss": 0.3372, + "step": 481 + }, + { + "epoch": 0.45, + "grad_norm": 0.4604879319667816, + "learning_rate": 6.010812348803509e-05, + "loss": 0.376, + "step": 482 + }, + { + "epoch": 0.45, + "grad_norm": 0.44598522782325745, + "learning_rate": 5.99592992551918e-05, + "loss": 0.4184, + "step": 483 + }, + { + "epoch": 0.45, + "grad_norm": 0.41852816939353943, + "learning_rate": 5.9810383086028535e-05, + "loss": 0.377, + "step": 484 + }, + { + "epoch": 0.45, + "grad_norm": 0.3676040768623352, + "learning_rate": 5.9661376355220734e-05, + "loss": 0.2941, + "step": 485 + }, + { + "epoch": 0.46, + "grad_norm": 0.48775678873062134, + "learning_rate": 5.9512280438279914e-05, + "loss": 0.4142, + "step": 486 + }, + { + "epoch": 0.46, + "grad_norm": 0.38730305433273315, + "learning_rate": 5.936309671154084e-05, + "loss": 0.3353, + "step": 487 + }, + { + "epoch": 0.46, + "grad_norm": 0.43073874711990356, + "learning_rate": 5.9213826552148886e-05, + "loss": 0.343, + "step": 488 + }, + { + "epoch": 0.46, + "grad_norm": 0.3993862569332123, + "learning_rate": 5.906447133804731e-05, + "loss": 0.3534, + "step": 489 + }, + { + "epoch": 0.46, + "grad_norm": 0.37199288606643677, + "learning_rate": 5.891503244796448e-05, + "loss": 0.2556, + "step": 490 + }, + { + "epoch": 0.46, + "grad_norm": 0.3543163239955902, + "learning_rate": 5.8765511261401254e-05, + "loss": 0.2985, + "step": 491 + }, + { + "epoch": 0.46, + "grad_norm": 0.35919004678726196, + "learning_rate": 5.861590915861817e-05, + "loss": 0.2354, + "step": 492 + }, + { + "epoch": 0.46, + "grad_norm": 0.37146124243736267, + "learning_rate": 5.846622752062268e-05, + "loss": 0.2635, + "step": 493 + }, + { + "epoch": 0.46, + "grad_norm": 0.4142633378505707, + "learning_rate": 5.831646772915651e-05, + "loss": 0.3471, + "step": 494 + }, + { + "epoch": 0.46, + "grad_norm": 0.4175507128238678, + "learning_rate": 5.816663116668276e-05, + "loss": 0.3043, + "step": 495 + }, + { + "epoch": 0.47, + "grad_norm": 0.4025022089481354, + "learning_rate": 5.801671921637328e-05, + "loss": 0.3176, + "step": 496 + }, + { + "epoch": 0.47, + "grad_norm": 0.5054760575294495, + "learning_rate": 5.786673326209584e-05, + "loss": 0.4023, + "step": 497 + }, + { + "epoch": 0.47, + "grad_norm": 0.3766479194164276, + "learning_rate": 5.7716674688401286e-05, + "loss": 0.265, + "step": 498 + }, + { + "epoch": 0.47, + "grad_norm": 0.482490599155426, + "learning_rate": 5.756654488051091e-05, + "loss": 0.3873, + "step": 499 + }, + { + "epoch": 0.47, + "grad_norm": 0.3957984149456024, + "learning_rate": 5.7416345224303524e-05, + "loss": 0.2965, + "step": 500 + }, + { + "epoch": 0.47, + "grad_norm": 0.3828335702419281, + "learning_rate": 5.7266077106302785e-05, + "loss": 0.2825, + "step": 501 + }, + { + "epoch": 0.47, + "grad_norm": 0.34701448678970337, + "learning_rate": 5.7115741913664264e-05, + "loss": 0.2666, + "step": 502 + }, + { + "epoch": 0.47, + "grad_norm": 0.3745706081390381, + "learning_rate": 5.696534103416276e-05, + "loss": 0.2752, + "step": 503 + }, + { + "epoch": 0.47, + "grad_norm": 0.4428030848503113, + "learning_rate": 5.6814875856179414e-05, + "loss": 0.3607, + "step": 504 + }, + { + "epoch": 0.47, + "grad_norm": 0.4908398985862732, + "learning_rate": 5.666434776868895e-05, + "loss": 0.3612, + "step": 505 + }, + { + "epoch": 0.47, + "grad_norm": 0.47323817014694214, + "learning_rate": 5.651375816124679e-05, + "loss": 0.3226, + "step": 506 + }, + { + "epoch": 0.48, + "grad_norm": 0.4364282786846161, + "learning_rate": 5.636310842397629e-05, + "loss": 0.3154, + "step": 507 + }, + { + "epoch": 0.48, + "grad_norm": 0.44495850801467896, + "learning_rate": 5.621239994755583e-05, + "loss": 0.3132, + "step": 508 + }, + { + "epoch": 0.48, + "grad_norm": 0.41207343339920044, + "learning_rate": 5.606163412320608e-05, + "loss": 0.2979, + "step": 509 + }, + { + "epoch": 0.48, + "grad_norm": 0.408840537071228, + "learning_rate": 5.5910812342677065e-05, + "loss": 0.2689, + "step": 510 + }, + { + "epoch": 0.48, + "grad_norm": 0.36593517661094666, + "learning_rate": 5.575993599823536e-05, + "loss": 0.2513, + "step": 511 + }, + { + "epoch": 0.48, + "grad_norm": 0.37152597308158875, + "learning_rate": 5.560900648265124e-05, + "loss": 0.1832, + "step": 512 + }, + { + "epoch": 0.48, + "grad_norm": 0.48371148109436035, + "learning_rate": 5.545802518918579e-05, + "loss": 0.3981, + "step": 513 + }, + { + "epoch": 0.48, + "grad_norm": 0.5247592329978943, + "learning_rate": 5.5306993511578096e-05, + "loss": 0.4218, + "step": 514 + }, + { + "epoch": 0.48, + "grad_norm": 0.4305899441242218, + "learning_rate": 5.515591284403234e-05, + "loss": 0.3061, + "step": 515 + }, + { + "epoch": 0.48, + "grad_norm": 0.40413033962249756, + "learning_rate": 5.5004784581204927e-05, + "loss": 0.3007, + "step": 516 + }, + { + "epoch": 0.48, + "grad_norm": 0.36091166734695435, + "learning_rate": 5.485361011819164e-05, + "loss": 0.2439, + "step": 517 + }, + { + "epoch": 0.49, + "grad_norm": 0.39443257451057434, + "learning_rate": 5.4702390850514726e-05, + "loss": 0.2617, + "step": 518 + }, + { + "epoch": 0.49, + "grad_norm": 0.40204355120658875, + "learning_rate": 5.455112817411006e-05, + "loss": 0.3121, + "step": 519 + }, + { + "epoch": 0.49, + "grad_norm": 0.41540104150772095, + "learning_rate": 5.4399823485314226e-05, + "loss": 0.2636, + "step": 520 + }, + { + "epoch": 0.49, + "grad_norm": 0.44149285554885864, + "learning_rate": 5.4248478180851604e-05, + "loss": 0.387, + "step": 521 + }, + { + "epoch": 0.49, + "grad_norm": 0.36926665902137756, + "learning_rate": 5.409709365782154e-05, + "loss": 0.254, + "step": 522 + }, + { + "epoch": 0.49, + "grad_norm": 0.4551641047000885, + "learning_rate": 5.3945671313685386e-05, + "loss": 0.3855, + "step": 523 + }, + { + "epoch": 0.49, + "grad_norm": 0.37731683254241943, + "learning_rate": 5.379421254625366e-05, + "loss": 0.2773, + "step": 524 + }, + { + "epoch": 0.49, + "grad_norm": 0.40048646926879883, + "learning_rate": 5.364271875367311e-05, + "loss": 0.294, + "step": 525 + }, + { + "epoch": 0.49, + "grad_norm": 0.4262080192565918, + "learning_rate": 5.3491191334413746e-05, + "loss": 0.3774, + "step": 526 + }, + { + "epoch": 0.49, + "grad_norm": 0.38077598810195923, + "learning_rate": 5.3339631687256084e-05, + "loss": 0.2447, + "step": 527 + }, + { + "epoch": 0.5, + "grad_norm": 0.39433252811431885, + "learning_rate": 5.318804121127807e-05, + "loss": 0.2666, + "step": 528 + }, + { + "epoch": 0.5, + "grad_norm": 0.45546483993530273, + "learning_rate": 5.3036421305842276e-05, + "loss": 0.3913, + "step": 529 + }, + { + "epoch": 0.5, + "grad_norm": 0.3501354455947876, + "learning_rate": 5.288477337058293e-05, + "loss": 0.2395, + "step": 530 + }, + { + "epoch": 0.5, + "grad_norm": 0.4665021002292633, + "learning_rate": 5.273309880539301e-05, + "loss": 0.3696, + "step": 531 + }, + { + "epoch": 0.5, + "grad_norm": 0.5128461718559265, + "learning_rate": 5.258139901041132e-05, + "loss": 0.2984, + "step": 532 + }, + { + "epoch": 0.5, + "grad_norm": 0.3114360272884369, + "learning_rate": 5.242967538600957e-05, + "loss": 0.1855, + "step": 533 + }, + { + "epoch": 0.5, + "grad_norm": 0.4211101531982422, + "learning_rate": 5.227792933277943e-05, + "loss": 0.3319, + "step": 534 + }, + { + "epoch": 0.5, + "grad_norm": 0.3312818706035614, + "learning_rate": 5.212616225151965e-05, + "loss": 0.2227, + "step": 535 + }, + { + "epoch": 0.5, + "grad_norm": 0.43743807077407837, + "learning_rate": 5.197437554322304e-05, + "loss": 0.4077, + "step": 536 + }, + { + "epoch": 0.5, + "grad_norm": 0.415711373090744, + "learning_rate": 5.182257060906365e-05, + "loss": 0.2987, + "step": 537 + }, + { + "epoch": 0.5, + "grad_norm": 0.46717381477355957, + "learning_rate": 5.167074885038373e-05, + "loss": 0.33, + "step": 538 + }, + { + "epoch": 0.51, + "grad_norm": 0.36937227845191956, + "learning_rate": 5.151891166868086e-05, + "loss": 0.2459, + "step": 539 + }, + { + "epoch": 0.51, + "grad_norm": 0.5149711966514587, + "learning_rate": 5.1367060465595006e-05, + "loss": 0.462, + "step": 540 + }, + { + "epoch": 0.51, + "grad_norm": 0.384420782327652, + "learning_rate": 5.121519664289553e-05, + "loss": 0.2958, + "step": 541 + }, + { + "epoch": 0.51, + "grad_norm": 0.46390560269355774, + "learning_rate": 5.106332160246834e-05, + "loss": 0.4127, + "step": 542 + }, + { + "epoch": 0.51, + "grad_norm": 0.37238529324531555, + "learning_rate": 5.0911436746302834e-05, + "loss": 0.2882, + "step": 543 + }, + { + "epoch": 0.51, + "grad_norm": 0.37248557806015015, + "learning_rate": 5.075954347647909e-05, + "loss": 0.2447, + "step": 544 + }, + { + "epoch": 0.51, + "grad_norm": 0.3653389811515808, + "learning_rate": 5.0607643195154796e-05, + "loss": 0.2843, + "step": 545 + }, + { + "epoch": 0.51, + "grad_norm": 0.3526271879673004, + "learning_rate": 5.045573730455241e-05, + "loss": 0.285, + "step": 546 + }, + { + "epoch": 0.51, + "grad_norm": 0.3367946445941925, + "learning_rate": 5.030382720694612e-05, + "loss": 0.2531, + "step": 547 + }, + { + "epoch": 0.51, + "grad_norm": 0.4197223484516144, + "learning_rate": 5.0151914304649015e-05, + "loss": 0.3536, + "step": 548 + }, + { + "epoch": 0.51, + "grad_norm": 0.33859139680862427, + "learning_rate": 5e-05, + "loss": 0.2301, + "step": 549 + }, + { + "epoch": 0.52, + "grad_norm": 0.43413838744163513, + "learning_rate": 4.984808569535101e-05, + "loss": 0.3349, + "step": 550 + }, + { + "epoch": 0.52, + "grad_norm": 0.4138838052749634, + "learning_rate": 4.969617279305388e-05, + "loss": 0.3194, + "step": 551 + }, + { + "epoch": 0.52, + "grad_norm": 0.37030553817749023, + "learning_rate": 4.954426269544761e-05, + "loss": 0.2617, + "step": 552 + }, + { + "epoch": 0.52, + "grad_norm": 0.39585164189338684, + "learning_rate": 4.939235680484522e-05, + "loss": 0.3128, + "step": 553 + }, + { + "epoch": 0.52, + "grad_norm": 0.35292455554008484, + "learning_rate": 4.924045652352092e-05, + "loss": 0.2415, + "step": 554 + }, + { + "epoch": 0.52, + "grad_norm": 0.383457213640213, + "learning_rate": 4.908856325369718e-05, + "loss": 0.3018, + "step": 555 + }, + { + "epoch": 0.52, + "grad_norm": 0.3327413499355316, + "learning_rate": 4.893667839753168e-05, + "loss": 0.1913, + "step": 556 + }, + { + "epoch": 0.52, + "grad_norm": 0.4065321087837219, + "learning_rate": 4.878480335710448e-05, + "loss": 0.3515, + "step": 557 + }, + { + "epoch": 0.52, + "grad_norm": 0.2919169068336487, + "learning_rate": 4.8632939534405006e-05, + "loss": 0.1718, + "step": 558 + }, + { + "epoch": 0.52, + "grad_norm": 0.3973611295223236, + "learning_rate": 4.8481088331319146e-05, + "loss": 0.32, + "step": 559 + }, + { + "epoch": 0.53, + "grad_norm": 0.3425373435020447, + "learning_rate": 4.832925114961629e-05, + "loss": 0.2478, + "step": 560 + }, + { + "epoch": 0.53, + "grad_norm": 0.472787082195282, + "learning_rate": 4.817742939093635e-05, + "loss": 0.4112, + "step": 561 + }, + { + "epoch": 0.53, + "grad_norm": 0.42442649602890015, + "learning_rate": 4.8025624456776966e-05, + "loss": 0.2729, + "step": 562 + }, + { + "epoch": 0.53, + "grad_norm": 0.36462414264678955, + "learning_rate": 4.787383774848037e-05, + "loss": 0.3008, + "step": 563 + }, + { + "epoch": 0.53, + "grad_norm": 0.4791617691516876, + "learning_rate": 4.772207066722057e-05, + "loss": 0.3457, + "step": 564 + }, + { + "epoch": 0.53, + "grad_norm": 0.46410971879959106, + "learning_rate": 4.757032461399044e-05, + "loss": 0.3273, + "step": 565 + }, + { + "epoch": 0.53, + "grad_norm": 0.4102344214916229, + "learning_rate": 4.7418600989588694e-05, + "loss": 0.235, + "step": 566 + }, + { + "epoch": 0.53, + "grad_norm": 0.3302304148674011, + "learning_rate": 4.726690119460701e-05, + "loss": 0.1944, + "step": 567 + }, + { + "epoch": 0.53, + "grad_norm": 0.39677003026008606, + "learning_rate": 4.7115226629417075e-05, + "loss": 0.3123, + "step": 568 + }, + { + "epoch": 0.53, + "grad_norm": 0.432862251996994, + "learning_rate": 4.6963578694157736e-05, + "loss": 0.2621, + "step": 569 + }, + { + "epoch": 0.53, + "grad_norm": 0.44008541107177734, + "learning_rate": 4.681195878872194e-05, + "loss": 0.3357, + "step": 570 + }, + { + "epoch": 0.54, + "grad_norm": 0.3981214463710785, + "learning_rate": 4.666036831274392e-05, + "loss": 0.2633, + "step": 571 + }, + { + "epoch": 0.54, + "grad_norm": 0.4169214963912964, + "learning_rate": 4.6508808665586265e-05, + "loss": 0.3009, + "step": 572 + }, + { + "epoch": 0.54, + "grad_norm": 0.33570775389671326, + "learning_rate": 4.635728124632692e-05, + "loss": 0.2555, + "step": 573 + }, + { + "epoch": 0.54, + "grad_norm": 0.4198802709579468, + "learning_rate": 4.6205787453746336e-05, + "loss": 0.2845, + "step": 574 + }, + { + "epoch": 0.54, + "grad_norm": 0.34704431891441345, + "learning_rate": 4.605432868631462e-05, + "loss": 0.233, + "step": 575 + }, + { + "epoch": 0.54, + "grad_norm": 0.48909562826156616, + "learning_rate": 4.590290634217848e-05, + "loss": 0.3706, + "step": 576 + }, + { + "epoch": 0.54, + "grad_norm": 0.43792209029197693, + "learning_rate": 4.57515218191484e-05, + "loss": 0.3334, + "step": 577 + }, + { + "epoch": 0.54, + "grad_norm": 0.3764708936214447, + "learning_rate": 4.5600176514685786e-05, + "loss": 0.2736, + "step": 578 + }, + { + "epoch": 0.54, + "grad_norm": 0.40787622332572937, + "learning_rate": 4.5448871825889946e-05, + "loss": 0.3078, + "step": 579 + }, + { + "epoch": 0.54, + "grad_norm": 0.41174522042274475, + "learning_rate": 4.52976091494853e-05, + "loss": 0.2572, + "step": 580 + }, + { + "epoch": 0.54, + "grad_norm": 0.3787226378917694, + "learning_rate": 4.514638988180837e-05, + "loss": 0.2752, + "step": 581 + }, + { + "epoch": 0.55, + "grad_norm": 0.47147271037101746, + "learning_rate": 4.4995215418795085e-05, + "loss": 0.3972, + "step": 582 + }, + { + "epoch": 0.55, + "grad_norm": 0.3915804922580719, + "learning_rate": 4.484408715596768e-05, + "loss": 0.293, + "step": 583 + }, + { + "epoch": 0.55, + "grad_norm": 0.413932204246521, + "learning_rate": 4.4693006488421915e-05, + "loss": 0.2767, + "step": 584 + }, + { + "epoch": 0.55, + "grad_norm": 0.43163156509399414, + "learning_rate": 4.454197481081422e-05, + "loss": 0.2971, + "step": 585 + }, + { + "epoch": 0.55, + "grad_norm": 0.3995768427848816, + "learning_rate": 4.439099351734878e-05, + "loss": 0.2823, + "step": 586 + }, + { + "epoch": 0.55, + "grad_norm": 0.4052314758300781, + "learning_rate": 4.4240064001764646e-05, + "loss": 0.2714, + "step": 587 + }, + { + "epoch": 0.55, + "grad_norm": 0.39411696791648865, + "learning_rate": 4.4089187657322953e-05, + "loss": 0.2645, + "step": 588 + }, + { + "epoch": 0.55, + "grad_norm": 0.3517491817474365, + "learning_rate": 4.393836587679394e-05, + "loss": 0.2443, + "step": 589 + }, + { + "epoch": 0.55, + "grad_norm": 0.46535736322402954, + "learning_rate": 4.3787600052444174e-05, + "loss": 0.3524, + "step": 590 + }, + { + "epoch": 0.55, + "grad_norm": 0.47955119609832764, + "learning_rate": 4.363689157602373e-05, + "loss": 0.4257, + "step": 591 + }, + { + "epoch": 0.56, + "grad_norm": 0.37164661288261414, + "learning_rate": 4.348624183875322e-05, + "loss": 0.2939, + "step": 592 + }, + { + "epoch": 0.56, + "grad_norm": 0.3547576367855072, + "learning_rate": 4.333565223131107e-05, + "loss": 0.2171, + "step": 593 + }, + { + "epoch": 0.56, + "grad_norm": 0.4252079427242279, + "learning_rate": 4.318512414382059e-05, + "loss": 0.3381, + "step": 594 + }, + { + "epoch": 0.56, + "grad_norm": 0.4120088517665863, + "learning_rate": 4.3034658965837255e-05, + "loss": 0.1831, + "step": 595 + }, + { + "epoch": 0.56, + "grad_norm": 0.4044457972049713, + "learning_rate": 4.288425808633575e-05, + "loss": 0.3326, + "step": 596 + }, + { + "epoch": 0.56, + "grad_norm": 0.3945488929748535, + "learning_rate": 4.273392289369723e-05, + "loss": 0.2759, + "step": 597 + }, + { + "epoch": 0.56, + "grad_norm": 0.3605932593345642, + "learning_rate": 4.258365477569648e-05, + "loss": 0.2268, + "step": 598 + }, + { + "epoch": 0.56, + "grad_norm": 0.4327079653739929, + "learning_rate": 4.2433455119489105e-05, + "loss": 0.3237, + "step": 599 + }, + { + "epoch": 0.56, + "grad_norm": 0.4783308207988739, + "learning_rate": 4.228332531159871e-05, + "loss": 0.3675, + "step": 600 + }, + { + "epoch": 0.56, + "grad_norm": 0.4235597550868988, + "learning_rate": 4.2133266737904176e-05, + "loss": 0.2976, + "step": 601 + }, + { + "epoch": 0.56, + "grad_norm": 0.4399956464767456, + "learning_rate": 4.1983280783626724e-05, + "loss": 0.3662, + "step": 602 + }, + { + "epoch": 0.57, + "grad_norm": 0.42588597536087036, + "learning_rate": 4.183336883331723e-05, + "loss": 0.3611, + "step": 603 + }, + { + "epoch": 0.57, + "grad_norm": 0.43737921118736267, + "learning_rate": 4.1683532270843504e-05, + "loss": 0.3477, + "step": 604 + }, + { + "epoch": 0.57, + "grad_norm": 0.40459975600242615, + "learning_rate": 4.153377247937732e-05, + "loss": 0.3054, + "step": 605 + }, + { + "epoch": 0.57, + "grad_norm": 0.3984995484352112, + "learning_rate": 4.138409084138185e-05, + "loss": 0.3077, + "step": 606 + }, + { + "epoch": 0.57, + "grad_norm": 0.4512704312801361, + "learning_rate": 4.1234488738598744e-05, + "loss": 0.2709, + "step": 607 + }, + { + "epoch": 0.57, + "grad_norm": 0.33205512166023254, + "learning_rate": 4.108496755203553e-05, + "loss": 0.2092, + "step": 608 + }, + { + "epoch": 0.57, + "grad_norm": 0.40625834465026855, + "learning_rate": 4.0935528661952716e-05, + "loss": 0.3054, + "step": 609 + }, + { + "epoch": 0.57, + "grad_norm": 0.41261643171310425, + "learning_rate": 4.0786173447851126e-05, + "loss": 0.3156, + "step": 610 + }, + { + "epoch": 0.57, + "grad_norm": 0.3921923041343689, + "learning_rate": 4.063690328845916e-05, + "loss": 0.2664, + "step": 611 + }, + { + "epoch": 0.57, + "grad_norm": 0.34725549817085266, + "learning_rate": 4.04877195617201e-05, + "loss": 0.2566, + "step": 612 + }, + { + "epoch": 0.58, + "grad_norm": 0.4461452066898346, + "learning_rate": 4.033862364477927e-05, + "loss": 0.3574, + "step": 613 + }, + { + "epoch": 0.58, + "grad_norm": 0.34110385179519653, + "learning_rate": 4.0189616913971484e-05, + "loss": 0.2254, + "step": 614 + }, + { + "epoch": 0.58, + "grad_norm": 0.41009894013404846, + "learning_rate": 4.0040700744808204e-05, + "loss": 0.3814, + "step": 615 + }, + { + "epoch": 0.58, + "grad_norm": 0.3577955663204193, + "learning_rate": 3.989187651196493e-05, + "loss": 0.2292, + "step": 616 + }, + { + "epoch": 0.58, + "grad_norm": 0.4174287021160126, + "learning_rate": 3.974314558926844e-05, + "loss": 0.3083, + "step": 617 + }, + { + "epoch": 0.58, + "grad_norm": 0.3687293529510498, + "learning_rate": 3.9594509349684216e-05, + "loss": 0.3006, + "step": 618 + }, + { + "epoch": 0.58, + "grad_norm": 0.3968195617198944, + "learning_rate": 3.9445969165303647e-05, + "loss": 0.2811, + "step": 619 + }, + { + "epoch": 0.58, + "grad_norm": 0.43467170000076294, + "learning_rate": 3.929752640733141e-05, + "loss": 0.3129, + "step": 620 + }, + { + "epoch": 0.58, + "grad_norm": 0.4192885160446167, + "learning_rate": 3.914918244607287e-05, + "loss": 0.2833, + "step": 621 + }, + { + "epoch": 0.58, + "grad_norm": 0.3982818126678467, + "learning_rate": 3.900093865092134e-05, + "loss": 0.2464, + "step": 622 + }, + { + "epoch": 0.58, + "grad_norm": 0.37380045652389526, + "learning_rate": 3.885279639034546e-05, + "loss": 0.2454, + "step": 623 + }, + { + "epoch": 0.59, + "grad_norm": 0.47077518701553345, + "learning_rate": 3.870475703187667e-05, + "loss": 0.265, + "step": 624 + }, + { + "epoch": 0.59, + "grad_norm": 0.47373396158218384, + "learning_rate": 3.855682194209639e-05, + "loss": 0.2975, + "step": 625 + }, + { + "epoch": 0.59, + "grad_norm": 0.30882200598716736, + "learning_rate": 3.840899248662358e-05, + "loss": 0.2194, + "step": 626 + }, + { + "epoch": 0.59, + "grad_norm": 0.401368111371994, + "learning_rate": 3.8261270030102084e-05, + "loss": 0.2833, + "step": 627 + }, + { + "epoch": 0.59, + "grad_norm": 0.3969556391239166, + "learning_rate": 3.8113655936187947e-05, + "loss": 0.3015, + "step": 628 + }, + { + "epoch": 0.59, + "grad_norm": 0.3934876024723053, + "learning_rate": 3.796615156753696e-05, + "loss": 0.3399, + "step": 629 + }, + { + "epoch": 0.59, + "grad_norm": 0.4074617028236389, + "learning_rate": 3.7818758285791955e-05, + "loss": 0.3068, + "step": 630 + }, + { + "epoch": 0.59, + "grad_norm": 0.3954191207885742, + "learning_rate": 3.767147745157039e-05, + "loss": 0.3337, + "step": 631 + }, + { + "epoch": 0.59, + "grad_norm": 0.47569915652275085, + "learning_rate": 3.7524310424451635e-05, + "loss": 0.3639, + "step": 632 + }, + { + "epoch": 0.59, + "grad_norm": 0.39985620975494385, + "learning_rate": 3.7377258562964454e-05, + "loss": 0.3208, + "step": 633 + }, + { + "epoch": 0.59, + "grad_norm": 0.4269794821739197, + "learning_rate": 3.723032322457458e-05, + "loss": 0.3463, + "step": 634 + }, + { + "epoch": 0.6, + "grad_norm": 0.4925236105918884, + "learning_rate": 3.708350576567204e-05, + "loss": 0.41, + "step": 635 + }, + { + "epoch": 0.6, + "grad_norm": 0.3888967037200928, + "learning_rate": 3.6936807541558674e-05, + "loss": 0.2667, + "step": 636 + }, + { + "epoch": 0.6, + "grad_norm": 0.4810844659805298, + "learning_rate": 3.6790229906435705e-05, + "loss": 0.4566, + "step": 637 + }, + { + "epoch": 0.6, + "grad_norm": 0.3134874403476715, + "learning_rate": 3.664377421339111e-05, + "loss": 0.2074, + "step": 638 + }, + { + "epoch": 0.6, + "grad_norm": 0.4054643213748932, + "learning_rate": 3.6497441814387247e-05, + "loss": 0.2916, + "step": 639 + }, + { + "epoch": 0.6, + "grad_norm": 0.3976866900920868, + "learning_rate": 3.6351234060248286e-05, + "loss": 0.2898, + "step": 640 + }, + { + "epoch": 0.6, + "grad_norm": 0.44661563634872437, + "learning_rate": 3.6205152300647784e-05, + "loss": 0.3798, + "step": 641 + }, + { + "epoch": 0.6, + "grad_norm": 0.37204062938690186, + "learning_rate": 3.605919788409621e-05, + "loss": 0.3046, + "step": 642 + }, + { + "epoch": 0.6, + "grad_norm": 0.40015891194343567, + "learning_rate": 3.591337215792852e-05, + "loss": 0.3225, + "step": 643 + }, + { + "epoch": 0.6, + "grad_norm": 0.39118486642837524, + "learning_rate": 3.5767676468291713e-05, + "loss": 0.2303, + "step": 644 + }, + { + "epoch": 0.61, + "grad_norm": 0.4793816804885864, + "learning_rate": 3.562211216013235e-05, + "loss": 0.4683, + "step": 645 + }, + { + "epoch": 0.61, + "grad_norm": 0.3733760714530945, + "learning_rate": 3.5476680577184206e-05, + "loss": 0.3253, + "step": 646 + }, + { + "epoch": 0.61, + "grad_norm": 0.339495986700058, + "learning_rate": 3.533138306195588e-05, + "loss": 0.2335, + "step": 647 + }, + { + "epoch": 0.61, + "grad_norm": 0.43037405610084534, + "learning_rate": 3.5186220955718306e-05, + "loss": 0.2914, + "step": 648 + }, + { + "epoch": 0.61, + "grad_norm": 0.3812549114227295, + "learning_rate": 3.5041195598492446e-05, + "loss": 0.3047, + "step": 649 + }, + { + "epoch": 0.61, + "grad_norm": 0.3607620298862457, + "learning_rate": 3.489630832903694e-05, + "loss": 0.2662, + "step": 650 + }, + { + "epoch": 0.61, + "grad_norm": 0.47065141797065735, + "learning_rate": 3.475156048483567e-05, + "loss": 0.3258, + "step": 651 + }, + { + "epoch": 0.61, + "grad_norm": 0.4050087034702301, + "learning_rate": 3.460695340208546e-05, + "loss": 0.3524, + "step": 652 + }, + { + "epoch": 0.61, + "grad_norm": 0.39014732837677, + "learning_rate": 3.446248841568375e-05, + "loss": 0.2748, + "step": 653 + }, + { + "epoch": 0.61, + "grad_norm": 0.4207385182380676, + "learning_rate": 3.431816685921625e-05, + "loss": 0.3261, + "step": 654 + }, + { + "epoch": 0.61, + "grad_norm": 0.3538001775741577, + "learning_rate": 3.417399006494466e-05, + "loss": 0.233, + "step": 655 + }, + { + "epoch": 0.62, + "grad_norm": 0.3425646722316742, + "learning_rate": 3.402995936379433e-05, + "loss": 0.245, + "step": 656 + }, + { + "epoch": 0.62, + "grad_norm": 0.3696725368499756, + "learning_rate": 3.3886076085341986e-05, + "loss": 0.2547, + "step": 657 + }, + { + "epoch": 0.62, + "grad_norm": 0.3415139615535736, + "learning_rate": 3.37423415578035e-05, + "loss": 0.2382, + "step": 658 + }, + { + "epoch": 0.62, + "grad_norm": 0.3608959913253784, + "learning_rate": 3.3598757108021546e-05, + "loss": 0.2394, + "step": 659 + }, + { + "epoch": 0.62, + "grad_norm": 0.3897416889667511, + "learning_rate": 3.345532406145345e-05, + "loss": 0.2854, + "step": 660 + }, + { + "epoch": 0.62, + "grad_norm": 0.34789225459098816, + "learning_rate": 3.331204374215888e-05, + "loss": 0.2357, + "step": 661 + }, + { + "epoch": 0.62, + "grad_norm": 0.37091711163520813, + "learning_rate": 3.316891747278761e-05, + "loss": 0.2424, + "step": 662 + }, + { + "epoch": 0.62, + "grad_norm": 0.47823840379714966, + "learning_rate": 3.302594657456744e-05, + "loss": 0.3736, + "step": 663 + }, + { + "epoch": 0.62, + "grad_norm": 0.45948362350463867, + "learning_rate": 3.288313236729183e-05, + "loss": 0.3395, + "step": 664 + }, + { + "epoch": 0.62, + "grad_norm": 0.43615004420280457, + "learning_rate": 3.274047616930781e-05, + "loss": 0.343, + "step": 665 + }, + { + "epoch": 0.62, + "grad_norm": 0.4098581373691559, + "learning_rate": 3.259797929750378e-05, + "loss": 0.2629, + "step": 666 + }, + { + "epoch": 0.63, + "grad_norm": 0.38520053029060364, + "learning_rate": 3.245564306729744e-05, + "loss": 0.2948, + "step": 667 + }, + { + "epoch": 0.63, + "grad_norm": 0.3814550042152405, + "learning_rate": 3.231346879262349e-05, + "loss": 0.2802, + "step": 668 + }, + { + "epoch": 0.63, + "grad_norm": 0.38907739520072937, + "learning_rate": 3.217145778592162e-05, + "loss": 0.3131, + "step": 669 + }, + { + "epoch": 0.63, + "grad_norm": 0.4280015230178833, + "learning_rate": 3.202961135812437e-05, + "loss": 0.3395, + "step": 670 + }, + { + "epoch": 0.63, + "grad_norm": 0.4221903681755066, + "learning_rate": 3.1887930818644996e-05, + "loss": 0.3522, + "step": 671 + }, + { + "epoch": 0.63, + "grad_norm": 0.4077076315879822, + "learning_rate": 3.1746417475365405e-05, + "loss": 0.2843, + "step": 672 + }, + { + "epoch": 0.63, + "grad_norm": 0.4645012319087982, + "learning_rate": 3.1605072634624125e-05, + "loss": 0.4123, + "step": 673 + }, + { + "epoch": 0.63, + "grad_norm": 0.4186570644378662, + "learning_rate": 3.146389760120416e-05, + "loss": 0.2707, + "step": 674 + }, + { + "epoch": 0.63, + "grad_norm": 0.40125572681427, + "learning_rate": 3.132289367832097e-05, + "loss": 0.2917, + "step": 675 + }, + { + "epoch": 0.63, + "grad_norm": 0.4461178183555603, + "learning_rate": 3.118206216761053e-05, + "loss": 0.3186, + "step": 676 + }, + { + "epoch": 0.64, + "grad_norm": 0.3696655035018921, + "learning_rate": 3.104140436911719e-05, + "loss": 0.1924, + "step": 677 + }, + { + "epoch": 0.64, + "grad_norm": 0.3892281949520111, + "learning_rate": 3.0900921581281725e-05, + "loss": 0.3101, + "step": 678 + }, + { + "epoch": 0.64, + "grad_norm": 0.4376410245895386, + "learning_rate": 3.076061510092935e-05, + "loss": 0.3385, + "step": 679 + }, + { + "epoch": 0.64, + "grad_norm": 0.3946727216243744, + "learning_rate": 3.062048622325779e-05, + "loss": 0.3105, + "step": 680 + }, + { + "epoch": 0.64, + "grad_norm": 0.4012112319469452, + "learning_rate": 3.0480536241825263e-05, + "loss": 0.2442, + "step": 681 + }, + { + "epoch": 0.64, + "grad_norm": 0.45755696296691895, + "learning_rate": 3.034076644853853e-05, + "loss": 0.2932, + "step": 682 + }, + { + "epoch": 0.64, + "grad_norm": 0.4443381428718567, + "learning_rate": 3.0201178133641038e-05, + "loss": 0.3799, + "step": 683 + }, + { + "epoch": 0.64, + "grad_norm": 0.3474355638027191, + "learning_rate": 3.0061772585700953e-05, + "loss": 0.2334, + "step": 684 + }, + { + "epoch": 0.64, + "grad_norm": 0.38603806495666504, + "learning_rate": 2.992255109159926e-05, + "loss": 0.2574, + "step": 685 + }, + { + "epoch": 0.64, + "grad_norm": 0.38428762555122375, + "learning_rate": 2.9783514936517965e-05, + "loss": 0.3081, + "step": 686 + }, + { + "epoch": 0.64, + "grad_norm": 0.41110917925834656, + "learning_rate": 2.9644665403928117e-05, + "loss": 0.2946, + "step": 687 + }, + { + "epoch": 0.65, + "grad_norm": 0.4486719071865082, + "learning_rate": 2.950600377557804e-05, + "loss": 0.4257, + "step": 688 + }, + { + "epoch": 0.65, + "grad_norm": 0.4164861738681793, + "learning_rate": 2.9367531331481436e-05, + "loss": 0.2622, + "step": 689 + }, + { + "epoch": 0.65, + "grad_norm": 0.40194183588027954, + "learning_rate": 2.9229249349905684e-05, + "loss": 0.3232, + "step": 690 + }, + { + "epoch": 0.65, + "grad_norm": 0.38287442922592163, + "learning_rate": 2.909115910735991e-05, + "loss": 0.2665, + "step": 691 + }, + { + "epoch": 0.65, + "grad_norm": 0.3484087288379669, + "learning_rate": 2.895326187858326e-05, + "loss": 0.2387, + "step": 692 + }, + { + "epoch": 0.65, + "grad_norm": 0.48390090465545654, + "learning_rate": 2.881555893653314e-05, + "loss": 0.478, + "step": 693 + }, + { + "epoch": 0.65, + "grad_norm": 0.3917997181415558, + "learning_rate": 2.8678051552373487e-05, + "loss": 0.2926, + "step": 694 + }, + { + "epoch": 0.65, + "grad_norm": 0.3424125611782074, + "learning_rate": 2.854074099546291e-05, + "loss": 0.2481, + "step": 695 + }, + { + "epoch": 0.65, + "grad_norm": 0.38915279507637024, + "learning_rate": 2.8403628533343206e-05, + "loss": 0.2039, + "step": 696 + }, + { + "epoch": 0.65, + "grad_norm": 0.39113256335258484, + "learning_rate": 2.826671543172738e-05, + "loss": 0.3135, + "step": 697 + }, + { + "epoch": 0.65, + "grad_norm": 0.3531236946582794, + "learning_rate": 2.8130002954488183e-05, + "loss": 0.2567, + "step": 698 + }, + { + "epoch": 0.66, + "grad_norm": 0.4020720422267914, + "learning_rate": 2.799349236364634e-05, + "loss": 0.3434, + "step": 699 + }, + { + "epoch": 0.66, + "grad_norm": 0.41817712783813477, + "learning_rate": 2.7857184919358937e-05, + "loss": 0.3242, + "step": 700 + }, + { + "epoch": 0.66, + "grad_norm": 0.3442646861076355, + "learning_rate": 2.7721081879907718e-05, + "loss": 0.2424, + "step": 701 + }, + { + "epoch": 0.66, + "grad_norm": 0.49456846714019775, + "learning_rate": 2.7585184501687577e-05, + "loss": 0.356, + "step": 702 + }, + { + "epoch": 0.66, + "grad_norm": 0.4054080545902252, + "learning_rate": 2.74494940391949e-05, + "loss": 0.3587, + "step": 703 + }, + { + "epoch": 0.66, + "grad_norm": 0.4222458302974701, + "learning_rate": 2.731401174501601e-05, + "loss": 0.317, + "step": 704 + }, + { + "epoch": 0.66, + "grad_norm": 0.36371931433677673, + "learning_rate": 2.7178738869815506e-05, + "loss": 0.2638, + "step": 705 + }, + { + "epoch": 0.66, + "grad_norm": 0.34860721230506897, + "learning_rate": 2.7043676662324878e-05, + "loss": 0.2298, + "step": 706 + }, + { + "epoch": 0.66, + "grad_norm": 0.42635902762413025, + "learning_rate": 2.69088263693309e-05, + "loss": 0.3104, + "step": 707 + }, + { + "epoch": 0.66, + "grad_norm": 0.411121129989624, + "learning_rate": 2.6774189235664026e-05, + "loss": 0.2629, + "step": 708 + }, + { + "epoch": 0.67, + "grad_norm": 0.46932339668273926, + "learning_rate": 2.663976650418715e-05, + "loss": 0.4047, + "step": 709 + }, + { + "epoch": 0.67, + "grad_norm": 0.46861177682876587, + "learning_rate": 2.650555941578381e-05, + "loss": 0.4143, + "step": 710 + }, + { + "epoch": 0.67, + "grad_norm": 0.3969976603984833, + "learning_rate": 2.6371569209347014e-05, + "loss": 0.292, + "step": 711 + }, + { + "epoch": 0.67, + "grad_norm": 0.41862115263938904, + "learning_rate": 2.6237797121767634e-05, + "loss": 0.3706, + "step": 712 + }, + { + "epoch": 0.67, + "grad_norm": 0.3939150869846344, + "learning_rate": 2.6104244387923082e-05, + "loss": 0.2726, + "step": 713 + }, + { + "epoch": 0.67, + "grad_norm": 0.33569246530532837, + "learning_rate": 2.5970912240665813e-05, + "loss": 0.2001, + "step": 714 + }, + { + "epoch": 0.67, + "grad_norm": 0.3980376422405243, + "learning_rate": 2.5837801910812053e-05, + "loss": 0.3019, + "step": 715 + }, + { + "epoch": 0.67, + "grad_norm": 0.3491917550563812, + "learning_rate": 2.5704914627130374e-05, + "loss": 0.2664, + "step": 716 + }, + { + "epoch": 0.67, + "grad_norm": 0.43504101037979126, + "learning_rate": 2.5572251616330373e-05, + "loss": 0.3871, + "step": 717 + }, + { + "epoch": 0.67, + "grad_norm": 0.41263502836227417, + "learning_rate": 2.5439814103051284e-05, + "loss": 0.2307, + "step": 718 + }, + { + "epoch": 0.67, + "grad_norm": 0.31717926263809204, + "learning_rate": 2.530760330985079e-05, + "loss": 0.1846, + "step": 719 + }, + { + "epoch": 0.68, + "grad_norm": 0.451673686504364, + "learning_rate": 2.517562045719367e-05, + "loss": 0.403, + "step": 720 + }, + { + "epoch": 0.68, + "grad_norm": 0.3923144042491913, + "learning_rate": 2.504386676344047e-05, + "loss": 0.3311, + "step": 721 + }, + { + "epoch": 0.68, + "grad_norm": 0.41828757524490356, + "learning_rate": 2.4912343444836445e-05, + "loss": 0.3399, + "step": 722 + }, + { + "epoch": 0.68, + "grad_norm": 0.44174039363861084, + "learning_rate": 2.4781051715500076e-05, + "loss": 0.3893, + "step": 723 + }, + { + "epoch": 0.68, + "grad_norm": 0.34206709265708923, + "learning_rate": 2.46499927874121e-05, + "loss": 0.2145, + "step": 724 + }, + { + "epoch": 0.68, + "grad_norm": 0.3767687678337097, + "learning_rate": 2.4519167870404125e-05, + "loss": 0.2524, + "step": 725 + }, + { + "epoch": 0.68, + "grad_norm": 0.4103870987892151, + "learning_rate": 2.4388578172147675e-05, + "loss": 0.3165, + "step": 726 + }, + { + "epoch": 0.68, + "grad_norm": 0.37169212102890015, + "learning_rate": 2.4258224898142807e-05, + "loss": 0.2359, + "step": 727 + }, + { + "epoch": 0.68, + "grad_norm": 0.42850708961486816, + "learning_rate": 2.4128109251707155e-05, + "loss": 0.3953, + "step": 728 + }, + { + "epoch": 0.68, + "grad_norm": 0.3800533711910248, + "learning_rate": 2.399823243396476e-05, + "loss": 0.2891, + "step": 729 + }, + { + "epoch": 0.68, + "grad_norm": 0.3955475091934204, + "learning_rate": 2.3868595643834995e-05, + "loss": 0.2531, + "step": 730 + }, + { + "epoch": 0.69, + "grad_norm": 0.42857804894447327, + "learning_rate": 2.373920007802144e-05, + "loss": 0.322, + "step": 731 + }, + { + "epoch": 0.69, + "grad_norm": 0.3623318672180176, + "learning_rate": 2.361004693100094e-05, + "loss": 0.2604, + "step": 732 + }, + { + "epoch": 0.69, + "grad_norm": 0.40041908621788025, + "learning_rate": 2.3481137395012513e-05, + "loss": 0.3004, + "step": 733 + }, + { + "epoch": 0.69, + "grad_norm": 0.3790295124053955, + "learning_rate": 2.3352472660046295e-05, + "loss": 0.2634, + "step": 734 + }, + { + "epoch": 0.69, + "grad_norm": 0.3655712902545929, + "learning_rate": 2.322405391383273e-05, + "loss": 0.3042, + "step": 735 + }, + { + "epoch": 0.69, + "grad_norm": 0.42415592074394226, + "learning_rate": 2.3095882341831372e-05, + "loss": 0.4077, + "step": 736 + }, + { + "epoch": 0.69, + "grad_norm": 0.37483200430870056, + "learning_rate": 2.296795912722014e-05, + "loss": 0.3072, + "step": 737 + }, + { + "epoch": 0.69, + "grad_norm": 0.35293281078338623, + "learning_rate": 2.284028545088423e-05, + "loss": 0.259, + "step": 738 + }, + { + "epoch": 0.69, + "grad_norm": 0.34485095739364624, + "learning_rate": 2.2712862491405436e-05, + "loss": 0.266, + "step": 739 + }, + { + "epoch": 0.69, + "grad_norm": 0.344819039106369, + "learning_rate": 2.258569142505098e-05, + "loss": 0.2453, + "step": 740 + }, + { + "epoch": 0.7, + "grad_norm": 0.4387754797935486, + "learning_rate": 2.2458773425762912e-05, + "loss": 0.2779, + "step": 741 + }, + { + "epoch": 0.7, + "grad_norm": 0.4539426267147064, + "learning_rate": 2.2332109665147127e-05, + "loss": 0.3791, + "step": 742 + }, + { + "epoch": 0.7, + "grad_norm": 0.41004621982574463, + "learning_rate": 2.2205701312462617e-05, + "loss": 0.3598, + "step": 743 + }, + { + "epoch": 0.7, + "grad_norm": 0.3861682415008545, + "learning_rate": 2.2079549534610606e-05, + "loss": 0.2982, + "step": 744 + }, + { + "epoch": 0.7, + "grad_norm": 0.40412721037864685, + "learning_rate": 2.1953655496123853e-05, + "loss": 0.2879, + "step": 745 + }, + { + "epoch": 0.7, + "grad_norm": 0.41516509652137756, + "learning_rate": 2.1828020359155905e-05, + "loss": 0.2866, + "step": 746 + }, + { + "epoch": 0.7, + "grad_norm": 0.35956189036369324, + "learning_rate": 2.1702645283470236e-05, + "loss": 0.2449, + "step": 747 + }, + { + "epoch": 0.7, + "grad_norm": 0.45450302958488464, + "learning_rate": 2.1577531426429782e-05, + "loss": 0.3618, + "step": 748 + }, + { + "epoch": 0.7, + "grad_norm": 0.32821208238601685, + "learning_rate": 2.1452679942985993e-05, + "loss": 0.2288, + "step": 749 + }, + { + "epoch": 0.7, + "grad_norm": 0.4255446195602417, + "learning_rate": 2.132809198566837e-05, + "loss": 0.3431, + "step": 750 + }, + { + "epoch": 0.7, + "grad_norm": 0.4676855206489563, + "learning_rate": 2.1203768704573672e-05, + "loss": 0.3602, + "step": 751 + }, + { + "epoch": 0.71, + "grad_norm": 0.37337955832481384, + "learning_rate": 2.1079711247355505e-05, + "loss": 0.2727, + "step": 752 + }, + { + "epoch": 0.71, + "grad_norm": 0.422907292842865, + "learning_rate": 2.095592075921347e-05, + "loss": 0.2386, + "step": 753 + }, + { + "epoch": 0.71, + "grad_norm": 0.3308090269565582, + "learning_rate": 2.08323983828828e-05, + "loss": 0.191, + "step": 754 + }, + { + "epoch": 0.71, + "grad_norm": 0.42989543080329895, + "learning_rate": 2.0709145258623704e-05, + "loss": 0.3802, + "step": 755 + }, + { + "epoch": 0.71, + "grad_norm": 0.414399117231369, + "learning_rate": 2.0586162524210895e-05, + "loss": 0.2893, + "step": 756 + }, + { + "epoch": 0.71, + "grad_norm": 0.4370255172252655, + "learning_rate": 2.0463451314923015e-05, + "loss": 0.3331, + "step": 757 + }, + { + "epoch": 0.71, + "grad_norm": 0.42983582615852356, + "learning_rate": 2.0341012763532243e-05, + "loss": 0.3425, + "step": 758 + }, + { + "epoch": 0.71, + "grad_norm": 0.36104878783226013, + "learning_rate": 2.021884800029379e-05, + "loss": 0.2644, + "step": 759 + }, + { + "epoch": 0.71, + "grad_norm": 0.38013267517089844, + "learning_rate": 2.009695815293548e-05, + "loss": 0.2912, + "step": 760 + }, + { + "epoch": 0.71, + "grad_norm": 0.4188981056213379, + "learning_rate": 1.9975344346647297e-05, + "loss": 0.3136, + "step": 761 + }, + { + "epoch": 0.71, + "grad_norm": 0.38251587748527527, + "learning_rate": 1.9854007704071064e-05, + "loss": 0.2962, + "step": 762 + }, + { + "epoch": 0.72, + "grad_norm": 0.32296597957611084, + "learning_rate": 1.973294934529007e-05, + "loss": 0.2121, + "step": 763 + }, + { + "epoch": 0.72, + "grad_norm": 0.3597976565361023, + "learning_rate": 1.961217038781863e-05, + "loss": 0.2609, + "step": 764 + }, + { + "epoch": 0.72, + "grad_norm": 0.44841957092285156, + "learning_rate": 1.9491671946591962e-05, + "loss": 0.3221, + "step": 765 + }, + { + "epoch": 0.72, + "grad_norm": 0.4504457414150238, + "learning_rate": 1.9371455133955675e-05, + "loss": 0.3516, + "step": 766 + }, + { + "epoch": 0.72, + "grad_norm": 0.33190175890922546, + "learning_rate": 1.925152105965567e-05, + "loss": 0.2645, + "step": 767 + }, + { + "epoch": 0.72, + "grad_norm": 0.33879125118255615, + "learning_rate": 1.9131870830827818e-05, + "loss": 0.2267, + "step": 768 + }, + { + "epoch": 0.72, + "grad_norm": 0.42365795373916626, + "learning_rate": 1.9012505551987765e-05, + "loss": 0.2736, + "step": 769 + }, + { + "epoch": 0.72, + "grad_norm": 0.37389981746673584, + "learning_rate": 1.8893426325020686e-05, + "loss": 0.2634, + "step": 770 + }, + { + "epoch": 0.72, + "grad_norm": 0.41957971453666687, + "learning_rate": 1.8774634249171185e-05, + "loss": 0.4077, + "step": 771 + }, + { + "epoch": 0.72, + "grad_norm": 0.3793964982032776, + "learning_rate": 1.8656130421033123e-05, + "loss": 0.2957, + "step": 772 + }, + { + "epoch": 0.73, + "grad_norm": 0.34412625432014465, + "learning_rate": 1.8537915934539486e-05, + "loss": 0.1677, + "step": 773 + }, + { + "epoch": 0.73, + "grad_norm": 0.3888733685016632, + "learning_rate": 1.841999188095224e-05, + "loss": 0.3111, + "step": 774 + }, + { + "epoch": 0.73, + "grad_norm": 0.4144531786441803, + "learning_rate": 1.830235934885237e-05, + "loss": 0.3651, + "step": 775 + }, + { + "epoch": 0.73, + "grad_norm": 0.3423333168029785, + "learning_rate": 1.818501942412975e-05, + "loss": 0.2843, + "step": 776 + }, + { + "epoch": 0.73, + "grad_norm": 0.3905755877494812, + "learning_rate": 1.8067973189973075e-05, + "loss": 0.3341, + "step": 777 + }, + { + "epoch": 0.73, + "grad_norm": 0.4139542579650879, + "learning_rate": 1.7951221726860045e-05, + "loss": 0.3393, + "step": 778 + }, + { + "epoch": 0.73, + "grad_norm": 0.4078083634376526, + "learning_rate": 1.7834766112547142e-05, + "loss": 0.3124, + "step": 779 + }, + { + "epoch": 0.73, + "grad_norm": 0.3850538730621338, + "learning_rate": 1.771860742205988e-05, + "loss": 0.3099, + "step": 780 + }, + { + "epoch": 0.73, + "grad_norm": 0.4191945493221283, + "learning_rate": 1.7602746727682796e-05, + "loss": 0.3639, + "step": 781 + }, + { + "epoch": 0.73, + "grad_norm": 0.3619956076145172, + "learning_rate": 1.7487185098949565e-05, + "loss": 0.2781, + "step": 782 + }, + { + "epoch": 0.73, + "grad_norm": 0.42559903860092163, + "learning_rate": 1.7371923602633078e-05, + "loss": 0.3629, + "step": 783 + }, + { + "epoch": 0.74, + "grad_norm": 0.43343913555145264, + "learning_rate": 1.725696330273575e-05, + "loss": 0.3785, + "step": 784 + }, + { + "epoch": 0.74, + "grad_norm": 0.39102593064308167, + "learning_rate": 1.7142305260479474e-05, + "loss": 0.3651, + "step": 785 + }, + { + "epoch": 0.74, + "grad_norm": 0.3920254111289978, + "learning_rate": 1.7027950534296027e-05, + "loss": 0.378, + "step": 786 + }, + { + "epoch": 0.74, + "grad_norm": 0.3507162928581238, + "learning_rate": 1.6913900179817144e-05, + "loss": 0.2284, + "step": 787 + }, + { + "epoch": 0.74, + "grad_norm": 0.385535329580307, + "learning_rate": 1.6800155249864896e-05, + "loss": 0.3308, + "step": 788 + }, + { + "epoch": 0.74, + "grad_norm": 0.37612423300743103, + "learning_rate": 1.668671679444192e-05, + "loss": 0.2848, + "step": 789 + }, + { + "epoch": 0.74, + "grad_norm": 0.4130227267742157, + "learning_rate": 1.6573585860721646e-05, + "loss": 0.3532, + "step": 790 + }, + { + "epoch": 0.74, + "grad_norm": 0.37785717844963074, + "learning_rate": 1.646076349303884e-05, + "loss": 0.2867, + "step": 791 + }, + { + "epoch": 0.74, + "grad_norm": 0.3971453309059143, + "learning_rate": 1.63482507328797e-05, + "loss": 0.2969, + "step": 792 + }, + { + "epoch": 0.74, + "grad_norm": 0.301319420337677, + "learning_rate": 1.6236048618872456e-05, + "loss": 0.2183, + "step": 793 + }, + { + "epoch": 0.74, + "grad_norm": 0.486827552318573, + "learning_rate": 1.6124158186777676e-05, + "loss": 0.3165, + "step": 794 + }, + { + "epoch": 0.75, + "grad_norm": 0.4306073784828186, + "learning_rate": 1.6012580469478743e-05, + "loss": 0.3385, + "step": 795 + }, + { + "epoch": 0.75, + "grad_norm": 0.3236197829246521, + "learning_rate": 1.5901316496972262e-05, + "loss": 0.1959, + "step": 796 + }, + { + "epoch": 0.75, + "grad_norm": 0.383569598197937, + "learning_rate": 1.5790367296358644e-05, + "loss": 0.2979, + "step": 797 + }, + { + "epoch": 0.75, + "grad_norm": 0.31395941972732544, + "learning_rate": 1.5679733891832556e-05, + "loss": 0.2087, + "step": 798 + }, + { + "epoch": 0.75, + "grad_norm": 0.3919505178928375, + "learning_rate": 1.55694173046735e-05, + "loss": 0.2757, + "step": 799 + }, + { + "epoch": 0.75, + "grad_norm": 0.3555279076099396, + "learning_rate": 1.5459418553236343e-05, + "loss": 0.2804, + "step": 800 + }, + { + "epoch": 0.75, + "grad_norm": 0.3989741802215576, + "learning_rate": 1.5349738652941968e-05, + "loss": 0.3274, + "step": 801 + }, + { + "epoch": 0.75, + "grad_norm": 0.38423246145248413, + "learning_rate": 1.5240378616267886e-05, + "loss": 0.2666, + "step": 802 + }, + { + "epoch": 0.75, + "grad_norm": 0.3812728524208069, + "learning_rate": 1.5131339452738863e-05, + "loss": 0.3723, + "step": 803 + }, + { + "epoch": 0.75, + "grad_norm": 0.37258511781692505, + "learning_rate": 1.5022622168917649e-05, + "loss": 0.2737, + "step": 804 + }, + { + "epoch": 0.76, + "grad_norm": 0.35156306624412537, + "learning_rate": 1.4914227768395595e-05, + "loss": 0.2617, + "step": 805 + }, + { + "epoch": 0.76, + "grad_norm": 0.37645992636680603, + "learning_rate": 1.4806157251783515e-05, + "loss": 0.303, + "step": 806 + }, + { + "epoch": 0.76, + "grad_norm": 0.3626248836517334, + "learning_rate": 1.4698411616702356e-05, + "loss": 0.253, + "step": 807 + }, + { + "epoch": 0.76, + "grad_norm": 0.37690359354019165, + "learning_rate": 1.4590991857774038e-05, + "loss": 0.3221, + "step": 808 + }, + { + "epoch": 0.76, + "grad_norm": 0.416437029838562, + "learning_rate": 1.4483898966612209e-05, + "loss": 0.3789, + "step": 809 + }, + { + "epoch": 0.76, + "grad_norm": 0.3635142743587494, + "learning_rate": 1.437713393181317e-05, + "loss": 0.2786, + "step": 810 + }, + { + "epoch": 0.76, + "grad_norm": 0.4015759527683258, + "learning_rate": 1.4270697738946704e-05, + "loss": 0.2582, + "step": 811 + }, + { + "epoch": 0.76, + "grad_norm": 0.3174329698085785, + "learning_rate": 1.4164591370547004e-05, + "loss": 0.2301, + "step": 812 + }, + { + "epoch": 0.76, + "grad_norm": 0.38481318950653076, + "learning_rate": 1.4058815806103542e-05, + "loss": 0.2948, + "step": 813 + }, + { + "epoch": 0.76, + "grad_norm": 0.4067227840423584, + "learning_rate": 1.3953372022052107e-05, + "loss": 0.3354, + "step": 814 + }, + { + "epoch": 0.76, + "grad_norm": 0.3462645411491394, + "learning_rate": 1.3848260991765755e-05, + "loss": 0.294, + "step": 815 + }, + { + "epoch": 0.77, + "grad_norm": 0.4777897000312805, + "learning_rate": 1.3743483685545811e-05, + "loss": 0.4079, + "step": 816 + }, + { + "epoch": 0.77, + "grad_norm": 0.4179569184780121, + "learning_rate": 1.363904107061294e-05, + "loss": 0.3217, + "step": 817 + }, + { + "epoch": 0.77, + "grad_norm": 0.4066832661628723, + "learning_rate": 1.3534934111098179e-05, + "loss": 0.337, + "step": 818 + }, + { + "epoch": 0.77, + "grad_norm": 0.3682103455066681, + "learning_rate": 1.3431163768034077e-05, + "loss": 0.3227, + "step": 819 + }, + { + "epoch": 0.77, + "grad_norm": 0.3473730683326721, + "learning_rate": 1.3327730999345817e-05, + "loss": 0.2303, + "step": 820 + }, + { + "epoch": 0.77, + "grad_norm": 0.35099923610687256, + "learning_rate": 1.3224636759842363e-05, + "loss": 0.2924, + "step": 821 + }, + { + "epoch": 0.77, + "grad_norm": 0.3207169473171234, + "learning_rate": 1.3121882001207614e-05, + "loss": 0.2208, + "step": 822 + }, + { + "epoch": 0.77, + "grad_norm": 0.3512991666793823, + "learning_rate": 1.3019467671991692e-05, + "loss": 0.2392, + "step": 823 + }, + { + "epoch": 0.77, + "grad_norm": 0.4191690981388092, + "learning_rate": 1.2917394717602121e-05, + "loss": 0.3623, + "step": 824 + }, + { + "epoch": 0.77, + "grad_norm": 0.34676387906074524, + "learning_rate": 1.2815664080295159e-05, + "loss": 0.252, + "step": 825 + }, + { + "epoch": 0.77, + "grad_norm": 0.34901660680770874, + "learning_rate": 1.2714276699166994e-05, + "loss": 0.2367, + "step": 826 + }, + { + "epoch": 0.78, + "grad_norm": 0.3845536708831787, + "learning_rate": 1.261323351014525e-05, + "loss": 0.2552, + "step": 827 + }, + { + "epoch": 0.78, + "grad_norm": 0.3202722370624542, + "learning_rate": 1.251253544598014e-05, + "loss": 0.2054, + "step": 828 + }, + { + "epoch": 0.78, + "grad_norm": 0.35958996415138245, + "learning_rate": 1.241218343623602e-05, + "loss": 0.3064, + "step": 829 + }, + { + "epoch": 0.78, + "grad_norm": 0.37509864568710327, + "learning_rate": 1.2312178407282749e-05, + "loss": 0.3027, + "step": 830 + }, + { + "epoch": 0.78, + "grad_norm": 0.3806156814098358, + "learning_rate": 1.2212521282287092e-05, + "loss": 0.305, + "step": 831 + }, + { + "epoch": 0.78, + "grad_norm": 0.43754497170448303, + "learning_rate": 1.2113212981204292e-05, + "loss": 0.3643, + "step": 832 + }, + { + "epoch": 0.78, + "grad_norm": 0.41764360666275024, + "learning_rate": 1.2014254420769466e-05, + "loss": 0.3289, + "step": 833 + }, + { + "epoch": 0.78, + "grad_norm": 0.3798913061618805, + "learning_rate": 1.1915646514489292e-05, + "loss": 0.2902, + "step": 834 + }, + { + "epoch": 0.78, + "grad_norm": 0.38450661301612854, + "learning_rate": 1.1817390172633403e-05, + "loss": 0.3193, + "step": 835 + }, + { + "epoch": 0.78, + "grad_norm": 0.3482416570186615, + "learning_rate": 1.1719486302226118e-05, + "loss": 0.243, + "step": 836 + }, + { + "epoch": 0.79, + "grad_norm": 0.5255814790725708, + "learning_rate": 1.1621935807038003e-05, + "loss": 0.4848, + "step": 837 + }, + { + "epoch": 0.79, + "grad_norm": 0.41135770082473755, + "learning_rate": 1.152473958757756e-05, + "loss": 0.3362, + "step": 838 + }, + { + "epoch": 0.79, + "grad_norm": 0.3493039309978485, + "learning_rate": 1.1427898541082855e-05, + "loss": 0.2379, + "step": 839 + }, + { + "epoch": 0.79, + "grad_norm": 0.4206957519054413, + "learning_rate": 1.133141356151336e-05, + "loss": 0.3234, + "step": 840 + }, + { + "epoch": 0.79, + "grad_norm": 0.4049489498138428, + "learning_rate": 1.123528553954154e-05, + "loss": 0.3105, + "step": 841 + }, + { + "epoch": 0.79, + "grad_norm": 0.3023333251476288, + "learning_rate": 1.1139515362544755e-05, + "loss": 0.2024, + "step": 842 + }, + { + "epoch": 0.79, + "grad_norm": 0.4536070227622986, + "learning_rate": 1.1044103914597031e-05, + "loss": 0.3822, + "step": 843 + }, + { + "epoch": 0.79, + "grad_norm": 0.5311452150344849, + "learning_rate": 1.0949052076460853e-05, + "loss": 0.4121, + "step": 844 + }, + { + "epoch": 0.79, + "grad_norm": 0.4352911412715912, + "learning_rate": 1.085436072557911e-05, + "loss": 0.3284, + "step": 845 + }, + { + "epoch": 0.79, + "grad_norm": 0.3684670031070709, + "learning_rate": 1.0760030736066951e-05, + "loss": 0.2503, + "step": 846 + }, + { + "epoch": 0.79, + "grad_norm": 0.38302332162857056, + "learning_rate": 1.0666062978703733e-05, + "loss": 0.2825, + "step": 847 + }, + { + "epoch": 0.8, + "grad_norm": 0.36409202218055725, + "learning_rate": 1.0572458320924943e-05, + "loss": 0.2636, + "step": 848 + }, + { + "epoch": 0.8, + "grad_norm": 0.44230782985687256, + "learning_rate": 1.0479217626814253e-05, + "loss": 0.3134, + "step": 849 + }, + { + "epoch": 0.8, + "grad_norm": 0.32951366901397705, + "learning_rate": 1.0386341757095502e-05, + "loss": 0.198, + "step": 850 + }, + { + "epoch": 0.8, + "grad_norm": 0.39737051725387573, + "learning_rate": 1.0293831569124774e-05, + "loss": 0.3287, + "step": 851 + }, + { + "epoch": 0.8, + "grad_norm": 0.40820056200027466, + "learning_rate": 1.0201687916882418e-05, + "loss": 0.349, + "step": 852 + }, + { + "epoch": 0.8, + "grad_norm": 0.3584843575954437, + "learning_rate": 1.0109911650965314e-05, + "loss": 0.2142, + "step": 853 + }, + { + "epoch": 0.8, + "grad_norm": 0.5511006116867065, + "learning_rate": 1.0018503618578818e-05, + "loss": 0.5335, + "step": 854 + }, + { + "epoch": 0.8, + "grad_norm": 0.40527984499931335, + "learning_rate": 9.927464663529118e-06, + "loss": 0.2826, + "step": 855 + }, + { + "epoch": 0.8, + "grad_norm": 0.4436098337173462, + "learning_rate": 9.836795626215356e-06, + "loss": 0.3708, + "step": 856 + }, + { + "epoch": 0.8, + "grad_norm": 0.3601125180721283, + "learning_rate": 9.746497343621857e-06, + "loss": 0.2946, + "step": 857 + }, + { + "epoch": 0.8, + "grad_norm": 0.3351758122444153, + "learning_rate": 9.656570649310481e-06, + "loss": 0.2355, + "step": 858 + }, + { + "epoch": 0.81, + "grad_norm": 0.3661154806613922, + "learning_rate": 9.567016373412857e-06, + "loss": 0.3093, + "step": 859 + }, + { + "epoch": 0.81, + "grad_norm": 0.4545595347881317, + "learning_rate": 9.477835342622759e-06, + "loss": 0.3889, + "step": 860 + }, + { + "epoch": 0.81, + "grad_norm": 0.4591468274593353, + "learning_rate": 9.389028380188419e-06, + "loss": 0.3956, + "step": 861 + }, + { + "epoch": 0.81, + "grad_norm": 0.3811791241168976, + "learning_rate": 9.300596305905013e-06, + "loss": 0.2942, + "step": 862 + }, + { + "epoch": 0.81, + "grad_norm": 0.3912218511104584, + "learning_rate": 9.212539936107029e-06, + "loss": 0.2768, + "step": 863 + }, + { + "epoch": 0.81, + "grad_norm": 0.3407474458217621, + "learning_rate": 9.124860083660769e-06, + "loss": 0.2694, + "step": 864 + }, + { + "epoch": 0.81, + "grad_norm": 0.3362795412540436, + "learning_rate": 9.037557557956766e-06, + "loss": 0.2052, + "step": 865 + }, + { + "epoch": 0.81, + "grad_norm": 0.4200963079929352, + "learning_rate": 8.950633164902467e-06, + "loss": 0.3292, + "step": 866 + }, + { + "epoch": 0.81, + "grad_norm": 0.40156418085098267, + "learning_rate": 8.86408770691462e-06, + "loss": 0.2826, + "step": 867 + }, + { + "epoch": 0.81, + "grad_norm": 0.3317236304283142, + "learning_rate": 8.777921982911996e-06, + "loss": 0.24, + "step": 868 + }, + { + "epoch": 0.82, + "grad_norm": 0.37346014380455017, + "learning_rate": 8.692136788307903e-06, + "loss": 0.2693, + "step": 869 + }, + { + "epoch": 0.82, + "grad_norm": 0.32652783393859863, + "learning_rate": 8.606732915003002e-06, + "loss": 0.2489, + "step": 870 + }, + { + "epoch": 0.82, + "grad_norm": 0.36683058738708496, + "learning_rate": 8.521711151377803e-06, + "loss": 0.2614, + "step": 871 + }, + { + "epoch": 0.82, + "grad_norm": 0.39762526750564575, + "learning_rate": 8.437072282285535e-06, + "loss": 0.3236, + "step": 872 + }, + { + "epoch": 0.82, + "grad_norm": 0.3494615852832794, + "learning_rate": 8.35281708904485e-06, + "loss": 0.2425, + "step": 873 + }, + { + "epoch": 0.82, + "grad_norm": 0.38728398084640503, + "learning_rate": 8.268946349432582e-06, + "loss": 0.3143, + "step": 874 + }, + { + "epoch": 0.82, + "grad_norm": 0.41258835792541504, + "learning_rate": 8.185460837676612e-06, + "loss": 0.2985, + "step": 875 + }, + { + "epoch": 0.82, + "grad_norm": 0.4381546974182129, + "learning_rate": 8.102361324448715e-06, + "loss": 0.2868, + "step": 876 + }, + { + "epoch": 0.82, + "grad_norm": 0.41193243861198425, + "learning_rate": 8.019648576857425e-06, + "loss": 0.3096, + "step": 877 + }, + { + "epoch": 0.82, + "grad_norm": 0.4166334271430969, + "learning_rate": 7.937323358440935e-06, + "loss": 0.3443, + "step": 878 + }, + { + "epoch": 0.82, + "grad_norm": 0.3498817980289459, + "learning_rate": 7.85538642916015e-06, + "loss": 0.2727, + "step": 879 + }, + { + "epoch": 0.83, + "grad_norm": 0.33879581093788147, + "learning_rate": 7.773838545391515e-06, + "loss": 0.2048, + "step": 880 + }, + { + "epoch": 0.83, + "grad_norm": 0.3829089403152466, + "learning_rate": 7.692680459920188e-06, + "loss": 0.2927, + "step": 881 + }, + { + "epoch": 0.83, + "grad_norm": 0.3402465879917145, + "learning_rate": 7.6119129219329395e-06, + "loss": 0.2253, + "step": 882 + }, + { + "epoch": 0.83, + "grad_norm": 0.424730122089386, + "learning_rate": 7.5315366770114195e-06, + "loss": 0.335, + "step": 883 + }, + { + "epoch": 0.83, + "grad_norm": 0.4533472955226898, + "learning_rate": 7.4515524671250725e-06, + "loss": 0.2951, + "step": 884 + }, + { + "epoch": 0.83, + "grad_norm": 0.38933825492858887, + "learning_rate": 7.371961030624452e-06, + "loss": 0.257, + "step": 885 + }, + { + "epoch": 0.83, + "grad_norm": 0.33994266390800476, + "learning_rate": 7.292763102234329e-06, + "loss": 0.2736, + "step": 886 + }, + { + "epoch": 0.83, + "grad_norm": 0.356049507856369, + "learning_rate": 7.213959413046894e-06, + "loss": 0.2118, + "step": 887 + }, + { + "epoch": 0.83, + "grad_norm": 0.35656169056892395, + "learning_rate": 7.135550690515052e-06, + "loss": 0.2757, + "step": 888 + }, + { + "epoch": 0.83, + "grad_norm": 0.3579034209251404, + "learning_rate": 7.057537658445701e-06, + "loss": 0.2367, + "step": 889 + }, + { + "epoch": 0.83, + "grad_norm": 0.2979848086833954, + "learning_rate": 6.979921036993042e-06, + "loss": 0.1987, + "step": 890 + }, + { + "epoch": 0.84, + "grad_norm": 0.42152541875839233, + "learning_rate": 6.902701542651874e-06, + "loss": 0.2744, + "step": 891 + }, + { + "epoch": 0.84, + "grad_norm": 0.3124920725822449, + "learning_rate": 6.825879888251135e-06, + "loss": 0.1845, + "step": 892 + }, + { + "epoch": 0.84, + "grad_norm": 0.3792358934879303, + "learning_rate": 6.749456782947122e-06, + "loss": 0.2557, + "step": 893 + }, + { + "epoch": 0.84, + "grad_norm": 0.39623990654945374, + "learning_rate": 6.6734329322171165e-06, + "loss": 0.3049, + "step": 894 + }, + { + "epoch": 0.84, + "grad_norm": 0.3868322968482971, + "learning_rate": 6.597809037852726e-06, + "loss": 0.3433, + "step": 895 + }, + { + "epoch": 0.84, + "grad_norm": 0.3803116977214813, + "learning_rate": 6.522585797953579e-06, + "loss": 0.2398, + "step": 896 + }, + { + "epoch": 0.84, + "grad_norm": 0.4614750146865845, + "learning_rate": 6.447763906920679e-06, + "loss": 0.4015, + "step": 897 + }, + { + "epoch": 0.84, + "grad_norm": 0.4475559592247009, + "learning_rate": 6.373344055450165e-06, + "loss": 0.336, + "step": 898 + }, + { + "epoch": 0.84, + "grad_norm": 0.366828054189682, + "learning_rate": 6.2993269305268495e-06, + "loss": 0.2346, + "step": 899 + }, + { + "epoch": 0.84, + "grad_norm": 0.41806674003601074, + "learning_rate": 6.2257132154178665e-06, + "loss": 0.3423, + "step": 900 + }, + { + "epoch": 0.85, + "grad_norm": 0.4186936020851135, + "learning_rate": 6.152503589666425e-06, + "loss": 0.3321, + "step": 901 + }, + { + "epoch": 0.85, + "grad_norm": 0.4075973331928253, + "learning_rate": 6.079698729085498e-06, + "loss": 0.306, + "step": 902 + }, + { + "epoch": 0.85, + "grad_norm": 0.3074958920478821, + "learning_rate": 6.007299305751585e-06, + "loss": 0.185, + "step": 903 + }, + { + "epoch": 0.85, + "grad_norm": 0.3523412048816681, + "learning_rate": 5.935305987998496e-06, + "loss": 0.2477, + "step": 904 + }, + { + "epoch": 0.85, + "grad_norm": 0.3624049425125122, + "learning_rate": 5.863719440411214e-06, + "loss": 0.2392, + "step": 905 + }, + { + "epoch": 0.85, + "grad_norm": 0.3795306086540222, + "learning_rate": 5.792540323819751e-06, + "loss": 0.3398, + "step": 906 + }, + { + "epoch": 0.85, + "grad_norm": 0.3727322220802307, + "learning_rate": 5.721769295293034e-06, + "loss": 0.2522, + "step": 907 + }, + { + "epoch": 0.85, + "grad_norm": 0.44352471828460693, + "learning_rate": 5.651407008132809e-06, + "loss": 0.3619, + "step": 908 + }, + { + "epoch": 0.85, + "grad_norm": 0.3567368984222412, + "learning_rate": 5.5814541118677284e-06, + "loss": 0.2106, + "step": 909 + }, + { + "epoch": 0.85, + "grad_norm": 0.4105832874774933, + "learning_rate": 5.5119112522471924e-06, + "loss": 0.3556, + "step": 910 + }, + { + "epoch": 0.85, + "grad_norm": 0.37798091769218445, + "learning_rate": 5.442779071235516e-06, + "loss": 0.2559, + "step": 911 + }, + { + "epoch": 0.86, + "grad_norm": 0.3410490155220032, + "learning_rate": 5.374058207005944e-06, + "loss": 0.2455, + "step": 912 + }, + { + "epoch": 0.86, + "grad_norm": 0.378336101770401, + "learning_rate": 5.305749293934764e-06, + "loss": 0.2541, + "step": 913 + }, + { + "epoch": 0.86, + "grad_norm": 0.4319799840450287, + "learning_rate": 5.237852962595469e-06, + "loss": 0.3012, + "step": 914 + }, + { + "epoch": 0.86, + "grad_norm": 0.33922573924064636, + "learning_rate": 5.170369839752925e-06, + "loss": 0.2038, + "step": 915 + }, + { + "epoch": 0.86, + "grad_norm": 0.3704172372817993, + "learning_rate": 5.1033005483575925e-06, + "loss": 0.2578, + "step": 916 + }, + { + "epoch": 0.86, + "grad_norm": 0.41262710094451904, + "learning_rate": 5.036645707539745e-06, + "loss": 0.2624, + "step": 917 + }, + { + "epoch": 0.86, + "grad_norm": 0.40159058570861816, + "learning_rate": 4.9704059326038055e-06, + "loss": 0.2647, + "step": 918 + }, + { + "epoch": 0.86, + "grad_norm": 0.38587838411331177, + "learning_rate": 4.90458183502262e-06, + "loss": 0.2894, + "step": 919 + }, + { + "epoch": 0.86, + "grad_norm": 0.4216020107269287, + "learning_rate": 4.839174022431858e-06, + "loss": 0.3768, + "step": 920 + }, + { + "epoch": 0.86, + "grad_norm": 0.41159510612487793, + "learning_rate": 4.7741830986243356e-06, + "loss": 0.3103, + "step": 921 + }, + { + "epoch": 0.86, + "grad_norm": 0.3718778192996979, + "learning_rate": 4.709609663544534e-06, + "loss": 0.2877, + "step": 922 + }, + { + "epoch": 0.87, + "grad_norm": 0.3785461187362671, + "learning_rate": 4.645454313282965e-06, + "loss": 0.2953, + "step": 923 + }, + { + "epoch": 0.87, + "grad_norm": 0.4794873893260956, + "learning_rate": 4.581717640070743e-06, + "loss": 0.3446, + "step": 924 + }, + { + "epoch": 0.87, + "grad_norm": 0.40924200415611267, + "learning_rate": 4.5184002322740785e-06, + "loss": 0.3418, + "step": 925 + }, + { + "epoch": 0.87, + "grad_norm": 0.3224068880081177, + "learning_rate": 4.455502674388873e-06, + "loss": 0.2382, + "step": 926 + }, + { + "epoch": 0.87, + "grad_norm": 0.46128109097480774, + "learning_rate": 4.3930255470352736e-06, + "loss": 0.35, + "step": 927 + }, + { + "epoch": 0.87, + "grad_norm": 0.4323473572731018, + "learning_rate": 4.330969426952375e-06, + "loss": 0.3307, + "step": 928 + }, + { + "epoch": 0.87, + "grad_norm": 0.40308627486228943, + "learning_rate": 4.269334886992876e-06, + "loss": 0.2882, + "step": 929 + }, + { + "epoch": 0.87, + "grad_norm": 0.5716599822044373, + "learning_rate": 4.208122496117744e-06, + "loss": 0.2227, + "step": 930 + }, + { + "epoch": 0.87, + "grad_norm": 0.38822227716445923, + "learning_rate": 4.147332819391048e-06, + "loss": 0.2759, + "step": 931 + }, + { + "epoch": 0.87, + "grad_norm": 0.40629640221595764, + "learning_rate": 4.0869664179746694e-06, + "loss": 0.2665, + "step": 932 + }, + { + "epoch": 0.88, + "grad_norm": 0.35057899355888367, + "learning_rate": 4.027023849123157e-06, + "loss": 0.2333, + "step": 933 + }, + { + "epoch": 0.88, + "grad_norm": 0.393621563911438, + "learning_rate": 3.967505666178556e-06, + "loss": 0.2696, + "step": 934 + }, + { + "epoch": 0.88, + "grad_norm": 0.36038023233413696, + "learning_rate": 3.908412418565371e-06, + "loss": 0.2546, + "step": 935 + }, + { + "epoch": 0.88, + "grad_norm": 0.34687674045562744, + "learning_rate": 3.849744651785381e-06, + "loss": 0.2528, + "step": 936 + }, + { + "epoch": 0.88, + "grad_norm": 0.3856310546398163, + "learning_rate": 3.7915029074126974e-06, + "loss": 0.3603, + "step": 937 + }, + { + "epoch": 0.88, + "grad_norm": 0.5023623704910278, + "learning_rate": 3.7336877230887246e-06, + "loss": 0.4284, + "step": 938 + }, + { + "epoch": 0.88, + "grad_norm": 0.4204338788986206, + "learning_rate": 3.676299632517216e-06, + "loss": 0.4169, + "step": 939 + }, + { + "epoch": 0.88, + "grad_norm": 0.5023021101951599, + "learning_rate": 3.619339165459307e-06, + "loss": 0.3397, + "step": 940 + }, + { + "epoch": 0.88, + "grad_norm": 0.341836154460907, + "learning_rate": 3.562806847728678e-06, + "loss": 0.2697, + "step": 941 + }, + { + "epoch": 0.88, + "grad_norm": 0.40847527980804443, + "learning_rate": 3.5067032011866783e-06, + "loss": 0.3658, + "step": 942 + }, + { + "epoch": 0.88, + "grad_norm": 0.3342381417751312, + "learning_rate": 3.4510287437374835e-06, + "loss": 0.1656, + "step": 943 + }, + { + "epoch": 0.89, + "grad_norm": 0.3953072428703308, + "learning_rate": 3.3957839893233536e-06, + "loss": 0.2481, + "step": 944 + }, + { + "epoch": 0.89, + "grad_norm": 0.3248569667339325, + "learning_rate": 3.340969447919873e-06, + "loss": 0.2469, + "step": 945 + }, + { + "epoch": 0.89, + "grad_norm": 0.3305179178714752, + "learning_rate": 3.286585625531241e-06, + "loss": 0.2154, + "step": 946 + }, + { + "epoch": 0.89, + "grad_norm": 0.33991092443466187, + "learning_rate": 3.232633024185583e-06, + "loss": 0.2226, + "step": 947 + }, + { + "epoch": 0.89, + "grad_norm": 0.30957362055778503, + "learning_rate": 3.1791121419303794e-06, + "loss": 0.2185, + "step": 948 + }, + { + "epoch": 0.89, + "grad_norm": 0.4127846360206604, + "learning_rate": 3.1260234728277717e-06, + "loss": 0.2835, + "step": 949 + }, + { + "epoch": 0.89, + "grad_norm": 0.39826324582099915, + "learning_rate": 3.0733675069500865e-06, + "loss": 0.2953, + "step": 950 + }, + { + "epoch": 0.89, + "grad_norm": 0.4400089979171753, + "learning_rate": 3.0211447303752695e-06, + "loss": 0.296, + "step": 951 + }, + { + "epoch": 0.89, + "grad_norm": 0.38311436772346497, + "learning_rate": 2.9693556251824185e-06, + "loss": 0.29, + "step": 952 + }, + { + "epoch": 0.89, + "grad_norm": 0.46116960048675537, + "learning_rate": 2.9180006694472906e-06, + "loss": 0.4249, + "step": 953 + }, + { + "epoch": 0.89, + "grad_norm": 0.40703484416007996, + "learning_rate": 2.867080337237954e-06, + "loss": 0.3635, + "step": 954 + }, + { + "epoch": 0.9, + "grad_norm": 0.42896768450737, + "learning_rate": 2.8165950986103805e-06, + "loss": 0.3774, + "step": 955 + }, + { + "epoch": 0.9, + "grad_norm": 0.4287061393260956, + "learning_rate": 2.7665454196040664e-06, + "loss": 0.335, + "step": 956 + }, + { + "epoch": 0.9, + "grad_norm": 0.4173295199871063, + "learning_rate": 2.716931762237801e-06, + "loss": 0.2793, + "step": 957 + }, + { + "epoch": 0.9, + "grad_norm": 0.4245130121707916, + "learning_rate": 2.667754584505372e-06, + "loss": 0.3427, + "step": 958 + }, + { + "epoch": 0.9, + "grad_norm": 0.4839573800563812, + "learning_rate": 2.6190143403713174e-06, + "loss": 0.3515, + "step": 959 + }, + { + "epoch": 0.9, + "grad_norm": 0.434766948223114, + "learning_rate": 2.5707114797667465e-06, + "loss": 0.2847, + "step": 960 + }, + { + "epoch": 0.9, + "grad_norm": 0.4157213568687439, + "learning_rate": 2.522846448585231e-06, + "loss": 0.352, + "step": 961 + }, + { + "epoch": 0.9, + "grad_norm": 0.40966856479644775, + "learning_rate": 2.4754196886785986e-06, + "loss": 0.27, + "step": 962 + }, + { + "epoch": 0.9, + "grad_norm": 0.3235345184803009, + "learning_rate": 2.4284316378529404e-06, + "loss": 0.2657, + "step": 963 + }, + { + "epoch": 0.9, + "grad_norm": 0.3809812366962433, + "learning_rate": 2.3818827298645207e-06, + "loss": 0.2526, + "step": 964 + }, + { + "epoch": 0.91, + "grad_norm": 0.33908405900001526, + "learning_rate": 2.335773394415802e-06, + "loss": 0.2192, + "step": 965 + }, + { + "epoch": 0.91, + "grad_norm": 0.3907183110713959, + "learning_rate": 2.2901040571514322e-06, + "loss": 0.2976, + "step": 966 + }, + { + "epoch": 0.91, + "grad_norm": 0.4197416305541992, + "learning_rate": 2.2448751396543787e-06, + "loss": 0.2952, + "step": 967 + }, + { + "epoch": 0.91, + "grad_norm": 0.3461250364780426, + "learning_rate": 2.2000870594419908e-06, + "loss": 0.1856, + "step": 968 + }, + { + "epoch": 0.91, + "grad_norm": 0.36310264468193054, + "learning_rate": 2.155740229962161e-06, + "loss": 0.261, + "step": 969 + }, + { + "epoch": 0.91, + "grad_norm": 0.349942147731781, + "learning_rate": 2.1118350605894955e-06, + "loss": 0.3009, + "step": 970 + }, + { + "epoch": 0.91, + "grad_norm": 0.35959210991859436, + "learning_rate": 2.068371956621562e-06, + "loss": 0.2599, + "step": 971 + }, + { + "epoch": 0.91, + "grad_norm": 0.39764103293418884, + "learning_rate": 2.0253513192751373e-06, + "loss": 0.3175, + "step": 972 + }, + { + "epoch": 0.91, + "grad_norm": 0.3876802623271942, + "learning_rate": 1.982773545682459e-06, + "loss": 0.298, + "step": 973 + }, + { + "epoch": 0.91, + "grad_norm": 0.34669044613838196, + "learning_rate": 1.9406390288876586e-06, + "loss": 0.2254, + "step": 974 + }, + { + "epoch": 0.91, + "grad_norm": 0.3759954273700714, + "learning_rate": 1.8989481578430223e-06, + "loss": 0.2477, + "step": 975 + }, + { + "epoch": 0.92, + "grad_norm": 0.42282500863075256, + "learning_rate": 1.8577013174054857e-06, + "loss": 0.3294, + "step": 976 + }, + { + "epoch": 0.92, + "grad_norm": 0.40045222640037537, + "learning_rate": 1.8168988883330185e-06, + "loss": 0.3472, + "step": 977 + }, + { + "epoch": 0.92, + "grad_norm": 0.4552788734436035, + "learning_rate": 1.7765412472811771e-06, + "loss": 0.4431, + "step": 978 + }, + { + "epoch": 0.92, + "grad_norm": 0.3732775151729584, + "learning_rate": 1.7366287667995417e-06, + "loss": 0.2226, + "step": 979 + }, + { + "epoch": 0.92, + "grad_norm": 0.3581148386001587, + "learning_rate": 1.697161815328363e-06, + "loss": 0.2576, + "step": 980 + }, + { + "epoch": 0.92, + "grad_norm": 0.3295496106147766, + "learning_rate": 1.6581407571951092e-06, + "loss": 0.2751, + "step": 981 + }, + { + "epoch": 0.92, + "grad_norm": 0.3627816438674927, + "learning_rate": 1.6195659526111185e-06, + "loss": 0.2633, + "step": 982 + }, + { + "epoch": 0.92, + "grad_norm": 0.42608195543289185, + "learning_rate": 1.5814377576682527e-06, + "loss": 0.3034, + "step": 983 + }, + { + "epoch": 0.92, + "grad_norm": 0.3478807508945465, + "learning_rate": 1.5437565243356656e-06, + "loss": 0.1997, + "step": 984 + }, + { + "epoch": 0.92, + "grad_norm": 0.41831809282302856, + "learning_rate": 1.5065226004564893e-06, + "loss": 0.3793, + "step": 985 + }, + { + "epoch": 0.92, + "grad_norm": 0.3832615911960602, + "learning_rate": 1.4697363297446477e-06, + "loss": 0.2761, + "step": 986 + }, + { + "epoch": 0.93, + "grad_norm": 0.4452967941761017, + "learning_rate": 1.4333980517817203e-06, + "loss": 0.3798, + "step": 987 + }, + { + "epoch": 0.93, + "grad_norm": 0.3327552080154419, + "learning_rate": 1.3975081020137392e-06, + "loss": 0.204, + "step": 988 + }, + { + "epoch": 0.93, + "grad_norm": 0.5041511058807373, + "learning_rate": 1.3620668117481472e-06, + "loss": 0.3522, + "step": 989 + }, + { + "epoch": 0.93, + "grad_norm": 0.37212684750556946, + "learning_rate": 1.3270745081506997e-06, + "loss": 0.3307, + "step": 990 + }, + { + "epoch": 0.93, + "grad_norm": 0.5050026774406433, + "learning_rate": 1.292531514242501e-06, + "loss": 0.4735, + "step": 991 + }, + { + "epoch": 0.93, + "grad_norm": 0.4048091471195221, + "learning_rate": 1.2584381488969454e-06, + "loss": 0.2996, + "step": 992 + }, + { + "epoch": 0.93, + "grad_norm": 0.3142896890640259, + "learning_rate": 1.2247947268368364e-06, + "loss": 0.2256, + "step": 993 + }, + { + "epoch": 0.93, + "grad_norm": 0.37308236956596375, + "learning_rate": 1.191601558631461e-06, + "loss": 0.26, + "step": 994 + }, + { + "epoch": 0.93, + "grad_norm": 0.35677245259284973, + "learning_rate": 1.1588589506937198e-06, + "loss": 0.2756, + "step": 995 + }, + { + "epoch": 0.93, + "grad_norm": 0.5449343919754028, + "learning_rate": 1.126567205277279e-06, + "loss": 0.3556, + "step": 996 + }, + { + "epoch": 0.94, + "grad_norm": 0.3877151310443878, + "learning_rate": 1.094726620473835e-06, + "loss": 0.2792, + "step": 997 + }, + { + "epoch": 0.94, + "grad_norm": 0.3979206085205078, + "learning_rate": 1.0633374902103088e-06, + "loss": 0.3125, + "step": 998 + }, + { + "epoch": 0.94, + "grad_norm": 0.41529783606529236, + "learning_rate": 1.0324001042461395e-06, + "loss": 0.2613, + "step": 999 + }, + { + "epoch": 0.94, + "grad_norm": 0.4048457145690918, + "learning_rate": 1.0019147481706625e-06, + "loss": 0.3312, + "step": 1000 + }, + { + "epoch": 0.94, + "grad_norm": 0.4872767925262451, + "learning_rate": 9.718817034003901e-07, + "loss": 0.4226, + "step": 1001 + }, + { + "epoch": 0.94, + "grad_norm": 0.32919400930404663, + "learning_rate": 9.423012471764914e-07, + "loss": 0.1977, + "step": 1002 + }, + { + "epoch": 0.94, + "grad_norm": 0.342472106218338, + "learning_rate": 9.131736525621603e-07, + "loss": 0.2218, + "step": 1003 + }, + { + "epoch": 0.94, + "grad_norm": 0.4028159976005554, + "learning_rate": 8.844991884401854e-07, + "loss": 0.286, + "step": 1004 + }, + { + "epoch": 0.94, + "grad_norm": 0.3890497386455536, + "learning_rate": 8.56278119510362e-07, + "loss": 0.274, + "step": 1005 + }, + { + "epoch": 0.94, + "grad_norm": 0.4593218266963959, + "learning_rate": 8.285107062871333e-07, + "loss": 0.3539, + "step": 1006 + }, + { + "epoch": 0.94, + "grad_norm": 0.35354185104370117, + "learning_rate": 8.011972050971483e-07, + "loss": 0.272, + "step": 1007 + }, + { + "epoch": 0.95, + "grad_norm": 0.40247949957847595, + "learning_rate": 7.74337868076902e-07, + "loss": 0.3272, + "step": 1008 + }, + { + "epoch": 0.95, + "grad_norm": 0.4523579478263855, + "learning_rate": 7.479329431703985e-07, + "loss": 0.4128, + "step": 1009 + }, + { + "epoch": 0.95, + "grad_norm": 0.47494736313819885, + "learning_rate": 7.21982674126881e-07, + "loss": 0.3574, + "step": 1010 + }, + { + "epoch": 0.95, + "grad_norm": 0.35004284977912903, + "learning_rate": 6.964873004985717e-07, + "loss": 0.2341, + "step": 1011 + }, + { + "epoch": 0.95, + "grad_norm": 0.47898706793785095, + "learning_rate": 6.714470576384579e-07, + "loss": 0.3763, + "step": 1012 + }, + { + "epoch": 0.95, + "grad_norm": 0.3801863491535187, + "learning_rate": 6.468621766981154e-07, + "loss": 0.2777, + "step": 1013 + }, + { + "epoch": 0.95, + "grad_norm": 0.3755449056625366, + "learning_rate": 6.227328846255931e-07, + "loss": 0.2736, + "step": 1014 + }, + { + "epoch": 0.95, + "grad_norm": 0.3443692922592163, + "learning_rate": 5.990594041632991e-07, + "loss": 0.2544, + "step": 1015 + }, + { + "epoch": 0.95, + "grad_norm": 0.36282068490982056, + "learning_rate": 5.758419538459459e-07, + "loss": 0.2778, + "step": 1016 + }, + { + "epoch": 0.95, + "grad_norm": 0.45862850546836853, + "learning_rate": 5.530807479985633e-07, + "loss": 0.3454, + "step": 1017 + }, + { + "epoch": 0.95, + "grad_norm": 0.3214671313762665, + "learning_rate": 5.307759967344672e-07, + "loss": 0.2648, + "step": 1018 + }, + { + "epoch": 0.96, + "grad_norm": 0.4228048622608185, + "learning_rate": 5.089279059533658e-07, + "loss": 0.3266, + "step": 1019 + }, + { + "epoch": 0.96, + "grad_norm": 0.43997371196746826, + "learning_rate": 4.87536677339423e-07, + "loss": 0.3662, + "step": 1020 + }, + { + "epoch": 0.96, + "grad_norm": 0.3612721264362335, + "learning_rate": 4.666025083594483e-07, + "loss": 0.2639, + "step": 1021 + }, + { + "epoch": 0.96, + "grad_norm": 0.4024774432182312, + "learning_rate": 4.461255922609986e-07, + "loss": 0.3015, + "step": 1022 + }, + { + "epoch": 0.96, + "grad_norm": 0.4298073649406433, + "learning_rate": 4.261061180706627e-07, + "loss": 0.3631, + "step": 1023 + }, + { + "epoch": 0.96, + "grad_norm": 0.3973303735256195, + "learning_rate": 4.065442705922906e-07, + "loss": 0.3697, + "step": 1024 + }, + { + "epoch": 0.96, + "grad_norm": 0.4075637757778168, + "learning_rate": 3.8744023040528374e-07, + "loss": 0.3019, + "step": 1025 + }, + { + "epoch": 0.96, + "grad_norm": 0.4226757884025574, + "learning_rate": 3.687941738629186e-07, + "loss": 0.3541, + "step": 1026 + }, + { + "epoch": 0.96, + "grad_norm": 0.3980466425418854, + "learning_rate": 3.5060627309074224e-07, + "loss": 0.2771, + "step": 1027 + }, + { + "epoch": 0.96, + "grad_norm": 0.3921937048435211, + "learning_rate": 3.3287669598497383e-07, + "loss": 0.2716, + "step": 1028 + }, + { + "epoch": 0.97, + "grad_norm": 0.43594250082969666, + "learning_rate": 3.156056062109503e-07, + "loss": 0.3511, + "step": 1029 + }, + { + "epoch": 0.97, + "grad_norm": 0.33162230253219604, + "learning_rate": 2.987931632016272e-07, + "loss": 0.2191, + "step": 1030 + }, + { + "epoch": 0.97, + "grad_norm": 0.4063693881034851, + "learning_rate": 2.824395221560805e-07, + "loss": 0.2922, + "step": 1031 + }, + { + "epoch": 0.97, + "grad_norm": 0.3888567090034485, + "learning_rate": 2.665448340381016e-07, + "loss": 0.2674, + "step": 1032 + }, + { + "epoch": 0.97, + "grad_norm": 0.3596007525920868, + "learning_rate": 2.511092455747932e-07, + "loss": 0.2949, + "step": 1033 + }, + { + "epoch": 0.97, + "grad_norm": 0.3734399676322937, + "learning_rate": 2.361328992552314e-07, + "loss": 0.2538, + "step": 1034 + }, + { + "epoch": 0.97, + "grad_norm": 0.48918384313583374, + "learning_rate": 2.2161593332910013e-07, + "loss": 0.3673, + "step": 1035 + }, + { + "epoch": 0.97, + "grad_norm": 0.43824249505996704, + "learning_rate": 2.0755848180547543e-07, + "loss": 0.3915, + "step": 1036 + }, + { + "epoch": 0.97, + "grad_norm": 0.3258342146873474, + "learning_rate": 1.9396067445155986e-07, + "loss": 0.2233, + "step": 1037 + }, + { + "epoch": 0.97, + "grad_norm": 0.4354231357574463, + "learning_rate": 1.8082263679148337e-07, + "loss": 0.3945, + "step": 1038 + }, + { + "epoch": 0.97, + "grad_norm": 0.41941532492637634, + "learning_rate": 1.681444901051432e-07, + "loss": 0.3513, + "step": 1039 + }, + { + "epoch": 0.98, + "grad_norm": 0.42694929242134094, + "learning_rate": 1.5592635142709367e-07, + "loss": 0.2924, + "step": 1040 + }, + { + "epoch": 0.98, + "grad_norm": 0.377742201089859, + "learning_rate": 1.4416833354546356e-07, + "loss": 0.2663, + "step": 1041 + }, + { + "epoch": 0.98, + "grad_norm": 0.39117175340652466, + "learning_rate": 1.328705450009071e-07, + "loss": 0.2678, + "step": 1042 + }, + { + "epoch": 0.98, + "grad_norm": 0.44528332352638245, + "learning_rate": 1.2203309008561592e-07, + "loss": 0.3609, + "step": 1043 + }, + { + "epoch": 0.98, + "grad_norm": 0.464968740940094, + "learning_rate": 1.1165606884234181e-07, + "loss": 0.3227, + "step": 1044 + }, + { + "epoch": 0.98, + "grad_norm": 0.4331037700176239, + "learning_rate": 1.0173957706348659e-07, + "loss": 0.3268, + "step": 1045 + }, + { + "epoch": 0.98, + "grad_norm": 0.40670907497406006, + "learning_rate": 9.228370629019711e-08, + "loss": 0.2878, + "step": 1046 + }, + { + "epoch": 0.98, + "grad_norm": 0.4464401602745056, + "learning_rate": 8.328854381154938e-08, + "loss": 0.3512, + "step": 1047 + }, + { + "epoch": 0.98, + "grad_norm": 0.36872437596321106, + "learning_rate": 7.475417266371576e-08, + "loss": 0.2733, + "step": 1048 + }, + { + "epoch": 0.98, + "grad_norm": 0.35684168338775635, + "learning_rate": 6.668067162921566e-08, + "loss": 0.263, + "step": 1049 + }, + { + "epoch": 0.98, + "grad_norm": 0.41713425517082214, + "learning_rate": 5.906811523618272e-08, + "loss": 0.2968, + "step": 1050 + }, + { + "epoch": 0.99, + "grad_norm": 0.4490773379802704, + "learning_rate": 5.191657375767656e-08, + "loss": 0.3842, + "step": 1051 + }, + { + "epoch": 0.99, + "grad_norm": 0.39667680859565735, + "learning_rate": 4.522611321103321e-08, + "loss": 0.2798, + "step": 1052 + }, + { + "epoch": 0.99, + "grad_norm": 0.34880930185317993, + "learning_rate": 3.8996795357254535e-08, + "loss": 0.2241, + "step": 1053 + }, + { + "epoch": 0.99, + "grad_norm": 0.5244704484939575, + "learning_rate": 3.322867770044202e-08, + "loss": 0.378, + "step": 1054 + }, + { + "epoch": 0.99, + "grad_norm": 0.4561891555786133, + "learning_rate": 2.792181348726941e-08, + "loss": 0.3333, + "step": 1055 + }, + { + "epoch": 0.99, + "grad_norm": 0.422680526971817, + "learning_rate": 2.3076251706477536e-08, + "loss": 0.3322, + "step": 1056 + }, + { + "epoch": 0.99, + "grad_norm": 0.45002683997154236, + "learning_rate": 1.869203708843581e-08, + "loss": 0.3724, + "step": 1057 + }, + { + "epoch": 0.99, + "grad_norm": 0.3882070779800415, + "learning_rate": 1.476921010471477e-08, + "loss": 0.3165, + "step": 1058 + }, + { + "epoch": 0.99, + "grad_norm": 0.42735832929611206, + "learning_rate": 1.1307806967741919e-08, + "loss": 0.2957, + "step": 1059 + }, + { + "epoch": 0.99, + "grad_norm": 0.3215475380420685, + "learning_rate": 8.307859630429793e-09, + "loss": 0.2327, + "step": 1060 + }, + { + "epoch": 1.0, + "grad_norm": 0.4432857632637024, + "learning_rate": 5.7693957858984125e-09, + "loss": 0.3602, + "step": 1061 + }, + { + "epoch": 1.0, + "grad_norm": 0.41549310088157654, + "learning_rate": 3.6924388672254785e-09, + "loss": 0.3299, + "step": 1062 + }, + { + "epoch": 1.0, + "grad_norm": 0.3883119523525238, + "learning_rate": 2.0770080472298783e-09, + "loss": 0.2913, + "step": 1063 + }, + { + "epoch": 1.0, + "grad_norm": 0.5132927298545837, + "learning_rate": 9.231182382773984e-10, + "loss": 0.3904, + "step": 1064 + }, + { + "epoch": 1.0, + "grad_norm": 0.3842669427394867, + "learning_rate": 2.307800921641512e-10, + "loss": 0.2883, + "step": 1065 + }, + { + "epoch": 1.0, + "grad_norm": 0.3865910768508911, + "learning_rate": 0.0, + "loss": 0.276, + "step": 1066 + }, + { + "epoch": 1.0, + "step": 1066, + "total_flos": 1.171159722670162e+17, + "train_loss": 0.31279155516937573, + "train_runtime": 54620.1987, + "train_samples_per_second": 0.625, + "train_steps_per_second": 0.02 + } + ], + "logging_steps": 1.0, + "max_steps": 1066, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 1.171159722670162e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}