diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-SI-nextqa_shuffled-baseline_lora_mlp/README.md b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-SI-nextqa_shuffled-baseline_lora_mlp/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-SI-nextqa_shuffled-baseline_lora_mlp/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-SI-nextqa_shuffled-baseline_lora_mlp/adapter_config.json b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-SI-nextqa_shuffled-baseline_lora_mlp/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6525dbd33aec747ed5887c28753f3457677d46b8 --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-SI-nextqa_shuffled-baseline_lora_mlp/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "/data/wiedmann/hub/models--lmms-lab--llava-onevision-qwen2-7b-si", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 256, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "revision": null, + "target_modules": [ + "up_proj", + "v_proj", + "q_proj", + "gate_proj", + "k_proj", + "down_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-SI-nextqa_shuffled-baseline_lora_mlp/adapter_model.bin b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-SI-nextqa_shuffled-baseline_lora_mlp/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..008415eecaa4fae968d520668761c1fda58e12b8 --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-SI-nextqa_shuffled-baseline_lora_mlp/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ac9ca565ca2ade557fc561b6b90c95e6f09a458ad7d19ec4aecbf3e54832737 +size 692127130 diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-SI-nextqa_shuffled-baseline_lora_mlp/config.json b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-SI-nextqa_shuffled-baseline_lora_mlp/config.json new file mode 100644 index 0000000000000000000000000000000000000000..c9865019d718e38a92edb33ebb0d1aabd0d6a120 --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-SI-nextqa_shuffled-baseline_lora_mlp/config.json @@ -0,0 +1,59 @@ +{ + "_name_or_path": "/data/wiedmann/hub/models--lmms-lab--llava-onevision-qwen2-7b-si", + "add_faster_video": false, + "add_time_instruction": false, + "architectures": [ + "LlavaQwenForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "faster_token_stride": 10, + "force_sample": false, + "hidden_act": "silu", + "hidden_size": 3584, + "image_aspect_ratio": "square", + "image_crop_resolution": null, + "image_grid_pinpoints": null, + "image_split_resolution": null, + "image_token_index": 151646, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "mm_hidden_size": 1152, + "mm_newline_position": "no_token", + "mm_patch_merge_type": "spatial_unpad", + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_resampler_type": null, + "mm_spatial_pool_mode": "bilinear", + "mm_spatial_pool_stride": null, + "mm_tunable_parts": "mm_mlp_adapter", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-384", + "mm_vision_tower_lr": null, + "model_type": "llava_qwen", + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "pos_skipping_range": 4096, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 131072, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 8192, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.40.0.dev0", + "use_cache": true, + "use_mm_proj": true, + "use_pos_skipping": false, + "use_sliding_window": false, + "vision_tower_pretrained": null, + "vocab_size": 152064 +} diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-SI-nextqa_shuffled-baseline_lora_mlp/generation_config.json b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-SI-nextqa_shuffled-baseline_lora_mlp/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..19a297221acb87418d4388a3decef2282c6d7316 --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-SI-nextqa_shuffled-baseline_lora_mlp/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.05, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.40.0.dev0" +} diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-SI-nextqa_shuffled-baseline_lora_mlp/non_lora_trainables.bin b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-SI-nextqa_shuffled-baseline_lora_mlp/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..75d8bcd686646adc9f047cf719dd1edba5c86c56 --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-SI-nextqa_shuffled-baseline_lora_mlp/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:414ae8b21d12eeb164ab5d091d6835a24cc2f9317ed362396a627f884a3377bf +size 33964208 diff --git a/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-SI-nextqa_shuffled-baseline_lora_mlp/trainer_state.json b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-SI-nextqa_shuffled-baseline_lora_mlp/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1303918eb11b84f4c3806d85a0147a291c800e7b --- /dev/null +++ b/llavanext-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-SI-nextqa_shuffled-baseline_lora_mlp/trainer_state.json @@ -0,0 +1,7495 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9996446760630108, + "eval_steps": 100, + "global_step": 1055, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 6.573533058166504, + "learning_rate": 3.125e-07, + "loss": 0.9699, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 14.963333129882812, + "learning_rate": 6.25e-07, + "loss": 0.8536, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 6.163477420806885, + "learning_rate": 9.375000000000001e-07, + "loss": 1.1326, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 6.327515125274658, + "learning_rate": 1.25e-06, + "loss": 1.0532, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 16.021699905395508, + "learning_rate": 1.5625e-06, + "loss": 0.9634, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 6.020225524902344, + "learning_rate": 1.8750000000000003e-06, + "loss": 1.0092, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 7.418478012084961, + "learning_rate": 2.1875000000000002e-06, + "loss": 1.007, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 6.422685146331787, + "learning_rate": 2.5e-06, + "loss": 0.9613, + "step": 8 + }, + { + "epoch": 0.01, + "grad_norm": 5.807549953460693, + "learning_rate": 2.8125e-06, + "loss": 0.8425, + "step": 9 + }, + { + "epoch": 0.01, + "grad_norm": 6.5089192390441895, + "learning_rate": 3.125e-06, + "loss": 0.8219, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 7.570983409881592, + "learning_rate": 3.4375e-06, + "loss": 0.8622, + "step": 11 + }, + { + "epoch": 0.01, + "grad_norm": 7.30479621887207, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.7645, + "step": 12 + }, + { + "epoch": 0.01, + "grad_norm": 4.199153423309326, + "learning_rate": 4.0625000000000005e-06, + "loss": 0.7611, + "step": 13 + }, + { + "epoch": 0.01, + "grad_norm": 6.663885593414307, + "learning_rate": 4.3750000000000005e-06, + "loss": 0.7056, + "step": 14 + }, + { + "epoch": 0.01, + "grad_norm": 4.004486083984375, + "learning_rate": 4.6875000000000004e-06, + "loss": 0.8725, + "step": 15 + }, + { + "epoch": 0.02, + "grad_norm": 3.381334066390991, + "learning_rate": 5e-06, + "loss": 0.65, + "step": 16 + }, + { + "epoch": 0.02, + "grad_norm": 3.994814395904541, + "learning_rate": 5.3125e-06, + "loss": 0.757, + "step": 17 + }, + { + "epoch": 0.02, + "grad_norm": 3.5651938915252686, + "learning_rate": 5.625e-06, + "loss": 0.5595, + "step": 18 + }, + { + "epoch": 0.02, + "grad_norm": 1.94376802444458, + "learning_rate": 5.9375e-06, + "loss": 0.5499, + "step": 19 + }, + { + "epoch": 0.02, + "grad_norm": 1.7750961780548096, + "learning_rate": 6.25e-06, + "loss": 0.4746, + "step": 20 + }, + { + "epoch": 0.02, + "grad_norm": 1.8759979009628296, + "learning_rate": 6.5625e-06, + "loss": 0.5768, + "step": 21 + }, + { + "epoch": 0.02, + "grad_norm": 1.339124321937561, + "learning_rate": 6.875e-06, + "loss": 0.2824, + "step": 22 + }, + { + "epoch": 0.02, + "grad_norm": 1.5315219163894653, + "learning_rate": 7.1875e-06, + "loss": 0.4034, + "step": 23 + }, + { + "epoch": 0.02, + "grad_norm": 1.9613999128341675, + "learning_rate": 7.500000000000001e-06, + "loss": 0.6267, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 1.240362524986267, + "learning_rate": 7.8125e-06, + "loss": 0.3157, + "step": 25 + }, + { + "epoch": 0.02, + "grad_norm": 0.9875244498252869, + "learning_rate": 8.125000000000001e-06, + "loss": 0.3633, + "step": 26 + }, + { + "epoch": 0.03, + "grad_norm": 1.034967064857483, + "learning_rate": 8.4375e-06, + "loss": 0.3584, + "step": 27 + }, + { + "epoch": 0.03, + "grad_norm": 0.9101635217666626, + "learning_rate": 8.750000000000001e-06, + "loss": 0.4238, + "step": 28 + }, + { + "epoch": 0.03, + "grad_norm": 0.7796413898468018, + "learning_rate": 9.0625e-06, + "loss": 0.3641, + "step": 29 + }, + { + "epoch": 0.03, + "grad_norm": 0.849822461605072, + "learning_rate": 9.375000000000001e-06, + "loss": 0.3487, + "step": 30 + }, + { + "epoch": 0.03, + "grad_norm": 0.6829776763916016, + "learning_rate": 9.6875e-06, + "loss": 0.3434, + "step": 31 + }, + { + "epoch": 0.03, + "grad_norm": 0.8473533391952515, + "learning_rate": 1e-05, + "loss": 0.337, + "step": 32 + }, + { + "epoch": 0.03, + "grad_norm": 0.8275583386421204, + "learning_rate": 9.999976423021617e-06, + "loss": 0.3868, + "step": 33 + }, + { + "epoch": 0.03, + "grad_norm": 0.7024834752082825, + "learning_rate": 9.999905692308813e-06, + "loss": 0.3611, + "step": 34 + }, + { + "epoch": 0.03, + "grad_norm": 0.8121680617332458, + "learning_rate": 9.999787808528639e-06, + "loss": 0.4716, + "step": 35 + }, + { + "epoch": 0.03, + "grad_norm": 0.5949275493621826, + "learning_rate": 9.999622772792829e-06, + "loss": 0.2894, + "step": 36 + }, + { + "epoch": 0.04, + "grad_norm": 0.7868286371231079, + "learning_rate": 9.999410586657801e-06, + "loss": 0.3925, + "step": 37 + }, + { + "epoch": 0.04, + "grad_norm": 0.6609775424003601, + "learning_rate": 9.999151252124639e-06, + "loss": 0.3399, + "step": 38 + }, + { + "epoch": 0.04, + "grad_norm": 0.5456623435020447, + "learning_rate": 9.998844771639073e-06, + "loss": 0.2579, + "step": 39 + }, + { + "epoch": 0.04, + "grad_norm": 0.5806560516357422, + "learning_rate": 9.998491148091457e-06, + "loss": 0.3218, + "step": 40 + }, + { + "epoch": 0.04, + "grad_norm": 0.6813802123069763, + "learning_rate": 9.99809038481674e-06, + "loss": 0.3782, + "step": 41 + }, + { + "epoch": 0.04, + "grad_norm": 0.7458508610725403, + "learning_rate": 9.997642485594436e-06, + "loss": 0.3484, + "step": 42 + }, + { + "epoch": 0.04, + "grad_norm": 0.6288291811943054, + "learning_rate": 9.99714745464859e-06, + "loss": 0.2999, + "step": 43 + }, + { + "epoch": 0.04, + "grad_norm": 0.6087560057640076, + "learning_rate": 9.996605296647737e-06, + "loss": 0.2934, + "step": 44 + }, + { + "epoch": 0.04, + "grad_norm": 0.6758581399917603, + "learning_rate": 9.996016016704854e-06, + "loss": 0.4618, + "step": 45 + }, + { + "epoch": 0.04, + "grad_norm": 0.5289475321769714, + "learning_rate": 9.995379620377319e-06, + "loss": 0.2343, + "step": 46 + }, + { + "epoch": 0.04, + "grad_norm": 0.540494441986084, + "learning_rate": 9.99469611366685e-06, + "loss": 0.2364, + "step": 47 + }, + { + "epoch": 0.05, + "grad_norm": 0.6204575300216675, + "learning_rate": 9.993965503019457e-06, + "loss": 0.3579, + "step": 48 + }, + { + "epoch": 0.05, + "grad_norm": 0.6385788917541504, + "learning_rate": 9.993187795325381e-06, + "loss": 0.2899, + "step": 49 + }, + { + "epoch": 0.05, + "grad_norm": 0.5948849320411682, + "learning_rate": 9.992362997919016e-06, + "loss": 0.2543, + "step": 50 + }, + { + "epoch": 0.05, + "grad_norm": 0.6491946578025818, + "learning_rate": 9.991491118578856e-06, + "loss": 0.3367, + "step": 51 + }, + { + "epoch": 0.05, + "grad_norm": 0.6449819803237915, + "learning_rate": 9.990572165527413e-06, + "loss": 0.3244, + "step": 52 + }, + { + "epoch": 0.05, + "grad_norm": 0.7293139100074768, + "learning_rate": 9.98960614743114e-06, + "loss": 0.3801, + "step": 53 + }, + { + "epoch": 0.05, + "grad_norm": 0.6604413390159607, + "learning_rate": 9.988593073400354e-06, + "loss": 0.3575, + "step": 54 + }, + { + "epoch": 0.05, + "grad_norm": 0.6392818689346313, + "learning_rate": 9.987532952989145e-06, + "loss": 0.3391, + "step": 55 + }, + { + "epoch": 0.05, + "grad_norm": 0.6453878283500671, + "learning_rate": 9.986425796195287e-06, + "loss": 0.2703, + "step": 56 + }, + { + "epoch": 0.05, + "grad_norm": 0.6270106434822083, + "learning_rate": 9.985271613460144e-06, + "loss": 0.3559, + "step": 57 + }, + { + "epoch": 0.05, + "grad_norm": 0.6105512380599976, + "learning_rate": 9.984070415668574e-06, + "loss": 0.1861, + "step": 58 + }, + { + "epoch": 0.06, + "grad_norm": 0.6660328507423401, + "learning_rate": 9.98282221414882e-06, + "loss": 0.3553, + "step": 59 + }, + { + "epoch": 0.06, + "grad_norm": 0.6679720878601074, + "learning_rate": 9.981527020672413e-06, + "loss": 0.2726, + "step": 60 + }, + { + "epoch": 0.06, + "grad_norm": 0.6528822779655457, + "learning_rate": 9.980184847454052e-06, + "loss": 0.3589, + "step": 61 + }, + { + "epoch": 0.06, + "grad_norm": 0.8249501585960388, + "learning_rate": 9.978795707151492e-06, + "loss": 0.3055, + "step": 62 + }, + { + "epoch": 0.06, + "grad_norm": 0.6238577365875244, + "learning_rate": 9.977359612865424e-06, + "loss": 0.2546, + "step": 63 + }, + { + "epoch": 0.06, + "grad_norm": 0.7078774571418762, + "learning_rate": 9.975876578139355e-06, + "loss": 0.2807, + "step": 64 + }, + { + "epoch": 0.06, + "grad_norm": 0.6765748858451843, + "learning_rate": 9.974346616959476e-06, + "loss": 0.4132, + "step": 65 + }, + { + "epoch": 0.06, + "grad_norm": 0.6024010181427002, + "learning_rate": 9.972769743754532e-06, + "loss": 0.2856, + "step": 66 + }, + { + "epoch": 0.06, + "grad_norm": 0.510524332523346, + "learning_rate": 9.971145973395685e-06, + "loss": 0.1758, + "step": 67 + }, + { + "epoch": 0.06, + "grad_norm": 0.570541262626648, + "learning_rate": 9.969475321196374e-06, + "loss": 0.1721, + "step": 68 + }, + { + "epoch": 0.07, + "grad_norm": 0.5491730570793152, + "learning_rate": 9.967757802912172e-06, + "loss": 0.2676, + "step": 69 + }, + { + "epoch": 0.07, + "grad_norm": 0.5094174742698669, + "learning_rate": 9.965993434740634e-06, + "loss": 0.1943, + "step": 70 + }, + { + "epoch": 0.07, + "grad_norm": 0.5966671705245972, + "learning_rate": 9.96418223332115e-06, + "loss": 0.2785, + "step": 71 + }, + { + "epoch": 0.07, + "grad_norm": 0.5527228116989136, + "learning_rate": 9.962324215734782e-06, + "loss": 0.1879, + "step": 72 + }, + { + "epoch": 0.07, + "grad_norm": 0.627673864364624, + "learning_rate": 9.960419399504107e-06, + "loss": 0.3464, + "step": 73 + }, + { + "epoch": 0.07, + "grad_norm": 0.6054822206497192, + "learning_rate": 9.958467802593046e-06, + "loss": 0.222, + "step": 74 + }, + { + "epoch": 0.07, + "grad_norm": 0.5865452289581299, + "learning_rate": 9.956469443406707e-06, + "loss": 0.2541, + "step": 75 + }, + { + "epoch": 0.07, + "grad_norm": 0.5308388471603394, + "learning_rate": 9.954424340791195e-06, + "loss": 0.2497, + "step": 76 + }, + { + "epoch": 0.07, + "grad_norm": 0.7129713296890259, + "learning_rate": 9.952332514033449e-06, + "loss": 0.2928, + "step": 77 + }, + { + "epoch": 0.07, + "grad_norm": 0.6783535480499268, + "learning_rate": 9.950193982861048e-06, + "loss": 0.2822, + "step": 78 + }, + { + "epoch": 0.07, + "grad_norm": 0.6808055639266968, + "learning_rate": 9.948008767442034e-06, + "loss": 0.2615, + "step": 79 + }, + { + "epoch": 0.08, + "grad_norm": 0.5542997121810913, + "learning_rate": 9.94577688838472e-06, + "loss": 0.2276, + "step": 80 + }, + { + "epoch": 0.08, + "grad_norm": 0.6257948875427246, + "learning_rate": 9.943498366737487e-06, + "loss": 0.299, + "step": 81 + }, + { + "epoch": 0.08, + "grad_norm": 0.6514169573783875, + "learning_rate": 9.941173223988603e-06, + "loss": 0.2729, + "step": 82 + }, + { + "epoch": 0.08, + "grad_norm": 0.6393204927444458, + "learning_rate": 9.938801482065998e-06, + "loss": 0.2799, + "step": 83 + }, + { + "epoch": 0.08, + "grad_norm": 0.664059042930603, + "learning_rate": 9.93638316333708e-06, + "loss": 0.3243, + "step": 84 + }, + { + "epoch": 0.08, + "grad_norm": 0.6030569672584534, + "learning_rate": 9.93391829060851e-06, + "loss": 0.2788, + "step": 85 + }, + { + "epoch": 0.08, + "grad_norm": 0.6259936094284058, + "learning_rate": 9.93140688712598e-06, + "loss": 0.3041, + "step": 86 + }, + { + "epoch": 0.08, + "grad_norm": 0.6863194108009338, + "learning_rate": 9.92884897657402e-06, + "loss": 0.3346, + "step": 87 + }, + { + "epoch": 0.08, + "grad_norm": 0.6582688689231873, + "learning_rate": 9.926244583075748e-06, + "loss": 0.2774, + "step": 88 + }, + { + "epoch": 0.08, + "grad_norm": 0.649311363697052, + "learning_rate": 9.923593731192655e-06, + "loss": 0.3058, + "step": 89 + }, + { + "epoch": 0.09, + "grad_norm": 0.6434114575386047, + "learning_rate": 9.920896445924372e-06, + "loss": 0.2798, + "step": 90 + }, + { + "epoch": 0.09, + "grad_norm": 0.5461995601654053, + "learning_rate": 9.918152752708437e-06, + "loss": 0.1668, + "step": 91 + }, + { + "epoch": 0.09, + "grad_norm": 0.646434485912323, + "learning_rate": 9.915362677420045e-06, + "loss": 0.3334, + "step": 92 + }, + { + "epoch": 0.09, + "grad_norm": 0.728742241859436, + "learning_rate": 9.912526246371815e-06, + "loss": 0.3727, + "step": 93 + }, + { + "epoch": 0.09, + "grad_norm": 0.6656957864761353, + "learning_rate": 9.909643486313533e-06, + "loss": 0.3009, + "step": 94 + }, + { + "epoch": 0.09, + "grad_norm": 0.6820513606071472, + "learning_rate": 9.906714424431914e-06, + "loss": 0.3038, + "step": 95 + }, + { + "epoch": 0.09, + "grad_norm": 0.5719671845436096, + "learning_rate": 9.903739088350325e-06, + "loss": 0.2252, + "step": 96 + }, + { + "epoch": 0.09, + "grad_norm": 0.681084156036377, + "learning_rate": 9.90071750612854e-06, + "loss": 0.2547, + "step": 97 + }, + { + "epoch": 0.09, + "grad_norm": 0.5969502925872803, + "learning_rate": 9.897649706262474e-06, + "loss": 0.2529, + "step": 98 + }, + { + "epoch": 0.09, + "grad_norm": 0.7019534707069397, + "learning_rate": 9.894535717683902e-06, + "loss": 0.3078, + "step": 99 + }, + { + "epoch": 0.09, + "grad_norm": 0.6780735850334167, + "learning_rate": 9.891375569760205e-06, + "loss": 0.3005, + "step": 100 + }, + { + "epoch": 0.09, + "eval_loss": 0.2637310326099396, + "eval_runtime": 327.5464, + "eval_samples_per_second": 1.041, + "eval_steps_per_second": 0.263, + "step": 100 + }, + { + "epoch": 0.1, + "grad_norm": 0.6152581572532654, + "learning_rate": 9.888169292294077e-06, + "loss": 0.2279, + "step": 101 + }, + { + "epoch": 0.1, + "grad_norm": 0.6035614013671875, + "learning_rate": 9.88491691552325e-06, + "loss": 0.2279, + "step": 102 + }, + { + "epoch": 0.1, + "grad_norm": 0.7469601035118103, + "learning_rate": 9.881618470120216e-06, + "loss": 0.2789, + "step": 103 + }, + { + "epoch": 0.1, + "grad_norm": 0.7219517827033997, + "learning_rate": 9.87827398719192e-06, + "loss": 0.3135, + "step": 104 + }, + { + "epoch": 0.1, + "grad_norm": 0.6131671667098999, + "learning_rate": 9.874883498279485e-06, + "loss": 0.1929, + "step": 105 + }, + { + "epoch": 0.1, + "grad_norm": 0.6404304504394531, + "learning_rate": 9.871447035357903e-06, + "loss": 0.2558, + "step": 106 + }, + { + "epoch": 0.1, + "grad_norm": 0.622728705406189, + "learning_rate": 9.867964630835742e-06, + "loss": 0.2261, + "step": 107 + }, + { + "epoch": 0.1, + "grad_norm": 0.5448305010795593, + "learning_rate": 9.86443631755483e-06, + "loss": 0.2026, + "step": 108 + }, + { + "epoch": 0.1, + "grad_norm": 0.6098849773406982, + "learning_rate": 9.860862128789954e-06, + "loss": 0.2017, + "step": 109 + }, + { + "epoch": 0.1, + "grad_norm": 0.6181161403656006, + "learning_rate": 9.857242098248543e-06, + "loss": 0.1947, + "step": 110 + }, + { + "epoch": 0.11, + "grad_norm": 0.7358696460723877, + "learning_rate": 9.853576260070348e-06, + "loss": 0.2749, + "step": 111 + }, + { + "epoch": 0.11, + "grad_norm": 0.6237342953681946, + "learning_rate": 9.849864648827126e-06, + "loss": 0.2232, + "step": 112 + }, + { + "epoch": 0.11, + "grad_norm": 0.8139705657958984, + "learning_rate": 9.846107299522305e-06, + "loss": 0.336, + "step": 113 + }, + { + "epoch": 0.11, + "grad_norm": 0.7234876751899719, + "learning_rate": 9.842304247590668e-06, + "loss": 0.3555, + "step": 114 + }, + { + "epoch": 0.11, + "grad_norm": 0.6199493408203125, + "learning_rate": 9.838455528897998e-06, + "loss": 0.2127, + "step": 115 + }, + { + "epoch": 0.11, + "grad_norm": 0.5883256196975708, + "learning_rate": 9.834561179740763e-06, + "loss": 0.1799, + "step": 116 + }, + { + "epoch": 0.11, + "grad_norm": 0.6576983332633972, + "learning_rate": 9.830621236845755e-06, + "loss": 0.2601, + "step": 117 + }, + { + "epoch": 0.11, + "grad_norm": 0.6341786980628967, + "learning_rate": 9.826635737369752e-06, + "loss": 0.2503, + "step": 118 + }, + { + "epoch": 0.11, + "grad_norm": 0.6072645783424377, + "learning_rate": 9.82260471889917e-06, + "loss": 0.2027, + "step": 119 + }, + { + "epoch": 0.11, + "grad_norm": 0.8956173062324524, + "learning_rate": 9.818528219449705e-06, + "loss": 0.2389, + "step": 120 + }, + { + "epoch": 0.11, + "grad_norm": 0.614122211933136, + "learning_rate": 9.814406277465969e-06, + "loss": 0.197, + "step": 121 + }, + { + "epoch": 0.12, + "grad_norm": 0.7290271520614624, + "learning_rate": 9.810238931821139e-06, + "loss": 0.2665, + "step": 122 + }, + { + "epoch": 0.12, + "grad_norm": 0.6730625629425049, + "learning_rate": 9.806026221816582e-06, + "loss": 0.3386, + "step": 123 + }, + { + "epoch": 0.12, + "grad_norm": 0.5942097902297974, + "learning_rate": 9.801768187181487e-06, + "loss": 0.2707, + "step": 124 + }, + { + "epoch": 0.12, + "grad_norm": 0.6023765802383423, + "learning_rate": 9.797464868072489e-06, + "loss": 0.2218, + "step": 125 + }, + { + "epoch": 0.12, + "grad_norm": 0.7326840162277222, + "learning_rate": 9.793116305073292e-06, + "loss": 0.2333, + "step": 126 + }, + { + "epoch": 0.12, + "grad_norm": 0.6956497430801392, + "learning_rate": 9.788722539194291e-06, + "loss": 0.3369, + "step": 127 + }, + { + "epoch": 0.12, + "grad_norm": 0.6568793058395386, + "learning_rate": 9.78428361187217e-06, + "loss": 0.2236, + "step": 128 + }, + { + "epoch": 0.12, + "grad_norm": 0.7054308652877808, + "learning_rate": 9.77979956496953e-06, + "loss": 0.2579, + "step": 129 + }, + { + "epoch": 0.12, + "grad_norm": 0.7843027114868164, + "learning_rate": 9.775270440774481e-06, + "loss": 0.4016, + "step": 130 + }, + { + "epoch": 0.12, + "grad_norm": 0.7426701784133911, + "learning_rate": 9.770696282000245e-06, + "loss": 0.3249, + "step": 131 + }, + { + "epoch": 0.13, + "grad_norm": 0.7212030291557312, + "learning_rate": 9.766077131784764e-06, + "loss": 0.3782, + "step": 132 + }, + { + "epoch": 0.13, + "grad_norm": 0.6892423629760742, + "learning_rate": 9.761413033690276e-06, + "loss": 0.2803, + "step": 133 + }, + { + "epoch": 0.13, + "grad_norm": 0.7250350117683411, + "learning_rate": 9.756704031702919e-06, + "loss": 0.2911, + "step": 134 + }, + { + "epoch": 0.13, + "grad_norm": 0.5780990123748779, + "learning_rate": 9.75195017023231e-06, + "loss": 0.2263, + "step": 135 + }, + { + "epoch": 0.13, + "grad_norm": 0.6007035374641418, + "learning_rate": 9.74715149411112e-06, + "loss": 0.267, + "step": 136 + }, + { + "epoch": 0.13, + "grad_norm": 0.7782276272773743, + "learning_rate": 9.742308048594665e-06, + "loss": 0.3348, + "step": 137 + }, + { + "epoch": 0.13, + "grad_norm": 0.4514017105102539, + "learning_rate": 9.737419879360471e-06, + "loss": 0.1335, + "step": 138 + }, + { + "epoch": 0.13, + "grad_norm": 0.5577970743179321, + "learning_rate": 9.732487032507837e-06, + "loss": 0.1962, + "step": 139 + }, + { + "epoch": 0.13, + "grad_norm": 0.8191975355148315, + "learning_rate": 9.727509554557416e-06, + "loss": 0.359, + "step": 140 + }, + { + "epoch": 0.13, + "grad_norm": 0.6398234963417053, + "learning_rate": 9.722487492450764e-06, + "loss": 0.1976, + "step": 141 + }, + { + "epoch": 0.13, + "grad_norm": 0.7978889346122742, + "learning_rate": 9.717420893549902e-06, + "loss": 0.3241, + "step": 142 + }, + { + "epoch": 0.14, + "grad_norm": 0.6350768804550171, + "learning_rate": 9.712309805636863e-06, + "loss": 0.2473, + "step": 143 + }, + { + "epoch": 0.14, + "grad_norm": 0.6668251752853394, + "learning_rate": 9.707154276913255e-06, + "loss": 0.3091, + "step": 144 + }, + { + "epoch": 0.14, + "grad_norm": 0.714043915271759, + "learning_rate": 9.701954355999791e-06, + "loss": 0.2983, + "step": 145 + }, + { + "epoch": 0.14, + "grad_norm": 0.6359931230545044, + "learning_rate": 9.696710091935842e-06, + "loss": 0.2758, + "step": 146 + }, + { + "epoch": 0.14, + "grad_norm": 0.5917684435844421, + "learning_rate": 9.691421534178966e-06, + "loss": 0.2057, + "step": 147 + }, + { + "epoch": 0.14, + "grad_norm": 0.6024724841117859, + "learning_rate": 9.68608873260445e-06, + "loss": 0.2429, + "step": 148 + }, + { + "epoch": 0.14, + "grad_norm": 0.6073502898216248, + "learning_rate": 9.680711737504832e-06, + "loss": 0.1956, + "step": 149 + }, + { + "epoch": 0.14, + "grad_norm": 0.696204662322998, + "learning_rate": 9.675290599589429e-06, + "loss": 0.3212, + "step": 150 + }, + { + "epoch": 0.14, + "grad_norm": 0.6330821514129639, + "learning_rate": 9.669825369983865e-06, + "loss": 0.1942, + "step": 151 + }, + { + "epoch": 0.14, + "grad_norm": 0.666441023349762, + "learning_rate": 9.664316100229578e-06, + "loss": 0.2903, + "step": 152 + }, + { + "epoch": 0.14, + "grad_norm": 0.6631059050559998, + "learning_rate": 9.658762842283343e-06, + "loss": 0.2909, + "step": 153 + }, + { + "epoch": 0.15, + "grad_norm": 0.5578485727310181, + "learning_rate": 9.653165648516777e-06, + "loss": 0.2069, + "step": 154 + }, + { + "epoch": 0.15, + "grad_norm": 0.8304031491279602, + "learning_rate": 9.647524571715843e-06, + "loss": 0.3124, + "step": 155 + }, + { + "epoch": 0.15, + "grad_norm": 0.6066958904266357, + "learning_rate": 9.641839665080363e-06, + "loss": 0.2585, + "step": 156 + }, + { + "epoch": 0.15, + "grad_norm": 0.6744480133056641, + "learning_rate": 9.636110982223505e-06, + "loss": 0.232, + "step": 157 + }, + { + "epoch": 0.15, + "grad_norm": 0.7409882545471191, + "learning_rate": 9.630338577171282e-06, + "loss": 0.2774, + "step": 158 + }, + { + "epoch": 0.15, + "grad_norm": 0.595603883266449, + "learning_rate": 9.624522504362039e-06, + "loss": 0.2175, + "step": 159 + }, + { + "epoch": 0.15, + "grad_norm": 0.647714376449585, + "learning_rate": 9.618662818645949e-06, + "loss": 0.2964, + "step": 160 + }, + { + "epoch": 0.15, + "grad_norm": 0.5570324063301086, + "learning_rate": 9.612759575284483e-06, + "loss": 0.1763, + "step": 161 + }, + { + "epoch": 0.15, + "grad_norm": 0.7239076495170593, + "learning_rate": 9.606812829949896e-06, + "loss": 0.2865, + "step": 162 + }, + { + "epoch": 0.15, + "grad_norm": 0.6651351451873779, + "learning_rate": 9.600822638724704e-06, + "loss": 0.2219, + "step": 163 + }, + { + "epoch": 0.16, + "grad_norm": 0.6308421492576599, + "learning_rate": 9.594789058101154e-06, + "loss": 0.2315, + "step": 164 + }, + { + "epoch": 0.16, + "grad_norm": 0.6480094194412231, + "learning_rate": 9.588712144980681e-06, + "loss": 0.2858, + "step": 165 + }, + { + "epoch": 0.16, + "grad_norm": 0.7034019827842712, + "learning_rate": 9.582591956673387e-06, + "loss": 0.2438, + "step": 166 + }, + { + "epoch": 0.16, + "grad_norm": 0.6165547370910645, + "learning_rate": 9.57642855089749e-06, + "loss": 0.2009, + "step": 167 + }, + { + "epoch": 0.16, + "grad_norm": 0.8720855712890625, + "learning_rate": 9.570221985778785e-06, + "loss": 0.2984, + "step": 168 + }, + { + "epoch": 0.16, + "grad_norm": 0.6759960651397705, + "learning_rate": 9.563972319850092e-06, + "loss": 0.2318, + "step": 169 + }, + { + "epoch": 0.16, + "grad_norm": 0.567792534828186, + "learning_rate": 9.557679612050708e-06, + "loss": 0.181, + "step": 170 + }, + { + "epoch": 0.16, + "grad_norm": 0.7541651129722595, + "learning_rate": 9.551343921725844e-06, + "loss": 0.3476, + "step": 171 + }, + { + "epoch": 0.16, + "grad_norm": 0.7993423342704773, + "learning_rate": 9.544965308626075e-06, + "loss": 0.3679, + "step": 172 + }, + { + "epoch": 0.16, + "grad_norm": 0.6110410690307617, + "learning_rate": 9.538543832906773e-06, + "loss": 0.2089, + "step": 173 + }, + { + "epoch": 0.16, + "grad_norm": 0.77556973695755, + "learning_rate": 9.532079555127532e-06, + "loss": 0.285, + "step": 174 + }, + { + "epoch": 0.17, + "grad_norm": 0.7493922114372253, + "learning_rate": 9.525572536251608e-06, + "loss": 0.3202, + "step": 175 + }, + { + "epoch": 0.17, + "grad_norm": 0.6218746900558472, + "learning_rate": 9.519022837645337e-06, + "loss": 0.2713, + "step": 176 + }, + { + "epoch": 0.17, + "grad_norm": 0.6792969703674316, + "learning_rate": 9.512430521077565e-06, + "loss": 0.2558, + "step": 177 + }, + { + "epoch": 0.17, + "grad_norm": 0.6580852270126343, + "learning_rate": 9.505795648719049e-06, + "loss": 0.2697, + "step": 178 + }, + { + "epoch": 0.17, + "grad_norm": 0.7115439176559448, + "learning_rate": 9.499118283141887e-06, + "loss": 0.3133, + "step": 179 + }, + { + "epoch": 0.17, + "grad_norm": 0.5898191928863525, + "learning_rate": 9.492398487318922e-06, + "loss": 0.259, + "step": 180 + }, + { + "epoch": 0.17, + "grad_norm": 0.6370326280593872, + "learning_rate": 9.485636324623147e-06, + "loss": 0.2238, + "step": 181 + }, + { + "epoch": 0.17, + "grad_norm": 0.7449048161506653, + "learning_rate": 9.478831858827105e-06, + "loss": 0.2949, + "step": 182 + }, + { + "epoch": 0.17, + "grad_norm": 0.7421145439147949, + "learning_rate": 9.471985154102292e-06, + "loss": 0.2105, + "step": 183 + }, + { + "epoch": 0.17, + "grad_norm": 0.7774351239204407, + "learning_rate": 9.465096275018556e-06, + "loss": 0.3231, + "step": 184 + }, + { + "epoch": 0.18, + "grad_norm": 0.6269638538360596, + "learning_rate": 9.458165286543477e-06, + "loss": 0.2886, + "step": 185 + }, + { + "epoch": 0.18, + "grad_norm": 0.699456512928009, + "learning_rate": 9.451192254041759e-06, + "loss": 0.2803, + "step": 186 + }, + { + "epoch": 0.18, + "grad_norm": 0.7909825444221497, + "learning_rate": 9.444177243274619e-06, + "loss": 0.3949, + "step": 187 + }, + { + "epoch": 0.18, + "grad_norm": 0.7326850891113281, + "learning_rate": 9.437120320399158e-06, + "loss": 0.2864, + "step": 188 + }, + { + "epoch": 0.18, + "grad_norm": 0.7978874444961548, + "learning_rate": 9.430021551967745e-06, + "loss": 0.2661, + "step": 189 + }, + { + "epoch": 0.18, + "grad_norm": 0.6970043778419495, + "learning_rate": 9.422881004927383e-06, + "loss": 0.2223, + "step": 190 + }, + { + "epoch": 0.18, + "grad_norm": 0.645399272441864, + "learning_rate": 9.41569874661908e-06, + "loss": 0.2777, + "step": 191 + }, + { + "epoch": 0.18, + "grad_norm": 0.6742087602615356, + "learning_rate": 9.408474844777218e-06, + "loss": 0.2067, + "step": 192 + }, + { + "epoch": 0.18, + "grad_norm": 0.7820308208465576, + "learning_rate": 9.401209367528907e-06, + "loss": 0.4041, + "step": 193 + }, + { + "epoch": 0.18, + "grad_norm": 0.692047119140625, + "learning_rate": 9.393902383393347e-06, + "loss": 0.2648, + "step": 194 + }, + { + "epoch": 0.18, + "grad_norm": 0.8333490490913391, + "learning_rate": 9.386553961281179e-06, + "loss": 0.3695, + "step": 195 + }, + { + "epoch": 0.19, + "grad_norm": 0.7040037512779236, + "learning_rate": 9.379164170493844e-06, + "loss": 0.3136, + "step": 196 + }, + { + "epoch": 0.19, + "grad_norm": 0.8036830425262451, + "learning_rate": 9.371733080722911e-06, + "loss": 0.2963, + "step": 197 + }, + { + "epoch": 0.19, + "grad_norm": 0.5481095314025879, + "learning_rate": 9.36426076204944e-06, + "loss": 0.1438, + "step": 198 + }, + { + "epoch": 0.19, + "grad_norm": 0.7464080452919006, + "learning_rate": 9.35674728494331e-06, + "loss": 0.2957, + "step": 199 + }, + { + "epoch": 0.19, + "grad_norm": 0.6098821759223938, + "learning_rate": 9.349192720262556e-06, + "loss": 0.2428, + "step": 200 + }, + { + "epoch": 0.19, + "eval_loss": 0.2540014684200287, + "eval_runtime": 327.1693, + "eval_samples_per_second": 1.042, + "eval_steps_per_second": 0.263, + "step": 200 + }, + { + "epoch": 0.19, + "grad_norm": 0.6470658779144287, + "learning_rate": 9.341597139252698e-06, + "loss": 0.269, + "step": 201 + }, + { + "epoch": 0.19, + "grad_norm": 0.6855325102806091, + "learning_rate": 9.333960613546079e-06, + "loss": 0.2732, + "step": 202 + }, + { + "epoch": 0.19, + "grad_norm": 0.6597403287887573, + "learning_rate": 9.326283215161177e-06, + "loss": 0.2185, + "step": 203 + }, + { + "epoch": 0.19, + "grad_norm": 0.4843915104866028, + "learning_rate": 9.31856501650194e-06, + "loss": 0.117, + "step": 204 + }, + { + "epoch": 0.19, + "grad_norm": 0.6510450839996338, + "learning_rate": 9.310806090357083e-06, + "loss": 0.3108, + "step": 205 + }, + { + "epoch": 0.2, + "grad_norm": 0.5514283180236816, + "learning_rate": 9.30300650989942e-06, + "loss": 0.1462, + "step": 206 + }, + { + "epoch": 0.2, + "grad_norm": 0.7147676348686218, + "learning_rate": 9.295166348685169e-06, + "loss": 0.2668, + "step": 207 + }, + { + "epoch": 0.2, + "grad_norm": 0.7435725331306458, + "learning_rate": 9.287285680653254e-06, + "loss": 0.2922, + "step": 208 + }, + { + "epoch": 0.2, + "grad_norm": 0.6453981399536133, + "learning_rate": 9.279364580124615e-06, + "loss": 0.2574, + "step": 209 + }, + { + "epoch": 0.2, + "grad_norm": 0.6430134177207947, + "learning_rate": 9.271403121801492e-06, + "loss": 0.1975, + "step": 210 + }, + { + "epoch": 0.2, + "grad_norm": 0.89022296667099, + "learning_rate": 9.263401380766739e-06, + "loss": 0.3669, + "step": 211 + }, + { + "epoch": 0.2, + "grad_norm": 0.682579755783081, + "learning_rate": 9.255359432483106e-06, + "loss": 0.2956, + "step": 212 + }, + { + "epoch": 0.2, + "grad_norm": 0.6273362636566162, + "learning_rate": 9.247277352792534e-06, + "loss": 0.2152, + "step": 213 + }, + { + "epoch": 0.2, + "grad_norm": 0.7749226689338684, + "learning_rate": 9.239155217915422e-06, + "loss": 0.2967, + "step": 214 + }, + { + "epoch": 0.2, + "grad_norm": 0.5863397717475891, + "learning_rate": 9.23099310444994e-06, + "loss": 0.1697, + "step": 215 + }, + { + "epoch": 0.2, + "grad_norm": 0.6522620320320129, + "learning_rate": 9.222791089371266e-06, + "loss": 0.2507, + "step": 216 + }, + { + "epoch": 0.21, + "grad_norm": 0.64266037940979, + "learning_rate": 9.214549250030899e-06, + "loss": 0.2172, + "step": 217 + }, + { + "epoch": 0.21, + "grad_norm": 0.726273238658905, + "learning_rate": 9.206267664155906e-06, + "loss": 0.3365, + "step": 218 + }, + { + "epoch": 0.21, + "grad_norm": 0.5961911678314209, + "learning_rate": 9.197946409848196e-06, + "loss": 0.2067, + "step": 219 + }, + { + "epoch": 0.21, + "grad_norm": 0.6414331793785095, + "learning_rate": 9.189585565583779e-06, + "loss": 0.2531, + "step": 220 + }, + { + "epoch": 0.21, + "grad_norm": 0.5866371989250183, + "learning_rate": 9.181185210212034e-06, + "loss": 0.2646, + "step": 221 + }, + { + "epoch": 0.21, + "grad_norm": 0.6715410947799683, + "learning_rate": 9.172745422954961e-06, + "loss": 0.2473, + "step": 222 + }, + { + "epoch": 0.21, + "grad_norm": 0.6602498888969421, + "learning_rate": 9.164266283406433e-06, + "loss": 0.2778, + "step": 223 + }, + { + "epoch": 0.21, + "grad_norm": 0.7695605158805847, + "learning_rate": 9.155747871531444e-06, + "loss": 0.259, + "step": 224 + }, + { + "epoch": 0.21, + "grad_norm": 0.7421642541885376, + "learning_rate": 9.147190267665361e-06, + "loss": 0.2916, + "step": 225 + }, + { + "epoch": 0.21, + "grad_norm": 0.6979179978370667, + "learning_rate": 9.13859355251316e-06, + "loss": 0.2285, + "step": 226 + }, + { + "epoch": 0.22, + "grad_norm": 0.6683458089828491, + "learning_rate": 9.129957807148666e-06, + "loss": 0.2239, + "step": 227 + }, + { + "epoch": 0.22, + "grad_norm": 0.8766207098960876, + "learning_rate": 9.121283113013794e-06, + "loss": 0.3639, + "step": 228 + }, + { + "epoch": 0.22, + "grad_norm": 0.5637826323509216, + "learning_rate": 9.112569551917773e-06, + "loss": 0.1856, + "step": 229 + }, + { + "epoch": 0.22, + "grad_norm": 0.6588501334190369, + "learning_rate": 9.103817206036383e-06, + "loss": 0.3037, + "step": 230 + }, + { + "epoch": 0.22, + "grad_norm": 0.8170912861824036, + "learning_rate": 9.095026157911166e-06, + "loss": 0.2881, + "step": 231 + }, + { + "epoch": 0.22, + "grad_norm": 0.6367344856262207, + "learning_rate": 9.086196490448668e-06, + "loss": 0.1722, + "step": 232 + }, + { + "epoch": 0.22, + "grad_norm": 0.7489650249481201, + "learning_rate": 9.077328286919638e-06, + "loss": 0.3079, + "step": 233 + }, + { + "epoch": 0.22, + "grad_norm": 0.7185512185096741, + "learning_rate": 9.068421630958254e-06, + "loss": 0.2906, + "step": 234 + }, + { + "epoch": 0.22, + "grad_norm": 0.61532062292099, + "learning_rate": 9.059476606561328e-06, + "loss": 0.1991, + "step": 235 + }, + { + "epoch": 0.22, + "grad_norm": 0.7276179790496826, + "learning_rate": 9.050493298087523e-06, + "loss": 0.272, + "step": 236 + }, + { + "epoch": 0.22, + "grad_norm": 0.6644300818443298, + "learning_rate": 9.041471790256543e-06, + "loss": 0.2747, + "step": 237 + }, + { + "epoch": 0.23, + "grad_norm": 0.685745120048523, + "learning_rate": 9.032412168148345e-06, + "loss": 0.2481, + "step": 238 + }, + { + "epoch": 0.23, + "grad_norm": 0.7370786666870117, + "learning_rate": 9.023314517202341e-06, + "loss": 0.3041, + "step": 239 + }, + { + "epoch": 0.23, + "grad_norm": 0.6082615852355957, + "learning_rate": 9.014178923216572e-06, + "loss": 0.1761, + "step": 240 + }, + { + "epoch": 0.23, + "grad_norm": 0.594700038433075, + "learning_rate": 9.005005472346923e-06, + "loss": 0.2002, + "step": 241 + }, + { + "epoch": 0.23, + "grad_norm": 0.7096708416938782, + "learning_rate": 8.995794251106295e-06, + "loss": 0.3118, + "step": 242 + }, + { + "epoch": 0.23, + "grad_norm": 0.6614163517951965, + "learning_rate": 8.986545346363792e-06, + "loss": 0.2483, + "step": 243 + }, + { + "epoch": 0.23, + "grad_norm": 0.6781965494155884, + "learning_rate": 8.977258845343904e-06, + "loss": 0.271, + "step": 244 + }, + { + "epoch": 0.23, + "grad_norm": 0.7122507095336914, + "learning_rate": 8.96793483562569e-06, + "loss": 0.254, + "step": 245 + }, + { + "epoch": 0.23, + "grad_norm": 0.6902074217796326, + "learning_rate": 8.958573405141932e-06, + "loss": 0.2693, + "step": 246 + }, + { + "epoch": 0.23, + "grad_norm": 0.7376943230628967, + "learning_rate": 8.949174642178333e-06, + "loss": 0.2949, + "step": 247 + }, + { + "epoch": 0.23, + "grad_norm": 0.7361817955970764, + "learning_rate": 8.939738635372664e-06, + "loss": 0.2617, + "step": 248 + }, + { + "epoch": 0.24, + "grad_norm": 0.8066937327384949, + "learning_rate": 8.930265473713939e-06, + "loss": 0.3163, + "step": 249 + }, + { + "epoch": 0.24, + "grad_norm": 0.7305431962013245, + "learning_rate": 8.920755246541563e-06, + "loss": 0.2936, + "step": 250 + }, + { + "epoch": 0.24, + "grad_norm": 0.7218648195266724, + "learning_rate": 8.911208043544513e-06, + "loss": 0.2888, + "step": 251 + }, + { + "epoch": 0.24, + "grad_norm": 0.6873829960823059, + "learning_rate": 8.90162395476046e-06, + "loss": 0.2173, + "step": 252 + }, + { + "epoch": 0.24, + "grad_norm": 0.8219372630119324, + "learning_rate": 8.89200307057495e-06, + "loss": 0.3976, + "step": 253 + }, + { + "epoch": 0.24, + "grad_norm": 0.7167593836784363, + "learning_rate": 8.882345481720533e-06, + "loss": 0.302, + "step": 254 + }, + { + "epoch": 0.24, + "grad_norm": 0.7573580145835876, + "learning_rate": 8.872651279275917e-06, + "loss": 0.2741, + "step": 255 + }, + { + "epoch": 0.24, + "grad_norm": 0.7510555982589722, + "learning_rate": 8.862920554665098e-06, + "loss": 0.2862, + "step": 256 + }, + { + "epoch": 0.24, + "grad_norm": 0.678774356842041, + "learning_rate": 8.853153399656513e-06, + "loss": 0.2724, + "step": 257 + }, + { + "epoch": 0.24, + "grad_norm": 0.7355273365974426, + "learning_rate": 8.843349906362163e-06, + "loss": 0.2834, + "step": 258 + }, + { + "epoch": 0.25, + "grad_norm": 0.8037059903144836, + "learning_rate": 8.833510167236747e-06, + "loss": 0.335, + "step": 259 + }, + { + "epoch": 0.25, + "grad_norm": 0.695874810218811, + "learning_rate": 8.823634275076792e-06, + "loss": 0.2839, + "step": 260 + }, + { + "epoch": 0.25, + "grad_norm": 0.7235242128372192, + "learning_rate": 8.813722323019774e-06, + "loss": 0.318, + "step": 261 + }, + { + "epoch": 0.25, + "grad_norm": 0.6236990690231323, + "learning_rate": 8.803774404543246e-06, + "loss": 0.2083, + "step": 262 + }, + { + "epoch": 0.25, + "grad_norm": 0.6476077437400818, + "learning_rate": 8.793790613463956e-06, + "loss": 0.2701, + "step": 263 + }, + { + "epoch": 0.25, + "grad_norm": 0.4958818554878235, + "learning_rate": 8.783771043936949e-06, + "loss": 0.1821, + "step": 264 + }, + { + "epoch": 0.25, + "grad_norm": 0.7664861679077148, + "learning_rate": 8.773715790454695e-06, + "loss": 0.2629, + "step": 265 + }, + { + "epoch": 0.25, + "grad_norm": 0.6198388338088989, + "learning_rate": 8.763624947846195e-06, + "loss": 0.1988, + "step": 266 + }, + { + "epoch": 0.25, + "grad_norm": 0.7904531359672546, + "learning_rate": 8.75349861127608e-06, + "loss": 0.307, + "step": 267 + }, + { + "epoch": 0.25, + "grad_norm": 0.7708582282066345, + "learning_rate": 8.743336876243712e-06, + "loss": 0.2798, + "step": 268 + }, + { + "epoch": 0.25, + "grad_norm": 0.7881653308868408, + "learning_rate": 8.733139838582299e-06, + "loss": 0.3482, + "step": 269 + }, + { + "epoch": 0.26, + "grad_norm": 0.7888725399971008, + "learning_rate": 8.722907594457975e-06, + "loss": 0.3321, + "step": 270 + }, + { + "epoch": 0.26, + "grad_norm": 0.7690854668617249, + "learning_rate": 8.712640240368899e-06, + "loss": 0.3307, + "step": 271 + }, + { + "epoch": 0.26, + "grad_norm": 0.7235127687454224, + "learning_rate": 8.702337873144343e-06, + "loss": 0.2644, + "step": 272 + }, + { + "epoch": 0.26, + "grad_norm": 0.6718965172767639, + "learning_rate": 8.692000589943785e-06, + "loss": 0.2135, + "step": 273 + }, + { + "epoch": 0.26, + "grad_norm": 0.7318533062934875, + "learning_rate": 8.681628488255986e-06, + "loss": 0.311, + "step": 274 + }, + { + "epoch": 0.26, + "grad_norm": 0.6941997408866882, + "learning_rate": 8.671221665898074e-06, + "loss": 0.2215, + "step": 275 + }, + { + "epoch": 0.26, + "grad_norm": 0.7051814794540405, + "learning_rate": 8.660780221014617e-06, + "loss": 0.2967, + "step": 276 + }, + { + "epoch": 0.26, + "grad_norm": 0.6508485674858093, + "learning_rate": 8.650304252076704e-06, + "loss": 0.2548, + "step": 277 + }, + { + "epoch": 0.26, + "grad_norm": 0.5858379006385803, + "learning_rate": 8.63979385788101e-06, + "loss": 0.202, + "step": 278 + }, + { + "epoch": 0.26, + "grad_norm": 0.7965670228004456, + "learning_rate": 8.629249137548873e-06, + "loss": 0.3318, + "step": 279 + }, + { + "epoch": 0.27, + "grad_norm": 0.6064817905426025, + "learning_rate": 8.61867019052535e-06, + "loss": 0.1971, + "step": 280 + }, + { + "epoch": 0.27, + "grad_norm": 0.6326867938041687, + "learning_rate": 8.608057116578283e-06, + "loss": 0.2048, + "step": 281 + }, + { + "epoch": 0.27, + "grad_norm": 0.7318657040596008, + "learning_rate": 8.597410015797358e-06, + "loss": 0.3125, + "step": 282 + }, + { + "epoch": 0.27, + "grad_norm": 0.5439643263816833, + "learning_rate": 8.586728988593158e-06, + "loss": 0.2009, + "step": 283 + }, + { + "epoch": 0.27, + "grad_norm": 0.6920150518417358, + "learning_rate": 8.576014135696227e-06, + "loss": 0.2299, + "step": 284 + }, + { + "epoch": 0.27, + "grad_norm": 0.6073454022407532, + "learning_rate": 8.565265558156101e-06, + "loss": 0.2028, + "step": 285 + }, + { + "epoch": 0.27, + "grad_norm": 0.8154141902923584, + "learning_rate": 8.554483357340379e-06, + "loss": 0.3168, + "step": 286 + }, + { + "epoch": 0.27, + "grad_norm": 0.5363025069236755, + "learning_rate": 8.543667634933743e-06, + "loss": 0.2054, + "step": 287 + }, + { + "epoch": 0.27, + "grad_norm": 0.6466490626335144, + "learning_rate": 8.532818492937014e-06, + "loss": 0.2164, + "step": 288 + }, + { + "epoch": 0.27, + "grad_norm": 0.860065221786499, + "learning_rate": 8.521936033666187e-06, + "loss": 0.258, + "step": 289 + }, + { + "epoch": 0.27, + "grad_norm": 0.7522770762443542, + "learning_rate": 8.511020359751467e-06, + "loss": 0.2297, + "step": 290 + }, + { + "epoch": 0.28, + "grad_norm": 0.5030247569084167, + "learning_rate": 8.500071574136297e-06, + "loss": 0.149, + "step": 291 + }, + { + "epoch": 0.28, + "grad_norm": 0.6503915786743164, + "learning_rate": 8.489089780076387e-06, + "loss": 0.2363, + "step": 292 + }, + { + "epoch": 0.28, + "grad_norm": 0.7013444900512695, + "learning_rate": 8.478075081138746e-06, + "loss": 0.2444, + "step": 293 + }, + { + "epoch": 0.28, + "grad_norm": 0.7805238366127014, + "learning_rate": 8.467027581200702e-06, + "loss": 0.3497, + "step": 294 + }, + { + "epoch": 0.28, + "grad_norm": 0.72954261302948, + "learning_rate": 8.455947384448926e-06, + "loss": 0.2815, + "step": 295 + }, + { + "epoch": 0.28, + "grad_norm": 0.6791868209838867, + "learning_rate": 8.444834595378434e-06, + "loss": 0.1917, + "step": 296 + }, + { + "epoch": 0.28, + "grad_norm": 0.786886990070343, + "learning_rate": 8.433689318791628e-06, + "loss": 0.2849, + "step": 297 + }, + { + "epoch": 0.28, + "grad_norm": 0.7145736813545227, + "learning_rate": 8.42251165979728e-06, + "loss": 0.2544, + "step": 298 + }, + { + "epoch": 0.28, + "grad_norm": 0.6851241588592529, + "learning_rate": 8.411301723809563e-06, + "loss": 0.2292, + "step": 299 + }, + { + "epoch": 0.28, + "grad_norm": 0.7301487922668457, + "learning_rate": 8.400059616547046e-06, + "loss": 0.2454, + "step": 300 + }, + { + "epoch": 0.28, + "eval_loss": 0.24792622029781342, + "eval_runtime": 335.2342, + "eval_samples_per_second": 1.017, + "eval_steps_per_second": 0.257, + "step": 300 + }, + { + "epoch": 0.29, + "grad_norm": 0.6980499029159546, + "learning_rate": 8.388785444031695e-06, + "loss": 0.2847, + "step": 301 + }, + { + "epoch": 0.29, + "grad_norm": 0.6795322895050049, + "learning_rate": 8.37747931258788e-06, + "loss": 0.2189, + "step": 302 + }, + { + "epoch": 0.29, + "grad_norm": 0.6904706954956055, + "learning_rate": 8.366141328841367e-06, + "loss": 0.3023, + "step": 303 + }, + { + "epoch": 0.29, + "grad_norm": 0.6439961791038513, + "learning_rate": 8.354771599718313e-06, + "loss": 0.1895, + "step": 304 + }, + { + "epoch": 0.29, + "grad_norm": 0.7503674030303955, + "learning_rate": 8.34337023244426e-06, + "loss": 0.2955, + "step": 305 + }, + { + "epoch": 0.29, + "grad_norm": 0.7018287777900696, + "learning_rate": 8.331937334543132e-06, + "loss": 0.251, + "step": 306 + }, + { + "epoch": 0.29, + "grad_norm": 0.821648895740509, + "learning_rate": 8.320473013836197e-06, + "loss": 0.3457, + "step": 307 + }, + { + "epoch": 0.29, + "grad_norm": 0.7417198419570923, + "learning_rate": 8.308977378441072e-06, + "loss": 0.3358, + "step": 308 + }, + { + "epoch": 0.29, + "grad_norm": 0.6748823523521423, + "learning_rate": 8.297450536770697e-06, + "loss": 0.2351, + "step": 309 + }, + { + "epoch": 0.29, + "grad_norm": 0.7352398633956909, + "learning_rate": 8.285892597532311e-06, + "loss": 0.2514, + "step": 310 + }, + { + "epoch": 0.29, + "grad_norm": 0.7595478892326355, + "learning_rate": 8.274303669726427e-06, + "loss": 0.2877, + "step": 311 + }, + { + "epoch": 0.3, + "grad_norm": 0.6498991847038269, + "learning_rate": 8.262683862645804e-06, + "loss": 0.2528, + "step": 312 + }, + { + "epoch": 0.3, + "grad_norm": 0.7450823783874512, + "learning_rate": 8.25103328587442e-06, + "loss": 0.269, + "step": 313 + }, + { + "epoch": 0.3, + "grad_norm": 0.7365174293518066, + "learning_rate": 8.239352049286435e-06, + "loss": 0.2763, + "step": 314 + }, + { + "epoch": 0.3, + "grad_norm": 0.6681867837905884, + "learning_rate": 8.22764026304515e-06, + "loss": 0.2462, + "step": 315 + }, + { + "epoch": 0.3, + "grad_norm": 0.7543831467628479, + "learning_rate": 8.215898037601981e-06, + "loss": 0.294, + "step": 316 + }, + { + "epoch": 0.3, + "grad_norm": 0.784321129322052, + "learning_rate": 8.204125483695403e-06, + "loss": 0.2842, + "step": 317 + }, + { + "epoch": 0.3, + "grad_norm": 0.6397828459739685, + "learning_rate": 8.192322712349917e-06, + "loss": 0.2023, + "step": 318 + }, + { + "epoch": 0.3, + "grad_norm": 0.815455436706543, + "learning_rate": 8.180489834875e-06, + "loss": 0.2827, + "step": 319 + }, + { + "epoch": 0.3, + "grad_norm": 0.6677094101905823, + "learning_rate": 8.168626962864045e-06, + "loss": 0.2163, + "step": 320 + }, + { + "epoch": 0.3, + "grad_norm": 0.7844486832618713, + "learning_rate": 8.156734208193327e-06, + "loss": 0.261, + "step": 321 + }, + { + "epoch": 0.31, + "grad_norm": 0.7194960713386536, + "learning_rate": 8.144811683020932e-06, + "loss": 0.2892, + "step": 322 + }, + { + "epoch": 0.31, + "grad_norm": 0.6769686341285706, + "learning_rate": 8.132859499785708e-06, + "loss": 0.2651, + "step": 323 + }, + { + "epoch": 0.31, + "grad_norm": 0.7005006074905396, + "learning_rate": 8.120877771206201e-06, + "loss": 0.268, + "step": 324 + }, + { + "epoch": 0.31, + "grad_norm": 0.8526335954666138, + "learning_rate": 8.108866610279595e-06, + "loss": 0.3551, + "step": 325 + }, + { + "epoch": 0.31, + "grad_norm": 0.6851252317428589, + "learning_rate": 8.09682613028064e-06, + "loss": 0.2551, + "step": 326 + }, + { + "epoch": 0.31, + "grad_norm": 0.7191782593727112, + "learning_rate": 8.08475644476059e-06, + "loss": 0.3117, + "step": 327 + }, + { + "epoch": 0.31, + "grad_norm": 0.7180798053741455, + "learning_rate": 8.072657667546136e-06, + "loss": 0.3054, + "step": 328 + }, + { + "epoch": 0.31, + "grad_norm": 0.6469159722328186, + "learning_rate": 8.060529912738316e-06, + "loss": 0.2902, + "step": 329 + }, + { + "epoch": 0.31, + "grad_norm": 0.7266992330551147, + "learning_rate": 8.048373294711455e-06, + "loss": 0.2849, + "step": 330 + }, + { + "epoch": 0.31, + "grad_norm": 0.6951152086257935, + "learning_rate": 8.036187928112087e-06, + "loss": 0.2612, + "step": 331 + }, + { + "epoch": 0.31, + "grad_norm": 0.8010523915290833, + "learning_rate": 8.023973927857857e-06, + "loss": 0.3538, + "step": 332 + }, + { + "epoch": 0.32, + "grad_norm": 0.7954149842262268, + "learning_rate": 8.011731409136454e-06, + "loss": 0.305, + "step": 333 + }, + { + "epoch": 0.32, + "grad_norm": 0.7332515716552734, + "learning_rate": 7.99946048740452e-06, + "loss": 0.232, + "step": 334 + }, + { + "epoch": 0.32, + "grad_norm": 0.799700140953064, + "learning_rate": 7.987161278386555e-06, + "loss": 0.3147, + "step": 335 + }, + { + "epoch": 0.32, + "grad_norm": 0.6455225348472595, + "learning_rate": 7.974833898073832e-06, + "loss": 0.2652, + "step": 336 + }, + { + "epoch": 0.32, + "grad_norm": 0.6520224809646606, + "learning_rate": 7.962478462723306e-06, + "loss": 0.2673, + "step": 337 + }, + { + "epoch": 0.32, + "grad_norm": 0.6774414777755737, + "learning_rate": 7.950095088856509e-06, + "loss": 0.287, + "step": 338 + }, + { + "epoch": 0.32, + "grad_norm": 0.5675615072250366, + "learning_rate": 7.937683893258454e-06, + "loss": 0.1705, + "step": 339 + }, + { + "epoch": 0.32, + "grad_norm": 0.7145261764526367, + "learning_rate": 7.925244992976538e-06, + "loss": 0.2907, + "step": 340 + }, + { + "epoch": 0.32, + "grad_norm": 0.6485288143157959, + "learning_rate": 7.912778505319436e-06, + "loss": 0.2452, + "step": 341 + }, + { + "epoch": 0.32, + "grad_norm": 0.5317831635475159, + "learning_rate": 7.900284547855992e-06, + "loss": 0.1613, + "step": 342 + }, + { + "epoch": 0.33, + "grad_norm": 0.7434039115905762, + "learning_rate": 7.88776323841411e-06, + "loss": 0.262, + "step": 343 + }, + { + "epoch": 0.33, + "grad_norm": 0.9552202224731445, + "learning_rate": 7.875214695079647e-06, + "loss": 0.3407, + "step": 344 + }, + { + "epoch": 0.33, + "grad_norm": 0.6228988766670227, + "learning_rate": 7.862639036195298e-06, + "loss": 0.1854, + "step": 345 + }, + { + "epoch": 0.33, + "grad_norm": 0.8214723467826843, + "learning_rate": 7.850036380359479e-06, + "loss": 0.3061, + "step": 346 + }, + { + "epoch": 0.33, + "grad_norm": 0.7715755105018616, + "learning_rate": 7.837406846425205e-06, + "loss": 0.3337, + "step": 347 + }, + { + "epoch": 0.33, + "grad_norm": 0.7069976329803467, + "learning_rate": 7.824750553498977e-06, + "loss": 0.3405, + "step": 348 + }, + { + "epoch": 0.33, + "grad_norm": 0.7876215577125549, + "learning_rate": 7.812067620939653e-06, + "loss": 0.2986, + "step": 349 + }, + { + "epoch": 0.33, + "grad_norm": 0.6568417549133301, + "learning_rate": 7.799358168357323e-06, + "loss": 0.2195, + "step": 350 + }, + { + "epoch": 0.33, + "grad_norm": 0.7959166765213013, + "learning_rate": 7.786622315612182e-06, + "loss": 0.2963, + "step": 351 + }, + { + "epoch": 0.33, + "grad_norm": 0.6602903604507446, + "learning_rate": 7.773860182813404e-06, + "loss": 0.1999, + "step": 352 + }, + { + "epoch": 0.33, + "grad_norm": 0.7031603455543518, + "learning_rate": 7.761071890317994e-06, + "loss": 0.2623, + "step": 353 + }, + { + "epoch": 0.34, + "grad_norm": 0.6833639740943909, + "learning_rate": 7.748257558729677e-06, + "loss": 0.2387, + "step": 354 + }, + { + "epoch": 0.34, + "grad_norm": 0.6679258942604065, + "learning_rate": 7.735417308897737e-06, + "loss": 0.2951, + "step": 355 + }, + { + "epoch": 0.34, + "grad_norm": 0.6927437782287598, + "learning_rate": 7.72255126191589e-06, + "loss": 0.2516, + "step": 356 + }, + { + "epoch": 0.34, + "grad_norm": 0.7067151665687561, + "learning_rate": 7.709659539121144e-06, + "loss": 0.2554, + "step": 357 + }, + { + "epoch": 0.34, + "grad_norm": 0.6713438630104065, + "learning_rate": 7.696742262092643e-06, + "loss": 0.2542, + "step": 358 + }, + { + "epoch": 0.34, + "grad_norm": 0.5410385131835938, + "learning_rate": 7.683799552650534e-06, + "loss": 0.1874, + "step": 359 + }, + { + "epoch": 0.34, + "grad_norm": 0.6991607546806335, + "learning_rate": 7.670831532854811e-06, + "loss": 0.1918, + "step": 360 + }, + { + "epoch": 0.34, + "grad_norm": 0.5746275782585144, + "learning_rate": 7.65783832500416e-06, + "loss": 0.185, + "step": 361 + }, + { + "epoch": 0.34, + "grad_norm": 0.6405538320541382, + "learning_rate": 7.644820051634813e-06, + "loss": 0.2428, + "step": 362 + }, + { + "epoch": 0.34, + "grad_norm": 0.7153875827789307, + "learning_rate": 7.63177683551939e-06, + "loss": 0.2754, + "step": 363 + }, + { + "epoch": 0.34, + "grad_norm": 0.6949155330657959, + "learning_rate": 7.618708799665745e-06, + "loss": 0.238, + "step": 364 + }, + { + "epoch": 0.35, + "grad_norm": 0.761519730091095, + "learning_rate": 7.605616067315793e-06, + "loss": 0.2771, + "step": 365 + }, + { + "epoch": 0.35, + "grad_norm": 0.7326161861419678, + "learning_rate": 7.592498761944363e-06, + "loss": 0.177, + "step": 366 + }, + { + "epoch": 0.35, + "grad_norm": 0.5456448793411255, + "learning_rate": 7.579357007258022e-06, + "loss": 0.1321, + "step": 367 + }, + { + "epoch": 0.35, + "grad_norm": 0.6762471199035645, + "learning_rate": 7.56619092719392e-06, + "loss": 0.2127, + "step": 368 + }, + { + "epoch": 0.35, + "grad_norm": 0.673669695854187, + "learning_rate": 7.5530006459186115e-06, + "loss": 0.1863, + "step": 369 + }, + { + "epoch": 0.35, + "grad_norm": 0.5625087022781372, + "learning_rate": 7.539786287826885e-06, + "loss": 0.1529, + "step": 370 + }, + { + "epoch": 0.35, + "grad_norm": 0.6047581434249878, + "learning_rate": 7.526547977540592e-06, + "loss": 0.1931, + "step": 371 + }, + { + "epoch": 0.35, + "grad_norm": 0.6393082737922668, + "learning_rate": 7.51328583990748e-06, + "loss": 0.2069, + "step": 372 + }, + { + "epoch": 0.35, + "grad_norm": 0.7750303745269775, + "learning_rate": 7.500000000000001e-06, + "loss": 0.2369, + "step": 373 + }, + { + "epoch": 0.35, + "grad_norm": 0.6093407273292542, + "learning_rate": 7.486690583114137e-06, + "loss": 0.185, + "step": 374 + }, + { + "epoch": 0.36, + "grad_norm": 0.7837387919425964, + "learning_rate": 7.473357714768222e-06, + "loss": 0.3191, + "step": 375 + }, + { + "epoch": 0.36, + "grad_norm": 0.8034994602203369, + "learning_rate": 7.460001520701756e-06, + "loss": 0.3518, + "step": 376 + }, + { + "epoch": 0.36, + "grad_norm": 0.7601478099822998, + "learning_rate": 7.446622126874219e-06, + "loss": 0.2954, + "step": 377 + }, + { + "epoch": 0.36, + "grad_norm": 0.7800700068473816, + "learning_rate": 7.4332196594638815e-06, + "loss": 0.2649, + "step": 378 + }, + { + "epoch": 0.36, + "grad_norm": 0.8061292171478271, + "learning_rate": 7.419794244866619e-06, + "loss": 0.2895, + "step": 379 + }, + { + "epoch": 0.36, + "grad_norm": 0.6951506733894348, + "learning_rate": 7.406346009694713e-06, + "loss": 0.2163, + "step": 380 + }, + { + "epoch": 0.36, + "grad_norm": 0.575623631477356, + "learning_rate": 7.3928750807756656e-06, + "loss": 0.207, + "step": 381 + }, + { + "epoch": 0.36, + "grad_norm": 0.7265523672103882, + "learning_rate": 7.379381585150997e-06, + "loss": 0.2537, + "step": 382 + }, + { + "epoch": 0.36, + "grad_norm": 0.6455715298652649, + "learning_rate": 7.365865650075046e-06, + "loss": 0.2211, + "step": 383 + }, + { + "epoch": 0.36, + "grad_norm": 0.5819538831710815, + "learning_rate": 7.352327403013779e-06, + "loss": 0.1808, + "step": 384 + }, + { + "epoch": 0.36, + "grad_norm": 0.7239487171173096, + "learning_rate": 7.338766971643579e-06, + "loss": 0.2434, + "step": 385 + }, + { + "epoch": 0.37, + "grad_norm": 0.7268589735031128, + "learning_rate": 7.325184483850043e-06, + "loss": 0.2462, + "step": 386 + }, + { + "epoch": 0.37, + "grad_norm": 0.6222095489501953, + "learning_rate": 7.311580067726783e-06, + "loss": 0.192, + "step": 387 + }, + { + "epoch": 0.37, + "grad_norm": 0.6951655745506287, + "learning_rate": 7.297953851574207e-06, + "loss": 0.2471, + "step": 388 + }, + { + "epoch": 0.37, + "grad_norm": 0.8148622512817383, + "learning_rate": 7.284305963898315e-06, + "loss": 0.3615, + "step": 389 + }, + { + "epoch": 0.37, + "grad_norm": 0.6384522318840027, + "learning_rate": 7.270636533409491e-06, + "loss": 0.2832, + "step": 390 + }, + { + "epoch": 0.37, + "grad_norm": 0.7164002656936646, + "learning_rate": 7.25694568902128e-06, + "loss": 0.2781, + "step": 391 + }, + { + "epoch": 0.37, + "grad_norm": 0.617064893245697, + "learning_rate": 7.243233559849179e-06, + "loss": 0.1868, + "step": 392 + }, + { + "epoch": 0.37, + "grad_norm": 0.702542245388031, + "learning_rate": 7.229500275209418e-06, + "loss": 0.2169, + "step": 393 + }, + { + "epoch": 0.37, + "grad_norm": 0.7396453022956848, + "learning_rate": 7.215745964617737e-06, + "loss": 0.2465, + "step": 394 + }, + { + "epoch": 0.37, + "grad_norm": 0.7013355493545532, + "learning_rate": 7.201970757788172e-06, + "loss": 0.2857, + "step": 395 + }, + { + "epoch": 0.38, + "grad_norm": 0.6131812334060669, + "learning_rate": 7.188174784631824e-06, + "loss": 0.2012, + "step": 396 + }, + { + "epoch": 0.38, + "grad_norm": 0.7121132612228394, + "learning_rate": 7.174358175255636e-06, + "loss": 0.2295, + "step": 397 + }, + { + "epoch": 0.38, + "grad_norm": 0.6988712549209595, + "learning_rate": 7.160521059961169e-06, + "loss": 0.2304, + "step": 398 + }, + { + "epoch": 0.38, + "grad_norm": 0.5755978226661682, + "learning_rate": 7.14666356924337e-06, + "loss": 0.1861, + "step": 399 + }, + { + "epoch": 0.38, + "grad_norm": 0.7269181609153748, + "learning_rate": 7.132785833789344e-06, + "loss": 0.3132, + "step": 400 + }, + { + "epoch": 0.38, + "eval_loss": 0.24499449133872986, + "eval_runtime": 349.1649, + "eval_samples_per_second": 0.977, + "eval_steps_per_second": 0.246, + "step": 400 + }, + { + "epoch": 0.38, + "grad_norm": 0.6201484799385071, + "learning_rate": 7.118887984477116e-06, + "loss": 0.2389, + "step": 401 + }, + { + "epoch": 0.38, + "grad_norm": 0.629365086555481, + "learning_rate": 7.104970152374405e-06, + "loss": 0.2177, + "step": 402 + }, + { + "epoch": 0.38, + "grad_norm": 0.6488037109375, + "learning_rate": 7.091032468737382e-06, + "loss": 0.2393, + "step": 403 + }, + { + "epoch": 0.38, + "grad_norm": 0.8756319284439087, + "learning_rate": 7.0770750650094335e-06, + "loss": 0.2161, + "step": 404 + }, + { + "epoch": 0.38, + "grad_norm": 0.6746820211410522, + "learning_rate": 7.063098072819919e-06, + "loss": 0.2629, + "step": 405 + }, + { + "epoch": 0.38, + "grad_norm": 0.6890723705291748, + "learning_rate": 7.049101623982938e-06, + "loss": 0.2121, + "step": 406 + }, + { + "epoch": 0.39, + "grad_norm": 0.6225735545158386, + "learning_rate": 7.035085850496079e-06, + "loss": 0.2051, + "step": 407 + }, + { + "epoch": 0.39, + "grad_norm": 0.7883545756340027, + "learning_rate": 7.021050884539178e-06, + "loss": 0.2666, + "step": 408 + }, + { + "epoch": 0.39, + "grad_norm": 0.6250419020652771, + "learning_rate": 7.006996858473068e-06, + "loss": 0.2177, + "step": 409 + }, + { + "epoch": 0.39, + "grad_norm": 0.678107738494873, + "learning_rate": 6.992923904838341e-06, + "loss": 0.2276, + "step": 410 + }, + { + "epoch": 0.39, + "grad_norm": 0.6031367778778076, + "learning_rate": 6.97883215635408e-06, + "loss": 0.2049, + "step": 411 + }, + { + "epoch": 0.39, + "grad_norm": 0.6895279288291931, + "learning_rate": 6.96472174591663e-06, + "loss": 0.2755, + "step": 412 + }, + { + "epoch": 0.39, + "grad_norm": 0.6867420077323914, + "learning_rate": 6.9505928065983275e-06, + "loss": 0.2522, + "step": 413 + }, + { + "epoch": 0.39, + "grad_norm": 0.5972843766212463, + "learning_rate": 6.936445471646249e-06, + "loss": 0.1986, + "step": 414 + }, + { + "epoch": 0.39, + "grad_norm": 0.6085368394851685, + "learning_rate": 6.922279874480959e-06, + "loss": 0.2766, + "step": 415 + }, + { + "epoch": 0.39, + "grad_norm": 0.6487507224082947, + "learning_rate": 6.908096148695251e-06, + "loss": 0.2687, + "step": 416 + }, + { + "epoch": 0.4, + "grad_norm": 0.6461302638053894, + "learning_rate": 6.893894428052881e-06, + "loss": 0.2268, + "step": 417 + }, + { + "epoch": 0.4, + "grad_norm": 0.6626706719398499, + "learning_rate": 6.879674846487314e-06, + "loss": 0.2386, + "step": 418 + }, + { + "epoch": 0.4, + "grad_norm": 0.7022949457168579, + "learning_rate": 6.865437538100456e-06, + "loss": 0.2843, + "step": 419 + }, + { + "epoch": 0.4, + "grad_norm": 0.6765186786651611, + "learning_rate": 6.8511826371613955e-06, + "loss": 0.1828, + "step": 420 + }, + { + "epoch": 0.4, + "grad_norm": 0.661516547203064, + "learning_rate": 6.836910278105124e-06, + "loss": 0.2638, + "step": 421 + }, + { + "epoch": 0.4, + "grad_norm": 0.7024251818656921, + "learning_rate": 6.822620595531286e-06, + "loss": 0.2964, + "step": 422 + }, + { + "epoch": 0.4, + "grad_norm": 0.6266246438026428, + "learning_rate": 6.808313724202894e-06, + "loss": 0.2143, + "step": 423 + }, + { + "epoch": 0.4, + "grad_norm": 0.5504724979400635, + "learning_rate": 6.793989799045067e-06, + "loss": 0.1904, + "step": 424 + }, + { + "epoch": 0.4, + "grad_norm": 0.7041710019111633, + "learning_rate": 6.779648955143754e-06, + "loss": 0.2613, + "step": 425 + }, + { + "epoch": 0.4, + "grad_norm": 0.7407920360565186, + "learning_rate": 6.765291327744463e-06, + "loss": 0.2751, + "step": 426 + }, + { + "epoch": 0.4, + "grad_norm": 0.6743589043617249, + "learning_rate": 6.750917052250981e-06, + "loss": 0.2073, + "step": 427 + }, + { + "epoch": 0.41, + "grad_norm": 0.6536813378334045, + "learning_rate": 6.736526264224101e-06, + "loss": 0.2596, + "step": 428 + }, + { + "epoch": 0.41, + "grad_norm": 0.669067919254303, + "learning_rate": 6.722119099380345e-06, + "loss": 0.2643, + "step": 429 + }, + { + "epoch": 0.41, + "grad_norm": 0.6680570840835571, + "learning_rate": 6.7076956935906756e-06, + "loss": 0.188, + "step": 430 + }, + { + "epoch": 0.41, + "grad_norm": 0.5632251501083374, + "learning_rate": 6.693256182879224e-06, + "loss": 0.1931, + "step": 431 + }, + { + "epoch": 0.41, + "grad_norm": 0.7128719091415405, + "learning_rate": 6.678800703422004e-06, + "loss": 0.3227, + "step": 432 + }, + { + "epoch": 0.41, + "grad_norm": 0.7528271675109863, + "learning_rate": 6.664329391545625e-06, + "loss": 0.3102, + "step": 433 + }, + { + "epoch": 0.41, + "grad_norm": 0.7777692079544067, + "learning_rate": 6.649842383726011e-06, + "loss": 0.3419, + "step": 434 + }, + { + "epoch": 0.41, + "grad_norm": 0.6767750978469849, + "learning_rate": 6.635339816587109e-06, + "loss": 0.2562, + "step": 435 + }, + { + "epoch": 0.41, + "grad_norm": 0.8037603497505188, + "learning_rate": 6.620821826899606e-06, + "loss": 0.3355, + "step": 436 + }, + { + "epoch": 0.41, + "grad_norm": 0.7408990859985352, + "learning_rate": 6.606288551579629e-06, + "loss": 0.2344, + "step": 437 + }, + { + "epoch": 0.42, + "grad_norm": 0.7058804035186768, + "learning_rate": 6.59174012768747e-06, + "loss": 0.3084, + "step": 438 + }, + { + "epoch": 0.42, + "grad_norm": 0.6666316390037537, + "learning_rate": 6.5771766924262795e-06, + "loss": 0.276, + "step": 439 + }, + { + "epoch": 0.42, + "grad_norm": 0.7084954977035522, + "learning_rate": 6.562598383140773e-06, + "loss": 0.2705, + "step": 440 + }, + { + "epoch": 0.42, + "grad_norm": 0.6333549618721008, + "learning_rate": 6.548005337315943e-06, + "loss": 0.2308, + "step": 441 + }, + { + "epoch": 0.42, + "grad_norm": 0.5120007991790771, + "learning_rate": 6.533397692575766e-06, + "loss": 0.1273, + "step": 442 + }, + { + "epoch": 0.42, + "grad_norm": 0.8390111327171326, + "learning_rate": 6.518775586681887e-06, + "loss": 0.3499, + "step": 443 + }, + { + "epoch": 0.42, + "grad_norm": 0.647445797920227, + "learning_rate": 6.504139157532338e-06, + "loss": 0.2331, + "step": 444 + }, + { + "epoch": 0.42, + "grad_norm": 0.6378818154335022, + "learning_rate": 6.489488543160225e-06, + "loss": 0.1831, + "step": 445 + }, + { + "epoch": 0.42, + "grad_norm": 0.6484649777412415, + "learning_rate": 6.4748238817324395e-06, + "loss": 0.2106, + "step": 446 + }, + { + "epoch": 0.42, + "grad_norm": 1.051329255104065, + "learning_rate": 6.460145311548341e-06, + "loss": 0.3166, + "step": 447 + }, + { + "epoch": 0.42, + "grad_norm": 0.7134484648704529, + "learning_rate": 6.445452971038464e-06, + "loss": 0.2727, + "step": 448 + }, + { + "epoch": 0.43, + "grad_norm": 0.7991112470626831, + "learning_rate": 6.430746998763204e-06, + "loss": 0.3293, + "step": 449 + }, + { + "epoch": 0.43, + "grad_norm": 0.7408881187438965, + "learning_rate": 6.41602753341152e-06, + "loss": 0.3266, + "step": 450 + }, + { + "epoch": 0.43, + "grad_norm": 0.9650523662567139, + "learning_rate": 6.4012947137996175e-06, + "loss": 0.3552, + "step": 451 + }, + { + "epoch": 0.43, + "grad_norm": 0.6928345561027527, + "learning_rate": 6.386548678869644e-06, + "loss": 0.2326, + "step": 452 + }, + { + "epoch": 0.43, + "grad_norm": 0.5790258646011353, + "learning_rate": 6.37178956768838e-06, + "loss": 0.1952, + "step": 453 + }, + { + "epoch": 0.43, + "grad_norm": 0.6764933466911316, + "learning_rate": 6.3570175194459205e-06, + "loss": 0.2527, + "step": 454 + }, + { + "epoch": 0.43, + "grad_norm": 0.6727444529533386, + "learning_rate": 6.342232673454371e-06, + "loss": 0.2522, + "step": 455 + }, + { + "epoch": 0.43, + "grad_norm": 0.5731246471405029, + "learning_rate": 6.3274351691465305e-06, + "loss": 0.1655, + "step": 456 + }, + { + "epoch": 0.43, + "grad_norm": 0.7887910604476929, + "learning_rate": 6.312625146074574e-06, + "loss": 0.2716, + "step": 457 + }, + { + "epoch": 0.43, + "grad_norm": 0.6423369646072388, + "learning_rate": 6.2978027439087405e-06, + "loss": 0.2245, + "step": 458 + }, + { + "epoch": 0.43, + "grad_norm": 0.7198876738548279, + "learning_rate": 6.28296810243601e-06, + "loss": 0.2578, + "step": 459 + }, + { + "epoch": 0.44, + "grad_norm": 0.5674285888671875, + "learning_rate": 6.268121361558792e-06, + "loss": 0.1801, + "step": 460 + }, + { + "epoch": 0.44, + "grad_norm": 0.7052492499351501, + "learning_rate": 6.2532626612936035e-06, + "loss": 0.3003, + "step": 461 + }, + { + "epoch": 0.44, + "grad_norm": 0.6291772127151489, + "learning_rate": 6.238392141769743e-06, + "loss": 0.1771, + "step": 462 + }, + { + "epoch": 0.44, + "grad_norm": 0.7111222743988037, + "learning_rate": 6.22350994322798e-06, + "loss": 0.3175, + "step": 463 + }, + { + "epoch": 0.44, + "grad_norm": 0.5738310217857361, + "learning_rate": 6.208616206019225e-06, + "loss": 0.1926, + "step": 464 + }, + { + "epoch": 0.44, + "grad_norm": 0.6483380198478699, + "learning_rate": 6.193711070603202e-06, + "loss": 0.2267, + "step": 465 + }, + { + "epoch": 0.44, + "grad_norm": 0.6434422135353088, + "learning_rate": 6.178794677547138e-06, + "loss": 0.215, + "step": 466 + }, + { + "epoch": 0.44, + "grad_norm": 0.7429325580596924, + "learning_rate": 6.163867167524419e-06, + "loss": 0.2847, + "step": 467 + }, + { + "epoch": 0.44, + "grad_norm": 0.692653477191925, + "learning_rate": 6.14892868131328e-06, + "loss": 0.2467, + "step": 468 + }, + { + "epoch": 0.44, + "grad_norm": 0.594670295715332, + "learning_rate": 6.1339793597954675e-06, + "loss": 0.187, + "step": 469 + }, + { + "epoch": 0.45, + "grad_norm": 0.7211235165596008, + "learning_rate": 6.119019343954914e-06, + "loss": 0.2295, + "step": 470 + }, + { + "epoch": 0.45, + "grad_norm": 0.6931540369987488, + "learning_rate": 6.104048774876407e-06, + "loss": 0.2345, + "step": 471 + }, + { + "epoch": 0.45, + "grad_norm": 0.7969695925712585, + "learning_rate": 6.089067793744258e-06, + "loss": 0.3256, + "step": 472 + }, + { + "epoch": 0.45, + "grad_norm": 0.8951885104179382, + "learning_rate": 6.074076541840978e-06, + "loss": 0.3994, + "step": 473 + }, + { + "epoch": 0.45, + "grad_norm": 0.5864188075065613, + "learning_rate": 6.059075160545933e-06, + "loss": 0.1807, + "step": 474 + }, + { + "epoch": 0.45, + "grad_norm": 0.7101827263832092, + "learning_rate": 6.044063791334023e-06, + "loss": 0.1999, + "step": 475 + }, + { + "epoch": 0.45, + "grad_norm": 0.689575731754303, + "learning_rate": 6.029042575774334e-06, + "loss": 0.2511, + "step": 476 + }, + { + "epoch": 0.45, + "grad_norm": 0.7901589274406433, + "learning_rate": 6.01401165552882e-06, + "loss": 0.2774, + "step": 477 + }, + { + "epoch": 0.45, + "grad_norm": 0.7189447283744812, + "learning_rate": 5.998971172350953e-06, + "loss": 0.2859, + "step": 478 + }, + { + "epoch": 0.45, + "grad_norm": 0.6440539360046387, + "learning_rate": 5.9839212680843925e-06, + "loss": 0.1909, + "step": 479 + }, + { + "epoch": 0.45, + "grad_norm": 0.8534595370292664, + "learning_rate": 5.968862084661643e-06, + "loss": 0.3271, + "step": 480 + }, + { + "epoch": 0.46, + "grad_norm": 0.5987100005149841, + "learning_rate": 5.9537937641027225e-06, + "loss": 0.2241, + "step": 481 + }, + { + "epoch": 0.46, + "grad_norm": 0.6581840515136719, + "learning_rate": 5.938716448513819e-06, + "loss": 0.2265, + "step": 482 + }, + { + "epoch": 0.46, + "grad_norm": 0.9717352390289307, + "learning_rate": 5.923630280085948e-06, + "loss": 0.371, + "step": 483 + }, + { + "epoch": 0.46, + "grad_norm": 0.7156874537467957, + "learning_rate": 5.908535401093618e-06, + "loss": 0.3291, + "step": 484 + }, + { + "epoch": 0.46, + "grad_norm": 0.6929928064346313, + "learning_rate": 5.893431953893483e-06, + "loss": 0.2386, + "step": 485 + }, + { + "epoch": 0.46, + "grad_norm": 0.6523422002792358, + "learning_rate": 5.878320080923001e-06, + "loss": 0.205, + "step": 486 + }, + { + "epoch": 0.46, + "grad_norm": 0.8255969882011414, + "learning_rate": 5.8631999246990954e-06, + "loss": 0.2516, + "step": 487 + }, + { + "epoch": 0.46, + "grad_norm": 0.8311260342597961, + "learning_rate": 5.848071627816804e-06, + "loss": 0.3562, + "step": 488 + }, + { + "epoch": 0.46, + "grad_norm": 0.7574479579925537, + "learning_rate": 5.832935332947937e-06, + "loss": 0.2694, + "step": 489 + }, + { + "epoch": 0.46, + "grad_norm": 0.6098451614379883, + "learning_rate": 5.817791182839734e-06, + "loss": 0.2251, + "step": 490 + }, + { + "epoch": 0.47, + "grad_norm": 0.7459378838539124, + "learning_rate": 5.8026393203135145e-06, + "loss": 0.2995, + "step": 491 + }, + { + "epoch": 0.47, + "grad_norm": 0.7321155071258545, + "learning_rate": 5.787479888263333e-06, + "loss": 0.2579, + "step": 492 + }, + { + "epoch": 0.47, + "grad_norm": 0.7155848145484924, + "learning_rate": 5.772313029654631e-06, + "loss": 0.2231, + "step": 493 + }, + { + "epoch": 0.47, + "grad_norm": 0.7132194638252258, + "learning_rate": 5.757138887522884e-06, + "loss": 0.2826, + "step": 494 + }, + { + "epoch": 0.47, + "grad_norm": 0.6572800874710083, + "learning_rate": 5.741957604972264e-06, + "loss": 0.2229, + "step": 495 + }, + { + "epoch": 0.47, + "grad_norm": 0.6408301591873169, + "learning_rate": 5.726769325174279e-06, + "loss": 0.237, + "step": 496 + }, + { + "epoch": 0.47, + "grad_norm": 0.7460103631019592, + "learning_rate": 5.711574191366427e-06, + "loss": 0.2454, + "step": 497 + }, + { + "epoch": 0.47, + "grad_norm": 0.7643551826477051, + "learning_rate": 5.696372346850842e-06, + "loss": 0.3264, + "step": 498 + }, + { + "epoch": 0.47, + "grad_norm": 0.6984466314315796, + "learning_rate": 5.68116393499295e-06, + "loss": 0.2581, + "step": 499 + }, + { + "epoch": 0.47, + "grad_norm": 0.5989054441452026, + "learning_rate": 5.66594909922011e-06, + "loss": 0.1823, + "step": 500 + }, + { + "epoch": 0.47, + "eval_loss": 0.24097025394439697, + "eval_runtime": 381.7737, + "eval_samples_per_second": 0.893, + "eval_steps_per_second": 0.225, + "step": 500 + }, + { + "epoch": 0.47, + "grad_norm": 0.8489783406257629, + "learning_rate": 5.650727983020262e-06, + "loss": 0.3858, + "step": 501 + }, + { + "epoch": 0.48, + "grad_norm": 0.5319824814796448, + "learning_rate": 5.635500729940578e-06, + "loss": 0.1428, + "step": 502 + }, + { + "epoch": 0.48, + "grad_norm": 0.7632309198379517, + "learning_rate": 5.6202674835861045e-06, + "loss": 0.3202, + "step": 503 + }, + { + "epoch": 0.48, + "grad_norm": 0.7297234535217285, + "learning_rate": 5.605028387618412e-06, + "loss": 0.2656, + "step": 504 + }, + { + "epoch": 0.48, + "grad_norm": 0.8261827230453491, + "learning_rate": 5.5897835857542315e-06, + "loss": 0.3094, + "step": 505 + }, + { + "epoch": 0.48, + "grad_norm": 0.6307250261306763, + "learning_rate": 5.574533221764109e-06, + "loss": 0.2165, + "step": 506 + }, + { + "epoch": 0.48, + "grad_norm": 0.6916859745979309, + "learning_rate": 5.559277439471047e-06, + "loss": 0.2101, + "step": 507 + }, + { + "epoch": 0.48, + "grad_norm": 0.6984620690345764, + "learning_rate": 5.544016382749146e-06, + "loss": 0.2769, + "step": 508 + }, + { + "epoch": 0.48, + "grad_norm": 0.6447324156761169, + "learning_rate": 5.528750195522244e-06, + "loss": 0.211, + "step": 509 + }, + { + "epoch": 0.48, + "grad_norm": 0.6922134757041931, + "learning_rate": 5.513479021762573e-06, + "loss": 0.2479, + "step": 510 + }, + { + "epoch": 0.48, + "grad_norm": 0.6607391238212585, + "learning_rate": 5.498203005489378e-06, + "loss": 0.1672, + "step": 511 + }, + { + "epoch": 0.49, + "grad_norm": 0.7346232533454895, + "learning_rate": 5.4829222907675895e-06, + "loss": 0.2102, + "step": 512 + }, + { + "epoch": 0.49, + "grad_norm": 0.6713115572929382, + "learning_rate": 5.467637021706438e-06, + "loss": 0.2303, + "step": 513 + }, + { + "epoch": 0.49, + "grad_norm": 0.8098154067993164, + "learning_rate": 5.4523473424581045e-06, + "loss": 0.2906, + "step": 514 + }, + { + "epoch": 0.49, + "grad_norm": 0.8032150864601135, + "learning_rate": 5.437053397216364e-06, + "loss": 0.2855, + "step": 515 + }, + { + "epoch": 0.49, + "grad_norm": 0.7085696458816528, + "learning_rate": 5.421755330215223e-06, + "loss": 0.2357, + "step": 516 + }, + { + "epoch": 0.49, + "grad_norm": 0.8494563102722168, + "learning_rate": 5.4064532857275645e-06, + "loss": 0.2773, + "step": 517 + }, + { + "epoch": 0.49, + "grad_norm": 0.7258232235908508, + "learning_rate": 5.3911474080637705e-06, + "loss": 0.2357, + "step": 518 + }, + { + "epoch": 0.49, + "grad_norm": 0.6795754432678223, + "learning_rate": 5.3758378415703825e-06, + "loss": 0.1905, + "step": 519 + }, + { + "epoch": 0.49, + "grad_norm": 0.8729014992713928, + "learning_rate": 5.3605247306287275e-06, + "loss": 0.3643, + "step": 520 + }, + { + "epoch": 0.49, + "grad_norm": 0.6851757764816284, + "learning_rate": 5.345208219653562e-06, + "loss": 0.2186, + "step": 521 + }, + { + "epoch": 0.49, + "grad_norm": 0.7230455875396729, + "learning_rate": 5.329888453091701e-06, + "loss": 0.2538, + "step": 522 + }, + { + "epoch": 0.5, + "grad_norm": 0.6683774590492249, + "learning_rate": 5.314565575420671e-06, + "loss": 0.1929, + "step": 523 + }, + { + "epoch": 0.5, + "grad_norm": 0.6849011778831482, + "learning_rate": 5.299239731147332e-06, + "loss": 0.1494, + "step": 524 + }, + { + "epoch": 0.5, + "grad_norm": 0.8139472603797913, + "learning_rate": 5.283911064806522e-06, + "loss": 0.2992, + "step": 525 + }, + { + "epoch": 0.5, + "grad_norm": 0.709804117679596, + "learning_rate": 5.268579720959698e-06, + "loss": 0.2426, + "step": 526 + }, + { + "epoch": 0.5, + "grad_norm": 0.7699583172798157, + "learning_rate": 5.253245844193564e-06, + "loss": 0.2775, + "step": 527 + }, + { + "epoch": 0.5, + "grad_norm": 0.880550742149353, + "learning_rate": 5.237909579118713e-06, + "loss": 0.274, + "step": 528 + }, + { + "epoch": 0.5, + "grad_norm": 0.7186005115509033, + "learning_rate": 5.222571070368258e-06, + "loss": 0.2439, + "step": 529 + }, + { + "epoch": 0.5, + "grad_norm": 0.6252779364585876, + "learning_rate": 5.2072304625964785e-06, + "loss": 0.1902, + "step": 530 + }, + { + "epoch": 0.5, + "grad_norm": 0.7325754761695862, + "learning_rate": 5.191887900477444e-06, + "loss": 0.2849, + "step": 531 + }, + { + "epoch": 0.5, + "grad_norm": 0.7283653616905212, + "learning_rate": 5.176543528703657e-06, + "loss": 0.241, + "step": 532 + }, + { + "epoch": 0.51, + "grad_norm": 0.6465723514556885, + "learning_rate": 5.161197491984684e-06, + "loss": 0.2358, + "step": 533 + }, + { + "epoch": 0.51, + "grad_norm": 0.5979306697845459, + "learning_rate": 5.1458499350458e-06, + "loss": 0.1647, + "step": 534 + }, + { + "epoch": 0.51, + "grad_norm": 0.8471933007240295, + "learning_rate": 5.130501002626609e-06, + "loss": 0.4086, + "step": 535 + }, + { + "epoch": 0.51, + "grad_norm": 0.6678841710090637, + "learning_rate": 5.11515083947969e-06, + "loss": 0.2123, + "step": 536 + }, + { + "epoch": 0.51, + "grad_norm": 0.7951354384422302, + "learning_rate": 5.099799590369231e-06, + "loss": 0.251, + "step": 537 + }, + { + "epoch": 0.51, + "grad_norm": 0.7659385800361633, + "learning_rate": 5.084447400069656e-06, + "loss": 0.305, + "step": 538 + }, + { + "epoch": 0.51, + "grad_norm": 0.703631579875946, + "learning_rate": 5.069094413364272e-06, + "loss": 0.2384, + "step": 539 + }, + { + "epoch": 0.51, + "grad_norm": 0.8557327389717102, + "learning_rate": 5.053740775043891e-06, + "loss": 0.2994, + "step": 540 + }, + { + "epoch": 0.51, + "grad_norm": 0.78694087266922, + "learning_rate": 5.038386629905475e-06, + "loss": 0.2705, + "step": 541 + }, + { + "epoch": 0.51, + "grad_norm": 0.7444816827774048, + "learning_rate": 5.0230321227507595e-06, + "loss": 0.2931, + "step": 542 + }, + { + "epoch": 0.51, + "grad_norm": 0.6637994050979614, + "learning_rate": 5.007677398384902e-06, + "loss": 0.2023, + "step": 543 + }, + { + "epoch": 0.52, + "grad_norm": 0.795734167098999, + "learning_rate": 4.992322601615101e-06, + "loss": 0.2727, + "step": 544 + }, + { + "epoch": 0.52, + "grad_norm": 0.7429494261741638, + "learning_rate": 4.976967877249242e-06, + "loss": 0.2611, + "step": 545 + }, + { + "epoch": 0.52, + "grad_norm": 0.6305847764015198, + "learning_rate": 4.961613370094526e-06, + "loss": 0.2316, + "step": 546 + }, + { + "epoch": 0.52, + "grad_norm": 0.7047324776649475, + "learning_rate": 4.9462592249561095e-06, + "loss": 0.289, + "step": 547 + }, + { + "epoch": 0.52, + "grad_norm": 0.7027899622917175, + "learning_rate": 4.93090558663573e-06, + "loss": 0.1984, + "step": 548 + }, + { + "epoch": 0.52, + "grad_norm": 0.7407359480857849, + "learning_rate": 4.915552599930345e-06, + "loss": 0.2794, + "step": 549 + }, + { + "epoch": 0.52, + "grad_norm": 0.7558469772338867, + "learning_rate": 4.900200409630771e-06, + "loss": 0.2329, + "step": 550 + }, + { + "epoch": 0.52, + "grad_norm": 0.7046175599098206, + "learning_rate": 4.884849160520311e-06, + "loss": 0.2033, + "step": 551 + }, + { + "epoch": 0.52, + "grad_norm": 0.762986421585083, + "learning_rate": 4.869498997373393e-06, + "loss": 0.2441, + "step": 552 + }, + { + "epoch": 0.52, + "grad_norm": 0.7733058333396912, + "learning_rate": 4.854150064954201e-06, + "loss": 0.2922, + "step": 553 + }, + { + "epoch": 0.52, + "grad_norm": 0.7784504294395447, + "learning_rate": 4.838802508015316e-06, + "loss": 0.2629, + "step": 554 + }, + { + "epoch": 0.53, + "grad_norm": 0.7874659895896912, + "learning_rate": 4.8234564712963445e-06, + "loss": 0.2723, + "step": 555 + }, + { + "epoch": 0.53, + "grad_norm": 0.6290090680122375, + "learning_rate": 4.808112099522558e-06, + "loss": 0.2316, + "step": 556 + }, + { + "epoch": 0.53, + "grad_norm": 0.921485424041748, + "learning_rate": 4.792769537403523e-06, + "loss": 0.3212, + "step": 557 + }, + { + "epoch": 0.53, + "grad_norm": 0.8012232184410095, + "learning_rate": 4.777428929631743e-06, + "loss": 0.3222, + "step": 558 + }, + { + "epoch": 0.53, + "grad_norm": 0.8099982738494873, + "learning_rate": 4.762090420881289e-06, + "loss": 0.3166, + "step": 559 + }, + { + "epoch": 0.53, + "grad_norm": 0.9401763677597046, + "learning_rate": 4.746754155806437e-06, + "loss": 0.3724, + "step": 560 + }, + { + "epoch": 0.53, + "grad_norm": 0.6871545910835266, + "learning_rate": 4.731420279040303e-06, + "loss": 0.2512, + "step": 561 + }, + { + "epoch": 0.53, + "grad_norm": 0.6168978810310364, + "learning_rate": 4.716088935193479e-06, + "loss": 0.1568, + "step": 562 + }, + { + "epoch": 0.53, + "grad_norm": 0.7992426753044128, + "learning_rate": 4.700760268852669e-06, + "loss": 0.2947, + "step": 563 + }, + { + "epoch": 0.53, + "grad_norm": 0.5769720673561096, + "learning_rate": 4.68543442457933e-06, + "loss": 0.1636, + "step": 564 + }, + { + "epoch": 0.54, + "grad_norm": 0.6425333023071289, + "learning_rate": 4.670111546908299e-06, + "loss": 0.2066, + "step": 565 + }, + { + "epoch": 0.54, + "grad_norm": 0.7271445989608765, + "learning_rate": 4.65479178034644e-06, + "loss": 0.1569, + "step": 566 + }, + { + "epoch": 0.54, + "grad_norm": 0.7024662494659424, + "learning_rate": 4.639475269371273e-06, + "loss": 0.2402, + "step": 567 + }, + { + "epoch": 0.54, + "grad_norm": 0.8755499124526978, + "learning_rate": 4.624162158429618e-06, + "loss": 0.2204, + "step": 568 + }, + { + "epoch": 0.54, + "grad_norm": 0.8764632344245911, + "learning_rate": 4.608852591936231e-06, + "loss": 0.3024, + "step": 569 + }, + { + "epoch": 0.54, + "grad_norm": 0.7827917337417603, + "learning_rate": 4.593546714272438e-06, + "loss": 0.2817, + "step": 570 + }, + { + "epoch": 0.54, + "grad_norm": 0.8511632084846497, + "learning_rate": 4.5782446697847775e-06, + "loss": 0.3221, + "step": 571 + }, + { + "epoch": 0.54, + "grad_norm": 0.6528637409210205, + "learning_rate": 4.562946602783637e-06, + "loss": 0.1777, + "step": 572 + }, + { + "epoch": 0.54, + "grad_norm": 0.8783658146858215, + "learning_rate": 4.547652657541897e-06, + "loss": 0.4047, + "step": 573 + }, + { + "epoch": 0.54, + "grad_norm": 0.7391313910484314, + "learning_rate": 4.532362978293564e-06, + "loss": 0.2846, + "step": 574 + }, + { + "epoch": 0.54, + "grad_norm": 0.5948435068130493, + "learning_rate": 4.517077709232411e-06, + "loss": 0.1937, + "step": 575 + }, + { + "epoch": 0.55, + "grad_norm": 0.7957102656364441, + "learning_rate": 4.5017969945106225e-06, + "loss": 0.3197, + "step": 576 + }, + { + "epoch": 0.55, + "grad_norm": 0.572628915309906, + "learning_rate": 4.486520978237431e-06, + "loss": 0.17, + "step": 577 + }, + { + "epoch": 0.55, + "grad_norm": 0.6679872274398804, + "learning_rate": 4.471249804477758e-06, + "loss": 0.1744, + "step": 578 + }, + { + "epoch": 0.55, + "grad_norm": 0.6934377551078796, + "learning_rate": 4.455983617250857e-06, + "loss": 0.2471, + "step": 579 + }, + { + "epoch": 0.55, + "grad_norm": 0.6016349792480469, + "learning_rate": 4.440722560528955e-06, + "loss": 0.1866, + "step": 580 + }, + { + "epoch": 0.55, + "grad_norm": 0.5812397003173828, + "learning_rate": 4.4254667782358925e-06, + "loss": 0.1788, + "step": 581 + }, + { + "epoch": 0.55, + "grad_norm": 0.7885786890983582, + "learning_rate": 4.410216414245771e-06, + "loss": 0.3005, + "step": 582 + }, + { + "epoch": 0.55, + "grad_norm": 0.7440980672836304, + "learning_rate": 4.394971612381591e-06, + "loss": 0.2512, + "step": 583 + }, + { + "epoch": 0.55, + "grad_norm": 0.6813129186630249, + "learning_rate": 4.379732516413897e-06, + "loss": 0.216, + "step": 584 + }, + { + "epoch": 0.55, + "grad_norm": 0.7254709005355835, + "learning_rate": 4.364499270059423e-06, + "loss": 0.249, + "step": 585 + }, + { + "epoch": 0.56, + "grad_norm": 0.7485551238059998, + "learning_rate": 4.34927201697974e-06, + "loss": 0.2576, + "step": 586 + }, + { + "epoch": 0.56, + "grad_norm": 0.6328112483024597, + "learning_rate": 4.334050900779893e-06, + "loss": 0.1902, + "step": 587 + }, + { + "epoch": 0.56, + "grad_norm": 0.8314217329025269, + "learning_rate": 4.318836065007052e-06, + "loss": 0.2998, + "step": 588 + }, + { + "epoch": 0.56, + "grad_norm": 0.6509906053543091, + "learning_rate": 4.303627653149159e-06, + "loss": 0.1919, + "step": 589 + }, + { + "epoch": 0.56, + "grad_norm": 0.7075145840644836, + "learning_rate": 4.2884258086335755e-06, + "loss": 0.2028, + "step": 590 + }, + { + "epoch": 0.56, + "grad_norm": 0.7177186608314514, + "learning_rate": 4.2732306748257226e-06, + "loss": 0.2312, + "step": 591 + }, + { + "epoch": 0.56, + "grad_norm": 0.6522433161735535, + "learning_rate": 4.258042395027738e-06, + "loss": 0.1844, + "step": 592 + }, + { + "epoch": 0.56, + "grad_norm": 0.7007840871810913, + "learning_rate": 4.2428611124771184e-06, + "loss": 0.2224, + "step": 593 + }, + { + "epoch": 0.56, + "grad_norm": 0.7517430782318115, + "learning_rate": 4.227686970345373e-06, + "loss": 0.267, + "step": 594 + }, + { + "epoch": 0.56, + "grad_norm": 0.6153454184532166, + "learning_rate": 4.21252011173667e-06, + "loss": 0.2004, + "step": 595 + }, + { + "epoch": 0.56, + "grad_norm": 0.7708460092544556, + "learning_rate": 4.197360679686489e-06, + "loss": 0.2246, + "step": 596 + }, + { + "epoch": 0.57, + "grad_norm": 0.7786543369293213, + "learning_rate": 4.182208817160269e-06, + "loss": 0.3394, + "step": 597 + }, + { + "epoch": 0.57, + "grad_norm": 0.6301801800727844, + "learning_rate": 4.1670646670520656e-06, + "loss": 0.1996, + "step": 598 + }, + { + "epoch": 0.57, + "grad_norm": 0.7560685276985168, + "learning_rate": 4.151928372183198e-06, + "loss": 0.2335, + "step": 599 + }, + { + "epoch": 0.57, + "grad_norm": 0.835745096206665, + "learning_rate": 4.136800075300906e-06, + "loss": 0.2274, + "step": 600 + }, + { + "epoch": 0.57, + "eval_loss": 0.23925696313381195, + "eval_runtime": 549.7058, + "eval_samples_per_second": 0.62, + "eval_steps_per_second": 0.156, + "step": 600 + }, + { + "epoch": 0.57, + "grad_norm": 0.8754226565361023, + "learning_rate": 4.121679919077001e-06, + "loss": 0.3683, + "step": 601 + }, + { + "epoch": 0.57, + "grad_norm": 0.6908857226371765, + "learning_rate": 4.10656804610652e-06, + "loss": 0.203, + "step": 602 + }, + { + "epoch": 0.57, + "grad_norm": 0.696713387966156, + "learning_rate": 4.091464598906385e-06, + "loss": 0.228, + "step": 603 + }, + { + "epoch": 0.57, + "grad_norm": 0.676516592502594, + "learning_rate": 4.076369719914055e-06, + "loss": 0.2243, + "step": 604 + }, + { + "epoch": 0.57, + "grad_norm": 0.8648337721824646, + "learning_rate": 4.061283551486185e-06, + "loss": 0.3647, + "step": 605 + }, + { + "epoch": 0.57, + "grad_norm": 0.8069350719451904, + "learning_rate": 4.04620623589728e-06, + "loss": 0.3533, + "step": 606 + }, + { + "epoch": 0.58, + "grad_norm": 0.8564375638961792, + "learning_rate": 4.03113791533836e-06, + "loss": 0.3385, + "step": 607 + }, + { + "epoch": 0.58, + "grad_norm": 0.7426589131355286, + "learning_rate": 4.016078731915608e-06, + "loss": 0.2596, + "step": 608 + }, + { + "epoch": 0.58, + "grad_norm": 0.7388270497322083, + "learning_rate": 4.001028827649046e-06, + "loss": 0.2639, + "step": 609 + }, + { + "epoch": 0.58, + "grad_norm": 0.6917795538902283, + "learning_rate": 3.9859883444711795e-06, + "loss": 0.2463, + "step": 610 + }, + { + "epoch": 0.58, + "grad_norm": 0.695612370967865, + "learning_rate": 3.970957424225666e-06, + "loss": 0.2193, + "step": 611 + }, + { + "epoch": 0.58, + "grad_norm": 0.6280950307846069, + "learning_rate": 3.955936208665979e-06, + "loss": 0.2037, + "step": 612 + }, + { + "epoch": 0.58, + "grad_norm": 0.744050920009613, + "learning_rate": 3.940924839454067e-06, + "loss": 0.2406, + "step": 613 + }, + { + "epoch": 0.58, + "grad_norm": 0.8406124711036682, + "learning_rate": 3.925923458159023e-06, + "loss": 0.3304, + "step": 614 + }, + { + "epoch": 0.58, + "grad_norm": 0.7352477312088013, + "learning_rate": 3.910932206255742e-06, + "loss": 0.2496, + "step": 615 + }, + { + "epoch": 0.58, + "grad_norm": 0.6688184142112732, + "learning_rate": 3.895951225123595e-06, + "loss": 0.182, + "step": 616 + }, + { + "epoch": 0.58, + "grad_norm": 0.7015490531921387, + "learning_rate": 3.880980656045087e-06, + "loss": 0.2518, + "step": 617 + }, + { + "epoch": 0.59, + "grad_norm": 0.7293862700462341, + "learning_rate": 3.866020640204533e-06, + "loss": 0.2363, + "step": 618 + }, + { + "epoch": 0.59, + "grad_norm": 0.6916101574897766, + "learning_rate": 3.851071318686721e-06, + "loss": 0.2221, + "step": 619 + }, + { + "epoch": 0.59, + "grad_norm": 0.6372233629226685, + "learning_rate": 3.836132832475583e-06, + "loss": 0.1852, + "step": 620 + }, + { + "epoch": 0.59, + "grad_norm": 0.899456262588501, + "learning_rate": 3.821205322452863e-06, + "loss": 0.2606, + "step": 621 + }, + { + "epoch": 0.59, + "grad_norm": 0.8286627531051636, + "learning_rate": 3.806288929396798e-06, + "loss": 0.3659, + "step": 622 + }, + { + "epoch": 0.59, + "grad_norm": 0.7856718301773071, + "learning_rate": 3.7913837939807763e-06, + "loss": 0.2654, + "step": 623 + }, + { + "epoch": 0.59, + "grad_norm": 0.647342324256897, + "learning_rate": 3.77649005677202e-06, + "loss": 0.1858, + "step": 624 + }, + { + "epoch": 0.59, + "grad_norm": 0.8734769225120544, + "learning_rate": 3.7616078582302575e-06, + "loss": 0.35, + "step": 625 + }, + { + "epoch": 0.59, + "grad_norm": 0.7445953488349915, + "learning_rate": 3.7467373387063973e-06, + "loss": 0.2204, + "step": 626 + }, + { + "epoch": 0.59, + "grad_norm": 0.7629478573799133, + "learning_rate": 3.7318786384412076e-06, + "loss": 0.3257, + "step": 627 + }, + { + "epoch": 0.6, + "grad_norm": 0.764764130115509, + "learning_rate": 3.7170318975639902e-06, + "loss": 0.2477, + "step": 628 + }, + { + "epoch": 0.6, + "grad_norm": 0.6604840159416199, + "learning_rate": 3.70219725609126e-06, + "loss": 0.2454, + "step": 629 + }, + { + "epoch": 0.6, + "grad_norm": 0.714688241481781, + "learning_rate": 3.687374853925425e-06, + "loss": 0.2072, + "step": 630 + }, + { + "epoch": 0.6, + "grad_norm": 0.7985005974769592, + "learning_rate": 3.67256483085347e-06, + "loss": 0.2882, + "step": 631 + }, + { + "epoch": 0.6, + "grad_norm": 0.7973790764808655, + "learning_rate": 3.6577673265456296e-06, + "loss": 0.2818, + "step": 632 + }, + { + "epoch": 0.6, + "grad_norm": 0.829279899597168, + "learning_rate": 3.6429824805540816e-06, + "loss": 0.3016, + "step": 633 + }, + { + "epoch": 0.6, + "grad_norm": 0.6215798258781433, + "learning_rate": 3.628210432311621e-06, + "loss": 0.1855, + "step": 634 + }, + { + "epoch": 0.6, + "grad_norm": 0.7925015091896057, + "learning_rate": 3.6134513211303555e-06, + "loss": 0.3206, + "step": 635 + }, + { + "epoch": 0.6, + "grad_norm": 0.7273108959197998, + "learning_rate": 3.5987052862003824e-06, + "loss": 0.2378, + "step": 636 + }, + { + "epoch": 0.6, + "grad_norm": 0.7947001457214355, + "learning_rate": 3.58397246658848e-06, + "loss": 0.3151, + "step": 637 + }, + { + "epoch": 0.6, + "grad_norm": 1.0714225769042969, + "learning_rate": 3.569253001236795e-06, + "loss": 0.2715, + "step": 638 + }, + { + "epoch": 0.61, + "grad_norm": 0.784624457359314, + "learning_rate": 3.554547028961537e-06, + "loss": 0.2564, + "step": 639 + }, + { + "epoch": 0.61, + "grad_norm": 0.6738251447677612, + "learning_rate": 3.5398546884516606e-06, + "loss": 0.1875, + "step": 640 + }, + { + "epoch": 0.61, + "grad_norm": 0.7179457545280457, + "learning_rate": 3.5251761182675626e-06, + "loss": 0.2904, + "step": 641 + }, + { + "epoch": 0.61, + "grad_norm": 0.740993082523346, + "learning_rate": 3.510511456839777e-06, + "loss": 0.2648, + "step": 642 + }, + { + "epoch": 0.61, + "grad_norm": 0.7999202013015747, + "learning_rate": 3.495860842467664e-06, + "loss": 0.2863, + "step": 643 + }, + { + "epoch": 0.61, + "grad_norm": 0.6961380839347839, + "learning_rate": 3.481224413318114e-06, + "loss": 0.2768, + "step": 644 + }, + { + "epoch": 0.61, + "grad_norm": 0.7229123115539551, + "learning_rate": 3.4666023074242356e-06, + "loss": 0.298, + "step": 645 + }, + { + "epoch": 0.61, + "grad_norm": 0.8005563020706177, + "learning_rate": 3.451994662684057e-06, + "loss": 0.3037, + "step": 646 + }, + { + "epoch": 0.61, + "grad_norm": 0.7264557480812073, + "learning_rate": 3.4374016168592296e-06, + "loss": 0.2452, + "step": 647 + }, + { + "epoch": 0.61, + "grad_norm": 0.8246781826019287, + "learning_rate": 3.4228233075737225e-06, + "loss": 0.2857, + "step": 648 + }, + { + "epoch": 0.61, + "grad_norm": 0.7802700996398926, + "learning_rate": 3.4082598723125303e-06, + "loss": 0.3002, + "step": 649 + }, + { + "epoch": 0.62, + "grad_norm": 0.6945613622665405, + "learning_rate": 3.393711448420372e-06, + "loss": 0.2183, + "step": 650 + }, + { + "epoch": 0.62, + "grad_norm": 0.7643357515335083, + "learning_rate": 3.379178173100396e-06, + "loss": 0.2716, + "step": 651 + }, + { + "epoch": 0.62, + "grad_norm": 0.6494430899620056, + "learning_rate": 3.3646601834128924e-06, + "loss": 0.174, + "step": 652 + }, + { + "epoch": 0.62, + "grad_norm": 0.7811049818992615, + "learning_rate": 3.3501576162739903e-06, + "loss": 0.2065, + "step": 653 + }, + { + "epoch": 0.62, + "grad_norm": 0.7266541123390198, + "learning_rate": 3.3356706084543766e-06, + "loss": 0.2717, + "step": 654 + }, + { + "epoch": 0.62, + "grad_norm": 0.5529820322990417, + "learning_rate": 3.3211992965779984e-06, + "loss": 0.1486, + "step": 655 + }, + { + "epoch": 0.62, + "grad_norm": 0.8447984457015991, + "learning_rate": 3.306743817120777e-06, + "loss": 0.2957, + "step": 656 + }, + { + "epoch": 0.62, + "grad_norm": 0.6964268684387207, + "learning_rate": 3.2923043064093252e-06, + "loss": 0.2753, + "step": 657 + }, + { + "epoch": 0.62, + "grad_norm": 0.7075220942497253, + "learning_rate": 3.2778809006196564e-06, + "loss": 0.2483, + "step": 658 + }, + { + "epoch": 0.62, + "grad_norm": 0.6105390787124634, + "learning_rate": 3.2634737357758994e-06, + "loss": 0.1621, + "step": 659 + }, + { + "epoch": 0.63, + "grad_norm": 0.6852737665176392, + "learning_rate": 3.2490829477490194e-06, + "loss": 0.2133, + "step": 660 + }, + { + "epoch": 0.63, + "grad_norm": 0.6731998920440674, + "learning_rate": 3.2347086722555382e-06, + "loss": 0.2016, + "step": 661 + }, + { + "epoch": 0.63, + "grad_norm": 0.6744810342788696, + "learning_rate": 3.220351044856247e-06, + "loss": 0.2303, + "step": 662 + }, + { + "epoch": 0.63, + "grad_norm": 0.6960051655769348, + "learning_rate": 3.206010200954935e-06, + "loss": 0.2265, + "step": 663 + }, + { + "epoch": 0.63, + "grad_norm": 0.7880598902702332, + "learning_rate": 3.191686275797107e-06, + "loss": 0.3208, + "step": 664 + }, + { + "epoch": 0.63, + "grad_norm": 0.7482811212539673, + "learning_rate": 3.177379404468715e-06, + "loss": 0.2257, + "step": 665 + }, + { + "epoch": 0.63, + "grad_norm": 0.5179324746131897, + "learning_rate": 3.1630897218948765e-06, + "loss": 0.1486, + "step": 666 + }, + { + "epoch": 0.63, + "grad_norm": 0.7205661535263062, + "learning_rate": 3.1488173628386066e-06, + "loss": 0.2822, + "step": 667 + }, + { + "epoch": 0.63, + "grad_norm": 0.7530344724655151, + "learning_rate": 3.1345624618995444e-06, + "loss": 0.2819, + "step": 668 + }, + { + "epoch": 0.63, + "grad_norm": 0.7841753959655762, + "learning_rate": 3.1203251535126867e-06, + "loss": 0.2162, + "step": 669 + }, + { + "epoch": 0.63, + "grad_norm": 0.7488176822662354, + "learning_rate": 3.10610557194712e-06, + "loss": 0.2414, + "step": 670 + }, + { + "epoch": 0.64, + "grad_norm": 0.8237447142601013, + "learning_rate": 3.0919038513047507e-06, + "loss": 0.3725, + "step": 671 + }, + { + "epoch": 0.64, + "grad_norm": 0.6755101084709167, + "learning_rate": 3.077720125519042e-06, + "loss": 0.2308, + "step": 672 + }, + { + "epoch": 0.64, + "grad_norm": 0.8368008732795715, + "learning_rate": 3.0635545283537523e-06, + "loss": 0.3018, + "step": 673 + }, + { + "epoch": 0.64, + "grad_norm": 0.6606637835502625, + "learning_rate": 3.0494071934016737e-06, + "loss": 0.2167, + "step": 674 + }, + { + "epoch": 0.64, + "grad_norm": 0.7647143006324768, + "learning_rate": 3.03527825408337e-06, + "loss": 0.3043, + "step": 675 + }, + { + "epoch": 0.64, + "grad_norm": 0.6645731329917908, + "learning_rate": 3.0211678436459214e-06, + "loss": 0.2039, + "step": 676 + }, + { + "epoch": 0.64, + "grad_norm": 0.7168276309967041, + "learning_rate": 3.007076095161662e-06, + "loss": 0.2454, + "step": 677 + }, + { + "epoch": 0.64, + "grad_norm": 0.7966533899307251, + "learning_rate": 2.9930031415269327e-06, + "loss": 0.2565, + "step": 678 + }, + { + "epoch": 0.64, + "grad_norm": 0.594175398349762, + "learning_rate": 2.978949115460824e-06, + "loss": 0.152, + "step": 679 + }, + { + "epoch": 0.64, + "grad_norm": 0.7202569842338562, + "learning_rate": 2.9649141495039225e-06, + "loss": 0.2299, + "step": 680 + }, + { + "epoch": 0.65, + "grad_norm": 0.818361222743988, + "learning_rate": 2.950898376017064e-06, + "loss": 0.2468, + "step": 681 + }, + { + "epoch": 0.65, + "grad_norm": 0.6550590395927429, + "learning_rate": 2.9369019271800827e-06, + "loss": 0.1583, + "step": 682 + }, + { + "epoch": 0.65, + "grad_norm": 0.6880379915237427, + "learning_rate": 2.9229249349905686e-06, + "loss": 0.2737, + "step": 683 + }, + { + "epoch": 0.65, + "grad_norm": 0.6679242253303528, + "learning_rate": 2.908967531262618e-06, + "loss": 0.1807, + "step": 684 + }, + { + "epoch": 0.65, + "grad_norm": 0.7928236126899719, + "learning_rate": 2.895029847625595e-06, + "loss": 0.2728, + "step": 685 + }, + { + "epoch": 0.65, + "grad_norm": 0.9944189190864563, + "learning_rate": 2.8811120155228843e-06, + "loss": 0.3776, + "step": 686 + }, + { + "epoch": 0.65, + "grad_norm": 0.7912566661834717, + "learning_rate": 2.8672141662106577e-06, + "loss": 0.2994, + "step": 687 + }, + { + "epoch": 0.65, + "grad_norm": 0.8261896371841431, + "learning_rate": 2.8533364307566313e-06, + "loss": 0.271, + "step": 688 + }, + { + "epoch": 0.65, + "grad_norm": 0.8267972469329834, + "learning_rate": 2.839478940038833e-06, + "loss": 0.3058, + "step": 689 + }, + { + "epoch": 0.65, + "grad_norm": 0.7223398089408875, + "learning_rate": 2.8256418247443664e-06, + "loss": 0.256, + "step": 690 + }, + { + "epoch": 0.65, + "grad_norm": 0.6547335386276245, + "learning_rate": 2.811825215368179e-06, + "loss": 0.2153, + "step": 691 + }, + { + "epoch": 0.66, + "grad_norm": 0.8268930912017822, + "learning_rate": 2.7980292422118282e-06, + "loss": 0.3194, + "step": 692 + }, + { + "epoch": 0.66, + "grad_norm": 0.6824688911437988, + "learning_rate": 2.7842540353822634e-06, + "loss": 0.2299, + "step": 693 + }, + { + "epoch": 0.66, + "grad_norm": 0.7297709584236145, + "learning_rate": 2.770499724790584e-06, + "loss": 0.2297, + "step": 694 + }, + { + "epoch": 0.66, + "grad_norm": 0.7365825772285461, + "learning_rate": 2.7567664401508225e-06, + "loss": 0.2711, + "step": 695 + }, + { + "epoch": 0.66, + "grad_norm": 0.870194673538208, + "learning_rate": 2.743054310978722e-06, + "loss": 0.3355, + "step": 696 + }, + { + "epoch": 0.66, + "grad_norm": 0.6595069766044617, + "learning_rate": 2.729363466590511e-06, + "loss": 0.1812, + "step": 697 + }, + { + "epoch": 0.66, + "grad_norm": 0.7013744711875916, + "learning_rate": 2.7156940361016864e-06, + "loss": 0.238, + "step": 698 + }, + { + "epoch": 0.66, + "grad_norm": 0.745682954788208, + "learning_rate": 2.7020461484257952e-06, + "loss": 0.2412, + "step": 699 + }, + { + "epoch": 0.66, + "grad_norm": 0.820225179195404, + "learning_rate": 2.6884199322732192e-06, + "loss": 0.3275, + "step": 700 + }, + { + "epoch": 0.66, + "eval_loss": 0.23962344229221344, + "eval_runtime": 340.0624, + "eval_samples_per_second": 1.003, + "eval_steps_per_second": 0.253, + "step": 700 + }, + { + "epoch": 0.66, + "grad_norm": 0.8120321035385132, + "learning_rate": 2.6748155161499568e-06, + "loss": 0.2871, + "step": 701 + }, + { + "epoch": 0.67, + "grad_norm": 0.643677294254303, + "learning_rate": 2.6612330283564226e-06, + "loss": 0.1843, + "step": 702 + }, + { + "epoch": 0.67, + "grad_norm": 0.7032471895217896, + "learning_rate": 2.6476725969862227e-06, + "loss": 0.1753, + "step": 703 + }, + { + "epoch": 0.67, + "grad_norm": 0.6656165719032288, + "learning_rate": 2.634134349924956e-06, + "loss": 0.1874, + "step": 704 + }, + { + "epoch": 0.67, + "grad_norm": 0.6445679068565369, + "learning_rate": 2.6206184148490066e-06, + "loss": 0.2454, + "step": 705 + }, + { + "epoch": 0.67, + "grad_norm": 0.7996844053268433, + "learning_rate": 2.6071249192243365e-06, + "loss": 0.279, + "step": 706 + }, + { + "epoch": 0.67, + "grad_norm": 0.7802110910415649, + "learning_rate": 2.5936539903052893e-06, + "loss": 0.318, + "step": 707 + }, + { + "epoch": 0.67, + "grad_norm": 0.8619838356971741, + "learning_rate": 2.580205755133384e-06, + "loss": 0.3339, + "step": 708 + }, + { + "epoch": 0.67, + "grad_norm": 0.7842369079589844, + "learning_rate": 2.5667803405361214e-06, + "loss": 0.3304, + "step": 709 + }, + { + "epoch": 0.67, + "grad_norm": 0.8101242184638977, + "learning_rate": 2.5533778731257824e-06, + "loss": 0.3354, + "step": 710 + }, + { + "epoch": 0.67, + "grad_norm": 0.7099646329879761, + "learning_rate": 2.5399984792982457e-06, + "loss": 0.2381, + "step": 711 + }, + { + "epoch": 0.67, + "grad_norm": 0.7360900640487671, + "learning_rate": 2.5266422852317796e-06, + "loss": 0.2642, + "step": 712 + }, + { + "epoch": 0.68, + "grad_norm": 0.7768506407737732, + "learning_rate": 2.513309416885865e-06, + "loss": 0.3074, + "step": 713 + }, + { + "epoch": 0.68, + "grad_norm": 0.8049039244651794, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.2184, + "step": 714 + }, + { + "epoch": 0.68, + "grad_norm": 0.6561914086341858, + "learning_rate": 2.4867141600925214e-06, + "loss": 0.2148, + "step": 715 + }, + { + "epoch": 0.68, + "grad_norm": 0.8116360902786255, + "learning_rate": 2.4734520224594094e-06, + "loss": 0.3025, + "step": 716 + }, + { + "epoch": 0.68, + "grad_norm": 0.757860541343689, + "learning_rate": 2.4602137121731195e-06, + "loss": 0.2799, + "step": 717 + }, + { + "epoch": 0.68, + "grad_norm": 0.7722039222717285, + "learning_rate": 2.44699935408139e-06, + "loss": 0.2626, + "step": 718 + }, + { + "epoch": 0.68, + "grad_norm": 0.52679443359375, + "learning_rate": 2.4338090728060808e-06, + "loss": 0.1255, + "step": 719 + }, + { + "epoch": 0.68, + "grad_norm": 0.6381962299346924, + "learning_rate": 2.4206429927419795e-06, + "loss": 0.2139, + "step": 720 + }, + { + "epoch": 0.68, + "grad_norm": 0.7625266313552856, + "learning_rate": 2.40750123805564e-06, + "loss": 0.2465, + "step": 721 + }, + { + "epoch": 0.68, + "grad_norm": 0.8177068829536438, + "learning_rate": 2.3943839326842096e-06, + "loss": 0.2712, + "step": 722 + }, + { + "epoch": 0.69, + "grad_norm": 0.6748345494270325, + "learning_rate": 2.381291200334257e-06, + "loss": 0.2079, + "step": 723 + }, + { + "epoch": 0.69, + "grad_norm": 0.8563744425773621, + "learning_rate": 2.368223164480611e-06, + "loss": 0.3, + "step": 724 + }, + { + "epoch": 0.69, + "grad_norm": 0.7199714779853821, + "learning_rate": 2.3551799483651894e-06, + "loss": 0.2661, + "step": 725 + }, + { + "epoch": 0.69, + "grad_norm": 0.860863447189331, + "learning_rate": 2.342161674995843e-06, + "loss": 0.3407, + "step": 726 + }, + { + "epoch": 0.69, + "grad_norm": 0.7349135875701904, + "learning_rate": 2.3291684671451905e-06, + "loss": 0.2679, + "step": 727 + }, + { + "epoch": 0.69, + "grad_norm": 0.6539340615272522, + "learning_rate": 2.316200447349466e-06, + "loss": 0.2213, + "step": 728 + }, + { + "epoch": 0.69, + "grad_norm": 0.7105304002761841, + "learning_rate": 2.3032577379073577e-06, + "loss": 0.2413, + "step": 729 + }, + { + "epoch": 0.69, + "grad_norm": 0.623697817325592, + "learning_rate": 2.2903404608788582e-06, + "loss": 0.1846, + "step": 730 + }, + { + "epoch": 0.69, + "grad_norm": 0.7190108299255371, + "learning_rate": 2.2774487380841116e-06, + "loss": 0.26, + "step": 731 + }, + { + "epoch": 0.69, + "grad_norm": 0.7020860910415649, + "learning_rate": 2.2645826911022656e-06, + "loss": 0.2464, + "step": 732 + }, + { + "epoch": 0.69, + "grad_norm": 0.7273522615432739, + "learning_rate": 2.2517424412703256e-06, + "loss": 0.2234, + "step": 733 + }, + { + "epoch": 0.7, + "grad_norm": 0.6023182272911072, + "learning_rate": 2.2389281096820077e-06, + "loss": 0.2128, + "step": 734 + }, + { + "epoch": 0.7, + "grad_norm": 0.6555721163749695, + "learning_rate": 2.2261398171865976e-06, + "loss": 0.2629, + "step": 735 + }, + { + "epoch": 0.7, + "grad_norm": 0.6597947478294373, + "learning_rate": 2.2133776843878185e-06, + "loss": 0.1795, + "step": 736 + }, + { + "epoch": 0.7, + "grad_norm": 0.7203525304794312, + "learning_rate": 2.2006418316426773e-06, + "loss": 0.2305, + "step": 737 + }, + { + "epoch": 0.7, + "grad_norm": 0.6759414076805115, + "learning_rate": 2.187932379060348e-06, + "loss": 0.2382, + "step": 738 + }, + { + "epoch": 0.7, + "grad_norm": 0.7028172612190247, + "learning_rate": 2.175249446501024e-06, + "loss": 0.1931, + "step": 739 + }, + { + "epoch": 0.7, + "grad_norm": 0.7945227026939392, + "learning_rate": 2.1625931535747964e-06, + "loss": 0.277, + "step": 740 + }, + { + "epoch": 0.7, + "grad_norm": 0.7901543378829956, + "learning_rate": 2.1499636196405225e-06, + "loss": 0.2847, + "step": 741 + }, + { + "epoch": 0.7, + "grad_norm": 0.7968934178352356, + "learning_rate": 2.1373609638047033e-06, + "loss": 0.2271, + "step": 742 + }, + { + "epoch": 0.7, + "grad_norm": 0.7947246432304382, + "learning_rate": 2.1247853049203543e-06, + "loss": 0.2448, + "step": 743 + }, + { + "epoch": 0.7, + "grad_norm": 0.7870792150497437, + "learning_rate": 2.112236761585892e-06, + "loss": 0.2432, + "step": 744 + }, + { + "epoch": 0.71, + "grad_norm": 0.7615368962287903, + "learning_rate": 2.09971545214401e-06, + "loss": 0.265, + "step": 745 + }, + { + "epoch": 0.71, + "grad_norm": 0.8390629887580872, + "learning_rate": 2.087221494680563e-06, + "loss": 0.3023, + "step": 746 + }, + { + "epoch": 0.71, + "grad_norm": 0.7150559425354004, + "learning_rate": 2.074755007023461e-06, + "loss": 0.2097, + "step": 747 + }, + { + "epoch": 0.71, + "grad_norm": 0.5334554314613342, + "learning_rate": 2.0623161067415463e-06, + "loss": 0.1123, + "step": 748 + }, + { + "epoch": 0.71, + "grad_norm": 0.7979438304901123, + "learning_rate": 2.0499049111434922e-06, + "loss": 0.2728, + "step": 749 + }, + { + "epoch": 0.71, + "grad_norm": 0.7618849873542786, + "learning_rate": 2.0375215372766944e-06, + "loss": 0.2587, + "step": 750 + }, + { + "epoch": 0.71, + "grad_norm": 0.9019395112991333, + "learning_rate": 2.025166101926168e-06, + "loss": 0.3588, + "step": 751 + }, + { + "epoch": 0.71, + "grad_norm": 0.5421817302703857, + "learning_rate": 2.012838721613447e-06, + "loss": 0.1394, + "step": 752 + }, + { + "epoch": 0.71, + "grad_norm": 0.7142716646194458, + "learning_rate": 2.0005395125954814e-06, + "loss": 0.3148, + "step": 753 + }, + { + "epoch": 0.71, + "grad_norm": 0.6308157444000244, + "learning_rate": 1.988268590863546e-06, + "loss": 0.2071, + "step": 754 + }, + { + "epoch": 0.72, + "grad_norm": 0.7796697020530701, + "learning_rate": 1.9760260721421426e-06, + "loss": 0.2666, + "step": 755 + }, + { + "epoch": 0.72, + "grad_norm": 0.7846043109893799, + "learning_rate": 1.9638120718879133e-06, + "loss": 0.2956, + "step": 756 + }, + { + "epoch": 0.72, + "grad_norm": 0.713147759437561, + "learning_rate": 1.951626705288544e-06, + "loss": 0.2859, + "step": 757 + }, + { + "epoch": 0.72, + "grad_norm": 0.646619975566864, + "learning_rate": 1.9394700872616856e-06, + "loss": 0.1585, + "step": 758 + }, + { + "epoch": 0.72, + "grad_norm": 0.8083282113075256, + "learning_rate": 1.927342332453866e-06, + "loss": 0.3169, + "step": 759 + }, + { + "epoch": 0.72, + "grad_norm": 0.6538665890693665, + "learning_rate": 1.9152435552394105e-06, + "loss": 0.2333, + "step": 760 + }, + { + "epoch": 0.72, + "grad_norm": 0.6046538949012756, + "learning_rate": 1.9031738697193618e-06, + "loss": 0.1602, + "step": 761 + }, + { + "epoch": 0.72, + "grad_norm": 0.6781079769134521, + "learning_rate": 1.8911333897204071e-06, + "loss": 0.2269, + "step": 762 + }, + { + "epoch": 0.72, + "grad_norm": 0.7626420259475708, + "learning_rate": 1.8791222287937983e-06, + "loss": 0.2841, + "step": 763 + }, + { + "epoch": 0.72, + "grad_norm": 0.7151882648468018, + "learning_rate": 1.8671405002142918e-06, + "loss": 0.1745, + "step": 764 + }, + { + "epoch": 0.72, + "grad_norm": 0.7661588191986084, + "learning_rate": 1.855188316979068e-06, + "loss": 0.2579, + "step": 765 + }, + { + "epoch": 0.73, + "grad_norm": 0.7726594805717468, + "learning_rate": 1.8432657918066732e-06, + "loss": 0.2371, + "step": 766 + }, + { + "epoch": 0.73, + "grad_norm": 0.79924476146698, + "learning_rate": 1.831373037135955e-06, + "loss": 0.3382, + "step": 767 + }, + { + "epoch": 0.73, + "grad_norm": 0.5692100524902344, + "learning_rate": 1.819510165125002e-06, + "loss": 0.1561, + "step": 768 + }, + { + "epoch": 0.73, + "grad_norm": 0.9645766615867615, + "learning_rate": 1.8076772876500831e-06, + "loss": 0.2402, + "step": 769 + }, + { + "epoch": 0.73, + "grad_norm": 0.6940447688102722, + "learning_rate": 1.7958745163045987e-06, + "loss": 0.261, + "step": 770 + }, + { + "epoch": 0.73, + "grad_norm": 0.7400657534599304, + "learning_rate": 1.7841019623980215e-06, + "loss": 0.2289, + "step": 771 + }, + { + "epoch": 0.73, + "grad_norm": 0.6116971969604492, + "learning_rate": 1.77235973695485e-06, + "loss": 0.168, + "step": 772 + }, + { + "epoch": 0.73, + "grad_norm": 0.7953826189041138, + "learning_rate": 1.760647950713566e-06, + "loss": 0.2794, + "step": 773 + }, + { + "epoch": 0.73, + "grad_norm": 0.6571024656295776, + "learning_rate": 1.7489667141255801e-06, + "loss": 0.2146, + "step": 774 + }, + { + "epoch": 0.73, + "grad_norm": 0.8735598921775818, + "learning_rate": 1.7373161373541968e-06, + "loss": 0.3331, + "step": 775 + }, + { + "epoch": 0.74, + "grad_norm": 0.692879319190979, + "learning_rate": 1.7256963302735752e-06, + "loss": 0.2537, + "step": 776 + }, + { + "epoch": 0.74, + "grad_norm": 0.7424703240394592, + "learning_rate": 1.7141074024676913e-06, + "loss": 0.2342, + "step": 777 + }, + { + "epoch": 0.74, + "grad_norm": 0.6320103406906128, + "learning_rate": 1.702549463229305e-06, + "loss": 0.1437, + "step": 778 + }, + { + "epoch": 0.74, + "grad_norm": 0.7434752583503723, + "learning_rate": 1.6910226215589303e-06, + "loss": 0.2942, + "step": 779 + }, + { + "epoch": 0.74, + "grad_norm": 0.7751021385192871, + "learning_rate": 1.6795269861638041e-06, + "loss": 0.2776, + "step": 780 + }, + { + "epoch": 0.74, + "grad_norm": 0.674355149269104, + "learning_rate": 1.6680626654568688e-06, + "loss": 0.1999, + "step": 781 + }, + { + "epoch": 0.74, + "grad_norm": 0.8401548862457275, + "learning_rate": 1.6566297675557392e-06, + "loss": 0.3487, + "step": 782 + }, + { + "epoch": 0.74, + "grad_norm": 0.8457766771316528, + "learning_rate": 1.6452284002816893e-06, + "loss": 0.2785, + "step": 783 + }, + { + "epoch": 0.74, + "grad_norm": 0.7536783814430237, + "learning_rate": 1.6338586711586358e-06, + "loss": 0.2188, + "step": 784 + }, + { + "epoch": 0.74, + "grad_norm": 0.7402835488319397, + "learning_rate": 1.6225206874121219e-06, + "loss": 0.2805, + "step": 785 + }, + { + "epoch": 0.74, + "grad_norm": 0.7845516800880432, + "learning_rate": 1.6112145559683057e-06, + "loss": 0.2494, + "step": 786 + }, + { + "epoch": 0.75, + "grad_norm": 0.8195182085037231, + "learning_rate": 1.5999403834529549e-06, + "loss": 0.3057, + "step": 787 + }, + { + "epoch": 0.75, + "grad_norm": 0.6405972242355347, + "learning_rate": 1.588698276190438e-06, + "loss": 0.2197, + "step": 788 + }, + { + "epoch": 0.75, + "grad_norm": 0.7488459348678589, + "learning_rate": 1.5774883402027208e-06, + "loss": 0.2605, + "step": 789 + }, + { + "epoch": 0.75, + "grad_norm": 0.8009891510009766, + "learning_rate": 1.5663106812083746e-06, + "loss": 0.2756, + "step": 790 + }, + { + "epoch": 0.75, + "grad_norm": 0.6462413668632507, + "learning_rate": 1.555165404621567e-06, + "loss": 0.2111, + "step": 791 + }, + { + "epoch": 0.75, + "grad_norm": 0.7553801536560059, + "learning_rate": 1.5440526155510766e-06, + "loss": 0.2879, + "step": 792 + }, + { + "epoch": 0.75, + "grad_norm": 0.8148736357688904, + "learning_rate": 1.5329724187992983e-06, + "loss": 0.3281, + "step": 793 + }, + { + "epoch": 0.75, + "grad_norm": 0.7564266324043274, + "learning_rate": 1.5219249188612556e-06, + "loss": 0.2453, + "step": 794 + }, + { + "epoch": 0.75, + "grad_norm": 0.7308322787284851, + "learning_rate": 1.5109102199236152e-06, + "loss": 0.2592, + "step": 795 + }, + { + "epoch": 0.75, + "grad_norm": 0.8379185199737549, + "learning_rate": 1.4999284258637054e-06, + "loss": 0.2998, + "step": 796 + }, + { + "epoch": 0.76, + "grad_norm": 0.7561792135238647, + "learning_rate": 1.488979640248534e-06, + "loss": 0.2468, + "step": 797 + }, + { + "epoch": 0.76, + "grad_norm": 0.6872095465660095, + "learning_rate": 1.4780639663338125e-06, + "loss": 0.217, + "step": 798 + }, + { + "epoch": 0.76, + "grad_norm": 0.5847635865211487, + "learning_rate": 1.467181507062987e-06, + "loss": 0.1827, + "step": 799 + }, + { + "epoch": 0.76, + "grad_norm": 0.6254853010177612, + "learning_rate": 1.4563323650662586e-06, + "loss": 0.1564, + "step": 800 + }, + { + "epoch": 0.76, + "eval_loss": 0.23893068730831146, + "eval_runtime": 334.7721, + "eval_samples_per_second": 1.019, + "eval_steps_per_second": 0.257, + "step": 800 + }, + { + "epoch": 0.76, + "grad_norm": 0.6680090427398682, + "learning_rate": 1.4455166426596222e-06, + "loss": 0.1686, + "step": 801 + }, + { + "epoch": 0.76, + "grad_norm": 0.9229381084442139, + "learning_rate": 1.434734441843899e-06, + "loss": 0.4003, + "step": 802 + }, + { + "epoch": 0.76, + "grad_norm": 0.6827018857002258, + "learning_rate": 1.4239858643037753e-06, + "loss": 0.2393, + "step": 803 + }, + { + "epoch": 0.76, + "grad_norm": 0.6769413352012634, + "learning_rate": 1.4132710114068427e-06, + "loss": 0.2169, + "step": 804 + }, + { + "epoch": 0.76, + "grad_norm": 0.6936076283454895, + "learning_rate": 1.4025899842026442e-06, + "loss": 0.1547, + "step": 805 + }, + { + "epoch": 0.76, + "grad_norm": 0.6686022281646729, + "learning_rate": 1.3919428834217163e-06, + "loss": 0.198, + "step": 806 + }, + { + "epoch": 0.76, + "grad_norm": 0.6601407527923584, + "learning_rate": 1.3813298094746491e-06, + "loss": 0.1894, + "step": 807 + }, + { + "epoch": 0.77, + "grad_norm": 0.6349535584449768, + "learning_rate": 1.3707508624511263e-06, + "loss": 0.18, + "step": 808 + }, + { + "epoch": 0.77, + "grad_norm": 0.7586642503738403, + "learning_rate": 1.3602061421189899e-06, + "loss": 0.2731, + "step": 809 + }, + { + "epoch": 0.77, + "grad_norm": 0.7111486196517944, + "learning_rate": 1.349695747923298e-06, + "loss": 0.2387, + "step": 810 + }, + { + "epoch": 0.77, + "grad_norm": 0.7135524749755859, + "learning_rate": 1.339219778985385e-06, + "loss": 0.2516, + "step": 811 + }, + { + "epoch": 0.77, + "grad_norm": 0.7469358444213867, + "learning_rate": 1.3287783341019278e-06, + "loss": 0.2263, + "step": 812 + }, + { + "epoch": 0.77, + "grad_norm": 0.8230369091033936, + "learning_rate": 1.3183715117440143e-06, + "loss": 0.2903, + "step": 813 + }, + { + "epoch": 0.77, + "grad_norm": 0.83302241563797, + "learning_rate": 1.307999410056216e-06, + "loss": 0.2903, + "step": 814 + }, + { + "epoch": 0.77, + "grad_norm": 0.7573346495628357, + "learning_rate": 1.2976621268556571e-06, + "loss": 0.2714, + "step": 815 + }, + { + "epoch": 0.77, + "grad_norm": 0.649545431137085, + "learning_rate": 1.2873597596311026e-06, + "loss": 0.2145, + "step": 816 + }, + { + "epoch": 0.77, + "grad_norm": 0.9165211915969849, + "learning_rate": 1.2770924055420258e-06, + "loss": 0.2811, + "step": 817 + }, + { + "epoch": 0.78, + "grad_norm": 0.717332124710083, + "learning_rate": 1.2668601614177017e-06, + "loss": 0.3172, + "step": 818 + }, + { + "epoch": 0.78, + "grad_norm": 0.7683383822441101, + "learning_rate": 1.2566631237562894e-06, + "loss": 0.207, + "step": 819 + }, + { + "epoch": 0.78, + "grad_norm": 0.6939813494682312, + "learning_rate": 1.246501388723923e-06, + "loss": 0.2164, + "step": 820 + }, + { + "epoch": 0.78, + "grad_norm": 0.7078372836112976, + "learning_rate": 1.2363750521538064e-06, + "loss": 0.2329, + "step": 821 + }, + { + "epoch": 0.78, + "grad_norm": 0.8150206208229065, + "learning_rate": 1.2262842095453065e-06, + "loss": 0.3304, + "step": 822 + }, + { + "epoch": 0.78, + "grad_norm": 0.6956215500831604, + "learning_rate": 1.2162289560630524e-06, + "loss": 0.2567, + "step": 823 + }, + { + "epoch": 0.78, + "grad_norm": 0.8311361074447632, + "learning_rate": 1.2062093865360458e-06, + "loss": 0.3153, + "step": 824 + }, + { + "epoch": 0.78, + "grad_norm": 0.6261569261550903, + "learning_rate": 1.1962255954567537e-06, + "loss": 0.1892, + "step": 825 + }, + { + "epoch": 0.78, + "grad_norm": 0.7640008926391602, + "learning_rate": 1.1862776769802275e-06, + "loss": 0.2267, + "step": 826 + }, + { + "epoch": 0.78, + "grad_norm": 0.7890690565109253, + "learning_rate": 1.1763657249232107e-06, + "loss": 0.2429, + "step": 827 + }, + { + "epoch": 0.78, + "grad_norm": 0.6995456218719482, + "learning_rate": 1.1664898327632552e-06, + "loss": 0.1979, + "step": 828 + }, + { + "epoch": 0.79, + "grad_norm": 0.8067217469215393, + "learning_rate": 1.1566500936378389e-06, + "loss": 0.2707, + "step": 829 + }, + { + "epoch": 0.79, + "grad_norm": 0.7893016934394836, + "learning_rate": 1.146846600343488e-06, + "loss": 0.3009, + "step": 830 + }, + { + "epoch": 0.79, + "grad_norm": 0.880754292011261, + "learning_rate": 1.1370794453349039e-06, + "loss": 0.3307, + "step": 831 + }, + { + "epoch": 0.79, + "grad_norm": 0.8914408087730408, + "learning_rate": 1.1273487207240845e-06, + "loss": 0.2634, + "step": 832 + }, + { + "epoch": 0.79, + "grad_norm": 0.5336819291114807, + "learning_rate": 1.1176545182794674e-06, + "loss": 0.1236, + "step": 833 + }, + { + "epoch": 0.79, + "grad_norm": 0.7172911167144775, + "learning_rate": 1.1079969294250515e-06, + "loss": 0.2377, + "step": 834 + }, + { + "epoch": 0.79, + "grad_norm": 0.8779481053352356, + "learning_rate": 1.0983760452395415e-06, + "loss": 0.2652, + "step": 835 + }, + { + "epoch": 0.79, + "grad_norm": 0.8408273458480835, + "learning_rate": 1.0887919564554893e-06, + "loss": 0.2963, + "step": 836 + }, + { + "epoch": 0.79, + "grad_norm": 0.8103682994842529, + "learning_rate": 1.079244753458437e-06, + "loss": 0.2976, + "step": 837 + }, + { + "epoch": 0.79, + "grad_norm": 0.7215389609336853, + "learning_rate": 1.0697345262860638e-06, + "loss": 0.3125, + "step": 838 + }, + { + "epoch": 0.79, + "grad_norm": 0.7416812181472778, + "learning_rate": 1.0602613646273374e-06, + "loss": 0.3058, + "step": 839 + }, + { + "epoch": 0.8, + "grad_norm": 0.8230550289154053, + "learning_rate": 1.0508253578216693e-06, + "loss": 0.3375, + "step": 840 + }, + { + "epoch": 0.8, + "grad_norm": 0.8321605324745178, + "learning_rate": 1.0414265948580694e-06, + "loss": 0.3159, + "step": 841 + }, + { + "epoch": 0.8, + "grad_norm": 0.7492278218269348, + "learning_rate": 1.0320651643743128e-06, + "loss": 0.2286, + "step": 842 + }, + { + "epoch": 0.8, + "grad_norm": 0.6947314143180847, + "learning_rate": 1.0227411546560962e-06, + "loss": 0.1957, + "step": 843 + }, + { + "epoch": 0.8, + "grad_norm": 0.7169224619865417, + "learning_rate": 1.0134546536362099e-06, + "loss": 0.2468, + "step": 844 + }, + { + "epoch": 0.8, + "grad_norm": 0.794872522354126, + "learning_rate": 1.0042057488937067e-06, + "loss": 0.2482, + "step": 845 + }, + { + "epoch": 0.8, + "grad_norm": 0.8405777812004089, + "learning_rate": 9.949945276530782e-07, + "loss": 0.3541, + "step": 846 + }, + { + "epoch": 0.8, + "grad_norm": 0.6393839716911316, + "learning_rate": 9.858210767834292e-07, + "loss": 0.2073, + "step": 847 + }, + { + "epoch": 0.8, + "grad_norm": 0.8626486659049988, + "learning_rate": 9.76685482797662e-07, + "loss": 0.2682, + "step": 848 + }, + { + "epoch": 0.8, + "grad_norm": 0.667521059513092, + "learning_rate": 9.675878318516546e-07, + "loss": 0.2127, + "step": 849 + }, + { + "epoch": 0.81, + "grad_norm": 0.9174708724021912, + "learning_rate": 9.58528209743459e-07, + "loss": 0.3151, + "step": 850 + }, + { + "epoch": 0.81, + "grad_norm": 0.6799070239067078, + "learning_rate": 9.495067019124793e-07, + "loss": 0.27, + "step": 851 + }, + { + "epoch": 0.81, + "grad_norm": 0.7855434417724609, + "learning_rate": 9.405233934386726e-07, + "loss": 0.2487, + "step": 852 + }, + { + "epoch": 0.81, + "grad_norm": 0.7704946398735046, + "learning_rate": 9.315783690417479e-07, + "loss": 0.2218, + "step": 853 + }, + { + "epoch": 0.81, + "grad_norm": 0.6957180500030518, + "learning_rate": 9.226717130803636e-07, + "loss": 0.2129, + "step": 854 + }, + { + "epoch": 0.81, + "grad_norm": 0.7627095580101013, + "learning_rate": 9.138035095513337e-07, + "loss": 0.2735, + "step": 855 + }, + { + "epoch": 0.81, + "grad_norm": 0.6550484895706177, + "learning_rate": 9.049738420888349e-07, + "loss": 0.229, + "step": 856 + }, + { + "epoch": 0.81, + "grad_norm": 0.7645315527915955, + "learning_rate": 8.961827939636198e-07, + "loss": 0.2348, + "step": 857 + }, + { + "epoch": 0.81, + "grad_norm": 0.5948243737220764, + "learning_rate": 8.874304480822271e-07, + "loss": 0.1143, + "step": 858 + }, + { + "epoch": 0.81, + "grad_norm": 0.6549612283706665, + "learning_rate": 8.787168869862067e-07, + "loss": 0.1665, + "step": 859 + }, + { + "epoch": 0.81, + "grad_norm": 0.7662985920906067, + "learning_rate": 8.700421928513353e-07, + "loss": 0.2988, + "step": 860 + }, + { + "epoch": 0.82, + "grad_norm": 0.738470196723938, + "learning_rate": 8.614064474868423e-07, + "loss": 0.2364, + "step": 861 + }, + { + "epoch": 0.82, + "grad_norm": 0.7509949803352356, + "learning_rate": 8.528097323346408e-07, + "loss": 0.2596, + "step": 862 + }, + { + "epoch": 0.82, + "grad_norm": 0.7045740485191345, + "learning_rate": 8.442521284685573e-07, + "loss": 0.1838, + "step": 863 + }, + { + "epoch": 0.82, + "grad_norm": 0.6679222583770752, + "learning_rate": 8.357337165935675e-07, + "loss": 0.215, + "step": 864 + }, + { + "epoch": 0.82, + "grad_norm": 0.7682536840438843, + "learning_rate": 8.27254577045039e-07, + "loss": 0.2566, + "step": 865 + }, + { + "epoch": 0.82, + "grad_norm": 0.7528423070907593, + "learning_rate": 8.188147897879667e-07, + "loss": 0.2169, + "step": 866 + }, + { + "epoch": 0.82, + "grad_norm": 0.7058762907981873, + "learning_rate": 8.104144344162229e-07, + "loss": 0.1921, + "step": 867 + }, + { + "epoch": 0.82, + "grad_norm": 0.7042251825332642, + "learning_rate": 8.02053590151805e-07, + "loss": 0.239, + "step": 868 + }, + { + "epoch": 0.82, + "grad_norm": 0.7134466767311096, + "learning_rate": 7.937323358440935e-07, + "loss": 0.2534, + "step": 869 + }, + { + "epoch": 0.82, + "grad_norm": 0.569417417049408, + "learning_rate": 7.854507499691006e-07, + "loss": 0.1404, + "step": 870 + }, + { + "epoch": 0.83, + "grad_norm": 0.731052577495575, + "learning_rate": 7.772089106287345e-07, + "loss": 0.2561, + "step": 871 + }, + { + "epoch": 0.83, + "grad_norm": 0.8122794032096863, + "learning_rate": 7.690068955500623e-07, + "loss": 0.3047, + "step": 872 + }, + { + "epoch": 0.83, + "grad_norm": 0.6678743362426758, + "learning_rate": 7.608447820845771e-07, + "loss": 0.1985, + "step": 873 + }, + { + "epoch": 0.83, + "grad_norm": 0.6880447864532471, + "learning_rate": 7.527226472074678e-07, + "loss": 0.2263, + "step": 874 + }, + { + "epoch": 0.83, + "grad_norm": 0.7170695662498474, + "learning_rate": 7.446405675168938e-07, + "loss": 0.2498, + "step": 875 + }, + { + "epoch": 0.83, + "grad_norm": 0.7539134621620178, + "learning_rate": 7.365986192332624e-07, + "loss": 0.1666, + "step": 876 + }, + { + "epoch": 0.83, + "grad_norm": 0.7251168489456177, + "learning_rate": 7.285968781985093e-07, + "loss": 0.2157, + "step": 877 + }, + { + "epoch": 0.83, + "grad_norm": 0.5736756324768066, + "learning_rate": 7.206354198753862e-07, + "loss": 0.1379, + "step": 878 + }, + { + "epoch": 0.83, + "grad_norm": 0.876884400844574, + "learning_rate": 7.127143193467445e-07, + "loss": 0.3385, + "step": 879 + }, + { + "epoch": 0.83, + "grad_norm": 0.7779631614685059, + "learning_rate": 7.048336513148307e-07, + "loss": 0.303, + "step": 880 + }, + { + "epoch": 0.83, + "grad_norm": 0.8376134037971497, + "learning_rate": 6.969934901005809e-07, + "loss": 0.3363, + "step": 881 + }, + { + "epoch": 0.84, + "grad_norm": 0.7117416262626648, + "learning_rate": 6.89193909642919e-07, + "loss": 0.2705, + "step": 882 + }, + { + "epoch": 0.84, + "grad_norm": 0.6771072149276733, + "learning_rate": 6.814349834980622e-07, + "loss": 0.2274, + "step": 883 + }, + { + "epoch": 0.84, + "grad_norm": 0.8444775342941284, + "learning_rate": 6.737167848388227e-07, + "loss": 0.3234, + "step": 884 + }, + { + "epoch": 0.84, + "grad_norm": 0.7159910798072815, + "learning_rate": 6.660393864539222e-07, + "loss": 0.2191, + "step": 885 + }, + { + "epoch": 0.84, + "grad_norm": 0.6470720171928406, + "learning_rate": 6.584028607473019e-07, + "loss": 0.1555, + "step": 886 + }, + { + "epoch": 0.84, + "grad_norm": 0.8595316410064697, + "learning_rate": 6.508072797374454e-07, + "loss": 0.305, + "step": 887 + }, + { + "epoch": 0.84, + "grad_norm": 0.7818759679794312, + "learning_rate": 6.432527150566903e-07, + "loss": 0.2768, + "step": 888 + }, + { + "epoch": 0.84, + "grad_norm": 0.7317049503326416, + "learning_rate": 6.3573923795056e-07, + "loss": 0.2567, + "step": 889 + }, + { + "epoch": 0.84, + "grad_norm": 0.7605761289596558, + "learning_rate": 6.282669192770896e-07, + "loss": 0.2877, + "step": 890 + }, + { + "epoch": 0.84, + "grad_norm": 0.7579327821731567, + "learning_rate": 6.208358295061572e-07, + "loss": 0.2585, + "step": 891 + }, + { + "epoch": 0.85, + "grad_norm": 0.6664072275161743, + "learning_rate": 6.134460387188207e-07, + "loss": 0.213, + "step": 892 + }, + { + "epoch": 0.85, + "grad_norm": 0.7787521481513977, + "learning_rate": 6.060976166066546e-07, + "loss": 0.2172, + "step": 893 + }, + { + "epoch": 0.85, + "grad_norm": 0.7663528919219971, + "learning_rate": 5.98790632471094e-07, + "loss": 0.2807, + "step": 894 + }, + { + "epoch": 0.85, + "grad_norm": 0.6902214884757996, + "learning_rate": 5.91525155222783e-07, + "loss": 0.2408, + "step": 895 + }, + { + "epoch": 0.85, + "grad_norm": 0.9152516722679138, + "learning_rate": 5.843012533809211e-07, + "loss": 0.3191, + "step": 896 + }, + { + "epoch": 0.85, + "grad_norm": 0.7439894676208496, + "learning_rate": 5.771189950726191e-07, + "loss": 0.2269, + "step": 897 + }, + { + "epoch": 0.85, + "grad_norm": 0.8530980348587036, + "learning_rate": 5.699784480322568e-07, + "loss": 0.3409, + "step": 898 + }, + { + "epoch": 0.85, + "grad_norm": 0.7167938351631165, + "learning_rate": 5.628796796008435e-07, + "loss": 0.2531, + "step": 899 + }, + { + "epoch": 0.85, + "grad_norm": 0.7698455452919006, + "learning_rate": 5.558227567253832e-07, + "loss": 0.2512, + "step": 900 + }, + { + "epoch": 0.85, + "eval_loss": 0.23878981173038483, + "eval_runtime": 335.0563, + "eval_samples_per_second": 1.018, + "eval_steps_per_second": 0.257, + "step": 900 + }, + { + "epoch": 0.85, + "grad_norm": 0.7810540795326233, + "learning_rate": 5.488077459582425e-07, + "loss": 0.2887, + "step": 901 + }, + { + "epoch": 0.85, + "grad_norm": 0.7419719696044922, + "learning_rate": 5.418347134565249e-07, + "loss": 0.2431, + "step": 902 + }, + { + "epoch": 0.86, + "grad_norm": 0.7212926149368286, + "learning_rate": 5.349037249814443e-07, + "loss": 0.2713, + "step": 903 + }, + { + "epoch": 0.86, + "grad_norm": 0.740754246711731, + "learning_rate": 5.28014845897708e-07, + "loss": 0.2437, + "step": 904 + }, + { + "epoch": 0.86, + "grad_norm": 0.7760072350502014, + "learning_rate": 5.211681411728969e-07, + "loss": 0.3061, + "step": 905 + }, + { + "epoch": 0.86, + "grad_norm": 0.7475609183311462, + "learning_rate": 5.14363675376855e-07, + "loss": 0.2418, + "step": 906 + }, + { + "epoch": 0.86, + "grad_norm": 0.6755508780479431, + "learning_rate": 5.076015126810784e-07, + "loss": 0.2594, + "step": 907 + }, + { + "epoch": 0.86, + "grad_norm": 0.7507731318473816, + "learning_rate": 5.008817168581137e-07, + "loss": 0.232, + "step": 908 + }, + { + "epoch": 0.86, + "grad_norm": 0.8247950077056885, + "learning_rate": 4.94204351280953e-07, + "loss": 0.275, + "step": 909 + }, + { + "epoch": 0.86, + "grad_norm": 0.7758000493049622, + "learning_rate": 4.875694789224372e-07, + "loss": 0.2721, + "step": 910 + }, + { + "epoch": 0.86, + "grad_norm": 0.7328912615776062, + "learning_rate": 4.809771623546627e-07, + "loss": 0.1906, + "step": 911 + }, + { + "epoch": 0.86, + "grad_norm": 0.6297558546066284, + "learning_rate": 4.7442746374839363e-07, + "loss": 0.1976, + "step": 912 + }, + { + "epoch": 0.87, + "grad_norm": 0.7924368977546692, + "learning_rate": 4.6792044487247003e-07, + "loss": 0.2643, + "step": 913 + }, + { + "epoch": 0.87, + "grad_norm": 0.6739974617958069, + "learning_rate": 4.614561670932288e-07, + "loss": 0.2081, + "step": 914 + }, + { + "epoch": 0.87, + "grad_norm": 0.733456552028656, + "learning_rate": 4.5503469137392565e-07, + "loss": 0.2125, + "step": 915 + }, + { + "epoch": 0.87, + "grad_norm": 0.7064236998558044, + "learning_rate": 4.486560782741578e-07, + "loss": 0.1929, + "step": 916 + }, + { + "epoch": 0.87, + "grad_norm": 0.6808232069015503, + "learning_rate": 4.423203879492943e-07, + "loss": 0.1682, + "step": 917 + }, + { + "epoch": 0.87, + "grad_norm": 0.9870191216468811, + "learning_rate": 4.360276801499086e-07, + "loss": 0.309, + "step": 918 + }, + { + "epoch": 0.87, + "grad_norm": 0.7222360968589783, + "learning_rate": 4.29778014221216e-07, + "loss": 0.2207, + "step": 919 + }, + { + "epoch": 0.87, + "grad_norm": 0.7426356673240662, + "learning_rate": 4.2357144910251003e-07, + "loss": 0.2496, + "step": 920 + }, + { + "epoch": 0.87, + "grad_norm": 0.7929947376251221, + "learning_rate": 4.1740804332661365e-07, + "loss": 0.2641, + "step": 921 + }, + { + "epoch": 0.87, + "grad_norm": 0.7465423345565796, + "learning_rate": 4.1128785501931947e-07, + "loss": 0.2528, + "step": 922 + }, + { + "epoch": 0.87, + "grad_norm": 0.6530787348747253, + "learning_rate": 4.05210941898847e-07, + "loss": 0.1902, + "step": 923 + }, + { + "epoch": 0.88, + "grad_norm": 0.5978170037269592, + "learning_rate": 3.9917736127529525e-07, + "loss": 0.1877, + "step": 924 + }, + { + "epoch": 0.88, + "grad_norm": 0.8100348114967346, + "learning_rate": 3.9318717005010496e-07, + "loss": 0.2846, + "step": 925 + }, + { + "epoch": 0.88, + "grad_norm": 0.7248678803443909, + "learning_rate": 3.8724042471551925e-07, + "loss": 0.2635, + "step": 926 + }, + { + "epoch": 0.88, + "grad_norm": 0.7893044352531433, + "learning_rate": 3.8133718135405283e-07, + "loss": 0.3017, + "step": 927 + }, + { + "epoch": 0.88, + "grad_norm": 0.6711196899414062, + "learning_rate": 3.7547749563796144e-07, + "loss": 0.2334, + "step": 928 + }, + { + "epoch": 0.88, + "grad_norm": 0.797402024269104, + "learning_rate": 3.6966142282871873e-07, + "loss": 0.2679, + "step": 929 + }, + { + "epoch": 0.88, + "grad_norm": 0.7578075528144836, + "learning_rate": 3.638890177764948e-07, + "loss": 0.2445, + "step": 930 + }, + { + "epoch": 0.88, + "grad_norm": 0.7537961602210999, + "learning_rate": 3.581603349196372e-07, + "loss": 0.2642, + "step": 931 + }, + { + "epoch": 0.88, + "grad_norm": 0.7909368872642517, + "learning_rate": 3.524754282841575e-07, + "loss": 0.339, + "step": 932 + }, + { + "epoch": 0.88, + "grad_norm": 0.6665784120559692, + "learning_rate": 3.468343514832251e-07, + "loss": 0.2178, + "step": 933 + }, + { + "epoch": 0.88, + "grad_norm": 0.672397255897522, + "learning_rate": 3.4123715771665786e-07, + "loss": 0.176, + "step": 934 + }, + { + "epoch": 0.89, + "grad_norm": 0.7691673040390015, + "learning_rate": 3.356838997704226e-07, + "loss": 0.3186, + "step": 935 + }, + { + "epoch": 0.89, + "grad_norm": 0.7448713183403015, + "learning_rate": 3.3017463001613625e-07, + "loss": 0.2278, + "step": 936 + }, + { + "epoch": 0.89, + "grad_norm": 0.6707292795181274, + "learning_rate": 3.247094004105711e-07, + "loss": 0.189, + "step": 937 + }, + { + "epoch": 0.89, + "grad_norm": 0.6711617708206177, + "learning_rate": 3.1928826249516984e-07, + "loss": 0.1772, + "step": 938 + }, + { + "epoch": 0.89, + "grad_norm": 0.7171783447265625, + "learning_rate": 3.1391126739555134e-07, + "loss": 0.2374, + "step": 939 + }, + { + "epoch": 0.89, + "grad_norm": 0.7142137885093689, + "learning_rate": 3.0857846582103504e-07, + "loss": 0.27, + "step": 940 + }, + { + "epoch": 0.89, + "grad_norm": 0.7926027774810791, + "learning_rate": 3.0328990806415935e-07, + "loss": 0.2567, + "step": 941 + }, + { + "epoch": 0.89, + "grad_norm": 0.8226751089096069, + "learning_rate": 2.9804564400021e-07, + "loss": 0.2975, + "step": 942 + }, + { + "epoch": 0.89, + "grad_norm": 0.835600733757019, + "learning_rate": 2.92845723086746e-07, + "loss": 0.3123, + "step": 943 + }, + { + "epoch": 0.89, + "grad_norm": 0.7174431681632996, + "learning_rate": 2.876901943631372e-07, + "loss": 0.2407, + "step": 944 + }, + { + "epoch": 0.9, + "grad_norm": 0.665255069732666, + "learning_rate": 2.8257910645009935e-07, + "loss": 0.2518, + "step": 945 + }, + { + "epoch": 0.9, + "grad_norm": 0.8344467878341675, + "learning_rate": 2.7751250754923574e-07, + "loss": 0.261, + "step": 946 + }, + { + "epoch": 0.9, + "grad_norm": 0.8135045766830444, + "learning_rate": 2.724904454425836e-07, + "loss": 0.3022, + "step": 947 + }, + { + "epoch": 0.9, + "grad_norm": 0.8465341329574585, + "learning_rate": 2.6751296749216395e-07, + "loss": 0.3494, + "step": 948 + }, + { + "epoch": 0.9, + "grad_norm": 0.953181266784668, + "learning_rate": 2.625801206395312e-07, + "loss": 0.3343, + "step": 949 + }, + { + "epoch": 0.9, + "grad_norm": 0.7392805814743042, + "learning_rate": 2.5769195140533556e-07, + "loss": 0.2137, + "step": 950 + }, + { + "epoch": 0.9, + "grad_norm": 0.8524776101112366, + "learning_rate": 2.528485058888813e-07, + "loss": 0.3187, + "step": 951 + }, + { + "epoch": 0.9, + "grad_norm": 0.7331002354621887, + "learning_rate": 2.4804982976769197e-07, + "loss": 0.2828, + "step": 952 + }, + { + "epoch": 0.9, + "grad_norm": 0.9640606641769409, + "learning_rate": 2.4329596829708145e-07, + "loss": 0.3604, + "step": 953 + }, + { + "epoch": 0.9, + "grad_norm": 0.8314491510391235, + "learning_rate": 2.385869663097251e-07, + "loss": 0.2484, + "step": 954 + }, + { + "epoch": 0.9, + "grad_norm": 0.7713232040405273, + "learning_rate": 2.3392286821523723e-07, + "loss": 0.3094, + "step": 955 + }, + { + "epoch": 0.91, + "grad_norm": 0.7395336031913757, + "learning_rate": 2.2930371799975593e-07, + "loss": 0.2047, + "step": 956 + }, + { + "epoch": 0.91, + "grad_norm": 0.7347779273986816, + "learning_rate": 2.2472955922552164e-07, + "loss": 0.2186, + "step": 957 + }, + { + "epoch": 0.91, + "grad_norm": 0.8460179567337036, + "learning_rate": 2.202004350304715e-07, + "loss": 0.2971, + "step": 958 + }, + { + "epoch": 0.91, + "grad_norm": 0.8869367241859436, + "learning_rate": 2.1571638812783125e-07, + "loss": 0.3091, + "step": 959 + }, + { + "epoch": 0.91, + "grad_norm": 0.8569523692131042, + "learning_rate": 2.112774608057111e-07, + "loss": 0.259, + "step": 960 + }, + { + "epoch": 0.91, + "grad_norm": 0.7300274968147278, + "learning_rate": 2.068836949267089e-07, + "loss": 0.2597, + "step": 961 + }, + { + "epoch": 0.91, + "grad_norm": 0.7471948266029358, + "learning_rate": 2.0253513192751374e-07, + "loss": 0.2235, + "step": 962 + }, + { + "epoch": 0.91, + "grad_norm": 0.7210529446601868, + "learning_rate": 1.9823181281851513e-07, + "loss": 0.2374, + "step": 963 + }, + { + "epoch": 0.91, + "grad_norm": 0.7990643382072449, + "learning_rate": 1.9397377818341945e-07, + "loss": 0.2601, + "step": 964 + }, + { + "epoch": 0.91, + "grad_norm": 0.8323106169700623, + "learning_rate": 1.8976106817886197e-07, + "loss": 0.334, + "step": 965 + }, + { + "epoch": 0.92, + "grad_norm": 0.8063402771949768, + "learning_rate": 1.8559372253403152e-07, + "loss": 0.2553, + "step": 966 + }, + { + "epoch": 0.92, + "grad_norm": 0.7140501737594604, + "learning_rate": 1.814717805502958e-07, + "loss": 0.2324, + "step": 967 + }, + { + "epoch": 0.92, + "grad_norm": 0.6649307012557983, + "learning_rate": 1.7739528110083003e-07, + "loss": 0.2295, + "step": 968 + }, + { + "epoch": 0.92, + "grad_norm": 0.7434363961219788, + "learning_rate": 1.7336426263024896e-07, + "loss": 0.2461, + "step": 969 + }, + { + "epoch": 0.92, + "grad_norm": 0.5857141613960266, + "learning_rate": 1.6937876315424707e-07, + "loss": 0.1591, + "step": 970 + }, + { + "epoch": 0.92, + "grad_norm": 0.6685411930084229, + "learning_rate": 1.6543882025923884e-07, + "loss": 0.2452, + "step": 971 + }, + { + "epoch": 0.92, + "grad_norm": 0.775608479976654, + "learning_rate": 1.6154447110200256e-07, + "loss": 0.3133, + "step": 972 + }, + { + "epoch": 0.92, + "grad_norm": 0.7381249070167542, + "learning_rate": 1.5769575240933422e-07, + "loss": 0.2296, + "step": 973 + }, + { + "epoch": 0.92, + "grad_norm": 1.0521715879440308, + "learning_rate": 1.5389270047769578e-07, + "loss": 0.3918, + "step": 974 + }, + { + "epoch": 0.92, + "grad_norm": 0.767364501953125, + "learning_rate": 1.5013535117287648e-07, + "loss": 0.2897, + "step": 975 + }, + { + "epoch": 0.92, + "grad_norm": 0.598799467086792, + "learning_rate": 1.4642373992965365e-07, + "loss": 0.1702, + "step": 976 + }, + { + "epoch": 0.93, + "grad_norm": 0.6769062280654907, + "learning_rate": 1.427579017514591e-07, + "loss": 0.2071, + "step": 977 + }, + { + "epoch": 0.93, + "grad_norm": 0.6977760791778564, + "learning_rate": 1.3913787121004717e-07, + "loss": 0.2158, + "step": 978 + }, + { + "epoch": 0.93, + "grad_norm": 0.9182886481285095, + "learning_rate": 1.3556368244517116e-07, + "loss": 0.3066, + "step": 979 + }, + { + "epoch": 0.93, + "grad_norm": 0.6467658877372742, + "learning_rate": 1.3203536916425842e-07, + "loss": 0.2063, + "step": 980 + }, + { + "epoch": 0.93, + "grad_norm": 0.747478723526001, + "learning_rate": 1.2855296464209687e-07, + "loss": 0.2604, + "step": 981 + }, + { + "epoch": 0.93, + "grad_norm": 0.8544607758522034, + "learning_rate": 1.2511650172051636e-07, + "loss": 0.3624, + "step": 982 + }, + { + "epoch": 0.93, + "grad_norm": 0.7253915667533875, + "learning_rate": 1.217260128080816e-07, + "loss": 0.2207, + "step": 983 + }, + { + "epoch": 0.93, + "grad_norm": 0.7420864701271057, + "learning_rate": 1.183815298797858e-07, + "loss": 0.2552, + "step": 984 + }, + { + "epoch": 0.93, + "grad_norm": 0.7849740982055664, + "learning_rate": 1.1508308447674977e-07, + "loss": 0.2586, + "step": 985 + }, + { + "epoch": 0.93, + "grad_norm": 0.8122086524963379, + "learning_rate": 1.1183070770592442e-07, + "loss": 0.2471, + "step": 986 + }, + { + "epoch": 0.94, + "grad_norm": 0.6283002495765686, + "learning_rate": 1.0862443023979651e-07, + "loss": 0.2434, + "step": 987 + }, + { + "epoch": 0.94, + "grad_norm": 0.6364373564720154, + "learning_rate": 1.0546428231609896e-07, + "loss": 0.2017, + "step": 988 + }, + { + "epoch": 0.94, + "grad_norm": 0.8534179329872131, + "learning_rate": 1.0235029373752758e-07, + "loss": 0.363, + "step": 989 + }, + { + "epoch": 0.94, + "grad_norm": 0.6653255820274353, + "learning_rate": 9.928249387145983e-08, + "loss": 0.2208, + "step": 990 + }, + { + "epoch": 0.94, + "grad_norm": 0.7650781869888306, + "learning_rate": 9.626091164967599e-08, + "loss": 0.2101, + "step": 991 + }, + { + "epoch": 0.94, + "grad_norm": 0.6862877607345581, + "learning_rate": 9.32855755680867e-08, + "loss": 0.1838, + "step": 992 + }, + { + "epoch": 0.94, + "grad_norm": 0.8111743927001953, + "learning_rate": 9.035651368646647e-08, + "loss": 0.2743, + "step": 993 + }, + { + "epoch": 0.94, + "grad_norm": 0.931341290473938, + "learning_rate": 8.747375362818667e-08, + "loss": 0.3782, + "step": 994 + }, + { + "epoch": 0.94, + "grad_norm": 0.7730108499526978, + "learning_rate": 8.463732257995571e-08, + "loss": 0.2632, + "step": 995 + }, + { + "epoch": 0.94, + "grad_norm": 0.6024267673492432, + "learning_rate": 8.184724729156379e-08, + "loss": 0.1676, + "step": 996 + }, + { + "epoch": 0.94, + "grad_norm": 0.7430551648139954, + "learning_rate": 7.910355407562742e-08, + "loss": 0.231, + "step": 997 + }, + { + "epoch": 0.95, + "grad_norm": 0.7887012958526611, + "learning_rate": 7.640626880734581e-08, + "loss": 0.188, + "step": 998 + }, + { + "epoch": 0.95, + "grad_norm": 0.6349278688430786, + "learning_rate": 7.375541692425325e-08, + "loss": 0.1936, + "step": 999 + }, + { + "epoch": 0.95, + "grad_norm": 0.7530270218849182, + "learning_rate": 7.115102342598101e-08, + "loss": 0.2884, + "step": 1000 + }, + { + "epoch": 0.95, + "eval_loss": 0.23875625431537628, + "eval_runtime": 336.9275, + "eval_samples_per_second": 1.012, + "eval_steps_per_second": 0.255, + "step": 1000 + }, + { + "epoch": 0.95, + "grad_norm": 0.7549338340759277, + "learning_rate": 6.859311287402081e-08, + "loss": 0.1935, + "step": 1001 + }, + { + "epoch": 0.95, + "grad_norm": 0.6894304752349854, + "learning_rate": 6.608170939149283e-08, + "loss": 0.163, + "step": 1002 + }, + { + "epoch": 0.95, + "grad_norm": 0.7224317193031311, + "learning_rate": 6.361683666291973e-08, + "loss": 0.2729, + "step": 1003 + }, + { + "epoch": 0.95, + "grad_norm": 0.9411391615867615, + "learning_rate": 6.119851793400188e-08, + "loss": 0.3002, + "step": 1004 + }, + { + "epoch": 0.95, + "grad_norm": 0.7301786541938782, + "learning_rate": 5.882677601139919e-08, + "loss": 0.193, + "step": 1005 + }, + { + "epoch": 0.95, + "grad_norm": 0.7360510230064392, + "learning_rate": 5.6501633262513454e-08, + "loss": 0.2268, + "step": 1006 + }, + { + "epoch": 0.95, + "grad_norm": 0.8280237913131714, + "learning_rate": 5.4223111615281935e-08, + "loss": 0.3276, + "step": 1007 + }, + { + "epoch": 0.96, + "grad_norm": 0.780082643032074, + "learning_rate": 5.1991232557966344e-08, + "loss": 0.2664, + "step": 1008 + }, + { + "epoch": 0.96, + "grad_norm": 0.5786715149879456, + "learning_rate": 4.9806017138953053e-08, + "loss": 0.1433, + "step": 1009 + }, + { + "epoch": 0.96, + "grad_norm": 0.6797437071800232, + "learning_rate": 4.766748596655268e-08, + "loss": 0.1986, + "step": 1010 + }, + { + "epoch": 0.96, + "grad_norm": 0.7584928870201111, + "learning_rate": 4.55756592088058e-08, + "loss": 0.2975, + "step": 1011 + }, + { + "epoch": 0.96, + "grad_norm": 0.8755053281784058, + "learning_rate": 4.3530556593294194e-08, + "loss": 0.3136, + "step": 1012 + }, + { + "epoch": 0.96, + "grad_norm": 0.8384301066398621, + "learning_rate": 4.1532197406954357e-08, + "loss": 0.2797, + "step": 1013 + }, + { + "epoch": 0.96, + "grad_norm": 0.8307690620422363, + "learning_rate": 3.958060049589485e-08, + "loss": 0.3254, + "step": 1014 + }, + { + "epoch": 0.96, + "grad_norm": 0.6803908348083496, + "learning_rate": 3.767578426521923e-08, + "loss": 0.2112, + "step": 1015 + }, + { + "epoch": 0.96, + "grad_norm": 0.7312712073326111, + "learning_rate": 3.581776667885062e-08, + "loss": 0.2179, + "step": 1016 + }, + { + "epoch": 0.96, + "grad_norm": 0.6420474052429199, + "learning_rate": 3.40065652593663e-08, + "loss": 0.2204, + "step": 1017 + }, + { + "epoch": 0.96, + "grad_norm": 0.7175133228302002, + "learning_rate": 3.2242197087828944e-08, + "loss": 0.2561, + "step": 1018 + }, + { + "epoch": 0.97, + "grad_norm": 0.7940815091133118, + "learning_rate": 3.052467880362675e-08, + "loss": 0.3655, + "step": 1019 + }, + { + "epoch": 0.97, + "grad_norm": 0.6603592038154602, + "learning_rate": 2.8854026604315798e-08, + "loss": 0.2561, + "step": 1020 + }, + { + "epoch": 0.97, + "grad_norm": 0.7610006928443909, + "learning_rate": 2.723025624546849e-08, + "loss": 0.3093, + "step": 1021 + }, + { + "epoch": 0.97, + "grad_norm": 0.7235128879547119, + "learning_rate": 2.5653383040524228e-08, + "loss": 0.285, + "step": 1022 + }, + { + "epoch": 0.97, + "grad_norm": 0.7884017825126648, + "learning_rate": 2.4123421860645646e-08, + "loss": 0.2275, + "step": 1023 + }, + { + "epoch": 0.97, + "grad_norm": 0.8262652158737183, + "learning_rate": 2.264038713457706e-08, + "loss": 0.3359, + "step": 1024 + }, + { + "epoch": 0.97, + "grad_norm": 0.6753684878349304, + "learning_rate": 2.1204292848509557e-08, + "loss": 0.2222, + "step": 1025 + }, + { + "epoch": 0.97, + "grad_norm": 0.831423282623291, + "learning_rate": 1.98151525459489e-08, + "loss": 0.2273, + "step": 1026 + }, + { + "epoch": 0.97, + "grad_norm": 0.6862256526947021, + "learning_rate": 1.8472979327587292e-08, + "loss": 0.1598, + "step": 1027 + }, + { + "epoch": 0.97, + "grad_norm": 0.7676153182983398, + "learning_rate": 1.7177785851180127e-08, + "loss": 0.2834, + "step": 1028 + }, + { + "epoch": 0.98, + "grad_norm": 0.8516740798950195, + "learning_rate": 1.5929584331427218e-08, + "loss": 0.2838, + "step": 1029 + }, + { + "epoch": 0.98, + "grad_norm": 0.9321697354316711, + "learning_rate": 1.4728386539856754e-08, + "loss": 0.3405, + "step": 1030 + }, + { + "epoch": 0.98, + "grad_norm": 0.9343318939208984, + "learning_rate": 1.3574203804713748e-08, + "loss": 0.3623, + "step": 1031 + }, + { + "epoch": 0.98, + "grad_norm": 0.7499484419822693, + "learning_rate": 1.2467047010855659e-08, + "loss": 0.2538, + "step": 1032 + }, + { + "epoch": 0.98, + "grad_norm": 0.7069028615951538, + "learning_rate": 1.1406926599646373e-08, + "loss": 0.2288, + "step": 1033 + }, + { + "epoch": 0.98, + "grad_norm": 0.7008345723152161, + "learning_rate": 1.0393852568860718e-08, + "loss": 0.2133, + "step": 1034 + }, + { + "epoch": 0.98, + "grad_norm": 0.6172625422477722, + "learning_rate": 9.427834472588992e-09, + "loss": 0.1254, + "step": 1035 + }, + { + "epoch": 0.98, + "grad_norm": 0.7789463996887207, + "learning_rate": 8.508881421145366e-09, + "loss": 0.2299, + "step": 1036 + }, + { + "epoch": 0.98, + "grad_norm": 0.7931715846061707, + "learning_rate": 7.637002080985167e-09, + "loss": 0.2224, + "step": 1037 + }, + { + "epoch": 0.98, + "grad_norm": 0.8010485172271729, + "learning_rate": 6.81220467461996e-09, + "loss": 0.2829, + "step": 1038 + }, + { + "epoch": 0.98, + "grad_norm": 0.7054150700569153, + "learning_rate": 6.034496980542037e-09, + "loss": 0.202, + "step": 1039 + }, + { + "epoch": 0.99, + "grad_norm": 0.7824262380599976, + "learning_rate": 5.303886333151154e-09, + "loss": 0.2324, + "step": 1040 + }, + { + "epoch": 0.99, + "grad_norm": 0.7127040028572083, + "learning_rate": 4.620379622682358e-09, + "loss": 0.2468, + "step": 1041 + }, + { + "epoch": 0.99, + "grad_norm": 0.7531187534332275, + "learning_rate": 3.983983295146599e-09, + "loss": 0.2103, + "step": 1042 + }, + { + "epoch": 0.99, + "grad_norm": 0.6774247288703918, + "learning_rate": 3.394703352263551e-09, + "loss": 0.2367, + "step": 1043 + }, + { + "epoch": 0.99, + "grad_norm": 0.8376418352127075, + "learning_rate": 2.8525453514099966e-09, + "loss": 0.2763, + "step": 1044 + }, + { + "epoch": 0.99, + "grad_norm": 0.6632216572761536, + "learning_rate": 2.3575144055643094e-09, + "loss": 0.1538, + "step": 1045 + }, + { + "epoch": 0.99, + "grad_norm": 0.6601029634475708, + "learning_rate": 1.9096151832609378e-09, + "loss": 0.1739, + "step": 1046 + }, + { + "epoch": 0.99, + "grad_norm": 0.7332072854042053, + "learning_rate": 1.5088519085437736e-09, + "loss": 0.26, + "step": 1047 + }, + { + "epoch": 0.99, + "grad_norm": 0.7107292413711548, + "learning_rate": 1.1552283609272962e-09, + "loss": 0.2047, + "step": 1048 + }, + { + "epoch": 0.99, + "grad_norm": 0.8022828698158264, + "learning_rate": 8.487478753615997e-10, + "loss": 0.2998, + "step": 1049 + }, + { + "epoch": 0.99, + "grad_norm": 0.8682080507278442, + "learning_rate": 5.894133422001957e-10, + "loss": 0.3446, + "step": 1050 + }, + { + "epoch": 1.0, + "grad_norm": 0.6473788619041443, + "learning_rate": 3.772272071722594e-10, + "loss": 0.1819, + "step": 1051 + }, + { + "epoch": 1.0, + "grad_norm": 0.738074004650116, + "learning_rate": 2.1219147136264383e-10, + "loss": 0.2266, + "step": 1052 + }, + { + "epoch": 1.0, + "grad_norm": 0.805428683757782, + "learning_rate": 9.43076911874563e-11, + "loss": 0.3078, + "step": 1053 + }, + { + "epoch": 1.0, + "grad_norm": 0.8224419355392456, + "learning_rate": 2.3576978384065585e-11, + "loss": 0.2958, + "step": 1054 + }, + { + "epoch": 1.0, + "grad_norm": 0.6432749032974243, + "learning_rate": 0.0, + "loss": 0.187, + "step": 1055 + }, + { + "epoch": 1.0, + "step": 1055, + "total_flos": 1.1458310767955149e+17, + "train_loss": 0.26917854962049503, + "train_runtime": 48943.092, + "train_samples_per_second": 0.69, + "train_steps_per_second": 0.022 + } + ], + "logging_steps": 1.0, + "max_steps": 1055, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "total_flos": 1.1458310767955149e+17, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}