diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4199 @@ +{ + "best_global_step": 2375, + "best_metric": 0.3474566638469696, + "best_model_checkpoint": "saves/prefix-tuning/llama-3-8b-instruct/train_wsc_1756729607/checkpoint-2375", + "epoch": 10.0, + "eval_steps": 125, + "global_step": 2490, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.020080321285140562, + "grad_norm": 208.77610778808594, + "learning_rate": 8.032128514056225e-07, + "loss": 11.8262, + "num_input_tokens_seen": 832, + "step": 5 + }, + { + "epoch": 0.040160642570281124, + "grad_norm": 188.124267578125, + "learning_rate": 1.8072289156626506e-06, + "loss": 10.3109, + "num_input_tokens_seen": 1760, + "step": 10 + }, + { + "epoch": 0.060240963855421686, + "grad_norm": 118.94561767578125, + "learning_rate": 2.811244979919679e-06, + "loss": 8.5244, + "num_input_tokens_seen": 2608, + "step": 15 + }, + { + "epoch": 0.08032128514056225, + "grad_norm": 93.1993637084961, + "learning_rate": 3.8152610441767074e-06, + "loss": 6.312, + "num_input_tokens_seen": 3536, + "step": 20 + }, + { + "epoch": 0.10040160642570281, + "grad_norm": 80.68607330322266, + "learning_rate": 4.819277108433735e-06, + "loss": 4.854, + "num_input_tokens_seen": 4496, + "step": 25 + }, + { + "epoch": 0.12048192771084337, + "grad_norm": 53.525691986083984, + "learning_rate": 5.823293172690764e-06, + "loss": 2.9588, + "num_input_tokens_seen": 5424, + "step": 30 + }, + { + "epoch": 0.14056224899598393, + "grad_norm": 74.08545684814453, + "learning_rate": 6.827309236947792e-06, + "loss": 1.8179, + "num_input_tokens_seen": 6304, + "step": 35 + }, + { + "epoch": 0.1606425702811245, + "grad_norm": 39.353179931640625, + "learning_rate": 7.83132530120482e-06, + "loss": 0.9304, + "num_input_tokens_seen": 7072, + "step": 40 + }, + { + "epoch": 0.18072289156626506, + "grad_norm": 34.21826934814453, + "learning_rate": 8.835341365461847e-06, + "loss": 0.7447, + "num_input_tokens_seen": 7856, + "step": 45 + }, + { + "epoch": 0.20080321285140562, + "grad_norm": 46.137203216552734, + "learning_rate": 9.839357429718876e-06, + "loss": 0.4663, + "num_input_tokens_seen": 8880, + "step": 50 + }, + { + "epoch": 0.22088353413654618, + "grad_norm": 21.007869720458984, + "learning_rate": 1.0843373493975904e-05, + "loss": 0.419, + "num_input_tokens_seen": 9680, + "step": 55 + }, + { + "epoch": 0.24096385542168675, + "grad_norm": 32.040428161621094, + "learning_rate": 1.1847389558232933e-05, + "loss": 0.3886, + "num_input_tokens_seen": 10576, + "step": 60 + }, + { + "epoch": 0.26104417670682734, + "grad_norm": 33.35049819946289, + "learning_rate": 1.285140562248996e-05, + "loss": 0.3719, + "num_input_tokens_seen": 11424, + "step": 65 + }, + { + "epoch": 0.28112449799196787, + "grad_norm": 27.942962646484375, + "learning_rate": 1.3855421686746989e-05, + "loss": 0.4068, + "num_input_tokens_seen": 12224, + "step": 70 + }, + { + "epoch": 0.30120481927710846, + "grad_norm": 48.38943862915039, + "learning_rate": 1.4859437751004016e-05, + "loss": 0.4348, + "num_input_tokens_seen": 13168, + "step": 75 + }, + { + "epoch": 0.321285140562249, + "grad_norm": 54.91352462768555, + "learning_rate": 1.5863453815261046e-05, + "loss": 0.4399, + "num_input_tokens_seen": 14080, + "step": 80 + }, + { + "epoch": 0.3413654618473896, + "grad_norm": 100.44646453857422, + "learning_rate": 1.6867469879518073e-05, + "loss": 0.873, + "num_input_tokens_seen": 15056, + "step": 85 + }, + { + "epoch": 0.3614457831325301, + "grad_norm": 26.445947647094727, + "learning_rate": 1.78714859437751e-05, + "loss": 0.5697, + "num_input_tokens_seen": 15904, + "step": 90 + }, + { + "epoch": 0.3815261044176707, + "grad_norm": 8.536154747009277, + "learning_rate": 1.8875502008032127e-05, + "loss": 0.318, + "num_input_tokens_seen": 16688, + "step": 95 + }, + { + "epoch": 0.40160642570281124, + "grad_norm": 0.5681027173995972, + "learning_rate": 1.9879518072289157e-05, + "loss": 0.0212, + "num_input_tokens_seen": 17552, + "step": 100 + }, + { + "epoch": 0.42168674698795183, + "grad_norm": 43.08428955078125, + "learning_rate": 2.0883534136546184e-05, + "loss": 1.5137, + "num_input_tokens_seen": 18400, + "step": 105 + }, + { + "epoch": 0.44176706827309237, + "grad_norm": 32.574432373046875, + "learning_rate": 2.1887550200803214e-05, + "loss": 0.9558, + "num_input_tokens_seen": 19456, + "step": 110 + }, + { + "epoch": 0.46184738955823296, + "grad_norm": 12.149736404418945, + "learning_rate": 2.289156626506024e-05, + "loss": 0.2867, + "num_input_tokens_seen": 20288, + "step": 115 + }, + { + "epoch": 0.4819277108433735, + "grad_norm": 59.057838439941406, + "learning_rate": 2.389558232931727e-05, + "loss": 0.7216, + "num_input_tokens_seen": 21328, + "step": 120 + }, + { + "epoch": 0.5020080321285141, + "grad_norm": 38.10276794433594, + "learning_rate": 2.48995983935743e-05, + "loss": 0.5349, + "num_input_tokens_seen": 22304, + "step": 125 + }, + { + "epoch": 0.5020080321285141, + "eval_loss": 1.2113524675369263, + "eval_runtime": 1.2329, + "eval_samples_per_second": 45.42, + "eval_steps_per_second": 22.71, + "num_input_tokens_seen": 22304, + "step": 125 + }, + { + "epoch": 0.5220883534136547, + "grad_norm": 51.98005676269531, + "learning_rate": 2.5903614457831325e-05, + "loss": 1.3225, + "num_input_tokens_seen": 23056, + "step": 130 + }, + { + "epoch": 0.5421686746987951, + "grad_norm": 16.78879165649414, + "learning_rate": 2.6907630522088356e-05, + "loss": 0.3651, + "num_input_tokens_seen": 23840, + "step": 135 + }, + { + "epoch": 0.5622489959839357, + "grad_norm": 7.14707088470459, + "learning_rate": 2.791164658634538e-05, + "loss": 0.5608, + "num_input_tokens_seen": 24832, + "step": 140 + }, + { + "epoch": 0.5823293172690763, + "grad_norm": 2.345181465148926, + "learning_rate": 2.891566265060241e-05, + "loss": 0.4016, + "num_input_tokens_seen": 25648, + "step": 145 + }, + { + "epoch": 0.6024096385542169, + "grad_norm": 13.92953109741211, + "learning_rate": 2.991967871485944e-05, + "loss": 0.4123, + "num_input_tokens_seen": 26496, + "step": 150 + }, + { + "epoch": 0.6224899598393574, + "grad_norm": 13.532402038574219, + "learning_rate": 3.092369477911647e-05, + "loss": 0.538, + "num_input_tokens_seen": 27392, + "step": 155 + }, + { + "epoch": 0.642570281124498, + "grad_norm": 11.975309371948242, + "learning_rate": 3.192771084337349e-05, + "loss": 0.3054, + "num_input_tokens_seen": 28272, + "step": 160 + }, + { + "epoch": 0.6626506024096386, + "grad_norm": 10.431339263916016, + "learning_rate": 3.2931726907630524e-05, + "loss": 0.4957, + "num_input_tokens_seen": 29184, + "step": 165 + }, + { + "epoch": 0.6827309236947792, + "grad_norm": 2.0895514488220215, + "learning_rate": 3.393574297188755e-05, + "loss": 0.3453, + "num_input_tokens_seen": 30128, + "step": 170 + }, + { + "epoch": 0.7028112449799196, + "grad_norm": 3.5245909690856934, + "learning_rate": 3.4939759036144585e-05, + "loss": 0.4688, + "num_input_tokens_seen": 30976, + "step": 175 + }, + { + "epoch": 0.7228915662650602, + "grad_norm": 5.512556552886963, + "learning_rate": 3.5943775100401605e-05, + "loss": 0.4289, + "num_input_tokens_seen": 31776, + "step": 180 + }, + { + "epoch": 0.7429718875502008, + "grad_norm": 3.1914124488830566, + "learning_rate": 3.694779116465863e-05, + "loss": 0.3237, + "num_input_tokens_seen": 32608, + "step": 185 + }, + { + "epoch": 0.7630522088353414, + "grad_norm": 12.990925788879395, + "learning_rate": 3.7951807228915666e-05, + "loss": 0.5221, + "num_input_tokens_seen": 33360, + "step": 190 + }, + { + "epoch": 0.7831325301204819, + "grad_norm": 0.20752322673797607, + "learning_rate": 3.895582329317269e-05, + "loss": 0.2575, + "num_input_tokens_seen": 34176, + "step": 195 + }, + { + "epoch": 0.8032128514056225, + "grad_norm": 3.7412514686584473, + "learning_rate": 3.995983935742972e-05, + "loss": 1.23, + "num_input_tokens_seen": 34992, + "step": 200 + }, + { + "epoch": 0.8232931726907631, + "grad_norm": 2.3068435192108154, + "learning_rate": 4.0963855421686746e-05, + "loss": 0.4066, + "num_input_tokens_seen": 35888, + "step": 205 + }, + { + "epoch": 0.8433734939759037, + "grad_norm": 2.8328449726104736, + "learning_rate": 4.196787148594378e-05, + "loss": 0.3846, + "num_input_tokens_seen": 36848, + "step": 210 + }, + { + "epoch": 0.8634538152610441, + "grad_norm": 2.105517625808716, + "learning_rate": 4.297188755020081e-05, + "loss": 0.3303, + "num_input_tokens_seen": 37888, + "step": 215 + }, + { + "epoch": 0.8835341365461847, + "grad_norm": 2.6428239345550537, + "learning_rate": 4.3975903614457834e-05, + "loss": 0.4356, + "num_input_tokens_seen": 38768, + "step": 220 + }, + { + "epoch": 0.9036144578313253, + "grad_norm": 6.220267295837402, + "learning_rate": 4.497991967871486e-05, + "loss": 0.4328, + "num_input_tokens_seen": 39488, + "step": 225 + }, + { + "epoch": 0.9236947791164659, + "grad_norm": 4.502399444580078, + "learning_rate": 4.598393574297189e-05, + "loss": 0.6742, + "num_input_tokens_seen": 40336, + "step": 230 + }, + { + "epoch": 0.9437751004016064, + "grad_norm": 4.788579940795898, + "learning_rate": 4.698795180722892e-05, + "loss": 0.3987, + "num_input_tokens_seen": 41328, + "step": 235 + }, + { + "epoch": 0.963855421686747, + "grad_norm": 3.492311954498291, + "learning_rate": 4.799196787148594e-05, + "loss": 0.3388, + "num_input_tokens_seen": 42176, + "step": 240 + }, + { + "epoch": 0.9839357429718876, + "grad_norm": 2.018967628479004, + "learning_rate": 4.8995983935742975e-05, + "loss": 0.8551, + "num_input_tokens_seen": 43312, + "step": 245 + }, + { + "epoch": 1.0040160642570282, + "grad_norm": 1.773940086364746, + "learning_rate": 5e-05, + "loss": 0.4306, + "num_input_tokens_seen": 44064, + "step": 250 + }, + { + "epoch": 1.0040160642570282, + "eval_loss": 0.4651975631713867, + "eval_runtime": 1.2232, + "eval_samples_per_second": 45.781, + "eval_steps_per_second": 22.891, + "num_input_tokens_seen": 44064, + "step": 250 + }, + { + "epoch": 1.0240963855421688, + "grad_norm": 13.694576263427734, + "learning_rate": 4.9999385864396127e-05, + "loss": 0.7501, + "num_input_tokens_seen": 44816, + "step": 255 + }, + { + "epoch": 1.0441767068273093, + "grad_norm": 7.658875942230225, + "learning_rate": 4.99975434877575e-05, + "loss": 1.1828, + "num_input_tokens_seen": 45776, + "step": 260 + }, + { + "epoch": 1.0642570281124497, + "grad_norm": 1.0638893842697144, + "learning_rate": 4.999447296060165e-05, + "loss": 0.7021, + "num_input_tokens_seen": 46592, + "step": 265 + }, + { + "epoch": 1.0843373493975903, + "grad_norm": 4.289158344268799, + "learning_rate": 4.999017443378618e-05, + "loss": 0.37, + "num_input_tokens_seen": 47536, + "step": 270 + }, + { + "epoch": 1.104417670682731, + "grad_norm": 3.189358711242676, + "learning_rate": 4.998464811850137e-05, + "loss": 0.3415, + "num_input_tokens_seen": 48320, + "step": 275 + }, + { + "epoch": 1.1244979919678715, + "grad_norm": 2.700920581817627, + "learning_rate": 4.997789428625975e-05, + "loss": 0.381, + "num_input_tokens_seen": 49216, + "step": 280 + }, + { + "epoch": 1.144578313253012, + "grad_norm": 2.0674901008605957, + "learning_rate": 4.996991326888286e-05, + "loss": 0.3487, + "num_input_tokens_seen": 50048, + "step": 285 + }, + { + "epoch": 1.1646586345381527, + "grad_norm": 2.0477442741394043, + "learning_rate": 4.996070545848484e-05, + "loss": 0.346, + "num_input_tokens_seen": 50832, + "step": 290 + }, + { + "epoch": 1.1847389558232932, + "grad_norm": 2.814277410507202, + "learning_rate": 4.995027130745321e-05, + "loss": 0.3439, + "num_input_tokens_seen": 51824, + "step": 295 + }, + { + "epoch": 1.2048192771084336, + "grad_norm": 1.5188792943954468, + "learning_rate": 4.9938611328426685e-05, + "loss": 0.5375, + "num_input_tokens_seen": 52608, + "step": 300 + }, + { + "epoch": 1.2248995983935742, + "grad_norm": 0.27745991945266724, + "learning_rate": 4.992572609426992e-05, + "loss": 0.3537, + "num_input_tokens_seen": 53440, + "step": 305 + }, + { + "epoch": 1.2449799196787148, + "grad_norm": 0.40650656819343567, + "learning_rate": 4.99116162380454e-05, + "loss": 0.3549, + "num_input_tokens_seen": 54320, + "step": 310 + }, + { + "epoch": 1.2650602409638554, + "grad_norm": 1.8287845849990845, + "learning_rate": 4.989628245298233e-05, + "loss": 0.3352, + "num_input_tokens_seen": 55072, + "step": 315 + }, + { + "epoch": 1.285140562248996, + "grad_norm": 0.5225471258163452, + "learning_rate": 4.987972549244257e-05, + "loss": 0.3695, + "num_input_tokens_seen": 56224, + "step": 320 + }, + { + "epoch": 1.3052208835341366, + "grad_norm": 1.361476182937622, + "learning_rate": 4.986194616988364e-05, + "loss": 0.281, + "num_input_tokens_seen": 56912, + "step": 325 + }, + { + "epoch": 1.3253012048192772, + "grad_norm": 1.0196197032928467, + "learning_rate": 4.984294535881875e-05, + "loss": 0.488, + "num_input_tokens_seen": 57648, + "step": 330 + }, + { + "epoch": 1.3453815261044177, + "grad_norm": 0.7983678579330444, + "learning_rate": 4.982272399277386e-05, + "loss": 0.3598, + "num_input_tokens_seen": 58608, + "step": 335 + }, + { + "epoch": 1.3654618473895583, + "grad_norm": 0.21441864967346191, + "learning_rate": 4.980128306524183e-05, + "loss": 0.3973, + "num_input_tokens_seen": 59424, + "step": 340 + }, + { + "epoch": 1.3855421686746987, + "grad_norm": 1.7802695035934448, + "learning_rate": 4.9778623629633635e-05, + "loss": 0.3078, + "num_input_tokens_seen": 60272, + "step": 345 + }, + { + "epoch": 1.4056224899598393, + "grad_norm": 2.1653060913085938, + "learning_rate": 4.975474679922655e-05, + "loss": 0.4871, + "num_input_tokens_seen": 61056, + "step": 350 + }, + { + "epoch": 1.4257028112449799, + "grad_norm": 1.056137204170227, + "learning_rate": 4.972965374710952e-05, + "loss": 0.283, + "num_input_tokens_seen": 61968, + "step": 355 + }, + { + "epoch": 1.4457831325301205, + "grad_norm": 0.806865394115448, + "learning_rate": 4.9703345706125485e-05, + "loss": 0.3467, + "num_input_tokens_seen": 62800, + "step": 360 + }, + { + "epoch": 1.465863453815261, + "grad_norm": 0.4291660189628601, + "learning_rate": 4.96758239688108e-05, + "loss": 0.4493, + "num_input_tokens_seen": 63824, + "step": 365 + }, + { + "epoch": 1.4859437751004017, + "grad_norm": 0.9207323789596558, + "learning_rate": 4.964708988733178e-05, + "loss": 0.3217, + "num_input_tokens_seen": 64800, + "step": 370 + }, + { + "epoch": 1.5060240963855422, + "grad_norm": 0.7873408198356628, + "learning_rate": 4.961714487341822e-05, + "loss": 0.3766, + "num_input_tokens_seen": 65808, + "step": 375 + }, + { + "epoch": 1.5060240963855422, + "eval_loss": 0.38677313923835754, + "eval_runtime": 1.2517, + "eval_samples_per_second": 44.74, + "eval_steps_per_second": 22.37, + "num_input_tokens_seen": 65808, + "step": 375 + }, + { + "epoch": 1.5261044176706826, + "grad_norm": 0.23284302651882172, + "learning_rate": 4.9585990398294043e-05, + "loss": 0.4091, + "num_input_tokens_seen": 66752, + "step": 380 + }, + { + "epoch": 1.5461847389558234, + "grad_norm": 0.18355536460876465, + "learning_rate": 4.9553627992605066e-05, + "loss": 0.3531, + "num_input_tokens_seen": 67632, + "step": 385 + }, + { + "epoch": 1.5662650602409638, + "grad_norm": 0.12290728837251663, + "learning_rate": 4.952005924634372e-05, + "loss": 0.3506, + "num_input_tokens_seen": 68400, + "step": 390 + }, + { + "epoch": 1.5863453815261044, + "grad_norm": 0.8354266881942749, + "learning_rate": 4.948528580877099e-05, + "loss": 0.3255, + "num_input_tokens_seen": 69408, + "step": 395 + }, + { + "epoch": 1.606425702811245, + "grad_norm": 0.3600679337978363, + "learning_rate": 4.944930938833535e-05, + "loss": 0.3689, + "num_input_tokens_seen": 70352, + "step": 400 + }, + { + "epoch": 1.6265060240963856, + "grad_norm": 0.2819594442844391, + "learning_rate": 4.9412131752588874e-05, + "loss": 0.374, + "num_input_tokens_seen": 71184, + "step": 405 + }, + { + "epoch": 1.6465863453815262, + "grad_norm": 0.9768486618995667, + "learning_rate": 4.937375472810033e-05, + "loss": 0.3785, + "num_input_tokens_seen": 72272, + "step": 410 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.11346471309661865, + "learning_rate": 4.9334180200365486e-05, + "loss": 0.3645, + "num_input_tokens_seen": 73136, + "step": 415 + }, + { + "epoch": 1.6867469879518073, + "grad_norm": 0.15687525272369385, + "learning_rate": 4.929341011371448e-05, + "loss": 0.3477, + "num_input_tokens_seen": 73872, + "step": 420 + }, + { + "epoch": 1.7068273092369477, + "grad_norm": 0.31356531381607056, + "learning_rate": 4.9251446471216226e-05, + "loss": 0.3495, + "num_input_tokens_seen": 74784, + "step": 425 + }, + { + "epoch": 1.7269076305220885, + "grad_norm": 0.2755882441997528, + "learning_rate": 4.9208291334580104e-05, + "loss": 0.3477, + "num_input_tokens_seen": 75664, + "step": 430 + }, + { + "epoch": 1.7469879518072289, + "grad_norm": 1.0474437475204468, + "learning_rate": 4.9163946824054574e-05, + "loss": 0.4005, + "num_input_tokens_seen": 76592, + "step": 435 + }, + { + "epoch": 1.7670682730923695, + "grad_norm": 0.9340880513191223, + "learning_rate": 4.911841511832305e-05, + "loss": 0.3454, + "num_input_tokens_seen": 77408, + "step": 440 + }, + { + "epoch": 1.78714859437751, + "grad_norm": 0.7397641539573669, + "learning_rate": 4.907169845439688e-05, + "loss": 0.3494, + "num_input_tokens_seen": 78272, + "step": 445 + }, + { + "epoch": 1.8072289156626506, + "grad_norm": 0.32739967107772827, + "learning_rate": 4.902379912750537e-05, + "loss": 0.3211, + "num_input_tokens_seen": 79200, + "step": 450 + }, + { + "epoch": 1.8273092369477912, + "grad_norm": 0.6281508207321167, + "learning_rate": 4.897471949098309e-05, + "loss": 0.3843, + "num_input_tokens_seen": 80112, + "step": 455 + }, + { + "epoch": 1.8473895582329316, + "grad_norm": 0.6183443069458008, + "learning_rate": 4.892446195615423e-05, + "loss": 0.3143, + "num_input_tokens_seen": 81168, + "step": 460 + }, + { + "epoch": 1.8674698795180724, + "grad_norm": 0.6530241966247559, + "learning_rate": 4.88730289922141e-05, + "loss": 0.3845, + "num_input_tokens_seen": 82112, + "step": 465 + }, + { + "epoch": 1.8875502008032128, + "grad_norm": 0.2493213266134262, + "learning_rate": 4.8820423126107845e-05, + "loss": 0.3467, + "num_input_tokens_seen": 83072, + "step": 470 + }, + { + "epoch": 1.9076305220883534, + "grad_norm": 0.7678804993629456, + "learning_rate": 4.87666469424063e-05, + "loss": 0.3683, + "num_input_tokens_seen": 83920, + "step": 475 + }, + { + "epoch": 1.927710843373494, + "grad_norm": 0.8148710131645203, + "learning_rate": 4.8711703083178986e-05, + "loss": 0.3512, + "num_input_tokens_seen": 84768, + "step": 480 + }, + { + "epoch": 1.9477911646586346, + "grad_norm": 0.8058724999427795, + "learning_rate": 4.865559424786432e-05, + "loss": 0.3478, + "num_input_tokens_seen": 85616, + "step": 485 + }, + { + "epoch": 1.9678714859437751, + "grad_norm": 0.1543155312538147, + "learning_rate": 4.859832319313697e-05, + "loss": 0.3477, + "num_input_tokens_seen": 86400, + "step": 490 + }, + { + "epoch": 1.9879518072289155, + "grad_norm": 1.0387096405029297, + "learning_rate": 4.8539892732772455e-05, + "loss": 0.3753, + "num_input_tokens_seen": 87216, + "step": 495 + }, + { + "epoch": 2.0080321285140563, + "grad_norm": 0.2593827545642853, + "learning_rate": 4.848030573750885e-05, + "loss": 0.3159, + "num_input_tokens_seen": 88048, + "step": 500 + }, + { + "epoch": 2.0080321285140563, + "eval_loss": 0.3829512596130371, + "eval_runtime": 1.2272, + "eval_samples_per_second": 45.632, + "eval_steps_per_second": 22.816, + "num_input_tokens_seen": 88048, + "step": 500 + }, + { + "epoch": 2.0281124497991967, + "grad_norm": 0.6328459978103638, + "learning_rate": 4.841956513490577e-05, + "loss": 0.3501, + "num_input_tokens_seen": 88896, + "step": 505 + }, + { + "epoch": 2.0481927710843375, + "grad_norm": 0.17319746315479279, + "learning_rate": 4.8357673909200563e-05, + "loss": 0.3452, + "num_input_tokens_seen": 89744, + "step": 510 + }, + { + "epoch": 2.068273092369478, + "grad_norm": 0.13001570105552673, + "learning_rate": 4.8294635101161645e-05, + "loss": 0.3738, + "num_input_tokens_seen": 90528, + "step": 515 + }, + { + "epoch": 2.0883534136546187, + "grad_norm": 0.1256828010082245, + "learning_rate": 4.8230451807939135e-05, + "loss": 0.3347, + "num_input_tokens_seen": 91360, + "step": 520 + }, + { + "epoch": 2.108433734939759, + "grad_norm": 0.7363554835319519, + "learning_rate": 4.816512718291267e-05, + "loss": 0.346, + "num_input_tokens_seen": 92176, + "step": 525 + }, + { + "epoch": 2.1285140562248994, + "grad_norm": 0.19434113800525665, + "learning_rate": 4.80986644355365e-05, + "loss": 0.341, + "num_input_tokens_seen": 93104, + "step": 530 + }, + { + "epoch": 2.1485943775100402, + "grad_norm": 0.1456967294216156, + "learning_rate": 4.803106683118177e-05, + "loss": 0.3588, + "num_input_tokens_seen": 93984, + "step": 535 + }, + { + "epoch": 2.1686746987951806, + "grad_norm": 0.9451452493667603, + "learning_rate": 4.796233769097615e-05, + "loss": 0.3438, + "num_input_tokens_seen": 94896, + "step": 540 + }, + { + "epoch": 2.1887550200803214, + "grad_norm": 0.1497451663017273, + "learning_rate": 4.789248039164058e-05, + "loss": 0.375, + "num_input_tokens_seen": 95824, + "step": 545 + }, + { + "epoch": 2.208835341365462, + "grad_norm": 0.8181028366088867, + "learning_rate": 4.782149836532345e-05, + "loss": 0.3607, + "num_input_tokens_seen": 96688, + "step": 550 + }, + { + "epoch": 2.2289156626506026, + "grad_norm": 0.6427181959152222, + "learning_rate": 4.7749395099431924e-05, + "loss": 0.3312, + "num_input_tokens_seen": 97488, + "step": 555 + }, + { + "epoch": 2.248995983935743, + "grad_norm": 1.3496628999710083, + "learning_rate": 4.7676174136460625e-05, + "loss": 0.4083, + "num_input_tokens_seen": 98288, + "step": 560 + }, + { + "epoch": 2.2690763052208833, + "grad_norm": 0.2799893915653229, + "learning_rate": 4.760183907381757e-05, + "loss": 0.3447, + "num_input_tokens_seen": 99200, + "step": 565 + }, + { + "epoch": 2.289156626506024, + "grad_norm": 0.22829043865203857, + "learning_rate": 4.752639356364744e-05, + "loss": 0.3228, + "num_input_tokens_seen": 99984, + "step": 570 + }, + { + "epoch": 2.3092369477911645, + "grad_norm": 0.17503726482391357, + "learning_rate": 4.7449841312652166e-05, + "loss": 0.3781, + "num_input_tokens_seen": 100784, + "step": 575 + }, + { + "epoch": 2.3293172690763053, + "grad_norm": 0.8941397666931152, + "learning_rate": 4.737218608190878e-05, + "loss": 0.367, + "num_input_tokens_seen": 101584, + "step": 580 + }, + { + "epoch": 2.3493975903614457, + "grad_norm": 0.7553220987319946, + "learning_rate": 4.729343168668463e-05, + "loss": 0.3603, + "num_input_tokens_seen": 102480, + "step": 585 + }, + { + "epoch": 2.3694779116465865, + "grad_norm": 0.2362431138753891, + "learning_rate": 4.721358199624997e-05, + "loss": 0.3631, + "num_input_tokens_seen": 103408, + "step": 590 + }, + { + "epoch": 2.389558232931727, + "grad_norm": 0.2296190857887268, + "learning_rate": 4.713264093368783e-05, + "loss": 0.3911, + "num_input_tokens_seen": 104160, + "step": 595 + }, + { + "epoch": 2.4096385542168672, + "grad_norm": 0.16303616762161255, + "learning_rate": 4.705061247570128e-05, + "loss": 0.3406, + "num_input_tokens_seen": 105040, + "step": 600 + }, + { + "epoch": 2.429718875502008, + "grad_norm": 0.15550056099891663, + "learning_rate": 4.6967500652418034e-05, + "loss": 0.3582, + "num_input_tokens_seen": 105856, + "step": 605 + }, + { + "epoch": 2.4497991967871484, + "grad_norm": 0.9154328107833862, + "learning_rate": 4.6883309547192476e-05, + "loss": 0.3701, + "num_input_tokens_seen": 106928, + "step": 610 + }, + { + "epoch": 2.4698795180722892, + "grad_norm": 0.5483267903327942, + "learning_rate": 4.679804329640505e-05, + "loss": 0.3423, + "num_input_tokens_seen": 107808, + "step": 615 + }, + { + "epoch": 2.4899598393574296, + "grad_norm": 0.5274845957756042, + "learning_rate": 4.6711706089258955e-05, + "loss": 0.3104, + "num_input_tokens_seen": 108656, + "step": 620 + }, + { + "epoch": 2.5100401606425704, + "grad_norm": 0.23020295798778534, + "learning_rate": 4.6624302167574436e-05, + "loss": 0.3958, + "num_input_tokens_seen": 109696, + "step": 625 + }, + { + "epoch": 2.5100401606425704, + "eval_loss": 0.3675794303417206, + "eval_runtime": 1.213, + "eval_samples_per_second": 46.168, + "eval_steps_per_second": 23.084, + "num_input_tokens_seen": 109696, + "step": 625 + }, + { + "epoch": 2.5301204819277108, + "grad_norm": 0.13925790786743164, + "learning_rate": 4.653583582558031e-05, + "loss": 0.3587, + "num_input_tokens_seen": 110576, + "step": 630 + }, + { + "epoch": 2.550200803212851, + "grad_norm": 0.9564950466156006, + "learning_rate": 4.6446311409703006e-05, + "loss": 0.365, + "num_input_tokens_seen": 111440, + "step": 635 + }, + { + "epoch": 2.570281124497992, + "grad_norm": 0.2103191763162613, + "learning_rate": 4.635573331835302e-05, + "loss": 0.3339, + "num_input_tokens_seen": 112192, + "step": 640 + }, + { + "epoch": 2.5903614457831328, + "grad_norm": 0.230157271027565, + "learning_rate": 4.6264106001708824e-05, + "loss": 0.3631, + "num_input_tokens_seen": 113024, + "step": 645 + }, + { + "epoch": 2.610441767068273, + "grad_norm": 0.19389480352401733, + "learning_rate": 4.61714339614982e-05, + "loss": 0.3795, + "num_input_tokens_seen": 113952, + "step": 650 + }, + { + "epoch": 2.6305220883534135, + "grad_norm": 0.14889311790466309, + "learning_rate": 4.607772175077711e-05, + "loss": 0.3586, + "num_input_tokens_seen": 114928, + "step": 655 + }, + { + "epoch": 2.6506024096385543, + "grad_norm": 0.10515565425157547, + "learning_rate": 4.598297397370596e-05, + "loss": 0.3726, + "num_input_tokens_seen": 115728, + "step": 660 + }, + { + "epoch": 2.6706827309236947, + "grad_norm": 0.6394887566566467, + "learning_rate": 4.588719528532342e-05, + "loss": 0.3549, + "num_input_tokens_seen": 116544, + "step": 665 + }, + { + "epoch": 2.6907630522088355, + "grad_norm": 0.14106032252311707, + "learning_rate": 4.5790390391317675e-05, + "loss": 0.3379, + "num_input_tokens_seen": 117568, + "step": 670 + }, + { + "epoch": 2.710843373493976, + "grad_norm": 0.08746648579835892, + "learning_rate": 4.5692564047795316e-05, + "loss": 0.3688, + "num_input_tokens_seen": 118368, + "step": 675 + }, + { + "epoch": 2.7309236947791167, + "grad_norm": 0.5902343392372131, + "learning_rate": 4.5593721061047576e-05, + "loss": 0.3455, + "num_input_tokens_seen": 119120, + "step": 680 + }, + { + "epoch": 2.751004016064257, + "grad_norm": 0.57841557264328, + "learning_rate": 4.549386628731425e-05, + "loss": 0.3575, + "num_input_tokens_seen": 120064, + "step": 685 + }, + { + "epoch": 2.7710843373493974, + "grad_norm": 0.10715785622596741, + "learning_rate": 4.5393004632545064e-05, + "loss": 0.3721, + "num_input_tokens_seen": 120960, + "step": 690 + }, + { + "epoch": 2.791164658634538, + "grad_norm": 0.09298089146614075, + "learning_rate": 4.529114105215869e-05, + "loss": 0.3545, + "num_input_tokens_seen": 121760, + "step": 695 + }, + { + "epoch": 2.8112449799196786, + "grad_norm": 0.659833550453186, + "learning_rate": 4.518828055079925e-05, + "loss": 0.3675, + "num_input_tokens_seen": 122720, + "step": 700 + }, + { + "epoch": 2.8313253012048194, + "grad_norm": 0.600629985332489, + "learning_rate": 4.508442818209042e-05, + "loss": 0.3543, + "num_input_tokens_seen": 123712, + "step": 705 + }, + { + "epoch": 2.8514056224899598, + "grad_norm": 0.1273653358221054, + "learning_rate": 4.4979589048387186e-05, + "loss": 0.3561, + "num_input_tokens_seen": 124624, + "step": 710 + }, + { + "epoch": 2.8714859437751006, + "grad_norm": 0.5190867185592651, + "learning_rate": 4.487376830052511e-05, + "loss": 0.3474, + "num_input_tokens_seen": 125696, + "step": 715 + }, + { + "epoch": 2.891566265060241, + "grad_norm": 0.5262147784233093, + "learning_rate": 4.476697113756731e-05, + "loss": 0.2977, + "num_input_tokens_seen": 126480, + "step": 720 + }, + { + "epoch": 2.9116465863453813, + "grad_norm": 0.9929115772247314, + "learning_rate": 4.465920280654901e-05, + "loss": 0.3658, + "num_input_tokens_seen": 127312, + "step": 725 + }, + { + "epoch": 2.931726907630522, + "grad_norm": 0.26060280203819275, + "learning_rate": 4.4550468602219716e-05, + "loss": 0.3475, + "num_input_tokens_seen": 128352, + "step": 730 + }, + { + "epoch": 2.9518072289156625, + "grad_norm": 0.2218063920736313, + "learning_rate": 4.4440773866783136e-05, + "loss": 0.4262, + "num_input_tokens_seen": 129232, + "step": 735 + }, + { + "epoch": 2.9718875502008033, + "grad_norm": 0.6310736536979675, + "learning_rate": 4.433012398963468e-05, + "loss": 0.4037, + "num_input_tokens_seen": 130080, + "step": 740 + }, + { + "epoch": 2.9919678714859437, + "grad_norm": 0.5500450134277344, + "learning_rate": 4.421852440709666e-05, + "loss": 0.3459, + "num_input_tokens_seen": 130880, + "step": 745 + }, + { + "epoch": 3.0120481927710845, + "grad_norm": 0.5436108112335205, + "learning_rate": 4.4105980602151256e-05, + "loss": 0.3521, + "num_input_tokens_seen": 131872, + "step": 750 + }, + { + "epoch": 3.0120481927710845, + "eval_loss": 0.35073402523994446, + "eval_runtime": 1.2195, + "eval_samples_per_second": 45.921, + "eval_steps_per_second": 22.961, + "num_input_tokens_seen": 131872, + "step": 750 + }, + { + "epoch": 3.032128514056225, + "grad_norm": 0.5504011511802673, + "learning_rate": 4.399249810417108e-05, + "loss": 0.354, + "num_input_tokens_seen": 132656, + "step": 755 + }, + { + "epoch": 3.0522088353413657, + "grad_norm": 0.09489892423152924, + "learning_rate": 4.387808248864751e-05, + "loss": 0.3708, + "num_input_tokens_seen": 133472, + "step": 760 + }, + { + "epoch": 3.072289156626506, + "grad_norm": 0.1529596596956253, + "learning_rate": 4.376273937691681e-05, + "loss": 0.3463, + "num_input_tokens_seen": 134416, + "step": 765 + }, + { + "epoch": 3.0923694779116464, + "grad_norm": 0.08475496619939804, + "learning_rate": 4.364647443588389e-05, + "loss": 0.3485, + "num_input_tokens_seen": 135344, + "step": 770 + }, + { + "epoch": 3.112449799196787, + "grad_norm": 0.12930436432361603, + "learning_rate": 4.352929337774395e-05, + "loss": 0.3382, + "num_input_tokens_seen": 136240, + "step": 775 + }, + { + "epoch": 3.1325301204819276, + "grad_norm": 1.1815729141235352, + "learning_rate": 4.341120195970178e-05, + "loss": 0.3559, + "num_input_tokens_seen": 137120, + "step": 780 + }, + { + "epoch": 3.1526104417670684, + "grad_norm": 0.3863106071949005, + "learning_rate": 4.3292205983688905e-05, + "loss": 0.36, + "num_input_tokens_seen": 138112, + "step": 785 + }, + { + "epoch": 3.1726907630522088, + "grad_norm": 0.4967614412307739, + "learning_rate": 4.3172311296078595e-05, + "loss": 0.3472, + "num_input_tokens_seen": 138960, + "step": 790 + }, + { + "epoch": 3.1927710843373496, + "grad_norm": 0.13387857377529144, + "learning_rate": 4.305152378739855e-05, + "loss": 0.3646, + "num_input_tokens_seen": 140016, + "step": 795 + }, + { + "epoch": 3.21285140562249, + "grad_norm": 0.11857640743255615, + "learning_rate": 4.292984939204155e-05, + "loss": 0.3357, + "num_input_tokens_seen": 140768, + "step": 800 + }, + { + "epoch": 3.2329317269076308, + "grad_norm": 0.12806545197963715, + "learning_rate": 4.2807294087973834e-05, + "loss": 0.3444, + "num_input_tokens_seen": 141664, + "step": 805 + }, + { + "epoch": 3.253012048192771, + "grad_norm": 0.11283908039331436, + "learning_rate": 4.2683863896441475e-05, + "loss": 0.3541, + "num_input_tokens_seen": 142448, + "step": 810 + }, + { + "epoch": 3.2730923694779115, + "grad_norm": 0.06589579582214355, + "learning_rate": 4.255956488167449e-05, + "loss": 0.3619, + "num_input_tokens_seen": 143408, + "step": 815 + }, + { + "epoch": 3.2931726907630523, + "grad_norm": 0.6529943346977234, + "learning_rate": 4.2434403150588895e-05, + "loss": 0.3449, + "num_input_tokens_seen": 144256, + "step": 820 + }, + { + "epoch": 3.3132530120481927, + "grad_norm": 0.618222177028656, + "learning_rate": 4.230838485248674e-05, + "loss": 0.3504, + "num_input_tokens_seen": 145120, + "step": 825 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.566098690032959, + "learning_rate": 4.21815161787539e-05, + "loss": 0.3445, + "num_input_tokens_seen": 146080, + "step": 830 + }, + { + "epoch": 3.353413654618474, + "grad_norm": 0.13646817207336426, + "learning_rate": 4.205380336255594e-05, + "loss": 0.3418, + "num_input_tokens_seen": 146912, + "step": 835 + }, + { + "epoch": 3.3734939759036147, + "grad_norm": 0.21328403055667877, + "learning_rate": 4.192525267853188e-05, + "loss": 0.2934, + "num_input_tokens_seen": 147776, + "step": 840 + }, + { + "epoch": 3.393574297188755, + "grad_norm": 0.3965797424316406, + "learning_rate": 4.179587044248585e-05, + "loss": 0.2829, + "num_input_tokens_seen": 148768, + "step": 845 + }, + { + "epoch": 3.4136546184738954, + "grad_norm": 0.24930249154567719, + "learning_rate": 4.166566301107687e-05, + "loss": 0.5387, + "num_input_tokens_seen": 149728, + "step": 850 + }, + { + "epoch": 3.433734939759036, + "grad_norm": 0.7255300879478455, + "learning_rate": 4.153463678150651e-05, + "loss": 0.3639, + "num_input_tokens_seen": 150784, + "step": 855 + }, + { + "epoch": 3.4538152610441766, + "grad_norm": 0.11454490572214127, + "learning_rate": 4.140279819120457e-05, + "loss": 0.3721, + "num_input_tokens_seen": 151728, + "step": 860 + }, + { + "epoch": 3.4738955823293174, + "grad_norm": 0.7094687819480896, + "learning_rate": 4.127015371751284e-05, + "loss": 0.3656, + "num_input_tokens_seen": 152640, + "step": 865 + }, + { + "epoch": 3.4939759036144578, + "grad_norm": 0.5258346796035767, + "learning_rate": 4.1136709877366844e-05, + "loss": 0.3193, + "num_input_tokens_seen": 153424, + "step": 870 + }, + { + "epoch": 3.5140562248995986, + "grad_norm": 0.5166998505592346, + "learning_rate": 4.100247322697562e-05, + "loss": 0.3677, + "num_input_tokens_seen": 154416, + "step": 875 + }, + { + "epoch": 3.5140562248995986, + "eval_loss": 0.3535325825214386, + "eval_runtime": 1.2211, + "eval_samples_per_second": 45.861, + "eval_steps_per_second": 22.931, + "num_input_tokens_seen": 154416, + "step": 875 + }, + { + "epoch": 3.534136546184739, + "grad_norm": 0.49516451358795166, + "learning_rate": 4.08674503614997e-05, + "loss": 0.3907, + "num_input_tokens_seen": 155184, + "step": 880 + }, + { + "epoch": 3.5542168674698793, + "grad_norm": 0.0980529636144638, + "learning_rate": 4.0731647914727004e-05, + "loss": 0.3941, + "num_input_tokens_seen": 156000, + "step": 885 + }, + { + "epoch": 3.57429718875502, + "grad_norm": 0.5644952058792114, + "learning_rate": 4.059507255874694e-05, + "loss": 0.345, + "num_input_tokens_seen": 156976, + "step": 890 + }, + { + "epoch": 3.5943775100401605, + "grad_norm": 0.5101115703582764, + "learning_rate": 4.0457731003622606e-05, + "loss": 0.3331, + "num_input_tokens_seen": 157904, + "step": 895 + }, + { + "epoch": 3.6144578313253013, + "grad_norm": 0.4910569190979004, + "learning_rate": 4.0319629997061116e-05, + "loss": 0.3339, + "num_input_tokens_seen": 158864, + "step": 900 + }, + { + "epoch": 3.6345381526104417, + "grad_norm": 0.48415863513946533, + "learning_rate": 4.018077632408207e-05, + "loss": 0.2827, + "num_input_tokens_seen": 159744, + "step": 905 + }, + { + "epoch": 3.6546184738955825, + "grad_norm": 0.4711949825286865, + "learning_rate": 4.004117680668422e-05, + "loss": 0.3838, + "num_input_tokens_seen": 160608, + "step": 910 + }, + { + "epoch": 3.674698795180723, + "grad_norm": 0.935171902179718, + "learning_rate": 3.990083830351027e-05, + "loss": 0.3816, + "num_input_tokens_seen": 161488, + "step": 915 + }, + { + "epoch": 3.694779116465863, + "grad_norm": 0.48552215099334717, + "learning_rate": 3.975976770950994e-05, + "loss": 0.4066, + "num_input_tokens_seen": 162224, + "step": 920 + }, + { + "epoch": 3.714859437751004, + "grad_norm": 0.5080327391624451, + "learning_rate": 3.961797195560118e-05, + "loss": 0.3183, + "num_input_tokens_seen": 163056, + "step": 925 + }, + { + "epoch": 3.734939759036145, + "grad_norm": 0.606795072555542, + "learning_rate": 3.947545800832967e-05, + "loss": 0.3641, + "num_input_tokens_seen": 163856, + "step": 930 + }, + { + "epoch": 3.755020080321285, + "grad_norm": 0.5324833989143372, + "learning_rate": 3.9332232869526534e-05, + "loss": 0.3394, + "num_input_tokens_seen": 164768, + "step": 935 + }, + { + "epoch": 3.7751004016064256, + "grad_norm": 0.10697121173143387, + "learning_rate": 3.918830357596434e-05, + "loss": 0.3368, + "num_input_tokens_seen": 165600, + "step": 940 + }, + { + "epoch": 3.7951807228915664, + "grad_norm": 0.13268576562404633, + "learning_rate": 3.9043677199011364e-05, + "loss": 0.3511, + "num_input_tokens_seen": 166400, + "step": 945 + }, + { + "epoch": 3.8152610441767068, + "grad_norm": 0.12882153689861298, + "learning_rate": 3.889836084428422e-05, + "loss": 0.328, + "num_input_tokens_seen": 167296, + "step": 950 + }, + { + "epoch": 3.835341365461847, + "grad_norm": 0.14181359112262726, + "learning_rate": 3.8752361651298675e-05, + "loss": 0.369, + "num_input_tokens_seen": 168208, + "step": 955 + }, + { + "epoch": 3.855421686746988, + "grad_norm": 0.4742559492588043, + "learning_rate": 3.860568679311893e-05, + "loss": 0.3657, + "num_input_tokens_seen": 169056, + "step": 960 + }, + { + "epoch": 3.8755020080321287, + "grad_norm": 0.1299924999475479, + "learning_rate": 3.8458343476005196e-05, + "loss": 0.3849, + "num_input_tokens_seen": 169888, + "step": 965 + }, + { + "epoch": 3.895582329317269, + "grad_norm": 0.08048601448535919, + "learning_rate": 3.8310338939059644e-05, + "loss": 0.3541, + "num_input_tokens_seen": 170704, + "step": 970 + }, + { + "epoch": 3.9156626506024095, + "grad_norm": 0.6629543304443359, + "learning_rate": 3.8161680453870715e-05, + "loss": 0.3558, + "num_input_tokens_seen": 171600, + "step": 975 + }, + { + "epoch": 3.9357429718875503, + "grad_norm": 0.15418274700641632, + "learning_rate": 3.8012375324155904e-05, + "loss": 0.3131, + "num_input_tokens_seen": 172480, + "step": 980 + }, + { + "epoch": 3.9558232931726907, + "grad_norm": 0.4927317202091217, + "learning_rate": 3.7862430885402876e-05, + "loss": 0.3661, + "num_input_tokens_seen": 173504, + "step": 985 + }, + { + "epoch": 3.9759036144578315, + "grad_norm": 0.4768475890159607, + "learning_rate": 3.7711854504509135e-05, + "loss": 0.3373, + "num_input_tokens_seen": 174288, + "step": 990 + }, + { + "epoch": 3.995983935742972, + "grad_norm": 0.7225349545478821, + "learning_rate": 3.756065357941999e-05, + "loss": 0.3623, + "num_input_tokens_seen": 175104, + "step": 995 + }, + { + "epoch": 4.016064257028113, + "grad_norm": 0.7921448945999146, + "learning_rate": 3.740883553876515e-05, + "loss": 0.3426, + "num_input_tokens_seen": 176048, + "step": 1000 + }, + { + "epoch": 4.016064257028113, + "eval_loss": 0.35071006417274475, + "eval_runtime": 1.2197, + "eval_samples_per_second": 45.913, + "eval_steps_per_second": 22.957, + "num_input_tokens_seen": 176048, + "step": 1000 + }, + { + "epoch": 4.036144578313253, + "grad_norm": 0.11272845417261124, + "learning_rate": 3.725640784149375e-05, + "loss": 0.4204, + "num_input_tokens_seen": 176880, + "step": 1005 + }, + { + "epoch": 4.056224899598393, + "grad_norm": 0.08953544497489929, + "learning_rate": 3.710337797650787e-05, + "loss": 0.339, + "num_input_tokens_seen": 177680, + "step": 1010 + }, + { + "epoch": 4.076305220883534, + "grad_norm": 0.4361952543258667, + "learning_rate": 3.694975346229458e-05, + "loss": 0.3311, + "num_input_tokens_seen": 178608, + "step": 1015 + }, + { + "epoch": 4.096385542168675, + "grad_norm": 0.09541574120521545, + "learning_rate": 3.679554184655659e-05, + "loss": 0.3611, + "num_input_tokens_seen": 179600, + "step": 1020 + }, + { + "epoch": 4.116465863453815, + "grad_norm": 0.5854984521865845, + "learning_rate": 3.6640750705841405e-05, + "loss": 0.3403, + "num_input_tokens_seen": 180464, + "step": 1025 + }, + { + "epoch": 4.136546184738956, + "grad_norm": 0.10651904344558716, + "learning_rate": 3.6485387645169064e-05, + "loss": 0.3243, + "num_input_tokens_seen": 181344, + "step": 1030 + }, + { + "epoch": 4.156626506024097, + "grad_norm": 0.5942978262901306, + "learning_rate": 3.632946029765856e-05, + "loss": 0.3965, + "num_input_tokens_seen": 182080, + "step": 1035 + }, + { + "epoch": 4.176706827309237, + "grad_norm": 0.07312840223312378, + "learning_rate": 3.617297632415273e-05, + "loss": 0.3719, + "num_input_tokens_seen": 182848, + "step": 1040 + }, + { + "epoch": 4.196787148594377, + "grad_norm": 0.5075451135635376, + "learning_rate": 3.601594341284195e-05, + "loss": 0.3512, + "num_input_tokens_seen": 183840, + "step": 1045 + }, + { + "epoch": 4.216867469879518, + "grad_norm": 0.047960445284843445, + "learning_rate": 3.5858369278886354e-05, + "loss": 0.3388, + "num_input_tokens_seen": 184720, + "step": 1050 + }, + { + "epoch": 4.236947791164659, + "grad_norm": 0.08333683758974075, + "learning_rate": 3.5700261664036827e-05, + "loss": 0.3457, + "num_input_tokens_seen": 185504, + "step": 1055 + }, + { + "epoch": 4.257028112449799, + "grad_norm": 0.0653541311621666, + "learning_rate": 3.55416283362546e-05, + "loss": 0.3588, + "num_input_tokens_seen": 186272, + "step": 1060 + }, + { + "epoch": 4.27710843373494, + "grad_norm": 0.5113236308097839, + "learning_rate": 3.5382477089329646e-05, + "loss": 0.3579, + "num_input_tokens_seen": 187296, + "step": 1065 + }, + { + "epoch": 4.2971887550200805, + "grad_norm": 0.07462375611066818, + "learning_rate": 3.522281574249774e-05, + "loss": 0.348, + "num_input_tokens_seen": 188320, + "step": 1070 + }, + { + "epoch": 4.317269076305221, + "grad_norm": 0.11710739135742188, + "learning_rate": 3.5062652140056275e-05, + "loss": 0.3282, + "num_input_tokens_seen": 189248, + "step": 1075 + }, + { + "epoch": 4.337349397590361, + "grad_norm": 0.15031148493289948, + "learning_rate": 3.490199415097892e-05, + "loss": 0.3005, + "num_input_tokens_seen": 190432, + "step": 1080 + }, + { + "epoch": 4.357429718875502, + "grad_norm": 0.6503745913505554, + "learning_rate": 3.474084966852897e-05, + "loss": 0.4539, + "num_input_tokens_seen": 191296, + "step": 1085 + }, + { + "epoch": 4.377510040160643, + "grad_norm": 0.14889173209667206, + "learning_rate": 3.457922660987155e-05, + "loss": 0.3682, + "num_input_tokens_seen": 192368, + "step": 1090 + }, + { + "epoch": 4.397590361445783, + "grad_norm": 0.4457005560398102, + "learning_rate": 3.441713291568462e-05, + "loss": 0.3338, + "num_input_tokens_seen": 193232, + "step": 1095 + }, + { + "epoch": 4.417670682730924, + "grad_norm": 0.5409120321273804, + "learning_rate": 3.42545765497689e-05, + "loss": 0.3587, + "num_input_tokens_seen": 194128, + "step": 1100 + }, + { + "epoch": 4.437751004016064, + "grad_norm": 0.085002101957798, + "learning_rate": 3.409156549865654e-05, + "loss": 0.3609, + "num_input_tokens_seen": 194944, + "step": 1105 + }, + { + "epoch": 4.457831325301205, + "grad_norm": 0.49231743812561035, + "learning_rate": 3.392810777121876e-05, + "loss": 0.3477, + "num_input_tokens_seen": 195840, + "step": 1110 + }, + { + "epoch": 4.477911646586345, + "grad_norm": 0.5549922585487366, + "learning_rate": 3.376421139827237e-05, + "loss": 0.3871, + "num_input_tokens_seen": 196640, + "step": 1115 + }, + { + "epoch": 4.497991967871486, + "grad_norm": 0.06657743453979492, + "learning_rate": 3.3599884432185225e-05, + "loss": 0.3481, + "num_input_tokens_seen": 197440, + "step": 1120 + }, + { + "epoch": 4.518072289156627, + "grad_norm": 0.13579045236110687, + "learning_rate": 3.343513494648055e-05, + "loss": 0.3393, + "num_input_tokens_seen": 198432, + "step": 1125 + }, + { + "epoch": 4.518072289156627, + "eval_loss": 0.3545505702495575, + "eval_runtime": 1.4272, + "eval_samples_per_second": 39.238, + "eval_steps_per_second": 19.619, + "num_input_tokens_seen": 198432, + "step": 1125 + }, + { + "epoch": 4.538152610441767, + "grad_norm": 0.4401510953903198, + "learning_rate": 3.326997103544035e-05, + "loss": 0.3349, + "num_input_tokens_seen": 199232, + "step": 1130 + }, + { + "epoch": 4.5582329317269075, + "grad_norm": 0.14849136769771576, + "learning_rate": 3.310440081370767e-05, + "loss": 0.3373, + "num_input_tokens_seen": 200144, + "step": 1135 + }, + { + "epoch": 4.578313253012048, + "grad_norm": 0.621387243270874, + "learning_rate": 3.2938432415887984e-05, + "loss": 0.3213, + "num_input_tokens_seen": 200896, + "step": 1140 + }, + { + "epoch": 4.598393574297189, + "grad_norm": 0.7517121434211731, + "learning_rate": 3.2772073996149435e-05, + "loss": 0.3475, + "num_input_tokens_seen": 201760, + "step": 1145 + }, + { + "epoch": 4.618473895582329, + "grad_norm": 0.41686856746673584, + "learning_rate": 3.260533372782234e-05, + "loss": 0.4032, + "num_input_tokens_seen": 202688, + "step": 1150 + }, + { + "epoch": 4.63855421686747, + "grad_norm": 0.6020703315734863, + "learning_rate": 3.24382198029975e-05, + "loss": 0.3564, + "num_input_tokens_seen": 203392, + "step": 1155 + }, + { + "epoch": 4.658634538152611, + "grad_norm": 0.40914788842201233, + "learning_rate": 3.227074043212383e-05, + "loss": 0.322, + "num_input_tokens_seen": 204080, + "step": 1160 + }, + { + "epoch": 4.678714859437751, + "grad_norm": 0.09926328808069229, + "learning_rate": 3.2102903843604885e-05, + "loss": 0.373, + "num_input_tokens_seen": 204816, + "step": 1165 + }, + { + "epoch": 4.698795180722891, + "grad_norm": 0.08548900485038757, + "learning_rate": 3.1934718283394646e-05, + "loss": 0.3587, + "num_input_tokens_seen": 205616, + "step": 1170 + }, + { + "epoch": 4.718875502008032, + "grad_norm": 0.11746017634868622, + "learning_rate": 3.1766192014592344e-05, + "loss": 0.3571, + "num_input_tokens_seen": 206512, + "step": 1175 + }, + { + "epoch": 4.738955823293173, + "grad_norm": 0.4761631190776825, + "learning_rate": 3.1597333317036545e-05, + "loss": 0.3507, + "num_input_tokens_seen": 207424, + "step": 1180 + }, + { + "epoch": 4.759036144578313, + "grad_norm": 0.5010347366333008, + "learning_rate": 3.142815048689828e-05, + "loss": 0.3575, + "num_input_tokens_seen": 208464, + "step": 1185 + }, + { + "epoch": 4.779116465863454, + "grad_norm": 0.07341606169939041, + "learning_rate": 3.125865183627354e-05, + "loss": 0.3579, + "num_input_tokens_seen": 209280, + "step": 1190 + }, + { + "epoch": 4.7991967871485945, + "grad_norm": 0.43029800057411194, + "learning_rate": 3.10888456927748e-05, + "loss": 0.3327, + "num_input_tokens_seen": 210080, + "step": 1195 + }, + { + "epoch": 4.8192771084337345, + "grad_norm": 0.49401140213012695, + "learning_rate": 3.091874039912195e-05, + "loss": 0.3619, + "num_input_tokens_seen": 210960, + "step": 1200 + }, + { + "epoch": 4.839357429718875, + "grad_norm": 0.07773241400718689, + "learning_rate": 3.074834431273236e-05, + "loss": 0.3488, + "num_input_tokens_seen": 211776, + "step": 1205 + }, + { + "epoch": 4.859437751004016, + "grad_norm": 0.4646616280078888, + "learning_rate": 3.057766580531031e-05, + "loss": 0.3542, + "num_input_tokens_seen": 212576, + "step": 1210 + }, + { + "epoch": 4.879518072289157, + "grad_norm": 0.4323027431964874, + "learning_rate": 3.0406713262435656e-05, + "loss": 0.3362, + "num_input_tokens_seen": 213360, + "step": 1215 + }, + { + "epoch": 4.899598393574297, + "grad_norm": 0.502923846244812, + "learning_rate": 3.0235495083151844e-05, + "loss": 0.3814, + "num_input_tokens_seen": 214304, + "step": 1220 + }, + { + "epoch": 4.919678714859438, + "grad_norm": 0.43908852338790894, + "learning_rate": 3.0064019679553274e-05, + "loss": 0.3492, + "num_input_tokens_seen": 215072, + "step": 1225 + }, + { + "epoch": 4.9397590361445785, + "grad_norm": 0.07500998675823212, + "learning_rate": 2.9892295476371988e-05, + "loss": 0.3542, + "num_input_tokens_seen": 215904, + "step": 1230 + }, + { + "epoch": 4.959839357429718, + "grad_norm": 0.10780856758356094, + "learning_rate": 2.9720330910563772e-05, + "loss": 0.3543, + "num_input_tokens_seen": 216864, + "step": 1235 + }, + { + "epoch": 4.979919678714859, + "grad_norm": 0.0808030292391777, + "learning_rate": 2.9548134430893604e-05, + "loss": 0.3387, + "num_input_tokens_seen": 217856, + "step": 1240 + }, + { + "epoch": 5.0, + "grad_norm": 0.0589243620634079, + "learning_rate": 2.9375714497520623e-05, + "loss": 0.339, + "num_input_tokens_seen": 218864, + "step": 1245 + }, + { + "epoch": 5.020080321285141, + "grad_norm": 0.05416898429393768, + "learning_rate": 2.920307958158241e-05, + "loss": 0.3601, + "num_input_tokens_seen": 219680, + "step": 1250 + }, + { + "epoch": 5.020080321285141, + "eval_loss": 0.3591695725917816, + "eval_runtime": 1.2186, + "eval_samples_per_second": 45.953, + "eval_steps_per_second": 22.976, + "num_input_tokens_seen": 219680, + "step": 1250 + }, + { + "epoch": 5.040160642570281, + "grad_norm": 0.40490707755088806, + "learning_rate": 2.903023816477885e-05, + "loss": 0.3239, + "num_input_tokens_seen": 220560, + "step": 1255 + }, + { + "epoch": 5.0602409638554215, + "grad_norm": 0.12411481887102127, + "learning_rate": 2.885719873895536e-05, + "loss": 0.3419, + "num_input_tokens_seen": 221440, + "step": 1260 + }, + { + "epoch": 5.080321285140562, + "grad_norm": 0.5238391160964966, + "learning_rate": 2.868396980568572e-05, + "loss": 0.348, + "num_input_tokens_seen": 222304, + "step": 1265 + }, + { + "epoch": 5.100401606425703, + "grad_norm": 0.358173131942749, + "learning_rate": 2.8510559875854377e-05, + "loss": 0.2762, + "num_input_tokens_seen": 223248, + "step": 1270 + }, + { + "epoch": 5.120481927710843, + "grad_norm": 0.3846758306026459, + "learning_rate": 2.833697746923829e-05, + "loss": 0.2662, + "num_input_tokens_seen": 224000, + "step": 1275 + }, + { + "epoch": 5.140562248995984, + "grad_norm": 0.233295738697052, + "learning_rate": 2.816323111408835e-05, + "loss": 0.3421, + "num_input_tokens_seen": 224880, + "step": 1280 + }, + { + "epoch": 5.160642570281125, + "grad_norm": 0.7878521084785461, + "learning_rate": 2.7989329346710375e-05, + "loss": 0.4232, + "num_input_tokens_seen": 225776, + "step": 1285 + }, + { + "epoch": 5.180722891566265, + "grad_norm": 0.3618873655796051, + "learning_rate": 2.7815280711045717e-05, + "loss": 0.3838, + "num_input_tokens_seen": 226576, + "step": 1290 + }, + { + "epoch": 5.2008032128514055, + "grad_norm": 0.14807634055614471, + "learning_rate": 2.7641093758251497e-05, + "loss": 0.3104, + "num_input_tokens_seen": 227360, + "step": 1295 + }, + { + "epoch": 5.220883534136546, + "grad_norm": 0.3595626950263977, + "learning_rate": 2.7466777046280457e-05, + "loss": 0.3105, + "num_input_tokens_seen": 228112, + "step": 1300 + }, + { + "epoch": 5.240963855421687, + "grad_norm": 0.6282910108566284, + "learning_rate": 2.7292339139460556e-05, + "loss": 0.3474, + "num_input_tokens_seen": 228992, + "step": 1305 + }, + { + "epoch": 5.261044176706827, + "grad_norm": 0.357793390750885, + "learning_rate": 2.71177886080741e-05, + "loss": 0.3076, + "num_input_tokens_seen": 229872, + "step": 1310 + }, + { + "epoch": 5.281124497991968, + "grad_norm": 0.1209518164396286, + "learning_rate": 2.69431340279368e-05, + "loss": 0.4231, + "num_input_tokens_seen": 230720, + "step": 1315 + }, + { + "epoch": 5.301204819277109, + "grad_norm": 0.09653452038764954, + "learning_rate": 2.676838397997633e-05, + "loss": 0.3725, + "num_input_tokens_seen": 231568, + "step": 1320 + }, + { + "epoch": 5.321285140562249, + "grad_norm": 0.4242056608200073, + "learning_rate": 2.659354704981078e-05, + "loss": 0.3237, + "num_input_tokens_seen": 232368, + "step": 1325 + }, + { + "epoch": 5.341365461847389, + "grad_norm": 0.10253465920686722, + "learning_rate": 2.6418631827326857e-05, + "loss": 0.3534, + "num_input_tokens_seen": 233184, + "step": 1330 + }, + { + "epoch": 5.36144578313253, + "grad_norm": 0.08719359338283539, + "learning_rate": 2.6243646906257806e-05, + "loss": 0.338, + "num_input_tokens_seen": 233984, + "step": 1335 + }, + { + "epoch": 5.381526104417671, + "grad_norm": 0.5312795639038086, + "learning_rate": 2.606860088376126e-05, + "loss": 0.3687, + "num_input_tokens_seen": 234848, + "step": 1340 + }, + { + "epoch": 5.401606425702811, + "grad_norm": 0.47353821992874146, + "learning_rate": 2.5893502359996786e-05, + "loss": 0.3449, + "num_input_tokens_seen": 235760, + "step": 1345 + }, + { + "epoch": 5.421686746987952, + "grad_norm": 0.49804431200027466, + "learning_rate": 2.5718359937703408e-05, + "loss": 0.3504, + "num_input_tokens_seen": 236640, + "step": 1350 + }, + { + "epoch": 5.4417670682730925, + "grad_norm": 0.5166244506835938, + "learning_rate": 2.554318222177689e-05, + "loss": 0.3538, + "num_input_tokens_seen": 237616, + "step": 1355 + }, + { + "epoch": 5.461847389558233, + "grad_norm": 0.1073295921087265, + "learning_rate": 2.5367977818847034e-05, + "loss": 0.3354, + "num_input_tokens_seen": 238528, + "step": 1360 + }, + { + "epoch": 5.481927710843373, + "grad_norm": 0.49680086970329285, + "learning_rate": 2.519275533685477e-05, + "loss": 0.3354, + "num_input_tokens_seen": 239424, + "step": 1365 + }, + { + "epoch": 5.502008032128514, + "grad_norm": 0.11628666520118713, + "learning_rate": 2.5017523384629298e-05, + "loss": 0.354, + "num_input_tokens_seen": 240272, + "step": 1370 + }, + { + "epoch": 5.522088353413655, + "grad_norm": 0.11434385180473328, + "learning_rate": 2.484229057146507e-05, + "loss": 0.3422, + "num_input_tokens_seen": 241136, + "step": 1375 + }, + { + "epoch": 5.522088353413655, + "eval_loss": 0.35063984990119934, + "eval_runtime": 1.4257, + "eval_samples_per_second": 39.28, + "eval_steps_per_second": 19.64, + "num_input_tokens_seen": 241136, + "step": 1375 + }, + { + "epoch": 5.542168674698795, + "grad_norm": 0.5793833136558533, + "learning_rate": 2.466706550669886e-05, + "loss": 0.3574, + "num_input_tokens_seen": 241936, + "step": 1380 + }, + { + "epoch": 5.562248995983936, + "grad_norm": 0.5316616296768188, + "learning_rate": 2.449185679928672e-05, + "loss": 0.3748, + "num_input_tokens_seen": 242672, + "step": 1385 + }, + { + "epoch": 5.582329317269076, + "grad_norm": 0.10326164960861206, + "learning_rate": 2.431667305738112e-05, + "loss": 0.3507, + "num_input_tokens_seen": 243808, + "step": 1390 + }, + { + "epoch": 5.602409638554217, + "grad_norm": 0.46083107590675354, + "learning_rate": 2.414152288790787e-05, + "loss": 0.3506, + "num_input_tokens_seen": 244688, + "step": 1395 + }, + { + "epoch": 5.622489959839357, + "grad_norm": 0.42732155323028564, + "learning_rate": 2.3966414896143385e-05, + "loss": 0.3386, + "num_input_tokens_seen": 245696, + "step": 1400 + }, + { + "epoch": 5.642570281124498, + "grad_norm": 0.4096117317676544, + "learning_rate": 2.3791357685291863e-05, + "loss": 0.3298, + "num_input_tokens_seen": 246544, + "step": 1405 + }, + { + "epoch": 5.662650602409639, + "grad_norm": 0.39851030707359314, + "learning_rate": 2.361635985606256e-05, + "loss": 0.3413, + "num_input_tokens_seen": 247744, + "step": 1410 + }, + { + "epoch": 5.682730923694779, + "grad_norm": 0.385628879070282, + "learning_rate": 2.344143000624729e-05, + "loss": 0.3623, + "num_input_tokens_seen": 248480, + "step": 1415 + }, + { + "epoch": 5.7028112449799195, + "grad_norm": 0.38799935579299927, + "learning_rate": 2.3266576730297956e-05, + "loss": 0.3284, + "num_input_tokens_seen": 249312, + "step": 1420 + }, + { + "epoch": 5.72289156626506, + "grad_norm": 0.11895764619112015, + "learning_rate": 2.3091808618904352e-05, + "loss": 0.3679, + "num_input_tokens_seen": 250304, + "step": 1425 + }, + { + "epoch": 5.742971887550201, + "grad_norm": 0.3756169080734253, + "learning_rate": 2.2917134258572038e-05, + "loss": 0.3506, + "num_input_tokens_seen": 251216, + "step": 1430 + }, + { + "epoch": 5.763052208835341, + "grad_norm": 0.5232722163200378, + "learning_rate": 2.274256223120051e-05, + "loss": 0.3512, + "num_input_tokens_seen": 251952, + "step": 1435 + }, + { + "epoch": 5.783132530120482, + "grad_norm": 0.06907851248979568, + "learning_rate": 2.2568101113661577e-05, + "loss": 0.3292, + "num_input_tokens_seen": 253072, + "step": 1440 + }, + { + "epoch": 5.803212851405623, + "grad_norm": 0.08446797728538513, + "learning_rate": 2.239375947737793e-05, + "loss": 0.3499, + "num_input_tokens_seen": 253840, + "step": 1445 + }, + { + "epoch": 5.823293172690763, + "grad_norm": 0.06622636318206787, + "learning_rate": 2.221954588790206e-05, + "loss": 0.3647, + "num_input_tokens_seen": 254640, + "step": 1450 + }, + { + "epoch": 5.843373493975903, + "grad_norm": 0.03167250007390976, + "learning_rate": 2.2045468904495415e-05, + "loss": 0.3518, + "num_input_tokens_seen": 255456, + "step": 1455 + }, + { + "epoch": 5.863453815261044, + "grad_norm": 0.07110590487718582, + "learning_rate": 2.1871537079707833e-05, + "loss": 0.354, + "num_input_tokens_seen": 256304, + "step": 1460 + }, + { + "epoch": 5.883534136546185, + "grad_norm": 0.5281655788421631, + "learning_rate": 2.1697758958957448e-05, + "loss": 0.3385, + "num_input_tokens_seen": 257104, + "step": 1465 + }, + { + "epoch": 5.903614457831325, + "grad_norm": 0.5472500920295715, + "learning_rate": 2.1524143080110716e-05, + "loss": 0.3532, + "num_input_tokens_seen": 258080, + "step": 1470 + }, + { + "epoch": 5.923694779116466, + "grad_norm": 0.5764107704162598, + "learning_rate": 2.135069797306308e-05, + "loss": 0.3701, + "num_input_tokens_seen": 259056, + "step": 1475 + }, + { + "epoch": 5.943775100401607, + "grad_norm": 0.5297293663024902, + "learning_rate": 2.1177432159319754e-05, + "loss": 0.3721, + "num_input_tokens_seen": 260000, + "step": 1480 + }, + { + "epoch": 5.9638554216867465, + "grad_norm": 0.03520062938332558, + "learning_rate": 2.100435415157718e-05, + "loss": 0.3517, + "num_input_tokens_seen": 260768, + "step": 1485 + }, + { + "epoch": 5.983935742971887, + "grad_norm": 0.451326847076416, + "learning_rate": 2.083147245330468e-05, + "loss": 0.3572, + "num_input_tokens_seen": 261760, + "step": 1490 + }, + { + "epoch": 6.004016064257028, + "grad_norm": 0.4301218092441559, + "learning_rate": 2.0658795558326743e-05, + "loss": 0.3476, + "num_input_tokens_seen": 262752, + "step": 1495 + }, + { + "epoch": 6.024096385542169, + "grad_norm": 0.09133653342723846, + "learning_rate": 2.048633195040572e-05, + "loss": 0.3609, + "num_input_tokens_seen": 263616, + "step": 1500 + }, + { + "epoch": 6.024096385542169, + "eval_loss": 0.3502369225025177, + "eval_runtime": 1.2111, + "eval_samples_per_second": 46.238, + "eval_steps_per_second": 23.119, + "num_input_tokens_seen": 263616, + "step": 1500 + }, + { + "epoch": 6.044176706827309, + "grad_norm": 0.4608069062232971, + "learning_rate": 2.0314090102824963e-05, + "loss": 0.3669, + "num_input_tokens_seen": 264432, + "step": 1505 + }, + { + "epoch": 6.06425702811245, + "grad_norm": 0.08459752053022385, + "learning_rate": 2.014207847797256e-05, + "loss": 0.3542, + "num_input_tokens_seen": 265184, + "step": 1510 + }, + { + "epoch": 6.0843373493975905, + "grad_norm": 0.04717332869768143, + "learning_rate": 1.997030552692556e-05, + "loss": 0.3509, + "num_input_tokens_seen": 266064, + "step": 1515 + }, + { + "epoch": 6.104417670682731, + "grad_norm": 0.07227271795272827, + "learning_rate": 1.9798779689034757e-05, + "loss": 0.3483, + "num_input_tokens_seen": 266928, + "step": 1520 + }, + { + "epoch": 6.124497991967871, + "grad_norm": 0.4734782874584198, + "learning_rate": 1.9627509391510086e-05, + "loss": 0.3542, + "num_input_tokens_seen": 267824, + "step": 1525 + }, + { + "epoch": 6.144578313253012, + "grad_norm": 0.430178701877594, + "learning_rate": 1.9456503049006542e-05, + "loss": 0.3479, + "num_input_tokens_seen": 268608, + "step": 1530 + }, + { + "epoch": 6.164658634538153, + "grad_norm": 0.4436280131340027, + "learning_rate": 1.9285769063210812e-05, + "loss": 0.3477, + "num_input_tokens_seen": 269696, + "step": 1535 + }, + { + "epoch": 6.184738955823293, + "grad_norm": 0.44335198402404785, + "learning_rate": 1.9115315822428437e-05, + "loss": 0.351, + "num_input_tokens_seen": 270704, + "step": 1540 + }, + { + "epoch": 6.204819277108434, + "grad_norm": 0.0994335189461708, + "learning_rate": 1.8945151701171755e-05, + "loss": 0.3447, + "num_input_tokens_seen": 271568, + "step": 1545 + }, + { + "epoch": 6.224899598393574, + "grad_norm": 0.42669767141342163, + "learning_rate": 1.877528505974838e-05, + "loss": 0.3386, + "num_input_tokens_seen": 272304, + "step": 1550 + }, + { + "epoch": 6.244979919678715, + "grad_norm": 0.091577909886837, + "learning_rate": 1.8605724243850502e-05, + "loss": 0.3302, + "num_input_tokens_seen": 273152, + "step": 1555 + }, + { + "epoch": 6.265060240963855, + "grad_norm": 0.5046164393424988, + "learning_rate": 1.8436477584144863e-05, + "loss": 0.3962, + "num_input_tokens_seen": 274112, + "step": 1560 + }, + { + "epoch": 6.285140562248996, + "grad_norm": 0.1121777817606926, + "learning_rate": 1.826755339586341e-05, + "loss": 0.3337, + "num_input_tokens_seen": 274944, + "step": 1565 + }, + { + "epoch": 6.305220883534137, + "grad_norm": 0.40517058968544006, + "learning_rate": 1.809895997839482e-05, + "loss": 0.3484, + "num_input_tokens_seen": 275712, + "step": 1570 + }, + { + "epoch": 6.325301204819277, + "grad_norm": 0.09261249750852585, + "learning_rate": 1.793070561487672e-05, + "loss": 0.3391, + "num_input_tokens_seen": 276560, + "step": 1575 + }, + { + "epoch": 6.3453815261044175, + "grad_norm": 0.49878165125846863, + "learning_rate": 1.7762798571788707e-05, + "loss": 0.3948, + "num_input_tokens_seen": 277456, + "step": 1580 + }, + { + "epoch": 6.365461847389558, + "grad_norm": 0.415039598941803, + "learning_rate": 1.759524709854626e-05, + "loss": 0.3246, + "num_input_tokens_seen": 278352, + "step": 1585 + }, + { + "epoch": 6.385542168674699, + "grad_norm": 0.11119506508111954, + "learning_rate": 1.742805942709538e-05, + "loss": 0.3468, + "num_input_tokens_seen": 279264, + "step": 1590 + }, + { + "epoch": 6.405622489959839, + "grad_norm": 0.10621926933526993, + "learning_rate": 1.7261243771508208e-05, + "loss": 0.3428, + "num_input_tokens_seen": 280144, + "step": 1595 + }, + { + "epoch": 6.42570281124498, + "grad_norm": 0.10251349955797195, + "learning_rate": 1.70948083275794e-05, + "loss": 0.3439, + "num_input_tokens_seen": 281008, + "step": 1600 + }, + { + "epoch": 6.445783132530121, + "grad_norm": 0.4213389456272125, + "learning_rate": 1.6928761272423522e-05, + "loss": 0.3717, + "num_input_tokens_seen": 281792, + "step": 1605 + }, + { + "epoch": 6.4658634538152615, + "grad_norm": 0.08103923499584198, + "learning_rate": 1.6763110764073235e-05, + "loss": 0.3517, + "num_input_tokens_seen": 282560, + "step": 1610 + }, + { + "epoch": 6.485943775100401, + "grad_norm": 0.46979421377182007, + "learning_rate": 1.6597864941078552e-05, + "loss": 0.3423, + "num_input_tokens_seen": 283440, + "step": 1615 + }, + { + "epoch": 6.506024096385542, + "grad_norm": 0.12765252590179443, + "learning_rate": 1.643303192210693e-05, + "loss": 0.358, + "num_input_tokens_seen": 284592, + "step": 1620 + }, + { + "epoch": 6.526104417670683, + "grad_norm": 0.06467333436012268, + "learning_rate": 1.626861980554441e-05, + "loss": 0.3457, + "num_input_tokens_seen": 285424, + "step": 1625 + }, + { + "epoch": 6.526104417670683, + "eval_loss": 0.3553968071937561, + "eval_runtime": 1.2158, + "eval_samples_per_second": 46.06, + "eval_steps_per_second": 23.03, + "num_input_tokens_seen": 285424, + "step": 1625 + }, + { + "epoch": 6.546184738955823, + "grad_norm": 0.41862520575523376, + "learning_rate": 1.6104636669097776e-05, + "loss": 0.3518, + "num_input_tokens_seen": 286272, + "step": 1630 + }, + { + "epoch": 6.566265060240964, + "grad_norm": 0.0668744370341301, + "learning_rate": 1.5941090569397616e-05, + "loss": 0.3512, + "num_input_tokens_seen": 287200, + "step": 1635 + }, + { + "epoch": 6.586345381526105, + "grad_norm": 0.08563435077667236, + "learning_rate": 1.5777989541602533e-05, + "loss": 0.348, + "num_input_tokens_seen": 288224, + "step": 1640 + }, + { + "epoch": 6.606425702811245, + "grad_norm": 0.44376978278160095, + "learning_rate": 1.561534159900441e-05, + "loss": 0.3353, + "num_input_tokens_seen": 289216, + "step": 1645 + }, + { + "epoch": 6.626506024096385, + "grad_norm": 0.12750263512134552, + "learning_rate": 1.5453154732634616e-05, + "loss": 0.3476, + "num_input_tokens_seen": 290080, + "step": 1650 + }, + { + "epoch": 6.646586345381526, + "grad_norm": 0.4618399739265442, + "learning_rate": 1.52914369108715e-05, + "loss": 0.351, + "num_input_tokens_seen": 290880, + "step": 1655 + }, + { + "epoch": 6.666666666666667, + "grad_norm": 0.07694806158542633, + "learning_rate": 1.513019607904882e-05, + "loss": 0.3607, + "num_input_tokens_seen": 291728, + "step": 1660 + }, + { + "epoch": 6.686746987951807, + "grad_norm": 0.09036281704902649, + "learning_rate": 1.4969440159065439e-05, + "loss": 0.3573, + "num_input_tokens_seen": 292624, + "step": 1665 + }, + { + "epoch": 6.706827309236948, + "grad_norm": 0.08298249542713165, + "learning_rate": 1.4809177048996064e-05, + "loss": 0.3476, + "num_input_tokens_seen": 293488, + "step": 1670 + }, + { + "epoch": 6.7269076305220885, + "grad_norm": 0.4784527122974396, + "learning_rate": 1.464941462270325e-05, + "loss": 0.3477, + "num_input_tokens_seen": 294400, + "step": 1675 + }, + { + "epoch": 6.746987951807229, + "grad_norm": 0.45142361521720886, + "learning_rate": 1.449016072945053e-05, + "loss": 0.357, + "num_input_tokens_seen": 295184, + "step": 1680 + }, + { + "epoch": 6.767068273092369, + "grad_norm": 0.4861927330493927, + "learning_rate": 1.4331423193516768e-05, + "loss": 0.3575, + "num_input_tokens_seen": 296176, + "step": 1685 + }, + { + "epoch": 6.78714859437751, + "grad_norm": 0.41130194067955017, + "learning_rate": 1.4173209813811788e-05, + "loss": 0.3358, + "num_input_tokens_seen": 297072, + "step": 1690 + }, + { + "epoch": 6.807228915662651, + "grad_norm": 0.40937137603759766, + "learning_rate": 1.4015528363493125e-05, + "loss": 0.3491, + "num_input_tokens_seen": 297856, + "step": 1695 + }, + { + "epoch": 6.827309236947791, + "grad_norm": 0.40058666467666626, + "learning_rate": 1.3858386589584187e-05, + "loss": 0.3253, + "num_input_tokens_seen": 298896, + "step": 1700 + }, + { + "epoch": 6.847389558232932, + "grad_norm": 0.11114007234573364, + "learning_rate": 1.3701792212593662e-05, + "loss": 0.3302, + "num_input_tokens_seen": 299712, + "step": 1705 + }, + { + "epoch": 6.867469879518072, + "grad_norm": 0.11977384239435196, + "learning_rate": 1.354575292613611e-05, + "loss": 0.3882, + "num_input_tokens_seen": 300720, + "step": 1710 + }, + { + "epoch": 6.887550200803213, + "grad_norm": 0.5076762437820435, + "learning_rate": 1.3390276396554052e-05, + "loss": 0.3658, + "num_input_tokens_seen": 301552, + "step": 1715 + }, + { + "epoch": 6.907630522088353, + "grad_norm": 0.07989758253097534, + "learning_rate": 1.3235370262541272e-05, + "loss": 0.3388, + "num_input_tokens_seen": 302352, + "step": 1720 + }, + { + "epoch": 6.927710843373494, + "grad_norm": 0.38454264402389526, + "learning_rate": 1.3081042134767554e-05, + "loss": 0.3335, + "num_input_tokens_seen": 303232, + "step": 1725 + }, + { + "epoch": 6.947791164658635, + "grad_norm": 0.07989054918289185, + "learning_rate": 1.292729959550473e-05, + "loss": 0.3262, + "num_input_tokens_seen": 304016, + "step": 1730 + }, + { + "epoch": 6.967871485943775, + "grad_norm": 0.3905481994152069, + "learning_rate": 1.277415019825417e-05, + "loss": 0.3396, + "num_input_tokens_seen": 304944, + "step": 1735 + }, + { + "epoch": 6.9879518072289155, + "grad_norm": 0.39736074209213257, + "learning_rate": 1.2621601467375684e-05, + "loss": 0.3422, + "num_input_tokens_seen": 305984, + "step": 1740 + }, + { + "epoch": 7.008032128514056, + "grad_norm": 0.1431533843278885, + "learning_rate": 1.2469660897717816e-05, + "loss": 0.3182, + "num_input_tokens_seen": 306992, + "step": 1745 + }, + { + "epoch": 7.028112449799197, + "grad_norm": 0.39490222930908203, + "learning_rate": 1.2318335954249669e-05, + "loss": 0.315, + "num_input_tokens_seen": 307792, + "step": 1750 + }, + { + "epoch": 7.028112449799197, + "eval_loss": 0.36506387591362, + "eval_runtime": 1.2174, + "eval_samples_per_second": 45.998, + "eval_steps_per_second": 22.999, + "num_input_tokens_seen": 307792, + "step": 1750 + }, + { + "epoch": 7.048192771084337, + "grad_norm": 0.13177447021007538, + "learning_rate": 1.2167634071694081e-05, + "loss": 0.3174, + "num_input_tokens_seen": 308624, + "step": 1755 + }, + { + "epoch": 7.068273092369478, + "grad_norm": 0.38232582807540894, + "learning_rate": 1.2017562654162357e-05, + "loss": 0.2887, + "num_input_tokens_seen": 309680, + "step": 1760 + }, + { + "epoch": 7.088353413654619, + "grad_norm": 0.19812007248401642, + "learning_rate": 1.1868129074790577e-05, + "loss": 0.3394, + "num_input_tokens_seen": 310544, + "step": 1765 + }, + { + "epoch": 7.108433734939759, + "grad_norm": 0.2020581066608429, + "learning_rate": 1.1719340675377252e-05, + "loss": 0.3113, + "num_input_tokens_seen": 311568, + "step": 1770 + }, + { + "epoch": 7.128514056224899, + "grad_norm": 0.16722743213176727, + "learning_rate": 1.1571204766022665e-05, + "loss": 0.4907, + "num_input_tokens_seen": 312432, + "step": 1775 + }, + { + "epoch": 7.14859437751004, + "grad_norm": 0.14364704489707947, + "learning_rate": 1.1423728624769695e-05, + "loss": 0.3627, + "num_input_tokens_seen": 313168, + "step": 1780 + }, + { + "epoch": 7.168674698795181, + "grad_norm": 0.11236248165369034, + "learning_rate": 1.1276919497246288e-05, + "loss": 0.3648, + "num_input_tokens_seen": 313968, + "step": 1785 + }, + { + "epoch": 7.188755020080321, + "grad_norm": 0.11342489719390869, + "learning_rate": 1.1130784596309409e-05, + "loss": 0.3585, + "num_input_tokens_seen": 314736, + "step": 1790 + }, + { + "epoch": 7.208835341365462, + "grad_norm": 0.09296204149723053, + "learning_rate": 1.098533110169071e-05, + "loss": 0.3485, + "num_input_tokens_seen": 315664, + "step": 1795 + }, + { + "epoch": 7.228915662650603, + "grad_norm": 0.1172434464097023, + "learning_rate": 1.084056615964377e-05, + "loss": 0.3442, + "num_input_tokens_seen": 316704, + "step": 1800 + }, + { + "epoch": 7.2489959839357425, + "grad_norm": 0.0936344638466835, + "learning_rate": 1.069649688259299e-05, + "loss": 0.388, + "num_input_tokens_seen": 317520, + "step": 1805 + }, + { + "epoch": 7.269076305220883, + "grad_norm": 0.4171283543109894, + "learning_rate": 1.0553130348784182e-05, + "loss": 0.3306, + "num_input_tokens_seen": 318496, + "step": 1810 + }, + { + "epoch": 7.289156626506024, + "grad_norm": 0.403551310300827, + "learning_rate": 1.0410473601936765e-05, + "loss": 0.3181, + "num_input_tokens_seen": 319344, + "step": 1815 + }, + { + "epoch": 7.309236947791165, + "grad_norm": 0.08753710985183716, + "learning_rate": 1.026853365089773e-05, + "loss": 0.3494, + "num_input_tokens_seen": 320224, + "step": 1820 + }, + { + "epoch": 7.329317269076305, + "grad_norm": 0.393200546503067, + "learning_rate": 1.0127317469297277e-05, + "loss": 0.3193, + "num_input_tokens_seen": 320976, + "step": 1825 + }, + { + "epoch": 7.349397590361446, + "grad_norm": 0.1304997056722641, + "learning_rate": 9.986831995206195e-06, + "loss": 0.3271, + "num_input_tokens_seen": 321808, + "step": 1830 + }, + { + "epoch": 7.3694779116465865, + "grad_norm": 0.37172961235046387, + "learning_rate": 9.847084130795028e-06, + "loss": 0.3504, + "num_input_tokens_seen": 322624, + "step": 1835 + }, + { + "epoch": 7.389558232931727, + "grad_norm": 0.3792823255062103, + "learning_rate": 9.708080741994868e-06, + "loss": 0.3165, + "num_input_tokens_seen": 323696, + "step": 1840 + }, + { + "epoch": 7.409638554216867, + "grad_norm": 0.365556925535202, + "learning_rate": 9.569828658160158e-06, + "loss": 0.3184, + "num_input_tokens_seen": 324496, + "step": 1845 + }, + { + "epoch": 7.429718875502008, + "grad_norm": 0.10406666994094849, + "learning_rate": 9.432334671733039e-06, + "loss": 0.3824, + "num_input_tokens_seen": 325328, + "step": 1850 + }, + { + "epoch": 7.449799196787149, + "grad_norm": 0.3579881191253662, + "learning_rate": 9.295605537909708e-06, + "loss": 0.3336, + "num_input_tokens_seen": 326304, + "step": 1855 + }, + { + "epoch": 7.469879518072289, + "grad_norm": 0.171662375330925, + "learning_rate": 9.159647974308494e-06, + "loss": 0.3148, + "num_input_tokens_seen": 327120, + "step": 1860 + }, + { + "epoch": 7.48995983935743, + "grad_norm": 0.5380930304527283, + "learning_rate": 9.024468660639826e-06, + "loss": 0.3811, + "num_input_tokens_seen": 328128, + "step": 1865 + }, + { + "epoch": 7.51004016064257, + "grad_norm": 0.11611666530370712, + "learning_rate": 8.890074238378074e-06, + "loss": 0.351, + "num_input_tokens_seen": 329008, + "step": 1870 + }, + { + "epoch": 7.530120481927711, + "grad_norm": 0.10266629606485367, + "learning_rate": 8.756471310435204e-06, + "loss": 0.3149, + "num_input_tokens_seen": 329840, + "step": 1875 + }, + { + "epoch": 7.530120481927711, + "eval_loss": 0.3625907897949219, + "eval_runtime": 1.2148, + "eval_samples_per_second": 46.1, + "eval_steps_per_second": 23.05, + "num_input_tokens_seen": 329840, + "step": 1875 + }, + { + "epoch": 7.550200803212851, + "grad_norm": 0.15210554003715515, + "learning_rate": 8.623666440836404e-06, + "loss": 0.3623, + "num_input_tokens_seen": 330624, + "step": 1880 + }, + { + "epoch": 7.570281124497992, + "grad_norm": 0.09614621847867966, + "learning_rate": 8.491666154397573e-06, + "loss": 0.3149, + "num_input_tokens_seen": 331440, + "step": 1885 + }, + { + "epoch": 7.590361445783133, + "grad_norm": 0.11491771787405014, + "learning_rate": 8.360476936404754e-06, + "loss": 0.3897, + "num_input_tokens_seen": 332192, + "step": 1890 + }, + { + "epoch": 7.610441767068274, + "grad_norm": 0.09532292187213898, + "learning_rate": 8.230105232295538e-06, + "loss": 0.3736, + "num_input_tokens_seen": 333168, + "step": 1895 + }, + { + "epoch": 7.6305220883534135, + "grad_norm": 0.5067830681800842, + "learning_rate": 8.100557447342327e-06, + "loss": 0.3618, + "num_input_tokens_seen": 334080, + "step": 1900 + }, + { + "epoch": 7.650602409638554, + "grad_norm": 0.13660845160484314, + "learning_rate": 7.971839946337698e-06, + "loss": 0.3533, + "num_input_tokens_seen": 335040, + "step": 1905 + }, + { + "epoch": 7.670682730923695, + "grad_norm": 0.41254377365112305, + "learning_rate": 7.843959053281663e-06, + "loss": 0.3532, + "num_input_tokens_seen": 335824, + "step": 1910 + }, + { + "epoch": 7.690763052208835, + "grad_norm": 0.07652189582586288, + "learning_rate": 7.71692105107098e-06, + "loss": 0.3362, + "num_input_tokens_seen": 336656, + "step": 1915 + }, + { + "epoch": 7.710843373493976, + "grad_norm": 0.0759321078658104, + "learning_rate": 7.590732181190482e-06, + "loss": 0.3608, + "num_input_tokens_seen": 337488, + "step": 1920 + }, + { + "epoch": 7.730923694779117, + "grad_norm": 0.0837617963552475, + "learning_rate": 7.465398643406366e-06, + "loss": 0.342, + "num_input_tokens_seen": 338400, + "step": 1925 + }, + { + "epoch": 7.7510040160642575, + "grad_norm": 0.09123997390270233, + "learning_rate": 7.340926595461687e-06, + "loss": 0.3573, + "num_input_tokens_seen": 339248, + "step": 1930 + }, + { + "epoch": 7.771084337349397, + "grad_norm": 0.09809573739767075, + "learning_rate": 7.217322152773742e-06, + "loss": 0.3539, + "num_input_tokens_seen": 340112, + "step": 1935 + }, + { + "epoch": 7.791164658634538, + "grad_norm": 0.4673003554344177, + "learning_rate": 7.094591388133659e-06, + "loss": 0.3471, + "num_input_tokens_seen": 340896, + "step": 1940 + }, + { + "epoch": 7.811244979919679, + "grad_norm": 0.46651169657707214, + "learning_rate": 6.972740331408015e-06, + "loss": 0.3599, + "num_input_tokens_seen": 341760, + "step": 1945 + }, + { + "epoch": 7.831325301204819, + "grad_norm": 0.09857732057571411, + "learning_rate": 6.851774969242589e-06, + "loss": 0.3382, + "num_input_tokens_seen": 342608, + "step": 1950 + }, + { + "epoch": 7.85140562248996, + "grad_norm": 0.14156574010849, + "learning_rate": 6.731701244768254e-06, + "loss": 0.338, + "num_input_tokens_seen": 343632, + "step": 1955 + }, + { + "epoch": 7.871485943775101, + "grad_norm": 0.08120275288820267, + "learning_rate": 6.612525057308949e-06, + "loss": 0.3473, + "num_input_tokens_seen": 344528, + "step": 1960 + }, + { + "epoch": 7.891566265060241, + "grad_norm": 0.4519568085670471, + "learning_rate": 6.494252262091857e-06, + "loss": 0.3505, + "num_input_tokens_seen": 345568, + "step": 1965 + }, + { + "epoch": 7.911646586345381, + "grad_norm": 0.10098995268344879, + "learning_rate": 6.3768886699597436e-06, + "loss": 0.3443, + "num_input_tokens_seen": 346496, + "step": 1970 + }, + { + "epoch": 7.931726907630522, + "grad_norm": 0.44386422634124756, + "learning_rate": 6.260440047085439e-06, + "loss": 0.3473, + "num_input_tokens_seen": 347360, + "step": 1975 + }, + { + "epoch": 7.951807228915663, + "grad_norm": 0.06753702461719513, + "learning_rate": 6.1449121146885894e-06, + "loss": 0.3445, + "num_input_tokens_seen": 348128, + "step": 1980 + }, + { + "epoch": 7.971887550200803, + "grad_norm": 0.42976370453834534, + "learning_rate": 6.030310548754506e-06, + "loss": 0.3509, + "num_input_tokens_seen": 348960, + "step": 1985 + }, + { + "epoch": 7.991967871485944, + "grad_norm": 0.4281879961490631, + "learning_rate": 5.9166409797553415e-06, + "loss": 0.3477, + "num_input_tokens_seen": 349856, + "step": 1990 + }, + { + "epoch": 8.012048192771084, + "grad_norm": 0.4621274471282959, + "learning_rate": 5.803908992373449e-06, + "loss": 0.338, + "num_input_tokens_seen": 350784, + "step": 1995 + }, + { + "epoch": 8.032128514056225, + "grad_norm": 0.08658602088689804, + "learning_rate": 5.692120125226993e-06, + "loss": 0.3441, + "num_input_tokens_seen": 351552, + "step": 2000 + }, + { + "epoch": 8.032128514056225, + "eval_loss": 0.3484961986541748, + "eval_runtime": 1.2167, + "eval_samples_per_second": 46.027, + "eval_steps_per_second": 23.014, + "num_input_tokens_seen": 351552, + "step": 2000 + }, + { + "epoch": 8.052208835341366, + "grad_norm": 0.0772676169872284, + "learning_rate": 5.581279870597867e-06, + "loss": 0.3537, + "num_input_tokens_seen": 352592, + "step": 2005 + }, + { + "epoch": 8.072289156626505, + "grad_norm": 0.08750884979963303, + "learning_rate": 5.4713936741617845e-06, + "loss": 0.3441, + "num_input_tokens_seen": 353392, + "step": 2010 + }, + { + "epoch": 8.092369477911646, + "grad_norm": 0.4653143882751465, + "learning_rate": 5.3624669347208085e-06, + "loss": 0.3473, + "num_input_tokens_seen": 354176, + "step": 2015 + }, + { + "epoch": 8.112449799196787, + "grad_norm": 0.07544849812984467, + "learning_rate": 5.254505003938043e-06, + "loss": 0.335, + "num_input_tokens_seen": 355040, + "step": 2020 + }, + { + "epoch": 8.132530120481928, + "grad_norm": 0.4536716043949127, + "learning_rate": 5.147513186074751e-06, + "loss": 0.3445, + "num_input_tokens_seen": 355984, + "step": 2025 + }, + { + "epoch": 8.152610441767068, + "grad_norm": 0.06866519898176193, + "learning_rate": 5.041496737729687e-06, + "loss": 0.3443, + "num_input_tokens_seen": 356768, + "step": 2030 + }, + { + "epoch": 8.17269076305221, + "grad_norm": 0.07859829813241959, + "learning_rate": 4.936460867580889e-06, + "loss": 0.3604, + "num_input_tokens_seen": 357648, + "step": 2035 + }, + { + "epoch": 8.19277108433735, + "grad_norm": 0.48313626646995544, + "learning_rate": 4.832410736129778e-06, + "loss": 0.3509, + "num_input_tokens_seen": 358464, + "step": 2040 + }, + { + "epoch": 8.21285140562249, + "grad_norm": 0.4338780343532562, + "learning_rate": 4.729351455447573e-06, + "loss": 0.3421, + "num_input_tokens_seen": 359520, + "step": 2045 + }, + { + "epoch": 8.23293172690763, + "grad_norm": 0.4279603660106659, + "learning_rate": 4.627288088924156e-06, + "loss": 0.3447, + "num_input_tokens_seen": 360256, + "step": 2050 + }, + { + "epoch": 8.25301204819277, + "grad_norm": 0.09719569236040115, + "learning_rate": 4.526225651019309e-06, + "loss": 0.3479, + "num_input_tokens_seen": 361184, + "step": 2055 + }, + { + "epoch": 8.273092369477911, + "grad_norm": 0.11000480502843857, + "learning_rate": 4.4261691070163316e-06, + "loss": 0.3447, + "num_input_tokens_seen": 362064, + "step": 2060 + }, + { + "epoch": 8.293172690763052, + "grad_norm": 0.11856409162282944, + "learning_rate": 4.327123372778122e-06, + "loss": 0.3415, + "num_input_tokens_seen": 362928, + "step": 2065 + }, + { + "epoch": 8.313253012048193, + "grad_norm": 0.06177099421620369, + "learning_rate": 4.229093314505619e-06, + "loss": 0.336, + "num_input_tokens_seen": 363888, + "step": 2070 + }, + { + "epoch": 8.333333333333334, + "grad_norm": 0.07357407361268997, + "learning_rate": 4.132083748498744e-06, + "loss": 0.3572, + "num_input_tokens_seen": 364800, + "step": 2075 + }, + { + "epoch": 8.353413654618475, + "grad_norm": 0.1254434585571289, + "learning_rate": 4.036099440919763e-06, + "loss": 0.3479, + "num_input_tokens_seen": 365680, + "step": 2080 + }, + { + "epoch": 8.373493975903614, + "grad_norm": 0.12399672716856003, + "learning_rate": 3.9411451075591464e-06, + "loss": 0.3477, + "num_input_tokens_seen": 366560, + "step": 2085 + }, + { + "epoch": 8.393574297188755, + "grad_norm": 0.4643438458442688, + "learning_rate": 3.847225413603839e-06, + "loss": 0.3449, + "num_input_tokens_seen": 367424, + "step": 2090 + }, + { + "epoch": 8.413654618473895, + "grad_norm": 0.09002748876810074, + "learning_rate": 3.754344973408064e-06, + "loss": 0.3538, + "num_input_tokens_seen": 368272, + "step": 2095 + }, + { + "epoch": 8.433734939759036, + "grad_norm": 0.06885527074337006, + "learning_rate": 3.6625083502666554e-06, + "loss": 0.3472, + "num_input_tokens_seen": 369040, + "step": 2100 + }, + { + "epoch": 8.453815261044177, + "grad_norm": 0.4609389007091522, + "learning_rate": 3.5717200561908026e-06, + "loss": 0.3411, + "num_input_tokens_seen": 369808, + "step": 2105 + }, + { + "epoch": 8.473895582329318, + "grad_norm": 0.09589619934558868, + "learning_rate": 3.481984551686429e-06, + "loss": 0.3383, + "num_input_tokens_seen": 370672, + "step": 2110 + }, + { + "epoch": 8.493975903614459, + "grad_norm": 0.11223804205656052, + "learning_rate": 3.3933062455349744e-06, + "loss": 0.3417, + "num_input_tokens_seen": 371520, + "step": 2115 + }, + { + "epoch": 8.514056224899598, + "grad_norm": 0.47606217861175537, + "learning_rate": 3.305689494576847e-06, + "loss": 0.36, + "num_input_tokens_seen": 372368, + "step": 2120 + }, + { + "epoch": 8.534136546184738, + "grad_norm": 0.5172960162162781, + "learning_rate": 3.2191386034973627e-06, + "loss": 0.3574, + "num_input_tokens_seen": 373424, + "step": 2125 + }, + { + "epoch": 8.534136546184738, + "eval_loss": 0.3515996038913727, + "eval_runtime": 1.2192, + "eval_samples_per_second": 45.93, + "eval_steps_per_second": 22.965, + "num_input_tokens_seen": 373424, + "step": 2125 + }, + { + "epoch": 8.55421686746988, + "grad_norm": 0.4503559172153473, + "learning_rate": 3.1336578246152103e-06, + "loss": 0.3443, + "num_input_tokens_seen": 374240, + "step": 2130 + }, + { + "epoch": 8.57429718875502, + "grad_norm": 0.11173038929700851, + "learning_rate": 3.049251357673577e-06, + "loss": 0.3383, + "num_input_tokens_seen": 375104, + "step": 2135 + }, + { + "epoch": 8.594377510040161, + "grad_norm": 0.09678234905004501, + "learning_rate": 2.9659233496337786e-06, + "loss": 0.3476, + "num_input_tokens_seen": 376080, + "step": 2140 + }, + { + "epoch": 8.614457831325302, + "grad_norm": 0.08732342720031738, + "learning_rate": 2.8836778944715454e-06, + "loss": 0.3415, + "num_input_tokens_seen": 376928, + "step": 2145 + }, + { + "epoch": 8.634538152610443, + "grad_norm": 0.12217875570058823, + "learning_rate": 2.802519032975859e-06, + "loss": 0.351, + "num_input_tokens_seen": 377856, + "step": 2150 + }, + { + "epoch": 8.654618473895582, + "grad_norm": 0.4865402579307556, + "learning_rate": 2.722450752550429e-06, + "loss": 0.3417, + "num_input_tokens_seen": 378784, + "step": 2155 + }, + { + "epoch": 8.674698795180722, + "grad_norm": 0.4950112998485565, + "learning_rate": 2.6434769870177985e-06, + "loss": 0.3604, + "num_input_tokens_seen": 379696, + "step": 2160 + }, + { + "epoch": 8.694779116465863, + "grad_norm": 0.07212843000888824, + "learning_rate": 2.5656016164260554e-06, + "loss": 0.3447, + "num_input_tokens_seen": 380512, + "step": 2165 + }, + { + "epoch": 8.714859437751004, + "grad_norm": 0.09282089024782181, + "learning_rate": 2.4888284668582285e-06, + "loss": 0.3445, + "num_input_tokens_seen": 381520, + "step": 2170 + }, + { + "epoch": 8.734939759036145, + "grad_norm": 0.08465081453323364, + "learning_rate": 2.4131613102442857e-06, + "loss": 0.3354, + "num_input_tokens_seen": 382480, + "step": 2175 + }, + { + "epoch": 8.755020080321286, + "grad_norm": 0.45736274123191833, + "learning_rate": 2.3386038641758063e-06, + "loss": 0.3383, + "num_input_tokens_seen": 383440, + "step": 2180 + }, + { + "epoch": 8.775100401606426, + "grad_norm": 0.46960991621017456, + "learning_rate": 2.265159791723373e-06, + "loss": 0.3508, + "num_input_tokens_seen": 384240, + "step": 2185 + }, + { + "epoch": 8.795180722891565, + "grad_norm": 0.4292510449886322, + "learning_rate": 2.1928327012565696e-06, + "loss": 0.3483, + "num_input_tokens_seen": 385120, + "step": 2190 + }, + { + "epoch": 8.815261044176706, + "grad_norm": 0.07858365774154663, + "learning_rate": 2.121626146266706e-06, + "loss": 0.3546, + "num_input_tokens_seen": 385984, + "step": 2195 + }, + { + "epoch": 8.835341365461847, + "grad_norm": 0.5046152472496033, + "learning_rate": 2.051543625192226e-06, + "loss": 0.3609, + "num_input_tokens_seen": 386896, + "step": 2200 + }, + { + "epoch": 8.855421686746988, + "grad_norm": 0.43499457836151123, + "learning_rate": 1.9825885812468524e-06, + "loss": 0.33, + "num_input_tokens_seen": 387776, + "step": 2205 + }, + { + "epoch": 8.875502008032129, + "grad_norm": 0.4260156750679016, + "learning_rate": 1.914764402250385e-06, + "loss": 0.3487, + "num_input_tokens_seen": 388704, + "step": 2210 + }, + { + "epoch": 8.89558232931727, + "grad_norm": 0.43153613805770874, + "learning_rate": 1.8480744204622757e-06, + "loss": 0.3512, + "num_input_tokens_seen": 389456, + "step": 2215 + }, + { + "epoch": 8.91566265060241, + "grad_norm": 0.4854128956794739, + "learning_rate": 1.7825219124179004e-06, + "loss": 0.3522, + "num_input_tokens_seen": 390304, + "step": 2220 + }, + { + "epoch": 8.93574297188755, + "grad_norm": 0.08903124928474426, + "learning_rate": 1.7181100987675862e-06, + "loss": 0.3356, + "num_input_tokens_seen": 391104, + "step": 2225 + }, + { + "epoch": 8.95582329317269, + "grad_norm": 0.49092555046081543, + "learning_rate": 1.6548421441183875e-06, + "loss": 0.3516, + "num_input_tokens_seen": 392112, + "step": 2230 + }, + { + "epoch": 8.975903614457831, + "grad_norm": 0.08050279319286346, + "learning_rate": 1.5927211568785878e-06, + "loss": 0.3449, + "num_input_tokens_seen": 392880, + "step": 2235 + }, + { + "epoch": 8.995983935742972, + "grad_norm": 0.4535515308380127, + "learning_rate": 1.5317501891049719e-06, + "loss": 0.3302, + "num_input_tokens_seen": 393728, + "step": 2240 + }, + { + "epoch": 9.016064257028113, + "grad_norm": 0.48401492834091187, + "learning_rate": 1.4719322363529242e-06, + "loss": 0.3487, + "num_input_tokens_seen": 394688, + "step": 2245 + }, + { + "epoch": 9.036144578313253, + "grad_norm": 0.4877987205982208, + "learning_rate": 1.4132702375291989e-06, + "loss": 0.3673, + "num_input_tokens_seen": 395616, + "step": 2250 + }, + { + "epoch": 9.036144578313253, + "eval_loss": 0.35450634360313416, + "eval_runtime": 1.2106, + "eval_samples_per_second": 46.258, + "eval_steps_per_second": 23.129, + "num_input_tokens_seen": 395616, + "step": 2250 + }, + { + "epoch": 9.056224899598394, + "grad_norm": 0.4849400520324707, + "learning_rate": 1.3557670747475714e-06, + "loss": 0.3455, + "num_input_tokens_seen": 396560, + "step": 2255 + }, + { + "epoch": 9.076305220883533, + "grad_norm": 0.4335307478904724, + "learning_rate": 1.2994255731871963e-06, + "loss": 0.3489, + "num_input_tokens_seen": 397456, + "step": 2260 + }, + { + "epoch": 9.096385542168674, + "grad_norm": 0.10791157931089401, + "learning_rate": 1.244248500953854e-06, + "loss": 0.3648, + "num_input_tokens_seen": 398448, + "step": 2265 + }, + { + "epoch": 9.116465863453815, + "grad_norm": 0.4895195960998535, + "learning_rate": 1.1902385689439022e-06, + "loss": 0.3544, + "num_input_tokens_seen": 399248, + "step": 2270 + }, + { + "epoch": 9.136546184738956, + "grad_norm": 0.45951637625694275, + "learning_rate": 1.137398430711123e-06, + "loss": 0.3574, + "num_input_tokens_seen": 400096, + "step": 2275 + }, + { + "epoch": 9.156626506024097, + "grad_norm": 0.44542670249938965, + "learning_rate": 1.085730682336325e-06, + "loss": 0.3477, + "num_input_tokens_seen": 401024, + "step": 2280 + }, + { + "epoch": 9.176706827309237, + "grad_norm": 0.10208380967378616, + "learning_rate": 1.0352378622998204e-06, + "loss": 0.3506, + "num_input_tokens_seen": 401856, + "step": 2285 + }, + { + "epoch": 9.196787148594378, + "grad_norm": 0.43247994780540466, + "learning_rate": 9.85922451356694e-07, + "loss": 0.3388, + "num_input_tokens_seen": 402736, + "step": 2290 + }, + { + "epoch": 9.216867469879517, + "grad_norm": 0.13525407016277313, + "learning_rate": 9.377868724149197e-07, + "loss": 0.3413, + "num_input_tokens_seen": 403696, + "step": 2295 + }, + { + "epoch": 9.236947791164658, + "grad_norm": 0.4610027074813843, + "learning_rate": 8.908334904163207e-07, + "loss": 0.3445, + "num_input_tokens_seen": 404480, + "step": 2300 + }, + { + "epoch": 9.257028112449799, + "grad_norm": 0.4158003330230713, + "learning_rate": 8.450646122203865e-07, + "loss": 0.3233, + "num_input_tokens_seen": 405536, + "step": 2305 + }, + { + "epoch": 9.27710843373494, + "grad_norm": 0.4854465425014496, + "learning_rate": 8.004824864909277e-07, + "loss": 0.3513, + "num_input_tokens_seen": 406368, + "step": 2310 + }, + { + "epoch": 9.29718875502008, + "grad_norm": 0.42681699991226196, + "learning_rate": 7.570893035856091e-07, + "loss": 0.3417, + "num_input_tokens_seen": 407184, + "step": 2315 + }, + { + "epoch": 9.317269076305221, + "grad_norm": 0.48750439286231995, + "learning_rate": 7.148871954483105e-07, + "loss": 0.3542, + "num_input_tokens_seen": 407904, + "step": 2320 + }, + { + "epoch": 9.337349397590362, + "grad_norm": 0.11087116599082947, + "learning_rate": 6.738782355044049e-07, + "loss": 0.3387, + "num_input_tokens_seen": 408736, + "step": 2325 + }, + { + "epoch": 9.357429718875501, + "grad_norm": 0.4181234538555145, + "learning_rate": 6.340644385588846e-07, + "loss": 0.33, + "num_input_tokens_seen": 409664, + "step": 2330 + }, + { + "epoch": 9.377510040160642, + "grad_norm": 0.1247372180223465, + "learning_rate": 5.954477606973679e-07, + "loss": 0.3643, + "num_input_tokens_seen": 410736, + "step": 2335 + }, + { + "epoch": 9.397590361445783, + "grad_norm": 0.09054780006408691, + "learning_rate": 5.580300991899989e-07, + "loss": 0.3612, + "num_input_tokens_seen": 411680, + "step": 2340 + }, + { + "epoch": 9.417670682730924, + "grad_norm": 0.44483572244644165, + "learning_rate": 5.218132923982267e-07, + "loss": 0.3417, + "num_input_tokens_seen": 412480, + "step": 2345 + }, + { + "epoch": 9.437751004016064, + "grad_norm": 0.07839091122150421, + "learning_rate": 4.867991196844918e-07, + "loss": 0.3352, + "num_input_tokens_seen": 413248, + "step": 2350 + }, + { + "epoch": 9.457831325301205, + "grad_norm": 0.12925294041633606, + "learning_rate": 4.5298930132480213e-07, + "loss": 0.3606, + "num_input_tokens_seen": 414080, + "step": 2355 + }, + { + "epoch": 9.477911646586346, + "grad_norm": 0.15520651638507843, + "learning_rate": 4.203854984242195e-07, + "loss": 0.3481, + "num_input_tokens_seen": 414928, + "step": 2360 + }, + { + "epoch": 9.497991967871485, + "grad_norm": 0.4360347092151642, + "learning_rate": 3.8898931283523344e-07, + "loss": 0.364, + "num_input_tokens_seen": 415728, + "step": 2365 + }, + { + "epoch": 9.518072289156626, + "grad_norm": 0.4386354088783264, + "learning_rate": 3.5880228707907417e-07, + "loss": 0.3336, + "num_input_tokens_seen": 416672, + "step": 2370 + }, + { + "epoch": 9.538152610441767, + "grad_norm": 0.48464497923851013, + "learning_rate": 3.2982590426993145e-07, + "loss": 0.3419, + "num_input_tokens_seen": 417520, + "step": 2375 + }, + { + "epoch": 9.538152610441767, + "eval_loss": 0.3474566638469696, + "eval_runtime": 1.2124, + "eval_samples_per_second": 46.19, + "eval_steps_per_second": 23.095, + "num_input_tokens_seen": 417520, + "step": 2375 + }, + { + "epoch": 9.558232931726907, + "grad_norm": 0.43182554841041565, + "learning_rate": 3.020615880420713e-07, + "loss": 0.3326, + "num_input_tokens_seen": 418400, + "step": 2380 + }, + { + "epoch": 9.578313253012048, + "grad_norm": 0.09252151101827621, + "learning_rate": 2.7551070247990305e-07, + "loss": 0.3449, + "num_input_tokens_seen": 419248, + "step": 2385 + }, + { + "epoch": 9.598393574297189, + "grad_norm": 0.44276589155197144, + "learning_rate": 2.501745520509552e-07, + "loss": 0.3481, + "num_input_tokens_seen": 420096, + "step": 2390 + }, + { + "epoch": 9.61847389558233, + "grad_norm": 0.07001478224992752, + "learning_rate": 2.2605438154179038e-07, + "loss": 0.3386, + "num_input_tokens_seen": 420848, + "step": 2395 + }, + { + "epoch": 9.638554216867469, + "grad_norm": 0.11785867065191269, + "learning_rate": 2.0315137599685174e-07, + "loss": 0.3294, + "num_input_tokens_seen": 421728, + "step": 2400 + }, + { + "epoch": 9.65863453815261, + "grad_norm": 0.08471374958753586, + "learning_rate": 1.814666606602261e-07, + "loss": 0.3604, + "num_input_tokens_seen": 422656, + "step": 2405 + }, + { + "epoch": 9.67871485943775, + "grad_norm": 0.4377971589565277, + "learning_rate": 1.6100130092037703e-07, + "loss": 0.3457, + "num_input_tokens_seen": 423600, + "step": 2410 + }, + { + "epoch": 9.698795180722891, + "grad_norm": 0.12307439744472504, + "learning_rate": 1.4175630225778947e-07, + "loss": 0.3447, + "num_input_tokens_seen": 424448, + "step": 2415 + }, + { + "epoch": 9.718875502008032, + "grad_norm": 0.11983578652143478, + "learning_rate": 1.237326101955677e-07, + "loss": 0.3544, + "num_input_tokens_seen": 425632, + "step": 2420 + }, + { + "epoch": 9.738955823293173, + "grad_norm": 0.1070198193192482, + "learning_rate": 1.0693111025300017e-07, + "loss": 0.3385, + "num_input_tokens_seen": 426576, + "step": 2425 + }, + { + "epoch": 9.759036144578314, + "grad_norm": 0.44310370087623596, + "learning_rate": 9.13526279020277e-08, + "loss": 0.3296, + "num_input_tokens_seen": 427376, + "step": 2430 + }, + { + "epoch": 9.779116465863455, + "grad_norm": 0.4734679162502289, + "learning_rate": 7.699792852670362e-08, + "loss": 0.3514, + "num_input_tokens_seen": 428256, + "step": 2435 + }, + { + "epoch": 9.799196787148594, + "grad_norm": 0.13541147112846375, + "learning_rate": 6.386771738558506e-08, + "loss": 0.3389, + "num_input_tokens_seen": 429216, + "step": 2440 + }, + { + "epoch": 9.819277108433734, + "grad_norm": 0.48132413625717163, + "learning_rate": 5.196263957708836e-08, + "loss": 0.3542, + "num_input_tokens_seen": 430208, + "step": 2445 + }, + { + "epoch": 9.839357429718875, + "grad_norm": 0.4347268044948578, + "learning_rate": 4.1283280007778366e-08, + "loss": 0.3292, + "num_input_tokens_seen": 430960, + "step": 2450 + }, + { + "epoch": 9.859437751004016, + "grad_norm": 0.43290168046951294, + "learning_rate": 3.1830163363655296e-08, + "loss": 0.355, + "num_input_tokens_seen": 431936, + "step": 2455 + }, + { + "epoch": 9.879518072289157, + "grad_norm": 0.10507988184690475, + "learning_rate": 2.3603754084358663e-08, + "loss": 0.3425, + "num_input_tokens_seen": 432912, + "step": 2460 + }, + { + "epoch": 9.899598393574298, + "grad_norm": 0.4859294593334198, + "learning_rate": 1.6604456340352235e-08, + "loss": 0.3449, + "num_input_tokens_seen": 433696, + "step": 2465 + }, + { + "epoch": 9.919678714859439, + "grad_norm": 0.1426219940185547, + "learning_rate": 1.0832614013073228e-08, + "loss": 0.3513, + "num_input_tokens_seen": 434528, + "step": 2470 + }, + { + "epoch": 9.939759036144578, + "grad_norm": 0.08813058584928513, + "learning_rate": 6.288510678031934e-09, + "loss": 0.3633, + "num_input_tokens_seen": 435280, + "step": 2475 + }, + { + "epoch": 9.959839357429718, + "grad_norm": 0.4306187033653259, + "learning_rate": 2.972369590878432e-09, + "loss": 0.3483, + "num_input_tokens_seen": 436096, + "step": 2480 + }, + { + "epoch": 9.97991967871486, + "grad_norm": 0.07171786576509476, + "learning_rate": 8.843536764419069e-10, + "loss": 0.3521, + "num_input_tokens_seen": 437008, + "step": 2485 + }, + { + "epoch": 10.0, + "grad_norm": 0.0965154618024826, + "learning_rate": 2.4565520709285417e-11, + "loss": 0.3294, + "num_input_tokens_seen": 437760, + "step": 2490 + }, + { + "epoch": 10.0, + "num_input_tokens_seen": 437760, + "step": 2490, + "total_flos": 1.971213494648832e+16, + "train_loss": 0.4594599806580199, + "train_runtime": 258.0589, + "train_samples_per_second": 19.298, + "train_steps_per_second": 9.649 + } + ], + "logging_steps": 5, + "max_steps": 2490, + "num_input_tokens_seen": 437760, + "num_train_epochs": 10, + "save_steps": 125, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.971213494648832e+16, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}