{ "best_global_step": 2375, "best_metric": 0.3474566638469696, "best_model_checkpoint": "saves/prefix-tuning/llama-3-8b-instruct/train_wsc_1756729607/checkpoint-2375", "epoch": 10.0, "eval_steps": 125, "global_step": 2490, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020080321285140562, "grad_norm": 208.77610778808594, "learning_rate": 8.032128514056225e-07, "loss": 11.8262, "num_input_tokens_seen": 832, "step": 5 }, { "epoch": 0.040160642570281124, "grad_norm": 188.124267578125, "learning_rate": 1.8072289156626506e-06, "loss": 10.3109, "num_input_tokens_seen": 1760, "step": 10 }, { "epoch": 0.060240963855421686, "grad_norm": 118.94561767578125, "learning_rate": 2.811244979919679e-06, "loss": 8.5244, "num_input_tokens_seen": 2608, "step": 15 }, { "epoch": 0.08032128514056225, "grad_norm": 93.1993637084961, "learning_rate": 3.8152610441767074e-06, "loss": 6.312, "num_input_tokens_seen": 3536, "step": 20 }, { "epoch": 0.10040160642570281, "grad_norm": 80.68607330322266, "learning_rate": 4.819277108433735e-06, "loss": 4.854, "num_input_tokens_seen": 4496, "step": 25 }, { "epoch": 0.12048192771084337, "grad_norm": 53.525691986083984, "learning_rate": 5.823293172690764e-06, "loss": 2.9588, "num_input_tokens_seen": 5424, "step": 30 }, { "epoch": 0.14056224899598393, "grad_norm": 74.08545684814453, "learning_rate": 6.827309236947792e-06, "loss": 1.8179, "num_input_tokens_seen": 6304, "step": 35 }, { "epoch": 0.1606425702811245, "grad_norm": 39.353179931640625, "learning_rate": 7.83132530120482e-06, "loss": 0.9304, "num_input_tokens_seen": 7072, "step": 40 }, { "epoch": 0.18072289156626506, "grad_norm": 34.21826934814453, "learning_rate": 8.835341365461847e-06, "loss": 0.7447, "num_input_tokens_seen": 7856, "step": 45 }, { "epoch": 0.20080321285140562, "grad_norm": 46.137203216552734, "learning_rate": 9.839357429718876e-06, "loss": 0.4663, "num_input_tokens_seen": 8880, "step": 50 }, { "epoch": 0.22088353413654618, "grad_norm": 21.007869720458984, "learning_rate": 1.0843373493975904e-05, "loss": 0.419, "num_input_tokens_seen": 9680, "step": 55 }, { "epoch": 0.24096385542168675, "grad_norm": 32.040428161621094, "learning_rate": 1.1847389558232933e-05, "loss": 0.3886, "num_input_tokens_seen": 10576, "step": 60 }, { "epoch": 0.26104417670682734, "grad_norm": 33.35049819946289, "learning_rate": 1.285140562248996e-05, "loss": 0.3719, "num_input_tokens_seen": 11424, "step": 65 }, { "epoch": 0.28112449799196787, "grad_norm": 27.942962646484375, "learning_rate": 1.3855421686746989e-05, "loss": 0.4068, "num_input_tokens_seen": 12224, "step": 70 }, { "epoch": 0.30120481927710846, "grad_norm": 48.38943862915039, "learning_rate": 1.4859437751004016e-05, "loss": 0.4348, "num_input_tokens_seen": 13168, "step": 75 }, { "epoch": 0.321285140562249, "grad_norm": 54.91352462768555, "learning_rate": 1.5863453815261046e-05, "loss": 0.4399, "num_input_tokens_seen": 14080, "step": 80 }, { "epoch": 0.3413654618473896, "grad_norm": 100.44646453857422, "learning_rate": 1.6867469879518073e-05, "loss": 0.873, "num_input_tokens_seen": 15056, "step": 85 }, { "epoch": 0.3614457831325301, "grad_norm": 26.445947647094727, "learning_rate": 1.78714859437751e-05, "loss": 0.5697, "num_input_tokens_seen": 15904, "step": 90 }, { "epoch": 0.3815261044176707, "grad_norm": 8.536154747009277, "learning_rate": 1.8875502008032127e-05, "loss": 0.318, "num_input_tokens_seen": 16688, "step": 95 }, { "epoch": 0.40160642570281124, "grad_norm": 0.5681027173995972, "learning_rate": 1.9879518072289157e-05, "loss": 0.0212, "num_input_tokens_seen": 17552, "step": 100 }, { "epoch": 0.42168674698795183, "grad_norm": 43.08428955078125, "learning_rate": 2.0883534136546184e-05, "loss": 1.5137, "num_input_tokens_seen": 18400, "step": 105 }, { "epoch": 0.44176706827309237, "grad_norm": 32.574432373046875, "learning_rate": 2.1887550200803214e-05, "loss": 0.9558, "num_input_tokens_seen": 19456, "step": 110 }, { "epoch": 0.46184738955823296, "grad_norm": 12.149736404418945, "learning_rate": 2.289156626506024e-05, "loss": 0.2867, "num_input_tokens_seen": 20288, "step": 115 }, { "epoch": 0.4819277108433735, "grad_norm": 59.057838439941406, "learning_rate": 2.389558232931727e-05, "loss": 0.7216, "num_input_tokens_seen": 21328, "step": 120 }, { "epoch": 0.5020080321285141, "grad_norm": 38.10276794433594, "learning_rate": 2.48995983935743e-05, "loss": 0.5349, "num_input_tokens_seen": 22304, "step": 125 }, { "epoch": 0.5020080321285141, "eval_loss": 1.2113524675369263, "eval_runtime": 1.2329, "eval_samples_per_second": 45.42, "eval_steps_per_second": 22.71, "num_input_tokens_seen": 22304, "step": 125 }, { "epoch": 0.5220883534136547, "grad_norm": 51.98005676269531, "learning_rate": 2.5903614457831325e-05, "loss": 1.3225, "num_input_tokens_seen": 23056, "step": 130 }, { "epoch": 0.5421686746987951, "grad_norm": 16.78879165649414, "learning_rate": 2.6907630522088356e-05, "loss": 0.3651, "num_input_tokens_seen": 23840, "step": 135 }, { "epoch": 0.5622489959839357, "grad_norm": 7.14707088470459, "learning_rate": 2.791164658634538e-05, "loss": 0.5608, "num_input_tokens_seen": 24832, "step": 140 }, { "epoch": 0.5823293172690763, "grad_norm": 2.345181465148926, "learning_rate": 2.891566265060241e-05, "loss": 0.4016, "num_input_tokens_seen": 25648, "step": 145 }, { "epoch": 0.6024096385542169, "grad_norm": 13.92953109741211, "learning_rate": 2.991967871485944e-05, "loss": 0.4123, "num_input_tokens_seen": 26496, "step": 150 }, { "epoch": 0.6224899598393574, "grad_norm": 13.532402038574219, "learning_rate": 3.092369477911647e-05, "loss": 0.538, "num_input_tokens_seen": 27392, "step": 155 }, { "epoch": 0.642570281124498, "grad_norm": 11.975309371948242, "learning_rate": 3.192771084337349e-05, "loss": 0.3054, "num_input_tokens_seen": 28272, "step": 160 }, { "epoch": 0.6626506024096386, "grad_norm": 10.431339263916016, "learning_rate": 3.2931726907630524e-05, "loss": 0.4957, "num_input_tokens_seen": 29184, "step": 165 }, { "epoch": 0.6827309236947792, "grad_norm": 2.0895514488220215, "learning_rate": 3.393574297188755e-05, "loss": 0.3453, "num_input_tokens_seen": 30128, "step": 170 }, { "epoch": 0.7028112449799196, "grad_norm": 3.5245909690856934, "learning_rate": 3.4939759036144585e-05, "loss": 0.4688, "num_input_tokens_seen": 30976, "step": 175 }, { "epoch": 0.7228915662650602, "grad_norm": 5.512556552886963, "learning_rate": 3.5943775100401605e-05, "loss": 0.4289, "num_input_tokens_seen": 31776, "step": 180 }, { "epoch": 0.7429718875502008, "grad_norm": 3.1914124488830566, "learning_rate": 3.694779116465863e-05, "loss": 0.3237, "num_input_tokens_seen": 32608, "step": 185 }, { "epoch": 0.7630522088353414, "grad_norm": 12.990925788879395, "learning_rate": 3.7951807228915666e-05, "loss": 0.5221, "num_input_tokens_seen": 33360, "step": 190 }, { "epoch": 0.7831325301204819, "grad_norm": 0.20752322673797607, "learning_rate": 3.895582329317269e-05, "loss": 0.2575, "num_input_tokens_seen": 34176, "step": 195 }, { "epoch": 0.8032128514056225, "grad_norm": 3.7412514686584473, "learning_rate": 3.995983935742972e-05, "loss": 1.23, "num_input_tokens_seen": 34992, "step": 200 }, { "epoch": 0.8232931726907631, "grad_norm": 2.3068435192108154, "learning_rate": 4.0963855421686746e-05, "loss": 0.4066, "num_input_tokens_seen": 35888, "step": 205 }, { "epoch": 0.8433734939759037, "grad_norm": 2.8328449726104736, "learning_rate": 4.196787148594378e-05, "loss": 0.3846, "num_input_tokens_seen": 36848, "step": 210 }, { "epoch": 0.8634538152610441, "grad_norm": 2.105517625808716, "learning_rate": 4.297188755020081e-05, "loss": 0.3303, "num_input_tokens_seen": 37888, "step": 215 }, { "epoch": 0.8835341365461847, "grad_norm": 2.6428239345550537, "learning_rate": 4.3975903614457834e-05, "loss": 0.4356, "num_input_tokens_seen": 38768, "step": 220 }, { "epoch": 0.9036144578313253, "grad_norm": 6.220267295837402, "learning_rate": 4.497991967871486e-05, "loss": 0.4328, "num_input_tokens_seen": 39488, "step": 225 }, { "epoch": 0.9236947791164659, "grad_norm": 4.502399444580078, "learning_rate": 4.598393574297189e-05, "loss": 0.6742, "num_input_tokens_seen": 40336, "step": 230 }, { "epoch": 0.9437751004016064, "grad_norm": 4.788579940795898, "learning_rate": 4.698795180722892e-05, "loss": 0.3987, "num_input_tokens_seen": 41328, "step": 235 }, { "epoch": 0.963855421686747, "grad_norm": 3.492311954498291, "learning_rate": 4.799196787148594e-05, "loss": 0.3388, "num_input_tokens_seen": 42176, "step": 240 }, { "epoch": 0.9839357429718876, "grad_norm": 2.018967628479004, "learning_rate": 4.8995983935742975e-05, "loss": 0.8551, "num_input_tokens_seen": 43312, "step": 245 }, { "epoch": 1.0040160642570282, "grad_norm": 1.773940086364746, "learning_rate": 5e-05, "loss": 0.4306, "num_input_tokens_seen": 44064, "step": 250 }, { "epoch": 1.0040160642570282, "eval_loss": 0.4651975631713867, "eval_runtime": 1.2232, "eval_samples_per_second": 45.781, "eval_steps_per_second": 22.891, "num_input_tokens_seen": 44064, "step": 250 }, { "epoch": 1.0240963855421688, "grad_norm": 13.694576263427734, "learning_rate": 4.9999385864396127e-05, "loss": 0.7501, "num_input_tokens_seen": 44816, "step": 255 }, { "epoch": 1.0441767068273093, "grad_norm": 7.658875942230225, "learning_rate": 4.99975434877575e-05, "loss": 1.1828, "num_input_tokens_seen": 45776, "step": 260 }, { "epoch": 1.0642570281124497, "grad_norm": 1.0638893842697144, "learning_rate": 4.999447296060165e-05, "loss": 0.7021, "num_input_tokens_seen": 46592, "step": 265 }, { "epoch": 1.0843373493975903, "grad_norm": 4.289158344268799, "learning_rate": 4.999017443378618e-05, "loss": 0.37, "num_input_tokens_seen": 47536, "step": 270 }, { "epoch": 1.104417670682731, "grad_norm": 3.189358711242676, "learning_rate": 4.998464811850137e-05, "loss": 0.3415, "num_input_tokens_seen": 48320, "step": 275 }, { "epoch": 1.1244979919678715, "grad_norm": 2.700920581817627, "learning_rate": 4.997789428625975e-05, "loss": 0.381, "num_input_tokens_seen": 49216, "step": 280 }, { "epoch": 1.144578313253012, "grad_norm": 2.0674901008605957, "learning_rate": 4.996991326888286e-05, "loss": 0.3487, "num_input_tokens_seen": 50048, "step": 285 }, { "epoch": 1.1646586345381527, "grad_norm": 2.0477442741394043, "learning_rate": 4.996070545848484e-05, "loss": 0.346, "num_input_tokens_seen": 50832, "step": 290 }, { "epoch": 1.1847389558232932, "grad_norm": 2.814277410507202, "learning_rate": 4.995027130745321e-05, "loss": 0.3439, "num_input_tokens_seen": 51824, "step": 295 }, { "epoch": 1.2048192771084336, "grad_norm": 1.5188792943954468, "learning_rate": 4.9938611328426685e-05, "loss": 0.5375, "num_input_tokens_seen": 52608, "step": 300 }, { "epoch": 1.2248995983935742, "grad_norm": 0.27745991945266724, "learning_rate": 4.992572609426992e-05, "loss": 0.3537, "num_input_tokens_seen": 53440, "step": 305 }, { "epoch": 1.2449799196787148, "grad_norm": 0.40650656819343567, "learning_rate": 4.99116162380454e-05, "loss": 0.3549, "num_input_tokens_seen": 54320, "step": 310 }, { "epoch": 1.2650602409638554, "grad_norm": 1.8287845849990845, "learning_rate": 4.989628245298233e-05, "loss": 0.3352, "num_input_tokens_seen": 55072, "step": 315 }, { "epoch": 1.285140562248996, "grad_norm": 0.5225471258163452, "learning_rate": 4.987972549244257e-05, "loss": 0.3695, "num_input_tokens_seen": 56224, "step": 320 }, { "epoch": 1.3052208835341366, "grad_norm": 1.361476182937622, "learning_rate": 4.986194616988364e-05, "loss": 0.281, "num_input_tokens_seen": 56912, "step": 325 }, { "epoch": 1.3253012048192772, "grad_norm": 1.0196197032928467, "learning_rate": 4.984294535881875e-05, "loss": 0.488, "num_input_tokens_seen": 57648, "step": 330 }, { "epoch": 1.3453815261044177, "grad_norm": 0.7983678579330444, "learning_rate": 4.982272399277386e-05, "loss": 0.3598, "num_input_tokens_seen": 58608, "step": 335 }, { "epoch": 1.3654618473895583, "grad_norm": 0.21441864967346191, "learning_rate": 4.980128306524183e-05, "loss": 0.3973, "num_input_tokens_seen": 59424, "step": 340 }, { "epoch": 1.3855421686746987, "grad_norm": 1.7802695035934448, "learning_rate": 4.9778623629633635e-05, "loss": 0.3078, "num_input_tokens_seen": 60272, "step": 345 }, { "epoch": 1.4056224899598393, "grad_norm": 2.1653060913085938, "learning_rate": 4.975474679922655e-05, "loss": 0.4871, "num_input_tokens_seen": 61056, "step": 350 }, { "epoch": 1.4257028112449799, "grad_norm": 1.056137204170227, "learning_rate": 4.972965374710952e-05, "loss": 0.283, "num_input_tokens_seen": 61968, "step": 355 }, { "epoch": 1.4457831325301205, "grad_norm": 0.806865394115448, "learning_rate": 4.9703345706125485e-05, "loss": 0.3467, "num_input_tokens_seen": 62800, "step": 360 }, { "epoch": 1.465863453815261, "grad_norm": 0.4291660189628601, "learning_rate": 4.96758239688108e-05, "loss": 0.4493, "num_input_tokens_seen": 63824, "step": 365 }, { "epoch": 1.4859437751004017, "grad_norm": 0.9207323789596558, "learning_rate": 4.964708988733178e-05, "loss": 0.3217, "num_input_tokens_seen": 64800, "step": 370 }, { "epoch": 1.5060240963855422, "grad_norm": 0.7873408198356628, "learning_rate": 4.961714487341822e-05, "loss": 0.3766, "num_input_tokens_seen": 65808, "step": 375 }, { "epoch": 1.5060240963855422, "eval_loss": 0.38677313923835754, "eval_runtime": 1.2517, "eval_samples_per_second": 44.74, "eval_steps_per_second": 22.37, "num_input_tokens_seen": 65808, "step": 375 }, { "epoch": 1.5261044176706826, "grad_norm": 0.23284302651882172, "learning_rate": 4.9585990398294043e-05, "loss": 0.4091, "num_input_tokens_seen": 66752, "step": 380 }, { "epoch": 1.5461847389558234, "grad_norm": 0.18355536460876465, "learning_rate": 4.9553627992605066e-05, "loss": 0.3531, "num_input_tokens_seen": 67632, "step": 385 }, { "epoch": 1.5662650602409638, "grad_norm": 0.12290728837251663, "learning_rate": 4.952005924634372e-05, "loss": 0.3506, "num_input_tokens_seen": 68400, "step": 390 }, { "epoch": 1.5863453815261044, "grad_norm": 0.8354266881942749, "learning_rate": 4.948528580877099e-05, "loss": 0.3255, "num_input_tokens_seen": 69408, "step": 395 }, { "epoch": 1.606425702811245, "grad_norm": 0.3600679337978363, "learning_rate": 4.944930938833535e-05, "loss": 0.3689, "num_input_tokens_seen": 70352, "step": 400 }, { "epoch": 1.6265060240963856, "grad_norm": 0.2819594442844391, "learning_rate": 4.9412131752588874e-05, "loss": 0.374, "num_input_tokens_seen": 71184, "step": 405 }, { "epoch": 1.6465863453815262, "grad_norm": 0.9768486618995667, "learning_rate": 4.937375472810033e-05, "loss": 0.3785, "num_input_tokens_seen": 72272, "step": 410 }, { "epoch": 1.6666666666666665, "grad_norm": 0.11346471309661865, "learning_rate": 4.9334180200365486e-05, "loss": 0.3645, "num_input_tokens_seen": 73136, "step": 415 }, { "epoch": 1.6867469879518073, "grad_norm": 0.15687525272369385, "learning_rate": 4.929341011371448e-05, "loss": 0.3477, "num_input_tokens_seen": 73872, "step": 420 }, { "epoch": 1.7068273092369477, "grad_norm": 0.31356531381607056, "learning_rate": 4.9251446471216226e-05, "loss": 0.3495, "num_input_tokens_seen": 74784, "step": 425 }, { "epoch": 1.7269076305220885, "grad_norm": 0.2755882441997528, "learning_rate": 4.9208291334580104e-05, "loss": 0.3477, "num_input_tokens_seen": 75664, "step": 430 }, { "epoch": 1.7469879518072289, "grad_norm": 1.0474437475204468, "learning_rate": 4.9163946824054574e-05, "loss": 0.4005, "num_input_tokens_seen": 76592, "step": 435 }, { "epoch": 1.7670682730923695, "grad_norm": 0.9340880513191223, "learning_rate": 4.911841511832305e-05, "loss": 0.3454, "num_input_tokens_seen": 77408, "step": 440 }, { "epoch": 1.78714859437751, "grad_norm": 0.7397641539573669, "learning_rate": 4.907169845439688e-05, "loss": 0.3494, "num_input_tokens_seen": 78272, "step": 445 }, { "epoch": 1.8072289156626506, "grad_norm": 0.32739967107772827, "learning_rate": 4.902379912750537e-05, "loss": 0.3211, "num_input_tokens_seen": 79200, "step": 450 }, { "epoch": 1.8273092369477912, "grad_norm": 0.6281508207321167, "learning_rate": 4.897471949098309e-05, "loss": 0.3843, "num_input_tokens_seen": 80112, "step": 455 }, { "epoch": 1.8473895582329316, "grad_norm": 0.6183443069458008, "learning_rate": 4.892446195615423e-05, "loss": 0.3143, "num_input_tokens_seen": 81168, "step": 460 }, { "epoch": 1.8674698795180724, "grad_norm": 0.6530241966247559, "learning_rate": 4.88730289922141e-05, "loss": 0.3845, "num_input_tokens_seen": 82112, "step": 465 }, { "epoch": 1.8875502008032128, "grad_norm": 0.2493213266134262, "learning_rate": 4.8820423126107845e-05, "loss": 0.3467, "num_input_tokens_seen": 83072, "step": 470 }, { "epoch": 1.9076305220883534, "grad_norm": 0.7678804993629456, "learning_rate": 4.87666469424063e-05, "loss": 0.3683, "num_input_tokens_seen": 83920, "step": 475 }, { "epoch": 1.927710843373494, "grad_norm": 0.8148710131645203, "learning_rate": 4.8711703083178986e-05, "loss": 0.3512, "num_input_tokens_seen": 84768, "step": 480 }, { "epoch": 1.9477911646586346, "grad_norm": 0.8058724999427795, "learning_rate": 4.865559424786432e-05, "loss": 0.3478, "num_input_tokens_seen": 85616, "step": 485 }, { "epoch": 1.9678714859437751, "grad_norm": 0.1543155312538147, "learning_rate": 4.859832319313697e-05, "loss": 0.3477, "num_input_tokens_seen": 86400, "step": 490 }, { "epoch": 1.9879518072289155, "grad_norm": 1.0387096405029297, "learning_rate": 4.8539892732772455e-05, "loss": 0.3753, "num_input_tokens_seen": 87216, "step": 495 }, { "epoch": 2.0080321285140563, "grad_norm": 0.2593827545642853, "learning_rate": 4.848030573750885e-05, "loss": 0.3159, "num_input_tokens_seen": 88048, "step": 500 }, { "epoch": 2.0080321285140563, "eval_loss": 0.3829512596130371, "eval_runtime": 1.2272, "eval_samples_per_second": 45.632, "eval_steps_per_second": 22.816, "num_input_tokens_seen": 88048, "step": 500 }, { "epoch": 2.0281124497991967, "grad_norm": 0.6328459978103638, "learning_rate": 4.841956513490577e-05, "loss": 0.3501, "num_input_tokens_seen": 88896, "step": 505 }, { "epoch": 2.0481927710843375, "grad_norm": 0.17319746315479279, "learning_rate": 4.8357673909200563e-05, "loss": 0.3452, "num_input_tokens_seen": 89744, "step": 510 }, { "epoch": 2.068273092369478, "grad_norm": 0.13001570105552673, "learning_rate": 4.8294635101161645e-05, "loss": 0.3738, "num_input_tokens_seen": 90528, "step": 515 }, { "epoch": 2.0883534136546187, "grad_norm": 0.1256828010082245, "learning_rate": 4.8230451807939135e-05, "loss": 0.3347, "num_input_tokens_seen": 91360, "step": 520 }, { "epoch": 2.108433734939759, "grad_norm": 0.7363554835319519, "learning_rate": 4.816512718291267e-05, "loss": 0.346, "num_input_tokens_seen": 92176, "step": 525 }, { "epoch": 2.1285140562248994, "grad_norm": 0.19434113800525665, "learning_rate": 4.80986644355365e-05, "loss": 0.341, "num_input_tokens_seen": 93104, "step": 530 }, { "epoch": 2.1485943775100402, "grad_norm": 0.1456967294216156, "learning_rate": 4.803106683118177e-05, "loss": 0.3588, "num_input_tokens_seen": 93984, "step": 535 }, { "epoch": 2.1686746987951806, "grad_norm": 0.9451452493667603, "learning_rate": 4.796233769097615e-05, "loss": 0.3438, "num_input_tokens_seen": 94896, "step": 540 }, { "epoch": 2.1887550200803214, "grad_norm": 0.1497451663017273, "learning_rate": 4.789248039164058e-05, "loss": 0.375, "num_input_tokens_seen": 95824, "step": 545 }, { "epoch": 2.208835341365462, "grad_norm": 0.8181028366088867, "learning_rate": 4.782149836532345e-05, "loss": 0.3607, "num_input_tokens_seen": 96688, "step": 550 }, { "epoch": 2.2289156626506026, "grad_norm": 0.6427181959152222, "learning_rate": 4.7749395099431924e-05, "loss": 0.3312, "num_input_tokens_seen": 97488, "step": 555 }, { "epoch": 2.248995983935743, "grad_norm": 1.3496628999710083, "learning_rate": 4.7676174136460625e-05, "loss": 0.4083, "num_input_tokens_seen": 98288, "step": 560 }, { "epoch": 2.2690763052208833, "grad_norm": 0.2799893915653229, "learning_rate": 4.760183907381757e-05, "loss": 0.3447, "num_input_tokens_seen": 99200, "step": 565 }, { "epoch": 2.289156626506024, "grad_norm": 0.22829043865203857, "learning_rate": 4.752639356364744e-05, "loss": 0.3228, "num_input_tokens_seen": 99984, "step": 570 }, { "epoch": 2.3092369477911645, "grad_norm": 0.17503726482391357, "learning_rate": 4.7449841312652166e-05, "loss": 0.3781, "num_input_tokens_seen": 100784, "step": 575 }, { "epoch": 2.3293172690763053, "grad_norm": 0.8941397666931152, "learning_rate": 4.737218608190878e-05, "loss": 0.367, "num_input_tokens_seen": 101584, "step": 580 }, { "epoch": 2.3493975903614457, "grad_norm": 0.7553220987319946, "learning_rate": 4.729343168668463e-05, "loss": 0.3603, "num_input_tokens_seen": 102480, "step": 585 }, { "epoch": 2.3694779116465865, "grad_norm": 0.2362431138753891, "learning_rate": 4.721358199624997e-05, "loss": 0.3631, "num_input_tokens_seen": 103408, "step": 590 }, { "epoch": 2.389558232931727, "grad_norm": 0.2296190857887268, "learning_rate": 4.713264093368783e-05, "loss": 0.3911, "num_input_tokens_seen": 104160, "step": 595 }, { "epoch": 2.4096385542168672, "grad_norm": 0.16303616762161255, "learning_rate": 4.705061247570128e-05, "loss": 0.3406, "num_input_tokens_seen": 105040, "step": 600 }, { "epoch": 2.429718875502008, "grad_norm": 0.15550056099891663, "learning_rate": 4.6967500652418034e-05, "loss": 0.3582, "num_input_tokens_seen": 105856, "step": 605 }, { "epoch": 2.4497991967871484, "grad_norm": 0.9154328107833862, "learning_rate": 4.6883309547192476e-05, "loss": 0.3701, "num_input_tokens_seen": 106928, "step": 610 }, { "epoch": 2.4698795180722892, "grad_norm": 0.5483267903327942, "learning_rate": 4.679804329640505e-05, "loss": 0.3423, "num_input_tokens_seen": 107808, "step": 615 }, { "epoch": 2.4899598393574296, "grad_norm": 0.5274845957756042, "learning_rate": 4.6711706089258955e-05, "loss": 0.3104, "num_input_tokens_seen": 108656, "step": 620 }, { "epoch": 2.5100401606425704, "grad_norm": 0.23020295798778534, "learning_rate": 4.6624302167574436e-05, "loss": 0.3958, "num_input_tokens_seen": 109696, "step": 625 }, { "epoch": 2.5100401606425704, "eval_loss": 0.3675794303417206, "eval_runtime": 1.213, "eval_samples_per_second": 46.168, "eval_steps_per_second": 23.084, "num_input_tokens_seen": 109696, "step": 625 }, { "epoch": 2.5301204819277108, "grad_norm": 0.13925790786743164, "learning_rate": 4.653583582558031e-05, "loss": 0.3587, "num_input_tokens_seen": 110576, "step": 630 }, { "epoch": 2.550200803212851, "grad_norm": 0.9564950466156006, "learning_rate": 4.6446311409703006e-05, "loss": 0.365, "num_input_tokens_seen": 111440, "step": 635 }, { "epoch": 2.570281124497992, "grad_norm": 0.2103191763162613, "learning_rate": 4.635573331835302e-05, "loss": 0.3339, "num_input_tokens_seen": 112192, "step": 640 }, { "epoch": 2.5903614457831328, "grad_norm": 0.230157271027565, "learning_rate": 4.6264106001708824e-05, "loss": 0.3631, "num_input_tokens_seen": 113024, "step": 645 }, { "epoch": 2.610441767068273, "grad_norm": 0.19389480352401733, "learning_rate": 4.61714339614982e-05, "loss": 0.3795, "num_input_tokens_seen": 113952, "step": 650 }, { "epoch": 2.6305220883534135, "grad_norm": 0.14889311790466309, "learning_rate": 4.607772175077711e-05, "loss": 0.3586, "num_input_tokens_seen": 114928, "step": 655 }, { "epoch": 2.6506024096385543, "grad_norm": 0.10515565425157547, "learning_rate": 4.598297397370596e-05, "loss": 0.3726, "num_input_tokens_seen": 115728, "step": 660 }, { "epoch": 2.6706827309236947, "grad_norm": 0.6394887566566467, "learning_rate": 4.588719528532342e-05, "loss": 0.3549, "num_input_tokens_seen": 116544, "step": 665 }, { "epoch": 2.6907630522088355, "grad_norm": 0.14106032252311707, "learning_rate": 4.5790390391317675e-05, "loss": 0.3379, "num_input_tokens_seen": 117568, "step": 670 }, { "epoch": 2.710843373493976, "grad_norm": 0.08746648579835892, "learning_rate": 4.5692564047795316e-05, "loss": 0.3688, "num_input_tokens_seen": 118368, "step": 675 }, { "epoch": 2.7309236947791167, "grad_norm": 0.5902343392372131, "learning_rate": 4.5593721061047576e-05, "loss": 0.3455, "num_input_tokens_seen": 119120, "step": 680 }, { "epoch": 2.751004016064257, "grad_norm": 0.57841557264328, "learning_rate": 4.549386628731425e-05, "loss": 0.3575, "num_input_tokens_seen": 120064, "step": 685 }, { "epoch": 2.7710843373493974, "grad_norm": 0.10715785622596741, "learning_rate": 4.5393004632545064e-05, "loss": 0.3721, "num_input_tokens_seen": 120960, "step": 690 }, { "epoch": 2.791164658634538, "grad_norm": 0.09298089146614075, "learning_rate": 4.529114105215869e-05, "loss": 0.3545, "num_input_tokens_seen": 121760, "step": 695 }, { "epoch": 2.8112449799196786, "grad_norm": 0.659833550453186, "learning_rate": 4.518828055079925e-05, "loss": 0.3675, "num_input_tokens_seen": 122720, "step": 700 }, { "epoch": 2.8313253012048194, "grad_norm": 0.600629985332489, "learning_rate": 4.508442818209042e-05, "loss": 0.3543, "num_input_tokens_seen": 123712, "step": 705 }, { "epoch": 2.8514056224899598, "grad_norm": 0.1273653358221054, "learning_rate": 4.4979589048387186e-05, "loss": 0.3561, "num_input_tokens_seen": 124624, "step": 710 }, { "epoch": 2.8714859437751006, "grad_norm": 0.5190867185592651, "learning_rate": 4.487376830052511e-05, "loss": 0.3474, "num_input_tokens_seen": 125696, "step": 715 }, { "epoch": 2.891566265060241, "grad_norm": 0.5262147784233093, "learning_rate": 4.476697113756731e-05, "loss": 0.2977, "num_input_tokens_seen": 126480, "step": 720 }, { "epoch": 2.9116465863453813, "grad_norm": 0.9929115772247314, "learning_rate": 4.465920280654901e-05, "loss": 0.3658, "num_input_tokens_seen": 127312, "step": 725 }, { "epoch": 2.931726907630522, "grad_norm": 0.26060280203819275, "learning_rate": 4.4550468602219716e-05, "loss": 0.3475, "num_input_tokens_seen": 128352, "step": 730 }, { "epoch": 2.9518072289156625, "grad_norm": 0.2218063920736313, "learning_rate": 4.4440773866783136e-05, "loss": 0.4262, "num_input_tokens_seen": 129232, "step": 735 }, { "epoch": 2.9718875502008033, "grad_norm": 0.6310736536979675, "learning_rate": 4.433012398963468e-05, "loss": 0.4037, "num_input_tokens_seen": 130080, "step": 740 }, { "epoch": 2.9919678714859437, "grad_norm": 0.5500450134277344, "learning_rate": 4.421852440709666e-05, "loss": 0.3459, "num_input_tokens_seen": 130880, "step": 745 }, { "epoch": 3.0120481927710845, "grad_norm": 0.5436108112335205, "learning_rate": 4.4105980602151256e-05, "loss": 0.3521, "num_input_tokens_seen": 131872, "step": 750 }, { "epoch": 3.0120481927710845, "eval_loss": 0.35073402523994446, "eval_runtime": 1.2195, "eval_samples_per_second": 45.921, "eval_steps_per_second": 22.961, "num_input_tokens_seen": 131872, "step": 750 }, { "epoch": 3.032128514056225, "grad_norm": 0.5504011511802673, "learning_rate": 4.399249810417108e-05, "loss": 0.354, "num_input_tokens_seen": 132656, "step": 755 }, { "epoch": 3.0522088353413657, "grad_norm": 0.09489892423152924, "learning_rate": 4.387808248864751e-05, "loss": 0.3708, "num_input_tokens_seen": 133472, "step": 760 }, { "epoch": 3.072289156626506, "grad_norm": 0.1529596596956253, "learning_rate": 4.376273937691681e-05, "loss": 0.3463, "num_input_tokens_seen": 134416, "step": 765 }, { "epoch": 3.0923694779116464, "grad_norm": 0.08475496619939804, "learning_rate": 4.364647443588389e-05, "loss": 0.3485, "num_input_tokens_seen": 135344, "step": 770 }, { "epoch": 3.112449799196787, "grad_norm": 0.12930436432361603, "learning_rate": 4.352929337774395e-05, "loss": 0.3382, "num_input_tokens_seen": 136240, "step": 775 }, { "epoch": 3.1325301204819276, "grad_norm": 1.1815729141235352, "learning_rate": 4.341120195970178e-05, "loss": 0.3559, "num_input_tokens_seen": 137120, "step": 780 }, { "epoch": 3.1526104417670684, "grad_norm": 0.3863106071949005, "learning_rate": 4.3292205983688905e-05, "loss": 0.36, "num_input_tokens_seen": 138112, "step": 785 }, { "epoch": 3.1726907630522088, "grad_norm": 0.4967614412307739, "learning_rate": 4.3172311296078595e-05, "loss": 0.3472, "num_input_tokens_seen": 138960, "step": 790 }, { "epoch": 3.1927710843373496, "grad_norm": 0.13387857377529144, "learning_rate": 4.305152378739855e-05, "loss": 0.3646, "num_input_tokens_seen": 140016, "step": 795 }, { "epoch": 3.21285140562249, "grad_norm": 0.11857640743255615, "learning_rate": 4.292984939204155e-05, "loss": 0.3357, "num_input_tokens_seen": 140768, "step": 800 }, { "epoch": 3.2329317269076308, "grad_norm": 0.12806545197963715, "learning_rate": 4.2807294087973834e-05, "loss": 0.3444, "num_input_tokens_seen": 141664, "step": 805 }, { "epoch": 3.253012048192771, "grad_norm": 0.11283908039331436, "learning_rate": 4.2683863896441475e-05, "loss": 0.3541, "num_input_tokens_seen": 142448, "step": 810 }, { "epoch": 3.2730923694779115, "grad_norm": 0.06589579582214355, "learning_rate": 4.255956488167449e-05, "loss": 0.3619, "num_input_tokens_seen": 143408, "step": 815 }, { "epoch": 3.2931726907630523, "grad_norm": 0.6529943346977234, "learning_rate": 4.2434403150588895e-05, "loss": 0.3449, "num_input_tokens_seen": 144256, "step": 820 }, { "epoch": 3.3132530120481927, "grad_norm": 0.618222177028656, "learning_rate": 4.230838485248674e-05, "loss": 0.3504, "num_input_tokens_seen": 145120, "step": 825 }, { "epoch": 3.3333333333333335, "grad_norm": 0.566098690032959, "learning_rate": 4.21815161787539e-05, "loss": 0.3445, "num_input_tokens_seen": 146080, "step": 830 }, { "epoch": 3.353413654618474, "grad_norm": 0.13646817207336426, "learning_rate": 4.205380336255594e-05, "loss": 0.3418, "num_input_tokens_seen": 146912, "step": 835 }, { "epoch": 3.3734939759036147, "grad_norm": 0.21328403055667877, "learning_rate": 4.192525267853188e-05, "loss": 0.2934, "num_input_tokens_seen": 147776, "step": 840 }, { "epoch": 3.393574297188755, "grad_norm": 0.3965797424316406, "learning_rate": 4.179587044248585e-05, "loss": 0.2829, "num_input_tokens_seen": 148768, "step": 845 }, { "epoch": 3.4136546184738954, "grad_norm": 0.24930249154567719, "learning_rate": 4.166566301107687e-05, "loss": 0.5387, "num_input_tokens_seen": 149728, "step": 850 }, { "epoch": 3.433734939759036, "grad_norm": 0.7255300879478455, "learning_rate": 4.153463678150651e-05, "loss": 0.3639, "num_input_tokens_seen": 150784, "step": 855 }, { "epoch": 3.4538152610441766, "grad_norm": 0.11454490572214127, "learning_rate": 4.140279819120457e-05, "loss": 0.3721, "num_input_tokens_seen": 151728, "step": 860 }, { "epoch": 3.4738955823293174, "grad_norm": 0.7094687819480896, "learning_rate": 4.127015371751284e-05, "loss": 0.3656, "num_input_tokens_seen": 152640, "step": 865 }, { "epoch": 3.4939759036144578, "grad_norm": 0.5258346796035767, "learning_rate": 4.1136709877366844e-05, "loss": 0.3193, "num_input_tokens_seen": 153424, "step": 870 }, { "epoch": 3.5140562248995986, "grad_norm": 0.5166998505592346, "learning_rate": 4.100247322697562e-05, "loss": 0.3677, "num_input_tokens_seen": 154416, "step": 875 }, { "epoch": 3.5140562248995986, "eval_loss": 0.3535325825214386, "eval_runtime": 1.2211, "eval_samples_per_second": 45.861, "eval_steps_per_second": 22.931, "num_input_tokens_seen": 154416, "step": 875 }, { "epoch": 3.534136546184739, "grad_norm": 0.49516451358795166, "learning_rate": 4.08674503614997e-05, "loss": 0.3907, "num_input_tokens_seen": 155184, "step": 880 }, { "epoch": 3.5542168674698793, "grad_norm": 0.0980529636144638, "learning_rate": 4.0731647914727004e-05, "loss": 0.3941, "num_input_tokens_seen": 156000, "step": 885 }, { "epoch": 3.57429718875502, "grad_norm": 0.5644952058792114, "learning_rate": 4.059507255874694e-05, "loss": 0.345, "num_input_tokens_seen": 156976, "step": 890 }, { "epoch": 3.5943775100401605, "grad_norm": 0.5101115703582764, "learning_rate": 4.0457731003622606e-05, "loss": 0.3331, "num_input_tokens_seen": 157904, "step": 895 }, { "epoch": 3.6144578313253013, "grad_norm": 0.4910569190979004, "learning_rate": 4.0319629997061116e-05, "loss": 0.3339, "num_input_tokens_seen": 158864, "step": 900 }, { "epoch": 3.6345381526104417, "grad_norm": 0.48415863513946533, "learning_rate": 4.018077632408207e-05, "loss": 0.2827, "num_input_tokens_seen": 159744, "step": 905 }, { "epoch": 3.6546184738955825, "grad_norm": 0.4711949825286865, "learning_rate": 4.004117680668422e-05, "loss": 0.3838, "num_input_tokens_seen": 160608, "step": 910 }, { "epoch": 3.674698795180723, "grad_norm": 0.935171902179718, "learning_rate": 3.990083830351027e-05, "loss": 0.3816, "num_input_tokens_seen": 161488, "step": 915 }, { "epoch": 3.694779116465863, "grad_norm": 0.48552215099334717, "learning_rate": 3.975976770950994e-05, "loss": 0.4066, "num_input_tokens_seen": 162224, "step": 920 }, { "epoch": 3.714859437751004, "grad_norm": 0.5080327391624451, "learning_rate": 3.961797195560118e-05, "loss": 0.3183, "num_input_tokens_seen": 163056, "step": 925 }, { "epoch": 3.734939759036145, "grad_norm": 0.606795072555542, "learning_rate": 3.947545800832967e-05, "loss": 0.3641, "num_input_tokens_seen": 163856, "step": 930 }, { "epoch": 3.755020080321285, "grad_norm": 0.5324833989143372, "learning_rate": 3.9332232869526534e-05, "loss": 0.3394, "num_input_tokens_seen": 164768, "step": 935 }, { "epoch": 3.7751004016064256, "grad_norm": 0.10697121173143387, "learning_rate": 3.918830357596434e-05, "loss": 0.3368, "num_input_tokens_seen": 165600, "step": 940 }, { "epoch": 3.7951807228915664, "grad_norm": 0.13268576562404633, "learning_rate": 3.9043677199011364e-05, "loss": 0.3511, "num_input_tokens_seen": 166400, "step": 945 }, { "epoch": 3.8152610441767068, "grad_norm": 0.12882153689861298, "learning_rate": 3.889836084428422e-05, "loss": 0.328, "num_input_tokens_seen": 167296, "step": 950 }, { "epoch": 3.835341365461847, "grad_norm": 0.14181359112262726, "learning_rate": 3.8752361651298675e-05, "loss": 0.369, "num_input_tokens_seen": 168208, "step": 955 }, { "epoch": 3.855421686746988, "grad_norm": 0.4742559492588043, "learning_rate": 3.860568679311893e-05, "loss": 0.3657, "num_input_tokens_seen": 169056, "step": 960 }, { "epoch": 3.8755020080321287, "grad_norm": 0.1299924999475479, "learning_rate": 3.8458343476005196e-05, "loss": 0.3849, "num_input_tokens_seen": 169888, "step": 965 }, { "epoch": 3.895582329317269, "grad_norm": 0.08048601448535919, "learning_rate": 3.8310338939059644e-05, "loss": 0.3541, "num_input_tokens_seen": 170704, "step": 970 }, { "epoch": 3.9156626506024095, "grad_norm": 0.6629543304443359, "learning_rate": 3.8161680453870715e-05, "loss": 0.3558, "num_input_tokens_seen": 171600, "step": 975 }, { "epoch": 3.9357429718875503, "grad_norm": 0.15418274700641632, "learning_rate": 3.8012375324155904e-05, "loss": 0.3131, "num_input_tokens_seen": 172480, "step": 980 }, { "epoch": 3.9558232931726907, "grad_norm": 0.4927317202091217, "learning_rate": 3.7862430885402876e-05, "loss": 0.3661, "num_input_tokens_seen": 173504, "step": 985 }, { "epoch": 3.9759036144578315, "grad_norm": 0.4768475890159607, "learning_rate": 3.7711854504509135e-05, "loss": 0.3373, "num_input_tokens_seen": 174288, "step": 990 }, { "epoch": 3.995983935742972, "grad_norm": 0.7225349545478821, "learning_rate": 3.756065357941999e-05, "loss": 0.3623, "num_input_tokens_seen": 175104, "step": 995 }, { "epoch": 4.016064257028113, "grad_norm": 0.7921448945999146, "learning_rate": 3.740883553876515e-05, "loss": 0.3426, "num_input_tokens_seen": 176048, "step": 1000 }, { "epoch": 4.016064257028113, "eval_loss": 0.35071006417274475, "eval_runtime": 1.2197, "eval_samples_per_second": 45.913, "eval_steps_per_second": 22.957, "num_input_tokens_seen": 176048, "step": 1000 }, { "epoch": 4.036144578313253, "grad_norm": 0.11272845417261124, "learning_rate": 3.725640784149375e-05, "loss": 0.4204, "num_input_tokens_seen": 176880, "step": 1005 }, { "epoch": 4.056224899598393, "grad_norm": 0.08953544497489929, "learning_rate": 3.710337797650787e-05, "loss": 0.339, "num_input_tokens_seen": 177680, "step": 1010 }, { "epoch": 4.076305220883534, "grad_norm": 0.4361952543258667, "learning_rate": 3.694975346229458e-05, "loss": 0.3311, "num_input_tokens_seen": 178608, "step": 1015 }, { "epoch": 4.096385542168675, "grad_norm": 0.09541574120521545, "learning_rate": 3.679554184655659e-05, "loss": 0.3611, "num_input_tokens_seen": 179600, "step": 1020 }, { "epoch": 4.116465863453815, "grad_norm": 0.5854984521865845, "learning_rate": 3.6640750705841405e-05, "loss": 0.3403, "num_input_tokens_seen": 180464, "step": 1025 }, { "epoch": 4.136546184738956, "grad_norm": 0.10651904344558716, "learning_rate": 3.6485387645169064e-05, "loss": 0.3243, "num_input_tokens_seen": 181344, "step": 1030 }, { "epoch": 4.156626506024097, "grad_norm": 0.5942978262901306, "learning_rate": 3.632946029765856e-05, "loss": 0.3965, "num_input_tokens_seen": 182080, "step": 1035 }, { "epoch": 4.176706827309237, "grad_norm": 0.07312840223312378, "learning_rate": 3.617297632415273e-05, "loss": 0.3719, "num_input_tokens_seen": 182848, "step": 1040 }, { "epoch": 4.196787148594377, "grad_norm": 0.5075451135635376, "learning_rate": 3.601594341284195e-05, "loss": 0.3512, "num_input_tokens_seen": 183840, "step": 1045 }, { "epoch": 4.216867469879518, "grad_norm": 0.047960445284843445, "learning_rate": 3.5858369278886354e-05, "loss": 0.3388, "num_input_tokens_seen": 184720, "step": 1050 }, { "epoch": 4.236947791164659, "grad_norm": 0.08333683758974075, "learning_rate": 3.5700261664036827e-05, "loss": 0.3457, "num_input_tokens_seen": 185504, "step": 1055 }, { "epoch": 4.257028112449799, "grad_norm": 0.0653541311621666, "learning_rate": 3.55416283362546e-05, "loss": 0.3588, "num_input_tokens_seen": 186272, "step": 1060 }, { "epoch": 4.27710843373494, "grad_norm": 0.5113236308097839, "learning_rate": 3.5382477089329646e-05, "loss": 0.3579, "num_input_tokens_seen": 187296, "step": 1065 }, { "epoch": 4.2971887550200805, "grad_norm": 0.07462375611066818, "learning_rate": 3.522281574249774e-05, "loss": 0.348, "num_input_tokens_seen": 188320, "step": 1070 }, { "epoch": 4.317269076305221, "grad_norm": 0.11710739135742188, "learning_rate": 3.5062652140056275e-05, "loss": 0.3282, "num_input_tokens_seen": 189248, "step": 1075 }, { "epoch": 4.337349397590361, "grad_norm": 0.15031148493289948, "learning_rate": 3.490199415097892e-05, "loss": 0.3005, "num_input_tokens_seen": 190432, "step": 1080 }, { "epoch": 4.357429718875502, "grad_norm": 0.6503745913505554, "learning_rate": 3.474084966852897e-05, "loss": 0.4539, "num_input_tokens_seen": 191296, "step": 1085 }, { "epoch": 4.377510040160643, "grad_norm": 0.14889173209667206, "learning_rate": 3.457922660987155e-05, "loss": 0.3682, "num_input_tokens_seen": 192368, "step": 1090 }, { "epoch": 4.397590361445783, "grad_norm": 0.4457005560398102, "learning_rate": 3.441713291568462e-05, "loss": 0.3338, "num_input_tokens_seen": 193232, "step": 1095 }, { "epoch": 4.417670682730924, "grad_norm": 0.5409120321273804, "learning_rate": 3.42545765497689e-05, "loss": 0.3587, "num_input_tokens_seen": 194128, "step": 1100 }, { "epoch": 4.437751004016064, "grad_norm": 0.085002101957798, "learning_rate": 3.409156549865654e-05, "loss": 0.3609, "num_input_tokens_seen": 194944, "step": 1105 }, { "epoch": 4.457831325301205, "grad_norm": 0.49231743812561035, "learning_rate": 3.392810777121876e-05, "loss": 0.3477, "num_input_tokens_seen": 195840, "step": 1110 }, { "epoch": 4.477911646586345, "grad_norm": 0.5549922585487366, "learning_rate": 3.376421139827237e-05, "loss": 0.3871, "num_input_tokens_seen": 196640, "step": 1115 }, { "epoch": 4.497991967871486, "grad_norm": 0.06657743453979492, "learning_rate": 3.3599884432185225e-05, "loss": 0.3481, "num_input_tokens_seen": 197440, "step": 1120 }, { "epoch": 4.518072289156627, "grad_norm": 0.13579045236110687, "learning_rate": 3.343513494648055e-05, "loss": 0.3393, "num_input_tokens_seen": 198432, "step": 1125 }, { "epoch": 4.518072289156627, "eval_loss": 0.3545505702495575, "eval_runtime": 1.4272, "eval_samples_per_second": 39.238, "eval_steps_per_second": 19.619, "num_input_tokens_seen": 198432, "step": 1125 }, { "epoch": 4.538152610441767, "grad_norm": 0.4401510953903198, "learning_rate": 3.326997103544035e-05, "loss": 0.3349, "num_input_tokens_seen": 199232, "step": 1130 }, { "epoch": 4.5582329317269075, "grad_norm": 0.14849136769771576, "learning_rate": 3.310440081370767e-05, "loss": 0.3373, "num_input_tokens_seen": 200144, "step": 1135 }, { "epoch": 4.578313253012048, "grad_norm": 0.621387243270874, "learning_rate": 3.2938432415887984e-05, "loss": 0.3213, "num_input_tokens_seen": 200896, "step": 1140 }, { "epoch": 4.598393574297189, "grad_norm": 0.7517121434211731, "learning_rate": 3.2772073996149435e-05, "loss": 0.3475, "num_input_tokens_seen": 201760, "step": 1145 }, { "epoch": 4.618473895582329, "grad_norm": 0.41686856746673584, "learning_rate": 3.260533372782234e-05, "loss": 0.4032, "num_input_tokens_seen": 202688, "step": 1150 }, { "epoch": 4.63855421686747, "grad_norm": 0.6020703315734863, "learning_rate": 3.24382198029975e-05, "loss": 0.3564, "num_input_tokens_seen": 203392, "step": 1155 }, { "epoch": 4.658634538152611, "grad_norm": 0.40914788842201233, "learning_rate": 3.227074043212383e-05, "loss": 0.322, "num_input_tokens_seen": 204080, "step": 1160 }, { "epoch": 4.678714859437751, "grad_norm": 0.09926328808069229, "learning_rate": 3.2102903843604885e-05, "loss": 0.373, "num_input_tokens_seen": 204816, "step": 1165 }, { "epoch": 4.698795180722891, "grad_norm": 0.08548900485038757, "learning_rate": 3.1934718283394646e-05, "loss": 0.3587, "num_input_tokens_seen": 205616, "step": 1170 }, { "epoch": 4.718875502008032, "grad_norm": 0.11746017634868622, "learning_rate": 3.1766192014592344e-05, "loss": 0.3571, "num_input_tokens_seen": 206512, "step": 1175 }, { "epoch": 4.738955823293173, "grad_norm": 0.4761631190776825, "learning_rate": 3.1597333317036545e-05, "loss": 0.3507, "num_input_tokens_seen": 207424, "step": 1180 }, { "epoch": 4.759036144578313, "grad_norm": 0.5010347366333008, "learning_rate": 3.142815048689828e-05, "loss": 0.3575, "num_input_tokens_seen": 208464, "step": 1185 }, { "epoch": 4.779116465863454, "grad_norm": 0.07341606169939041, "learning_rate": 3.125865183627354e-05, "loss": 0.3579, "num_input_tokens_seen": 209280, "step": 1190 }, { "epoch": 4.7991967871485945, "grad_norm": 0.43029800057411194, "learning_rate": 3.10888456927748e-05, "loss": 0.3327, "num_input_tokens_seen": 210080, "step": 1195 }, { "epoch": 4.8192771084337345, "grad_norm": 0.49401140213012695, "learning_rate": 3.091874039912195e-05, "loss": 0.3619, "num_input_tokens_seen": 210960, "step": 1200 }, { "epoch": 4.839357429718875, "grad_norm": 0.07773241400718689, "learning_rate": 3.074834431273236e-05, "loss": 0.3488, "num_input_tokens_seen": 211776, "step": 1205 }, { "epoch": 4.859437751004016, "grad_norm": 0.4646616280078888, "learning_rate": 3.057766580531031e-05, "loss": 0.3542, "num_input_tokens_seen": 212576, "step": 1210 }, { "epoch": 4.879518072289157, "grad_norm": 0.4323027431964874, "learning_rate": 3.0406713262435656e-05, "loss": 0.3362, "num_input_tokens_seen": 213360, "step": 1215 }, { "epoch": 4.899598393574297, "grad_norm": 0.502923846244812, "learning_rate": 3.0235495083151844e-05, "loss": 0.3814, "num_input_tokens_seen": 214304, "step": 1220 }, { "epoch": 4.919678714859438, "grad_norm": 0.43908852338790894, "learning_rate": 3.0064019679553274e-05, "loss": 0.3492, "num_input_tokens_seen": 215072, "step": 1225 }, { "epoch": 4.9397590361445785, "grad_norm": 0.07500998675823212, "learning_rate": 2.9892295476371988e-05, "loss": 0.3542, "num_input_tokens_seen": 215904, "step": 1230 }, { "epoch": 4.959839357429718, "grad_norm": 0.10780856758356094, "learning_rate": 2.9720330910563772e-05, "loss": 0.3543, "num_input_tokens_seen": 216864, "step": 1235 }, { "epoch": 4.979919678714859, "grad_norm": 0.0808030292391777, "learning_rate": 2.9548134430893604e-05, "loss": 0.3387, "num_input_tokens_seen": 217856, "step": 1240 }, { "epoch": 5.0, "grad_norm": 0.0589243620634079, "learning_rate": 2.9375714497520623e-05, "loss": 0.339, "num_input_tokens_seen": 218864, "step": 1245 }, { "epoch": 5.020080321285141, "grad_norm": 0.05416898429393768, "learning_rate": 2.920307958158241e-05, "loss": 0.3601, "num_input_tokens_seen": 219680, "step": 1250 }, { "epoch": 5.020080321285141, "eval_loss": 0.3591695725917816, "eval_runtime": 1.2186, "eval_samples_per_second": 45.953, "eval_steps_per_second": 22.976, "num_input_tokens_seen": 219680, "step": 1250 }, { "epoch": 5.040160642570281, "grad_norm": 0.40490707755088806, "learning_rate": 2.903023816477885e-05, "loss": 0.3239, "num_input_tokens_seen": 220560, "step": 1255 }, { "epoch": 5.0602409638554215, "grad_norm": 0.12411481887102127, "learning_rate": 2.885719873895536e-05, "loss": 0.3419, "num_input_tokens_seen": 221440, "step": 1260 }, { "epoch": 5.080321285140562, "grad_norm": 0.5238391160964966, "learning_rate": 2.868396980568572e-05, "loss": 0.348, "num_input_tokens_seen": 222304, "step": 1265 }, { "epoch": 5.100401606425703, "grad_norm": 0.358173131942749, "learning_rate": 2.8510559875854377e-05, "loss": 0.2762, "num_input_tokens_seen": 223248, "step": 1270 }, { "epoch": 5.120481927710843, "grad_norm": 0.3846758306026459, "learning_rate": 2.833697746923829e-05, "loss": 0.2662, "num_input_tokens_seen": 224000, "step": 1275 }, { "epoch": 5.140562248995984, "grad_norm": 0.233295738697052, "learning_rate": 2.816323111408835e-05, "loss": 0.3421, "num_input_tokens_seen": 224880, "step": 1280 }, { "epoch": 5.160642570281125, "grad_norm": 0.7878521084785461, "learning_rate": 2.7989329346710375e-05, "loss": 0.4232, "num_input_tokens_seen": 225776, "step": 1285 }, { "epoch": 5.180722891566265, "grad_norm": 0.3618873655796051, "learning_rate": 2.7815280711045717e-05, "loss": 0.3838, "num_input_tokens_seen": 226576, "step": 1290 }, { "epoch": 5.2008032128514055, "grad_norm": 0.14807634055614471, "learning_rate": 2.7641093758251497e-05, "loss": 0.3104, "num_input_tokens_seen": 227360, "step": 1295 }, { "epoch": 5.220883534136546, "grad_norm": 0.3595626950263977, "learning_rate": 2.7466777046280457e-05, "loss": 0.3105, "num_input_tokens_seen": 228112, "step": 1300 }, { "epoch": 5.240963855421687, "grad_norm": 0.6282910108566284, "learning_rate": 2.7292339139460556e-05, "loss": 0.3474, "num_input_tokens_seen": 228992, "step": 1305 }, { "epoch": 5.261044176706827, "grad_norm": 0.357793390750885, "learning_rate": 2.71177886080741e-05, "loss": 0.3076, "num_input_tokens_seen": 229872, "step": 1310 }, { "epoch": 5.281124497991968, "grad_norm": 0.1209518164396286, "learning_rate": 2.69431340279368e-05, "loss": 0.4231, "num_input_tokens_seen": 230720, "step": 1315 }, { "epoch": 5.301204819277109, "grad_norm": 0.09653452038764954, "learning_rate": 2.676838397997633e-05, "loss": 0.3725, "num_input_tokens_seen": 231568, "step": 1320 }, { "epoch": 5.321285140562249, "grad_norm": 0.4242056608200073, "learning_rate": 2.659354704981078e-05, "loss": 0.3237, "num_input_tokens_seen": 232368, "step": 1325 }, { "epoch": 5.341365461847389, "grad_norm": 0.10253465920686722, "learning_rate": 2.6418631827326857e-05, "loss": 0.3534, "num_input_tokens_seen": 233184, "step": 1330 }, { "epoch": 5.36144578313253, "grad_norm": 0.08719359338283539, "learning_rate": 2.6243646906257806e-05, "loss": 0.338, "num_input_tokens_seen": 233984, "step": 1335 }, { "epoch": 5.381526104417671, "grad_norm": 0.5312795639038086, "learning_rate": 2.606860088376126e-05, "loss": 0.3687, "num_input_tokens_seen": 234848, "step": 1340 }, { "epoch": 5.401606425702811, "grad_norm": 0.47353821992874146, "learning_rate": 2.5893502359996786e-05, "loss": 0.3449, "num_input_tokens_seen": 235760, "step": 1345 }, { "epoch": 5.421686746987952, "grad_norm": 0.49804431200027466, "learning_rate": 2.5718359937703408e-05, "loss": 0.3504, "num_input_tokens_seen": 236640, "step": 1350 }, { "epoch": 5.4417670682730925, "grad_norm": 0.5166244506835938, "learning_rate": 2.554318222177689e-05, "loss": 0.3538, "num_input_tokens_seen": 237616, "step": 1355 }, { "epoch": 5.461847389558233, "grad_norm": 0.1073295921087265, "learning_rate": 2.5367977818847034e-05, "loss": 0.3354, "num_input_tokens_seen": 238528, "step": 1360 }, { "epoch": 5.481927710843373, "grad_norm": 0.49680086970329285, "learning_rate": 2.519275533685477e-05, "loss": 0.3354, "num_input_tokens_seen": 239424, "step": 1365 }, { "epoch": 5.502008032128514, "grad_norm": 0.11628666520118713, "learning_rate": 2.5017523384629298e-05, "loss": 0.354, "num_input_tokens_seen": 240272, "step": 1370 }, { "epoch": 5.522088353413655, "grad_norm": 0.11434385180473328, "learning_rate": 2.484229057146507e-05, "loss": 0.3422, "num_input_tokens_seen": 241136, "step": 1375 }, { "epoch": 5.522088353413655, "eval_loss": 0.35063984990119934, "eval_runtime": 1.4257, "eval_samples_per_second": 39.28, "eval_steps_per_second": 19.64, "num_input_tokens_seen": 241136, "step": 1375 }, { "epoch": 5.542168674698795, "grad_norm": 0.5793833136558533, "learning_rate": 2.466706550669886e-05, "loss": 0.3574, "num_input_tokens_seen": 241936, "step": 1380 }, { "epoch": 5.562248995983936, "grad_norm": 0.5316616296768188, "learning_rate": 2.449185679928672e-05, "loss": 0.3748, "num_input_tokens_seen": 242672, "step": 1385 }, { "epoch": 5.582329317269076, "grad_norm": 0.10326164960861206, "learning_rate": 2.431667305738112e-05, "loss": 0.3507, "num_input_tokens_seen": 243808, "step": 1390 }, { "epoch": 5.602409638554217, "grad_norm": 0.46083107590675354, "learning_rate": 2.414152288790787e-05, "loss": 0.3506, "num_input_tokens_seen": 244688, "step": 1395 }, { "epoch": 5.622489959839357, "grad_norm": 0.42732155323028564, "learning_rate": 2.3966414896143385e-05, "loss": 0.3386, "num_input_tokens_seen": 245696, "step": 1400 }, { "epoch": 5.642570281124498, "grad_norm": 0.4096117317676544, "learning_rate": 2.3791357685291863e-05, "loss": 0.3298, "num_input_tokens_seen": 246544, "step": 1405 }, { "epoch": 5.662650602409639, "grad_norm": 0.39851030707359314, "learning_rate": 2.361635985606256e-05, "loss": 0.3413, "num_input_tokens_seen": 247744, "step": 1410 }, { "epoch": 5.682730923694779, "grad_norm": 0.385628879070282, "learning_rate": 2.344143000624729e-05, "loss": 0.3623, "num_input_tokens_seen": 248480, "step": 1415 }, { "epoch": 5.7028112449799195, "grad_norm": 0.38799935579299927, "learning_rate": 2.3266576730297956e-05, "loss": 0.3284, "num_input_tokens_seen": 249312, "step": 1420 }, { "epoch": 5.72289156626506, "grad_norm": 0.11895764619112015, "learning_rate": 2.3091808618904352e-05, "loss": 0.3679, "num_input_tokens_seen": 250304, "step": 1425 }, { "epoch": 5.742971887550201, "grad_norm": 0.3756169080734253, "learning_rate": 2.2917134258572038e-05, "loss": 0.3506, "num_input_tokens_seen": 251216, "step": 1430 }, { "epoch": 5.763052208835341, "grad_norm": 0.5232722163200378, "learning_rate": 2.274256223120051e-05, "loss": 0.3512, "num_input_tokens_seen": 251952, "step": 1435 }, { "epoch": 5.783132530120482, "grad_norm": 0.06907851248979568, "learning_rate": 2.2568101113661577e-05, "loss": 0.3292, "num_input_tokens_seen": 253072, "step": 1440 }, { "epoch": 5.803212851405623, "grad_norm": 0.08446797728538513, "learning_rate": 2.239375947737793e-05, "loss": 0.3499, "num_input_tokens_seen": 253840, "step": 1445 }, { "epoch": 5.823293172690763, "grad_norm": 0.06622636318206787, "learning_rate": 2.221954588790206e-05, "loss": 0.3647, "num_input_tokens_seen": 254640, "step": 1450 }, { "epoch": 5.843373493975903, "grad_norm": 0.03167250007390976, "learning_rate": 2.2045468904495415e-05, "loss": 0.3518, "num_input_tokens_seen": 255456, "step": 1455 }, { "epoch": 5.863453815261044, "grad_norm": 0.07110590487718582, "learning_rate": 2.1871537079707833e-05, "loss": 0.354, "num_input_tokens_seen": 256304, "step": 1460 }, { "epoch": 5.883534136546185, "grad_norm": 0.5281655788421631, "learning_rate": 2.1697758958957448e-05, "loss": 0.3385, "num_input_tokens_seen": 257104, "step": 1465 }, { "epoch": 5.903614457831325, "grad_norm": 0.5472500920295715, "learning_rate": 2.1524143080110716e-05, "loss": 0.3532, "num_input_tokens_seen": 258080, "step": 1470 }, { "epoch": 5.923694779116466, "grad_norm": 0.5764107704162598, "learning_rate": 2.135069797306308e-05, "loss": 0.3701, "num_input_tokens_seen": 259056, "step": 1475 }, { "epoch": 5.943775100401607, "grad_norm": 0.5297293663024902, "learning_rate": 2.1177432159319754e-05, "loss": 0.3721, "num_input_tokens_seen": 260000, "step": 1480 }, { "epoch": 5.9638554216867465, "grad_norm": 0.03520062938332558, "learning_rate": 2.100435415157718e-05, "loss": 0.3517, "num_input_tokens_seen": 260768, "step": 1485 }, { "epoch": 5.983935742971887, "grad_norm": 0.451326847076416, "learning_rate": 2.083147245330468e-05, "loss": 0.3572, "num_input_tokens_seen": 261760, "step": 1490 }, { "epoch": 6.004016064257028, "grad_norm": 0.4301218092441559, "learning_rate": 2.0658795558326743e-05, "loss": 0.3476, "num_input_tokens_seen": 262752, "step": 1495 }, { "epoch": 6.024096385542169, "grad_norm": 0.09133653342723846, "learning_rate": 2.048633195040572e-05, "loss": 0.3609, "num_input_tokens_seen": 263616, "step": 1500 }, { "epoch": 6.024096385542169, "eval_loss": 0.3502369225025177, "eval_runtime": 1.2111, "eval_samples_per_second": 46.238, "eval_steps_per_second": 23.119, "num_input_tokens_seen": 263616, "step": 1500 }, { "epoch": 6.044176706827309, "grad_norm": 0.4608069062232971, "learning_rate": 2.0314090102824963e-05, "loss": 0.3669, "num_input_tokens_seen": 264432, "step": 1505 }, { "epoch": 6.06425702811245, "grad_norm": 0.08459752053022385, "learning_rate": 2.014207847797256e-05, "loss": 0.3542, "num_input_tokens_seen": 265184, "step": 1510 }, { "epoch": 6.0843373493975905, "grad_norm": 0.04717332869768143, "learning_rate": 1.997030552692556e-05, "loss": 0.3509, "num_input_tokens_seen": 266064, "step": 1515 }, { "epoch": 6.104417670682731, "grad_norm": 0.07227271795272827, "learning_rate": 1.9798779689034757e-05, "loss": 0.3483, "num_input_tokens_seen": 266928, "step": 1520 }, { "epoch": 6.124497991967871, "grad_norm": 0.4734782874584198, "learning_rate": 1.9627509391510086e-05, "loss": 0.3542, "num_input_tokens_seen": 267824, "step": 1525 }, { "epoch": 6.144578313253012, "grad_norm": 0.430178701877594, "learning_rate": 1.9456503049006542e-05, "loss": 0.3479, "num_input_tokens_seen": 268608, "step": 1530 }, { "epoch": 6.164658634538153, "grad_norm": 0.4436280131340027, "learning_rate": 1.9285769063210812e-05, "loss": 0.3477, "num_input_tokens_seen": 269696, "step": 1535 }, { "epoch": 6.184738955823293, "grad_norm": 0.44335198402404785, "learning_rate": 1.9115315822428437e-05, "loss": 0.351, "num_input_tokens_seen": 270704, "step": 1540 }, { "epoch": 6.204819277108434, "grad_norm": 0.0994335189461708, "learning_rate": 1.8945151701171755e-05, "loss": 0.3447, "num_input_tokens_seen": 271568, "step": 1545 }, { "epoch": 6.224899598393574, "grad_norm": 0.42669767141342163, "learning_rate": 1.877528505974838e-05, "loss": 0.3386, "num_input_tokens_seen": 272304, "step": 1550 }, { "epoch": 6.244979919678715, "grad_norm": 0.091577909886837, "learning_rate": 1.8605724243850502e-05, "loss": 0.3302, "num_input_tokens_seen": 273152, "step": 1555 }, { "epoch": 6.265060240963855, "grad_norm": 0.5046164393424988, "learning_rate": 1.8436477584144863e-05, "loss": 0.3962, "num_input_tokens_seen": 274112, "step": 1560 }, { "epoch": 6.285140562248996, "grad_norm": 0.1121777817606926, "learning_rate": 1.826755339586341e-05, "loss": 0.3337, "num_input_tokens_seen": 274944, "step": 1565 }, { "epoch": 6.305220883534137, "grad_norm": 0.40517058968544006, "learning_rate": 1.809895997839482e-05, "loss": 0.3484, "num_input_tokens_seen": 275712, "step": 1570 }, { "epoch": 6.325301204819277, "grad_norm": 0.09261249750852585, "learning_rate": 1.793070561487672e-05, "loss": 0.3391, "num_input_tokens_seen": 276560, "step": 1575 }, { "epoch": 6.3453815261044175, "grad_norm": 0.49878165125846863, "learning_rate": 1.7762798571788707e-05, "loss": 0.3948, "num_input_tokens_seen": 277456, "step": 1580 }, { "epoch": 6.365461847389558, "grad_norm": 0.415039598941803, "learning_rate": 1.759524709854626e-05, "loss": 0.3246, "num_input_tokens_seen": 278352, "step": 1585 }, { "epoch": 6.385542168674699, "grad_norm": 0.11119506508111954, "learning_rate": 1.742805942709538e-05, "loss": 0.3468, "num_input_tokens_seen": 279264, "step": 1590 }, { "epoch": 6.405622489959839, "grad_norm": 0.10621926933526993, "learning_rate": 1.7261243771508208e-05, "loss": 0.3428, "num_input_tokens_seen": 280144, "step": 1595 }, { "epoch": 6.42570281124498, "grad_norm": 0.10251349955797195, "learning_rate": 1.70948083275794e-05, "loss": 0.3439, "num_input_tokens_seen": 281008, "step": 1600 }, { "epoch": 6.445783132530121, "grad_norm": 0.4213389456272125, "learning_rate": 1.6928761272423522e-05, "loss": 0.3717, "num_input_tokens_seen": 281792, "step": 1605 }, { "epoch": 6.4658634538152615, "grad_norm": 0.08103923499584198, "learning_rate": 1.6763110764073235e-05, "loss": 0.3517, "num_input_tokens_seen": 282560, "step": 1610 }, { "epoch": 6.485943775100401, "grad_norm": 0.46979421377182007, "learning_rate": 1.6597864941078552e-05, "loss": 0.3423, "num_input_tokens_seen": 283440, "step": 1615 }, { "epoch": 6.506024096385542, "grad_norm": 0.12765252590179443, "learning_rate": 1.643303192210693e-05, "loss": 0.358, "num_input_tokens_seen": 284592, "step": 1620 }, { "epoch": 6.526104417670683, "grad_norm": 0.06467333436012268, "learning_rate": 1.626861980554441e-05, "loss": 0.3457, "num_input_tokens_seen": 285424, "step": 1625 }, { "epoch": 6.526104417670683, "eval_loss": 0.3553968071937561, "eval_runtime": 1.2158, "eval_samples_per_second": 46.06, "eval_steps_per_second": 23.03, "num_input_tokens_seen": 285424, "step": 1625 }, { "epoch": 6.546184738955823, "grad_norm": 0.41862520575523376, "learning_rate": 1.6104636669097776e-05, "loss": 0.3518, "num_input_tokens_seen": 286272, "step": 1630 }, { "epoch": 6.566265060240964, "grad_norm": 0.0668744370341301, "learning_rate": 1.5941090569397616e-05, "loss": 0.3512, "num_input_tokens_seen": 287200, "step": 1635 }, { "epoch": 6.586345381526105, "grad_norm": 0.08563435077667236, "learning_rate": 1.5777989541602533e-05, "loss": 0.348, "num_input_tokens_seen": 288224, "step": 1640 }, { "epoch": 6.606425702811245, "grad_norm": 0.44376978278160095, "learning_rate": 1.561534159900441e-05, "loss": 0.3353, "num_input_tokens_seen": 289216, "step": 1645 }, { "epoch": 6.626506024096385, "grad_norm": 0.12750263512134552, "learning_rate": 1.5453154732634616e-05, "loss": 0.3476, "num_input_tokens_seen": 290080, "step": 1650 }, { "epoch": 6.646586345381526, "grad_norm": 0.4618399739265442, "learning_rate": 1.52914369108715e-05, "loss": 0.351, "num_input_tokens_seen": 290880, "step": 1655 }, { "epoch": 6.666666666666667, "grad_norm": 0.07694806158542633, "learning_rate": 1.513019607904882e-05, "loss": 0.3607, "num_input_tokens_seen": 291728, "step": 1660 }, { "epoch": 6.686746987951807, "grad_norm": 0.09036281704902649, "learning_rate": 1.4969440159065439e-05, "loss": 0.3573, "num_input_tokens_seen": 292624, "step": 1665 }, { "epoch": 6.706827309236948, "grad_norm": 0.08298249542713165, "learning_rate": 1.4809177048996064e-05, "loss": 0.3476, "num_input_tokens_seen": 293488, "step": 1670 }, { "epoch": 6.7269076305220885, "grad_norm": 0.4784527122974396, "learning_rate": 1.464941462270325e-05, "loss": 0.3477, "num_input_tokens_seen": 294400, "step": 1675 }, { "epoch": 6.746987951807229, "grad_norm": 0.45142361521720886, "learning_rate": 1.449016072945053e-05, "loss": 0.357, "num_input_tokens_seen": 295184, "step": 1680 }, { "epoch": 6.767068273092369, "grad_norm": 0.4861927330493927, "learning_rate": 1.4331423193516768e-05, "loss": 0.3575, "num_input_tokens_seen": 296176, "step": 1685 }, { "epoch": 6.78714859437751, "grad_norm": 0.41130194067955017, "learning_rate": 1.4173209813811788e-05, "loss": 0.3358, "num_input_tokens_seen": 297072, "step": 1690 }, { "epoch": 6.807228915662651, "grad_norm": 0.40937137603759766, "learning_rate": 1.4015528363493125e-05, "loss": 0.3491, "num_input_tokens_seen": 297856, "step": 1695 }, { "epoch": 6.827309236947791, "grad_norm": 0.40058666467666626, "learning_rate": 1.3858386589584187e-05, "loss": 0.3253, "num_input_tokens_seen": 298896, "step": 1700 }, { "epoch": 6.847389558232932, "grad_norm": 0.11114007234573364, "learning_rate": 1.3701792212593662e-05, "loss": 0.3302, "num_input_tokens_seen": 299712, "step": 1705 }, { "epoch": 6.867469879518072, "grad_norm": 0.11977384239435196, "learning_rate": 1.354575292613611e-05, "loss": 0.3882, "num_input_tokens_seen": 300720, "step": 1710 }, { "epoch": 6.887550200803213, "grad_norm": 0.5076762437820435, "learning_rate": 1.3390276396554052e-05, "loss": 0.3658, "num_input_tokens_seen": 301552, "step": 1715 }, { "epoch": 6.907630522088353, "grad_norm": 0.07989758253097534, "learning_rate": 1.3235370262541272e-05, "loss": 0.3388, "num_input_tokens_seen": 302352, "step": 1720 }, { "epoch": 6.927710843373494, "grad_norm": 0.38454264402389526, "learning_rate": 1.3081042134767554e-05, "loss": 0.3335, "num_input_tokens_seen": 303232, "step": 1725 }, { "epoch": 6.947791164658635, "grad_norm": 0.07989054918289185, "learning_rate": 1.292729959550473e-05, "loss": 0.3262, "num_input_tokens_seen": 304016, "step": 1730 }, { "epoch": 6.967871485943775, "grad_norm": 0.3905481994152069, "learning_rate": 1.277415019825417e-05, "loss": 0.3396, "num_input_tokens_seen": 304944, "step": 1735 }, { "epoch": 6.9879518072289155, "grad_norm": 0.39736074209213257, "learning_rate": 1.2621601467375684e-05, "loss": 0.3422, "num_input_tokens_seen": 305984, "step": 1740 }, { "epoch": 7.008032128514056, "grad_norm": 0.1431533843278885, "learning_rate": 1.2469660897717816e-05, "loss": 0.3182, "num_input_tokens_seen": 306992, "step": 1745 }, { "epoch": 7.028112449799197, "grad_norm": 0.39490222930908203, "learning_rate": 1.2318335954249669e-05, "loss": 0.315, "num_input_tokens_seen": 307792, "step": 1750 }, { "epoch": 7.028112449799197, "eval_loss": 0.36506387591362, "eval_runtime": 1.2174, "eval_samples_per_second": 45.998, "eval_steps_per_second": 22.999, "num_input_tokens_seen": 307792, "step": 1750 }, { "epoch": 7.048192771084337, "grad_norm": 0.13177447021007538, "learning_rate": 1.2167634071694081e-05, "loss": 0.3174, "num_input_tokens_seen": 308624, "step": 1755 }, { "epoch": 7.068273092369478, "grad_norm": 0.38232582807540894, "learning_rate": 1.2017562654162357e-05, "loss": 0.2887, "num_input_tokens_seen": 309680, "step": 1760 }, { "epoch": 7.088353413654619, "grad_norm": 0.19812007248401642, "learning_rate": 1.1868129074790577e-05, "loss": 0.3394, "num_input_tokens_seen": 310544, "step": 1765 }, { "epoch": 7.108433734939759, "grad_norm": 0.2020581066608429, "learning_rate": 1.1719340675377252e-05, "loss": 0.3113, "num_input_tokens_seen": 311568, "step": 1770 }, { "epoch": 7.128514056224899, "grad_norm": 0.16722743213176727, "learning_rate": 1.1571204766022665e-05, "loss": 0.4907, "num_input_tokens_seen": 312432, "step": 1775 }, { "epoch": 7.14859437751004, "grad_norm": 0.14364704489707947, "learning_rate": 1.1423728624769695e-05, "loss": 0.3627, "num_input_tokens_seen": 313168, "step": 1780 }, { "epoch": 7.168674698795181, "grad_norm": 0.11236248165369034, "learning_rate": 1.1276919497246288e-05, "loss": 0.3648, "num_input_tokens_seen": 313968, "step": 1785 }, { "epoch": 7.188755020080321, "grad_norm": 0.11342489719390869, "learning_rate": 1.1130784596309409e-05, "loss": 0.3585, "num_input_tokens_seen": 314736, "step": 1790 }, { "epoch": 7.208835341365462, "grad_norm": 0.09296204149723053, "learning_rate": 1.098533110169071e-05, "loss": 0.3485, "num_input_tokens_seen": 315664, "step": 1795 }, { "epoch": 7.228915662650603, "grad_norm": 0.1172434464097023, "learning_rate": 1.084056615964377e-05, "loss": 0.3442, "num_input_tokens_seen": 316704, "step": 1800 }, { "epoch": 7.2489959839357425, "grad_norm": 0.0936344638466835, "learning_rate": 1.069649688259299e-05, "loss": 0.388, "num_input_tokens_seen": 317520, "step": 1805 }, { "epoch": 7.269076305220883, "grad_norm": 0.4171283543109894, "learning_rate": 1.0553130348784182e-05, "loss": 0.3306, "num_input_tokens_seen": 318496, "step": 1810 }, { "epoch": 7.289156626506024, "grad_norm": 0.403551310300827, "learning_rate": 1.0410473601936765e-05, "loss": 0.3181, "num_input_tokens_seen": 319344, "step": 1815 }, { "epoch": 7.309236947791165, "grad_norm": 0.08753710985183716, "learning_rate": 1.026853365089773e-05, "loss": 0.3494, "num_input_tokens_seen": 320224, "step": 1820 }, { "epoch": 7.329317269076305, "grad_norm": 0.393200546503067, "learning_rate": 1.0127317469297277e-05, "loss": 0.3193, "num_input_tokens_seen": 320976, "step": 1825 }, { "epoch": 7.349397590361446, "grad_norm": 0.1304997056722641, "learning_rate": 9.986831995206195e-06, "loss": 0.3271, "num_input_tokens_seen": 321808, "step": 1830 }, { "epoch": 7.3694779116465865, "grad_norm": 0.37172961235046387, "learning_rate": 9.847084130795028e-06, "loss": 0.3504, "num_input_tokens_seen": 322624, "step": 1835 }, { "epoch": 7.389558232931727, "grad_norm": 0.3792823255062103, "learning_rate": 9.708080741994868e-06, "loss": 0.3165, "num_input_tokens_seen": 323696, "step": 1840 }, { "epoch": 7.409638554216867, "grad_norm": 0.365556925535202, "learning_rate": 9.569828658160158e-06, "loss": 0.3184, "num_input_tokens_seen": 324496, "step": 1845 }, { "epoch": 7.429718875502008, "grad_norm": 0.10406666994094849, "learning_rate": 9.432334671733039e-06, "loss": 0.3824, "num_input_tokens_seen": 325328, "step": 1850 }, { "epoch": 7.449799196787149, "grad_norm": 0.3579881191253662, "learning_rate": 9.295605537909708e-06, "loss": 0.3336, "num_input_tokens_seen": 326304, "step": 1855 }, { "epoch": 7.469879518072289, "grad_norm": 0.171662375330925, "learning_rate": 9.159647974308494e-06, "loss": 0.3148, "num_input_tokens_seen": 327120, "step": 1860 }, { "epoch": 7.48995983935743, "grad_norm": 0.5380930304527283, "learning_rate": 9.024468660639826e-06, "loss": 0.3811, "num_input_tokens_seen": 328128, "step": 1865 }, { "epoch": 7.51004016064257, "grad_norm": 0.11611666530370712, "learning_rate": 8.890074238378074e-06, "loss": 0.351, "num_input_tokens_seen": 329008, "step": 1870 }, { "epoch": 7.530120481927711, "grad_norm": 0.10266629606485367, "learning_rate": 8.756471310435204e-06, "loss": 0.3149, "num_input_tokens_seen": 329840, "step": 1875 }, { "epoch": 7.530120481927711, "eval_loss": 0.3625907897949219, "eval_runtime": 1.2148, "eval_samples_per_second": 46.1, "eval_steps_per_second": 23.05, "num_input_tokens_seen": 329840, "step": 1875 }, { "epoch": 7.550200803212851, "grad_norm": 0.15210554003715515, "learning_rate": 8.623666440836404e-06, "loss": 0.3623, "num_input_tokens_seen": 330624, "step": 1880 }, { "epoch": 7.570281124497992, "grad_norm": 0.09614621847867966, "learning_rate": 8.491666154397573e-06, "loss": 0.3149, "num_input_tokens_seen": 331440, "step": 1885 }, { "epoch": 7.590361445783133, "grad_norm": 0.11491771787405014, "learning_rate": 8.360476936404754e-06, "loss": 0.3897, "num_input_tokens_seen": 332192, "step": 1890 }, { "epoch": 7.610441767068274, "grad_norm": 0.09532292187213898, "learning_rate": 8.230105232295538e-06, "loss": 0.3736, "num_input_tokens_seen": 333168, "step": 1895 }, { "epoch": 7.6305220883534135, "grad_norm": 0.5067830681800842, "learning_rate": 8.100557447342327e-06, "loss": 0.3618, "num_input_tokens_seen": 334080, "step": 1900 }, { "epoch": 7.650602409638554, "grad_norm": 0.13660845160484314, "learning_rate": 7.971839946337698e-06, "loss": 0.3533, "num_input_tokens_seen": 335040, "step": 1905 }, { "epoch": 7.670682730923695, "grad_norm": 0.41254377365112305, "learning_rate": 7.843959053281663e-06, "loss": 0.3532, "num_input_tokens_seen": 335824, "step": 1910 }, { "epoch": 7.690763052208835, "grad_norm": 0.07652189582586288, "learning_rate": 7.71692105107098e-06, "loss": 0.3362, "num_input_tokens_seen": 336656, "step": 1915 }, { "epoch": 7.710843373493976, "grad_norm": 0.0759321078658104, "learning_rate": 7.590732181190482e-06, "loss": 0.3608, "num_input_tokens_seen": 337488, "step": 1920 }, { "epoch": 7.730923694779117, "grad_norm": 0.0837617963552475, "learning_rate": 7.465398643406366e-06, "loss": 0.342, "num_input_tokens_seen": 338400, "step": 1925 }, { "epoch": 7.7510040160642575, "grad_norm": 0.09123997390270233, "learning_rate": 7.340926595461687e-06, "loss": 0.3573, "num_input_tokens_seen": 339248, "step": 1930 }, { "epoch": 7.771084337349397, "grad_norm": 0.09809573739767075, "learning_rate": 7.217322152773742e-06, "loss": 0.3539, "num_input_tokens_seen": 340112, "step": 1935 }, { "epoch": 7.791164658634538, "grad_norm": 0.4673003554344177, "learning_rate": 7.094591388133659e-06, "loss": 0.3471, "num_input_tokens_seen": 340896, "step": 1940 }, { "epoch": 7.811244979919679, "grad_norm": 0.46651169657707214, "learning_rate": 6.972740331408015e-06, "loss": 0.3599, "num_input_tokens_seen": 341760, "step": 1945 }, { "epoch": 7.831325301204819, "grad_norm": 0.09857732057571411, "learning_rate": 6.851774969242589e-06, "loss": 0.3382, "num_input_tokens_seen": 342608, "step": 1950 }, { "epoch": 7.85140562248996, "grad_norm": 0.14156574010849, "learning_rate": 6.731701244768254e-06, "loss": 0.338, "num_input_tokens_seen": 343632, "step": 1955 }, { "epoch": 7.871485943775101, "grad_norm": 0.08120275288820267, "learning_rate": 6.612525057308949e-06, "loss": 0.3473, "num_input_tokens_seen": 344528, "step": 1960 }, { "epoch": 7.891566265060241, "grad_norm": 0.4519568085670471, "learning_rate": 6.494252262091857e-06, "loss": 0.3505, "num_input_tokens_seen": 345568, "step": 1965 }, { "epoch": 7.911646586345381, "grad_norm": 0.10098995268344879, "learning_rate": 6.3768886699597436e-06, "loss": 0.3443, "num_input_tokens_seen": 346496, "step": 1970 }, { "epoch": 7.931726907630522, "grad_norm": 0.44386422634124756, "learning_rate": 6.260440047085439e-06, "loss": 0.3473, "num_input_tokens_seen": 347360, "step": 1975 }, { "epoch": 7.951807228915663, "grad_norm": 0.06753702461719513, "learning_rate": 6.1449121146885894e-06, "loss": 0.3445, "num_input_tokens_seen": 348128, "step": 1980 }, { "epoch": 7.971887550200803, "grad_norm": 0.42976370453834534, "learning_rate": 6.030310548754506e-06, "loss": 0.3509, "num_input_tokens_seen": 348960, "step": 1985 }, { "epoch": 7.991967871485944, "grad_norm": 0.4281879961490631, "learning_rate": 5.9166409797553415e-06, "loss": 0.3477, "num_input_tokens_seen": 349856, "step": 1990 }, { "epoch": 8.012048192771084, "grad_norm": 0.4621274471282959, "learning_rate": 5.803908992373449e-06, "loss": 0.338, "num_input_tokens_seen": 350784, "step": 1995 }, { "epoch": 8.032128514056225, "grad_norm": 0.08658602088689804, "learning_rate": 5.692120125226993e-06, "loss": 0.3441, "num_input_tokens_seen": 351552, "step": 2000 }, { "epoch": 8.032128514056225, "eval_loss": 0.3484961986541748, "eval_runtime": 1.2167, "eval_samples_per_second": 46.027, "eval_steps_per_second": 23.014, "num_input_tokens_seen": 351552, "step": 2000 }, { "epoch": 8.052208835341366, "grad_norm": 0.0772676169872284, "learning_rate": 5.581279870597867e-06, "loss": 0.3537, "num_input_tokens_seen": 352592, "step": 2005 }, { "epoch": 8.072289156626505, "grad_norm": 0.08750884979963303, "learning_rate": 5.4713936741617845e-06, "loss": 0.3441, "num_input_tokens_seen": 353392, "step": 2010 }, { "epoch": 8.092369477911646, "grad_norm": 0.4653143882751465, "learning_rate": 5.3624669347208085e-06, "loss": 0.3473, "num_input_tokens_seen": 354176, "step": 2015 }, { "epoch": 8.112449799196787, "grad_norm": 0.07544849812984467, "learning_rate": 5.254505003938043e-06, "loss": 0.335, "num_input_tokens_seen": 355040, "step": 2020 }, { "epoch": 8.132530120481928, "grad_norm": 0.4536716043949127, "learning_rate": 5.147513186074751e-06, "loss": 0.3445, "num_input_tokens_seen": 355984, "step": 2025 }, { "epoch": 8.152610441767068, "grad_norm": 0.06866519898176193, "learning_rate": 5.041496737729687e-06, "loss": 0.3443, "num_input_tokens_seen": 356768, "step": 2030 }, { "epoch": 8.17269076305221, "grad_norm": 0.07859829813241959, "learning_rate": 4.936460867580889e-06, "loss": 0.3604, "num_input_tokens_seen": 357648, "step": 2035 }, { "epoch": 8.19277108433735, "grad_norm": 0.48313626646995544, "learning_rate": 4.832410736129778e-06, "loss": 0.3509, "num_input_tokens_seen": 358464, "step": 2040 }, { "epoch": 8.21285140562249, "grad_norm": 0.4338780343532562, "learning_rate": 4.729351455447573e-06, "loss": 0.3421, "num_input_tokens_seen": 359520, "step": 2045 }, { "epoch": 8.23293172690763, "grad_norm": 0.4279603660106659, "learning_rate": 4.627288088924156e-06, "loss": 0.3447, "num_input_tokens_seen": 360256, "step": 2050 }, { "epoch": 8.25301204819277, "grad_norm": 0.09719569236040115, "learning_rate": 4.526225651019309e-06, "loss": 0.3479, "num_input_tokens_seen": 361184, "step": 2055 }, { "epoch": 8.273092369477911, "grad_norm": 0.11000480502843857, "learning_rate": 4.4261691070163316e-06, "loss": 0.3447, "num_input_tokens_seen": 362064, "step": 2060 }, { "epoch": 8.293172690763052, "grad_norm": 0.11856409162282944, "learning_rate": 4.327123372778122e-06, "loss": 0.3415, "num_input_tokens_seen": 362928, "step": 2065 }, { "epoch": 8.313253012048193, "grad_norm": 0.06177099421620369, "learning_rate": 4.229093314505619e-06, "loss": 0.336, "num_input_tokens_seen": 363888, "step": 2070 }, { "epoch": 8.333333333333334, "grad_norm": 0.07357407361268997, "learning_rate": 4.132083748498744e-06, "loss": 0.3572, "num_input_tokens_seen": 364800, "step": 2075 }, { "epoch": 8.353413654618475, "grad_norm": 0.1254434585571289, "learning_rate": 4.036099440919763e-06, "loss": 0.3479, "num_input_tokens_seen": 365680, "step": 2080 }, { "epoch": 8.373493975903614, "grad_norm": 0.12399672716856003, "learning_rate": 3.9411451075591464e-06, "loss": 0.3477, "num_input_tokens_seen": 366560, "step": 2085 }, { "epoch": 8.393574297188755, "grad_norm": 0.4643438458442688, "learning_rate": 3.847225413603839e-06, "loss": 0.3449, "num_input_tokens_seen": 367424, "step": 2090 }, { "epoch": 8.413654618473895, "grad_norm": 0.09002748876810074, "learning_rate": 3.754344973408064e-06, "loss": 0.3538, "num_input_tokens_seen": 368272, "step": 2095 }, { "epoch": 8.433734939759036, "grad_norm": 0.06885527074337006, "learning_rate": 3.6625083502666554e-06, "loss": 0.3472, "num_input_tokens_seen": 369040, "step": 2100 }, { "epoch": 8.453815261044177, "grad_norm": 0.4609389007091522, "learning_rate": 3.5717200561908026e-06, "loss": 0.3411, "num_input_tokens_seen": 369808, "step": 2105 }, { "epoch": 8.473895582329318, "grad_norm": 0.09589619934558868, "learning_rate": 3.481984551686429e-06, "loss": 0.3383, "num_input_tokens_seen": 370672, "step": 2110 }, { "epoch": 8.493975903614459, "grad_norm": 0.11223804205656052, "learning_rate": 3.3933062455349744e-06, "loss": 0.3417, "num_input_tokens_seen": 371520, "step": 2115 }, { "epoch": 8.514056224899598, "grad_norm": 0.47606217861175537, "learning_rate": 3.305689494576847e-06, "loss": 0.36, "num_input_tokens_seen": 372368, "step": 2120 }, { "epoch": 8.534136546184738, "grad_norm": 0.5172960162162781, "learning_rate": 3.2191386034973627e-06, "loss": 0.3574, "num_input_tokens_seen": 373424, "step": 2125 }, { "epoch": 8.534136546184738, "eval_loss": 0.3515996038913727, "eval_runtime": 1.2192, "eval_samples_per_second": 45.93, "eval_steps_per_second": 22.965, "num_input_tokens_seen": 373424, "step": 2125 }, { "epoch": 8.55421686746988, "grad_norm": 0.4503559172153473, "learning_rate": 3.1336578246152103e-06, "loss": 0.3443, "num_input_tokens_seen": 374240, "step": 2130 }, { "epoch": 8.57429718875502, "grad_norm": 0.11173038929700851, "learning_rate": 3.049251357673577e-06, "loss": 0.3383, "num_input_tokens_seen": 375104, "step": 2135 }, { "epoch": 8.594377510040161, "grad_norm": 0.09678234905004501, "learning_rate": 2.9659233496337786e-06, "loss": 0.3476, "num_input_tokens_seen": 376080, "step": 2140 }, { "epoch": 8.614457831325302, "grad_norm": 0.08732342720031738, "learning_rate": 2.8836778944715454e-06, "loss": 0.3415, "num_input_tokens_seen": 376928, "step": 2145 }, { "epoch": 8.634538152610443, "grad_norm": 0.12217875570058823, "learning_rate": 2.802519032975859e-06, "loss": 0.351, "num_input_tokens_seen": 377856, "step": 2150 }, { "epoch": 8.654618473895582, "grad_norm": 0.4865402579307556, "learning_rate": 2.722450752550429e-06, "loss": 0.3417, "num_input_tokens_seen": 378784, "step": 2155 }, { "epoch": 8.674698795180722, "grad_norm": 0.4950112998485565, "learning_rate": 2.6434769870177985e-06, "loss": 0.3604, "num_input_tokens_seen": 379696, "step": 2160 }, { "epoch": 8.694779116465863, "grad_norm": 0.07212843000888824, "learning_rate": 2.5656016164260554e-06, "loss": 0.3447, "num_input_tokens_seen": 380512, "step": 2165 }, { "epoch": 8.714859437751004, "grad_norm": 0.09282089024782181, "learning_rate": 2.4888284668582285e-06, "loss": 0.3445, "num_input_tokens_seen": 381520, "step": 2170 }, { "epoch": 8.734939759036145, "grad_norm": 0.08465081453323364, "learning_rate": 2.4131613102442857e-06, "loss": 0.3354, "num_input_tokens_seen": 382480, "step": 2175 }, { "epoch": 8.755020080321286, "grad_norm": 0.45736274123191833, "learning_rate": 2.3386038641758063e-06, "loss": 0.3383, "num_input_tokens_seen": 383440, "step": 2180 }, { "epoch": 8.775100401606426, "grad_norm": 0.46960991621017456, "learning_rate": 2.265159791723373e-06, "loss": 0.3508, "num_input_tokens_seen": 384240, "step": 2185 }, { "epoch": 8.795180722891565, "grad_norm": 0.4292510449886322, "learning_rate": 2.1928327012565696e-06, "loss": 0.3483, "num_input_tokens_seen": 385120, "step": 2190 }, { "epoch": 8.815261044176706, "grad_norm": 0.07858365774154663, "learning_rate": 2.121626146266706e-06, "loss": 0.3546, "num_input_tokens_seen": 385984, "step": 2195 }, { "epoch": 8.835341365461847, "grad_norm": 0.5046152472496033, "learning_rate": 2.051543625192226e-06, "loss": 0.3609, "num_input_tokens_seen": 386896, "step": 2200 }, { "epoch": 8.855421686746988, "grad_norm": 0.43499457836151123, "learning_rate": 1.9825885812468524e-06, "loss": 0.33, "num_input_tokens_seen": 387776, "step": 2205 }, { "epoch": 8.875502008032129, "grad_norm": 0.4260156750679016, "learning_rate": 1.914764402250385e-06, "loss": 0.3487, "num_input_tokens_seen": 388704, "step": 2210 }, { "epoch": 8.89558232931727, "grad_norm": 0.43153613805770874, "learning_rate": 1.8480744204622757e-06, "loss": 0.3512, "num_input_tokens_seen": 389456, "step": 2215 }, { "epoch": 8.91566265060241, "grad_norm": 0.4854128956794739, "learning_rate": 1.7825219124179004e-06, "loss": 0.3522, "num_input_tokens_seen": 390304, "step": 2220 }, { "epoch": 8.93574297188755, "grad_norm": 0.08903124928474426, "learning_rate": 1.7181100987675862e-06, "loss": 0.3356, "num_input_tokens_seen": 391104, "step": 2225 }, { "epoch": 8.95582329317269, "grad_norm": 0.49092555046081543, "learning_rate": 1.6548421441183875e-06, "loss": 0.3516, "num_input_tokens_seen": 392112, "step": 2230 }, { "epoch": 8.975903614457831, "grad_norm": 0.08050279319286346, "learning_rate": 1.5927211568785878e-06, "loss": 0.3449, "num_input_tokens_seen": 392880, "step": 2235 }, { "epoch": 8.995983935742972, "grad_norm": 0.4535515308380127, "learning_rate": 1.5317501891049719e-06, "loss": 0.3302, "num_input_tokens_seen": 393728, "step": 2240 }, { "epoch": 9.016064257028113, "grad_norm": 0.48401492834091187, "learning_rate": 1.4719322363529242e-06, "loss": 0.3487, "num_input_tokens_seen": 394688, "step": 2245 }, { "epoch": 9.036144578313253, "grad_norm": 0.4877987205982208, "learning_rate": 1.4132702375291989e-06, "loss": 0.3673, "num_input_tokens_seen": 395616, "step": 2250 }, { "epoch": 9.036144578313253, "eval_loss": 0.35450634360313416, "eval_runtime": 1.2106, "eval_samples_per_second": 46.258, "eval_steps_per_second": 23.129, "num_input_tokens_seen": 395616, "step": 2250 }, { "epoch": 9.056224899598394, "grad_norm": 0.4849400520324707, "learning_rate": 1.3557670747475714e-06, "loss": 0.3455, "num_input_tokens_seen": 396560, "step": 2255 }, { "epoch": 9.076305220883533, "grad_norm": 0.4335307478904724, "learning_rate": 1.2994255731871963e-06, "loss": 0.3489, "num_input_tokens_seen": 397456, "step": 2260 }, { "epoch": 9.096385542168674, "grad_norm": 0.10791157931089401, "learning_rate": 1.244248500953854e-06, "loss": 0.3648, "num_input_tokens_seen": 398448, "step": 2265 }, { "epoch": 9.116465863453815, "grad_norm": 0.4895195960998535, "learning_rate": 1.1902385689439022e-06, "loss": 0.3544, "num_input_tokens_seen": 399248, "step": 2270 }, { "epoch": 9.136546184738956, "grad_norm": 0.45951637625694275, "learning_rate": 1.137398430711123e-06, "loss": 0.3574, "num_input_tokens_seen": 400096, "step": 2275 }, { "epoch": 9.156626506024097, "grad_norm": 0.44542670249938965, "learning_rate": 1.085730682336325e-06, "loss": 0.3477, "num_input_tokens_seen": 401024, "step": 2280 }, { "epoch": 9.176706827309237, "grad_norm": 0.10208380967378616, "learning_rate": 1.0352378622998204e-06, "loss": 0.3506, "num_input_tokens_seen": 401856, "step": 2285 }, { "epoch": 9.196787148594378, "grad_norm": 0.43247994780540466, "learning_rate": 9.85922451356694e-07, "loss": 0.3388, "num_input_tokens_seen": 402736, "step": 2290 }, { "epoch": 9.216867469879517, "grad_norm": 0.13525407016277313, "learning_rate": 9.377868724149197e-07, "loss": 0.3413, "num_input_tokens_seen": 403696, "step": 2295 }, { "epoch": 9.236947791164658, "grad_norm": 0.4610027074813843, "learning_rate": 8.908334904163207e-07, "loss": 0.3445, "num_input_tokens_seen": 404480, "step": 2300 }, { "epoch": 9.257028112449799, "grad_norm": 0.4158003330230713, "learning_rate": 8.450646122203865e-07, "loss": 0.3233, "num_input_tokens_seen": 405536, "step": 2305 }, { "epoch": 9.27710843373494, "grad_norm": 0.4854465425014496, "learning_rate": 8.004824864909277e-07, "loss": 0.3513, "num_input_tokens_seen": 406368, "step": 2310 }, { "epoch": 9.29718875502008, "grad_norm": 0.42681699991226196, "learning_rate": 7.570893035856091e-07, "loss": 0.3417, "num_input_tokens_seen": 407184, "step": 2315 }, { "epoch": 9.317269076305221, "grad_norm": 0.48750439286231995, "learning_rate": 7.148871954483105e-07, "loss": 0.3542, "num_input_tokens_seen": 407904, "step": 2320 }, { "epoch": 9.337349397590362, "grad_norm": 0.11087116599082947, "learning_rate": 6.738782355044049e-07, "loss": 0.3387, "num_input_tokens_seen": 408736, "step": 2325 }, { "epoch": 9.357429718875501, "grad_norm": 0.4181234538555145, "learning_rate": 6.340644385588846e-07, "loss": 0.33, "num_input_tokens_seen": 409664, "step": 2330 }, { "epoch": 9.377510040160642, "grad_norm": 0.1247372180223465, "learning_rate": 5.954477606973679e-07, "loss": 0.3643, "num_input_tokens_seen": 410736, "step": 2335 }, { "epoch": 9.397590361445783, "grad_norm": 0.09054780006408691, "learning_rate": 5.580300991899989e-07, "loss": 0.3612, "num_input_tokens_seen": 411680, "step": 2340 }, { "epoch": 9.417670682730924, "grad_norm": 0.44483572244644165, "learning_rate": 5.218132923982267e-07, "loss": 0.3417, "num_input_tokens_seen": 412480, "step": 2345 }, { "epoch": 9.437751004016064, "grad_norm": 0.07839091122150421, "learning_rate": 4.867991196844918e-07, "loss": 0.3352, "num_input_tokens_seen": 413248, "step": 2350 }, { "epoch": 9.457831325301205, "grad_norm": 0.12925294041633606, "learning_rate": 4.5298930132480213e-07, "loss": 0.3606, "num_input_tokens_seen": 414080, "step": 2355 }, { "epoch": 9.477911646586346, "grad_norm": 0.15520651638507843, "learning_rate": 4.203854984242195e-07, "loss": 0.3481, "num_input_tokens_seen": 414928, "step": 2360 }, { "epoch": 9.497991967871485, "grad_norm": 0.4360347092151642, "learning_rate": 3.8898931283523344e-07, "loss": 0.364, "num_input_tokens_seen": 415728, "step": 2365 }, { "epoch": 9.518072289156626, "grad_norm": 0.4386354088783264, "learning_rate": 3.5880228707907417e-07, "loss": 0.3336, "num_input_tokens_seen": 416672, "step": 2370 }, { "epoch": 9.538152610441767, "grad_norm": 0.48464497923851013, "learning_rate": 3.2982590426993145e-07, "loss": 0.3419, "num_input_tokens_seen": 417520, "step": 2375 }, { "epoch": 9.538152610441767, "eval_loss": 0.3474566638469696, "eval_runtime": 1.2124, "eval_samples_per_second": 46.19, "eval_steps_per_second": 23.095, "num_input_tokens_seen": 417520, "step": 2375 }, { "epoch": 9.558232931726907, "grad_norm": 0.43182554841041565, "learning_rate": 3.020615880420713e-07, "loss": 0.3326, "num_input_tokens_seen": 418400, "step": 2380 }, { "epoch": 9.578313253012048, "grad_norm": 0.09252151101827621, "learning_rate": 2.7551070247990305e-07, "loss": 0.3449, "num_input_tokens_seen": 419248, "step": 2385 }, { "epoch": 9.598393574297189, "grad_norm": 0.44276589155197144, "learning_rate": 2.501745520509552e-07, "loss": 0.3481, "num_input_tokens_seen": 420096, "step": 2390 }, { "epoch": 9.61847389558233, "grad_norm": 0.07001478224992752, "learning_rate": 2.2605438154179038e-07, "loss": 0.3386, "num_input_tokens_seen": 420848, "step": 2395 }, { "epoch": 9.638554216867469, "grad_norm": 0.11785867065191269, "learning_rate": 2.0315137599685174e-07, "loss": 0.3294, "num_input_tokens_seen": 421728, "step": 2400 }, { "epoch": 9.65863453815261, "grad_norm": 0.08471374958753586, "learning_rate": 1.814666606602261e-07, "loss": 0.3604, "num_input_tokens_seen": 422656, "step": 2405 }, { "epoch": 9.67871485943775, "grad_norm": 0.4377971589565277, "learning_rate": 1.6100130092037703e-07, "loss": 0.3457, "num_input_tokens_seen": 423600, "step": 2410 }, { "epoch": 9.698795180722891, "grad_norm": 0.12307439744472504, "learning_rate": 1.4175630225778947e-07, "loss": 0.3447, "num_input_tokens_seen": 424448, "step": 2415 }, { "epoch": 9.718875502008032, "grad_norm": 0.11983578652143478, "learning_rate": 1.237326101955677e-07, "loss": 0.3544, "num_input_tokens_seen": 425632, "step": 2420 }, { "epoch": 9.738955823293173, "grad_norm": 0.1070198193192482, "learning_rate": 1.0693111025300017e-07, "loss": 0.3385, "num_input_tokens_seen": 426576, "step": 2425 }, { "epoch": 9.759036144578314, "grad_norm": 0.44310370087623596, "learning_rate": 9.13526279020277e-08, "loss": 0.3296, "num_input_tokens_seen": 427376, "step": 2430 }, { "epoch": 9.779116465863455, "grad_norm": 0.4734679162502289, "learning_rate": 7.699792852670362e-08, "loss": 0.3514, "num_input_tokens_seen": 428256, "step": 2435 }, { "epoch": 9.799196787148594, "grad_norm": 0.13541147112846375, "learning_rate": 6.386771738558506e-08, "loss": 0.3389, "num_input_tokens_seen": 429216, "step": 2440 }, { "epoch": 9.819277108433734, "grad_norm": 0.48132413625717163, "learning_rate": 5.196263957708836e-08, "loss": 0.3542, "num_input_tokens_seen": 430208, "step": 2445 }, { "epoch": 9.839357429718875, "grad_norm": 0.4347268044948578, "learning_rate": 4.1283280007778366e-08, "loss": 0.3292, "num_input_tokens_seen": 430960, "step": 2450 }, { "epoch": 9.859437751004016, "grad_norm": 0.43290168046951294, "learning_rate": 3.1830163363655296e-08, "loss": 0.355, "num_input_tokens_seen": 431936, "step": 2455 }, { "epoch": 9.879518072289157, "grad_norm": 0.10507988184690475, "learning_rate": 2.3603754084358663e-08, "loss": 0.3425, "num_input_tokens_seen": 432912, "step": 2460 }, { "epoch": 9.899598393574298, "grad_norm": 0.4859294593334198, "learning_rate": 1.6604456340352235e-08, "loss": 0.3449, "num_input_tokens_seen": 433696, "step": 2465 }, { "epoch": 9.919678714859439, "grad_norm": 0.1426219940185547, "learning_rate": 1.0832614013073228e-08, "loss": 0.3513, "num_input_tokens_seen": 434528, "step": 2470 }, { "epoch": 9.939759036144578, "grad_norm": 0.08813058584928513, "learning_rate": 6.288510678031934e-09, "loss": 0.3633, "num_input_tokens_seen": 435280, "step": 2475 }, { "epoch": 9.959839357429718, "grad_norm": 0.4306187033653259, "learning_rate": 2.972369590878432e-09, "loss": 0.3483, "num_input_tokens_seen": 436096, "step": 2480 }, { "epoch": 9.97991967871486, "grad_norm": 0.07171786576509476, "learning_rate": 8.843536764419069e-10, "loss": 0.3521, "num_input_tokens_seen": 437008, "step": 2485 }, { "epoch": 10.0, "grad_norm": 0.0965154618024826, "learning_rate": 2.4565520709285417e-11, "loss": 0.3294, "num_input_tokens_seen": 437760, "step": 2490 }, { "epoch": 10.0, "num_input_tokens_seen": 437760, "step": 2490, "total_flos": 1.971213494648832e+16, "train_loss": 0.4594599806580199, "train_runtime": 258.0589, "train_samples_per_second": 19.298, "train_steps_per_second": 9.649 } ], "logging_steps": 5, "max_steps": 2490, "num_input_tokens_seen": 437760, "num_train_epochs": 10, "save_steps": 125, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.971213494648832e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }