diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5617 @@ +{ + "best_metric": 0.03459745645523071, + "best_model_checkpoint": "saves/psy-course/Llama-3.1-8B-Instruct/train/fold10/checkpoint-1700", + "epoch": 9.989417989417989, + "eval_steps": 50, + "global_step": 6490, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.015392015392015393, + "grad_norm": 4.528165340423584, + "learning_rate": 1.5408320493066258e-06, + "loss": 1.5277, + "step": 10 + }, + { + "epoch": 0.030784030784030785, + "grad_norm": 4.624378681182861, + "learning_rate": 3.0816640986132515e-06, + "loss": 1.6084, + "step": 20 + }, + { + "epoch": 0.046176046176046176, + "grad_norm": 4.4105544090271, + "learning_rate": 4.622496147919877e-06, + "loss": 1.5518, + "step": 30 + }, + { + "epoch": 0.06156806156806157, + "grad_norm": 5.693728923797607, + "learning_rate": 6.163328197226503e-06, + "loss": 1.4121, + "step": 40 + }, + { + "epoch": 0.07696007696007696, + "grad_norm": 1.919281244277954, + "learning_rate": 7.704160246533127e-06, + "loss": 0.9905, + "step": 50 + }, + { + "epoch": 0.07696007696007696, + "eval_loss": 0.6973788738250732, + "eval_runtime": 153.3898, + "eval_samples_per_second": 7.53, + "eval_steps_per_second": 7.53, + "step": 50 + }, + { + "epoch": 0.09235209235209235, + "grad_norm": 1.9437800645828247, + "learning_rate": 9.244992295839754e-06, + "loss": 0.789, + "step": 60 + }, + { + "epoch": 0.10774410774410774, + "grad_norm": 1.445709466934204, + "learning_rate": 1.078582434514638e-05, + "loss": 0.5089, + "step": 70 + }, + { + "epoch": 0.12313612313612314, + "grad_norm": 1.1087522506713867, + "learning_rate": 1.2326656394453006e-05, + "loss": 0.4015, + "step": 80 + }, + { + "epoch": 0.13852813852813853, + "grad_norm": 1.0083191394805908, + "learning_rate": 1.386748844375963e-05, + "loss": 0.2548, + "step": 90 + }, + { + "epoch": 0.15392015392015393, + "grad_norm": 0.9103021025657654, + "learning_rate": 1.5408320493066255e-05, + "loss": 0.177, + "step": 100 + }, + { + "epoch": 0.15392015392015393, + "eval_loss": 0.14521953463554382, + "eval_runtime": 153.7007, + "eval_samples_per_second": 7.515, + "eval_steps_per_second": 7.515, + "step": 100 + }, + { + "epoch": 0.1693121693121693, + "grad_norm": 1.0953843593597412, + "learning_rate": 1.694915254237288e-05, + "loss": 0.1578, + "step": 110 + }, + { + "epoch": 0.1847041847041847, + "grad_norm": 1.0190010070800781, + "learning_rate": 1.8489984591679507e-05, + "loss": 0.1082, + "step": 120 + }, + { + "epoch": 0.2000962000962001, + "grad_norm": 1.2451119422912598, + "learning_rate": 2.0030816640986133e-05, + "loss": 0.0967, + "step": 130 + }, + { + "epoch": 0.21548821548821548, + "grad_norm": 1.397735357284546, + "learning_rate": 2.157164869029276e-05, + "loss": 0.0928, + "step": 140 + }, + { + "epoch": 0.23088023088023088, + "grad_norm": 1.1243575811386108, + "learning_rate": 2.3112480739599386e-05, + "loss": 0.0969, + "step": 150 + }, + { + "epoch": 0.23088023088023088, + "eval_loss": 0.07209072262048721, + "eval_runtime": 153.6706, + "eval_samples_per_second": 7.516, + "eval_steps_per_second": 7.516, + "step": 150 + }, + { + "epoch": 0.24627224627224628, + "grad_norm": 0.799592137336731, + "learning_rate": 2.4653312788906012e-05, + "loss": 0.0553, + "step": 160 + }, + { + "epoch": 0.26166426166426165, + "grad_norm": 0.6200801134109497, + "learning_rate": 2.6194144838212635e-05, + "loss": 0.0758, + "step": 170 + }, + { + "epoch": 0.27705627705627706, + "grad_norm": 0.5058329105377197, + "learning_rate": 2.773497688751926e-05, + "loss": 0.0683, + "step": 180 + }, + { + "epoch": 0.29244829244829246, + "grad_norm": 1.4177403450012207, + "learning_rate": 2.9275808936825887e-05, + "loss": 0.0683, + "step": 190 + }, + { + "epoch": 0.30784030784030786, + "grad_norm": 0.865456223487854, + "learning_rate": 3.081664098613251e-05, + "loss": 0.066, + "step": 200 + }, + { + "epoch": 0.30784030784030786, + "eval_loss": 0.058082256466150284, + "eval_runtime": 153.7515, + "eval_samples_per_second": 7.512, + "eval_steps_per_second": 7.512, + "step": 200 + }, + { + "epoch": 0.32323232323232326, + "grad_norm": 1.4013773202896118, + "learning_rate": 3.235747303543914e-05, + "loss": 0.066, + "step": 210 + }, + { + "epoch": 0.3386243386243386, + "grad_norm": 0.7052648067474365, + "learning_rate": 3.389830508474576e-05, + "loss": 0.0719, + "step": 220 + }, + { + "epoch": 0.354016354016354, + "grad_norm": 2.0154361724853516, + "learning_rate": 3.543913713405239e-05, + "loss": 0.0612, + "step": 230 + }, + { + "epoch": 0.3694083694083694, + "grad_norm": 0.7605541348457336, + "learning_rate": 3.6979969183359015e-05, + "loss": 0.068, + "step": 240 + }, + { + "epoch": 0.3848003848003848, + "grad_norm": 1.1585204601287842, + "learning_rate": 3.852080123266564e-05, + "loss": 0.0733, + "step": 250 + }, + { + "epoch": 0.3848003848003848, + "eval_loss": 0.06001831218600273, + "eval_runtime": 153.7629, + "eval_samples_per_second": 7.512, + "eval_steps_per_second": 7.512, + "step": 250 + }, + { + "epoch": 0.4001924001924002, + "grad_norm": 0.8206828236579895, + "learning_rate": 4.006163328197227e-05, + "loss": 0.0614, + "step": 260 + }, + { + "epoch": 0.4155844155844156, + "grad_norm": 0.5927722454071045, + "learning_rate": 4.160246533127889e-05, + "loss": 0.0591, + "step": 270 + }, + { + "epoch": 0.43097643097643096, + "grad_norm": 0.42531347274780273, + "learning_rate": 4.314329738058552e-05, + "loss": 0.042, + "step": 280 + }, + { + "epoch": 0.44636844636844636, + "grad_norm": 0.8253109455108643, + "learning_rate": 4.4684129429892145e-05, + "loss": 0.0609, + "step": 290 + }, + { + "epoch": 0.46176046176046176, + "grad_norm": 0.6391673684120178, + "learning_rate": 4.622496147919877e-05, + "loss": 0.0617, + "step": 300 + }, + { + "epoch": 0.46176046176046176, + "eval_loss": 0.051416847854852676, + "eval_runtime": 153.6283, + "eval_samples_per_second": 7.518, + "eval_steps_per_second": 7.518, + "step": 300 + }, + { + "epoch": 0.47715247715247716, + "grad_norm": 0.4509883224964142, + "learning_rate": 4.776579352850539e-05, + "loss": 0.0506, + "step": 310 + }, + { + "epoch": 0.49254449254449256, + "grad_norm": 0.9591169357299805, + "learning_rate": 4.9306625577812024e-05, + "loss": 0.0676, + "step": 320 + }, + { + "epoch": 0.5079365079365079, + "grad_norm": 0.40753549337387085, + "learning_rate": 5.0847457627118643e-05, + "loss": 0.0654, + "step": 330 + }, + { + "epoch": 0.5233285233285233, + "grad_norm": 0.4310671389102936, + "learning_rate": 5.238828967642527e-05, + "loss": 0.0498, + "step": 340 + }, + { + "epoch": 0.5387205387205387, + "grad_norm": 1.1829637289047241, + "learning_rate": 5.39291217257319e-05, + "loss": 0.0509, + "step": 350 + }, + { + "epoch": 0.5387205387205387, + "eval_loss": 0.046002235263586044, + "eval_runtime": 153.8025, + "eval_samples_per_second": 7.51, + "eval_steps_per_second": 7.51, + "step": 350 + }, + { + "epoch": 0.5541125541125541, + "grad_norm": 0.6535453200340271, + "learning_rate": 5.546995377503852e-05, + "loss": 0.044, + "step": 360 + }, + { + "epoch": 0.5695045695045695, + "grad_norm": 0.7604301571846008, + "learning_rate": 5.701078582434515e-05, + "loss": 0.0481, + "step": 370 + }, + { + "epoch": 0.5848965848965849, + "grad_norm": 0.8156557679176331, + "learning_rate": 5.8551617873651774e-05, + "loss": 0.0434, + "step": 380 + }, + { + "epoch": 0.6002886002886003, + "grad_norm": 1.127022624015808, + "learning_rate": 6.009244992295841e-05, + "loss": 0.0433, + "step": 390 + }, + { + "epoch": 0.6156806156806157, + "grad_norm": 0.4061015844345093, + "learning_rate": 6.163328197226502e-05, + "loss": 0.0501, + "step": 400 + }, + { + "epoch": 0.6156806156806157, + "eval_loss": 0.04338805004954338, + "eval_runtime": 153.6359, + "eval_samples_per_second": 7.518, + "eval_steps_per_second": 7.518, + "step": 400 + }, + { + "epoch": 0.6310726310726311, + "grad_norm": 0.8806648254394531, + "learning_rate": 6.317411402157165e-05, + "loss": 0.0614, + "step": 410 + }, + { + "epoch": 0.6464646464646465, + "grad_norm": 0.882322371006012, + "learning_rate": 6.471494607087829e-05, + "loss": 0.043, + "step": 420 + }, + { + "epoch": 0.6618566618566618, + "grad_norm": 0.6951248049736023, + "learning_rate": 6.625577812018491e-05, + "loss": 0.0476, + "step": 430 + }, + { + "epoch": 0.6772486772486772, + "grad_norm": 0.5838789343833923, + "learning_rate": 6.779661016949152e-05, + "loss": 0.0511, + "step": 440 + }, + { + "epoch": 0.6926406926406926, + "grad_norm": 0.3744179904460907, + "learning_rate": 6.933744221879815e-05, + "loss": 0.0517, + "step": 450 + }, + { + "epoch": 0.6926406926406926, + "eval_loss": 0.05035843327641487, + "eval_runtime": 153.8129, + "eval_samples_per_second": 7.509, + "eval_steps_per_second": 7.509, + "step": 450 + }, + { + "epoch": 0.708032708032708, + "grad_norm": 0.6012084484100342, + "learning_rate": 7.087827426810478e-05, + "loss": 0.0567, + "step": 460 + }, + { + "epoch": 0.7234247234247234, + "grad_norm": 0.46988222002983093, + "learning_rate": 7.24191063174114e-05, + "loss": 0.0574, + "step": 470 + }, + { + "epoch": 0.7388167388167388, + "grad_norm": 0.7519162893295288, + "learning_rate": 7.395993836671803e-05, + "loss": 0.0482, + "step": 480 + }, + { + "epoch": 0.7542087542087542, + "grad_norm": 0.6140932440757751, + "learning_rate": 7.550077041602466e-05, + "loss": 0.0465, + "step": 490 + }, + { + "epoch": 0.7696007696007696, + "grad_norm": 0.648766279220581, + "learning_rate": 7.704160246533128e-05, + "loss": 0.0623, + "step": 500 + }, + { + "epoch": 0.7696007696007696, + "eval_loss": 0.047655683010816574, + "eval_runtime": 153.6294, + "eval_samples_per_second": 7.518, + "eval_steps_per_second": 7.518, + "step": 500 + }, + { + "epoch": 0.784992784992785, + "grad_norm": 0.5732603073120117, + "learning_rate": 7.858243451463791e-05, + "loss": 0.0543, + "step": 510 + }, + { + "epoch": 0.8003848003848004, + "grad_norm": 0.45404285192489624, + "learning_rate": 8.012326656394453e-05, + "loss": 0.0487, + "step": 520 + }, + { + "epoch": 0.8157768157768158, + "grad_norm": 0.46479329466819763, + "learning_rate": 8.166409861325116e-05, + "loss": 0.0369, + "step": 530 + }, + { + "epoch": 0.8311688311688312, + "grad_norm": 0.5144111514091492, + "learning_rate": 8.320493066255779e-05, + "loss": 0.0483, + "step": 540 + }, + { + "epoch": 0.8465608465608465, + "grad_norm": 0.29511991143226624, + "learning_rate": 8.474576271186441e-05, + "loss": 0.0485, + "step": 550 + }, + { + "epoch": 0.8465608465608465, + "eval_loss": 0.04674944281578064, + "eval_runtime": 153.7672, + "eval_samples_per_second": 7.511, + "eval_steps_per_second": 7.511, + "step": 550 + }, + { + "epoch": 0.8619528619528619, + "grad_norm": 0.38855424523353577, + "learning_rate": 8.628659476117104e-05, + "loss": 0.0571, + "step": 560 + }, + { + "epoch": 0.8773448773448773, + "grad_norm": 0.8409668803215027, + "learning_rate": 8.782742681047766e-05, + "loss": 0.0572, + "step": 570 + }, + { + "epoch": 0.8927368927368927, + "grad_norm": 0.385643869638443, + "learning_rate": 8.936825885978429e-05, + "loss": 0.04, + "step": 580 + }, + { + "epoch": 0.9081289081289081, + "grad_norm": 0.31636983156204224, + "learning_rate": 9.090909090909092e-05, + "loss": 0.0505, + "step": 590 + }, + { + "epoch": 0.9235209235209235, + "grad_norm": 0.3808170258998871, + "learning_rate": 9.244992295839754e-05, + "loss": 0.0535, + "step": 600 + }, + { + "epoch": 0.9235209235209235, + "eval_loss": 0.04125829413533211, + "eval_runtime": 153.6957, + "eval_samples_per_second": 7.515, + "eval_steps_per_second": 7.515, + "step": 600 + }, + { + "epoch": 0.9389129389129389, + "grad_norm": 0.4853918254375458, + "learning_rate": 9.399075500770417e-05, + "loss": 0.0453, + "step": 610 + }, + { + "epoch": 0.9543049543049543, + "grad_norm": 0.29903852939605713, + "learning_rate": 9.553158705701078e-05, + "loss": 0.0397, + "step": 620 + }, + { + "epoch": 0.9696969696969697, + "grad_norm": 0.38780346512794495, + "learning_rate": 9.707241910631741e-05, + "loss": 0.0652, + "step": 630 + }, + { + "epoch": 0.9850889850889851, + "grad_norm": 0.2700529098510742, + "learning_rate": 9.861325115562405e-05, + "loss": 0.0399, + "step": 640 + }, + { + "epoch": 1.0004810004810005, + "grad_norm": 0.2887946665287018, + "learning_rate": 9.999999276788487e-05, + "loss": 0.0576, + "step": 650 + }, + { + "epoch": 1.0004810004810005, + "eval_loss": 0.03968934714794159, + "eval_runtime": 153.7815, + "eval_samples_per_second": 7.511, + "eval_steps_per_second": 7.511, + "step": 650 + }, + { + "epoch": 1.0158730158730158, + "grad_norm": 0.23010389506816864, + "learning_rate": 9.999912491660088e-05, + "loss": 0.0419, + "step": 660 + }, + { + "epoch": 1.0312650312650313, + "grad_norm": 0.26416629552841187, + "learning_rate": 9.999681067105788e-05, + "loss": 0.039, + "step": 670 + }, + { + "epoch": 1.0466570466570466, + "grad_norm": 0.2325461506843567, + "learning_rate": 9.999305009820327e-05, + "loss": 0.0372, + "step": 680 + }, + { + "epoch": 1.0620490620490621, + "grad_norm": 0.35149532556533813, + "learning_rate": 9.998784330682439e-05, + "loss": 0.0372, + "step": 690 + }, + { + "epoch": 1.0774410774410774, + "grad_norm": 0.4586612284183502, + "learning_rate": 9.99811904475453e-05, + "loss": 0.0376, + "step": 700 + }, + { + "epoch": 1.0774410774410774, + "eval_loss": 0.04443621635437012, + "eval_runtime": 153.813, + "eval_samples_per_second": 7.509, + "eval_steps_per_second": 7.509, + "step": 700 + }, + { + "epoch": 1.092833092833093, + "grad_norm": 1.4532707929611206, + "learning_rate": 9.997309171282256e-05, + "loss": 0.0404, + "step": 710 + }, + { + "epoch": 1.1082251082251082, + "grad_norm": 1.28654944896698, + "learning_rate": 9.996354733693952e-05, + "loss": 0.043, + "step": 720 + }, + { + "epoch": 1.1236171236171235, + "grad_norm": 0.22073368728160858, + "learning_rate": 9.995255759599961e-05, + "loss": 0.0459, + "step": 730 + }, + { + "epoch": 1.139009139009139, + "grad_norm": 0.49542248249053955, + "learning_rate": 9.994012280791838e-05, + "loss": 0.0466, + "step": 740 + }, + { + "epoch": 1.1544011544011543, + "grad_norm": 0.48700040578842163, + "learning_rate": 9.992624333241425e-05, + "loss": 0.0331, + "step": 750 + }, + { + "epoch": 1.1544011544011543, + "eval_loss": 0.03974996879696846, + "eval_runtime": 153.8765, + "eval_samples_per_second": 7.506, + "eval_steps_per_second": 7.506, + "step": 750 + }, + { + "epoch": 1.1697931697931698, + "grad_norm": 0.128897562623024, + "learning_rate": 9.991091957099808e-05, + "loss": 0.038, + "step": 760 + }, + { + "epoch": 1.1851851851851851, + "grad_norm": 0.11212223768234253, + "learning_rate": 9.98941519669617e-05, + "loss": 0.0308, + "step": 770 + }, + { + "epoch": 1.2005772005772006, + "grad_norm": 0.2181469202041626, + "learning_rate": 9.987594100536486e-05, + "loss": 0.0377, + "step": 780 + }, + { + "epoch": 1.215969215969216, + "grad_norm": 0.5143812298774719, + "learning_rate": 9.985628721302143e-05, + "loss": 0.048, + "step": 790 + }, + { + "epoch": 1.2313612313612314, + "grad_norm": 0.20458604395389557, + "learning_rate": 9.983519115848402e-05, + "loss": 0.0306, + "step": 800 + }, + { + "epoch": 1.2313612313612314, + "eval_loss": 0.03764142468571663, + "eval_runtime": 153.8073, + "eval_samples_per_second": 7.509, + "eval_steps_per_second": 7.509, + "step": 800 + }, + { + "epoch": 1.2467532467532467, + "grad_norm": 0.4800724983215332, + "learning_rate": 9.98126534520275e-05, + "loss": 0.0499, + "step": 810 + }, + { + "epoch": 1.2621452621452622, + "grad_norm": 0.29265281558036804, + "learning_rate": 9.978867474563152e-05, + "loss": 0.0429, + "step": 820 + }, + { + "epoch": 1.2775372775372775, + "grad_norm": 0.17428182065486908, + "learning_rate": 9.976325573296143e-05, + "loss": 0.0402, + "step": 830 + }, + { + "epoch": 1.2929292929292928, + "grad_norm": 0.38409313559532166, + "learning_rate": 9.973639714934844e-05, + "loss": 0.032, + "step": 840 + }, + { + "epoch": 1.3083213083213083, + "grad_norm": 0.7570776343345642, + "learning_rate": 9.970809977176813e-05, + "loss": 0.0328, + "step": 850 + }, + { + "epoch": 1.3083213083213083, + "eval_loss": 0.03785460442304611, + "eval_runtime": 153.6707, + "eval_samples_per_second": 7.516, + "eval_steps_per_second": 7.516, + "step": 850 + }, + { + "epoch": 1.3237133237133238, + "grad_norm": 0.36016738414764404, + "learning_rate": 9.967836441881815e-05, + "loss": 0.0305, + "step": 860 + }, + { + "epoch": 1.3391053391053391, + "grad_norm": 0.3494383990764618, + "learning_rate": 9.964719195069438e-05, + "loss": 0.0426, + "step": 870 + }, + { + "epoch": 1.3544973544973544, + "grad_norm": 0.2895767092704773, + "learning_rate": 9.961458326916624e-05, + "loss": 0.0417, + "step": 880 + }, + { + "epoch": 1.36988936988937, + "grad_norm": 0.24187473952770233, + "learning_rate": 9.958053931755038e-05, + "loss": 0.038, + "step": 890 + }, + { + "epoch": 1.3852813852813852, + "grad_norm": 0.314855694770813, + "learning_rate": 9.95450610806836e-05, + "loss": 0.044, + "step": 900 + }, + { + "epoch": 1.3852813852813852, + "eval_loss": 0.03801371157169342, + "eval_runtime": 153.6363, + "eval_samples_per_second": 7.518, + "eval_steps_per_second": 7.518, + "step": 900 + }, + { + "epoch": 1.4006734006734007, + "grad_norm": 0.33298107981681824, + "learning_rate": 9.95081495848942e-05, + "loss": 0.0452, + "step": 910 + }, + { + "epoch": 1.416065416065416, + "grad_norm": 0.2790783941745758, + "learning_rate": 9.946980589797241e-05, + "loss": 0.0386, + "step": 920 + }, + { + "epoch": 1.4314574314574315, + "grad_norm": 0.3105545938014984, + "learning_rate": 9.943003112913937e-05, + "loss": 0.0432, + "step": 930 + }, + { + "epoch": 1.4468494468494468, + "grad_norm": 0.5141786336898804, + "learning_rate": 9.938882642901522e-05, + "loss": 0.0443, + "step": 940 + }, + { + "epoch": 1.4622414622414621, + "grad_norm": 0.3150622248649597, + "learning_rate": 9.934619298958561e-05, + "loss": 0.0392, + "step": 950 + }, + { + "epoch": 1.4622414622414621, + "eval_loss": 0.036707017570734024, + "eval_runtime": 153.6837, + "eval_samples_per_second": 7.515, + "eval_steps_per_second": 7.515, + "step": 950 + }, + { + "epoch": 1.4776334776334776, + "grad_norm": 0.4975491762161255, + "learning_rate": 9.930213204416738e-05, + "loss": 0.0366, + "step": 960 + }, + { + "epoch": 1.4930254930254931, + "grad_norm": 0.5039690732955933, + "learning_rate": 9.925664486737281e-05, + "loss": 0.0424, + "step": 970 + }, + { + "epoch": 1.5084175084175084, + "grad_norm": 0.3383561968803406, + "learning_rate": 9.920973277507276e-05, + "loss": 0.0444, + "step": 980 + }, + { + "epoch": 1.5238095238095237, + "grad_norm": 0.2685425877571106, + "learning_rate": 9.916139712435859e-05, + "loss": 0.031, + "step": 990 + }, + { + "epoch": 1.5392015392015392, + "grad_norm": 0.3437824547290802, + "learning_rate": 9.911163931350296e-05, + "loss": 0.0314, + "step": 1000 + }, + { + "epoch": 1.5392015392015392, + "eval_loss": 0.041356153786182404, + "eval_runtime": 153.6183, + "eval_samples_per_second": 7.519, + "eval_steps_per_second": 7.519, + "step": 1000 + }, + { + "epoch": 1.5545935545935547, + "grad_norm": 0.27056369185447693, + "learning_rate": 9.906046078191924e-05, + "loss": 0.0385, + "step": 1010 + }, + { + "epoch": 1.56998556998557, + "grad_norm": 0.2805034816265106, + "learning_rate": 9.900786301012008e-05, + "loss": 0.0323, + "step": 1020 + }, + { + "epoch": 1.5853775853775853, + "grad_norm": 0.370043009519577, + "learning_rate": 9.895384751967441e-05, + "loss": 0.0268, + "step": 1030 + }, + { + "epoch": 1.6007696007696008, + "grad_norm": 0.5167951583862305, + "learning_rate": 9.889841587316344e-05, + "loss": 0.0433, + "step": 1040 + }, + { + "epoch": 1.6161616161616161, + "grad_norm": 0.17235764861106873, + "learning_rate": 9.884156967413558e-05, + "loss": 0.0412, + "step": 1050 + }, + { + "epoch": 1.6161616161616161, + "eval_loss": 0.03594866394996643, + "eval_runtime": 153.6739, + "eval_samples_per_second": 7.516, + "eval_steps_per_second": 7.516, + "step": 1050 + }, + { + "epoch": 1.6315536315536314, + "grad_norm": 0.2157604843378067, + "learning_rate": 9.878331056705992e-05, + "loss": 0.0399, + "step": 1060 + }, + { + "epoch": 1.646945646945647, + "grad_norm": 0.317499577999115, + "learning_rate": 9.872364023727873e-05, + "loss": 0.046, + "step": 1070 + }, + { + "epoch": 1.6623376623376624, + "grad_norm": 0.4339614808559418, + "learning_rate": 9.866256041095863e-05, + "loss": 0.0375, + "step": 1080 + }, + { + "epoch": 1.6777296777296777, + "grad_norm": 0.35777151584625244, + "learning_rate": 9.860007285504079e-05, + "loss": 0.0271, + "step": 1090 + }, + { + "epoch": 1.693121693121693, + "grad_norm": 0.3131728172302246, + "learning_rate": 9.853617937718966e-05, + "loss": 0.0384, + "step": 1100 + }, + { + "epoch": 1.693121693121693, + "eval_loss": 0.03828589618206024, + "eval_runtime": 153.7103, + "eval_samples_per_second": 7.514, + "eval_steps_per_second": 7.514, + "step": 1100 + }, + { + "epoch": 1.7085137085137085, + "grad_norm": 0.4421572983264923, + "learning_rate": 9.84708818257408e-05, + "loss": 0.0477, + "step": 1110 + }, + { + "epoch": 1.723905723905724, + "grad_norm": 0.35871562361717224, + "learning_rate": 9.840418208964732e-05, + "loss": 0.0294, + "step": 1120 + }, + { + "epoch": 1.7392977392977393, + "grad_norm": 0.3327915668487549, + "learning_rate": 9.833608209842534e-05, + "loss": 0.0288, + "step": 1130 + }, + { + "epoch": 1.7546897546897546, + "grad_norm": 0.35908883810043335, + "learning_rate": 9.826658382209799e-05, + "loss": 0.0313, + "step": 1140 + }, + { + "epoch": 1.7700817700817701, + "grad_norm": 0.33278927206993103, + "learning_rate": 9.819568927113869e-05, + "loss": 0.0308, + "step": 1150 + }, + { + "epoch": 1.7700817700817701, + "eval_loss": 0.03695591911673546, + "eval_runtime": 153.9078, + "eval_samples_per_second": 7.504, + "eval_steps_per_second": 7.504, + "step": 1150 + }, + { + "epoch": 1.7854737854737854, + "grad_norm": 0.3940715491771698, + "learning_rate": 9.812340049641274e-05, + "loss": 0.0387, + "step": 1160 + }, + { + "epoch": 1.8008658008658007, + "grad_norm": 0.2702500820159912, + "learning_rate": 9.804971958911808e-05, + "loss": 0.03, + "step": 1170 + }, + { + "epoch": 1.8162578162578162, + "grad_norm": 0.3114793002605438, + "learning_rate": 9.797464868072488e-05, + "loss": 0.033, + "step": 1180 + }, + { + "epoch": 1.8316498316498318, + "grad_norm": 0.5738898515701294, + "learning_rate": 9.789818994291375e-05, + "loss": 0.0462, + "step": 1190 + }, + { + "epoch": 1.847041847041847, + "grad_norm": 0.5608885288238525, + "learning_rate": 9.7820345587513e-05, + "loss": 0.0437, + "step": 1200 + }, + { + "epoch": 1.847041847041847, + "eval_loss": 0.03845854848623276, + "eval_runtime": 153.7901, + "eval_samples_per_second": 7.51, + "eval_steps_per_second": 7.51, + "step": 1200 + }, + { + "epoch": 1.8624338624338623, + "grad_norm": 0.3220265507698059, + "learning_rate": 9.77411178664346e-05, + "loss": 0.0302, + "step": 1210 + }, + { + "epoch": 1.8778258778258778, + "grad_norm": 0.12242457270622253, + "learning_rate": 9.766050907160911e-05, + "loss": 0.0353, + "step": 1220 + }, + { + "epoch": 1.8932178932178934, + "grad_norm": 0.2093583047389984, + "learning_rate": 9.757852153491926e-05, + "loss": 0.0293, + "step": 1230 + }, + { + "epoch": 1.9086099086099086, + "grad_norm": 0.3392092287540436, + "learning_rate": 9.749515762813265e-05, + "loss": 0.0414, + "step": 1240 + }, + { + "epoch": 1.924001924001924, + "grad_norm": 0.3785693645477295, + "learning_rate": 9.741041976283298e-05, + "loss": 0.037, + "step": 1250 + }, + { + "epoch": 1.924001924001924, + "eval_loss": 0.03727955371141434, + "eval_runtime": 153.9794, + "eval_samples_per_second": 7.501, + "eval_steps_per_second": 7.501, + "step": 1250 + }, + { + "epoch": 1.9393939393939394, + "grad_norm": 0.17241697013378143, + "learning_rate": 9.732431039035042e-05, + "loss": 0.035, + "step": 1260 + }, + { + "epoch": 1.9547859547859547, + "grad_norm": 0.16306449472904205, + "learning_rate": 9.723683200169058e-05, + "loss": 0.0337, + "step": 1270 + }, + { + "epoch": 1.97017797017797, + "grad_norm": 0.3089943826198578, + "learning_rate": 9.714798712746253e-05, + "loss": 0.0314, + "step": 1280 + }, + { + "epoch": 1.9855699855699855, + "grad_norm": 0.14751343429088593, + "learning_rate": 9.70577783378056e-05, + "loss": 0.0456, + "step": 1290 + }, + { + "epoch": 2.000962000962001, + "grad_norm": 0.21892204880714417, + "learning_rate": 9.696620824231496e-05, + "loss": 0.0375, + "step": 1300 + }, + { + "epoch": 2.000962000962001, + "eval_loss": 0.038531381636857986, + "eval_runtime": 153.7803, + "eval_samples_per_second": 7.511, + "eval_steps_per_second": 7.511, + "step": 1300 + }, + { + "epoch": 2.0163540163540166, + "grad_norm": 0.2808768153190613, + "learning_rate": 9.687327948996617e-05, + "loss": 0.0323, + "step": 1310 + }, + { + "epoch": 2.0317460317460316, + "grad_norm": 0.18679580092430115, + "learning_rate": 9.677899476903857e-05, + "loss": 0.0255, + "step": 1320 + }, + { + "epoch": 2.047138047138047, + "grad_norm": 0.3120351731777191, + "learning_rate": 9.668335680703746e-05, + "loss": 0.0275, + "step": 1330 + }, + { + "epoch": 2.0625300625300627, + "grad_norm": 0.22421617805957794, + "learning_rate": 9.658636837061526e-05, + "loss": 0.024, + "step": 1340 + }, + { + "epoch": 2.0779220779220777, + "grad_norm": 0.30565041303634644, + "learning_rate": 9.648803226549141e-05, + "loss": 0.0241, + "step": 1350 + }, + { + "epoch": 2.0779220779220777, + "eval_loss": 0.036000799387693405, + "eval_runtime": 153.8946, + "eval_samples_per_second": 7.505, + "eval_steps_per_second": 7.505, + "step": 1350 + }, + { + "epoch": 2.0933140933140932, + "grad_norm": 0.2510288953781128, + "learning_rate": 9.638835133637129e-05, + "loss": 0.0207, + "step": 1360 + }, + { + "epoch": 2.1087061087061088, + "grad_norm": 0.29693618416786194, + "learning_rate": 9.62873284668638e-05, + "loss": 0.0233, + "step": 1370 + }, + { + "epoch": 2.1240981240981243, + "grad_norm": 0.2484658658504486, + "learning_rate": 9.618496657939809e-05, + "loss": 0.0271, + "step": 1380 + }, + { + "epoch": 2.1394901394901393, + "grad_norm": 0.19184599816799164, + "learning_rate": 9.608126863513889e-05, + "loss": 0.0237, + "step": 1390 + }, + { + "epoch": 2.154882154882155, + "grad_norm": 0.2674761116504669, + "learning_rate": 9.597623763390094e-05, + "loss": 0.0304, + "step": 1400 + }, + { + "epoch": 2.154882154882155, + "eval_loss": 0.03596730902791023, + "eval_runtime": 153.7406, + "eval_samples_per_second": 7.513, + "eval_steps_per_second": 7.513, + "step": 1400 + }, + { + "epoch": 2.1702741702741704, + "grad_norm": 0.3518555164337158, + "learning_rate": 9.586987661406213e-05, + "loss": 0.0238, + "step": 1410 + }, + { + "epoch": 2.185666185666186, + "grad_norm": 0.19268417358398438, + "learning_rate": 9.576218865247573e-05, + "loss": 0.0248, + "step": 1420 + }, + { + "epoch": 2.201058201058201, + "grad_norm": 0.15123145282268524, + "learning_rate": 9.565317686438123e-05, + "loss": 0.0351, + "step": 1430 + }, + { + "epoch": 2.2164502164502164, + "grad_norm": 0.2907862067222595, + "learning_rate": 9.55428444033143e-05, + "loss": 0.021, + "step": 1440 + }, + { + "epoch": 2.231842231842232, + "grad_norm": 0.308349072933197, + "learning_rate": 9.543119446101556e-05, + "loss": 0.0248, + "step": 1450 + }, + { + "epoch": 2.231842231842232, + "eval_loss": 0.03539415821433067, + "eval_runtime": 153.917, + "eval_samples_per_second": 7.504, + "eval_steps_per_second": 7.504, + "step": 1450 + }, + { + "epoch": 2.247234247234247, + "grad_norm": 0.2561543881893158, + "learning_rate": 9.531823026733826e-05, + "loss": 0.028, + "step": 1460 + }, + { + "epoch": 2.2626262626262625, + "grad_norm": 0.2734358012676239, + "learning_rate": 9.520395509015483e-05, + "loss": 0.0244, + "step": 1470 + }, + { + "epoch": 2.278018278018278, + "grad_norm": 0.42557621002197266, + "learning_rate": 9.508837223526232e-05, + "loss": 0.026, + "step": 1480 + }, + { + "epoch": 2.2934102934102936, + "grad_norm": 0.20696111023426056, + "learning_rate": 9.49714850462868e-05, + "loss": 0.0299, + "step": 1490 + }, + { + "epoch": 2.3088023088023086, + "grad_norm": 0.27246183156967163, + "learning_rate": 9.485329690458664e-05, + "loss": 0.0283, + "step": 1500 + }, + { + "epoch": 2.3088023088023086, + "eval_loss": 0.03620808944106102, + "eval_runtime": 153.8969, + "eval_samples_per_second": 7.505, + "eval_steps_per_second": 7.505, + "step": 1500 + }, + { + "epoch": 2.324194324194324, + "grad_norm": 0.2641438841819763, + "learning_rate": 9.473381122915465e-05, + "loss": 0.0263, + "step": 1510 + }, + { + "epoch": 2.3395863395863397, + "grad_norm": 0.15598271787166595, + "learning_rate": 9.461303147651926e-05, + "loss": 0.0214, + "step": 1520 + }, + { + "epoch": 2.354978354978355, + "grad_norm": 0.20637468993663788, + "learning_rate": 9.449096114064442e-05, + "loss": 0.0341, + "step": 1530 + }, + { + "epoch": 2.3703703703703702, + "grad_norm": 0.27665892243385315, + "learning_rate": 9.436760375282859e-05, + "loss": 0.0384, + "step": 1540 + }, + { + "epoch": 2.3857623857623858, + "grad_norm": 0.3268972933292389, + "learning_rate": 9.424296288160257e-05, + "loss": 0.0281, + "step": 1550 + }, + { + "epoch": 2.3857623857623858, + "eval_loss": 0.03699302673339844, + "eval_runtime": 154.0015, + "eval_samples_per_second": 7.5, + "eval_steps_per_second": 7.5, + "step": 1550 + }, + { + "epoch": 2.4011544011544013, + "grad_norm": 0.22946308553218842, + "learning_rate": 9.411704213262629e-05, + "loss": 0.0285, + "step": 1560 + }, + { + "epoch": 2.4165464165464163, + "grad_norm": 0.12260106205940247, + "learning_rate": 9.398984514858446e-05, + "loss": 0.0237, + "step": 1570 + }, + { + "epoch": 2.431938431938432, + "grad_norm": 0.22740353643894196, + "learning_rate": 9.386137560908122e-05, + "loss": 0.0189, + "step": 1580 + }, + { + "epoch": 2.4473304473304474, + "grad_norm": 0.03466098755598068, + "learning_rate": 9.373163723053374e-05, + "loss": 0.0207, + "step": 1590 + }, + { + "epoch": 2.462722462722463, + "grad_norm": 0.2721256613731384, + "learning_rate": 9.360063376606458e-05, + "loss": 0.0331, + "step": 1600 + }, + { + "epoch": 2.462722462722463, + "eval_loss": 0.03893834352493286, + "eval_runtime": 154.0381, + "eval_samples_per_second": 7.498, + "eval_steps_per_second": 7.498, + "step": 1600 + }, + { + "epoch": 2.478114478114478, + "grad_norm": 0.2785675823688507, + "learning_rate": 9.346836900539329e-05, + "loss": 0.0247, + "step": 1610 + }, + { + "epoch": 2.4935064935064934, + "grad_norm": 0.2126145213842392, + "learning_rate": 9.333484677472659e-05, + "loss": 0.0257, + "step": 1620 + }, + { + "epoch": 2.508898508898509, + "grad_norm": 0.38729798793792725, + "learning_rate": 9.320007093664789e-05, + "loss": 0.0362, + "step": 1630 + }, + { + "epoch": 2.5242905242905245, + "grad_norm": 0.280361145734787, + "learning_rate": 9.306404539000537e-05, + "loss": 0.0293, + "step": 1640 + }, + { + "epoch": 2.5396825396825395, + "grad_norm": 0.42176496982574463, + "learning_rate": 9.292677406979929e-05, + "loss": 0.0338, + "step": 1650 + }, + { + "epoch": 2.5396825396825395, + "eval_loss": 0.035883817821741104, + "eval_runtime": 154.0368, + "eval_samples_per_second": 7.498, + "eval_steps_per_second": 7.498, + "step": 1650 + }, + { + "epoch": 2.555074555074555, + "grad_norm": 0.12095620483160019, + "learning_rate": 9.278826094706815e-05, + "loss": 0.0388, + "step": 1660 + }, + { + "epoch": 2.5704665704665706, + "grad_norm": 0.18758133053779602, + "learning_rate": 9.26485100287738e-05, + "loss": 0.0355, + "step": 1670 + }, + { + "epoch": 2.5858585858585856, + "grad_norm": 0.2807534635066986, + "learning_rate": 9.250752535768548e-05, + "loss": 0.0273, + "step": 1680 + }, + { + "epoch": 2.601250601250601, + "grad_norm": 0.27200889587402344, + "learning_rate": 9.236531101226298e-05, + "loss": 0.0203, + "step": 1690 + }, + { + "epoch": 2.6166426166426167, + "grad_norm": 0.43716514110565186, + "learning_rate": 9.222187110653853e-05, + "loss": 0.033, + "step": 1700 + }, + { + "epoch": 2.6166426166426167, + "eval_loss": 0.03459745645523071, + "eval_runtime": 154.1283, + "eval_samples_per_second": 7.494, + "eval_steps_per_second": 7.494, + "step": 1700 + }, + { + "epoch": 2.632034632034632, + "grad_norm": 0.1480332612991333, + "learning_rate": 9.20772097899979e-05, + "loss": 0.0238, + "step": 1710 + }, + { + "epoch": 2.6474266474266477, + "grad_norm": 0.20187431573867798, + "learning_rate": 9.193133124746029e-05, + "loss": 0.0248, + "step": 1720 + }, + { + "epoch": 2.6628186628186628, + "grad_norm": 0.2907063066959381, + "learning_rate": 9.178423969895726e-05, + "loss": 0.0295, + "step": 1730 + }, + { + "epoch": 2.6782106782106783, + "grad_norm": 0.12126291543245316, + "learning_rate": 9.163593939961071e-05, + "loss": 0.0244, + "step": 1740 + }, + { + "epoch": 2.6936026936026938, + "grad_norm": 0.3435959815979004, + "learning_rate": 9.148643463950979e-05, + "loss": 0.0226, + "step": 1750 + }, + { + "epoch": 2.6936026936026938, + "eval_loss": 0.03571978211402893, + "eval_runtime": 154.3427, + "eval_samples_per_second": 7.483, + "eval_steps_per_second": 7.483, + "step": 1750 + }, + { + "epoch": 2.708994708994709, + "grad_norm": 0.27285709977149963, + "learning_rate": 9.133572974358669e-05, + "loss": 0.022, + "step": 1760 + }, + { + "epoch": 2.7243867243867244, + "grad_norm": 0.28213655948638916, + "learning_rate": 9.118382907149165e-05, + "loss": 0.0293, + "step": 1770 + }, + { + "epoch": 2.73977873977874, + "grad_norm": 0.24500757455825806, + "learning_rate": 9.103073701746678e-05, + "loss": 0.0317, + "step": 1780 + }, + { + "epoch": 2.755170755170755, + "grad_norm": 0.22210805118083954, + "learning_rate": 9.087645801021895e-05, + "loss": 0.0209, + "step": 1790 + }, + { + "epoch": 2.7705627705627704, + "grad_norm": 0.4315476715564728, + "learning_rate": 9.072099651279166e-05, + "loss": 0.0281, + "step": 1800 + }, + { + "epoch": 2.7705627705627704, + "eval_loss": 0.03848813474178314, + "eval_runtime": 154.6093, + "eval_samples_per_second": 7.47, + "eval_steps_per_second": 7.47, + "step": 1800 + }, + { + "epoch": 2.785954785954786, + "grad_norm": 0.3140532672405243, + "learning_rate": 9.056435702243601e-05, + "loss": 0.0271, + "step": 1810 + }, + { + "epoch": 2.8013468013468015, + "grad_norm": 0.16396714746952057, + "learning_rate": 9.040654407048046e-05, + "loss": 0.0306, + "step": 1820 + }, + { + "epoch": 2.816738816738817, + "grad_norm": 0.23600676655769348, + "learning_rate": 9.024756222219987e-05, + "loss": 0.0309, + "step": 1830 + }, + { + "epoch": 2.832130832130832, + "grad_norm": 0.41101738810539246, + "learning_rate": 9.008741607668341e-05, + "loss": 0.0286, + "step": 1840 + }, + { + "epoch": 2.8475228475228476, + "grad_norm": 0.3991663455963135, + "learning_rate": 8.992611026670143e-05, + "loss": 0.0181, + "step": 1850 + }, + { + "epoch": 2.8475228475228476, + "eval_loss": 0.04208507016301155, + "eval_runtime": 154.4869, + "eval_samples_per_second": 7.476, + "eval_steps_per_second": 7.476, + "step": 1850 + }, + { + "epoch": 2.862914862914863, + "grad_norm": 0.31233450770378113, + "learning_rate": 8.976364945857157e-05, + "loss": 0.0255, + "step": 1860 + }, + { + "epoch": 2.878306878306878, + "grad_norm": 0.38498422503471375, + "learning_rate": 8.96000383520237e-05, + "loss": 0.0348, + "step": 1870 + }, + { + "epoch": 2.8936988936988937, + "grad_norm": 0.2639101445674896, + "learning_rate": 8.943528168006394e-05, + "loss": 0.0253, + "step": 1880 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 0.13839489221572876, + "learning_rate": 8.926938420883779e-05, + "loss": 0.0245, + "step": 1890 + }, + { + "epoch": 2.9244829244829242, + "grad_norm": 0.10831055045127869, + "learning_rate": 8.910235073749227e-05, + "loss": 0.0234, + "step": 1900 + }, + { + "epoch": 2.9244829244829242, + "eval_loss": 0.03652135282754898, + "eval_runtime": 154.8495, + "eval_samples_per_second": 7.459, + "eval_steps_per_second": 7.459, + "step": 1900 + }, + { + "epoch": 2.9398749398749398, + "grad_norm": 0.2650848925113678, + "learning_rate": 8.893418609803699e-05, + "loss": 0.0273, + "step": 1910 + }, + { + "epoch": 2.9552669552669553, + "grad_norm": 0.31554996967315674, + "learning_rate": 8.876489515520452e-05, + "loss": 0.0272, + "step": 1920 + }, + { + "epoch": 2.9706589706589708, + "grad_norm": 0.4010063409805298, + "learning_rate": 8.859448280630948e-05, + "loss": 0.0348, + "step": 1930 + }, + { + "epoch": 2.9860509860509863, + "grad_norm": 0.29273927211761475, + "learning_rate": 8.842295398110704e-05, + "loss": 0.0315, + "step": 1940 + }, + { + "epoch": 3.0014430014430014, + "grad_norm": 0.35062375664711, + "learning_rate": 8.82503136416502e-05, + "loss": 0.0285, + "step": 1950 + }, + { + "epoch": 3.0014430014430014, + "eval_loss": 0.03746026009321213, + "eval_runtime": 155.0206, + "eval_samples_per_second": 7.451, + "eval_steps_per_second": 7.451, + "step": 1950 + }, + { + "epoch": 3.016835016835017, + "grad_norm": 0.09749991446733475, + "learning_rate": 8.807656678214628e-05, + "loss": 0.0201, + "step": 1960 + }, + { + "epoch": 3.0322270322270324, + "grad_norm": 0.2139156609773636, + "learning_rate": 8.790171842881245e-05, + "loss": 0.0175, + "step": 1970 + }, + { + "epoch": 3.0476190476190474, + "grad_norm": 0.21557338535785675, + "learning_rate": 8.772577363973033e-05, + "loss": 0.0138, + "step": 1980 + }, + { + "epoch": 3.063011063011063, + "grad_norm": 0.14884543418884277, + "learning_rate": 8.754873750469964e-05, + "loss": 0.0166, + "step": 1990 + }, + { + "epoch": 3.0784030784030785, + "grad_norm": 0.25472021102905273, + "learning_rate": 8.737061514509102e-05, + "loss": 0.019, + "step": 2000 + }, + { + "epoch": 3.0784030784030785, + "eval_loss": 0.04241637885570526, + "eval_runtime": 155.0191, + "eval_samples_per_second": 7.451, + "eval_steps_per_second": 7.451, + "step": 2000 + }, + { + "epoch": 3.093795093795094, + "grad_norm": 0.42326387763023376, + "learning_rate": 8.71914117136978e-05, + "loss": 0.0184, + "step": 2010 + }, + { + "epoch": 3.109187109187109, + "grad_norm": 0.1640986055135727, + "learning_rate": 8.701113239458703e-05, + "loss": 0.0187, + "step": 2020 + }, + { + "epoch": 3.1245791245791246, + "grad_norm": 0.6786589622497559, + "learning_rate": 8.682978240294939e-05, + "loss": 0.0182, + "step": 2030 + }, + { + "epoch": 3.13997113997114, + "grad_norm": 0.3853350281715393, + "learning_rate": 8.664736698494844e-05, + "loss": 0.0132, + "step": 2040 + }, + { + "epoch": 3.1553631553631556, + "grad_norm": 0.2559121251106262, + "learning_rate": 8.646389141756882e-05, + "loss": 0.0201, + "step": 2050 + }, + { + "epoch": 3.1553631553631556, + "eval_loss": 0.04055478423833847, + "eval_runtime": 155.0132, + "eval_samples_per_second": 7.451, + "eval_steps_per_second": 7.451, + "step": 2050 + }, + { + "epoch": 3.1707551707551707, + "grad_norm": 0.3448712229728699, + "learning_rate": 8.627936100846355e-05, + "loss": 0.0128, + "step": 2060 + }, + { + "epoch": 3.186147186147186, + "grad_norm": 1.0832544565200806, + "learning_rate": 8.609378109580058e-05, + "loss": 0.0142, + "step": 2070 + }, + { + "epoch": 3.2015392015392017, + "grad_norm": 0.654077410697937, + "learning_rate": 8.590715704810822e-05, + "loss": 0.0189, + "step": 2080 + }, + { + "epoch": 3.2169312169312168, + "grad_norm": 0.29627299308776855, + "learning_rate": 8.571949426412001e-05, + "loss": 0.0166, + "step": 2090 + }, + { + "epoch": 3.2323232323232323, + "grad_norm": 0.2167317122220993, + "learning_rate": 8.553079817261844e-05, + "loss": 0.0229, + "step": 2100 + }, + { + "epoch": 3.2323232323232323, + "eval_loss": 0.038315802812576294, + "eval_runtime": 155.0609, + "eval_samples_per_second": 7.449, + "eval_steps_per_second": 7.449, + "step": 2100 + }, + { + "epoch": 3.2477152477152478, + "grad_norm": 0.2702988088130951, + "learning_rate": 8.53410742322779e-05, + "loss": 0.0145, + "step": 2110 + }, + { + "epoch": 3.2631072631072633, + "grad_norm": 0.46758317947387695, + "learning_rate": 8.515032793150682e-05, + "loss": 0.0265, + "step": 2120 + }, + { + "epoch": 3.2784992784992784, + "grad_norm": 0.19082292914390564, + "learning_rate": 8.495856478828883e-05, + "loss": 0.0204, + "step": 2130 + }, + { + "epoch": 3.293891293891294, + "grad_norm": 0.4500688314437866, + "learning_rate": 8.476579035002323e-05, + "loss": 0.0172, + "step": 2140 + }, + { + "epoch": 3.3092833092833094, + "grad_norm": 0.3187370300292969, + "learning_rate": 8.457201019336445e-05, + "loss": 0.0182, + "step": 2150 + }, + { + "epoch": 3.3092833092833094, + "eval_loss": 0.039496615529060364, + "eval_runtime": 155.0309, + "eval_samples_per_second": 7.45, + "eval_steps_per_second": 7.45, + "step": 2150 + }, + { + "epoch": 3.324675324675325, + "grad_norm": 0.19123513996601105, + "learning_rate": 8.43772299240607e-05, + "loss": 0.015, + "step": 2160 + }, + { + "epoch": 3.34006734006734, + "grad_norm": 0.12203314155340195, + "learning_rate": 8.418145517679187e-05, + "loss": 0.0138, + "step": 2170 + }, + { + "epoch": 3.3554593554593555, + "grad_norm": 0.38870304822921753, + "learning_rate": 8.39846916150065e-05, + "loss": 0.0189, + "step": 2180 + }, + { + "epoch": 3.370851370851371, + "grad_norm": 0.2655882239341736, + "learning_rate": 8.378694493075792e-05, + "loss": 0.0241, + "step": 2190 + }, + { + "epoch": 3.386243386243386, + "grad_norm": 0.4333752393722534, + "learning_rate": 8.358822084453965e-05, + "loss": 0.0189, + "step": 2200 + }, + { + "epoch": 3.386243386243386, + "eval_loss": 0.04004070907831192, + "eval_runtime": 155.013, + "eval_samples_per_second": 7.451, + "eval_steps_per_second": 7.451, + "step": 2200 + }, + { + "epoch": 3.4016354016354016, + "grad_norm": 0.2670556604862213, + "learning_rate": 8.338852510511981e-05, + "loss": 0.0169, + "step": 2210 + }, + { + "epoch": 3.417027417027417, + "grad_norm": 0.17558638751506805, + "learning_rate": 8.318786348937493e-05, + "loss": 0.0196, + "step": 2220 + }, + { + "epoch": 3.4324194324194326, + "grad_norm": 0.2259896844625473, + "learning_rate": 8.298624180212282e-05, + "loss": 0.0187, + "step": 2230 + }, + { + "epoch": 3.4478114478114477, + "grad_norm": 0.1544823795557022, + "learning_rate": 8.278366587595456e-05, + "loss": 0.0149, + "step": 2240 + }, + { + "epoch": 3.463203463203463, + "grad_norm": 0.4848611354827881, + "learning_rate": 8.258014157106584e-05, + "loss": 0.0148, + "step": 2250 + }, + { + "epoch": 3.463203463203463, + "eval_loss": 0.044121548533439636, + "eval_runtime": 155.1082, + "eval_samples_per_second": 7.446, + "eval_steps_per_second": 7.446, + "step": 2250 + }, + { + "epoch": 3.4785954785954787, + "grad_norm": 0.07561318576335907, + "learning_rate": 8.237567477508745e-05, + "loss": 0.0254, + "step": 2260 + }, + { + "epoch": 3.493987493987494, + "grad_norm": 0.11404462903738022, + "learning_rate": 8.217027140291485e-05, + "loss": 0.0209, + "step": 2270 + }, + { + "epoch": 3.5093795093795093, + "grad_norm": 0.4680187404155731, + "learning_rate": 8.196393739653727e-05, + "loss": 0.0243, + "step": 2280 + }, + { + "epoch": 3.5247715247715248, + "grad_norm": 0.22974807024002075, + "learning_rate": 8.175667872486558e-05, + "loss": 0.0138, + "step": 2290 + }, + { + "epoch": 3.5401635401635403, + "grad_norm": 0.49656739830970764, + "learning_rate": 8.154850138355978e-05, + "loss": 0.0184, + "step": 2300 + }, + { + "epoch": 3.5401635401635403, + "eval_loss": 0.04184039309620857, + "eval_runtime": 155.073, + "eval_samples_per_second": 7.448, + "eval_steps_per_second": 7.448, + "step": 2300 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 0.27048709988594055, + "learning_rate": 8.133941139485551e-05, + "loss": 0.0145, + "step": 2310 + }, + { + "epoch": 3.570947570947571, + "grad_norm": 0.3945915699005127, + "learning_rate": 8.11294148073898e-05, + "loss": 0.0216, + "step": 2320 + }, + { + "epoch": 3.5863395863395864, + "grad_norm": 0.36210399866104126, + "learning_rate": 8.091851769602614e-05, + "loss": 0.0265, + "step": 2330 + }, + { + "epoch": 3.601731601731602, + "grad_norm": 0.34212827682495117, + "learning_rate": 8.070672616167873e-05, + "loss": 0.0205, + "step": 2340 + }, + { + "epoch": 3.617123617123617, + "grad_norm": 0.2602589726448059, + "learning_rate": 8.0494046331136e-05, + "loss": 0.0186, + "step": 2350 + }, + { + "epoch": 3.617123617123617, + "eval_loss": 0.037504736334085464, + "eval_runtime": 155.1298, + "eval_samples_per_second": 7.445, + "eval_steps_per_second": 7.445, + "step": 2350 + }, + { + "epoch": 3.6325156325156325, + "grad_norm": 0.07770682126283646, + "learning_rate": 8.028048435688333e-05, + "loss": 0.0128, + "step": 2360 + }, + { + "epoch": 3.647907647907648, + "grad_norm": 0.07664130628108978, + "learning_rate": 8.006604641692513e-05, + "loss": 0.0114, + "step": 2370 + }, + { + "epoch": 3.6632996632996635, + "grad_norm": 0.25726816058158875, + "learning_rate": 7.985073871460606e-05, + "loss": 0.0159, + "step": 2380 + }, + { + "epoch": 3.6786916786916786, + "grad_norm": 0.31828832626342773, + "learning_rate": 7.963456747843163e-05, + "loss": 0.0136, + "step": 2390 + }, + { + "epoch": 3.694083694083694, + "grad_norm": 0.32144877314567566, + "learning_rate": 7.941753896188799e-05, + "loss": 0.0178, + "step": 2400 + }, + { + "epoch": 3.694083694083694, + "eval_loss": 0.04591238498687744, + "eval_runtime": 155.0804, + "eval_samples_per_second": 7.448, + "eval_steps_per_second": 7.448, + "step": 2400 + }, + { + "epoch": 3.7094757094757096, + "grad_norm": 0.6328426003456116, + "learning_rate": 7.919965944326101e-05, + "loss": 0.0251, + "step": 2410 + }, + { + "epoch": 3.7248677248677247, + "grad_norm": 0.15846994519233704, + "learning_rate": 7.898093522545472e-05, + "loss": 0.0188, + "step": 2420 + }, + { + "epoch": 3.74025974025974, + "grad_norm": 0.188494473695755, + "learning_rate": 7.876137263580889e-05, + "loss": 0.0186, + "step": 2430 + }, + { + "epoch": 3.7556517556517557, + "grad_norm": 0.3505702316761017, + "learning_rate": 7.85409780259161e-05, + "loss": 0.019, + "step": 2440 + }, + { + "epoch": 3.771043771043771, + "grad_norm": 0.2287614494562149, + "learning_rate": 7.831975777143789e-05, + "loss": 0.0165, + "step": 2450 + }, + { + "epoch": 3.771043771043771, + "eval_loss": 0.04134778305888176, + "eval_runtime": 155.1612, + "eval_samples_per_second": 7.444, + "eval_steps_per_second": 7.444, + "step": 2450 + }, + { + "epoch": 3.7864357864357867, + "grad_norm": 0.05540940538048744, + "learning_rate": 7.809771827192037e-05, + "loss": 0.0141, + "step": 2460 + }, + { + "epoch": 3.801827801827802, + "grad_norm": 0.18242235481739044, + "learning_rate": 7.787486595060913e-05, + "loss": 0.0168, + "step": 2470 + }, + { + "epoch": 3.8172198172198173, + "grad_norm": 0.43631306290626526, + "learning_rate": 7.765120725426333e-05, + "loss": 0.0178, + "step": 2480 + }, + { + "epoch": 3.832611832611833, + "grad_norm": 0.17660892009735107, + "learning_rate": 7.742674865296934e-05, + "loss": 0.0199, + "step": 2490 + }, + { + "epoch": 3.848003848003848, + "grad_norm": 0.27444711327552795, + "learning_rate": 7.72014966399534e-05, + "loss": 0.0272, + "step": 2500 + }, + { + "epoch": 3.848003848003848, + "eval_loss": 0.040123943239450455, + "eval_runtime": 155.1062, + "eval_samples_per_second": 7.447, + "eval_steps_per_second": 7.447, + "step": 2500 + }, + { + "epoch": 3.8633958633958634, + "grad_norm": 0.18719252943992615, + "learning_rate": 7.697545773139397e-05, + "loss": 0.0244, + "step": 2510 + }, + { + "epoch": 3.878787878787879, + "grad_norm": 0.32853636145591736, + "learning_rate": 7.674863846623304e-05, + "loss": 0.0198, + "step": 2520 + }, + { + "epoch": 3.894179894179894, + "grad_norm": 0.2294764220714569, + "learning_rate": 7.652104540598712e-05, + "loss": 0.0124, + "step": 2530 + }, + { + "epoch": 3.9095719095719095, + "grad_norm": 0.21580101549625397, + "learning_rate": 7.629268513455739e-05, + "loss": 0.0153, + "step": 2540 + }, + { + "epoch": 3.924963924963925, + "grad_norm": 0.9243898391723633, + "learning_rate": 7.606356425803913e-05, + "loss": 0.0154, + "step": 2550 + }, + { + "epoch": 3.924963924963925, + "eval_loss": 0.04535938426852226, + "eval_runtime": 155.2794, + "eval_samples_per_second": 7.438, + "eval_steps_per_second": 7.438, + "step": 2550 + }, + { + "epoch": 3.9403559403559405, + "grad_norm": 0.2209847867488861, + "learning_rate": 7.583368940453077e-05, + "loss": 0.0196, + "step": 2560 + }, + { + "epoch": 3.955747955747956, + "grad_norm": 0.5092231631278992, + "learning_rate": 7.560306722394209e-05, + "loss": 0.0217, + "step": 2570 + }, + { + "epoch": 3.971139971139971, + "grad_norm": 0.37853744626045227, + "learning_rate": 7.537170438780177e-05, + "loss": 0.015, + "step": 2580 + }, + { + "epoch": 3.9865319865319866, + "grad_norm": 0.15276499092578888, + "learning_rate": 7.513960758906452e-05, + "loss": 0.021, + "step": 2590 + }, + { + "epoch": 4.001924001924002, + "grad_norm": 0.10910890996456146, + "learning_rate": 7.490678354191739e-05, + "loss": 0.0197, + "step": 2600 + }, + { + "epoch": 4.001924001924002, + "eval_loss": 0.03905920311808586, + "eval_runtime": 155.0851, + "eval_samples_per_second": 7.448, + "eval_steps_per_second": 7.448, + "step": 2600 + }, + { + "epoch": 4.017316017316017, + "grad_norm": 0.39457622170448303, + "learning_rate": 7.467323898158559e-05, + "loss": 0.01, + "step": 2610 + }, + { + "epoch": 4.032708032708033, + "grad_norm": 0.486274778842926, + "learning_rate": 7.443898066413755e-05, + "loss": 0.0116, + "step": 2620 + }, + { + "epoch": 4.048100048100048, + "grad_norm": 0.2256522923707962, + "learning_rate": 7.420401536628962e-05, + "loss": 0.0092, + "step": 2630 + }, + { + "epoch": 4.063492063492063, + "grad_norm": 0.29507842659950256, + "learning_rate": 7.396834988520988e-05, + "loss": 0.0132, + "step": 2640 + }, + { + "epoch": 4.078884078884079, + "grad_norm": 0.16445063054561615, + "learning_rate": 7.373199103832167e-05, + "loss": 0.0104, + "step": 2650 + }, + { + "epoch": 4.078884078884079, + "eval_loss": 0.04517115280032158, + "eval_runtime": 155.1625, + "eval_samples_per_second": 7.444, + "eval_steps_per_second": 7.444, + "step": 2650 + }, + { + "epoch": 4.094276094276094, + "grad_norm": 0.042906515300273895, + "learning_rate": 7.349494566310618e-05, + "loss": 0.0102, + "step": 2660 + }, + { + "epoch": 4.109668109668109, + "grad_norm": 0.7587229013442993, + "learning_rate": 7.325722061690489e-05, + "loss": 0.0092, + "step": 2670 + }, + { + "epoch": 4.125060125060125, + "grad_norm": 0.24704445898532867, + "learning_rate": 7.301882277672094e-05, + "loss": 0.0182, + "step": 2680 + }, + { + "epoch": 4.14045214045214, + "grad_norm": 0.19122077524662018, + "learning_rate": 7.277975903902043e-05, + "loss": 0.0114, + "step": 2690 + }, + { + "epoch": 4.1558441558441555, + "grad_norm": 0.20493003726005554, + "learning_rate": 7.25400363195327e-05, + "loss": 0.0096, + "step": 2700 + }, + { + "epoch": 4.1558441558441555, + "eval_loss": 0.04710530862212181, + "eval_runtime": 155.1449, + "eval_samples_per_second": 7.445, + "eval_steps_per_second": 7.445, + "step": 2700 + }, + { + "epoch": 4.171236171236171, + "grad_norm": 0.32011181116104126, + "learning_rate": 7.229966155305047e-05, + "loss": 0.0095, + "step": 2710 + }, + { + "epoch": 4.1866281866281865, + "grad_norm": 0.595345675945282, + "learning_rate": 7.205864169322905e-05, + "loss": 0.0073, + "step": 2720 + }, + { + "epoch": 4.202020202020202, + "grad_norm": 0.36714866757392883, + "learning_rate": 7.181698371238533e-05, + "loss": 0.0153, + "step": 2730 + }, + { + "epoch": 4.2174122174122175, + "grad_norm": 0.1129726693034172, + "learning_rate": 7.157469460129597e-05, + "loss": 0.0086, + "step": 2740 + }, + { + "epoch": 4.232804232804233, + "grad_norm": 0.358989953994751, + "learning_rate": 7.133178136899522e-05, + "loss": 0.0114, + "step": 2750 + }, + { + "epoch": 4.232804232804233, + "eval_loss": 0.049211855977773666, + "eval_runtime": 155.1076, + "eval_samples_per_second": 7.446, + "eval_steps_per_second": 7.446, + "step": 2750 + }, + { + "epoch": 4.2481962481962485, + "grad_norm": 0.06870854645967484, + "learning_rate": 7.108825104257215e-05, + "loss": 0.0091, + "step": 2760 + }, + { + "epoch": 4.263588263588264, + "grad_norm": 0.20162701606750488, + "learning_rate": 7.084411066696741e-05, + "loss": 0.0115, + "step": 2770 + }, + { + "epoch": 4.278980278980279, + "grad_norm": 0.022406499832868576, + "learning_rate": 7.059936730476933e-05, + "loss": 0.0117, + "step": 2780 + }, + { + "epoch": 4.294372294372295, + "grad_norm": 0.5594321489334106, + "learning_rate": 7.035402803600971e-05, + "loss": 0.0108, + "step": 2790 + }, + { + "epoch": 4.30976430976431, + "grad_norm": 0.41700586676597595, + "learning_rate": 7.010809995795897e-05, + "loss": 0.0093, + "step": 2800 + }, + { + "epoch": 4.30976430976431, + "eval_loss": 0.050402212888002396, + "eval_runtime": 155.1101, + "eval_samples_per_second": 7.446, + "eval_steps_per_second": 7.446, + "step": 2800 + }, + { + "epoch": 4.325156325156325, + "grad_norm": 0.46149563789367676, + "learning_rate": 6.986159018492082e-05, + "loss": 0.0118, + "step": 2810 + }, + { + "epoch": 4.340548340548341, + "grad_norm": 0.2322218269109726, + "learning_rate": 6.961450584802649e-05, + "loss": 0.0058, + "step": 2820 + }, + { + "epoch": 4.355940355940356, + "grad_norm": 0.42568182945251465, + "learning_rate": 6.936685409502838e-05, + "loss": 0.0064, + "step": 2830 + }, + { + "epoch": 4.371332371332372, + "grad_norm": 1.2661809921264648, + "learning_rate": 6.911864209009337e-05, + "loss": 0.0074, + "step": 2840 + }, + { + "epoch": 4.386724386724387, + "grad_norm": 0.43955183029174805, + "learning_rate": 6.886987701359552e-05, + "loss": 0.0113, + "step": 2850 + }, + { + "epoch": 4.386724386724387, + "eval_loss": 0.057221509516239166, + "eval_runtime": 155.2072, + "eval_samples_per_second": 7.442, + "eval_steps_per_second": 7.442, + "step": 2850 + }, + { + "epoch": 4.402116402116402, + "grad_norm": 0.45764613151550293, + "learning_rate": 6.86205660619083e-05, + "loss": 0.009, + "step": 2860 + }, + { + "epoch": 4.417508417508418, + "grad_norm": 0.950131893157959, + "learning_rate": 6.837071644719657e-05, + "loss": 0.016, + "step": 2870 + }, + { + "epoch": 4.432900432900433, + "grad_norm": 0.5738692283630371, + "learning_rate": 6.812033539720776e-05, + "loss": 0.0089, + "step": 2880 + }, + { + "epoch": 4.448292448292448, + "grad_norm": 0.3087545931339264, + "learning_rate": 6.786943015506292e-05, + "loss": 0.0106, + "step": 2890 + }, + { + "epoch": 4.463684463684464, + "grad_norm": 0.44009312987327576, + "learning_rate": 6.761800797904711e-05, + "loss": 0.0118, + "step": 2900 + }, + { + "epoch": 4.463684463684464, + "eval_loss": 0.047569435089826584, + "eval_runtime": 155.1688, + "eval_samples_per_second": 7.444, + "eval_steps_per_second": 7.444, + "step": 2900 + }, + { + "epoch": 4.479076479076479, + "grad_norm": 0.6292593479156494, + "learning_rate": 6.736607614239947e-05, + "loss": 0.0068, + "step": 2910 + }, + { + "epoch": 4.494468494468494, + "grad_norm": 0.019609015434980392, + "learning_rate": 6.711364193310282e-05, + "loss": 0.004, + "step": 2920 + }, + { + "epoch": 4.50986050986051, + "grad_norm": 0.09694305062294006, + "learning_rate": 6.686071265367273e-05, + "loss": 0.0112, + "step": 2930 + }, + { + "epoch": 4.525252525252525, + "grad_norm": 0.1770501583814621, + "learning_rate": 6.660729562094644e-05, + "loss": 0.0096, + "step": 2940 + }, + { + "epoch": 4.540644540644541, + "grad_norm": 0.31653764843940735, + "learning_rate": 6.635339816587109e-05, + "loss": 0.0116, + "step": 2950 + }, + { + "epoch": 4.540644540644541, + "eval_loss": 0.0523294098675251, + "eval_runtime": 155.1716, + "eval_samples_per_second": 7.443, + "eval_steps_per_second": 7.443, + "step": 2950 + }, + { + "epoch": 4.556036556036556, + "grad_norm": 0.1391102522611618, + "learning_rate": 6.609902763329164e-05, + "loss": 0.0081, + "step": 2960 + }, + { + "epoch": 4.571428571428571, + "grad_norm": 0.5930052995681763, + "learning_rate": 6.584419138173845e-05, + "loss": 0.0104, + "step": 2970 + }, + { + "epoch": 4.586820586820587, + "grad_norm": 0.08531355112791061, + "learning_rate": 6.558889678321436e-05, + "loss": 0.0085, + "step": 2980 + }, + { + "epoch": 4.602212602212602, + "grad_norm": 0.2185678333044052, + "learning_rate": 6.533315122298146e-05, + "loss": 0.0095, + "step": 2990 + }, + { + "epoch": 4.617604617604617, + "grad_norm": 0.5350275635719299, + "learning_rate": 6.507696209934741e-05, + "loss": 0.0196, + "step": 3000 + }, + { + "epoch": 4.617604617604617, + "eval_loss": 0.04244374856352806, + "eval_runtime": 155.1502, + "eval_samples_per_second": 7.444, + "eval_steps_per_second": 7.444, + "step": 3000 + }, + { + "epoch": 4.632996632996633, + "grad_norm": 0.4110289216041565, + "learning_rate": 6.482033682345153e-05, + "loss": 0.0133, + "step": 3010 + }, + { + "epoch": 4.648388648388648, + "grad_norm": 0.16000980138778687, + "learning_rate": 6.45632828190502e-05, + "loss": 0.0135, + "step": 3020 + }, + { + "epoch": 4.663780663780663, + "grad_norm": 0.2741226851940155, + "learning_rate": 6.430580752230232e-05, + "loss": 0.0104, + "step": 3030 + }, + { + "epoch": 4.679172679172679, + "grad_norm": 0.2901691198348999, + "learning_rate": 6.404791838155406e-05, + "loss": 0.0103, + "step": 3040 + }, + { + "epoch": 4.694564694564694, + "grad_norm": 0.680422306060791, + "learning_rate": 6.378962285712346e-05, + "loss": 0.0199, + "step": 3050 + }, + { + "epoch": 4.694564694564694, + "eval_loss": 0.05109880864620209, + "eval_runtime": 155.2395, + "eval_samples_per_second": 7.44, + "eval_steps_per_second": 7.44, + "step": 3050 + }, + { + "epoch": 4.70995670995671, + "grad_norm": 0.2926085591316223, + "learning_rate": 6.353092842108453e-05, + "loss": 0.011, + "step": 3060 + }, + { + "epoch": 4.725348725348725, + "grad_norm": 0.5540523529052734, + "learning_rate": 6.327184255705123e-05, + "loss": 0.0111, + "step": 3070 + }, + { + "epoch": 4.7407407407407405, + "grad_norm": 0.31537291407585144, + "learning_rate": 6.301237275996082e-05, + "loss": 0.0105, + "step": 3080 + }, + { + "epoch": 4.756132756132756, + "grad_norm": 0.173953577876091, + "learning_rate": 6.275252653585722e-05, + "loss": 0.0163, + "step": 3090 + }, + { + "epoch": 4.7715247715247715, + "grad_norm": 0.13173459470272064, + "learning_rate": 6.249231140167365e-05, + "loss": 0.0106, + "step": 3100 + }, + { + "epoch": 4.7715247715247715, + "eval_loss": 0.050595082342624664, + "eval_runtime": 155.2929, + "eval_samples_per_second": 7.438, + "eval_steps_per_second": 7.438, + "step": 3100 + }, + { + "epoch": 4.786916786916787, + "grad_norm": 0.8345432281494141, + "learning_rate": 6.223173488501545e-05, + "loss": 0.0116, + "step": 3110 + }, + { + "epoch": 4.8023088023088025, + "grad_norm": 0.5431621074676514, + "learning_rate": 6.197080452394207e-05, + "loss": 0.0128, + "step": 3120 + }, + { + "epoch": 4.817700817700818, + "grad_norm": 0.2182980328798294, + "learning_rate": 6.170952786674915e-05, + "loss": 0.0125, + "step": 3130 + }, + { + "epoch": 4.833092833092833, + "grad_norm": 0.08524614572525024, + "learning_rate": 6.14479124717501e-05, + "loss": 0.0109, + "step": 3140 + }, + { + "epoch": 4.848484848484849, + "grad_norm": 0.09413493424654007, + "learning_rate": 6.118596590705751e-05, + "loss": 0.0091, + "step": 3150 + }, + { + "epoch": 4.848484848484849, + "eval_loss": 0.04569848254323006, + "eval_runtime": 155.444, + "eval_samples_per_second": 7.43, + "eval_steps_per_second": 7.43, + "step": 3150 + }, + { + "epoch": 4.863876863876864, + "grad_norm": 0.3124259114265442, + "learning_rate": 6.0923695750364106e-05, + "loss": 0.0078, + "step": 3160 + }, + { + "epoch": 4.87926887926888, + "grad_norm": 0.08396346867084503, + "learning_rate": 6.0661109588723696e-05, + "loss": 0.0107, + "step": 3170 + }, + { + "epoch": 4.894660894660895, + "grad_norm": 0.44696280360221863, + "learning_rate": 6.039821501833153e-05, + "loss": 0.008, + "step": 3180 + }, + { + "epoch": 4.91005291005291, + "grad_norm": 0.416126012802124, + "learning_rate": 6.013501964430468e-05, + "loss": 0.0096, + "step": 3190 + }, + { + "epoch": 4.925444925444926, + "grad_norm": 0.012737839482724667, + "learning_rate": 5.987153108046194e-05, + "loss": 0.009, + "step": 3200 + }, + { + "epoch": 4.925444925444926, + "eval_loss": 0.0524786151945591, + "eval_runtime": 155.279, + "eval_samples_per_second": 7.438, + "eval_steps_per_second": 7.438, + "step": 3200 + }, + { + "epoch": 4.940836940836941, + "grad_norm": 0.3531193137168884, + "learning_rate": 5.960775694910365e-05, + "loss": 0.0238, + "step": 3210 + }, + { + "epoch": 4.956228956228956, + "grad_norm": 0.12316201627254486, + "learning_rate": 5.934370488079115e-05, + "loss": 0.0147, + "step": 3220 + }, + { + "epoch": 4.971620971620972, + "grad_norm": 0.1998698115348816, + "learning_rate": 5.907938251412603e-05, + "loss": 0.0143, + "step": 3230 + }, + { + "epoch": 4.987012987012987, + "grad_norm": 0.13598302006721497, + "learning_rate": 5.881479749552916e-05, + "loss": 0.0092, + "step": 3240 + }, + { + "epoch": 5.002405002405002, + "grad_norm": 0.14183315634727478, + "learning_rate": 5.854995747901958e-05, + "loss": 0.0114, + "step": 3250 + }, + { + "epoch": 5.002405002405002, + "eval_loss": 0.0509212426841259, + "eval_runtime": 155.3123, + "eval_samples_per_second": 7.437, + "eval_steps_per_second": 7.437, + "step": 3250 + }, + { + "epoch": 5.017797017797018, + "grad_norm": 0.029530184343457222, + "learning_rate": 5.828487012599294e-05, + "loss": 0.0118, + "step": 3260 + }, + { + "epoch": 5.033189033189033, + "grad_norm": 0.16333474218845367, + "learning_rate": 5.8019543104999986e-05, + "loss": 0.0077, + "step": 3270 + }, + { + "epoch": 5.048581048581049, + "grad_norm": 0.2968755066394806, + "learning_rate": 5.7753984091524626e-05, + "loss": 0.0069, + "step": 3280 + }, + { + "epoch": 5.063973063973064, + "grad_norm": 0.6200087070465088, + "learning_rate": 5.748820076776198e-05, + "loss": 0.0086, + "step": 3290 + }, + { + "epoch": 5.079365079365079, + "grad_norm": 0.07885055989027023, + "learning_rate": 5.7222200822396075e-05, + "loss": 0.0094, + "step": 3300 + }, + { + "epoch": 5.079365079365079, + "eval_loss": 0.05441857501864433, + "eval_runtime": 155.3308, + "eval_samples_per_second": 7.436, + "eval_steps_per_second": 7.436, + "step": 3300 + }, + { + "epoch": 5.094757094757095, + "grad_norm": 0.21745358407497406, + "learning_rate": 5.6955991950377476e-05, + "loss": 0.0035, + "step": 3310 + }, + { + "epoch": 5.11014911014911, + "grad_norm": 0.030182110145688057, + "learning_rate": 5.6689581852700636e-05, + "loss": 0.0053, + "step": 3320 + }, + { + "epoch": 5.125541125541125, + "grad_norm": 0.02843502163887024, + "learning_rate": 5.642297823618114e-05, + "loss": 0.0041, + "step": 3330 + }, + { + "epoch": 5.140933140933141, + "grad_norm": 0.06287218630313873, + "learning_rate": 5.61561888132328e-05, + "loss": 0.0048, + "step": 3340 + }, + { + "epoch": 5.156325156325156, + "grad_norm": 0.3266278803348541, + "learning_rate": 5.5889221301644476e-05, + "loss": 0.0068, + "step": 3350 + }, + { + "epoch": 5.156325156325156, + "eval_loss": 0.05760101601481438, + "eval_runtime": 155.4089, + "eval_samples_per_second": 7.432, + "eval_steps_per_second": 7.432, + "step": 3350 + }, + { + "epoch": 5.171717171717171, + "grad_norm": 0.20232290029525757, + "learning_rate": 5.562208342435685e-05, + "loss": 0.0038, + "step": 3360 + }, + { + "epoch": 5.187109187109187, + "grad_norm": 0.9380260705947876, + "learning_rate": 5.5354782909239025e-05, + "loss": 0.0054, + "step": 3370 + }, + { + "epoch": 5.202501202501202, + "grad_norm": 0.18466781079769135, + "learning_rate": 5.508732748886493e-05, + "loss": 0.0027, + "step": 3380 + }, + { + "epoch": 5.217893217893218, + "grad_norm": 0.1192750334739685, + "learning_rate": 5.481972490028968e-05, + "loss": 0.0054, + "step": 3390 + }, + { + "epoch": 5.233285233285233, + "grad_norm": 1.2839610576629639, + "learning_rate": 5.4551982884825693e-05, + "loss": 0.008, + "step": 3400 + }, + { + "epoch": 5.233285233285233, + "eval_loss": 0.062122732400894165, + "eval_runtime": 155.5247, + "eval_samples_per_second": 7.426, + "eval_steps_per_second": 7.426, + "step": 3400 + }, + { + "epoch": 5.248677248677248, + "grad_norm": 0.33902907371520996, + "learning_rate": 5.4284109187818845e-05, + "loss": 0.0088, + "step": 3410 + }, + { + "epoch": 5.264069264069264, + "grad_norm": 0.08099062740802765, + "learning_rate": 5.401611155842429e-05, + "loss": 0.0065, + "step": 3420 + }, + { + "epoch": 5.279461279461279, + "grad_norm": 0.176478773355484, + "learning_rate": 5.374799774938236e-05, + "loss": 0.0073, + "step": 3430 + }, + { + "epoch": 5.2948532948532945, + "grad_norm": 0.2959177494049072, + "learning_rate": 5.3479775516794284e-05, + "loss": 0.0052, + "step": 3440 + }, + { + "epoch": 5.31024531024531, + "grad_norm": 0.09551749378442764, + "learning_rate": 5.321145261989781e-05, + "loss": 0.0043, + "step": 3450 + }, + { + "epoch": 5.31024531024531, + "eval_loss": 0.056223295629024506, + "eval_runtime": 155.3613, + "eval_samples_per_second": 7.434, + "eval_steps_per_second": 7.434, + "step": 3450 + }, + { + "epoch": 5.3256373256373255, + "grad_norm": 0.06893710792064667, + "learning_rate": 5.294303682084274e-05, + "loss": 0.0074, + "step": 3460 + }, + { + "epoch": 5.341029341029341, + "grad_norm": 0.08062580227851868, + "learning_rate": 5.2674535884466404e-05, + "loss": 0.003, + "step": 3470 + }, + { + "epoch": 5.3564213564213565, + "grad_norm": 0.07167403399944305, + "learning_rate": 5.2405957578068967e-05, + "loss": 0.0036, + "step": 3480 + }, + { + "epoch": 5.371813371813372, + "grad_norm": 0.39387157559394836, + "learning_rate": 5.213730967118886e-05, + "loss": 0.0047, + "step": 3490 + }, + { + "epoch": 5.3872053872053876, + "grad_norm": 0.03161190077662468, + "learning_rate": 5.186859993537787e-05, + "loss": 0.0052, + "step": 3500 + }, + { + "epoch": 5.3872053872053876, + "eval_loss": 0.06362296640872955, + "eval_runtime": 155.4204, + "eval_samples_per_second": 7.431, + "eval_steps_per_second": 7.431, + "step": 3500 + }, + { + "epoch": 5.402597402597403, + "grad_norm": 0.02227042242884636, + "learning_rate": 5.159983614397644e-05, + "loss": 0.0074, + "step": 3510 + }, + { + "epoch": 5.417989417989418, + "grad_norm": 0.015060101635754108, + "learning_rate": 5.133102607188874e-05, + "loss": 0.0025, + "step": 3520 + }, + { + "epoch": 5.433381433381434, + "grad_norm": 0.09409883618354797, + "learning_rate": 5.1062177495357774e-05, + "loss": 0.0036, + "step": 3530 + }, + { + "epoch": 5.448773448773449, + "grad_norm": 0.20783928036689758, + "learning_rate": 5.0793298191740404e-05, + "loss": 0.0043, + "step": 3540 + }, + { + "epoch": 5.464165464165464, + "grad_norm": 0.12817227840423584, + "learning_rate": 5.052439593928239e-05, + "loss": 0.0106, + "step": 3550 + }, + { + "epoch": 5.464165464165464, + "eval_loss": 0.062369879335165024, + "eval_runtime": 155.3813, + "eval_samples_per_second": 7.433, + "eval_steps_per_second": 7.433, + "step": 3550 + }, + { + "epoch": 5.47955747955748, + "grad_norm": 0.11789727956056595, + "learning_rate": 5.025547851689334e-05, + "loss": 0.0046, + "step": 3560 + }, + { + "epoch": 5.494949494949495, + "grad_norm": 0.5952243804931641, + "learning_rate": 4.998655370392174e-05, + "loss": 0.0089, + "step": 3570 + }, + { + "epoch": 5.51034151034151, + "grad_norm": 0.49004054069519043, + "learning_rate": 4.971762927992989e-05, + "loss": 0.0056, + "step": 3580 + }, + { + "epoch": 5.525733525733526, + "grad_norm": 0.017842531204223633, + "learning_rate": 4.944871302446874e-05, + "loss": 0.0057, + "step": 3590 + }, + { + "epoch": 5.541125541125541, + "grad_norm": 0.20063471794128418, + "learning_rate": 4.917981271685305e-05, + "loss": 0.0071, + "step": 3600 + }, + { + "epoch": 5.541125541125541, + "eval_loss": 0.05501287430524826, + "eval_runtime": 155.4811, + "eval_samples_per_second": 7.429, + "eval_steps_per_second": 7.429, + "step": 3600 + }, + { + "epoch": 5.556517556517557, + "grad_norm": 0.7454602718353271, + "learning_rate": 4.8910936135936155e-05, + "loss": 0.0068, + "step": 3610 + }, + { + "epoch": 5.571909571909572, + "grad_norm": 0.052781131118535995, + "learning_rate": 4.864209105988508e-05, + "loss": 0.0042, + "step": 3620 + }, + { + "epoch": 5.587301587301587, + "grad_norm": 0.36434969305992126, + "learning_rate": 4.8373285265955395e-05, + "loss": 0.0038, + "step": 3630 + }, + { + "epoch": 5.602693602693603, + "grad_norm": 0.04938585311174393, + "learning_rate": 4.810452653026635e-05, + "loss": 0.009, + "step": 3640 + }, + { + "epoch": 5.618085618085618, + "grad_norm": 0.07580199837684631, + "learning_rate": 4.783582262757588e-05, + "loss": 0.0071, + "step": 3650 + }, + { + "epoch": 5.618085618085618, + "eval_loss": 0.05748259276151657, + "eval_runtime": 155.3926, + "eval_samples_per_second": 7.433, + "eval_steps_per_second": 7.433, + "step": 3650 + }, + { + "epoch": 5.633477633477634, + "grad_norm": 0.1322399526834488, + "learning_rate": 4.7567181331055674e-05, + "loss": 0.0026, + "step": 3660 + }, + { + "epoch": 5.648869648869649, + "grad_norm": 0.4781762361526489, + "learning_rate": 4.729861041206629e-05, + "loss": 0.003, + "step": 3670 + }, + { + "epoch": 5.664261664261664, + "grad_norm": 0.2590942084789276, + "learning_rate": 4.703011763993244e-05, + "loss": 0.0046, + "step": 3680 + }, + { + "epoch": 5.679653679653679, + "grad_norm": 0.2909308969974518, + "learning_rate": 4.676171078171814e-05, + "loss": 0.0057, + "step": 3690 + }, + { + "epoch": 5.695045695045695, + "grad_norm": 0.11341127008199692, + "learning_rate": 4.649339760200206e-05, + "loss": 0.0037, + "step": 3700 + }, + { + "epoch": 5.695045695045695, + "eval_loss": 0.06757444888353348, + "eval_runtime": 155.3869, + "eval_samples_per_second": 7.433, + "eval_steps_per_second": 7.433, + "step": 3700 + }, + { + "epoch": 5.71043771043771, + "grad_norm": 0.3195706903934479, + "learning_rate": 4.62251858626529e-05, + "loss": 0.0034, + "step": 3710 + }, + { + "epoch": 5.725829725829726, + "grad_norm": 0.15774591267108917, + "learning_rate": 4.595708332260484e-05, + "loss": 0.0117, + "step": 3720 + }, + { + "epoch": 5.741221741221741, + "grad_norm": 0.10123348236083984, + "learning_rate": 4.568909773763313e-05, + "loss": 0.0086, + "step": 3730 + }, + { + "epoch": 5.756613756613756, + "grad_norm": 0.17369748651981354, + "learning_rate": 4.5421236860129686e-05, + "loss": 0.0051, + "step": 3740 + }, + { + "epoch": 5.772005772005772, + "grad_norm": 0.5327238440513611, + "learning_rate": 4.515350843887881e-05, + "loss": 0.0048, + "step": 3750 + }, + { + "epoch": 5.772005772005772, + "eval_loss": 0.058464374393224716, + "eval_runtime": 155.4919, + "eval_samples_per_second": 7.428, + "eval_steps_per_second": 7.428, + "step": 3750 + }, + { + "epoch": 5.787397787397787, + "grad_norm": 0.4068518280982971, + "learning_rate": 4.4885920218833094e-05, + "loss": 0.0116, + "step": 3760 + }, + { + "epoch": 5.802789802789803, + "grad_norm": 0.20335321128368378, + "learning_rate": 4.461847994088933e-05, + "loss": 0.0063, + "step": 3770 + }, + { + "epoch": 5.818181818181818, + "grad_norm": 0.012666774913668633, + "learning_rate": 4.4351195341664586e-05, + "loss": 0.0061, + "step": 3780 + }, + { + "epoch": 5.833573833573833, + "grad_norm": 0.0807795375585556, + "learning_rate": 4.408407415327234e-05, + "loss": 0.005, + "step": 3790 + }, + { + "epoch": 5.8489658489658485, + "grad_norm": 0.10903661698102951, + "learning_rate": 4.381712410309894e-05, + "loss": 0.0095, + "step": 3800 + }, + { + "epoch": 5.8489658489658485, + "eval_loss": 0.0627235621213913, + "eval_runtime": 155.3884, + "eval_samples_per_second": 7.433, + "eval_steps_per_second": 7.433, + "step": 3800 + }, + { + "epoch": 5.864357864357864, + "grad_norm": 0.18351446092128754, + "learning_rate": 4.355035291357993e-05, + "loss": 0.0062, + "step": 3810 + }, + { + "epoch": 5.8797498797498795, + "grad_norm": 0.1731889843940735, + "learning_rate": 4.328376830197672e-05, + "loss": 0.0035, + "step": 3820 + }, + { + "epoch": 5.8951418951418955, + "grad_norm": 0.07248035818338394, + "learning_rate": 4.301737798015329e-05, + "loss": 0.0041, + "step": 3830 + }, + { + "epoch": 5.9105339105339105, + "grad_norm": 1.05582857131958, + "learning_rate": 4.2751189654353174e-05, + "loss": 0.005, + "step": 3840 + }, + { + "epoch": 5.925925925925926, + "grad_norm": 0.08837444335222244, + "learning_rate": 4.2485211024976496e-05, + "loss": 0.0044, + "step": 3850 + }, + { + "epoch": 5.925925925925926, + "eval_loss": 0.06223612278699875, + "eval_runtime": 155.2566, + "eval_samples_per_second": 7.439, + "eval_steps_per_second": 7.439, + "step": 3850 + }, + { + "epoch": 5.9413179413179416, + "grad_norm": 0.05066361278295517, + "learning_rate": 4.221944978635711e-05, + "loss": 0.0069, + "step": 3860 + }, + { + "epoch": 5.956709956709957, + "grad_norm": 0.12386852502822876, + "learning_rate": 4.195391362654021e-05, + "loss": 0.0102, + "step": 3870 + }, + { + "epoch": 5.972101972101973, + "grad_norm": 0.24510745704174042, + "learning_rate": 4.168861022705975e-05, + "loss": 0.01, + "step": 3880 + }, + { + "epoch": 5.987493987493988, + "grad_norm": 0.1872347891330719, + "learning_rate": 4.142354726271638e-05, + "loss": 0.0045, + "step": 3890 + }, + { + "epoch": 6.002886002886003, + "grad_norm": 0.1695815473794937, + "learning_rate": 4.1158732401355236e-05, + "loss": 0.0059, + "step": 3900 + }, + { + "epoch": 6.002886002886003, + "eval_loss": 0.05374613031744957, + "eval_runtime": 155.2416, + "eval_samples_per_second": 7.44, + "eval_steps_per_second": 7.44, + "step": 3900 + }, + { + "epoch": 6.018278018278019, + "grad_norm": 0.012538016773760319, + "learning_rate": 4.0894173303644335e-05, + "loss": 0.0034, + "step": 3910 + }, + { + "epoch": 6.033670033670034, + "grad_norm": 0.06368619948625565, + "learning_rate": 4.06298776228528e-05, + "loss": 0.0016, + "step": 3920 + }, + { + "epoch": 6.049062049062049, + "grad_norm": 0.05364130809903145, + "learning_rate": 4.036585300462959e-05, + "loss": 0.0022, + "step": 3930 + }, + { + "epoch": 6.064454064454065, + "grad_norm": 0.08409163355827332, + "learning_rate": 4.010210708678217e-05, + "loss": 0.0046, + "step": 3940 + }, + { + "epoch": 6.07984607984608, + "grad_norm": 0.06317105144262314, + "learning_rate": 3.9838647499055716e-05, + "loss": 0.0027, + "step": 3950 + }, + { + "epoch": 6.07984607984608, + "eval_loss": 0.061136141419410706, + "eval_runtime": 155.3821, + "eval_samples_per_second": 7.433, + "eval_steps_per_second": 7.433, + "step": 3950 + }, + { + "epoch": 6.095238095238095, + "grad_norm": 0.06427408754825592, + "learning_rate": 3.957548186291233e-05, + "loss": 0.0015, + "step": 3960 + }, + { + "epoch": 6.110630110630111, + "grad_norm": 0.025733016431331635, + "learning_rate": 3.931261779131058e-05, + "loss": 0.0013, + "step": 3970 + }, + { + "epoch": 6.126022126022126, + "grad_norm": 0.08047357946634293, + "learning_rate": 3.905006288848519e-05, + "loss": 0.0025, + "step": 3980 + }, + { + "epoch": 6.141414141414141, + "grad_norm": 0.020656704902648926, + "learning_rate": 3.8787824749727185e-05, + "loss": 0.0031, + "step": 3990 + }, + { + "epoch": 6.156806156806157, + "grad_norm": 0.00342270964756608, + "learning_rate": 3.852591096116412e-05, + "loss": 0.0014, + "step": 4000 + }, + { + "epoch": 6.156806156806157, + "eval_loss": 0.0680951476097107, + "eval_runtime": 155.3016, + "eval_samples_per_second": 7.437, + "eval_steps_per_second": 7.437, + "step": 4000 + }, + { + "epoch": 6.172198172198172, + "grad_norm": 0.021745314821600914, + "learning_rate": 3.82643290995406e-05, + "loss": 0.0009, + "step": 4010 + }, + { + "epoch": 6.187590187590188, + "grad_norm": 0.007515972014516592, + "learning_rate": 3.8003086731999114e-05, + "loss": 0.0019, + "step": 4020 + }, + { + "epoch": 6.202982202982203, + "grad_norm": 0.016504747793078423, + "learning_rate": 3.774219141586113e-05, + "loss": 0.0015, + "step": 4030 + }, + { + "epoch": 6.218374218374218, + "grad_norm": 0.009938733652234077, + "learning_rate": 3.748165069840849e-05, + "loss": 0.0046, + "step": 4040 + }, + { + "epoch": 6.233766233766234, + "grad_norm": 0.14640678465366364, + "learning_rate": 3.722147211666509e-05, + "loss": 0.0027, + "step": 4050 + }, + { + "epoch": 6.233766233766234, + "eval_loss": 0.07209355384111404, + "eval_runtime": 155.3025, + "eval_samples_per_second": 7.437, + "eval_steps_per_second": 7.437, + "step": 4050 + }, + { + "epoch": 6.249158249158249, + "grad_norm": 0.26223722100257874, + "learning_rate": 3.696166319717876e-05, + "loss": 0.0031, + "step": 4060 + }, + { + "epoch": 6.264550264550264, + "grad_norm": 0.011222557164728642, + "learning_rate": 3.670223145580366e-05, + "loss": 0.0019, + "step": 4070 + }, + { + "epoch": 6.27994227994228, + "grad_norm": 0.00779169425368309, + "learning_rate": 3.6443184397482746e-05, + "loss": 0.0057, + "step": 4080 + }, + { + "epoch": 6.295334295334295, + "grad_norm": 0.7065511345863342, + "learning_rate": 3.618452951603075e-05, + "loss": 0.0042, + "step": 4090 + }, + { + "epoch": 6.310726310726311, + "grad_norm": 0.08722670376300812, + "learning_rate": 3.592627429391732e-05, + "loss": 0.0015, + "step": 4100 + }, + { + "epoch": 6.310726310726311, + "eval_loss": 0.0669177994132042, + "eval_runtime": 155.3268, + "eval_samples_per_second": 7.436, + "eval_steps_per_second": 7.436, + "step": 4100 + }, + { + "epoch": 6.326118326118326, + "grad_norm": 0.25067102909088135, + "learning_rate": 3.566842620205064e-05, + "loss": 0.0028, + "step": 4110 + }, + { + "epoch": 6.341510341510341, + "grad_norm": 0.40399977564811707, + "learning_rate": 3.541099269956123e-05, + "loss": 0.0026, + "step": 4120 + }, + { + "epoch": 6.356902356902357, + "grad_norm": 0.08952571451663971, + "learning_rate": 3.515398123358627e-05, + "loss": 0.002, + "step": 4130 + }, + { + "epoch": 6.372294372294372, + "grad_norm": 0.08081924170255661, + "learning_rate": 3.4897399239054015e-05, + "loss": 0.0023, + "step": 4140 + }, + { + "epoch": 6.387686387686387, + "grad_norm": 0.20503763854503632, + "learning_rate": 3.464125413846886e-05, + "loss": 0.0052, + "step": 4150 + }, + { + "epoch": 6.387686387686387, + "eval_loss": 0.06801678240299225, + "eval_runtime": 155.3945, + "eval_samples_per_second": 7.433, + "eval_steps_per_second": 7.433, + "step": 4150 + }, + { + "epoch": 6.403078403078403, + "grad_norm": 0.20953159034252167, + "learning_rate": 3.4385553341696563e-05, + "loss": 0.0036, + "step": 4160 + }, + { + "epoch": 6.418470418470418, + "grad_norm": 0.12193531543016434, + "learning_rate": 3.413030424574989e-05, + "loss": 0.001, + "step": 4170 + }, + { + "epoch": 6.4338624338624335, + "grad_norm": 0.014033235609531403, + "learning_rate": 3.3875514234574556e-05, + "loss": 0.0027, + "step": 4180 + }, + { + "epoch": 6.4492544492544495, + "grad_norm": 0.023747866973280907, + "learning_rate": 3.362119067883581e-05, + "loss": 0.0027, + "step": 4190 + }, + { + "epoch": 6.4646464646464645, + "grad_norm": 0.20930615067481995, + "learning_rate": 3.336734093570498e-05, + "loss": 0.0043, + "step": 4200 + }, + { + "epoch": 6.4646464646464645, + "eval_loss": 0.063514344394207, + "eval_runtime": 155.3203, + "eval_samples_per_second": 7.436, + "eval_steps_per_second": 7.436, + "step": 4200 + }, + { + "epoch": 6.4800384800384805, + "grad_norm": 0.178164541721344, + "learning_rate": 3.311397234864687e-05, + "loss": 0.0035, + "step": 4210 + }, + { + "epoch": 6.4954304954304956, + "grad_norm": 0.1388297826051712, + "learning_rate": 3.2861092247207095e-05, + "loss": 0.0024, + "step": 4220 + }, + { + "epoch": 6.510822510822511, + "grad_norm": 0.038345132023096085, + "learning_rate": 3.260870794680025e-05, + "loss": 0.0022, + "step": 4230 + }, + { + "epoch": 6.526214526214527, + "grad_norm": 0.04468591883778572, + "learning_rate": 3.235682674849818e-05, + "loss": 0.0018, + "step": 4240 + }, + { + "epoch": 6.541606541606542, + "grad_norm": 0.023503240197896957, + "learning_rate": 3.210545593881882e-05, + "loss": 0.0038, + "step": 4250 + }, + { + "epoch": 6.541606541606542, + "eval_loss": 0.06662000715732574, + "eval_runtime": 155.3285, + "eval_samples_per_second": 7.436, + "eval_steps_per_second": 7.436, + "step": 4250 + }, + { + "epoch": 6.556998556998557, + "grad_norm": 0.04609577730298042, + "learning_rate": 3.1854602789515314e-05, + "loss": 0.0007, + "step": 4260 + }, + { + "epoch": 6.572390572390573, + "grad_norm": 0.052642837166786194, + "learning_rate": 3.1604274557365786e-05, + "loss": 0.0052, + "step": 4270 + }, + { + "epoch": 6.587782587782588, + "grad_norm": 0.11532415449619293, + "learning_rate": 3.1354478483963346e-05, + "loss": 0.005, + "step": 4280 + }, + { + "epoch": 6.603174603174603, + "grad_norm": 0.09213144332170486, + "learning_rate": 3.1105221795506584e-05, + "loss": 0.0015, + "step": 4290 + }, + { + "epoch": 6.618566618566619, + "grad_norm": 0.007917952723801136, + "learning_rate": 3.0856511702590555e-05, + "loss": 0.0044, + "step": 4300 + }, + { + "epoch": 6.618566618566619, + "eval_loss": 0.06719578802585602, + "eval_runtime": 155.5158, + "eval_samples_per_second": 7.427, + "eval_steps_per_second": 7.427, + "step": 4300 + }, + { + "epoch": 6.633958633958634, + "grad_norm": 0.02404402382671833, + "learning_rate": 3.060835539999819e-05, + "loss": 0.0026, + "step": 4310 + }, + { + "epoch": 6.64935064935065, + "grad_norm": 0.07480927556753159, + "learning_rate": 3.0360760066492167e-05, + "loss": 0.002, + "step": 4320 + }, + { + "epoch": 6.664742664742665, + "grad_norm": 0.060346685349941254, + "learning_rate": 3.0113732864607235e-05, + "loss": 0.004, + "step": 4330 + }, + { + "epoch": 6.68013468013468, + "grad_norm": 0.4563782513141632, + "learning_rate": 2.9867280940442954e-05, + "loss": 0.0037, + "step": 4340 + }, + { + "epoch": 6.695526695526696, + "grad_norm": 0.011966399848461151, + "learning_rate": 2.962141142345709e-05, + "loss": 0.0012, + "step": 4350 + }, + { + "epoch": 6.695526695526696, + "eval_loss": 0.0658504068851471, + "eval_runtime": 155.6114, + "eval_samples_per_second": 7.422, + "eval_steps_per_second": 7.422, + "step": 4350 + }, + { + "epoch": 6.710918710918711, + "grad_norm": 0.08539020270109177, + "learning_rate": 2.937613142625929e-05, + "loss": 0.0042, + "step": 4360 + }, + { + "epoch": 6.726310726310726, + "grad_norm": 0.014089019037783146, + "learning_rate": 2.913144804440534e-05, + "loss": 0.0044, + "step": 4370 + }, + { + "epoch": 6.741702741702742, + "grad_norm": 0.24594922363758087, + "learning_rate": 2.888736835619188e-05, + "loss": 0.0044, + "step": 4380 + }, + { + "epoch": 6.757094757094757, + "grad_norm": 0.06746606528759003, + "learning_rate": 2.8643899422451658e-05, + "loss": 0.0016, + "step": 4390 + }, + { + "epoch": 6.772486772486772, + "grad_norm": 0.006381847895681858, + "learning_rate": 2.8401048286349353e-05, + "loss": 0.0017, + "step": 4400 + }, + { + "epoch": 6.772486772486772, + "eval_loss": 0.06882518529891968, + "eval_runtime": 155.6065, + "eval_samples_per_second": 7.423, + "eval_steps_per_second": 7.423, + "step": 4400 + }, + { + "epoch": 6.787878787878788, + "grad_norm": 0.20728610455989838, + "learning_rate": 2.8158821973177674e-05, + "loss": 0.0046, + "step": 4410 + }, + { + "epoch": 6.803270803270803, + "grad_norm": 0.12275005877017975, + "learning_rate": 2.791722749015424e-05, + "loss": 0.0015, + "step": 4420 + }, + { + "epoch": 6.818662818662819, + "grad_norm": 0.09544987231492996, + "learning_rate": 2.767627182621881e-05, + "loss": 0.0023, + "step": 4430 + }, + { + "epoch": 6.834054834054834, + "grad_norm": 0.1116814985871315, + "learning_rate": 2.74359619518312e-05, + "loss": 0.003, + "step": 4440 + }, + { + "epoch": 6.849446849446849, + "grad_norm": 0.03937378153204918, + "learning_rate": 2.7196304818769508e-05, + "loss": 0.0072, + "step": 4450 + }, + { + "epoch": 6.849446849446849, + "eval_loss": 0.06910678744316101, + "eval_runtime": 155.4694, + "eval_samples_per_second": 7.429, + "eval_steps_per_second": 7.429, + "step": 4450 + }, + { + "epoch": 6.864838864838865, + "grad_norm": 1.0658265352249146, + "learning_rate": 2.6957307359929103e-05, + "loss": 0.0034, + "step": 4460 + }, + { + "epoch": 6.88023088023088, + "grad_norm": 0.16542820632457733, + "learning_rate": 2.671897648912204e-05, + "loss": 0.0015, + "step": 4470 + }, + { + "epoch": 6.895622895622895, + "grad_norm": 0.13427330553531647, + "learning_rate": 2.648131910087701e-05, + "loss": 0.0029, + "step": 4480 + }, + { + "epoch": 6.911014911014911, + "grad_norm": 0.06726118177175522, + "learning_rate": 2.624434207024007e-05, + "loss": 0.0023, + "step": 4490 + }, + { + "epoch": 6.926406926406926, + "grad_norm": 0.5540192723274231, + "learning_rate": 2.6008052252575428e-05, + "loss": 0.0036, + "step": 4500 + }, + { + "epoch": 6.926406926406926, + "eval_loss": 0.06586867570877075, + "eval_runtime": 155.4027, + "eval_samples_per_second": 7.432, + "eval_steps_per_second": 7.432, + "step": 4500 + }, + { + "epoch": 6.941798941798941, + "grad_norm": 0.013956030830740929, + "learning_rate": 2.5772456483367497e-05, + "loss": 0.0028, + "step": 4510 + }, + { + "epoch": 6.957190957190957, + "grad_norm": 0.020251085981726646, + "learning_rate": 2.55375615780229e-05, + "loss": 0.0045, + "step": 4520 + }, + { + "epoch": 6.972582972582972, + "grad_norm": 0.031177794560790062, + "learning_rate": 2.5303374331673414e-05, + "loss": 0.0014, + "step": 4530 + }, + { + "epoch": 6.987974987974988, + "grad_norm": 0.010427170433104038, + "learning_rate": 2.5069901518979382e-05, + "loss": 0.0021, + "step": 4540 + }, + { + "epoch": 7.0033670033670035, + "grad_norm": 0.007624060846865177, + "learning_rate": 2.4837149893933708e-05, + "loss": 0.0006, + "step": 4550 + }, + { + "epoch": 7.0033670033670035, + "eval_loss": 0.0666525736451149, + "eval_runtime": 155.4956, + "eval_samples_per_second": 7.428, + "eval_steps_per_second": 7.428, + "step": 4550 + }, + { + "epoch": 7.0187590187590185, + "grad_norm": 0.040518421679735184, + "learning_rate": 2.4605126189666554e-05, + "loss": 0.0006, + "step": 4560 + }, + { + "epoch": 7.0341510341510345, + "grad_norm": 0.1422734409570694, + "learning_rate": 2.4373837118250453e-05, + "loss": 0.0009, + "step": 4570 + }, + { + "epoch": 7.0495430495430496, + "grad_norm": 0.025685759261250496, + "learning_rate": 2.4143289370506207e-05, + "loss": 0.0012, + "step": 4580 + }, + { + "epoch": 7.064935064935065, + "grad_norm": 0.01645013317465782, + "learning_rate": 2.3913489615809287e-05, + "loss": 0.0017, + "step": 4590 + }, + { + "epoch": 7.080327080327081, + "grad_norm": 0.0011862561805173755, + "learning_rate": 2.3684444501897013e-05, + "loss": 0.0012, + "step": 4600 + }, + { + "epoch": 7.080327080327081, + "eval_loss": 0.07158812135457993, + "eval_runtime": 155.4503, + "eval_samples_per_second": 7.43, + "eval_steps_per_second": 7.43, + "step": 4600 + }, + { + "epoch": 7.095719095719096, + "grad_norm": 0.02576514147222042, + "learning_rate": 2.3456160654676073e-05, + "loss": 0.0011, + "step": 4610 + }, + { + "epoch": 7.111111111111111, + "grad_norm": 0.014289550483226776, + "learning_rate": 2.322864467803097e-05, + "loss": 0.0008, + "step": 4620 + }, + { + "epoch": 7.126503126503127, + "grad_norm": 0.07458142936229706, + "learning_rate": 2.3001903153632955e-05, + "loss": 0.0015, + "step": 4630 + }, + { + "epoch": 7.141895141895142, + "grad_norm": 0.04169384390115738, + "learning_rate": 2.277594264074957e-05, + "loss": 0.0006, + "step": 4640 + }, + { + "epoch": 7.157287157287158, + "grad_norm": 0.0025861808098852634, + "learning_rate": 2.255076967605506e-05, + "loss": 0.0018, + "step": 4650 + }, + { + "epoch": 7.157287157287158, + "eval_loss": 0.07346295565366745, + "eval_runtime": 155.5035, + "eval_samples_per_second": 7.427, + "eval_steps_per_second": 7.427, + "step": 4650 + }, + { + "epoch": 7.172679172679173, + "grad_norm": 0.20995664596557617, + "learning_rate": 2.232639077344102e-05, + "loss": 0.001, + "step": 4660 + }, + { + "epoch": 7.188071188071188, + "grad_norm": 0.005601596087217331, + "learning_rate": 2.210281242382824e-05, + "loss": 0.0014, + "step": 4670 + }, + { + "epoch": 7.203463203463204, + "grad_norm": 0.06737088412046432, + "learning_rate": 2.1880041094978705e-05, + "loss": 0.0013, + "step": 4680 + }, + { + "epoch": 7.218855218855219, + "grad_norm": 0.07896935194730759, + "learning_rate": 2.1658083231308677e-05, + "loss": 0.0009, + "step": 4690 + }, + { + "epoch": 7.234247234247234, + "grad_norm": 0.003924542106688023, + "learning_rate": 2.1436945253702063e-05, + "loss": 0.002, + "step": 4700 + }, + { + "epoch": 7.234247234247234, + "eval_loss": 0.07609633356332779, + "eval_runtime": 155.3997, + "eval_samples_per_second": 7.432, + "eval_steps_per_second": 7.432, + "step": 4700 + }, + { + "epoch": 7.24963924963925, + "grad_norm": 0.4797951579093933, + "learning_rate": 2.1216633559324894e-05, + "loss": 0.0027, + "step": 4710 + }, + { + "epoch": 7.265031265031265, + "grad_norm": 0.0018229541601613164, + "learning_rate": 2.09971545214401e-05, + "loss": 0.0011, + "step": 4720 + }, + { + "epoch": 7.28042328042328, + "grad_norm": 0.02090558409690857, + "learning_rate": 2.0778514489223204e-05, + "loss": 0.0002, + "step": 4730 + }, + { + "epoch": 7.295815295815296, + "grad_norm": 0.002939369063824415, + "learning_rate": 2.0560719787578654e-05, + "loss": 0.001, + "step": 4740 + }, + { + "epoch": 7.311207311207311, + "grad_norm": 0.01444339007139206, + "learning_rate": 2.0343776716956826e-05, + "loss": 0.0006, + "step": 4750 + }, + { + "epoch": 7.311207311207311, + "eval_loss": 0.076209157705307, + "eval_runtime": 155.4293, + "eval_samples_per_second": 7.431, + "eval_steps_per_second": 7.431, + "step": 4750 + }, + { + "epoch": 7.326599326599327, + "grad_norm": 0.20151416957378387, + "learning_rate": 2.0127691553171823e-05, + "loss": 0.0008, + "step": 4760 + }, + { + "epoch": 7.341991341991342, + "grad_norm": 0.0074240039102733135, + "learning_rate": 1.9912470547219843e-05, + "loss": 0.0009, + "step": 4770 + }, + { + "epoch": 7.357383357383357, + "grad_norm": 0.018797826021909714, + "learning_rate": 1.9698119925098396e-05, + "loss": 0.0007, + "step": 4780 + }, + { + "epoch": 7.372775372775373, + "grad_norm": 0.008177504874765873, + "learning_rate": 1.9484645887626174e-05, + "loss": 0.0019, + "step": 4790 + }, + { + "epoch": 7.388167388167388, + "grad_norm": 0.011816039681434631, + "learning_rate": 1.927205461026374e-05, + "loss": 0.0002, + "step": 4800 + }, + { + "epoch": 7.388167388167388, + "eval_loss": 0.07652729749679565, + "eval_runtime": 155.4827, + "eval_samples_per_second": 7.428, + "eval_steps_per_second": 7.428, + "step": 4800 + }, + { + "epoch": 7.403559403559403, + "grad_norm": 0.017783937975764275, + "learning_rate": 1.9060352242934775e-05, + "loss": 0.0004, + "step": 4810 + }, + { + "epoch": 7.418951418951419, + "grad_norm": 0.27515709400177, + "learning_rate": 1.8849544909848233e-05, + "loss": 0.0015, + "step": 4820 + }, + { + "epoch": 7.434343434343434, + "grad_norm": 0.1167323887348175, + "learning_rate": 1.8639638709321173e-05, + "loss": 0.0009, + "step": 4830 + }, + { + "epoch": 7.449735449735449, + "grad_norm": 0.006972416769713163, + "learning_rate": 1.8430639713602316e-05, + "loss": 0.0002, + "step": 4840 + }, + { + "epoch": 7.465127465127465, + "grad_norm": 0.030542902648448944, + "learning_rate": 1.8222553968696483e-05, + "loss": 0.0003, + "step": 4850 + }, + { + "epoch": 7.465127465127465, + "eval_loss": 0.08020766079425812, + "eval_runtime": 155.5184, + "eval_samples_per_second": 7.427, + "eval_steps_per_second": 7.427, + "step": 4850 + }, + { + "epoch": 7.48051948051948, + "grad_norm": 0.06428160518407822, + "learning_rate": 1.8015387494189496e-05, + "loss": 0.0015, + "step": 4860 + }, + { + "epoch": 7.495911495911496, + "grad_norm": 0.012726989574730396, + "learning_rate": 1.780914628307428e-05, + "loss": 0.001, + "step": 4870 + }, + { + "epoch": 7.511303511303511, + "grad_norm": 0.04011881723999977, + "learning_rate": 1.7603836301577296e-05, + "loss": 0.0018, + "step": 4880 + }, + { + "epoch": 7.526695526695526, + "grad_norm": 0.019626803696155548, + "learning_rate": 1.7399463488986107e-05, + "loss": 0.0005, + "step": 4890 + }, + { + "epoch": 7.542087542087542, + "grad_norm": 0.04123581200838089, + "learning_rate": 1.7196033757477387e-05, + "loss": 0.0002, + "step": 4900 + }, + { + "epoch": 7.542087542087542, + "eval_loss": 0.07892713695764542, + "eval_runtime": 155.5475, + "eval_samples_per_second": 7.425, + "eval_steps_per_second": 7.425, + "step": 4900 + }, + { + "epoch": 7.5574795574795575, + "grad_norm": 0.2327321320772171, + "learning_rate": 1.6993552991946083e-05, + "loss": 0.0008, + "step": 4910 + }, + { + "epoch": 7.5728715728715725, + "grad_norm": 0.026135364547371864, + "learning_rate": 1.679202704983504e-05, + "loss": 0.0016, + "step": 4920 + }, + { + "epoch": 7.5882635882635885, + "grad_norm": 0.13880804181098938, + "learning_rate": 1.65914617609656e-05, + "loss": 0.0003, + "step": 4930 + }, + { + "epoch": 7.603655603655604, + "grad_norm": 0.0015883224550634623, + "learning_rate": 1.639186292736895e-05, + "loss": 0.0009, + "step": 4940 + }, + { + "epoch": 7.619047619047619, + "grad_norm": 0.3299875259399414, + "learning_rate": 1.6193236323118283e-05, + "loss": 0.0025, + "step": 4950 + }, + { + "epoch": 7.619047619047619, + "eval_loss": 0.0804843008518219, + "eval_runtime": 155.4589, + "eval_samples_per_second": 7.43, + "eval_steps_per_second": 7.43, + "step": 4950 + }, + { + "epoch": 7.634439634439635, + "grad_norm": 0.0029539340175688267, + "learning_rate": 1.599558769416179e-05, + "loss": 0.001, + "step": 4960 + }, + { + "epoch": 7.64983164983165, + "grad_norm": 0.0016157279023900628, + "learning_rate": 1.5798922758156364e-05, + "loss": 0.0013, + "step": 4970 + }, + { + "epoch": 7.665223665223666, + "grad_norm": 0.0009673562017269433, + "learning_rate": 1.560324720430227e-05, + "loss": 0.0009, + "step": 4980 + }, + { + "epoch": 7.680615680615681, + "grad_norm": 0.023053240031003952, + "learning_rate": 1.540856669317851e-05, + "loss": 0.0016, + "step": 4990 + }, + { + "epoch": 7.696007696007696, + "grad_norm": 0.006223414093255997, + "learning_rate": 1.5214886856579163e-05, + "loss": 0.0003, + "step": 5000 + }, + { + "epoch": 7.696007696007696, + "eval_loss": 0.07501552253961563, + "eval_runtime": 155.6712, + "eval_samples_per_second": 7.419, + "eval_steps_per_second": 7.419, + "step": 5000 + }, + { + "epoch": 7.711399711399712, + "grad_norm": 0.06261438876390457, + "learning_rate": 1.5022213297350336e-05, + "loss": 0.0002, + "step": 5010 + }, + { + "epoch": 7.726791726791727, + "grad_norm": 0.09084452688694, + "learning_rate": 1.4830551589228181e-05, + "loss": 0.0018, + "step": 5020 + }, + { + "epoch": 7.742183742183742, + "grad_norm": 0.020666206255555153, + "learning_rate": 1.4639907276677628e-05, + "loss": 0.0014, + "step": 5030 + }, + { + "epoch": 7.757575757575758, + "grad_norm": 0.11565666645765305, + "learning_rate": 1.4450285874731962e-05, + "loss": 0.0009, + "step": 5040 + }, + { + "epoch": 7.772967772967773, + "grad_norm": 0.0210795346647501, + "learning_rate": 1.4261692868833376e-05, + "loss": 0.0006, + "step": 5050 + }, + { + "epoch": 7.772967772967773, + "eval_loss": 0.07412903010845184, + "eval_runtime": 155.6862, + "eval_samples_per_second": 7.419, + "eval_steps_per_second": 7.419, + "step": 5050 + }, + { + "epoch": 7.788359788359788, + "grad_norm": 0.3153684735298157, + "learning_rate": 1.4074133714674115e-05, + "loss": 0.0036, + "step": 5060 + }, + { + "epoch": 7.803751803751804, + "grad_norm": 0.010926426388323307, + "learning_rate": 1.3887613838038854e-05, + "loss": 0.0013, + "step": 5070 + }, + { + "epoch": 7.819143819143819, + "grad_norm": 0.3632314205169678, + "learning_rate": 1.3702138634647587e-05, + "loss": 0.0025, + "step": 5080 + }, + { + "epoch": 7.834535834535835, + "grad_norm": 0.2929563522338867, + "learning_rate": 1.3517713469999638e-05, + "loss": 0.0013, + "step": 5090 + }, + { + "epoch": 7.84992784992785, + "grad_norm": 0.03482995927333832, + "learning_rate": 1.3334343679218314e-05, + "loss": 0.0011, + "step": 5100 + }, + { + "epoch": 7.84992784992785, + "eval_loss": 0.074626624584198, + "eval_runtime": 155.7207, + "eval_samples_per_second": 7.417, + "eval_steps_per_second": 7.417, + "step": 5100 + }, + { + "epoch": 7.865319865319865, + "grad_norm": 0.1680263876914978, + "learning_rate": 1.3152034566896754e-05, + "loss": 0.0027, + "step": 5110 + }, + { + "epoch": 7.880711880711881, + "grad_norm": 0.37238502502441406, + "learning_rate": 1.2970791406944304e-05, + "loss": 0.0029, + "step": 5120 + }, + { + "epoch": 7.896103896103896, + "grad_norm": 0.02305671013891697, + "learning_rate": 1.279061944243406e-05, + "loss": 0.0018, + "step": 5130 + }, + { + "epoch": 7.911495911495911, + "grad_norm": 0.0066406638361513615, + "learning_rate": 1.2611523885451137e-05, + "loss": 0.0014, + "step": 5140 + }, + { + "epoch": 7.926887926887927, + "grad_norm": 0.03207942098379135, + "learning_rate": 1.2433509916941905e-05, + "loss": 0.0002, + "step": 5150 + }, + { + "epoch": 7.926887926887927, + "eval_loss": 0.07476626336574554, + "eval_runtime": 155.7545, + "eval_samples_per_second": 7.416, + "eval_steps_per_second": 7.416, + "step": 5150 + }, + { + "epoch": 7.942279942279942, + "grad_norm": 0.9030601382255554, + "learning_rate": 1.2256582686564183e-05, + "loss": 0.0022, + "step": 5160 + }, + { + "epoch": 7.957671957671957, + "grad_norm": 0.0008251300314441323, + "learning_rate": 1.2080747312538083e-05, + "loss": 0.0003, + "step": 5170 + }, + { + "epoch": 7.973063973063973, + "grad_norm": 0.011682317592203617, + "learning_rate": 1.1906008881498192e-05, + "loss": 0.0003, + "step": 5180 + }, + { + "epoch": 7.988455988455988, + "grad_norm": 0.16692648828029633, + "learning_rate": 1.1732372448346224e-05, + "loss": 0.0026, + "step": 5190 + }, + { + "epoch": 8.003848003848004, + "grad_norm": 0.0023031621240079403, + "learning_rate": 1.1559843036104928e-05, + "loss": 0.0016, + "step": 5200 + }, + { + "epoch": 8.003848003848004, + "eval_loss": 0.0746072456240654, + "eval_runtime": 155.6768, + "eval_samples_per_second": 7.419, + "eval_steps_per_second": 7.419, + "step": 5200 + }, + { + "epoch": 8.01924001924002, + "grad_norm": 0.04431803897023201, + "learning_rate": 1.1388425635772626e-05, + "loss": 0.001, + "step": 5210 + }, + { + "epoch": 8.034632034632034, + "grad_norm": 0.004026742186397314, + "learning_rate": 1.1218125206179019e-05, + "loss": 0.002, + "step": 5220 + }, + { + "epoch": 8.05002405002405, + "grad_norm": 0.014678643085062504, + "learning_rate": 1.1048946673841599e-05, + "loss": 0.0001, + "step": 5230 + }, + { + "epoch": 8.065416065416066, + "grad_norm": 0.005510509479790926, + "learning_rate": 1.0880894932823176e-05, + "loss": 0.0008, + "step": 5240 + }, + { + "epoch": 8.080808080808081, + "grad_norm": 0.01404597144573927, + "learning_rate": 1.0713974844590297e-05, + "loss": 0.0011, + "step": 5250 + }, + { + "epoch": 8.080808080808081, + "eval_loss": 0.07546179741621017, + "eval_runtime": 155.6851, + "eval_samples_per_second": 7.419, + "eval_steps_per_second": 7.419, + "step": 5250 + }, + { + "epoch": 8.096200096200096, + "grad_norm": 0.0014472113689407706, + "learning_rate": 1.0548191237872607e-05, + "loss": 0.0007, + "step": 5260 + }, + { + "epoch": 8.111592111592111, + "grad_norm": 0.003951767459511757, + "learning_rate": 1.0383548908523205e-05, + "loss": 0.0002, + "step": 5270 + }, + { + "epoch": 8.126984126984127, + "grad_norm": 0.006296288687735796, + "learning_rate": 1.0220052619379843e-05, + "loss": 0.0008, + "step": 5280 + }, + { + "epoch": 8.142376142376142, + "grad_norm": 0.007426737807691097, + "learning_rate": 1.0057707100127178e-05, + "loss": 0.0005, + "step": 5290 + }, + { + "epoch": 8.157768157768158, + "grad_norm": 0.0049497466534376144, + "learning_rate": 9.896517047159925e-06, + "loss": 0.0002, + "step": 5300 + }, + { + "epoch": 8.157768157768158, + "eval_loss": 0.0791458860039711, + "eval_runtime": 155.7218, + "eval_samples_per_second": 7.417, + "eval_steps_per_second": 7.417, + "step": 5300 + }, + { + "epoch": 8.173160173160174, + "grad_norm": 0.07586673647165298, + "learning_rate": 9.73648712344707e-06, + "loss": 0.0006, + "step": 5310 + }, + { + "epoch": 8.188552188552189, + "grad_norm": 0.0025019501335918903, + "learning_rate": 9.577621958396875e-06, + "loss": 0.0001, + "step": 5320 + }, + { + "epoch": 8.203944203944204, + "grad_norm": 0.003655059961602092, + "learning_rate": 9.419926147723034e-06, + "loss": 0.0004, + "step": 5330 + }, + { + "epoch": 8.219336219336219, + "grad_norm": 0.003684364492073655, + "learning_rate": 9.263404253311685e-06, + "loss": 0.0001, + "step": 5340 + }, + { + "epoch": 8.234728234728236, + "grad_norm": 0.007365832105278969, + "learning_rate": 9.108060803089458e-06, + "loss": 0.0001, + "step": 5350 + }, + { + "epoch": 8.234728234728236, + "eval_loss": 0.08233842253684998, + "eval_runtime": 155.6457, + "eval_samples_per_second": 7.421, + "eval_steps_per_second": 7.421, + "step": 5350 + }, + { + "epoch": 8.25012025012025, + "grad_norm": 0.16903477907180786, + "learning_rate": 8.953900290892524e-06, + "loss": 0.0026, + "step": 5360 + }, + { + "epoch": 8.265512265512266, + "grad_norm": 0.024330507963895798, + "learning_rate": 8.800927176336487e-06, + "loss": 0.0002, + "step": 5370 + }, + { + "epoch": 8.28090428090428, + "grad_norm": 0.0027705500833690166, + "learning_rate": 8.64914588468752e-06, + "loss": 0.0006, + "step": 5380 + }, + { + "epoch": 8.296296296296296, + "grad_norm": 0.0548093244433403, + "learning_rate": 8.498560806734229e-06, + "loss": 0.0002, + "step": 5390 + }, + { + "epoch": 8.311688311688311, + "grad_norm": 0.0016956565668806434, + "learning_rate": 8.349176298660738e-06, + "loss": 0.0, + "step": 5400 + }, + { + "epoch": 8.311688311688311, + "eval_loss": 0.08344623446464539, + "eval_runtime": 155.5992, + "eval_samples_per_second": 7.423, + "eval_steps_per_second": 7.423, + "step": 5400 + }, + { + "epoch": 8.327080327080328, + "grad_norm": 0.0021419841796159744, + "learning_rate": 8.200996681920519e-06, + "loss": 0.0003, + "step": 5410 + }, + { + "epoch": 8.342472342472343, + "grad_norm": 0.010396492667496204, + "learning_rate": 8.054026243111574e-06, + "loss": 0.0002, + "step": 5420 + }, + { + "epoch": 8.357864357864358, + "grad_norm": 0.0035029638092964888, + "learning_rate": 7.908269233852284e-06, + "loss": 0.0009, + "step": 5430 + }, + { + "epoch": 8.373256373256373, + "grad_norm": 0.0016604536212980747, + "learning_rate": 7.763729870658466e-06, + "loss": 0.0003, + "step": 5440 + }, + { + "epoch": 8.388648388648388, + "grad_norm": 0.009655708447098732, + "learning_rate": 7.6204123348214085e-06, + "loss": 0.0001, + "step": 5450 + }, + { + "epoch": 8.388648388648388, + "eval_loss": 0.08563042432069778, + "eval_runtime": 155.6695, + "eval_samples_per_second": 7.42, + "eval_steps_per_second": 7.42, + "step": 5450 + }, + { + "epoch": 8.404040404040405, + "grad_norm": 0.007650343701243401, + "learning_rate": 7.478320772286879e-06, + "loss": 0.0004, + "step": 5460 + }, + { + "epoch": 8.41943241943242, + "grad_norm": 0.0007566296844743192, + "learning_rate": 7.337459293535248e-06, + "loss": 0.0007, + "step": 5470 + }, + { + "epoch": 8.434824434824435, + "grad_norm": 0.04448939859867096, + "learning_rate": 7.197831973462499e-06, + "loss": 0.0001, + "step": 5480 + }, + { + "epoch": 8.45021645021645, + "grad_norm": 0.0032022397499531507, + "learning_rate": 7.059442851262421e-06, + "loss": 0.0013, + "step": 5490 + }, + { + "epoch": 8.465608465608465, + "grad_norm": 0.0019638806115835905, + "learning_rate": 6.922295930309691e-06, + "loss": 0.0002, + "step": 5500 + }, + { + "epoch": 8.465608465608465, + "eval_loss": 0.08418013900518417, + "eval_runtime": 155.6308, + "eval_samples_per_second": 7.421, + "eval_steps_per_second": 7.421, + "step": 5500 + }, + { + "epoch": 8.48100048100048, + "grad_norm": 0.002686739433556795, + "learning_rate": 6.786395178044158e-06, + "loss": 0.0, + "step": 5510 + }, + { + "epoch": 8.496392496392497, + "grad_norm": 0.001246888074092567, + "learning_rate": 6.65174452585598e-06, + "loss": 0.0001, + "step": 5520 + }, + { + "epoch": 8.511784511784512, + "grad_norm": 0.0030873273499310017, + "learning_rate": 6.518347868971925e-06, + "loss": 0.0015, + "step": 5530 + }, + { + "epoch": 8.527176527176527, + "grad_norm": 0.004396612755954266, + "learning_rate": 6.386209066342707e-06, + "loss": 0.0001, + "step": 5540 + }, + { + "epoch": 8.542568542568542, + "grad_norm": 0.00302975089289248, + "learning_rate": 6.255331940531328e-06, + "loss": 0.0002, + "step": 5550 + }, + { + "epoch": 8.542568542568542, + "eval_loss": 0.0852900892496109, + "eval_runtime": 155.6402, + "eval_samples_per_second": 7.421, + "eval_steps_per_second": 7.421, + "step": 5550 + }, + { + "epoch": 8.557960557960557, + "grad_norm": 0.0011217949213460088, + "learning_rate": 6.125720277602553e-06, + "loss": 0.0001, + "step": 5560 + }, + { + "epoch": 8.573352573352574, + "grad_norm": 0.25388625264167786, + "learning_rate": 5.997377827013268e-06, + "loss": 0.0016, + "step": 5570 + }, + { + "epoch": 8.58874458874459, + "grad_norm": 0.0072586615569889545, + "learning_rate": 5.870308301504146e-06, + "loss": 0.0, + "step": 5580 + }, + { + "epoch": 8.604136604136604, + "grad_norm": 0.007417232729494572, + "learning_rate": 5.744515376992155e-06, + "loss": 0.0005, + "step": 5590 + }, + { + "epoch": 8.61952861952862, + "grad_norm": 0.0018258714117109776, + "learning_rate": 5.620002692464288e-06, + "loss": 0.0014, + "step": 5600 + }, + { + "epoch": 8.61952861952862, + "eval_loss": 0.08519508689641953, + "eval_runtime": 155.7209, + "eval_samples_per_second": 7.417, + "eval_steps_per_second": 7.417, + "step": 5600 + }, + { + "epoch": 8.634920634920634, + "grad_norm": 0.003418419975787401, + "learning_rate": 5.496773849872183e-06, + "loss": 0.0015, + "step": 5610 + }, + { + "epoch": 8.65031265031265, + "grad_norm": 0.0012940014712512493, + "learning_rate": 5.374832414028075e-06, + "loss": 0.0001, + "step": 5620 + }, + { + "epoch": 8.665704665704666, + "grad_norm": 0.0012422839645296335, + "learning_rate": 5.2541819125015435e-06, + "loss": 0.0006, + "step": 5630 + }, + { + "epoch": 8.681096681096681, + "grad_norm": 0.006399250589311123, + "learning_rate": 5.134825835517531e-06, + "loss": 0.0001, + "step": 5640 + }, + { + "epoch": 8.696488696488696, + "grad_norm": 0.0014951463090255857, + "learning_rate": 5.016767635855357e-06, + "loss": 0.0009, + "step": 5650 + }, + { + "epoch": 8.696488696488696, + "eval_loss": 0.08536998927593231, + "eval_runtime": 155.8552, + "eval_samples_per_second": 7.411, + "eval_steps_per_second": 7.411, + "step": 5650 + }, + { + "epoch": 8.711880711880712, + "grad_norm": 0.015206399373710155, + "learning_rate": 4.9000107287488276e-06, + "loss": 0.0012, + "step": 5660 + }, + { + "epoch": 8.727272727272727, + "grad_norm": 0.006114702206104994, + "learning_rate": 4.784558491787472e-06, + "loss": 0.0011, + "step": 5670 + }, + { + "epoch": 8.742664742664743, + "grad_norm": 0.02511346898972988, + "learning_rate": 4.670414264818801e-06, + "loss": 0.0005, + "step": 5680 + }, + { + "epoch": 8.758056758056759, + "grad_norm": 0.009570403955876827, + "learning_rate": 4.557581349851686e-06, + "loss": 0.0003, + "step": 5690 + }, + { + "epoch": 8.773448773448774, + "grad_norm": 0.022344782948493958, + "learning_rate": 4.4460630109608606e-06, + "loss": 0.0006, + "step": 5700 + }, + { + "epoch": 8.773448773448774, + "eval_loss": 0.08490793406963348, + "eval_runtime": 155.6955, + "eval_samples_per_second": 7.418, + "eval_steps_per_second": 7.418, + "step": 5700 + }, + { + "epoch": 8.788840788840789, + "grad_norm": 0.0016660256078466773, + "learning_rate": 4.335862474192504e-06, + "loss": 0.0012, + "step": 5710 + }, + { + "epoch": 8.804232804232804, + "grad_norm": 0.007655631750822067, + "learning_rate": 4.226982927470874e-06, + "loss": 0.0007, + "step": 5720 + }, + { + "epoch": 8.819624819624819, + "grad_norm": 0.030732352286577225, + "learning_rate": 4.119427520506125e-06, + "loss": 0.0009, + "step": 5730 + }, + { + "epoch": 8.835016835016836, + "grad_norm": 0.0030992215033620596, + "learning_rate": 4.013199364703174e-06, + "loss": 0.0001, + "step": 5740 + }, + { + "epoch": 8.85040885040885, + "grad_norm": 0.0008929058094508946, + "learning_rate": 3.908301533071684e-06, + "loss": 0.0003, + "step": 5750 + }, + { + "epoch": 8.85040885040885, + "eval_loss": 0.08484598994255066, + "eval_runtime": 155.7793, + "eval_samples_per_second": 7.414, + "eval_steps_per_second": 7.414, + "step": 5750 + }, + { + "epoch": 8.865800865800866, + "grad_norm": 0.00011168943456141278, + "learning_rate": 3.804737060137231e-06, + "loss": 0.0005, + "step": 5760 + }, + { + "epoch": 8.88119288119288, + "grad_norm": 0.004064339213073254, + "learning_rate": 3.702508941853383e-06, + "loss": 0.0, + "step": 5770 + }, + { + "epoch": 8.896584896584896, + "grad_norm": 0.026621725410223007, + "learning_rate": 3.601620135515199e-06, + "loss": 0.0001, + "step": 5780 + }, + { + "epoch": 8.911976911976913, + "grad_norm": 0.0005972280050627887, + "learning_rate": 3.502073559673558e-06, + "loss": 0.0, + "step": 5790 + }, + { + "epoch": 8.927368927368928, + "grad_norm": 0.001488787354901433, + "learning_rate": 3.4038720940507694e-06, + "loss": 0.0007, + "step": 5800 + }, + { + "epoch": 8.927368927368928, + "eval_loss": 0.08573105931282043, + "eval_runtime": 155.7568, + "eval_samples_per_second": 7.415, + "eval_steps_per_second": 7.415, + "step": 5800 + }, + { + "epoch": 8.942760942760943, + "grad_norm": 0.004728777799755335, + "learning_rate": 3.307018579457266e-06, + "loss": 0.0007, + "step": 5810 + }, + { + "epoch": 8.958152958152958, + "grad_norm": 0.001370166428387165, + "learning_rate": 3.2115158177094597e-06, + "loss": 0.0001, + "step": 5820 + }, + { + "epoch": 8.973544973544973, + "grad_norm": 0.16003435850143433, + "learning_rate": 3.117366571548608e-06, + "loss": 0.0002, + "step": 5830 + }, + { + "epoch": 8.988936988936988, + "grad_norm": 0.16123169660568237, + "learning_rate": 3.024573564560973e-06, + "loss": 0.0011, + "step": 5840 + }, + { + "epoch": 9.004329004329005, + "grad_norm": 0.0031955912709236145, + "learning_rate": 2.9331394810989786e-06, + "loss": 0.0, + "step": 5850 + }, + { + "epoch": 9.004329004329005, + "eval_loss": 0.08653371781110764, + "eval_runtime": 155.868, + "eval_samples_per_second": 7.41, + "eval_steps_per_second": 7.41, + "step": 5850 + }, + { + "epoch": 9.01972101972102, + "grad_norm": 0.004821439739316702, + "learning_rate": 2.843066966203578e-06, + "loss": 0.0, + "step": 5860 + }, + { + "epoch": 9.035113035113035, + "grad_norm": 0.13105858862400055, + "learning_rate": 2.754358625527764e-06, + "loss": 0.0004, + "step": 5870 + }, + { + "epoch": 9.05050505050505, + "grad_norm": 0.0017742820782586932, + "learning_rate": 2.6670170252611317e-06, + "loss": 0.0004, + "step": 5880 + }, + { + "epoch": 9.065897065897065, + "grad_norm": 0.004621502943336964, + "learning_rate": 2.5810446920556842e-06, + "loss": 0.0001, + "step": 5890 + }, + { + "epoch": 9.081289081289082, + "grad_norm": 0.0015882363077253103, + "learning_rate": 2.496444112952734e-06, + "loss": 0.0, + "step": 5900 + }, + { + "epoch": 9.081289081289082, + "eval_loss": 0.08728780597448349, + "eval_runtime": 155.9019, + "eval_samples_per_second": 7.409, + "eval_steps_per_second": 7.409, + "step": 5900 + }, + { + "epoch": 9.096681096681097, + "grad_norm": 0.018675755709409714, + "learning_rate": 2.413217735310952e-06, + "loss": 0.0001, + "step": 5910 + }, + { + "epoch": 9.112073112073112, + "grad_norm": 0.007952186278998852, + "learning_rate": 2.3313679667355815e-06, + "loss": 0.0002, + "step": 5920 + }, + { + "epoch": 9.127465127465127, + "grad_norm": 0.003998554777354002, + "learning_rate": 2.250897175008748e-06, + "loss": 0.0003, + "step": 5930 + }, + { + "epoch": 9.142857142857142, + "grad_norm": 0.0017615428660064936, + "learning_rate": 2.1718076880210327e-06, + "loss": 0.0001, + "step": 5940 + }, + { + "epoch": 9.158249158249157, + "grad_norm": 0.1068064346909523, + "learning_rate": 2.094101793704073e-06, + "loss": 0.0006, + "step": 5950 + }, + { + "epoch": 9.158249158249157, + "eval_loss": 0.08758776634931564, + "eval_runtime": 155.5596, + "eval_samples_per_second": 7.425, + "eval_steps_per_second": 7.425, + "step": 5950 + }, + { + "epoch": 9.173641173641174, + "grad_norm": 0.13283948600292206, + "learning_rate": 2.017781739964414e-06, + "loss": 0.0012, + "step": 5960 + }, + { + "epoch": 9.18903318903319, + "grad_norm": 0.004606310743838549, + "learning_rate": 1.9428497346184184e-06, + "loss": 0.0001, + "step": 5970 + }, + { + "epoch": 9.204425204425204, + "grad_norm": 0.0019413360860198736, + "learning_rate": 1.8693079453284967e-06, + "loss": 0.0002, + "step": 5980 + }, + { + "epoch": 9.21981721981722, + "grad_norm": 0.002914516953751445, + "learning_rate": 1.7971584995403025e-06, + "loss": 0.0007, + "step": 5990 + }, + { + "epoch": 9.235209235209235, + "grad_norm": 0.00011632400855887681, + "learning_rate": 1.7264034844212628e-06, + "loss": 0.0003, + "step": 6000 + }, + { + "epoch": 9.235209235209235, + "eval_loss": 0.08785953372716904, + "eval_runtime": 155.2985, + "eval_samples_per_second": 7.437, + "eval_steps_per_second": 7.437, + "step": 6000 + }, + { + "epoch": 9.250601250601251, + "grad_norm": 0.02615189366042614, + "learning_rate": 1.6570449468001403e-06, + "loss": 0.0006, + "step": 6010 + }, + { + "epoch": 9.265993265993266, + "grad_norm": 0.016659028828144073, + "learning_rate": 1.5890848931078672e-06, + "loss": 0.0, + "step": 6020 + }, + { + "epoch": 9.281385281385282, + "grad_norm": 0.0022707050666213036, + "learning_rate": 1.522525289319493e-06, + "loss": 0.0001, + "step": 6030 + }, + { + "epoch": 9.296777296777297, + "grad_norm": 0.0027180383913218975, + "learning_rate": 1.4573680608972796e-06, + "loss": 0.0003, + "step": 6040 + }, + { + "epoch": 9.312169312169312, + "grad_norm": 0.013231371529400349, + "learning_rate": 1.39361509273504e-06, + "loss": 0.0002, + "step": 6050 + }, + { + "epoch": 9.312169312169312, + "eval_loss": 0.0881686881184578, + "eval_runtime": 155.3933, + "eval_samples_per_second": 7.433, + "eval_steps_per_second": 7.433, + "step": 6050 + }, + { + "epoch": 9.327561327561327, + "grad_norm": 0.00750443059951067, + "learning_rate": 1.3312682291035882e-06, + "loss": 0.0, + "step": 6060 + }, + { + "epoch": 9.342953342953344, + "grad_norm": 0.0022666684817522764, + "learning_rate": 1.2703292735974203e-06, + "loss": 0.0007, + "step": 6070 + }, + { + "epoch": 9.358345358345359, + "grad_norm": 0.00871659629046917, + "learning_rate": 1.2107999890824896e-06, + "loss": 0.0001, + "step": 6080 + }, + { + "epoch": 9.373737373737374, + "grad_norm": 0.0011352143483236432, + "learning_rate": 1.1526820976452413e-06, + "loss": 0.0009, + "step": 6090 + }, + { + "epoch": 9.389129389129389, + "grad_norm": 0.003721831599250436, + "learning_rate": 1.0959772805427871e-06, + "loss": 0.0004, + "step": 6100 + }, + { + "epoch": 9.389129389129389, + "eval_loss": 0.08848949521780014, + "eval_runtime": 155.514, + "eval_samples_per_second": 7.427, + "eval_steps_per_second": 7.427, + "step": 6100 + }, + { + "epoch": 9.404521404521404, + "grad_norm": 0.0064863171428442, + "learning_rate": 1.040687178154276e-06, + "loss": 0.0001, + "step": 6110 + }, + { + "epoch": 9.41991341991342, + "grad_norm": 0.006570219062268734, + "learning_rate": 9.868133899334443e-07, + "loss": 0.0003, + "step": 6120 + }, + { + "epoch": 9.435305435305436, + "grad_norm": 0.01222176942974329, + "learning_rate": 9.34357474362313e-07, + "loss": 0.0, + "step": 6130 + }, + { + "epoch": 9.45069745069745, + "grad_norm": 0.000440994743257761, + "learning_rate": 8.833209489061523e-07, + "loss": 0.0001, + "step": 6140 + }, + { + "epoch": 9.466089466089466, + "grad_norm": 0.009765603579580784, + "learning_rate": 8.337052899695497e-07, + "loss": 0.0003, + "step": 6150 + }, + { + "epoch": 9.466089466089466, + "eval_loss": 0.08840280771255493, + "eval_runtime": 155.7481, + "eval_samples_per_second": 7.416, + "eval_steps_per_second": 7.416, + "step": 6150 + }, + { + "epoch": 9.481481481481481, + "grad_norm": 0.007055079098790884, + "learning_rate": 7.855119328537109e-07, + "loss": 0.0, + "step": 6160 + }, + { + "epoch": 9.496873496873496, + "grad_norm": 0.00461737671867013, + "learning_rate": 7.387422717149261e-07, + "loss": 0.0001, + "step": 6170 + }, + { + "epoch": 9.512265512265513, + "grad_norm": 0.13330116868019104, + "learning_rate": 6.933976595242809e-07, + "loss": 0.0002, + "step": 6180 + }, + { + "epoch": 9.527657527657528, + "grad_norm": 0.004195301327854395, + "learning_rate": 6.494794080284527e-07, + "loss": 0.0004, + "step": 6190 + }, + { + "epoch": 9.543049543049543, + "grad_norm": 0.010475163348019123, + "learning_rate": 6.069887877118208e-07, + "loss": 0.0005, + "step": 6200 + }, + { + "epoch": 9.543049543049543, + "eval_loss": 0.08871092647314072, + "eval_runtime": 155.8579, + "eval_samples_per_second": 7.411, + "eval_steps_per_second": 7.411, + "step": 6200 + }, + { + "epoch": 9.558441558441558, + "grad_norm": 0.008875600062310696, + "learning_rate": 5.659270277596773e-07, + "loss": 0.0001, + "step": 6210 + }, + { + "epoch": 9.573833573833573, + "grad_norm": 0.024604877457022667, + "learning_rate": 5.262953160226958e-07, + "loss": 0.0006, + "step": 6220 + }, + { + "epoch": 9.58922558922559, + "grad_norm": 0.0040601021610200405, + "learning_rate": 4.880947989825413e-07, + "loss": 0.0001, + "step": 6230 + }, + { + "epoch": 9.604617604617605, + "grad_norm": 0.0021606090012937784, + "learning_rate": 4.513265817187362e-07, + "loss": 0.0003, + "step": 6240 + }, + { + "epoch": 9.62000962000962, + "grad_norm": 0.0008290394907817245, + "learning_rate": 4.159917278766523e-07, + "loss": 0.0003, + "step": 6250 + }, + { + "epoch": 9.62000962000962, + "eval_loss": 0.08868324011564255, + "eval_runtime": 155.872, + "eval_samples_per_second": 7.41, + "eval_steps_per_second": 7.41, + "step": 6250 + }, + { + "epoch": 9.635401635401635, + "grad_norm": 0.005818967241793871, + "learning_rate": 3.8209125963677985e-07, + "loss": 0.0001, + "step": 6260 + }, + { + "epoch": 9.65079365079365, + "grad_norm": 0.01783004403114319, + "learning_rate": 3.4962615768513453e-07, + "loss": 0.0002, + "step": 6270 + }, + { + "epoch": 9.666185666185665, + "grad_norm": 0.0014194652903825045, + "learning_rate": 3.1859736118489693e-07, + "loss": 0.0004, + "step": 6280 + }, + { + "epoch": 9.681577681577682, + "grad_norm": 0.004197706468403339, + "learning_rate": 2.8900576774923417e-07, + "loss": 0.0004, + "step": 6290 + }, + { + "epoch": 9.696969696969697, + "grad_norm": 0.002955689327791333, + "learning_rate": 2.608522334153485e-07, + "loss": 0.0001, + "step": 6300 + }, + { + "epoch": 9.696969696969697, + "eval_loss": 0.08897044509649277, + "eval_runtime": 155.7864, + "eval_samples_per_second": 7.414, + "eval_steps_per_second": 7.414, + "step": 6300 + }, + { + "epoch": 9.712361712361712, + "grad_norm": 0.008194908499717712, + "learning_rate": 2.341375726197026e-07, + "loss": 0.0001, + "step": 6310 + }, + { + "epoch": 9.727753727753727, + "grad_norm": 0.0005930314655415714, + "learning_rate": 2.0886255817446078e-07, + "loss": 0.0, + "step": 6320 + }, + { + "epoch": 9.743145743145742, + "grad_norm": 0.0006030787480995059, + "learning_rate": 1.8502792124513447e-07, + "loss": 0.0, + "step": 6330 + }, + { + "epoch": 9.75853775853776, + "grad_norm": 0.0014561834977939725, + "learning_rate": 1.6263435132943817e-07, + "loss": 0.0005, + "step": 6340 + }, + { + "epoch": 9.773929773929774, + "grad_norm": 0.0008178597781807184, + "learning_rate": 1.4168249623731643e-07, + "loss": 0.0004, + "step": 6350 + }, + { + "epoch": 9.773929773929774, + "eval_loss": 0.08892033249139786, + "eval_runtime": 155.9127, + "eval_samples_per_second": 7.408, + "eval_steps_per_second": 7.408, + "step": 6350 + }, + { + "epoch": 9.78932178932179, + "grad_norm": 0.009240039624273777, + "learning_rate": 1.2217296207225338e-07, + "loss": 0.0011, + "step": 6360 + }, + { + "epoch": 9.804713804713804, + "grad_norm": 0.007418003398925066, + "learning_rate": 1.0410631321366993e-07, + "loss": 0.0002, + "step": 6370 + }, + { + "epoch": 9.82010582010582, + "grad_norm": 0.0023945968132466078, + "learning_rate": 8.748307230067588e-08, + "loss": 0.0003, + "step": 6380 + }, + { + "epoch": 9.835497835497836, + "grad_norm": 0.002187097677960992, + "learning_rate": 7.230372021688193e-08, + "loss": 0.0, + "step": 6390 + }, + { + "epoch": 9.850889850889851, + "grad_norm": 0.001660968759097159, + "learning_rate": 5.8568696076527484e-08, + "loss": 0.001, + "step": 6400 + }, + { + "epoch": 9.850889850889851, + "eval_loss": 0.08894965052604675, + "eval_runtime": 155.9825, + "eval_samples_per_second": 7.405, + "eval_steps_per_second": 7.405, + "step": 6400 + }, + { + "epoch": 9.866281866281867, + "grad_norm": 0.023139866068959236, + "learning_rate": 4.627839721176863e-08, + "loss": 0.0004, + "step": 6410 + }, + { + "epoch": 9.881673881673882, + "grad_norm": 0.134952574968338, + "learning_rate": 3.543317916118172e-08, + "loss": 0.0012, + "step": 6420 + }, + { + "epoch": 9.897065897065897, + "grad_norm": 0.02697904221713543, + "learning_rate": 2.603335565948828e-08, + "loss": 0.0006, + "step": 6430 + }, + { + "epoch": 9.912457912457912, + "grad_norm": 0.0022994026076048613, + "learning_rate": 1.8079198628451198e-08, + "loss": 0.001, + "step": 6440 + }, + { + "epoch": 9.927849927849929, + "grad_norm": 0.005798928439617157, + "learning_rate": 1.1570938169042089e-08, + "loss": 0.0, + "step": 6450 + }, + { + "epoch": 9.927849927849929, + "eval_loss": 0.08900179713964462, + "eval_runtime": 155.9713, + "eval_samples_per_second": 7.405, + "eval_steps_per_second": 7.405, + "step": 6450 + }, + { + "epoch": 9.943241943241944, + "grad_norm": 0.0010307361371815205, + "learning_rate": 6.508762554768844e-09, + "loss": 0.0003, + "step": 6460 + }, + { + "epoch": 9.958633958633959, + "grad_norm": 0.013674820773303509, + "learning_rate": 2.8928182262355495e-09, + "loss": 0.0, + "step": 6470 + }, + { + "epoch": 9.974025974025974, + "grad_norm": 0.004010125994682312, + "learning_rate": 7.23209786884782e-10, + "loss": 0.0008, + "step": 6480 + }, + { + "epoch": 9.989417989417989, + "grad_norm": 0.007564207073301077, + "learning_rate": 0.0, + "loss": 0.001, + "step": 6490 + }, + { + "epoch": 9.989417989417989, + "step": 6490, + "total_flos": 1.6307842783863767e+18, + "train_loss": 0.029858223626566607, + "train_runtime": 70104.8536, + "train_samples_per_second": 1.483, + "train_steps_per_second": 0.093 + } + ], + "logging_steps": 10, + "max_steps": 6490, + "num_input_tokens_seen": 0, + "num_train_epochs": 10, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.6307842783863767e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}