{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 740, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013531799729364006, "grad_norm": 1.4412583112716675, "learning_rate": 1.2903225806451614e-06, "loss": 1.3064, "step": 5 }, { "epoch": 0.02706359945872801, "grad_norm": 0.9369994401931763, "learning_rate": 2.9032258064516128e-06, "loss": 1.3176, "step": 10 }, { "epoch": 0.04059539918809202, "grad_norm": 0.6337246298789978, "learning_rate": 4.516129032258065e-06, "loss": 1.3047, "step": 15 }, { "epoch": 0.05412719891745602, "grad_norm": 0.7102669477462769, "learning_rate": 6.129032258064517e-06, "loss": 1.2226, "step": 20 }, { "epoch": 0.06765899864682003, "grad_norm": 0.5792216062545776, "learning_rate": 7.741935483870968e-06, "loss": 1.2671, "step": 25 }, { "epoch": 0.08119079837618404, "grad_norm": 0.5476118326187134, "learning_rate": 9.35483870967742e-06, "loss": 1.2585, "step": 30 }, { "epoch": 0.09472259810554803, "grad_norm": 0.6799878478050232, "learning_rate": 1.0967741935483872e-05, "loss": 1.2151, "step": 35 }, { "epoch": 0.10825439783491204, "grad_norm": 0.5542110800743103, "learning_rate": 1.2580645161290324e-05, "loss": 1.185, "step": 40 }, { "epoch": 0.12178619756427606, "grad_norm": 0.4575681686401367, "learning_rate": 1.4193548387096776e-05, "loss": 1.1688, "step": 45 }, { "epoch": 0.13531799729364005, "grad_norm": 0.5137224197387695, "learning_rate": 1.5806451612903226e-05, "loss": 1.1865, "step": 50 }, { "epoch": 0.14884979702300405, "grad_norm": 0.4554082453250885, "learning_rate": 1.741935483870968e-05, "loss": 1.163, "step": 55 }, { "epoch": 0.16238159675236807, "grad_norm": 0.5306389331817627, "learning_rate": 1.903225806451613e-05, "loss": 1.1513, "step": 60 }, { "epoch": 0.17591339648173207, "grad_norm": 0.5879004597663879, "learning_rate": 2.0645161290322582e-05, "loss": 1.1948, "step": 65 }, { "epoch": 0.18944519621109607, "grad_norm": 0.4350433647632599, "learning_rate": 2.2258064516129034e-05, "loss": 1.0976, "step": 70 }, { "epoch": 0.2029769959404601, "grad_norm": 0.49727171659469604, "learning_rate": 2.3870967741935483e-05, "loss": 1.1021, "step": 75 }, { "epoch": 0.2165087956698241, "grad_norm": 0.551313042640686, "learning_rate": 2.548387096774194e-05, "loss": 1.1962, "step": 80 }, { "epoch": 0.23004059539918809, "grad_norm": 0.4979248046875, "learning_rate": 2.7096774193548387e-05, "loss": 1.1318, "step": 85 }, { "epoch": 0.2435723951285521, "grad_norm": 0.5184155702590942, "learning_rate": 2.870967741935484e-05, "loss": 1.1268, "step": 90 }, { "epoch": 0.2571041948579161, "grad_norm": 0.6341415643692017, "learning_rate": 2.9999976021756284e-05, "loss": 1.0454, "step": 95 }, { "epoch": 0.2706359945872801, "grad_norm": 0.4915357530117035, "learning_rate": 2.9999136791275564e-05, "loss": 1.078, "step": 100 }, { "epoch": 0.28416779431664413, "grad_norm": 0.541904091835022, "learning_rate": 2.999709872526874e-05, "loss": 1.0623, "step": 105 }, { "epoch": 0.2976995940460081, "grad_norm": 0.5630597472190857, "learning_rate": 2.999386198663225e-05, "loss": 1.0517, "step": 110 }, { "epoch": 0.3112313937753721, "grad_norm": 0.7628896832466125, "learning_rate": 2.9989426834068792e-05, "loss": 1.084, "step": 115 }, { "epoch": 0.32476319350473615, "grad_norm": 0.6299301981925964, "learning_rate": 2.9983793622066668e-05, "loss": 1.0186, "step": 120 }, { "epoch": 0.3382949932341001, "grad_norm": 0.6008714437484741, "learning_rate": 2.9976962800871434e-05, "loss": 1.0228, "step": 125 }, { "epoch": 0.35182679296346414, "grad_norm": 0.6700873374938965, "learning_rate": 2.9968934916449923e-05, "loss": 0.9923, "step": 130 }, { "epoch": 0.36535859269282817, "grad_norm": 0.5688751339912415, "learning_rate": 2.9959710610446577e-05, "loss": 0.998, "step": 135 }, { "epoch": 0.37889039242219213, "grad_norm": 0.7533664703369141, "learning_rate": 2.9949290620132225e-05, "loss": 1.0353, "step": 140 }, { "epoch": 0.39242219215155616, "grad_norm": 0.7014450430870056, "learning_rate": 2.99376757783451e-05, "loss": 0.9277, "step": 145 }, { "epoch": 0.4059539918809202, "grad_norm": 0.6594902873039246, "learning_rate": 2.992486701342427e-05, "loss": 0.9636, "step": 150 }, { "epoch": 0.41948579161028415, "grad_norm": 0.7603819966316223, "learning_rate": 2.9910865349135498e-05, "loss": 0.9665, "step": 155 }, { "epoch": 0.4330175913396482, "grad_norm": 0.642242968082428, "learning_rate": 2.989567190458935e-05, "loss": 0.9616, "step": 160 }, { "epoch": 0.4465493910690122, "grad_norm": 0.6365484595298767, "learning_rate": 2.9879287894151786e-05, "loss": 0.979, "step": 165 }, { "epoch": 0.46008119079837617, "grad_norm": 0.6748781800270081, "learning_rate": 2.9861714627347076e-05, "loss": 0.9437, "step": 170 }, { "epoch": 0.4736129905277402, "grad_norm": 0.6489437222480774, "learning_rate": 2.984295350875316e-05, "loss": 0.9036, "step": 175 }, { "epoch": 0.4871447902571042, "grad_norm": 0.6802551746368408, "learning_rate": 2.9823006037889358e-05, "loss": 0.8769, "step": 180 }, { "epoch": 0.5006765899864682, "grad_norm": 0.7979145050048828, "learning_rate": 2.9801873809096543e-05, "loss": 0.9136, "step": 185 }, { "epoch": 0.5142083897158322, "grad_norm": 0.774574875831604, "learning_rate": 2.9779558511409678e-05, "loss": 0.8767, "step": 190 }, { "epoch": 0.5277401894451962, "grad_norm": 0.724077045917511, "learning_rate": 2.9756061928422857e-05, "loss": 0.913, "step": 195 }, { "epoch": 0.5412719891745602, "grad_norm": 0.7980031967163086, "learning_rate": 2.973138593814671e-05, "loss": 0.9224, "step": 200 }, { "epoch": 0.5548037889039242, "grad_norm": 0.900132417678833, "learning_rate": 2.9705532512858324e-05, "loss": 0.8389, "step": 205 }, { "epoch": 0.5683355886332883, "grad_norm": 0.8545295596122742, "learning_rate": 2.9678503718943594e-05, "loss": 0.8391, "step": 210 }, { "epoch": 0.5818673883626523, "grad_norm": 0.78533935546875, "learning_rate": 2.965030171673207e-05, "loss": 0.8716, "step": 215 }, { "epoch": 0.5953991880920162, "grad_norm": 0.9080139994621277, "learning_rate": 2.962092876032427e-05, "loss": 0.8158, "step": 220 }, { "epoch": 0.6089309878213802, "grad_norm": 0.9167734384536743, "learning_rate": 2.9590387197411547e-05, "loss": 0.8416, "step": 225 }, { "epoch": 0.6224627875507442, "grad_norm": 1.0270551443099976, "learning_rate": 2.9558679469088423e-05, "loss": 0.8628, "step": 230 }, { "epoch": 0.6359945872801083, "grad_norm": 0.8939360976219177, "learning_rate": 2.9525808109657485e-05, "loss": 0.8487, "step": 235 }, { "epoch": 0.6495263870094723, "grad_norm": 0.8669422268867493, "learning_rate": 2.949177574642682e-05, "loss": 0.8317, "step": 240 }, { "epoch": 0.6630581867388363, "grad_norm": 0.7396143674850464, "learning_rate": 2.9456585099500036e-05, "loss": 0.784, "step": 245 }, { "epoch": 0.6765899864682002, "grad_norm": 0.9191597700119019, "learning_rate": 2.942023898155885e-05, "loss": 0.8148, "step": 250 }, { "epoch": 0.6901217861975643, "grad_norm": 1.05917227268219, "learning_rate": 2.938274029763826e-05, "loss": 0.7824, "step": 255 }, { "epoch": 0.7036535859269283, "grad_norm": 0.8561118245124817, "learning_rate": 2.934409204489438e-05, "loss": 0.8054, "step": 260 }, { "epoch": 0.7171853856562923, "grad_norm": 0.7686528563499451, "learning_rate": 2.9304297312364865e-05, "loss": 0.7765, "step": 265 }, { "epoch": 0.7307171853856563, "grad_norm": 0.9166774749755859, "learning_rate": 2.926335928072203e-05, "loss": 0.7964, "step": 270 }, { "epoch": 0.7442489851150202, "grad_norm": 0.9427902102470398, "learning_rate": 2.922128122201862e-05, "loss": 0.7521, "step": 275 }, { "epoch": 0.7577807848443843, "grad_norm": 0.8347809314727783, "learning_rate": 2.9178066499426284e-05, "loss": 0.7492, "step": 280 }, { "epoch": 0.7713125845737483, "grad_norm": 0.9658071994781494, "learning_rate": 2.9133718566966773e-05, "loss": 0.6946, "step": 285 }, { "epoch": 0.7848443843031123, "grad_norm": 0.8596900105476379, "learning_rate": 2.9088240969235864e-05, "loss": 0.7395, "step": 290 }, { "epoch": 0.7983761840324763, "grad_norm": 0.9865032434463501, "learning_rate": 2.9041637341120054e-05, "loss": 0.7497, "step": 295 }, { "epoch": 0.8119079837618404, "grad_norm": 0.8877797722816467, "learning_rate": 2.8993911407506037e-05, "loss": 0.7162, "step": 300 }, { "epoch": 0.8254397834912043, "grad_norm": 0.9064735174179077, "learning_rate": 2.8945066982982984e-05, "loss": 0.6864, "step": 305 }, { "epoch": 0.8389715832205683, "grad_norm": 0.9386357665061951, "learning_rate": 2.889510797153764e-05, "loss": 0.6957, "step": 310 }, { "epoch": 0.8525033829499323, "grad_norm": 1.0564672946929932, "learning_rate": 2.8844038366242326e-05, "loss": 0.698, "step": 315 }, { "epoch": 0.8660351826792964, "grad_norm": 0.9778911471366882, "learning_rate": 2.879186224893574e-05, "loss": 0.6916, "step": 320 }, { "epoch": 0.8795669824086604, "grad_norm": 0.9041974544525146, "learning_rate": 2.8738583789896743e-05, "loss": 0.6482, "step": 325 }, { "epoch": 0.8930987821380244, "grad_norm": 1.026167392730713, "learning_rate": 2.8684207247511025e-05, "loss": 0.7138, "step": 330 }, { "epoch": 0.9066305818673883, "grad_norm": 0.8488349318504333, "learning_rate": 2.8628736967930747e-05, "loss": 0.6957, "step": 335 }, { "epoch": 0.9201623815967523, "grad_norm": 0.9476014375686646, "learning_rate": 2.8572177384727167e-05, "loss": 0.6485, "step": 340 }, { "epoch": 0.9336941813261164, "grad_norm": 0.9190114140510559, "learning_rate": 2.8514533018536286e-05, "loss": 0.6747, "step": 345 }, { "epoch": 0.9472259810554804, "grad_norm": 0.9507735371589661, "learning_rate": 2.8455808476697513e-05, "loss": 0.6732, "step": 350 }, { "epoch": 0.9607577807848444, "grad_norm": 1.0128976106643677, "learning_rate": 2.8396008452885426e-05, "loss": 0.6633, "step": 355 }, { "epoch": 0.9742895805142084, "grad_norm": 1.2091327905654907, "learning_rate": 2.8335137726734608e-05, "loss": 0.6888, "step": 360 }, { "epoch": 0.9878213802435724, "grad_norm": 0.998440682888031, "learning_rate": 2.827320116345764e-05, "loss": 0.6605, "step": 365 }, { "epoch": 1.0, "grad_norm": 1.5365850925445557, "learning_rate": 2.821020371345624e-05, "loss": 0.6687, "step": 370 }, { "epoch": 1.013531799729364, "grad_norm": 1.3685508966445923, "learning_rate": 2.8146150411925568e-05, "loss": 0.541, "step": 375 }, { "epoch": 1.027063599458728, "grad_norm": 1.0007332563400269, "learning_rate": 2.8081046378451807e-05, "loss": 0.5494, "step": 380 }, { "epoch": 1.040595399188092, "grad_norm": 1.3241759538650513, "learning_rate": 2.801489681660296e-05, "loss": 0.5614, "step": 385 }, { "epoch": 1.054127198917456, "grad_norm": 1.0600066184997559, "learning_rate": 2.7947707013512936e-05, "loss": 0.5606, "step": 390 }, { "epoch": 1.0676589986468201, "grad_norm": 0.9737664461135864, "learning_rate": 2.7879482339458974e-05, "loss": 0.5386, "step": 395 }, { "epoch": 1.0811907983761841, "grad_norm": 0.9769577980041504, "learning_rate": 2.7810228247432415e-05, "loss": 0.5397, "step": 400 }, { "epoch": 1.094722598105548, "grad_norm": 0.8770543336868286, "learning_rate": 2.7739950272702856e-05, "loss": 0.5319, "step": 405 }, { "epoch": 1.108254397834912, "grad_norm": 0.9559663534164429, "learning_rate": 2.7668654032375733e-05, "loss": 0.5952, "step": 410 }, { "epoch": 1.121786197564276, "grad_norm": 0.9816983938217163, "learning_rate": 2.7596345224943357e-05, "loss": 0.5308, "step": 415 }, { "epoch": 1.13531799729364, "grad_norm": 1.0468382835388184, "learning_rate": 2.7523029629829478e-05, "loss": 0.5199, "step": 420 }, { "epoch": 1.148849797023004, "grad_norm": 1.1081461906433105, "learning_rate": 2.744871310692731e-05, "loss": 0.5094, "step": 425 }, { "epoch": 1.162381596752368, "grad_norm": 1.0275734663009644, "learning_rate": 2.73734015961312e-05, "loss": 0.5744, "step": 430 }, { "epoch": 1.175913396481732, "grad_norm": 0.9587875604629517, "learning_rate": 2.7297101116861862e-05, "loss": 0.5095, "step": 435 }, { "epoch": 1.1894451962110961, "grad_norm": 1.1627024412155151, "learning_rate": 2.721981776758526e-05, "loss": 0.4957, "step": 440 }, { "epoch": 1.2029769959404601, "grad_norm": 0.9614389538764954, "learning_rate": 2.714155772532518e-05, "loss": 0.506, "step": 445 }, { "epoch": 1.2165087956698242, "grad_norm": 1.0214952230453491, "learning_rate": 2.7062327245169506e-05, "loss": 0.5174, "step": 450 }, { "epoch": 1.230040595399188, "grad_norm": 1.1152983903884888, "learning_rate": 2.6982132659770298e-05, "loss": 0.5119, "step": 455 }, { "epoch": 1.243572395128552, "grad_norm": 0.9631413817405701, "learning_rate": 2.6900980378837614e-05, "loss": 0.5094, "step": 460 }, { "epoch": 1.257104194857916, "grad_norm": 1.333390712738037, "learning_rate": 2.6818876888627204e-05, "loss": 0.5279, "step": 465 }, { "epoch": 1.27063599458728, "grad_norm": 1.1421831846237183, "learning_rate": 2.6735828751422117e-05, "loss": 0.5038, "step": 470 }, { "epoch": 1.284167794316644, "grad_norm": 1.093878149986267, "learning_rate": 2.6651842605008142e-05, "loss": 0.4847, "step": 475 }, { "epoch": 1.297699594046008, "grad_norm": 1.0584564208984375, "learning_rate": 2.6566925162143322e-05, "loss": 0.463, "step": 480 }, { "epoch": 1.3112313937753721, "grad_norm": 1.1284931898117065, "learning_rate": 2.6481083210021396e-05, "loss": 0.5409, "step": 485 }, { "epoch": 1.3247631935047361, "grad_norm": 1.6687848567962646, "learning_rate": 2.6394323609729317e-05, "loss": 0.5144, "step": 490 }, { "epoch": 1.3382949932341002, "grad_norm": 1.1010960340499878, "learning_rate": 2.6306653295698885e-05, "loss": 0.526, "step": 495 }, { "epoch": 1.3518267929634642, "grad_norm": 1.0190247297286987, "learning_rate": 2.6218079275152485e-05, "loss": 0.4194, "step": 500 }, { "epoch": 1.3653585926928282, "grad_norm": 1.2479708194732666, "learning_rate": 2.6128608627543012e-05, "loss": 0.4678, "step": 505 }, { "epoch": 1.3788903924221922, "grad_norm": 1.0126714706420898, "learning_rate": 2.6038248503988058e-05, "loss": 0.4241, "step": 510 }, { "epoch": 1.3924221921515563, "grad_norm": 1.1457223892211914, "learning_rate": 2.5947006126698325e-05, "loss": 0.4553, "step": 515 }, { "epoch": 1.4059539918809203, "grad_norm": 0.9630009531974792, "learning_rate": 2.5854888788400384e-05, "loss": 0.4664, "step": 520 }, { "epoch": 1.419485791610284, "grad_norm": 1.067337989807129, "learning_rate": 2.5761903851753783e-05, "loss": 0.4379, "step": 525 }, { "epoch": 1.4330175913396481, "grad_norm": 1.0295122861862183, "learning_rate": 2.5668058748762574e-05, "loss": 0.4428, "step": 530 }, { "epoch": 1.4465493910690121, "grad_norm": 0.9696286916732788, "learning_rate": 2.5573360980181297e-05, "loss": 0.4245, "step": 535 }, { "epoch": 1.4600811907983762, "grad_norm": 1.2453484535217285, "learning_rate": 2.5477818114915477e-05, "loss": 0.462, "step": 540 }, { "epoch": 1.4736129905277402, "grad_norm": 1.038318395614624, "learning_rate": 2.5381437789416643e-05, "loss": 0.4367, "step": 545 }, { "epoch": 1.4871447902571042, "grad_norm": 1.1128343343734741, "learning_rate": 2.5284227707071986e-05, "loss": 0.4184, "step": 550 }, { "epoch": 1.5006765899864682, "grad_norm": 1.2020708322525024, "learning_rate": 2.518619563758864e-05, "loss": 0.4334, "step": 555 }, { "epoch": 1.514208389715832, "grad_norm": 1.0298750400543213, "learning_rate": 2.5087349416372696e-05, "loss": 0.4125, "step": 560 }, { "epoch": 1.527740189445196, "grad_norm": 1.0778883695602417, "learning_rate": 2.49876969439029e-05, "loss": 0.4133, "step": 565 }, { "epoch": 1.54127198917456, "grad_norm": 1.1158068180084229, "learning_rate": 2.4887246185099237e-05, "loss": 0.4366, "step": 570 }, { "epoch": 1.5548037889039241, "grad_norm": 1.1676713228225708, "learning_rate": 2.4786005168686286e-05, "loss": 0.436, "step": 575 }, { "epoch": 1.5683355886332881, "grad_norm": 1.2565547227859497, "learning_rate": 2.4683981986551526e-05, "loss": 0.4557, "step": 580 }, { "epoch": 1.5818673883626522, "grad_norm": 1.133944034576416, "learning_rate": 2.458118479309857e-05, "loss": 0.4026, "step": 585 }, { "epoch": 1.5953991880920162, "grad_norm": 1.0183407068252563, "learning_rate": 2.4477621804595402e-05, "loss": 0.4533, "step": 590 }, { "epoch": 1.6089309878213802, "grad_norm": 1.26309335231781, "learning_rate": 2.4373301298517696e-05, "loss": 0.4314, "step": 595 }, { "epoch": 1.6224627875507442, "grad_norm": 1.1744800806045532, "learning_rate": 2.42682316128872e-05, "loss": 0.3831, "step": 600 }, { "epoch": 1.6359945872801083, "grad_norm": 1.1486276388168335, "learning_rate": 2.4162421145605308e-05, "loss": 0.4609, "step": 605 }, { "epoch": 1.6495263870094723, "grad_norm": 1.1308343410491943, "learning_rate": 2.4055878353781858e-05, "loss": 0.3715, "step": 610 }, { "epoch": 1.6630581867388363, "grad_norm": 1.048828125, "learning_rate": 2.3948611753059155e-05, "loss": 0.3978, "step": 615 }, { "epoch": 1.6765899864682003, "grad_norm": 1.2727230787277222, "learning_rate": 2.3840629916931362e-05, "loss": 0.3986, "step": 620 }, { "epoch": 1.6901217861975644, "grad_norm": 1.1679140329360962, "learning_rate": 2.3731941476059243e-05, "loss": 0.3896, "step": 625 }, { "epoch": 1.7036535859269284, "grad_norm": 1.1558784246444702, "learning_rate": 2.362255511758033e-05, "loss": 0.3888, "step": 630 }, { "epoch": 1.7171853856562924, "grad_norm": 1.337999939918518, "learning_rate": 2.351247958441459e-05, "loss": 0.3811, "step": 635 }, { "epoch": 1.7307171853856564, "grad_norm": 1.1977120637893677, "learning_rate": 2.340172367456564e-05, "loss": 0.3987, "step": 640 }, { "epoch": 1.7442489851150202, "grad_norm": 1.2168259620666504, "learning_rate": 2.3290296240417544e-05, "loss": 0.3497, "step": 645 }, { "epoch": 1.7577807848443843, "grad_norm": 1.1579980850219727, "learning_rate": 2.3178206188027265e-05, "loss": 0.3342, "step": 650 }, { "epoch": 1.7713125845737483, "grad_norm": 0.9696447253227234, "learning_rate": 2.3065462476412825e-05, "loss": 0.3763, "step": 655 }, { "epoch": 1.7848443843031123, "grad_norm": 1.072059154510498, "learning_rate": 2.295207411683725e-05, "loss": 0.3668, "step": 660 }, { "epoch": 1.7983761840324763, "grad_norm": 1.0907052755355835, "learning_rate": 2.283805017208834e-05, "loss": 0.369, "step": 665 }, { "epoch": 1.8119079837618404, "grad_norm": 1.1626112461090088, "learning_rate": 2.2723399755754262e-05, "loss": 0.349, "step": 670 }, { "epoch": 1.8254397834912042, "grad_norm": 1.0268486738204956, "learning_rate": 2.2608132031495184e-05, "loss": 0.3214, "step": 675 }, { "epoch": 1.8389715832205682, "grad_norm": 1.1764811277389526, "learning_rate": 2.2492256212310805e-05, "loss": 0.3133, "step": 680 }, { "epoch": 1.8525033829499322, "grad_norm": 1.0286613702774048, "learning_rate": 2.2375781559804012e-05, "loss": 0.3362, "step": 685 }, { "epoch": 1.8660351826792962, "grad_norm": 1.2203805446624756, "learning_rate": 2.2258717383440632e-05, "loss": 0.3344, "step": 690 }, { "epoch": 1.8795669824086603, "grad_norm": 1.0343270301818848, "learning_rate": 2.2141073039805344e-05, "loss": 0.3352, "step": 695 }, { "epoch": 1.8930987821380243, "grad_norm": 1.082728385925293, "learning_rate": 2.202285793185383e-05, "loss": 0.3419, "step": 700 }, { "epoch": 1.9066305818673883, "grad_norm": 1.1293714046478271, "learning_rate": 2.1904081508161236e-05, "loss": 0.3589, "step": 705 }, { "epoch": 1.9201623815967523, "grad_norm": 1.090536117553711, "learning_rate": 2.1784753262166984e-05, "loss": 0.3487, "step": 710 }, { "epoch": 1.9336941813261164, "grad_norm": 1.2215139865875244, "learning_rate": 2.166488273141597e-05, "loss": 0.3917, "step": 715 }, { "epoch": 1.9472259810554804, "grad_norm": 1.2397185564041138, "learning_rate": 2.1544479496796258e-05, "loss": 0.3715, "step": 720 }, { "epoch": 1.9607577807848444, "grad_norm": 1.0956600904464722, "learning_rate": 2.1423553181773336e-05, "loss": 0.3448, "step": 725 }, { "epoch": 1.9742895805142084, "grad_norm": 1.1384000778198242, "learning_rate": 2.130211345162091e-05, "loss": 0.3248, "step": 730 }, { "epoch": 1.9878213802435725, "grad_norm": 1.2017817497253418, "learning_rate": 2.1180170012648406e-05, "loss": 0.3016, "step": 735 }, { "epoch": 2.0, "grad_norm": 1.444764256477356, "learning_rate": 2.105773261142516e-05, "loss": 0.3309, "step": 740 } ], "logging_steps": 5, "max_steps": 1850, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0801623899381432e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }