{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 734, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013623978201634877, "grad_norm": 1.2644609212875366, "learning_rate": 1.3043478260869566e-06, "loss": 1.3371, "step": 5 }, { "epoch": 0.027247956403269755, "grad_norm": 0.9776316285133362, "learning_rate": 2.9347826086956523e-06, "loss": 1.3342, "step": 10 }, { "epoch": 0.04087193460490463, "grad_norm": 0.8481394052505493, "learning_rate": 4.565217391304348e-06, "loss": 1.3057, "step": 15 }, { "epoch": 0.05449591280653951, "grad_norm": 0.6515122652053833, "learning_rate": 6.195652173913044e-06, "loss": 1.2589, "step": 20 }, { "epoch": 0.0681198910081744, "grad_norm": 0.5874186158180237, "learning_rate": 7.826086956521738e-06, "loss": 1.3107, "step": 25 }, { "epoch": 0.08174386920980926, "grad_norm": 0.5392442345619202, "learning_rate": 9.456521739130436e-06, "loss": 1.2746, "step": 30 }, { "epoch": 0.09536784741144415, "grad_norm": 0.47512176632881165, "learning_rate": 1.108695652173913e-05, "loss": 1.187, "step": 35 }, { "epoch": 0.10899182561307902, "grad_norm": 0.4385203421115875, "learning_rate": 1.2717391304347827e-05, "loss": 1.2059, "step": 40 }, { "epoch": 0.1226158038147139, "grad_norm": 0.6362432241439819, "learning_rate": 1.4347826086956522e-05, "loss": 1.1893, "step": 45 }, { "epoch": 0.1362397820163488, "grad_norm": 0.46012377738952637, "learning_rate": 1.597826086956522e-05, "loss": 1.2839, "step": 50 }, { "epoch": 0.14986376021798364, "grad_norm": 0.5726909637451172, "learning_rate": 1.7608695652173915e-05, "loss": 1.1741, "step": 55 }, { "epoch": 0.16348773841961853, "grad_norm": 0.4440523684024811, "learning_rate": 1.9239130434782607e-05, "loss": 1.2477, "step": 60 }, { "epoch": 0.1771117166212534, "grad_norm": 0.4722035229206085, "learning_rate": 2.0869565217391306e-05, "loss": 1.1941, "step": 65 }, { "epoch": 0.1907356948228883, "grad_norm": 0.45267796516418457, "learning_rate": 2.25e-05, "loss": 1.2442, "step": 70 }, { "epoch": 0.20435967302452315, "grad_norm": 0.5661253929138184, "learning_rate": 2.4130434782608697e-05, "loss": 1.2069, "step": 75 }, { "epoch": 0.21798365122615804, "grad_norm": 0.46782588958740234, "learning_rate": 2.5760869565217392e-05, "loss": 1.1675, "step": 80 }, { "epoch": 0.23160762942779292, "grad_norm": 0.48947134613990784, "learning_rate": 2.7391304347826085e-05, "loss": 1.1297, "step": 85 }, { "epoch": 0.2452316076294278, "grad_norm": 0.5822666883468628, "learning_rate": 2.9021739130434783e-05, "loss": 1.1361, "step": 90 }, { "epoch": 0.25885558583106266, "grad_norm": 0.48768264055252075, "learning_rate": 2.9999902540146195e-05, "loss": 1.1555, "step": 95 }, { "epoch": 0.2724795640326976, "grad_norm": 0.5380802750587463, "learning_rate": 2.999880613133526e-05, "loss": 1.1335, "step": 100 }, { "epoch": 0.28610354223433243, "grad_norm": 0.5643327832221985, "learning_rate": 2.9996491578238983e-05, "loss": 1.0905, "step": 105 }, { "epoch": 0.2997275204359673, "grad_norm": 0.5208688974380493, "learning_rate": 2.9992959068836304e-05, "loss": 1.1015, "step": 110 }, { "epoch": 0.3133514986376022, "grad_norm": 0.5542858839035034, "learning_rate": 2.99882088900238e-05, "loss": 1.0951, "step": 115 }, { "epoch": 0.32697547683923706, "grad_norm": 0.5248655080795288, "learning_rate": 2.9982241427592387e-05, "loss": 1.0697, "step": 120 }, { "epoch": 0.3405994550408719, "grad_norm": 0.6014358401298523, "learning_rate": 2.997505716619599e-05, "loss": 1.0873, "step": 125 }, { "epoch": 0.3542234332425068, "grad_norm": 0.6086240410804749, "learning_rate": 2.996665668931218e-05, "loss": 1.0565, "step": 130 }, { "epoch": 0.3678474114441417, "grad_norm": 0.6661133766174316, "learning_rate": 2.9957040679194782e-05, "loss": 1.0233, "step": 135 }, { "epoch": 0.3814713896457766, "grad_norm": 0.707312285900116, "learning_rate": 2.9946209916818477e-05, "loss": 1.0613, "step": 140 }, { "epoch": 0.39509536784741145, "grad_norm": 0.5628401041030884, "learning_rate": 2.9934165281815363e-05, "loss": 0.9882, "step": 145 }, { "epoch": 0.4087193460490463, "grad_norm": 0.6965730786323547, "learning_rate": 2.9920907752403513e-05, "loss": 0.9984, "step": 150 }, { "epoch": 0.4223433242506812, "grad_norm": 0.7086770534515381, "learning_rate": 2.9906438405307548e-05, "loss": 0.9605, "step": 155 }, { "epoch": 0.4359673024523161, "grad_norm": 0.6676215529441833, "learning_rate": 2.989075841567115e-05, "loss": 1.0023, "step": 160 }, { "epoch": 0.44959128065395093, "grad_norm": 0.8493645191192627, "learning_rate": 2.987386905696167e-05, "loss": 0.9039, "step": 165 }, { "epoch": 0.46321525885558584, "grad_norm": 0.7191267609596252, "learning_rate": 2.9855771700866665e-05, "loss": 0.9652, "step": 170 }, { "epoch": 0.4768392370572207, "grad_norm": 0.7579267621040344, "learning_rate": 2.983646781718251e-05, "loss": 0.9237, "step": 175 }, { "epoch": 0.4904632152588556, "grad_norm": 0.8027481436729431, "learning_rate": 2.9815958973695034e-05, "loss": 0.9653, "step": 180 }, { "epoch": 0.5040871934604905, "grad_norm": 0.8199446797370911, "learning_rate": 2.9794246836052167e-05, "loss": 0.9864, "step": 185 }, { "epoch": 0.5177111716621253, "grad_norm": 0.7790918946266174, "learning_rate": 2.977133316762869e-05, "loss": 0.9403, "step": 190 }, { "epoch": 0.5313351498637602, "grad_norm": 0.7630456686019897, "learning_rate": 2.9747219829382997e-05, "loss": 0.9161, "step": 195 }, { "epoch": 0.5449591280653951, "grad_norm": 0.8171895742416382, "learning_rate": 2.9721908779705974e-05, "loss": 0.8556, "step": 200 }, { "epoch": 0.55858310626703, "grad_norm": 0.8095611929893494, "learning_rate": 2.969540207426193e-05, "loss": 0.9051, "step": 205 }, { "epoch": 0.5722070844686649, "grad_norm": 0.7118027806282043, "learning_rate": 2.9667701865821666e-05, "loss": 0.9304, "step": 210 }, { "epoch": 0.5858310626702997, "grad_norm": 0.897177517414093, "learning_rate": 2.9638810404087603e-05, "loss": 0.87, "step": 215 }, { "epoch": 0.5994550408719346, "grad_norm": 0.7748181819915771, "learning_rate": 2.960873003551111e-05, "loss": 0.8723, "step": 220 }, { "epoch": 0.6130790190735694, "grad_norm": 0.8167896270751953, "learning_rate": 2.9577463203101897e-05, "loss": 0.8648, "step": 225 }, { "epoch": 0.6267029972752044, "grad_norm": 0.8939515352249146, "learning_rate": 2.9545012446229613e-05, "loss": 0.7818, "step": 230 }, { "epoch": 0.6403269754768393, "grad_norm": 0.7820625305175781, "learning_rate": 2.951138040041764e-05, "loss": 0.9032, "step": 235 }, { "epoch": 0.6539509536784741, "grad_norm": 0.844601571559906, "learning_rate": 2.9476569797129e-05, "loss": 0.8736, "step": 240 }, { "epoch": 0.667574931880109, "grad_norm": 0.7247692942619324, "learning_rate": 2.944058346354454e-05, "loss": 0.8779, "step": 245 }, { "epoch": 0.6811989100817438, "grad_norm": 0.8070963621139526, "learning_rate": 2.9403424322333326e-05, "loss": 0.8503, "step": 250 }, { "epoch": 0.6948228882833788, "grad_norm": 0.8530706167221069, "learning_rate": 2.9365095391415254e-05, "loss": 0.8546, "step": 255 }, { "epoch": 0.7084468664850136, "grad_norm": 0.8497718572616577, "learning_rate": 2.932559978371596e-05, "loss": 0.78, "step": 260 }, { "epoch": 0.7220708446866485, "grad_norm": 0.9066139459609985, "learning_rate": 2.928494070691401e-05, "loss": 0.829, "step": 265 }, { "epoch": 0.7356948228882834, "grad_norm": 0.7985149621963501, "learning_rate": 2.9243121463180362e-05, "loss": 0.8262, "step": 270 }, { "epoch": 0.7493188010899182, "grad_norm": 1.0057857036590576, "learning_rate": 2.9200145448910184e-05, "loss": 0.7681, "step": 275 }, { "epoch": 0.7629427792915532, "grad_norm": 0.9126638770103455, "learning_rate": 2.915601615444703e-05, "loss": 0.8171, "step": 280 }, { "epoch": 0.776566757493188, "grad_norm": 0.8743240237236023, "learning_rate": 2.9110737163799347e-05, "loss": 0.7672, "step": 285 }, { "epoch": 0.7901907356948229, "grad_norm": 0.8928736448287964, "learning_rate": 2.9064312154349395e-05, "loss": 0.7824, "step": 290 }, { "epoch": 0.8038147138964578, "grad_norm": 1.0869660377502441, "learning_rate": 2.9016744896554606e-05, "loss": 0.7687, "step": 295 }, { "epoch": 0.8174386920980926, "grad_norm": 0.9520823955535889, "learning_rate": 2.8968039253641347e-05, "loss": 0.7603, "step": 300 }, { "epoch": 0.8310626702997275, "grad_norm": 0.9313263297080994, "learning_rate": 2.8918199181291154e-05, "loss": 0.7344, "step": 305 }, { "epoch": 0.8446866485013624, "grad_norm": 0.8860709071159363, "learning_rate": 2.8867228727319484e-05, "loss": 0.7221, "step": 310 }, { "epoch": 0.8583106267029973, "grad_norm": 1.0076904296875, "learning_rate": 2.8815132031346967e-05, "loss": 0.7163, "step": 315 }, { "epoch": 0.8719346049046321, "grad_norm": 1.0047346353530884, "learning_rate": 2.8761913324463193e-05, "loss": 0.731, "step": 320 }, { "epoch": 0.885558583106267, "grad_norm": 0.9905893802642822, "learning_rate": 2.8707576928883083e-05, "loss": 0.7234, "step": 325 }, { "epoch": 0.8991825613079019, "grad_norm": 0.8848779797554016, "learning_rate": 2.8652127257595852e-05, "loss": 0.7241, "step": 330 }, { "epoch": 0.9128065395095368, "grad_norm": 1.086945652961731, "learning_rate": 2.8595568814006618e-05, "loss": 0.7474, "step": 335 }, { "epoch": 0.9264305177111717, "grad_norm": 1.058239221572876, "learning_rate": 2.853790619157063e-05, "loss": 0.6512, "step": 340 }, { "epoch": 0.9400544959128065, "grad_norm": 0.9978867173194885, "learning_rate": 2.8479144073420237e-05, "loss": 0.6968, "step": 345 }, { "epoch": 0.9536784741144414, "grad_norm": 0.9755434989929199, "learning_rate": 2.841928723198449e-05, "loss": 0.6774, "step": 350 }, { "epoch": 0.9673024523160763, "grad_norm": 0.9939659833908081, "learning_rate": 2.835834052860162e-05, "loss": 0.6851, "step": 355 }, { "epoch": 0.9809264305177112, "grad_norm": 0.919965922832489, "learning_rate": 2.8296308913124137e-05, "loss": 0.6636, "step": 360 }, { "epoch": 0.9945504087193461, "grad_norm": 1.1084834337234497, "learning_rate": 2.8233197423516885e-05, "loss": 0.696, "step": 365 }, { "epoch": 1.008174386920981, "grad_norm": 1.0081297159194946, "learning_rate": 2.816901118544785e-05, "loss": 0.6079, "step": 370 }, { "epoch": 1.021798365122616, "grad_norm": 0.9290661811828613, "learning_rate": 2.810375541187188e-05, "loss": 0.5794, "step": 375 }, { "epoch": 1.0354223433242506, "grad_norm": 1.0772560834884644, "learning_rate": 2.80374354026073e-05, "loss": 0.5495, "step": 380 }, { "epoch": 1.0490463215258856, "grad_norm": 1.0354938507080078, "learning_rate": 2.79700565439055e-05, "loss": 0.6109, "step": 385 }, { "epoch": 1.0626702997275204, "grad_norm": 1.0381073951721191, "learning_rate": 2.7901624308013465e-05, "loss": 0.5849, "step": 390 }, { "epoch": 1.0762942779291553, "grad_norm": 1.146996021270752, "learning_rate": 2.7832144252729354e-05, "loss": 0.5798, "step": 395 }, { "epoch": 1.0899182561307903, "grad_norm": 1.0819785594940186, "learning_rate": 2.776162202095111e-05, "loss": 0.584, "step": 400 }, { "epoch": 1.103542234332425, "grad_norm": 1.090579628944397, "learning_rate": 2.7690063340218173e-05, "loss": 0.556, "step": 405 }, { "epoch": 1.11716621253406, "grad_norm": 1.034417748451233, "learning_rate": 2.7617474022246297e-05, "loss": 0.5788, "step": 410 }, { "epoch": 1.1307901907356948, "grad_norm": 1.1490421295166016, "learning_rate": 2.7543859962455576e-05, "loss": 0.5576, "step": 415 }, { "epoch": 1.1444141689373297, "grad_norm": 1.0201855897903442, "learning_rate": 2.7469227139491603e-05, "loss": 0.5886, "step": 420 }, { "epoch": 1.1580381471389645, "grad_norm": 1.01875901222229, "learning_rate": 2.7393581614739924e-05, "loss": 0.6062, "step": 425 }, { "epoch": 1.1716621253405994, "grad_norm": 1.0762981176376343, "learning_rate": 2.7316929531833775e-05, "loss": 0.5389, "step": 430 }, { "epoch": 1.1852861035422344, "grad_norm": 1.0140329599380493, "learning_rate": 2.7239277116155077e-05, "loss": 0.5462, "step": 435 }, { "epoch": 1.1989100817438691, "grad_norm": 1.1165515184402466, "learning_rate": 2.7160630674328893e-05, "loss": 0.5596, "step": 440 }, { "epoch": 1.2125340599455041, "grad_norm": 1.0689040422439575, "learning_rate": 2.7080996593711172e-05, "loss": 0.5137, "step": 445 }, { "epoch": 1.226158038147139, "grad_norm": 1.2397655248641968, "learning_rate": 2.700038134187002e-05, "loss": 0.5643, "step": 450 }, { "epoch": 1.2397820163487738, "grad_norm": 0.9885849952697754, "learning_rate": 2.691879146606043e-05, "loss": 0.5921, "step": 455 }, { "epoch": 1.2534059945504088, "grad_norm": 1.279818058013916, "learning_rate": 2.6836233592692544e-05, "loss": 0.5126, "step": 460 }, { "epoch": 1.2670299727520435, "grad_norm": 1.1189254522323608, "learning_rate": 2.675271442679346e-05, "loss": 0.5198, "step": 465 }, { "epoch": 1.2806539509536785, "grad_norm": 1.1345500946044922, "learning_rate": 2.6668240751462707e-05, "loss": 0.5117, "step": 470 }, { "epoch": 1.2942779291553133, "grad_norm": 1.1066192388534546, "learning_rate": 2.6582819427321313e-05, "loss": 0.5314, "step": 475 }, { "epoch": 1.3079019073569482, "grad_norm": 1.0839710235595703, "learning_rate": 2.649645739195464e-05, "loss": 0.5382, "step": 480 }, { "epoch": 1.3215258855585832, "grad_norm": 1.052043080329895, "learning_rate": 2.640916165934893e-05, "loss": 0.5135, "step": 485 }, { "epoch": 1.335149863760218, "grad_norm": 1.1333352327346802, "learning_rate": 2.6320939319321657e-05, "loss": 0.5359, "step": 490 }, { "epoch": 1.348773841961853, "grad_norm": 1.21845543384552, "learning_rate": 2.623179753694573e-05, "loss": 0.4853, "step": 495 }, { "epoch": 1.3623978201634879, "grad_norm": 1.142262578010559, "learning_rate": 2.614174355196754e-05, "loss": 0.4993, "step": 500 }, { "epoch": 1.3760217983651226, "grad_norm": 1.037720799446106, "learning_rate": 2.6050784678219024e-05, "loss": 0.512, "step": 505 }, { "epoch": 1.3896457765667574, "grad_norm": 1.1667251586914062, "learning_rate": 2.5958928303023634e-05, "loss": 0.4788, "step": 510 }, { "epoch": 1.4032697547683923, "grad_norm": 1.046761393547058, "learning_rate": 2.5866181886596367e-05, "loss": 0.4867, "step": 515 }, { "epoch": 1.4168937329700273, "grad_norm": 1.1029340028762817, "learning_rate": 2.5772552961437893e-05, "loss": 0.4799, "step": 520 }, { "epoch": 1.430517711171662, "grad_norm": 1.1235623359680176, "learning_rate": 2.5678049131722772e-05, "loss": 0.4752, "step": 525 }, { "epoch": 1.444141689373297, "grad_norm": 1.064278483390808, "learning_rate": 2.5582678072681903e-05, "loss": 0.5173, "step": 530 }, { "epoch": 1.457765667574932, "grad_norm": 1.13005793094635, "learning_rate": 2.5486447529979136e-05, "loss": 0.4451, "step": 535 }, { "epoch": 1.4713896457765667, "grad_norm": 1.0552195310592651, "learning_rate": 2.5389365319082226e-05, "loss": 0.4595, "step": 540 }, { "epoch": 1.4850136239782017, "grad_norm": 1.1078029870986938, "learning_rate": 2.5291439324628084e-05, "loss": 0.4693, "step": 545 }, { "epoch": 1.4986376021798364, "grad_norm": 1.090710997581482, "learning_rate": 2.5192677499782413e-05, "loss": 0.4537, "step": 550 }, { "epoch": 1.5122615803814714, "grad_norm": 1.037945032119751, "learning_rate": 2.5093087865593784e-05, "loss": 0.4556, "step": 555 }, { "epoch": 1.5258855585831061, "grad_norm": 1.1792460680007935, "learning_rate": 2.499267851034221e-05, "loss": 0.4734, "step": 560 }, { "epoch": 1.5395095367847411, "grad_norm": 1.0996177196502686, "learning_rate": 2.4891457588882238e-05, "loss": 0.4444, "step": 565 }, { "epoch": 1.553133514986376, "grad_norm": 1.13473641872406, "learning_rate": 2.478943332198062e-05, "loss": 0.4513, "step": 570 }, { "epoch": 1.5667574931880108, "grad_norm": 1.212607979774475, "learning_rate": 2.468661399564871e-05, "loss": 0.4506, "step": 575 }, { "epoch": 1.5803814713896458, "grad_norm": 1.0172079801559448, "learning_rate": 2.458300796046946e-05, "loss": 0.4238, "step": 580 }, { "epoch": 1.5940054495912808, "grad_norm": 1.3280726671218872, "learning_rate": 2.4478623630919236e-05, "loss": 0.4419, "step": 585 }, { "epoch": 1.6076294277929155, "grad_norm": 1.1108782291412354, "learning_rate": 2.437346948468441e-05, "loss": 0.3942, "step": 590 }, { "epoch": 1.6212534059945503, "grad_norm": 1.157791256904602, "learning_rate": 2.4267554061972873e-05, "loss": 0.397, "step": 595 }, { "epoch": 1.6348773841961854, "grad_norm": 1.1420488357543945, "learning_rate": 2.416088596482039e-05, "loss": 0.4849, "step": 600 }, { "epoch": 1.6485013623978202, "grad_norm": 1.264007329940796, "learning_rate": 2.405347385639202e-05, "loss": 0.4365, "step": 605 }, { "epoch": 1.662125340599455, "grad_norm": 1.0671672821044922, "learning_rate": 2.394532646027848e-05, "loss": 0.4259, "step": 610 }, { "epoch": 1.67574931880109, "grad_norm": 1.245835781097412, "learning_rate": 2.3836452559787673e-05, "loss": 0.4078, "step": 615 }, { "epoch": 1.6893732970027249, "grad_norm": 1.1798008680343628, "learning_rate": 2.3726860997231356e-05, "loss": 0.4319, "step": 620 }, { "epoch": 1.7029972752043596, "grad_norm": 1.3666014671325684, "learning_rate": 2.3616560673206984e-05, "loss": 0.4645, "step": 625 }, { "epoch": 1.7166212534059946, "grad_norm": 1.0500986576080322, "learning_rate": 2.3505560545874843e-05, "loss": 0.4543, "step": 630 }, { "epoch": 1.7302452316076296, "grad_norm": 1.4002113342285156, "learning_rate": 2.3393869630230495e-05, "loss": 0.3978, "step": 635 }, { "epoch": 1.7438692098092643, "grad_norm": 1.170268177986145, "learning_rate": 2.3281496997372625e-05, "loss": 0.4355, "step": 640 }, { "epoch": 1.757493188010899, "grad_norm": 1.0691126585006714, "learning_rate": 2.316845177376633e-05, "loss": 0.4097, "step": 645 }, { "epoch": 1.771117166212534, "grad_norm": 1.1894110441207886, "learning_rate": 2.3054743140501877e-05, "loss": 0.4025, "step": 650 }, { "epoch": 1.784741144414169, "grad_norm": 1.0559704303741455, "learning_rate": 2.2940380332549086e-05, "loss": 0.4237, "step": 655 }, { "epoch": 1.7983651226158037, "grad_norm": 1.1340513229370117, "learning_rate": 2.282537263800727e-05, "loss": 0.3523, "step": 660 }, { "epoch": 1.8119891008174387, "grad_norm": 1.1120586395263672, "learning_rate": 2.2709729397350904e-05, "loss": 0.4037, "step": 665 }, { "epoch": 1.8256130790190737, "grad_norm": 1.1562973260879517, "learning_rate": 2.2593460002671024e-05, "loss": 0.3839, "step": 670 }, { "epoch": 1.8392370572207084, "grad_norm": 1.2246955633163452, "learning_rate": 2.247657389691247e-05, "loss": 0.4188, "step": 675 }, { "epoch": 1.8528610354223434, "grad_norm": 0.9505524635314941, "learning_rate": 2.2359080573106913e-05, "loss": 0.3348, "step": 680 }, { "epoch": 1.8664850136239783, "grad_norm": 1.161018967628479, "learning_rate": 2.2240989573601902e-05, "loss": 0.3978, "step": 685 }, { "epoch": 1.880108991825613, "grad_norm": 1.076042890548706, "learning_rate": 2.212231048928587e-05, "loss": 0.3608, "step": 690 }, { "epoch": 1.8937329700272478, "grad_norm": 1.248965859413147, "learning_rate": 2.2003052958809185e-05, "loss": 0.3621, "step": 695 }, { "epoch": 1.9073569482288828, "grad_norm": 1.0826005935668945, "learning_rate": 2.1883226667801374e-05, "loss": 0.3733, "step": 700 }, { "epoch": 1.9209809264305178, "grad_norm": 1.1231858730316162, "learning_rate": 2.1762841348084425e-05, "loss": 0.3916, "step": 705 }, { "epoch": 1.9346049046321525, "grad_norm": 1.1095854043960571, "learning_rate": 2.164190677688248e-05, "loss": 0.3406, "step": 710 }, { "epoch": 1.9482288828337875, "grad_norm": 1.0947611331939697, "learning_rate": 2.1520432776027723e-05, "loss": 0.3511, "step": 715 }, { "epoch": 1.9618528610354224, "grad_norm": 1.035023808479309, "learning_rate": 2.1398429211162706e-05, "loss": 0.3894, "step": 720 }, { "epoch": 1.9754768392370572, "grad_norm": 1.166717529296875, "learning_rate": 2.127590599093909e-05, "loss": 0.3692, "step": 725 }, { "epoch": 1.989100817438692, "grad_norm": 1.1927279233932495, "learning_rate": 2.1152873066212913e-05, "loss": 0.3278, "step": 730 } ], "logging_steps": 5, "max_steps": 1835, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0749654008583946e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }