{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500.0, "global_step": 1690, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011841326228537595, "grad_norm": 219.0, "learning_rate": 3.921568627450981e-07, "loss": 1.2013294696807861, "step": 1, "token_acc": 0.8954758190327613 }, { "epoch": 0.011841326228537596, "grad_norm": 50.0, "learning_rate": 3.92156862745098e-06, "loss": 0.8140333493550619, "step": 10, "token_acc": 0.9158278375564041 }, { "epoch": 0.023682652457075192, "grad_norm": 15.0625, "learning_rate": 7.84313725490196e-06, "loss": 0.25884711742401123, "step": 20, "token_acc": 0.9479379018347185 }, { "epoch": 0.035523978685612786, "grad_norm": 4.78125, "learning_rate": 1.1764705882352942e-05, "loss": 0.13083882331848146, "step": 30, "token_acc": 0.9458783043954325 }, { "epoch": 0.047365304914150384, "grad_norm": 4.65625, "learning_rate": 1.568627450980392e-05, "loss": 0.11950666904449463, "step": 40, "token_acc": 0.953747256193164 }, { "epoch": 0.05920663114268798, "grad_norm": 3.921875, "learning_rate": 1.9607843137254903e-05, "loss": 0.10617152452468873, "step": 50, "token_acc": 0.9590588235294117 }, { "epoch": 0.07104795737122557, "grad_norm": 2.25, "learning_rate": 1.9998512057697314e-05, "loss": 0.10807085037231445, "step": 60, "token_acc": 0.9589895524715422 }, { "epoch": 0.08288928359976318, "grad_norm": 2.375, "learning_rate": 1.9993369121919753e-05, "loss": 0.10784111022949219, "step": 70, "token_acc": 0.9572008747266479 }, { "epoch": 0.09473060982830077, "grad_norm": 2.9375, "learning_rate": 1.998455471202776e-05, "loss": 0.10691288709640503, "step": 80, "token_acc": 0.9570600219401347 }, { "epoch": 0.10657193605683836, "grad_norm": 1.578125, "learning_rate": 1.9972072066356417e-05, "loss": 0.11526317596435547, "step": 90, "token_acc": 0.9496324104489285 }, { "epoch": 0.11841326228537596, "grad_norm": 3.09375, "learning_rate": 1.995592577091769e-05, "loss": 0.10205183029174805, "step": 100, "token_acc": 0.9572502348888193 }, { "epoch": 0.13025458851391356, "grad_norm": 3.1875, "learning_rate": 1.9936121757715598e-05, "loss": 0.10735645294189453, "step": 110, "token_acc": 0.9611528822055138 }, { "epoch": 0.14209591474245115, "grad_norm": 1.796875, "learning_rate": 1.991266730256683e-05, "loss": 0.10336060523986816, "step": 120, "token_acc": 0.9576736165543188 }, { "epoch": 0.15393724097098876, "grad_norm": 2.078125, "learning_rate": 1.9885571022427676e-05, "loss": 0.09967223405838013, "step": 130, "token_acc": 0.959868317918169 }, { "epoch": 0.16577856719952636, "grad_norm": 1.140625, "learning_rate": 1.9854842872228247e-05, "loss": 0.09939006567001343, "step": 140, "token_acc": 0.9603572547790661 }, { "epoch": 0.17761989342806395, "grad_norm": 1.2890625, "learning_rate": 1.98204941412151e-05, "loss": 0.0902411937713623, "step": 150, "token_acc": 0.9632122730118973 }, { "epoch": 0.18946121965660154, "grad_norm": 2.984375, "learning_rate": 1.9782537448803707e-05, "loss": 0.10655044317245484, "step": 160, "token_acc": 0.9559263340154258 }, { "epoch": 0.20130254588513913, "grad_norm": 2.65625, "learning_rate": 1.9740986739942146e-05, "loss": 0.10265426635742188, "step": 170, "token_acc": 0.9573600877880546 }, { "epoch": 0.21314387211367672, "grad_norm": 1.3046875, "learning_rate": 1.9695857279987897e-05, "loss": 0.09765652418136597, "step": 180, "token_acc": 0.9597620165962111 }, { "epoch": 0.22498519834221434, "grad_norm": 1.4140625, "learning_rate": 1.9647165649099465e-05, "loss": 0.09450024366378784, "step": 190, "token_acc": 0.963653454488485 }, { "epoch": 0.23682652457075193, "grad_norm": 1.3046875, "learning_rate": 1.9594929736144978e-05, "loss": 0.10988011360168456, "step": 200, "token_acc": 0.9540840231141652 }, { "epoch": 0.24866785079928952, "grad_norm": 1.828125, "learning_rate": 1.9539168732129977e-05, "loss": 0.09797856211662292, "step": 210, "token_acc": 0.9617614793919448 }, { "epoch": 0.2605091770278271, "grad_norm": 4.15625, "learning_rate": 1.9479903123146835e-05, "loss": 0.09065916538238525, "step": 220, "token_acc": 0.9650382394256282 }, { "epoch": 0.27235050325636473, "grad_norm": 1.9765625, "learning_rate": 1.9417154682848314e-05, "loss": 0.10036060810089112, "step": 230, "token_acc": 0.961611076148521 }, { "epoch": 0.2841918294849023, "grad_norm": 2.234375, "learning_rate": 1.935094646444815e-05, "loss": 0.09578206539154052, "step": 240, "token_acc": 0.9624119028974158 }, { "epoch": 0.2960331557134399, "grad_norm": 2.171875, "learning_rate": 1.928130279225149e-05, "loss": 0.09263083934783936, "step": 250, "token_acc": 0.963653454488485 }, { "epoch": 0.30787448194197753, "grad_norm": 1.6640625, "learning_rate": 1.920824925271838e-05, "loss": 0.09710139036178589, "step": 260, "token_acc": 0.9595754643358826 }, { "epoch": 0.3197158081705151, "grad_norm": 3.40625, "learning_rate": 1.9131812685063512e-05, "loss": 0.10172030925750733, "step": 270, "token_acc": 0.957680250783699 }, { "epoch": 0.3315571343990527, "grad_norm": 1.9609375, "learning_rate": 1.9052021171395742e-05, "loss": 0.10712752342224122, "step": 280, "token_acc": 0.9577840552416823 }, { "epoch": 0.3433984606275903, "grad_norm": 1.0703125, "learning_rate": 1.896890402640098e-05, "loss": 0.09744402766227722, "step": 290, "token_acc": 0.9596054485674025 }, { "epoch": 0.3552397868561279, "grad_norm": 2.359375, "learning_rate": 1.8882491786572226e-05, "loss": 0.09446089267730713, "step": 300, "token_acc": 0.9636648394675019 }, { "epoch": 0.36708111308466546, "grad_norm": 1.828125, "learning_rate": 1.8792816198990768e-05, "loss": 0.09970860481262207, "step": 310, "token_acc": 0.9583398590446358 }, { "epoch": 0.3789224393132031, "grad_norm": 1.5390625, "learning_rate": 1.8699910209662536e-05, "loss": 0.09670261144638062, "step": 320, "token_acc": 0.9606150949317432 }, { "epoch": 0.3907637655417407, "grad_norm": 2.828125, "learning_rate": 1.8603807951414093e-05, "loss": 0.09714120626449585, "step": 330, "token_acc": 0.9602938877598874 }, { "epoch": 0.40260509177027826, "grad_norm": 1.890625, "learning_rate": 1.850454473135249e-05, "loss": 0.09373531341552735, "step": 340, "token_acc": 0.9619166536600593 }, { "epoch": 0.4144464179988159, "grad_norm": 2.25, "learning_rate": 1.8402157017893795e-05, "loss": 0.09355499744415283, "step": 350, "token_acc": 0.9667919799498746 }, { "epoch": 0.42628774422735344, "grad_norm": 0.8828125, "learning_rate": 1.829668242736489e-05, "loss": 0.08944010734558105, "step": 360, "token_acc": 0.9638327853452325 }, { "epoch": 0.43812907045589106, "grad_norm": 1.2265625, "learning_rate": 1.8188159710183595e-05, "loss": 0.09383893013000488, "step": 370, "token_acc": 0.9663642052565707 }, { "epoch": 0.4499703966844287, "grad_norm": 2.953125, "learning_rate": 1.807662873662209e-05, "loss": 0.09152829647064209, "step": 380, "token_acc": 0.9641403069213905 }, { "epoch": 0.46181172291296624, "grad_norm": 1.6953125, "learning_rate": 1.796213048215896e-05, "loss": 0.10058202743530273, "step": 390, "token_acc": 0.961363279409455 }, { "epoch": 0.47365304914150386, "grad_norm": 1.7421875, "learning_rate": 1.7844707012425155e-05, "loss": 0.0878696620464325, "step": 400, "token_acc": 0.9662956576265872 }, { "epoch": 0.4854943753700414, "grad_norm": 1.3828125, "learning_rate": 1.772440146774945e-05, "loss": 0.09355847835540772, "step": 410, "token_acc": 0.9618928627205997 }, { "epoch": 0.49733570159857904, "grad_norm": 1.3828125, "learning_rate": 1.7601258047309096e-05, "loss": 0.09457954168319702, "step": 420, "token_acc": 0.9631430363864492 }, { "epoch": 0.5091770278271166, "grad_norm": 1.0234375, "learning_rate": 1.7475321992891417e-05, "loss": 0.09055821895599366, "step": 430, "token_acc": 0.9654251139399654 }, { "epoch": 0.5210183540556542, "grad_norm": 1.9921875, "learning_rate": 1.73466395722724e-05, "loss": 0.09674708843231201, "step": 440, "token_acc": 0.9611041405269761 }, { "epoch": 0.5328596802841918, "grad_norm": 1.671875, "learning_rate": 1.7215258062218323e-05, "loss": 0.10127317905426025, "step": 450, "token_acc": 0.9612791973663584 }, { "epoch": 0.5447010065127295, "grad_norm": 2.28125, "learning_rate": 1.708122573111669e-05, "loss": 0.08792918920516968, "step": 460, "token_acc": 0.9650962591954922 }, { "epoch": 0.5565423327412671, "grad_norm": 2.171875, "learning_rate": 1.6944591821242867e-05, "loss": 0.09947954416275025, "step": 470, "token_acc": 0.9605057758351545 }, { "epoch": 0.5683836589698046, "grad_norm": 1.46875, "learning_rate": 1.680540653066891e-05, "loss": 0.0963528037071228, "step": 480, "token_acc": 0.9614842649131048 }, { "epoch": 0.5802249851983422, "grad_norm": 1.1015625, "learning_rate": 1.6663720994821246e-05, "loss": 0.0961789608001709, "step": 490, "token_acc": 0.9619599248591109 }, { "epoch": 0.5920663114268798, "grad_norm": 2.109375, "learning_rate": 1.651958726769396e-05, "loss": 0.090640389919281, "step": 500, "token_acc": 0.963166144200627 }, { "epoch": 0.6039076376554174, "grad_norm": 0.9140625, "learning_rate": 1.6373058302724655e-05, "loss": 0.08862148523330689, "step": 510, "token_acc": 0.9642521166509878 }, { "epoch": 0.6157489638839551, "grad_norm": 1.7421875, "learning_rate": 1.6224187933339808e-05, "loss": 0.08748204708099365, "step": 520, "token_acc": 0.9620749098887321 }, { "epoch": 0.6275902901124926, "grad_norm": 1.5546875, "learning_rate": 1.6073030853176862e-05, "loss": 0.09252775907516479, "step": 530, "token_acc": 0.9616528408201597 }, { "epoch": 0.6394316163410302, "grad_norm": 1.296875, "learning_rate": 1.5919642595990275e-05, "loss": 0.08904544115066529, "step": 540, "token_acc": 0.9668594653743943 }, { "epoch": 0.6512729425695678, "grad_norm": 1.8203125, "learning_rate": 1.5764079515248922e-05, "loss": 0.08241082429885864, "step": 550, "token_acc": 0.9658628249295333 }, { "epoch": 0.6631142687981054, "grad_norm": 1.6015625, "learning_rate": 1.5606398763432318e-05, "loss": 0.0839945912361145, "step": 560, "token_acc": 0.9672131147540983 }, { "epoch": 0.6749555950266429, "grad_norm": 1.5, "learning_rate": 1.5446658271033336e-05, "loss": 0.09018040895462036, "step": 570, "token_acc": 0.9658574784651527 }, { "epoch": 0.6867969212551805, "grad_norm": 1.40625, "learning_rate": 1.528491672527504e-05, "loss": 0.08107317686080932, "step": 580, "token_acc": 0.9681967726774244 }, { "epoch": 0.6986382474837182, "grad_norm": 1.453125, "learning_rate": 1.512123354854955e-05, "loss": 0.08852046132087707, "step": 590, "token_acc": 0.9663957486714598 }, { "epoch": 0.7104795737122558, "grad_norm": 1.1875, "learning_rate": 1.4955668876586763e-05, "loss": 0.07870029807090759, "step": 600, "token_acc": 0.9683862849952816 }, { "epoch": 0.7223208999407934, "grad_norm": 1.4609375, "learning_rate": 1.4788283536361036e-05, "loss": 0.0841621994972229, "step": 610, "token_acc": 0.9685781618224666 }, { "epoch": 0.7341622261693309, "grad_norm": 1.6171875, "learning_rate": 1.4619139023743916e-05, "loss": 0.08564043045043945, "step": 620, "token_acc": 0.9654417513682565 }, { "epoch": 0.7460035523978685, "grad_norm": 1.203125, "learning_rate": 1.4448297480911086e-05, "loss": 0.09037463665008545, "step": 630, "token_acc": 0.963363081258807 }, { "epoch": 0.7578448786264061, "grad_norm": 0.9609375, "learning_rate": 1.4275821673511903e-05, "loss": 0.09671027660369873, "step": 640, "token_acc": 0.959305055564251 }, { "epoch": 0.7696862048549438, "grad_norm": 1.5078125, "learning_rate": 1.4101774967609854e-05, "loss": 0.09160791039466858, "step": 650, "token_acc": 0.9654741446648961 }, { "epoch": 0.7815275310834814, "grad_norm": 1.421875, "learning_rate": 1.392622130640243e-05, "loss": 0.095394766330719, "step": 660, "token_acc": 0.9619956208945887 }, { "epoch": 0.7933688573120189, "grad_norm": 1.78125, "learning_rate": 1.3749225186728991e-05, "loss": 0.08577767610549927, "step": 670, "token_acc": 0.966750313676286 }, { "epoch": 0.8052101835405565, "grad_norm": 2.0625, "learning_rate": 1.357085163537517e-05, "loss": 0.09209753274917602, "step": 680, "token_acc": 0.9620608899297424 }, { "epoch": 0.8170515097690941, "grad_norm": 2.390625, "learning_rate": 1.3391166185182651e-05, "loss": 0.0821334183216095, "step": 690, "token_acc": 0.9690383111806099 }, { "epoch": 0.8288928359976317, "grad_norm": 1.2109375, "learning_rate": 1.3210234850972966e-05, "loss": 0.09119898080825806, "step": 700, "token_acc": 0.9637817497648166 }, { "epoch": 0.8407341622261694, "grad_norm": 1.8671875, "learning_rate": 1.3028124105294255e-05, "loss": 0.0862145483493805, "step": 710, "token_acc": 0.9672259683236631 }, { "epoch": 0.8525754884547069, "grad_norm": 1.34375, "learning_rate": 1.2844900853999847e-05, "loss": 0.08162487745285034, "step": 720, "token_acc": 0.9676405906377631 }, { "epoch": 0.8644168146832445, "grad_norm": 1.7890625, "learning_rate": 1.2660632411667648e-05, "loss": 0.08193669319152833, "step": 730, "token_acc": 0.9653278945716975 }, { "epoch": 0.8762581409117821, "grad_norm": 1.6015625, "learning_rate": 1.2475386476869364e-05, "loss": 0.09078997969627381, "step": 740, "token_acc": 0.9639045825486503 }, { "epoch": 0.8880994671403197, "grad_norm": 1.671875, "learning_rate": 1.2289231107298672e-05, "loss": 0.09944761395454407, "step": 750, "token_acc": 0.9596546310832025 }, { "epoch": 0.8999407933688574, "grad_norm": 1.1171875, "learning_rate": 1.2102234694767401e-05, "loss": 0.0917394757270813, "step": 760, "token_acc": 0.9615505335844319 }, { "epoch": 0.9117821195973949, "grad_norm": 1.609375, "learning_rate": 1.1914465940079036e-05, "loss": 0.08656581044197083, "step": 770, "token_acc": 0.9671951028096061 }, { "epoch": 0.9236234458259325, "grad_norm": 1.28125, "learning_rate": 1.1725993827788625e-05, "loss": 0.08798307180404663, "step": 780, "token_acc": 0.9632065132299984 }, { "epoch": 0.9354647720544701, "grad_norm": 1.4765625, "learning_rate": 1.1536887600858487e-05, "loss": 0.08726394176483154, "step": 790, "token_acc": 0.9665934755332497 }, { "epoch": 0.9473060982830077, "grad_norm": 0.89453125, "learning_rate": 1.134721673521897e-05, "loss": 0.0808544933795929, "step": 800, "token_acc": 0.9646211646837821 }, { "epoch": 0.9591474245115453, "grad_norm": 1.3671875, "learning_rate": 1.1157050914243614e-05, "loss": 0.08560880422592163, "step": 810, "token_acc": 0.9667189952904238 }, { "epoch": 0.9709887507400828, "grad_norm": 1.3359375, "learning_rate": 1.0966460003148115e-05, "loss": 0.0828078031539917, "step": 820, "token_acc": 0.9668499607227022 }, { "epoch": 0.9828300769686205, "grad_norm": 1.921875, "learning_rate": 1.0775514023322444e-05, "loss": 0.09345529675483703, "step": 830, "token_acc": 0.9608886107634543 }, { "epoch": 0.9946714031971581, "grad_norm": 1.2578125, "learning_rate": 1.058428312660566e-05, "loss": 0.08514059782028198, "step": 840, "token_acc": 0.9657169693174703 }, { "epoch": 1.0059206631142688, "grad_norm": 1.015625, "learning_rate": 1.0392837569512715e-05, "loss": 0.08234425187110901, "step": 850, "token_acc": 0.9645318540931249 }, { "epoch": 1.0177619893428065, "grad_norm": 1.8359375, "learning_rate": 1.020124768742286e-05, "loss": 0.07545605897903443, "step": 860, "token_acc": 0.9709147771696638 }, { "epoch": 1.029603315571344, "grad_norm": 1.1640625, "learning_rate": 1.0009583868739053e-05, "loss": 0.07274842262268066, "step": 870, "token_acc": 0.9721873035826524 }, { "epoch": 1.0414446417998815, "grad_norm": 1.3359375, "learning_rate": 9.817916529027898e-06, "loss": 0.07491129636764526, "step": 880, "token_acc": 0.9713480507280413 }, { "epoch": 1.0532859680284192, "grad_norm": 1.15625, "learning_rate": 9.626316085149588e-06, "loss": 0.07744649052619934, "step": 890, "token_acc": 0.9709102283390679 }, { "epoch": 1.0651272942569567, "grad_norm": 1.1953125, "learning_rate": 9.43485292938739e-06, "loss": 0.07794994711875916, "step": 900, "token_acc": 0.970647931303669 }, { "epoch": 1.0769686204854945, "grad_norm": 1.0234375, "learning_rate": 9.243597403586145e-06, "loss": 0.0824435293674469, "step": 910, "token_acc": 0.9683633516053249 }, { "epoch": 1.088809946714032, "grad_norm": 0.91796875, "learning_rate": 9.052619773309318e-06, "loss": 0.07359167337417602, "step": 920, "token_acc": 0.9754111198120595 }, { "epoch": 1.1006512729425695, "grad_norm": 1.0625, "learning_rate": 8.861990202024046e-06, "loss": 0.07806094288825989, "step": 930, "token_acc": 0.9696922355881894 }, { "epoch": 1.1124925991711072, "grad_norm": 2.0, "learning_rate": 8.67177872532372e-06, "loss": 0.07662028670310975, "step": 940, "token_acc": 0.9707960433349034 }, { "epoch": 1.1243339253996447, "grad_norm": 1.5, "learning_rate": 8.482055225197532e-06, "loss": 0.07939339876174926, "step": 950, "token_acc": 0.9700156985871271 }, { "epoch": 1.1361752516281824, "grad_norm": 2.046875, "learning_rate": 8.292889404356461e-06, "loss": 0.07178534269332885, "step": 960, "token_acc": 0.9713704630788486 }, { "epoch": 1.14801657785672, "grad_norm": 1.453125, "learning_rate": 8.104350760625122e-06, "loss": 0.07578552961349487, "step": 970, "token_acc": 0.9700093720712277 }, { "epoch": 1.1598579040852575, "grad_norm": 1.3828125, "learning_rate": 7.916508561408892e-06, "loss": 0.07551709413528443, "step": 980, "token_acc": 0.9736513875896476 }, { "epoch": 1.1716992303137952, "grad_norm": 1.0625, "learning_rate": 7.729431818245678e-06, "loss": 0.06962672472000123, "step": 990, "token_acc": 0.9749726263100266 }, { "epoch": 1.1835405565423327, "grad_norm": 1.546875, "learning_rate": 7.543189261451716e-06, "loss": 0.07484488487243653, "step": 1000, "token_acc": 0.9705790297339593 }, { "epoch": 1.1953818827708704, "grad_norm": 1.328125, "learning_rate": 7.35784931487064e-06, "loss": 0.07622098922729492, "step": 1010, "token_acc": 0.970372680492749 }, { "epoch": 1.207223208999408, "grad_norm": 2.390625, "learning_rate": 7.173480070735209e-06, "loss": 0.07499848604202271, "step": 1020, "token_acc": 0.9686574146265399 }, { "epoch": 1.2190645352279454, "grad_norm": 1.2109375, "learning_rate": 6.990149264650814e-06, "loss": 0.07203071117401123, "step": 1030, "token_acc": 0.972574831531108 }, { "epoch": 1.2309058614564832, "grad_norm": 1.375, "learning_rate": 6.807924250710019e-06, "loss": 0.07002646923065185, "step": 1040, "token_acc": 0.9741379310344828 }, { "epoch": 1.2427471876850207, "grad_norm": 1.328125, "learning_rate": 6.626871976747289e-06, "loss": 0.07481561303138733, "step": 1050, "token_acc": 0.9709576138147566 }, { "epoch": 1.2545885139135584, "grad_norm": 1.2734375, "learning_rate": 6.44705895974294e-06, "loss": 0.06933027505874634, "step": 1060, "token_acc": 0.9734443746071653 }, { "epoch": 1.266429840142096, "grad_norm": 1.5625, "learning_rate": 6.268551261385414e-06, "loss": 0.0675657868385315, "step": 1070, "token_acc": 0.9746320075164422 }, { "epoch": 1.2782711663706334, "grad_norm": 1.5546875, "learning_rate": 6.091414463800789e-06, "loss": 0.07069060802459717, "step": 1080, "token_acc": 0.973655323819978 }, { "epoch": 1.2901124925991712, "grad_norm": 1.125, "learning_rate": 5.915713645458514e-06, "loss": 0.07225958108901978, "step": 1090, "token_acc": 0.9728201099764336 }, { "epoch": 1.3019538188277087, "grad_norm": 1.6171875, "learning_rate": 5.741513357262147e-06, "loss": 0.07490838170051575, "step": 1100, "token_acc": 0.970542149796302 }, { "epoch": 1.3137951450562464, "grad_norm": 1.3359375, "learning_rate": 5.568877598833935e-06, "loss": 0.07528679370880127, "step": 1110, "token_acc": 0.970496409615985 }, { "epoch": 1.325636471284784, "grad_norm": 1.453125, "learning_rate": 5.3978697950019484e-06, "loss": 0.07579593658447266, "step": 1120, "token_acc": 0.9716936625255543 }, { "epoch": 1.3374777975133214, "grad_norm": 1.6640625, "learning_rate": 5.228552772498335e-06, "loss": 0.06750929355621338, "step": 1130, "token_acc": 0.9741029641185648 }, { "epoch": 1.3493191237418591, "grad_norm": 1.6875, "learning_rate": 5.060988736877366e-06, "loss": 0.07841302156448364, "step": 1140, "token_acc": 0.9696400625978091 }, { "epoch": 1.3611604499703966, "grad_norm": 1.3671875, "learning_rate": 4.895239249661662e-06, "loss": 0.08451638221740723, "step": 1150, "token_acc": 0.967736883320282 }, { "epoch": 1.3730017761989344, "grad_norm": 1.0234375, "learning_rate": 4.731365205725056e-06, "loss": 0.074539315700531, "step": 1160, "token_acc": 0.9703715315880233 }, { "epoch": 1.3848431024274719, "grad_norm": 1.3359375, "learning_rate": 4.569426810920347e-06, "loss": 0.068775475025177, "step": 1170, "token_acc": 0.9716523101018011 }, { "epoch": 1.3966844286560094, "grad_norm": 1.2265625, "learning_rate": 4.409483559960221e-06, "loss": 0.07150940299034118, "step": 1180, "token_acc": 0.9737005913476502 }, { "epoch": 1.4085257548845471, "grad_norm": 1.890625, "learning_rate": 4.251594214559416e-06, "loss": 0.08267040252685547, "step": 1190, "token_acc": 0.9680350987151363 }, { "epoch": 1.4203670811130846, "grad_norm": 1.46875, "learning_rate": 4.095816781846219e-06, "loss": 0.0697063684463501, "step": 1200, "token_acc": 0.9751095804633688 }, { "epoch": 1.4322084073416224, "grad_norm": 1.203125, "learning_rate": 3.942208493051137e-06, "loss": 0.07361778020858764, "step": 1210, "token_acc": 0.9734901960784313 }, { "epoch": 1.4440497335701599, "grad_norm": 1.4609375, "learning_rate": 3.7908257824806814e-06, "loss": 0.07019197940826416, "step": 1220, "token_acc": 0.9710122218740207 }, { "epoch": 1.4558910597986974, "grad_norm": 1.5859375, "learning_rate": 3.6417242667838917e-06, "loss": 0.07444216012954712, "step": 1230, "token_acc": 0.9728040012503908 }, { "epoch": 1.467732386027235, "grad_norm": 1.4375, "learning_rate": 3.4949587245192983e-06, "loss": 0.06847925186157226, "step": 1240, "token_acc": 0.9746320075164422 }, { "epoch": 1.4795737122557726, "grad_norm": 1.625, "learning_rate": 3.3505830760297543e-06, "loss": 0.0696124255657196, "step": 1250, "token_acc": 0.9730534231552561 }, { "epoch": 1.4914150384843103, "grad_norm": 1.3203125, "learning_rate": 3.2086503636325895e-06, "loss": 0.07145707607269287, "step": 1260, "token_acc": 0.9749294891883422 }, { "epoch": 1.5032563647128478, "grad_norm": 1.8515625, "learning_rate": 3.069212732132345e-06, "loss": 0.07296675443649292, "step": 1270, "token_acc": 0.9725662329518734 }, { "epoch": 1.5150976909413854, "grad_norm": 2.625, "learning_rate": 2.9323214096632335e-06, "loss": 0.07637610435485839, "step": 1280, "token_acc": 0.9721566776781501 }, { "epoch": 1.526939017169923, "grad_norm": 1.4375, "learning_rate": 2.798026688868386e-06, "loss": 0.07028791308403015, "step": 1290, "token_acc": 0.9726801695713613 }, { "epoch": 1.5387803433984606, "grad_norm": 1.7578125, "learning_rate": 2.6663779084227926e-06, "loss": 0.0738570511341095, "step": 1300, "token_acc": 0.9717247879359096 }, { "epoch": 1.5506216696269983, "grad_norm": 2.046875, "learning_rate": 2.5374234349066985e-06, "loss": 0.07539566755294799, "step": 1310, "token_acc": 0.9680968096809681 }, { "epoch": 1.5624629958555358, "grad_norm": 1.09375, "learning_rate": 2.411210645036173e-06, "loss": 0.07291572093963623, "step": 1320, "token_acc": 0.972758405977584 }, { "epoch": 1.5743043220840733, "grad_norm": 1.6484375, "learning_rate": 2.2877859082573194e-06, "loss": 0.07078194618225098, "step": 1330, "token_acc": 0.9733229329173166 }, { "epoch": 1.586145648312611, "grad_norm": 1.53125, "learning_rate": 2.16719456971057e-06, "loss": 0.07727055549621582, "step": 1340, "token_acc": 0.9690154136520919 }, { "epoch": 1.5979869745411486, "grad_norm": 1.125, "learning_rate": 2.0494809335712697e-06, "loss": 0.06905415058135986, "step": 1350, "token_acc": 0.9750783699059561 }, { "epoch": 1.6098283007696863, "grad_norm": 1.8046875, "learning_rate": 1.9346882467727323e-06, "loss": 0.07434183359146118, "step": 1360, "token_acc": 0.9726091720143998 }, { "epoch": 1.6216696269982238, "grad_norm": 0.96875, "learning_rate": 1.8228586831177032e-06, "loss": 0.06618231534957886, "step": 1370, "token_acc": 0.9750900830330566 }, { "epoch": 1.6335109532267613, "grad_norm": 1.34375, "learning_rate": 1.7140333277840837e-06, "loss": 0.07258784770965576, "step": 1380, "token_acc": 0.9727699530516432 }, { "epoch": 1.6453522794552988, "grad_norm": 1.1875, "learning_rate": 1.6082521622306003e-06, "loss": 0.0752481460571289, "step": 1390, "token_acc": 0.9715364050951407 }, { "epoch": 1.6571936056838366, "grad_norm": 1.3984375, "learning_rate": 1.5055540495079802e-06, "loss": 0.06541621685028076, "step": 1400, "token_acc": 0.9767806714778788 }, { "epoch": 1.6690349319123743, "grad_norm": 1.90625, "learning_rate": 1.4059767199810125e-06, "loss": 0.0707894206047058, "step": 1410, "token_acc": 0.9731301068510371 }, { "epoch": 1.6808762581409118, "grad_norm": 1.3203125, "learning_rate": 1.3095567574667589e-06, "loss": 0.07458854913711548, "step": 1420, "token_acc": 0.9726630007855459 }, { "epoch": 1.6927175843694493, "grad_norm": 1.65625, "learning_rate": 1.216329585793975e-06, "loss": 0.06724110841751099, "step": 1430, "token_acc": 0.9734000938820216 }, { "epoch": 1.7045589105979868, "grad_norm": 1.46875, "learning_rate": 1.1263294557887216e-06, "loss": 0.07588486671447754, "step": 1440, "token_acc": 0.9710873664362036 }, { "epoch": 1.7164002368265245, "grad_norm": 2.046875, "learning_rate": 1.0395894326909163e-06, "loss": 0.07099611163139344, "step": 1450, "token_acc": 0.9723091364205256 }, { "epoch": 1.7282415630550623, "grad_norm": 1.921875, "learning_rate": 9.561413840064637e-07, "loss": 0.06974682807922364, "step": 1460, "token_acc": 0.9720609009574636 }, { "epoch": 1.7400828892835998, "grad_norm": 1.2578125, "learning_rate": 8.760159677994174e-07, "loss": 0.06880149841308594, "step": 1470, "token_acc": 0.9749019607843137 }, { "epoch": 1.7519242155121373, "grad_norm": 1.90625, "learning_rate": 7.992426214284787e-07, "loss": 0.07654795646667481, "step": 1480, "token_acc": 0.969967151572032 }, { "epoch": 1.7637655417406748, "grad_norm": 1.3671875, "learning_rate": 7.258495507319885e-07, "loss": 0.06865710020065308, "step": 1490, "token_acc": 0.9735068192506663 }, { "epoch": 1.7756068679692125, "grad_norm": 1.34375, "learning_rate": 6.558637196653372e-07, "loss": 0.06818960905075074, "step": 1500, "token_acc": 0.9739225484072455 }, { "epoch": 1.7874481941977503, "grad_norm": 1.7578125, "learning_rate": 5.893108403946634e-07, "loss": 0.07731307148933411, "step": 1510, "token_acc": 0.9705836332342357 }, { "epoch": 1.7992895204262878, "grad_norm": 1.1484375, "learning_rate": 5.262153638504286e-07, "loss": 0.07072955965995789, "step": 1520, "token_acc": 0.9747514596812372 }, { "epoch": 1.8111308466548253, "grad_norm": 1.40625, "learning_rate": 4.6660047074436945e-07, "loss": 0.07091631889343261, "step": 1530, "token_acc": 0.9746914544602406 }, { "epoch": 1.8229721728833628, "grad_norm": 1.6328125, "learning_rate": 4.10488063053105e-07, "loss": 0.062443327903747556, "step": 1540, "token_acc": 0.976577139287945 }, { "epoch": 1.8348134991119005, "grad_norm": 1.6015625, "learning_rate": 3.57898755971553e-07, "loss": 0.07588485479354859, "step": 1550, "token_acc": 0.973754100921731 }, { "epoch": 1.8466548253404382, "grad_norm": 1.4375, "learning_rate": 3.088518703390908e-07, "loss": 0.07371261715888977, "step": 1560, "token_acc": 0.9696590553644041 }, { "epoch": 1.8584961515689757, "grad_norm": 1.1484375, "learning_rate": 2.633654255412554e-07, "loss": 0.06826964616775513, "step": 1570, "token_acc": 0.9750783699059561 }, { "epoch": 1.8703374777975132, "grad_norm": 1.5078125, "learning_rate": 2.214561328895748e-07, "loss": 0.06952533721923829, "step": 1580, "token_acc": 0.9716478696741855 }, { "epoch": 1.8821788040260508, "grad_norm": 1.7890625, "learning_rate": 1.8313938948198884e-07, "loss": 0.07293472290039063, "step": 1590, "token_acc": 0.9714820009350164 }, { "epoch": 1.8940201302545885, "grad_norm": 1.625, "learning_rate": 1.484292725460934e-07, "loss": 0.07688854336738586, "step": 1600, "token_acc": 0.9702054257487847 }, { "epoch": 1.9058614564831262, "grad_norm": 1.3359375, "learning_rate": 1.173385342672917e-07, "loss": 0.07143334150314332, "step": 1610, "token_acc": 0.970491288651703 }, { "epoch": 1.9177027827116637, "grad_norm": 1.921875, "learning_rate": 8.987859710375524e-08, "loss": 0.081912100315094, "step": 1620, "token_acc": 0.9685150375939849 }, { "epoch": 1.9295441089402012, "grad_norm": 1.8203125, "learning_rate": 6.605954958991523e-08, "loss": 0.07874792218208312, "step": 1630, "token_acc": 0.9696588586700204 }, { "epoch": 1.9413854351687387, "grad_norm": 1.5546875, "learning_rate": 4.5890142630027336e-08, "loss": 0.0735186517238617, "step": 1640, "token_acc": 0.9709894934922377 }, { "epoch": 1.9532267613972765, "grad_norm": 1.9765625, "learning_rate": 2.9377786283167897e-08, "loss": 0.0773587942123413, "step": 1650, "token_acc": 0.9692741809060982 }, { "epoch": 1.9650680876258142, "grad_norm": 1.796875, "learning_rate": 1.6528547040842724e-08, "loss": 0.06999446153640747, "step": 1660, "token_acc": 0.9743669896842764 }, { "epoch": 1.9769094138543517, "grad_norm": 1.5859375, "learning_rate": 7.3471455982143665e-09, "loss": 0.07299281358718872, "step": 1670, "token_acc": 0.9729179711959924 }, { "epoch": 1.9887507400828892, "grad_norm": 1.34375, "learning_rate": 1.8369551197594538e-09, "loss": 0.067216557264328, "step": 1680, "token_acc": 0.9730407523510972 }, { "epoch": 2.0, "grad_norm": 2.65625, "learning_rate": 0.0, "loss": 0.07405292987823486, "step": 1690, "token_acc": 0.9716838024608124 } ], "logging_steps": 10, "max_steps": 1690, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7276889282044232e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }