Affine-second / trainer_state.json
guru-0430's picture
Upload folder using huggingface_hub
d92a331 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500.0,
"global_step": 1690,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0011841326228537595,
"grad_norm": 219.0,
"learning_rate": 3.921568627450981e-07,
"loss": 1.2013294696807861,
"step": 1,
"token_acc": 0.8954758190327613
},
{
"epoch": 0.011841326228537596,
"grad_norm": 50.0,
"learning_rate": 3.92156862745098e-06,
"loss": 0.8140333493550619,
"step": 10,
"token_acc": 0.9158278375564041
},
{
"epoch": 0.023682652457075192,
"grad_norm": 15.0625,
"learning_rate": 7.84313725490196e-06,
"loss": 0.25884711742401123,
"step": 20,
"token_acc": 0.9479379018347185
},
{
"epoch": 0.035523978685612786,
"grad_norm": 4.78125,
"learning_rate": 1.1764705882352942e-05,
"loss": 0.13083882331848146,
"step": 30,
"token_acc": 0.9458783043954325
},
{
"epoch": 0.047365304914150384,
"grad_norm": 4.65625,
"learning_rate": 1.568627450980392e-05,
"loss": 0.11950666904449463,
"step": 40,
"token_acc": 0.953747256193164
},
{
"epoch": 0.05920663114268798,
"grad_norm": 3.921875,
"learning_rate": 1.9607843137254903e-05,
"loss": 0.10617152452468873,
"step": 50,
"token_acc": 0.9590588235294117
},
{
"epoch": 0.07104795737122557,
"grad_norm": 2.25,
"learning_rate": 1.9998512057697314e-05,
"loss": 0.10807085037231445,
"step": 60,
"token_acc": 0.9589895524715422
},
{
"epoch": 0.08288928359976318,
"grad_norm": 2.375,
"learning_rate": 1.9993369121919753e-05,
"loss": 0.10784111022949219,
"step": 70,
"token_acc": 0.9572008747266479
},
{
"epoch": 0.09473060982830077,
"grad_norm": 2.9375,
"learning_rate": 1.998455471202776e-05,
"loss": 0.10691288709640503,
"step": 80,
"token_acc": 0.9570600219401347
},
{
"epoch": 0.10657193605683836,
"grad_norm": 1.578125,
"learning_rate": 1.9972072066356417e-05,
"loss": 0.11526317596435547,
"step": 90,
"token_acc": 0.9496324104489285
},
{
"epoch": 0.11841326228537596,
"grad_norm": 3.09375,
"learning_rate": 1.995592577091769e-05,
"loss": 0.10205183029174805,
"step": 100,
"token_acc": 0.9572502348888193
},
{
"epoch": 0.13025458851391356,
"grad_norm": 3.1875,
"learning_rate": 1.9936121757715598e-05,
"loss": 0.10735645294189453,
"step": 110,
"token_acc": 0.9611528822055138
},
{
"epoch": 0.14209591474245115,
"grad_norm": 1.796875,
"learning_rate": 1.991266730256683e-05,
"loss": 0.10336060523986816,
"step": 120,
"token_acc": 0.9576736165543188
},
{
"epoch": 0.15393724097098876,
"grad_norm": 2.078125,
"learning_rate": 1.9885571022427676e-05,
"loss": 0.09967223405838013,
"step": 130,
"token_acc": 0.959868317918169
},
{
"epoch": 0.16577856719952636,
"grad_norm": 1.140625,
"learning_rate": 1.9854842872228247e-05,
"loss": 0.09939006567001343,
"step": 140,
"token_acc": 0.9603572547790661
},
{
"epoch": 0.17761989342806395,
"grad_norm": 1.2890625,
"learning_rate": 1.98204941412151e-05,
"loss": 0.0902411937713623,
"step": 150,
"token_acc": 0.9632122730118973
},
{
"epoch": 0.18946121965660154,
"grad_norm": 2.984375,
"learning_rate": 1.9782537448803707e-05,
"loss": 0.10655044317245484,
"step": 160,
"token_acc": 0.9559263340154258
},
{
"epoch": 0.20130254588513913,
"grad_norm": 2.65625,
"learning_rate": 1.9740986739942146e-05,
"loss": 0.10265426635742188,
"step": 170,
"token_acc": 0.9573600877880546
},
{
"epoch": 0.21314387211367672,
"grad_norm": 1.3046875,
"learning_rate": 1.9695857279987897e-05,
"loss": 0.09765652418136597,
"step": 180,
"token_acc": 0.9597620165962111
},
{
"epoch": 0.22498519834221434,
"grad_norm": 1.4140625,
"learning_rate": 1.9647165649099465e-05,
"loss": 0.09450024366378784,
"step": 190,
"token_acc": 0.963653454488485
},
{
"epoch": 0.23682652457075193,
"grad_norm": 1.3046875,
"learning_rate": 1.9594929736144978e-05,
"loss": 0.10988011360168456,
"step": 200,
"token_acc": 0.9540840231141652
},
{
"epoch": 0.24866785079928952,
"grad_norm": 1.828125,
"learning_rate": 1.9539168732129977e-05,
"loss": 0.09797856211662292,
"step": 210,
"token_acc": 0.9617614793919448
},
{
"epoch": 0.2605091770278271,
"grad_norm": 4.15625,
"learning_rate": 1.9479903123146835e-05,
"loss": 0.09065916538238525,
"step": 220,
"token_acc": 0.9650382394256282
},
{
"epoch": 0.27235050325636473,
"grad_norm": 1.9765625,
"learning_rate": 1.9417154682848314e-05,
"loss": 0.10036060810089112,
"step": 230,
"token_acc": 0.961611076148521
},
{
"epoch": 0.2841918294849023,
"grad_norm": 2.234375,
"learning_rate": 1.935094646444815e-05,
"loss": 0.09578206539154052,
"step": 240,
"token_acc": 0.9624119028974158
},
{
"epoch": 0.2960331557134399,
"grad_norm": 2.171875,
"learning_rate": 1.928130279225149e-05,
"loss": 0.09263083934783936,
"step": 250,
"token_acc": 0.963653454488485
},
{
"epoch": 0.30787448194197753,
"grad_norm": 1.6640625,
"learning_rate": 1.920824925271838e-05,
"loss": 0.09710139036178589,
"step": 260,
"token_acc": 0.9595754643358826
},
{
"epoch": 0.3197158081705151,
"grad_norm": 3.40625,
"learning_rate": 1.9131812685063512e-05,
"loss": 0.10172030925750733,
"step": 270,
"token_acc": 0.957680250783699
},
{
"epoch": 0.3315571343990527,
"grad_norm": 1.9609375,
"learning_rate": 1.9052021171395742e-05,
"loss": 0.10712752342224122,
"step": 280,
"token_acc": 0.9577840552416823
},
{
"epoch": 0.3433984606275903,
"grad_norm": 1.0703125,
"learning_rate": 1.896890402640098e-05,
"loss": 0.09744402766227722,
"step": 290,
"token_acc": 0.9596054485674025
},
{
"epoch": 0.3552397868561279,
"grad_norm": 2.359375,
"learning_rate": 1.8882491786572226e-05,
"loss": 0.09446089267730713,
"step": 300,
"token_acc": 0.9636648394675019
},
{
"epoch": 0.36708111308466546,
"grad_norm": 1.828125,
"learning_rate": 1.8792816198990768e-05,
"loss": 0.09970860481262207,
"step": 310,
"token_acc": 0.9583398590446358
},
{
"epoch": 0.3789224393132031,
"grad_norm": 1.5390625,
"learning_rate": 1.8699910209662536e-05,
"loss": 0.09670261144638062,
"step": 320,
"token_acc": 0.9606150949317432
},
{
"epoch": 0.3907637655417407,
"grad_norm": 2.828125,
"learning_rate": 1.8603807951414093e-05,
"loss": 0.09714120626449585,
"step": 330,
"token_acc": 0.9602938877598874
},
{
"epoch": 0.40260509177027826,
"grad_norm": 1.890625,
"learning_rate": 1.850454473135249e-05,
"loss": 0.09373531341552735,
"step": 340,
"token_acc": 0.9619166536600593
},
{
"epoch": 0.4144464179988159,
"grad_norm": 2.25,
"learning_rate": 1.8402157017893795e-05,
"loss": 0.09355499744415283,
"step": 350,
"token_acc": 0.9667919799498746
},
{
"epoch": 0.42628774422735344,
"grad_norm": 0.8828125,
"learning_rate": 1.829668242736489e-05,
"loss": 0.08944010734558105,
"step": 360,
"token_acc": 0.9638327853452325
},
{
"epoch": 0.43812907045589106,
"grad_norm": 1.2265625,
"learning_rate": 1.8188159710183595e-05,
"loss": 0.09383893013000488,
"step": 370,
"token_acc": 0.9663642052565707
},
{
"epoch": 0.4499703966844287,
"grad_norm": 2.953125,
"learning_rate": 1.807662873662209e-05,
"loss": 0.09152829647064209,
"step": 380,
"token_acc": 0.9641403069213905
},
{
"epoch": 0.46181172291296624,
"grad_norm": 1.6953125,
"learning_rate": 1.796213048215896e-05,
"loss": 0.10058202743530273,
"step": 390,
"token_acc": 0.961363279409455
},
{
"epoch": 0.47365304914150386,
"grad_norm": 1.7421875,
"learning_rate": 1.7844707012425155e-05,
"loss": 0.0878696620464325,
"step": 400,
"token_acc": 0.9662956576265872
},
{
"epoch": 0.4854943753700414,
"grad_norm": 1.3828125,
"learning_rate": 1.772440146774945e-05,
"loss": 0.09355847835540772,
"step": 410,
"token_acc": 0.9618928627205997
},
{
"epoch": 0.49733570159857904,
"grad_norm": 1.3828125,
"learning_rate": 1.7601258047309096e-05,
"loss": 0.09457954168319702,
"step": 420,
"token_acc": 0.9631430363864492
},
{
"epoch": 0.5091770278271166,
"grad_norm": 1.0234375,
"learning_rate": 1.7475321992891417e-05,
"loss": 0.09055821895599366,
"step": 430,
"token_acc": 0.9654251139399654
},
{
"epoch": 0.5210183540556542,
"grad_norm": 1.9921875,
"learning_rate": 1.73466395722724e-05,
"loss": 0.09674708843231201,
"step": 440,
"token_acc": 0.9611041405269761
},
{
"epoch": 0.5328596802841918,
"grad_norm": 1.671875,
"learning_rate": 1.7215258062218323e-05,
"loss": 0.10127317905426025,
"step": 450,
"token_acc": 0.9612791973663584
},
{
"epoch": 0.5447010065127295,
"grad_norm": 2.28125,
"learning_rate": 1.708122573111669e-05,
"loss": 0.08792918920516968,
"step": 460,
"token_acc": 0.9650962591954922
},
{
"epoch": 0.5565423327412671,
"grad_norm": 2.171875,
"learning_rate": 1.6944591821242867e-05,
"loss": 0.09947954416275025,
"step": 470,
"token_acc": 0.9605057758351545
},
{
"epoch": 0.5683836589698046,
"grad_norm": 1.46875,
"learning_rate": 1.680540653066891e-05,
"loss": 0.0963528037071228,
"step": 480,
"token_acc": 0.9614842649131048
},
{
"epoch": 0.5802249851983422,
"grad_norm": 1.1015625,
"learning_rate": 1.6663720994821246e-05,
"loss": 0.0961789608001709,
"step": 490,
"token_acc": 0.9619599248591109
},
{
"epoch": 0.5920663114268798,
"grad_norm": 2.109375,
"learning_rate": 1.651958726769396e-05,
"loss": 0.090640389919281,
"step": 500,
"token_acc": 0.963166144200627
},
{
"epoch": 0.6039076376554174,
"grad_norm": 0.9140625,
"learning_rate": 1.6373058302724655e-05,
"loss": 0.08862148523330689,
"step": 510,
"token_acc": 0.9642521166509878
},
{
"epoch": 0.6157489638839551,
"grad_norm": 1.7421875,
"learning_rate": 1.6224187933339808e-05,
"loss": 0.08748204708099365,
"step": 520,
"token_acc": 0.9620749098887321
},
{
"epoch": 0.6275902901124926,
"grad_norm": 1.5546875,
"learning_rate": 1.6073030853176862e-05,
"loss": 0.09252775907516479,
"step": 530,
"token_acc": 0.9616528408201597
},
{
"epoch": 0.6394316163410302,
"grad_norm": 1.296875,
"learning_rate": 1.5919642595990275e-05,
"loss": 0.08904544115066529,
"step": 540,
"token_acc": 0.9668594653743943
},
{
"epoch": 0.6512729425695678,
"grad_norm": 1.8203125,
"learning_rate": 1.5764079515248922e-05,
"loss": 0.08241082429885864,
"step": 550,
"token_acc": 0.9658628249295333
},
{
"epoch": 0.6631142687981054,
"grad_norm": 1.6015625,
"learning_rate": 1.5606398763432318e-05,
"loss": 0.0839945912361145,
"step": 560,
"token_acc": 0.9672131147540983
},
{
"epoch": 0.6749555950266429,
"grad_norm": 1.5,
"learning_rate": 1.5446658271033336e-05,
"loss": 0.09018040895462036,
"step": 570,
"token_acc": 0.9658574784651527
},
{
"epoch": 0.6867969212551805,
"grad_norm": 1.40625,
"learning_rate": 1.528491672527504e-05,
"loss": 0.08107317686080932,
"step": 580,
"token_acc": 0.9681967726774244
},
{
"epoch": 0.6986382474837182,
"grad_norm": 1.453125,
"learning_rate": 1.512123354854955e-05,
"loss": 0.08852046132087707,
"step": 590,
"token_acc": 0.9663957486714598
},
{
"epoch": 0.7104795737122558,
"grad_norm": 1.1875,
"learning_rate": 1.4955668876586763e-05,
"loss": 0.07870029807090759,
"step": 600,
"token_acc": 0.9683862849952816
},
{
"epoch": 0.7223208999407934,
"grad_norm": 1.4609375,
"learning_rate": 1.4788283536361036e-05,
"loss": 0.0841621994972229,
"step": 610,
"token_acc": 0.9685781618224666
},
{
"epoch": 0.7341622261693309,
"grad_norm": 1.6171875,
"learning_rate": 1.4619139023743916e-05,
"loss": 0.08564043045043945,
"step": 620,
"token_acc": 0.9654417513682565
},
{
"epoch": 0.7460035523978685,
"grad_norm": 1.203125,
"learning_rate": 1.4448297480911086e-05,
"loss": 0.09037463665008545,
"step": 630,
"token_acc": 0.963363081258807
},
{
"epoch": 0.7578448786264061,
"grad_norm": 0.9609375,
"learning_rate": 1.4275821673511903e-05,
"loss": 0.09671027660369873,
"step": 640,
"token_acc": 0.959305055564251
},
{
"epoch": 0.7696862048549438,
"grad_norm": 1.5078125,
"learning_rate": 1.4101774967609854e-05,
"loss": 0.09160791039466858,
"step": 650,
"token_acc": 0.9654741446648961
},
{
"epoch": 0.7815275310834814,
"grad_norm": 1.421875,
"learning_rate": 1.392622130640243e-05,
"loss": 0.095394766330719,
"step": 660,
"token_acc": 0.9619956208945887
},
{
"epoch": 0.7933688573120189,
"grad_norm": 1.78125,
"learning_rate": 1.3749225186728991e-05,
"loss": 0.08577767610549927,
"step": 670,
"token_acc": 0.966750313676286
},
{
"epoch": 0.8052101835405565,
"grad_norm": 2.0625,
"learning_rate": 1.357085163537517e-05,
"loss": 0.09209753274917602,
"step": 680,
"token_acc": 0.9620608899297424
},
{
"epoch": 0.8170515097690941,
"grad_norm": 2.390625,
"learning_rate": 1.3391166185182651e-05,
"loss": 0.0821334183216095,
"step": 690,
"token_acc": 0.9690383111806099
},
{
"epoch": 0.8288928359976317,
"grad_norm": 1.2109375,
"learning_rate": 1.3210234850972966e-05,
"loss": 0.09119898080825806,
"step": 700,
"token_acc": 0.9637817497648166
},
{
"epoch": 0.8407341622261694,
"grad_norm": 1.8671875,
"learning_rate": 1.3028124105294255e-05,
"loss": 0.0862145483493805,
"step": 710,
"token_acc": 0.9672259683236631
},
{
"epoch": 0.8525754884547069,
"grad_norm": 1.34375,
"learning_rate": 1.2844900853999847e-05,
"loss": 0.08162487745285034,
"step": 720,
"token_acc": 0.9676405906377631
},
{
"epoch": 0.8644168146832445,
"grad_norm": 1.7890625,
"learning_rate": 1.2660632411667648e-05,
"loss": 0.08193669319152833,
"step": 730,
"token_acc": 0.9653278945716975
},
{
"epoch": 0.8762581409117821,
"grad_norm": 1.6015625,
"learning_rate": 1.2475386476869364e-05,
"loss": 0.09078997969627381,
"step": 740,
"token_acc": 0.9639045825486503
},
{
"epoch": 0.8880994671403197,
"grad_norm": 1.671875,
"learning_rate": 1.2289231107298672e-05,
"loss": 0.09944761395454407,
"step": 750,
"token_acc": 0.9596546310832025
},
{
"epoch": 0.8999407933688574,
"grad_norm": 1.1171875,
"learning_rate": 1.2102234694767401e-05,
"loss": 0.0917394757270813,
"step": 760,
"token_acc": 0.9615505335844319
},
{
"epoch": 0.9117821195973949,
"grad_norm": 1.609375,
"learning_rate": 1.1914465940079036e-05,
"loss": 0.08656581044197083,
"step": 770,
"token_acc": 0.9671951028096061
},
{
"epoch": 0.9236234458259325,
"grad_norm": 1.28125,
"learning_rate": 1.1725993827788625e-05,
"loss": 0.08798307180404663,
"step": 780,
"token_acc": 0.9632065132299984
},
{
"epoch": 0.9354647720544701,
"grad_norm": 1.4765625,
"learning_rate": 1.1536887600858487e-05,
"loss": 0.08726394176483154,
"step": 790,
"token_acc": 0.9665934755332497
},
{
"epoch": 0.9473060982830077,
"grad_norm": 0.89453125,
"learning_rate": 1.134721673521897e-05,
"loss": 0.0808544933795929,
"step": 800,
"token_acc": 0.9646211646837821
},
{
"epoch": 0.9591474245115453,
"grad_norm": 1.3671875,
"learning_rate": 1.1157050914243614e-05,
"loss": 0.08560880422592163,
"step": 810,
"token_acc": 0.9667189952904238
},
{
"epoch": 0.9709887507400828,
"grad_norm": 1.3359375,
"learning_rate": 1.0966460003148115e-05,
"loss": 0.0828078031539917,
"step": 820,
"token_acc": 0.9668499607227022
},
{
"epoch": 0.9828300769686205,
"grad_norm": 1.921875,
"learning_rate": 1.0775514023322444e-05,
"loss": 0.09345529675483703,
"step": 830,
"token_acc": 0.9608886107634543
},
{
"epoch": 0.9946714031971581,
"grad_norm": 1.2578125,
"learning_rate": 1.058428312660566e-05,
"loss": 0.08514059782028198,
"step": 840,
"token_acc": 0.9657169693174703
},
{
"epoch": 1.0059206631142688,
"grad_norm": 1.015625,
"learning_rate": 1.0392837569512715e-05,
"loss": 0.08234425187110901,
"step": 850,
"token_acc": 0.9645318540931249
},
{
"epoch": 1.0177619893428065,
"grad_norm": 1.8359375,
"learning_rate": 1.020124768742286e-05,
"loss": 0.07545605897903443,
"step": 860,
"token_acc": 0.9709147771696638
},
{
"epoch": 1.029603315571344,
"grad_norm": 1.1640625,
"learning_rate": 1.0009583868739053e-05,
"loss": 0.07274842262268066,
"step": 870,
"token_acc": 0.9721873035826524
},
{
"epoch": 1.0414446417998815,
"grad_norm": 1.3359375,
"learning_rate": 9.817916529027898e-06,
"loss": 0.07491129636764526,
"step": 880,
"token_acc": 0.9713480507280413
},
{
"epoch": 1.0532859680284192,
"grad_norm": 1.15625,
"learning_rate": 9.626316085149588e-06,
"loss": 0.07744649052619934,
"step": 890,
"token_acc": 0.9709102283390679
},
{
"epoch": 1.0651272942569567,
"grad_norm": 1.1953125,
"learning_rate": 9.43485292938739e-06,
"loss": 0.07794994711875916,
"step": 900,
"token_acc": 0.970647931303669
},
{
"epoch": 1.0769686204854945,
"grad_norm": 1.0234375,
"learning_rate": 9.243597403586145e-06,
"loss": 0.0824435293674469,
"step": 910,
"token_acc": 0.9683633516053249
},
{
"epoch": 1.088809946714032,
"grad_norm": 0.91796875,
"learning_rate": 9.052619773309318e-06,
"loss": 0.07359167337417602,
"step": 920,
"token_acc": 0.9754111198120595
},
{
"epoch": 1.1006512729425695,
"grad_norm": 1.0625,
"learning_rate": 8.861990202024046e-06,
"loss": 0.07806094288825989,
"step": 930,
"token_acc": 0.9696922355881894
},
{
"epoch": 1.1124925991711072,
"grad_norm": 2.0,
"learning_rate": 8.67177872532372e-06,
"loss": 0.07662028670310975,
"step": 940,
"token_acc": 0.9707960433349034
},
{
"epoch": 1.1243339253996447,
"grad_norm": 1.5,
"learning_rate": 8.482055225197532e-06,
"loss": 0.07939339876174926,
"step": 950,
"token_acc": 0.9700156985871271
},
{
"epoch": 1.1361752516281824,
"grad_norm": 2.046875,
"learning_rate": 8.292889404356461e-06,
"loss": 0.07178534269332885,
"step": 960,
"token_acc": 0.9713704630788486
},
{
"epoch": 1.14801657785672,
"grad_norm": 1.453125,
"learning_rate": 8.104350760625122e-06,
"loss": 0.07578552961349487,
"step": 970,
"token_acc": 0.9700093720712277
},
{
"epoch": 1.1598579040852575,
"grad_norm": 1.3828125,
"learning_rate": 7.916508561408892e-06,
"loss": 0.07551709413528443,
"step": 980,
"token_acc": 0.9736513875896476
},
{
"epoch": 1.1716992303137952,
"grad_norm": 1.0625,
"learning_rate": 7.729431818245678e-06,
"loss": 0.06962672472000123,
"step": 990,
"token_acc": 0.9749726263100266
},
{
"epoch": 1.1835405565423327,
"grad_norm": 1.546875,
"learning_rate": 7.543189261451716e-06,
"loss": 0.07484488487243653,
"step": 1000,
"token_acc": 0.9705790297339593
},
{
"epoch": 1.1953818827708704,
"grad_norm": 1.328125,
"learning_rate": 7.35784931487064e-06,
"loss": 0.07622098922729492,
"step": 1010,
"token_acc": 0.970372680492749
},
{
"epoch": 1.207223208999408,
"grad_norm": 2.390625,
"learning_rate": 7.173480070735209e-06,
"loss": 0.07499848604202271,
"step": 1020,
"token_acc": 0.9686574146265399
},
{
"epoch": 1.2190645352279454,
"grad_norm": 1.2109375,
"learning_rate": 6.990149264650814e-06,
"loss": 0.07203071117401123,
"step": 1030,
"token_acc": 0.972574831531108
},
{
"epoch": 1.2309058614564832,
"grad_norm": 1.375,
"learning_rate": 6.807924250710019e-06,
"loss": 0.07002646923065185,
"step": 1040,
"token_acc": 0.9741379310344828
},
{
"epoch": 1.2427471876850207,
"grad_norm": 1.328125,
"learning_rate": 6.626871976747289e-06,
"loss": 0.07481561303138733,
"step": 1050,
"token_acc": 0.9709576138147566
},
{
"epoch": 1.2545885139135584,
"grad_norm": 1.2734375,
"learning_rate": 6.44705895974294e-06,
"loss": 0.06933027505874634,
"step": 1060,
"token_acc": 0.9734443746071653
},
{
"epoch": 1.266429840142096,
"grad_norm": 1.5625,
"learning_rate": 6.268551261385414e-06,
"loss": 0.0675657868385315,
"step": 1070,
"token_acc": 0.9746320075164422
},
{
"epoch": 1.2782711663706334,
"grad_norm": 1.5546875,
"learning_rate": 6.091414463800789e-06,
"loss": 0.07069060802459717,
"step": 1080,
"token_acc": 0.973655323819978
},
{
"epoch": 1.2901124925991712,
"grad_norm": 1.125,
"learning_rate": 5.915713645458514e-06,
"loss": 0.07225958108901978,
"step": 1090,
"token_acc": 0.9728201099764336
},
{
"epoch": 1.3019538188277087,
"grad_norm": 1.6171875,
"learning_rate": 5.741513357262147e-06,
"loss": 0.07490838170051575,
"step": 1100,
"token_acc": 0.970542149796302
},
{
"epoch": 1.3137951450562464,
"grad_norm": 1.3359375,
"learning_rate": 5.568877598833935e-06,
"loss": 0.07528679370880127,
"step": 1110,
"token_acc": 0.970496409615985
},
{
"epoch": 1.325636471284784,
"grad_norm": 1.453125,
"learning_rate": 5.3978697950019484e-06,
"loss": 0.07579593658447266,
"step": 1120,
"token_acc": 0.9716936625255543
},
{
"epoch": 1.3374777975133214,
"grad_norm": 1.6640625,
"learning_rate": 5.228552772498335e-06,
"loss": 0.06750929355621338,
"step": 1130,
"token_acc": 0.9741029641185648
},
{
"epoch": 1.3493191237418591,
"grad_norm": 1.6875,
"learning_rate": 5.060988736877366e-06,
"loss": 0.07841302156448364,
"step": 1140,
"token_acc": 0.9696400625978091
},
{
"epoch": 1.3611604499703966,
"grad_norm": 1.3671875,
"learning_rate": 4.895239249661662e-06,
"loss": 0.08451638221740723,
"step": 1150,
"token_acc": 0.967736883320282
},
{
"epoch": 1.3730017761989344,
"grad_norm": 1.0234375,
"learning_rate": 4.731365205725056e-06,
"loss": 0.074539315700531,
"step": 1160,
"token_acc": 0.9703715315880233
},
{
"epoch": 1.3848431024274719,
"grad_norm": 1.3359375,
"learning_rate": 4.569426810920347e-06,
"loss": 0.068775475025177,
"step": 1170,
"token_acc": 0.9716523101018011
},
{
"epoch": 1.3966844286560094,
"grad_norm": 1.2265625,
"learning_rate": 4.409483559960221e-06,
"loss": 0.07150940299034118,
"step": 1180,
"token_acc": 0.9737005913476502
},
{
"epoch": 1.4085257548845471,
"grad_norm": 1.890625,
"learning_rate": 4.251594214559416e-06,
"loss": 0.08267040252685547,
"step": 1190,
"token_acc": 0.9680350987151363
},
{
"epoch": 1.4203670811130846,
"grad_norm": 1.46875,
"learning_rate": 4.095816781846219e-06,
"loss": 0.0697063684463501,
"step": 1200,
"token_acc": 0.9751095804633688
},
{
"epoch": 1.4322084073416224,
"grad_norm": 1.203125,
"learning_rate": 3.942208493051137e-06,
"loss": 0.07361778020858764,
"step": 1210,
"token_acc": 0.9734901960784313
},
{
"epoch": 1.4440497335701599,
"grad_norm": 1.4609375,
"learning_rate": 3.7908257824806814e-06,
"loss": 0.07019197940826416,
"step": 1220,
"token_acc": 0.9710122218740207
},
{
"epoch": 1.4558910597986974,
"grad_norm": 1.5859375,
"learning_rate": 3.6417242667838917e-06,
"loss": 0.07444216012954712,
"step": 1230,
"token_acc": 0.9728040012503908
},
{
"epoch": 1.467732386027235,
"grad_norm": 1.4375,
"learning_rate": 3.4949587245192983e-06,
"loss": 0.06847925186157226,
"step": 1240,
"token_acc": 0.9746320075164422
},
{
"epoch": 1.4795737122557726,
"grad_norm": 1.625,
"learning_rate": 3.3505830760297543e-06,
"loss": 0.0696124255657196,
"step": 1250,
"token_acc": 0.9730534231552561
},
{
"epoch": 1.4914150384843103,
"grad_norm": 1.3203125,
"learning_rate": 3.2086503636325895e-06,
"loss": 0.07145707607269287,
"step": 1260,
"token_acc": 0.9749294891883422
},
{
"epoch": 1.5032563647128478,
"grad_norm": 1.8515625,
"learning_rate": 3.069212732132345e-06,
"loss": 0.07296675443649292,
"step": 1270,
"token_acc": 0.9725662329518734
},
{
"epoch": 1.5150976909413854,
"grad_norm": 2.625,
"learning_rate": 2.9323214096632335e-06,
"loss": 0.07637610435485839,
"step": 1280,
"token_acc": 0.9721566776781501
},
{
"epoch": 1.526939017169923,
"grad_norm": 1.4375,
"learning_rate": 2.798026688868386e-06,
"loss": 0.07028791308403015,
"step": 1290,
"token_acc": 0.9726801695713613
},
{
"epoch": 1.5387803433984606,
"grad_norm": 1.7578125,
"learning_rate": 2.6663779084227926e-06,
"loss": 0.0738570511341095,
"step": 1300,
"token_acc": 0.9717247879359096
},
{
"epoch": 1.5506216696269983,
"grad_norm": 2.046875,
"learning_rate": 2.5374234349066985e-06,
"loss": 0.07539566755294799,
"step": 1310,
"token_acc": 0.9680968096809681
},
{
"epoch": 1.5624629958555358,
"grad_norm": 1.09375,
"learning_rate": 2.411210645036173e-06,
"loss": 0.07291572093963623,
"step": 1320,
"token_acc": 0.972758405977584
},
{
"epoch": 1.5743043220840733,
"grad_norm": 1.6484375,
"learning_rate": 2.2877859082573194e-06,
"loss": 0.07078194618225098,
"step": 1330,
"token_acc": 0.9733229329173166
},
{
"epoch": 1.586145648312611,
"grad_norm": 1.53125,
"learning_rate": 2.16719456971057e-06,
"loss": 0.07727055549621582,
"step": 1340,
"token_acc": 0.9690154136520919
},
{
"epoch": 1.5979869745411486,
"grad_norm": 1.125,
"learning_rate": 2.0494809335712697e-06,
"loss": 0.06905415058135986,
"step": 1350,
"token_acc": 0.9750783699059561
},
{
"epoch": 1.6098283007696863,
"grad_norm": 1.8046875,
"learning_rate": 1.9346882467727323e-06,
"loss": 0.07434183359146118,
"step": 1360,
"token_acc": 0.9726091720143998
},
{
"epoch": 1.6216696269982238,
"grad_norm": 0.96875,
"learning_rate": 1.8228586831177032e-06,
"loss": 0.06618231534957886,
"step": 1370,
"token_acc": 0.9750900830330566
},
{
"epoch": 1.6335109532267613,
"grad_norm": 1.34375,
"learning_rate": 1.7140333277840837e-06,
"loss": 0.07258784770965576,
"step": 1380,
"token_acc": 0.9727699530516432
},
{
"epoch": 1.6453522794552988,
"grad_norm": 1.1875,
"learning_rate": 1.6082521622306003e-06,
"loss": 0.0752481460571289,
"step": 1390,
"token_acc": 0.9715364050951407
},
{
"epoch": 1.6571936056838366,
"grad_norm": 1.3984375,
"learning_rate": 1.5055540495079802e-06,
"loss": 0.06541621685028076,
"step": 1400,
"token_acc": 0.9767806714778788
},
{
"epoch": 1.6690349319123743,
"grad_norm": 1.90625,
"learning_rate": 1.4059767199810125e-06,
"loss": 0.0707894206047058,
"step": 1410,
"token_acc": 0.9731301068510371
},
{
"epoch": 1.6808762581409118,
"grad_norm": 1.3203125,
"learning_rate": 1.3095567574667589e-06,
"loss": 0.07458854913711548,
"step": 1420,
"token_acc": 0.9726630007855459
},
{
"epoch": 1.6927175843694493,
"grad_norm": 1.65625,
"learning_rate": 1.216329585793975e-06,
"loss": 0.06724110841751099,
"step": 1430,
"token_acc": 0.9734000938820216
},
{
"epoch": 1.7045589105979868,
"grad_norm": 1.46875,
"learning_rate": 1.1263294557887216e-06,
"loss": 0.07588486671447754,
"step": 1440,
"token_acc": 0.9710873664362036
},
{
"epoch": 1.7164002368265245,
"grad_norm": 2.046875,
"learning_rate": 1.0395894326909163e-06,
"loss": 0.07099611163139344,
"step": 1450,
"token_acc": 0.9723091364205256
},
{
"epoch": 1.7282415630550623,
"grad_norm": 1.921875,
"learning_rate": 9.561413840064637e-07,
"loss": 0.06974682807922364,
"step": 1460,
"token_acc": 0.9720609009574636
},
{
"epoch": 1.7400828892835998,
"grad_norm": 1.2578125,
"learning_rate": 8.760159677994174e-07,
"loss": 0.06880149841308594,
"step": 1470,
"token_acc": 0.9749019607843137
},
{
"epoch": 1.7519242155121373,
"grad_norm": 1.90625,
"learning_rate": 7.992426214284787e-07,
"loss": 0.07654795646667481,
"step": 1480,
"token_acc": 0.969967151572032
},
{
"epoch": 1.7637655417406748,
"grad_norm": 1.3671875,
"learning_rate": 7.258495507319885e-07,
"loss": 0.06865710020065308,
"step": 1490,
"token_acc": 0.9735068192506663
},
{
"epoch": 1.7756068679692125,
"grad_norm": 1.34375,
"learning_rate": 6.558637196653372e-07,
"loss": 0.06818960905075074,
"step": 1500,
"token_acc": 0.9739225484072455
},
{
"epoch": 1.7874481941977503,
"grad_norm": 1.7578125,
"learning_rate": 5.893108403946634e-07,
"loss": 0.07731307148933411,
"step": 1510,
"token_acc": 0.9705836332342357
},
{
"epoch": 1.7992895204262878,
"grad_norm": 1.1484375,
"learning_rate": 5.262153638504286e-07,
"loss": 0.07072955965995789,
"step": 1520,
"token_acc": 0.9747514596812372
},
{
"epoch": 1.8111308466548253,
"grad_norm": 1.40625,
"learning_rate": 4.6660047074436945e-07,
"loss": 0.07091631889343261,
"step": 1530,
"token_acc": 0.9746914544602406
},
{
"epoch": 1.8229721728833628,
"grad_norm": 1.6328125,
"learning_rate": 4.10488063053105e-07,
"loss": 0.062443327903747556,
"step": 1540,
"token_acc": 0.976577139287945
},
{
"epoch": 1.8348134991119005,
"grad_norm": 1.6015625,
"learning_rate": 3.57898755971553e-07,
"loss": 0.07588485479354859,
"step": 1550,
"token_acc": 0.973754100921731
},
{
"epoch": 1.8466548253404382,
"grad_norm": 1.4375,
"learning_rate": 3.088518703390908e-07,
"loss": 0.07371261715888977,
"step": 1560,
"token_acc": 0.9696590553644041
},
{
"epoch": 1.8584961515689757,
"grad_norm": 1.1484375,
"learning_rate": 2.633654255412554e-07,
"loss": 0.06826964616775513,
"step": 1570,
"token_acc": 0.9750783699059561
},
{
"epoch": 1.8703374777975132,
"grad_norm": 1.5078125,
"learning_rate": 2.214561328895748e-07,
"loss": 0.06952533721923829,
"step": 1580,
"token_acc": 0.9716478696741855
},
{
"epoch": 1.8821788040260508,
"grad_norm": 1.7890625,
"learning_rate": 1.8313938948198884e-07,
"loss": 0.07293472290039063,
"step": 1590,
"token_acc": 0.9714820009350164
},
{
"epoch": 1.8940201302545885,
"grad_norm": 1.625,
"learning_rate": 1.484292725460934e-07,
"loss": 0.07688854336738586,
"step": 1600,
"token_acc": 0.9702054257487847
},
{
"epoch": 1.9058614564831262,
"grad_norm": 1.3359375,
"learning_rate": 1.173385342672917e-07,
"loss": 0.07143334150314332,
"step": 1610,
"token_acc": 0.970491288651703
},
{
"epoch": 1.9177027827116637,
"grad_norm": 1.921875,
"learning_rate": 8.987859710375524e-08,
"loss": 0.081912100315094,
"step": 1620,
"token_acc": 0.9685150375939849
},
{
"epoch": 1.9295441089402012,
"grad_norm": 1.8203125,
"learning_rate": 6.605954958991523e-08,
"loss": 0.07874792218208312,
"step": 1630,
"token_acc": 0.9696588586700204
},
{
"epoch": 1.9413854351687387,
"grad_norm": 1.5546875,
"learning_rate": 4.5890142630027336e-08,
"loss": 0.0735186517238617,
"step": 1640,
"token_acc": 0.9709894934922377
},
{
"epoch": 1.9532267613972765,
"grad_norm": 1.9765625,
"learning_rate": 2.9377786283167897e-08,
"loss": 0.0773587942123413,
"step": 1650,
"token_acc": 0.9692741809060982
},
{
"epoch": 1.9650680876258142,
"grad_norm": 1.796875,
"learning_rate": 1.6528547040842724e-08,
"loss": 0.06999446153640747,
"step": 1660,
"token_acc": 0.9743669896842764
},
{
"epoch": 1.9769094138543517,
"grad_norm": 1.5859375,
"learning_rate": 7.3471455982143665e-09,
"loss": 0.07299281358718872,
"step": 1670,
"token_acc": 0.9729179711959924
},
{
"epoch": 1.9887507400828892,
"grad_norm": 1.34375,
"learning_rate": 1.8369551197594538e-09,
"loss": 0.067216557264328,
"step": 1680,
"token_acc": 0.9730407523510972
},
{
"epoch": 2.0,
"grad_norm": 2.65625,
"learning_rate": 0.0,
"loss": 0.07405292987823486,
"step": 1690,
"token_acc": 0.9716838024608124
}
],
"logging_steps": 10,
"max_steps": 1690,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.7276889282044232e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}