{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9995759717314487, "eval_steps": 200, "global_step": 884, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011307420494699647, "grad_norm": 8.3125, "learning_rate": 3.3707865168539325e-07, "loss": 0.493535578250885, "step": 1, "token_acc": 0.8620689655172413 }, { "epoch": 0.005653710247349823, "grad_norm": 10.0625, "learning_rate": 1.6853932584269663e-06, "loss": 0.4733062982559204, "step": 5, "token_acc": 0.8886255924170616 }, { "epoch": 0.011307420494699646, "grad_norm": 7.8125, "learning_rate": 3.3707865168539327e-06, "loss": 0.575874137878418, "step": 10, "token_acc": 0.8524590163934426 }, { "epoch": 0.01696113074204947, "grad_norm": 10.625, "learning_rate": 5.056179775280899e-06, "loss": 0.5115116596221924, "step": 15, "token_acc": 0.8833652007648184 }, { "epoch": 0.022614840989399292, "grad_norm": 9.1875, "learning_rate": 6.741573033707865e-06, "loss": 0.5164161682128906, "step": 20, "token_acc": 0.858195211786372 }, { "epoch": 0.028268551236749116, "grad_norm": 7.28125, "learning_rate": 8.426966292134832e-06, "loss": 0.500755786895752, "step": 25, "token_acc": 0.8675623800383877 }, { "epoch": 0.03392226148409894, "grad_norm": 11.4375, "learning_rate": 1.0112359550561798e-05, "loss": 0.45065789222717284, "step": 30, "token_acc": 0.8818011257035647 }, { "epoch": 0.039575971731448764, "grad_norm": 9.375, "learning_rate": 1.1797752808988765e-05, "loss": 0.5452223300933838, "step": 35, "token_acc": 0.8604651162790697 }, { "epoch": 0.045229681978798585, "grad_norm": 12.0, "learning_rate": 1.348314606741573e-05, "loss": 0.46700444221496584, "step": 40, "token_acc": 0.8830188679245283 }, { "epoch": 0.05088339222614841, "grad_norm": 8.4375, "learning_rate": 1.5168539325842698e-05, "loss": 0.4567443370819092, "step": 45, "token_acc": 0.8964879852125693 }, { "epoch": 0.05653710247349823, "grad_norm": 13.5, "learning_rate": 1.6853932584269665e-05, "loss": 0.5110898494720459, "step": 50, "token_acc": 0.864406779661017 }, { "epoch": 0.06219081272084806, "grad_norm": 5.0, "learning_rate": 1.853932584269663e-05, "loss": 0.4209805965423584, "step": 55, "token_acc": 0.8962264150943396 }, { "epoch": 0.06784452296819787, "grad_norm": 8.125, "learning_rate": 2.0224719101123596e-05, "loss": 0.4727331638336182, "step": 60, "token_acc": 0.891941391941392 }, { "epoch": 0.0734982332155477, "grad_norm": 11.125, "learning_rate": 2.1910112359550563e-05, "loss": 0.390924072265625, "step": 65, "token_acc": 0.8983364140480592 }, { "epoch": 0.07915194346289753, "grad_norm": 7.09375, "learning_rate": 2.359550561797753e-05, "loss": 0.3485325813293457, "step": 70, "token_acc": 0.9133709981167608 }, { "epoch": 0.08480565371024736, "grad_norm": 9.875, "learning_rate": 2.5280898876404494e-05, "loss": 0.4826796531677246, "step": 75, "token_acc": 0.8802946593001841 }, { "epoch": 0.09045936395759717, "grad_norm": 8.5, "learning_rate": 2.696629213483146e-05, "loss": 0.4428101539611816, "step": 80, "token_acc": 0.8783269961977186 }, { "epoch": 0.096113074204947, "grad_norm": 9.375, "learning_rate": 2.865168539325843e-05, "loss": 0.48508028984069823, "step": 85, "token_acc": 0.886654478976234 }, { "epoch": 0.10176678445229682, "grad_norm": 13.5, "learning_rate": 2.999989459318379e-05, "loss": 0.4538856506347656, "step": 90, "token_acc": 0.9044943820224719 }, { "epoch": 0.10742049469964664, "grad_norm": 8.4375, "learning_rate": 2.999620552744515e-05, "loss": 0.4510812282562256, "step": 95, "token_acc": 0.9045801526717557 }, { "epoch": 0.11307420494699646, "grad_norm": 16.5, "learning_rate": 2.998724776679495e-05, "loss": 0.49095354080200193, "step": 100, "token_acc": 0.8860294117647058 }, { "epoch": 0.11872791519434629, "grad_norm": 9.875, "learning_rate": 2.997302480819445e-05, "loss": 0.41774497032165525, "step": 105, "token_acc": 0.8880597014925373 }, { "epoch": 0.12438162544169612, "grad_norm": 7.15625, "learning_rate": 2.9953542204050917e-05, "loss": 0.30308961868286133, "step": 110, "token_acc": 0.9306049822064056 }, { "epoch": 0.13003533568904593, "grad_norm": 16.0, "learning_rate": 2.9928807560050043e-05, "loss": 0.5130989074707031, "step": 115, "token_acc": 0.8803738317757009 }, { "epoch": 0.13568904593639575, "grad_norm": 6.84375, "learning_rate": 2.9898830532186824e-05, "loss": 0.4366453647613525, "step": 120, "token_acc": 0.886654478976234 }, { "epoch": 0.1413427561837456, "grad_norm": 10.875, "learning_rate": 2.9863622822996006e-05, "loss": 0.5397056102752685, "step": 125, "token_acc": 0.8822429906542056 }, { "epoch": 0.1469964664310954, "grad_norm": 12.5, "learning_rate": 2.982319817698363e-05, "loss": 0.5331222534179687, "step": 130, "token_acc": 0.8604651162790697 }, { "epoch": 0.15265017667844524, "grad_norm": 12.1875, "learning_rate": 2.977757237526136e-05, "loss": 0.343440842628479, "step": 135, "token_acc": 0.9131238447319778 }, { "epoch": 0.15830388692579506, "grad_norm": 12.625, "learning_rate": 2.9726763229385863e-05, "loss": 0.47114005088806155, "step": 140, "token_acc": 0.886654478976234 }, { "epoch": 0.16395759717314487, "grad_norm": 8.625, "learning_rate": 2.9670790574405432e-05, "loss": 0.4388119697570801, "step": 145, "token_acc": 0.8894927536231884 }, { "epoch": 0.1696113074204947, "grad_norm": 6.6875, "learning_rate": 2.9609676261116703e-05, "loss": 0.558556079864502, "step": 150, "token_acc": 0.8729582577132486 }, { "epoch": 0.17526501766784452, "grad_norm": 10.375, "learning_rate": 2.9543444147534497e-05, "loss": 0.4793074131011963, "step": 155, "token_acc": 0.8905660377358491 }, { "epoch": 0.18091872791519434, "grad_norm": 8.3125, "learning_rate": 2.947212008957803e-05, "loss": 0.48024735450744627, "step": 160, "token_acc": 0.8924528301886793 }, { "epoch": 0.18657243816254418, "grad_norm": 19.875, "learning_rate": 2.9395731930977187e-05, "loss": 0.36183197498321534, "step": 165, "token_acc": 0.9111969111969112 }, { "epoch": 0.192226148409894, "grad_norm": 16.5, "learning_rate": 2.9314309492402806e-05, "loss": 0.5196819305419922, "step": 170, "token_acc": 0.8822393822393823 }, { "epoch": 0.1978798586572438, "grad_norm": 11.5625, "learning_rate": 2.922788455982516e-05, "loss": 0.46062512397766114, "step": 175, "token_acc": 0.8813559322033898 }, { "epoch": 0.20353356890459365, "grad_norm": 4.65625, "learning_rate": 2.9136490872105272e-05, "loss": 0.35768780708312986, "step": 180, "token_acc": 0.9044943820224719 }, { "epoch": 0.20918727915194346, "grad_norm": 9.5625, "learning_rate": 2.904016410782379e-05, "loss": 0.49846739768981935, "step": 185, "token_acc": 0.875 }, { "epoch": 0.21484098939929328, "grad_norm": 9.625, "learning_rate": 2.8938941871352683e-05, "loss": 0.5851227760314941, "step": 190, "token_acc": 0.8588007736943907 }, { "epoch": 0.22049469964664312, "grad_norm": 14.3125, "learning_rate": 2.883286367817511e-05, "loss": 0.4418447017669678, "step": 195, "token_acc": 0.8899253731343284 }, { "epoch": 0.22614840989399293, "grad_norm": 11.4375, "learning_rate": 2.872197093945924e-05, "loss": 0.7016141414642334, "step": 200, "token_acc": 0.8148148148148148 }, { "epoch": 0.23180212014134274, "grad_norm": 17.875, "learning_rate": 2.860630694589199e-05, "loss": 0.3626969337463379, "step": 205, "token_acc": 0.9100917431192661 }, { "epoch": 0.23745583038869258, "grad_norm": 12.9375, "learning_rate": 2.8485916850779088e-05, "loss": 0.4723252296447754, "step": 210, "token_acc": 0.8835978835978836 }, { "epoch": 0.2431095406360424, "grad_norm": 11.4375, "learning_rate": 2.8360847652417973e-05, "loss": 0.5085994720458984, "step": 215, "token_acc": 0.893048128342246 }, { "epoch": 0.24876325088339224, "grad_norm": 9.9375, "learning_rate": 2.82311481757504e-05, "loss": 0.4896233081817627, "step": 220, "token_acc": 0.8727272727272727 }, { "epoch": 0.254416961130742, "grad_norm": 11.375, "learning_rate": 2.8096869053302046e-05, "loss": 0.4948256492614746, "step": 225, "token_acc": 0.8743068391866913 }, { "epoch": 0.26007067137809187, "grad_norm": 8.5625, "learning_rate": 2.7958062705416376e-05, "loss": 0.4669034481048584, "step": 230, "token_acc": 0.8822463768115942 }, { "epoch": 0.2657243816254417, "grad_norm": 9.8125, "learning_rate": 2.7814783319790595e-05, "loss": 0.4465358734130859, "step": 235, "token_acc": 0.8836772983114447 }, { "epoch": 0.2713780918727915, "grad_norm": 10.9375, "learning_rate": 2.766708683032173e-05, "loss": 0.4827105522155762, "step": 240, "token_acc": 0.8766859344894027 }, { "epoch": 0.27703180212014133, "grad_norm": 9.4375, "learning_rate": 2.75150308952709e-05, "loss": 0.40639634132385255, "step": 245, "token_acc": 0.9010791366906474 }, { "epoch": 0.2826855123674912, "grad_norm": 11.4375, "learning_rate": 2.735867487475452e-05, "loss": 0.5378179550170898, "step": 250, "token_acc": 0.874766355140187 }, { "epoch": 0.28833922261484096, "grad_norm": 19.125, "learning_rate": 2.7198079807571094e-05, "loss": 0.42697606086730955, "step": 255, "token_acc": 0.8934579439252337 }, { "epoch": 0.2939929328621908, "grad_norm": 7.625, "learning_rate": 2.7033308387372666e-05, "loss": 0.45955357551574705, "step": 260, "token_acc": 0.888045540796964 }, { "epoch": 0.29964664310954064, "grad_norm": 8.875, "learning_rate": 2.6864424938190263e-05, "loss": 0.51542067527771, "step": 265, "token_acc": 0.8812260536398467 }, { "epoch": 0.3053003533568905, "grad_norm": 10.25, "learning_rate": 2.6691495389322878e-05, "loss": 0.49557199478149416, "step": 270, "token_acc": 0.8658536585365854 }, { "epoch": 0.31095406360424027, "grad_norm": 7.125, "learning_rate": 2.651458724959973e-05, "loss": 0.4124931812286377, "step": 275, "token_acc": 0.8986615678776291 }, { "epoch": 0.3166077738515901, "grad_norm": 8.9375, "learning_rate": 2.633376958102597e-05, "loss": 0.6442465782165527, "step": 280, "token_acc": 0.8523364485981308 }, { "epoch": 0.32226148409893995, "grad_norm": 9.875, "learning_rate": 2.614911297182199e-05, "loss": 0.43247137069702146, "step": 285, "token_acc": 0.8936567164179104 }, { "epoch": 0.32791519434628974, "grad_norm": 8.1875, "learning_rate": 2.596068950886699e-05, "loss": 0.40369553565979005, "step": 290, "token_acc": 0.9065934065934066 }, { "epoch": 0.3335689045936396, "grad_norm": 6.75, "learning_rate": 2.5768572749557398e-05, "loss": 0.35304784774780273, "step": 295, "token_acc": 0.9064885496183206 }, { "epoch": 0.3392226148409894, "grad_norm": 10.625, "learning_rate": 2.5572837693091338e-05, "loss": 0.42280235290527346, "step": 300, "token_acc": 0.8916518650088809 }, { "epoch": 0.3448763250883392, "grad_norm": 15.3125, "learning_rate": 2.5373560751190164e-05, "loss": 0.41590089797973634, "step": 305, "token_acc": 0.8983050847457628 }, { "epoch": 0.35053003533568905, "grad_norm": 15.5625, "learning_rate": 2.517081971826858e-05, "loss": 0.5931228637695313, "step": 310, "token_acc": 0.8493408662900188 }, { "epoch": 0.3561837455830389, "grad_norm": 16.0, "learning_rate": 2.4964693741065e-05, "loss": 0.3908259630203247, "step": 315, "token_acc": 0.8956043956043956 }, { "epoch": 0.3618374558303887, "grad_norm": 6.9375, "learning_rate": 2.4755263287743982e-05, "loss": 0.533808708190918, "step": 320, "token_acc": 0.8736842105263158 }, { "epoch": 0.3674911660777385, "grad_norm": 11.75, "learning_rate": 2.4542610116482777e-05, "loss": 0.43399462699890134, "step": 325, "token_acc": 0.8923076923076924 }, { "epoch": 0.37314487632508836, "grad_norm": 16.375, "learning_rate": 2.43268172435543e-05, "loss": 0.5363679885864258, "step": 330, "token_acc": 0.8554006968641115 }, { "epoch": 0.37879858657243815, "grad_norm": 6.46875, "learning_rate": 2.4107968910918943e-05, "loss": 0.4685643196105957, "step": 335, "token_acc": 0.8926553672316384 }, { "epoch": 0.384452296819788, "grad_norm": 11.25, "learning_rate": 2.3886150553337925e-05, "loss": 0.4552040100097656, "step": 340, "token_acc": 0.8849557522123894 }, { "epoch": 0.3901060070671378, "grad_norm": 9.375, "learning_rate": 2.366144876502097e-05, "loss": 0.41571660041809083, "step": 345, "token_acc": 0.8919925512104283 }, { "epoch": 0.3957597173144876, "grad_norm": 16.75, "learning_rate": 2.3433951265821347e-05, "loss": 0.48478131294250487, "step": 350, "token_acc": 0.8825688073394495 }, { "epoch": 0.40141342756183745, "grad_norm": 12.875, "learning_rate": 2.320374686699154e-05, "loss": 0.48756847381591795, "step": 355, "token_acc": 0.8759398496240601 }, { "epoch": 0.4070671378091873, "grad_norm": 10.25, "learning_rate": 2.2970925436512743e-05, "loss": 0.5582265853881836, "step": 360, "token_acc": 0.8727272727272727 }, { "epoch": 0.4127208480565371, "grad_norm": 7.125, "learning_rate": 2.2735577864011946e-05, "loss": 0.40789146423339845, "step": 365, "token_acc": 0.903954802259887 }, { "epoch": 0.4183745583038869, "grad_norm": 9.0625, "learning_rate": 2.2497796025280097e-05, "loss": 0.4335779666900635, "step": 370, "token_acc": 0.8895027624309392 }, { "epoch": 0.42402826855123676, "grad_norm": 8.6875, "learning_rate": 2.2257672746405337e-05, "loss": 0.5792682647705079, "step": 375, "token_acc": 0.8518518518518519 }, { "epoch": 0.42968197879858655, "grad_norm": 9.125, "learning_rate": 2.201530176753521e-05, "loss": 0.5717463016510009, "step": 380, "token_acc": 0.8565965583173997 }, { "epoch": 0.4353356890459364, "grad_norm": 6.9375, "learning_rate": 2.17707777062821e-05, "loss": 0.3671257972717285, "step": 385, "token_acc": 0.9074410163339383 }, { "epoch": 0.44098939929328623, "grad_norm": 12.125, "learning_rate": 2.1524196020786038e-05, "loss": 0.5280078887939453, "step": 390, "token_acc": 0.8771929824561403 }, { "epoch": 0.446643109540636, "grad_norm": 21.25, "learning_rate": 2.127565297244947e-05, "loss": 0.4925088882446289, "step": 395, "token_acc": 0.8908765652951699 }, { "epoch": 0.45229681978798586, "grad_norm": 10.3125, "learning_rate": 2.1025245588358365e-05, "loss": 0.4084740161895752, "step": 400, "token_acc": 0.8884758364312267 }, { "epoch": 0.4579505300353357, "grad_norm": 11.1875, "learning_rate": 2.0773071623404486e-05, "loss": 0.6456653594970703, "step": 405, "token_acc": 0.8585461689587426 }, { "epoch": 0.4636042402826855, "grad_norm": 6.6875, "learning_rate": 2.0519229522123453e-05, "loss": 0.6197998046875, "step": 410, "token_acc": 0.8672727272727273 }, { "epoch": 0.46925795053003533, "grad_norm": 8.8125, "learning_rate": 2.026381838026368e-05, "loss": 0.4290182113647461, "step": 415, "token_acc": 0.8931860036832413 }, { "epoch": 0.47491166077738517, "grad_norm": 9.9375, "learning_rate": 2.0006937906100998e-05, "loss": 0.4530322551727295, "step": 420, "token_acc": 0.9066901408450704 }, { "epoch": 0.48056537102473496, "grad_norm": 14.5625, "learning_rate": 1.9748688381514224e-05, "loss": 0.4739545345306396, "step": 425, "token_acc": 0.8712121212121212 }, { "epoch": 0.4862190812720848, "grad_norm": 14.1875, "learning_rate": 1.9489170622836754e-05, "loss": 0.5166975498199463, "step": 430, "token_acc": 0.8643122676579925 }, { "epoch": 0.49187279151943464, "grad_norm": 7.25, "learning_rate": 1.922848594149955e-05, "loss": 0.41360926628112793, "step": 435, "token_acc": 0.8875968992248062 }, { "epoch": 0.4975265017667845, "grad_norm": 8.1875, "learning_rate": 1.896673610448085e-05, "loss": 0.3576143741607666, "step": 440, "token_acc": 0.9126559714795008 }, { "epoch": 0.5031802120141343, "grad_norm": 9.8125, "learning_rate": 1.8704023294578e-05, "loss": 0.4304816246032715, "step": 445, "token_acc": 0.8942486085343229 }, { "epoch": 0.508833922261484, "grad_norm": 9.3125, "learning_rate": 1.8440450070517e-05, "loss": 0.40277585983276365, "step": 450, "token_acc": 0.8975791433891993 }, { "epoch": 0.5144876325088339, "grad_norm": 11.25, "learning_rate": 1.817611932691528e-05, "loss": 0.4203328609466553, "step": 455, "token_acc": 0.8958333333333334 }, { "epoch": 0.5201413427561837, "grad_norm": 15.375, "learning_rate": 1.791113425411332e-05, "loss": 0.4957026481628418, "step": 460, "token_acc": 0.869811320754717 }, { "epoch": 0.5257950530035336, "grad_norm": 12.5, "learning_rate": 1.7645598297890914e-05, "loss": 0.38120887279510496, "step": 465, "token_acc": 0.8954372623574145 }, { "epoch": 0.5314487632508834, "grad_norm": 12.5625, "learning_rate": 1.7379615119083562e-05, "loss": 0.46092791557312013, "step": 470, "token_acc": 0.9040590405904059 }, { "epoch": 0.5371024734982333, "grad_norm": 8.875, "learning_rate": 1.7113288553115094e-05, "loss": 0.35474748611450196, "step": 475, "token_acc": 0.9045045045045045 }, { "epoch": 0.542756183745583, "grad_norm": 11.5, "learning_rate": 1.6846722569461957e-05, "loss": 0.6311816215515137, "step": 480, "token_acc": 0.864376130198915 }, { "epoch": 0.5484098939929328, "grad_norm": 7.46875, "learning_rate": 1.658002123106531e-05, "loss": 0.4077010154724121, "step": 485, "token_acc": 0.8971428571428571 }, { "epoch": 0.5540636042402827, "grad_norm": 9.25, "learning_rate": 1.6313288653706577e-05, "loss": 0.5004054546356201, "step": 490, "token_acc": 0.8745173745173745 }, { "epoch": 0.5597173144876325, "grad_norm": 10.875, "learning_rate": 1.6046628965362325e-05, "loss": 0.49433560371398927, "step": 495, "token_acc": 0.8756660746003553 }, { "epoch": 0.5653710247349824, "grad_norm": 13.625, "learning_rate": 1.5780146265554462e-05, "loss": 0.5154177188873291, "step": 500, "token_acc": 0.8778359511343804 }, { "epoch": 0.5710247349823322, "grad_norm": 16.0, "learning_rate": 1.5513944584711537e-05, "loss": 0.5768596172332764, "step": 505, "token_acc": 0.8682170542635659 }, { "epoch": 0.5766784452296819, "grad_norm": 11.5, "learning_rate": 1.5248127843556906e-05, "loss": 0.542631196975708, "step": 510, "token_acc": 0.8648148148148148 }, { "epoch": 0.5823321554770318, "grad_norm": 9.3125, "learning_rate": 1.4982799812539898e-05, "loss": 0.44904112815856934, "step": 515, "token_acc": 0.8768656716417911 }, { "epoch": 0.5879858657243816, "grad_norm": 12.0, "learning_rate": 1.471806407132547e-05, "loss": 0.4998485088348389, "step": 520, "token_acc": 0.8848920863309353 }, { "epoch": 0.5936395759717314, "grad_norm": 14.9375, "learning_rate": 1.445402396835848e-05, "loss": 0.4888237476348877, "step": 525, "token_acc": 0.8878676470588235 }, { "epoch": 0.5992932862190813, "grad_norm": 11.875, "learning_rate": 1.4190782580518134e-05, "loss": 0.4358950614929199, "step": 530, "token_acc": 0.8945454545454545 }, { "epoch": 0.6049469964664311, "grad_norm": 13.4375, "learning_rate": 1.3928442672878498e-05, "loss": 0.3919216632843018, "step": 535, "token_acc": 0.9005424954792043 }, { "epoch": 0.610600706713781, "grad_norm": 14.625, "learning_rate": 1.3667106658590713e-05, "loss": 0.4299191474914551, "step": 540, "token_acc": 0.8834586466165414 }, { "epoch": 0.6162544169611307, "grad_norm": 11.0, "learning_rate": 1.3406876558902596e-05, "loss": 0.4752546787261963, "step": 545, "token_acc": 0.8946395563770795 }, { "epoch": 0.6219081272084805, "grad_norm": 8.0, "learning_rate": 1.3147853963331226e-05, "loss": 0.462324857711792, "step": 550, "token_acc": 0.8884892086330936 }, { "epoch": 0.6275618374558304, "grad_norm": 14.125, "learning_rate": 1.2890139990004112e-05, "loss": 0.4813478946685791, "step": 555, "token_acc": 0.8781818181818182 }, { "epoch": 0.6332155477031802, "grad_norm": 10.375, "learning_rate": 1.2633835246184317e-05, "loss": 0.4766115188598633, "step": 560, "token_acc": 0.8956692913385826 }, { "epoch": 0.6388692579505301, "grad_norm": 13.75, "learning_rate": 1.2379039788995068e-05, "loss": 0.555994701385498, "step": 565, "token_acc": 0.8626692456479691 }, { "epoch": 0.6445229681978799, "grad_norm": 11.4375, "learning_rate": 1.2125853086359117e-05, "loss": 0.554969596862793, "step": 570, "token_acc": 0.864406779661017 }, { "epoch": 0.6501766784452296, "grad_norm": 9.375, "learning_rate": 1.1874373978168092e-05, "loss": 0.48480896949768065, "step": 575, "token_acc": 0.8574144486692015 }, { "epoch": 0.6558303886925795, "grad_norm": 10.75, "learning_rate": 1.1624700637697078e-05, "loss": 0.6284814357757569, "step": 580, "token_acc": 0.8512241054613936 }, { "epoch": 0.6614840989399293, "grad_norm": 10.1875, "learning_rate": 1.1376930533279357e-05, "loss": 0.38442087173461914, "step": 585, "token_acc": 0.9003690036900369 }, { "epoch": 0.6671378091872792, "grad_norm": 10.3125, "learning_rate": 1.1131160390256417e-05, "loss": 0.5038439750671386, "step": 590, "token_acc": 0.8627819548872181 }, { "epoch": 0.672791519434629, "grad_norm": 14.3125, "learning_rate": 1.0887486153217962e-05, "loss": 0.45342187881469725, "step": 595, "token_acc": 0.8917910447761194 }, { "epoch": 0.6784452296819788, "grad_norm": 10.3125, "learning_rate": 1.064600294854675e-05, "loss": 0.5414403915405274, "step": 600, "token_acc": 0.8771266540642723 }, { "epoch": 0.6840989399293286, "grad_norm": 12.1875, "learning_rate": 1.0406805047282826e-05, "loss": 0.44243249893188474, "step": 605, "token_acc": 0.8971428571428571 }, { "epoch": 0.6897526501766784, "grad_norm": 9.4375, "learning_rate": 1.0169985828321664e-05, "loss": 0.4240866661071777, "step": 610, "token_acc": 0.900562851782364 }, { "epoch": 0.6954063604240283, "grad_norm": 17.0, "learning_rate": 9.935637741960595e-06, "loss": 0.5268006324768066, "step": 615, "token_acc": 0.8721804511278195 }, { "epoch": 0.7010600706713781, "grad_norm": 13.75, "learning_rate": 9.703852273807745e-06, "loss": 0.5309527397155762, "step": 620, "token_acc": 0.864963503649635 }, { "epoch": 0.7067137809187279, "grad_norm": 10.625, "learning_rate": 9.474719909067592e-06, "loss": 0.46470232009887696, "step": 625, "token_acc": 0.8984674329501916 }, { "epoch": 0.7123674911660778, "grad_norm": 10.75, "learning_rate": 9.248330097216998e-06, "loss": 0.3585221290588379, "step": 630, "token_acc": 0.9072356215213359 }, { "epoch": 0.7180212014134275, "grad_norm": 10.8125, "learning_rate": 9.024771217085648e-06, "loss": 0.45208401679992677, "step": 635, "token_acc": 0.9046728971962616 }, { "epoch": 0.7236749116607774, "grad_norm": 9.0, "learning_rate": 8.804130542354423e-06, "loss": 0.40645594596862794, "step": 640, "token_acc": 0.8970588235294118 }, { "epoch": 0.7293286219081272, "grad_norm": 11.1875, "learning_rate": 8.586494207485173e-06, "loss": 0.5020310878753662, "step": 645, "token_acc": 0.8729582577132486 }, { "epoch": 0.734982332155477, "grad_norm": 26.25, "learning_rate": 8.371947174095276e-06, "loss": 0.5611002445220947, "step": 650, "token_acc": 0.8490566037735849 }, { "epoch": 0.7406360424028269, "grad_norm": 14.8125, "learning_rate": 8.160573197790034e-06, "loss": 0.504447078704834, "step": 655, "token_acc": 0.8843416370106761 }, { "epoch": 0.7462897526501767, "grad_norm": 18.375, "learning_rate": 7.952454795465847e-06, "loss": 0.41913480758666993, "step": 660, "token_acc": 0.8958333333333334 }, { "epoch": 0.7519434628975264, "grad_norm": 8.75, "learning_rate": 7.747673213097013e-06, "loss": 0.4940896511077881, "step": 665, "token_acc": 0.8772893772893773 }, { "epoch": 0.7575971731448763, "grad_norm": 13.5625, "learning_rate": 7.5463083940186235e-06, "loss": 0.47562193870544434, "step": 670, "token_acc": 0.8994614003590664 }, { "epoch": 0.7632508833922261, "grad_norm": 10.0625, "learning_rate": 7.3484389477180245e-06, "loss": 0.48333349227905276, "step": 675, "token_acc": 0.8953488372093024 }, { "epoch": 0.768904593639576, "grad_norm": 8.5, "learning_rate": 7.154142119146981e-06, "loss": 0.41202802658081056, "step": 680, "token_acc": 0.9003831417624522 }, { "epoch": 0.7745583038869258, "grad_norm": 12.9375, "learning_rate": 6.9634937585665066e-06, "loss": 0.47983555793762206, "step": 685, "token_acc": 0.8998211091234347 }, { "epoch": 0.7802120141342757, "grad_norm": 8.6875, "learning_rate": 6.776568291936193e-06, "loss": 0.4969668388366699, "step": 690, "token_acc": 0.8790786948176583 }, { "epoch": 0.7858657243816255, "grad_norm": 10.375, "learning_rate": 6.593438691859566e-06, "loss": 0.44586987495422364, "step": 695, "token_acc": 0.884469696969697 }, { "epoch": 0.7915194346289752, "grad_norm": 11.875, "learning_rate": 6.414176449096749e-06, "loss": 0.5935549736022949, "step": 700, "token_acc": 0.8718929254302104 }, { "epoch": 0.7971731448763251, "grad_norm": 16.75, "learning_rate": 6.238851544655688e-06, "loss": 0.5915599822998047, "step": 705, "token_acc": 0.8622641509433963 }, { "epoch": 0.8028268551236749, "grad_norm": 13.4375, "learning_rate": 6.067532422472728e-06, "loss": 0.45562114715576174, "step": 710, "token_acc": 0.8954372623574145 }, { "epoch": 0.8084805653710248, "grad_norm": 9.5625, "learning_rate": 5.9002859626932115e-06, "loss": 0.44912257194519045, "step": 715, "token_acc": 0.8789571694599627 }, { "epoch": 0.8141342756183746, "grad_norm": 12.6875, "learning_rate": 5.7371774555625925e-06, "loss": 0.4588914394378662, "step": 720, "token_acc": 0.8875638841567292 }, { "epoch": 0.8197879858657244, "grad_norm": 10.5625, "learning_rate": 5.578270575938212e-06, "loss": 0.42406349182128905, "step": 725, "token_acc": 0.9009009009009009 }, { "epoch": 0.8254416961130742, "grad_norm": 13.875, "learning_rate": 5.423627358431671e-06, "loss": 0.49872541427612305, "step": 730, "token_acc": 0.8799249530956847 }, { "epoch": 0.831095406360424, "grad_norm": 17.0, "learning_rate": 5.273308173191575e-06, "loss": 0.48893170356750487, "step": 735, "token_acc": 0.8968105065666041 }, { "epoch": 0.8367491166077738, "grad_norm": 10.3125, "learning_rate": 5.127371702336002e-06, "loss": 0.5636299133300782, "step": 740, "token_acc": 0.860236220472441 }, { "epoch": 0.8424028268551237, "grad_norm": 12.0, "learning_rate": 4.985874917043985e-06, "loss": 0.41251296997070314, "step": 745, "token_acc": 0.8983364140480592 }, { "epoch": 0.8480565371024735, "grad_norm": 12.3125, "learning_rate": 4.848873055314914e-06, "loss": 0.5803286552429199, "step": 750, "token_acc": 0.8612612612612612 }, { "epoch": 0.8537102473498234, "grad_norm": 9.75, "learning_rate": 4.7164196004045305e-06, "loss": 0.5717785835266114, "step": 755, "token_acc": 0.8810408921933085 }, { "epoch": 0.8593639575971731, "grad_norm": 8.9375, "learning_rate": 4.588566259945948e-06, "loss": 0.545508623123169, "step": 760, "token_acc": 0.8704761904761905 }, { "epoch": 0.8650176678445229, "grad_norm": 11.5, "learning_rate": 4.465362945763868e-06, "loss": 0.40853538513183596, "step": 765, "token_acc": 0.9005424954792043 }, { "epoch": 0.8706713780918728, "grad_norm": 15.1875, "learning_rate": 4.3468577543898026e-06, "loss": 0.469269323348999, "step": 770, "token_acc": 0.8923357664233577 }, { "epoch": 0.8763250883392226, "grad_norm": 14.875, "learning_rate": 4.233096948286008e-06, "loss": 0.485385799407959, "step": 775, "token_acc": 0.8886792452830189 }, { "epoch": 0.8819787985865725, "grad_norm": 10.25, "learning_rate": 4.124124937785375e-06, "loss": 0.5617117404937744, "step": 780, "token_acc": 0.8745318352059925 }, { "epoch": 0.8876325088339223, "grad_norm": 14.625, "learning_rate": 4.019984263754374e-06, "loss": 0.5018572807312012, "step": 785, "token_acc": 0.8718861209964412 }, { "epoch": 0.893286219081272, "grad_norm": 20.25, "learning_rate": 3.920715580985813e-06, "loss": 0.5424814224243164, "step": 790, "token_acc": 0.8679245283018868 }, { "epoch": 0.8989399293286219, "grad_norm": 7.375, "learning_rate": 3.8263576423278684e-06, "loss": 0.3595900058746338, "step": 795, "token_acc": 0.9087591240875912 }, { "epoch": 0.9045936395759717, "grad_norm": 13.9375, "learning_rate": 3.736947283555621e-06, "loss": 0.5959813117980957, "step": 800, "token_acc": 0.8745019920318725 }, { "epoch": 0.9102473498233216, "grad_norm": 9.6875, "learning_rate": 3.6525194089909827e-06, "loss": 0.6750380039215088, "step": 805, "token_acc": 0.8412098298676749 }, { "epoch": 0.9159010600706714, "grad_norm": 14.4375, "learning_rate": 3.5731069778766223e-06, "loss": 0.680885648727417, "step": 810, "token_acc": 0.8374291115311909 }, { "epoch": 0.9215547703180212, "grad_norm": 9.0, "learning_rate": 3.498740991509231e-06, "loss": 0.5743994235992431, "step": 815, "token_acc": 0.8820224719101124 }, { "epoch": 0.927208480565371, "grad_norm": 17.5, "learning_rate": 3.4294504811371234e-06, "loss": 0.5227997779846192, "step": 820, "token_acc": 0.887189292543021 }, { "epoch": 0.9328621908127208, "grad_norm": 12.8125, "learning_rate": 3.3652624966269193e-06, "loss": 0.41265015602111815, "step": 825, "token_acc": 0.8854545454545455 }, { "epoch": 0.9385159010600707, "grad_norm": 17.125, "learning_rate": 3.306202095903728e-06, "loss": 0.5101790428161621, "step": 830, "token_acc": 0.891588785046729 }, { "epoch": 0.9441696113074205, "grad_norm": 12.5, "learning_rate": 3.252292335168949e-06, "loss": 0.48482298851013184, "step": 835, "token_acc": 0.8655616942909761 }, { "epoch": 0.9498233215547703, "grad_norm": 11.0, "learning_rate": 3.2035542598995146e-06, "loss": 0.5188216209411621, "step": 840, "token_acc": 0.8686679174484052 }, { "epoch": 0.9554770318021202, "grad_norm": 16.625, "learning_rate": 3.1600068966320774e-06, "loss": 0.5178674697875977, "step": 845, "token_acc": 0.8787878787878788 }, { "epoch": 0.9611307420494699, "grad_norm": 9.375, "learning_rate": 3.1216672455353746e-06, "loss": 0.3571352958679199, "step": 850, "token_acc": 0.9163636363636364 }, { "epoch": 0.9667844522968198, "grad_norm": 19.75, "learning_rate": 3.0885502737736366e-06, "loss": 0.5291311740875244, "step": 855, "token_acc": 0.8817829457364341 }, { "epoch": 0.9724381625441696, "grad_norm": 10.625, "learning_rate": 3.0606689096636604e-06, "loss": 0.7415075778961182, "step": 860, "token_acc": 0.8426763110307414 }, { "epoch": 0.9780918727915194, "grad_norm": 11.125, "learning_rate": 3.0380340376278078e-06, "loss": 0.4753167152404785, "step": 865, "token_acc": 0.8872180451127819 }, { "epoch": 0.9837455830388693, "grad_norm": 13.75, "learning_rate": 3.0206544939449e-06, "loss": 0.5666730403900146, "step": 870, "token_acc": 0.8776595744680851 }, { "epoch": 0.9893992932862191, "grad_norm": 14.1875, "learning_rate": 3.0085370633006945e-06, "loss": 0.5065449714660645, "step": 875, "token_acc": 0.8717948717948718 }, { "epoch": 0.995053003533569, "grad_norm": 12.8125, "learning_rate": 3.0016864761392417e-06, "loss": 0.5106320858001709, "step": 880, "token_acc": 0.874031007751938 } ], "logging_steps": 5, "max_steps": 884, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.272720385457565e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }