| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9995759717314487, |
| "eval_steps": 200, |
| "global_step": 884, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.0011307420494699647, |
| "grad_norm": 8.3125, |
| "learning_rate": 3.3707865168539325e-07, |
| "loss": 0.493535578250885, |
| "step": 1, |
| "token_acc": 0.8620689655172413 |
| }, |
| { |
| "epoch": 0.005653710247349823, |
| "grad_norm": 10.0625, |
| "learning_rate": 1.6853932584269663e-06, |
| "loss": 0.4733062982559204, |
| "step": 5, |
| "token_acc": 0.8886255924170616 |
| }, |
| { |
| "epoch": 0.011307420494699646, |
| "grad_norm": 7.8125, |
| "learning_rate": 3.3707865168539327e-06, |
| "loss": 0.575874137878418, |
| "step": 10, |
| "token_acc": 0.8524590163934426 |
| }, |
| { |
| "epoch": 0.01696113074204947, |
| "grad_norm": 10.625, |
| "learning_rate": 5.056179775280899e-06, |
| "loss": 0.5115116596221924, |
| "step": 15, |
| "token_acc": 0.8833652007648184 |
| }, |
| { |
| "epoch": 0.022614840989399292, |
| "grad_norm": 9.1875, |
| "learning_rate": 6.741573033707865e-06, |
| "loss": 0.5164161682128906, |
| "step": 20, |
| "token_acc": 0.858195211786372 |
| }, |
| { |
| "epoch": 0.028268551236749116, |
| "grad_norm": 7.28125, |
| "learning_rate": 8.426966292134832e-06, |
| "loss": 0.500755786895752, |
| "step": 25, |
| "token_acc": 0.8675623800383877 |
| }, |
| { |
| "epoch": 0.03392226148409894, |
| "grad_norm": 11.4375, |
| "learning_rate": 1.0112359550561798e-05, |
| "loss": 0.45065789222717284, |
| "step": 30, |
| "token_acc": 0.8818011257035647 |
| }, |
| { |
| "epoch": 0.039575971731448764, |
| "grad_norm": 9.375, |
| "learning_rate": 1.1797752808988765e-05, |
| "loss": 0.5452223300933838, |
| "step": 35, |
| "token_acc": 0.8604651162790697 |
| }, |
| { |
| "epoch": 0.045229681978798585, |
| "grad_norm": 12.0, |
| "learning_rate": 1.348314606741573e-05, |
| "loss": 0.46700444221496584, |
| "step": 40, |
| "token_acc": 0.8830188679245283 |
| }, |
| { |
| "epoch": 0.05088339222614841, |
| "grad_norm": 8.4375, |
| "learning_rate": 1.5168539325842698e-05, |
| "loss": 0.4567443370819092, |
| "step": 45, |
| "token_acc": 0.8964879852125693 |
| }, |
| { |
| "epoch": 0.05653710247349823, |
| "grad_norm": 13.5, |
| "learning_rate": 1.6853932584269665e-05, |
| "loss": 0.5110898494720459, |
| "step": 50, |
| "token_acc": 0.864406779661017 |
| }, |
| { |
| "epoch": 0.06219081272084806, |
| "grad_norm": 5.0, |
| "learning_rate": 1.853932584269663e-05, |
| "loss": 0.4209805965423584, |
| "step": 55, |
| "token_acc": 0.8962264150943396 |
| }, |
| { |
| "epoch": 0.06784452296819787, |
| "grad_norm": 8.125, |
| "learning_rate": 2.0224719101123596e-05, |
| "loss": 0.4727331638336182, |
| "step": 60, |
| "token_acc": 0.891941391941392 |
| }, |
| { |
| "epoch": 0.0734982332155477, |
| "grad_norm": 11.125, |
| "learning_rate": 2.1910112359550563e-05, |
| "loss": 0.390924072265625, |
| "step": 65, |
| "token_acc": 0.8983364140480592 |
| }, |
| { |
| "epoch": 0.07915194346289753, |
| "grad_norm": 7.09375, |
| "learning_rate": 2.359550561797753e-05, |
| "loss": 0.3485325813293457, |
| "step": 70, |
| "token_acc": 0.9133709981167608 |
| }, |
| { |
| "epoch": 0.08480565371024736, |
| "grad_norm": 9.875, |
| "learning_rate": 2.5280898876404494e-05, |
| "loss": 0.4826796531677246, |
| "step": 75, |
| "token_acc": 0.8802946593001841 |
| }, |
| { |
| "epoch": 0.09045936395759717, |
| "grad_norm": 8.5, |
| "learning_rate": 2.696629213483146e-05, |
| "loss": 0.4428101539611816, |
| "step": 80, |
| "token_acc": 0.8783269961977186 |
| }, |
| { |
| "epoch": 0.096113074204947, |
| "grad_norm": 9.375, |
| "learning_rate": 2.865168539325843e-05, |
| "loss": 0.48508028984069823, |
| "step": 85, |
| "token_acc": 0.886654478976234 |
| }, |
| { |
| "epoch": 0.10176678445229682, |
| "grad_norm": 13.5, |
| "learning_rate": 2.999989459318379e-05, |
| "loss": 0.4538856506347656, |
| "step": 90, |
| "token_acc": 0.9044943820224719 |
| }, |
| { |
| "epoch": 0.10742049469964664, |
| "grad_norm": 8.4375, |
| "learning_rate": 2.999620552744515e-05, |
| "loss": 0.4510812282562256, |
| "step": 95, |
| "token_acc": 0.9045801526717557 |
| }, |
| { |
| "epoch": 0.11307420494699646, |
| "grad_norm": 16.5, |
| "learning_rate": 2.998724776679495e-05, |
| "loss": 0.49095354080200193, |
| "step": 100, |
| "token_acc": 0.8860294117647058 |
| }, |
| { |
| "epoch": 0.11872791519434629, |
| "grad_norm": 9.875, |
| "learning_rate": 2.997302480819445e-05, |
| "loss": 0.41774497032165525, |
| "step": 105, |
| "token_acc": 0.8880597014925373 |
| }, |
| { |
| "epoch": 0.12438162544169612, |
| "grad_norm": 7.15625, |
| "learning_rate": 2.9953542204050917e-05, |
| "loss": 0.30308961868286133, |
| "step": 110, |
| "token_acc": 0.9306049822064056 |
| }, |
| { |
| "epoch": 0.13003533568904593, |
| "grad_norm": 16.0, |
| "learning_rate": 2.9928807560050043e-05, |
| "loss": 0.5130989074707031, |
| "step": 115, |
| "token_acc": 0.8803738317757009 |
| }, |
| { |
| "epoch": 0.13568904593639575, |
| "grad_norm": 6.84375, |
| "learning_rate": 2.9898830532186824e-05, |
| "loss": 0.4366453647613525, |
| "step": 120, |
| "token_acc": 0.886654478976234 |
| }, |
| { |
| "epoch": 0.1413427561837456, |
| "grad_norm": 10.875, |
| "learning_rate": 2.9863622822996006e-05, |
| "loss": 0.5397056102752685, |
| "step": 125, |
| "token_acc": 0.8822429906542056 |
| }, |
| { |
| "epoch": 0.1469964664310954, |
| "grad_norm": 12.5, |
| "learning_rate": 2.982319817698363e-05, |
| "loss": 0.5331222534179687, |
| "step": 130, |
| "token_acc": 0.8604651162790697 |
| }, |
| { |
| "epoch": 0.15265017667844524, |
| "grad_norm": 12.1875, |
| "learning_rate": 2.977757237526136e-05, |
| "loss": 0.343440842628479, |
| "step": 135, |
| "token_acc": 0.9131238447319778 |
| }, |
| { |
| "epoch": 0.15830388692579506, |
| "grad_norm": 12.625, |
| "learning_rate": 2.9726763229385863e-05, |
| "loss": 0.47114005088806155, |
| "step": 140, |
| "token_acc": 0.886654478976234 |
| }, |
| { |
| "epoch": 0.16395759717314487, |
| "grad_norm": 8.625, |
| "learning_rate": 2.9670790574405432e-05, |
| "loss": 0.4388119697570801, |
| "step": 145, |
| "token_acc": 0.8894927536231884 |
| }, |
| { |
| "epoch": 0.1696113074204947, |
| "grad_norm": 6.6875, |
| "learning_rate": 2.9609676261116703e-05, |
| "loss": 0.558556079864502, |
| "step": 150, |
| "token_acc": 0.8729582577132486 |
| }, |
| { |
| "epoch": 0.17526501766784452, |
| "grad_norm": 10.375, |
| "learning_rate": 2.9543444147534497e-05, |
| "loss": 0.4793074131011963, |
| "step": 155, |
| "token_acc": 0.8905660377358491 |
| }, |
| { |
| "epoch": 0.18091872791519434, |
| "grad_norm": 8.3125, |
| "learning_rate": 2.947212008957803e-05, |
| "loss": 0.48024735450744627, |
| "step": 160, |
| "token_acc": 0.8924528301886793 |
| }, |
| { |
| "epoch": 0.18657243816254418, |
| "grad_norm": 19.875, |
| "learning_rate": 2.9395731930977187e-05, |
| "loss": 0.36183197498321534, |
| "step": 165, |
| "token_acc": 0.9111969111969112 |
| }, |
| { |
| "epoch": 0.192226148409894, |
| "grad_norm": 16.5, |
| "learning_rate": 2.9314309492402806e-05, |
| "loss": 0.5196819305419922, |
| "step": 170, |
| "token_acc": 0.8822393822393823 |
| }, |
| { |
| "epoch": 0.1978798586572438, |
| "grad_norm": 11.5625, |
| "learning_rate": 2.922788455982516e-05, |
| "loss": 0.46062512397766114, |
| "step": 175, |
| "token_acc": 0.8813559322033898 |
| }, |
| { |
| "epoch": 0.20353356890459365, |
| "grad_norm": 4.65625, |
| "learning_rate": 2.9136490872105272e-05, |
| "loss": 0.35768780708312986, |
| "step": 180, |
| "token_acc": 0.9044943820224719 |
| }, |
| { |
| "epoch": 0.20918727915194346, |
| "grad_norm": 9.5625, |
| "learning_rate": 2.904016410782379e-05, |
| "loss": 0.49846739768981935, |
| "step": 185, |
| "token_acc": 0.875 |
| }, |
| { |
| "epoch": 0.21484098939929328, |
| "grad_norm": 9.625, |
| "learning_rate": 2.8938941871352683e-05, |
| "loss": 0.5851227760314941, |
| "step": 190, |
| "token_acc": 0.8588007736943907 |
| }, |
| { |
| "epoch": 0.22049469964664312, |
| "grad_norm": 14.3125, |
| "learning_rate": 2.883286367817511e-05, |
| "loss": 0.4418447017669678, |
| "step": 195, |
| "token_acc": 0.8899253731343284 |
| }, |
| { |
| "epoch": 0.22614840989399293, |
| "grad_norm": 11.4375, |
| "learning_rate": 2.872197093945924e-05, |
| "loss": 0.7016141414642334, |
| "step": 200, |
| "token_acc": 0.8148148148148148 |
| }, |
| { |
| "epoch": 0.23180212014134274, |
| "grad_norm": 17.875, |
| "learning_rate": 2.860630694589199e-05, |
| "loss": 0.3626969337463379, |
| "step": 205, |
| "token_acc": 0.9100917431192661 |
| }, |
| { |
| "epoch": 0.23745583038869258, |
| "grad_norm": 12.9375, |
| "learning_rate": 2.8485916850779088e-05, |
| "loss": 0.4723252296447754, |
| "step": 210, |
| "token_acc": 0.8835978835978836 |
| }, |
| { |
| "epoch": 0.2431095406360424, |
| "grad_norm": 11.4375, |
| "learning_rate": 2.8360847652417973e-05, |
| "loss": 0.5085994720458984, |
| "step": 215, |
| "token_acc": 0.893048128342246 |
| }, |
| { |
| "epoch": 0.24876325088339224, |
| "grad_norm": 9.9375, |
| "learning_rate": 2.82311481757504e-05, |
| "loss": 0.4896233081817627, |
| "step": 220, |
| "token_acc": 0.8727272727272727 |
| }, |
| { |
| "epoch": 0.254416961130742, |
| "grad_norm": 11.375, |
| "learning_rate": 2.8096869053302046e-05, |
| "loss": 0.4948256492614746, |
| "step": 225, |
| "token_acc": 0.8743068391866913 |
| }, |
| { |
| "epoch": 0.26007067137809187, |
| "grad_norm": 8.5625, |
| "learning_rate": 2.7958062705416376e-05, |
| "loss": 0.4669034481048584, |
| "step": 230, |
| "token_acc": 0.8822463768115942 |
| }, |
| { |
| "epoch": 0.2657243816254417, |
| "grad_norm": 9.8125, |
| "learning_rate": 2.7814783319790595e-05, |
| "loss": 0.4465358734130859, |
| "step": 235, |
| "token_acc": 0.8836772983114447 |
| }, |
| { |
| "epoch": 0.2713780918727915, |
| "grad_norm": 10.9375, |
| "learning_rate": 2.766708683032173e-05, |
| "loss": 0.4827105522155762, |
| "step": 240, |
| "token_acc": 0.8766859344894027 |
| }, |
| { |
| "epoch": 0.27703180212014133, |
| "grad_norm": 9.4375, |
| "learning_rate": 2.75150308952709e-05, |
| "loss": 0.40639634132385255, |
| "step": 245, |
| "token_acc": 0.9010791366906474 |
| }, |
| { |
| "epoch": 0.2826855123674912, |
| "grad_norm": 11.4375, |
| "learning_rate": 2.735867487475452e-05, |
| "loss": 0.5378179550170898, |
| "step": 250, |
| "token_acc": 0.874766355140187 |
| }, |
| { |
| "epoch": 0.28833922261484096, |
| "grad_norm": 19.125, |
| "learning_rate": 2.7198079807571094e-05, |
| "loss": 0.42697606086730955, |
| "step": 255, |
| "token_acc": 0.8934579439252337 |
| }, |
| { |
| "epoch": 0.2939929328621908, |
| "grad_norm": 7.625, |
| "learning_rate": 2.7033308387372666e-05, |
| "loss": 0.45955357551574705, |
| "step": 260, |
| "token_acc": 0.888045540796964 |
| }, |
| { |
| "epoch": 0.29964664310954064, |
| "grad_norm": 8.875, |
| "learning_rate": 2.6864424938190263e-05, |
| "loss": 0.51542067527771, |
| "step": 265, |
| "token_acc": 0.8812260536398467 |
| }, |
| { |
| "epoch": 0.3053003533568905, |
| "grad_norm": 10.25, |
| "learning_rate": 2.6691495389322878e-05, |
| "loss": 0.49557199478149416, |
| "step": 270, |
| "token_acc": 0.8658536585365854 |
| }, |
| { |
| "epoch": 0.31095406360424027, |
| "grad_norm": 7.125, |
| "learning_rate": 2.651458724959973e-05, |
| "loss": 0.4124931812286377, |
| "step": 275, |
| "token_acc": 0.8986615678776291 |
| }, |
| { |
| "epoch": 0.3166077738515901, |
| "grad_norm": 8.9375, |
| "learning_rate": 2.633376958102597e-05, |
| "loss": 0.6442465782165527, |
| "step": 280, |
| "token_acc": 0.8523364485981308 |
| }, |
| { |
| "epoch": 0.32226148409893995, |
| "grad_norm": 9.875, |
| "learning_rate": 2.614911297182199e-05, |
| "loss": 0.43247137069702146, |
| "step": 285, |
| "token_acc": 0.8936567164179104 |
| }, |
| { |
| "epoch": 0.32791519434628974, |
| "grad_norm": 8.1875, |
| "learning_rate": 2.596068950886699e-05, |
| "loss": 0.40369553565979005, |
| "step": 290, |
| "token_acc": 0.9065934065934066 |
| }, |
| { |
| "epoch": 0.3335689045936396, |
| "grad_norm": 6.75, |
| "learning_rate": 2.5768572749557398e-05, |
| "loss": 0.35304784774780273, |
| "step": 295, |
| "token_acc": 0.9064885496183206 |
| }, |
| { |
| "epoch": 0.3392226148409894, |
| "grad_norm": 10.625, |
| "learning_rate": 2.5572837693091338e-05, |
| "loss": 0.42280235290527346, |
| "step": 300, |
| "token_acc": 0.8916518650088809 |
| }, |
| { |
| "epoch": 0.3448763250883392, |
| "grad_norm": 15.3125, |
| "learning_rate": 2.5373560751190164e-05, |
| "loss": 0.41590089797973634, |
| "step": 305, |
| "token_acc": 0.8983050847457628 |
| }, |
| { |
| "epoch": 0.35053003533568905, |
| "grad_norm": 15.5625, |
| "learning_rate": 2.517081971826858e-05, |
| "loss": 0.5931228637695313, |
| "step": 310, |
| "token_acc": 0.8493408662900188 |
| }, |
| { |
| "epoch": 0.3561837455830389, |
| "grad_norm": 16.0, |
| "learning_rate": 2.4964693741065e-05, |
| "loss": 0.3908259630203247, |
| "step": 315, |
| "token_acc": 0.8956043956043956 |
| }, |
| { |
| "epoch": 0.3618374558303887, |
| "grad_norm": 6.9375, |
| "learning_rate": 2.4755263287743982e-05, |
| "loss": 0.533808708190918, |
| "step": 320, |
| "token_acc": 0.8736842105263158 |
| }, |
| { |
| "epoch": 0.3674911660777385, |
| "grad_norm": 11.75, |
| "learning_rate": 2.4542610116482777e-05, |
| "loss": 0.43399462699890134, |
| "step": 325, |
| "token_acc": 0.8923076923076924 |
| }, |
| { |
| "epoch": 0.37314487632508836, |
| "grad_norm": 16.375, |
| "learning_rate": 2.43268172435543e-05, |
| "loss": 0.5363679885864258, |
| "step": 330, |
| "token_acc": 0.8554006968641115 |
| }, |
| { |
| "epoch": 0.37879858657243815, |
| "grad_norm": 6.46875, |
| "learning_rate": 2.4107968910918943e-05, |
| "loss": 0.4685643196105957, |
| "step": 335, |
| "token_acc": 0.8926553672316384 |
| }, |
| { |
| "epoch": 0.384452296819788, |
| "grad_norm": 11.25, |
| "learning_rate": 2.3886150553337925e-05, |
| "loss": 0.4552040100097656, |
| "step": 340, |
| "token_acc": 0.8849557522123894 |
| }, |
| { |
| "epoch": 0.3901060070671378, |
| "grad_norm": 9.375, |
| "learning_rate": 2.366144876502097e-05, |
| "loss": 0.41571660041809083, |
| "step": 345, |
| "token_acc": 0.8919925512104283 |
| }, |
| { |
| "epoch": 0.3957597173144876, |
| "grad_norm": 16.75, |
| "learning_rate": 2.3433951265821347e-05, |
| "loss": 0.48478131294250487, |
| "step": 350, |
| "token_acc": 0.8825688073394495 |
| }, |
| { |
| "epoch": 0.40141342756183745, |
| "grad_norm": 12.875, |
| "learning_rate": 2.320374686699154e-05, |
| "loss": 0.48756847381591795, |
| "step": 355, |
| "token_acc": 0.8759398496240601 |
| }, |
| { |
| "epoch": 0.4070671378091873, |
| "grad_norm": 10.25, |
| "learning_rate": 2.2970925436512743e-05, |
| "loss": 0.5582265853881836, |
| "step": 360, |
| "token_acc": 0.8727272727272727 |
| }, |
| { |
| "epoch": 0.4127208480565371, |
| "grad_norm": 7.125, |
| "learning_rate": 2.2735577864011946e-05, |
| "loss": 0.40789146423339845, |
| "step": 365, |
| "token_acc": 0.903954802259887 |
| }, |
| { |
| "epoch": 0.4183745583038869, |
| "grad_norm": 9.0625, |
| "learning_rate": 2.2497796025280097e-05, |
| "loss": 0.4335779666900635, |
| "step": 370, |
| "token_acc": 0.8895027624309392 |
| }, |
| { |
| "epoch": 0.42402826855123676, |
| "grad_norm": 8.6875, |
| "learning_rate": 2.2257672746405337e-05, |
| "loss": 0.5792682647705079, |
| "step": 375, |
| "token_acc": 0.8518518518518519 |
| }, |
| { |
| "epoch": 0.42968197879858655, |
| "grad_norm": 9.125, |
| "learning_rate": 2.201530176753521e-05, |
| "loss": 0.5717463016510009, |
| "step": 380, |
| "token_acc": 0.8565965583173997 |
| }, |
| { |
| "epoch": 0.4353356890459364, |
| "grad_norm": 6.9375, |
| "learning_rate": 2.17707777062821e-05, |
| "loss": 0.3671257972717285, |
| "step": 385, |
| "token_acc": 0.9074410163339383 |
| }, |
| { |
| "epoch": 0.44098939929328623, |
| "grad_norm": 12.125, |
| "learning_rate": 2.1524196020786038e-05, |
| "loss": 0.5280078887939453, |
| "step": 390, |
| "token_acc": 0.8771929824561403 |
| }, |
| { |
| "epoch": 0.446643109540636, |
| "grad_norm": 21.25, |
| "learning_rate": 2.127565297244947e-05, |
| "loss": 0.4925088882446289, |
| "step": 395, |
| "token_acc": 0.8908765652951699 |
| }, |
| { |
| "epoch": 0.45229681978798586, |
| "grad_norm": 10.3125, |
| "learning_rate": 2.1025245588358365e-05, |
| "loss": 0.4084740161895752, |
| "step": 400, |
| "token_acc": 0.8884758364312267 |
| }, |
| { |
| "epoch": 0.4579505300353357, |
| "grad_norm": 11.1875, |
| "learning_rate": 2.0773071623404486e-05, |
| "loss": 0.6456653594970703, |
| "step": 405, |
| "token_acc": 0.8585461689587426 |
| }, |
| { |
| "epoch": 0.4636042402826855, |
| "grad_norm": 6.6875, |
| "learning_rate": 2.0519229522123453e-05, |
| "loss": 0.6197998046875, |
| "step": 410, |
| "token_acc": 0.8672727272727273 |
| }, |
| { |
| "epoch": 0.46925795053003533, |
| "grad_norm": 8.8125, |
| "learning_rate": 2.026381838026368e-05, |
| "loss": 0.4290182113647461, |
| "step": 415, |
| "token_acc": 0.8931860036832413 |
| }, |
| { |
| "epoch": 0.47491166077738517, |
| "grad_norm": 9.9375, |
| "learning_rate": 2.0006937906100998e-05, |
| "loss": 0.4530322551727295, |
| "step": 420, |
| "token_acc": 0.9066901408450704 |
| }, |
| { |
| "epoch": 0.48056537102473496, |
| "grad_norm": 14.5625, |
| "learning_rate": 1.9748688381514224e-05, |
| "loss": 0.4739545345306396, |
| "step": 425, |
| "token_acc": 0.8712121212121212 |
| }, |
| { |
| "epoch": 0.4862190812720848, |
| "grad_norm": 14.1875, |
| "learning_rate": 1.9489170622836754e-05, |
| "loss": 0.5166975498199463, |
| "step": 430, |
| "token_acc": 0.8643122676579925 |
| }, |
| { |
| "epoch": 0.49187279151943464, |
| "grad_norm": 7.25, |
| "learning_rate": 1.922848594149955e-05, |
| "loss": 0.41360926628112793, |
| "step": 435, |
| "token_acc": 0.8875968992248062 |
| }, |
| { |
| "epoch": 0.4975265017667845, |
| "grad_norm": 8.1875, |
| "learning_rate": 1.896673610448085e-05, |
| "loss": 0.3576143741607666, |
| "step": 440, |
| "token_acc": 0.9126559714795008 |
| }, |
| { |
| "epoch": 0.5031802120141343, |
| "grad_norm": 9.8125, |
| "learning_rate": 1.8704023294578e-05, |
| "loss": 0.4304816246032715, |
| "step": 445, |
| "token_acc": 0.8942486085343229 |
| }, |
| { |
| "epoch": 0.508833922261484, |
| "grad_norm": 9.3125, |
| "learning_rate": 1.8440450070517e-05, |
| "loss": 0.40277585983276365, |
| "step": 450, |
| "token_acc": 0.8975791433891993 |
| }, |
| { |
| "epoch": 0.5144876325088339, |
| "grad_norm": 11.25, |
| "learning_rate": 1.817611932691528e-05, |
| "loss": 0.4203328609466553, |
| "step": 455, |
| "token_acc": 0.8958333333333334 |
| }, |
| { |
| "epoch": 0.5201413427561837, |
| "grad_norm": 15.375, |
| "learning_rate": 1.791113425411332e-05, |
| "loss": 0.4957026481628418, |
| "step": 460, |
| "token_acc": 0.869811320754717 |
| }, |
| { |
| "epoch": 0.5257950530035336, |
| "grad_norm": 12.5, |
| "learning_rate": 1.7645598297890914e-05, |
| "loss": 0.38120887279510496, |
| "step": 465, |
| "token_acc": 0.8954372623574145 |
| }, |
| { |
| "epoch": 0.5314487632508834, |
| "grad_norm": 12.5625, |
| "learning_rate": 1.7379615119083562e-05, |
| "loss": 0.46092791557312013, |
| "step": 470, |
| "token_acc": 0.9040590405904059 |
| }, |
| { |
| "epoch": 0.5371024734982333, |
| "grad_norm": 8.875, |
| "learning_rate": 1.7113288553115094e-05, |
| "loss": 0.35474748611450196, |
| "step": 475, |
| "token_acc": 0.9045045045045045 |
| }, |
| { |
| "epoch": 0.542756183745583, |
| "grad_norm": 11.5, |
| "learning_rate": 1.6846722569461957e-05, |
| "loss": 0.6311816215515137, |
| "step": 480, |
| "token_acc": 0.864376130198915 |
| }, |
| { |
| "epoch": 0.5484098939929328, |
| "grad_norm": 7.46875, |
| "learning_rate": 1.658002123106531e-05, |
| "loss": 0.4077010154724121, |
| "step": 485, |
| "token_acc": 0.8971428571428571 |
| }, |
| { |
| "epoch": 0.5540636042402827, |
| "grad_norm": 9.25, |
| "learning_rate": 1.6313288653706577e-05, |
| "loss": 0.5004054546356201, |
| "step": 490, |
| "token_acc": 0.8745173745173745 |
| }, |
| { |
| "epoch": 0.5597173144876325, |
| "grad_norm": 10.875, |
| "learning_rate": 1.6046628965362325e-05, |
| "loss": 0.49433560371398927, |
| "step": 495, |
| "token_acc": 0.8756660746003553 |
| }, |
| { |
| "epoch": 0.5653710247349824, |
| "grad_norm": 13.625, |
| "learning_rate": 1.5780146265554462e-05, |
| "loss": 0.5154177188873291, |
| "step": 500, |
| "token_acc": 0.8778359511343804 |
| }, |
| { |
| "epoch": 0.5710247349823322, |
| "grad_norm": 16.0, |
| "learning_rate": 1.5513944584711537e-05, |
| "loss": 0.5768596172332764, |
| "step": 505, |
| "token_acc": 0.8682170542635659 |
| }, |
| { |
| "epoch": 0.5766784452296819, |
| "grad_norm": 11.5, |
| "learning_rate": 1.5248127843556906e-05, |
| "loss": 0.542631196975708, |
| "step": 510, |
| "token_acc": 0.8648148148148148 |
| }, |
| { |
| "epoch": 0.5823321554770318, |
| "grad_norm": 9.3125, |
| "learning_rate": 1.4982799812539898e-05, |
| "loss": 0.44904112815856934, |
| "step": 515, |
| "token_acc": 0.8768656716417911 |
| }, |
| { |
| "epoch": 0.5879858657243816, |
| "grad_norm": 12.0, |
| "learning_rate": 1.471806407132547e-05, |
| "loss": 0.4998485088348389, |
| "step": 520, |
| "token_acc": 0.8848920863309353 |
| }, |
| { |
| "epoch": 0.5936395759717314, |
| "grad_norm": 14.9375, |
| "learning_rate": 1.445402396835848e-05, |
| "loss": 0.4888237476348877, |
| "step": 525, |
| "token_acc": 0.8878676470588235 |
| }, |
| { |
| "epoch": 0.5992932862190813, |
| "grad_norm": 11.875, |
| "learning_rate": 1.4190782580518134e-05, |
| "loss": 0.4358950614929199, |
| "step": 530, |
| "token_acc": 0.8945454545454545 |
| }, |
| { |
| "epoch": 0.6049469964664311, |
| "grad_norm": 13.4375, |
| "learning_rate": 1.3928442672878498e-05, |
| "loss": 0.3919216632843018, |
| "step": 535, |
| "token_acc": 0.9005424954792043 |
| }, |
| { |
| "epoch": 0.610600706713781, |
| "grad_norm": 14.625, |
| "learning_rate": 1.3667106658590713e-05, |
| "loss": 0.4299191474914551, |
| "step": 540, |
| "token_acc": 0.8834586466165414 |
| }, |
| { |
| "epoch": 0.6162544169611307, |
| "grad_norm": 11.0, |
| "learning_rate": 1.3406876558902596e-05, |
| "loss": 0.4752546787261963, |
| "step": 545, |
| "token_acc": 0.8946395563770795 |
| }, |
| { |
| "epoch": 0.6219081272084805, |
| "grad_norm": 8.0, |
| "learning_rate": 1.3147853963331226e-05, |
| "loss": 0.462324857711792, |
| "step": 550, |
| "token_acc": 0.8884892086330936 |
| }, |
| { |
| "epoch": 0.6275618374558304, |
| "grad_norm": 14.125, |
| "learning_rate": 1.2890139990004112e-05, |
| "loss": 0.4813478946685791, |
| "step": 555, |
| "token_acc": 0.8781818181818182 |
| }, |
| { |
| "epoch": 0.6332155477031802, |
| "grad_norm": 10.375, |
| "learning_rate": 1.2633835246184317e-05, |
| "loss": 0.4766115188598633, |
| "step": 560, |
| "token_acc": 0.8956692913385826 |
| }, |
| { |
| "epoch": 0.6388692579505301, |
| "grad_norm": 13.75, |
| "learning_rate": 1.2379039788995068e-05, |
| "loss": 0.555994701385498, |
| "step": 565, |
| "token_acc": 0.8626692456479691 |
| }, |
| { |
| "epoch": 0.6445229681978799, |
| "grad_norm": 11.4375, |
| "learning_rate": 1.2125853086359117e-05, |
| "loss": 0.554969596862793, |
| "step": 570, |
| "token_acc": 0.864406779661017 |
| }, |
| { |
| "epoch": 0.6501766784452296, |
| "grad_norm": 9.375, |
| "learning_rate": 1.1874373978168092e-05, |
| "loss": 0.48480896949768065, |
| "step": 575, |
| "token_acc": 0.8574144486692015 |
| }, |
| { |
| "epoch": 0.6558303886925795, |
| "grad_norm": 10.75, |
| "learning_rate": 1.1624700637697078e-05, |
| "loss": 0.6284814357757569, |
| "step": 580, |
| "token_acc": 0.8512241054613936 |
| }, |
| { |
| "epoch": 0.6614840989399293, |
| "grad_norm": 10.1875, |
| "learning_rate": 1.1376930533279357e-05, |
| "loss": 0.38442087173461914, |
| "step": 585, |
| "token_acc": 0.9003690036900369 |
| }, |
| { |
| "epoch": 0.6671378091872792, |
| "grad_norm": 10.3125, |
| "learning_rate": 1.1131160390256417e-05, |
| "loss": 0.5038439750671386, |
| "step": 590, |
| "token_acc": 0.8627819548872181 |
| }, |
| { |
| "epoch": 0.672791519434629, |
| "grad_norm": 14.3125, |
| "learning_rate": 1.0887486153217962e-05, |
| "loss": 0.45342187881469725, |
| "step": 595, |
| "token_acc": 0.8917910447761194 |
| }, |
| { |
| "epoch": 0.6784452296819788, |
| "grad_norm": 10.3125, |
| "learning_rate": 1.064600294854675e-05, |
| "loss": 0.5414403915405274, |
| "step": 600, |
| "token_acc": 0.8771266540642723 |
| }, |
| { |
| "epoch": 0.6840989399293286, |
| "grad_norm": 12.1875, |
| "learning_rate": 1.0406805047282826e-05, |
| "loss": 0.44243249893188474, |
| "step": 605, |
| "token_acc": 0.8971428571428571 |
| }, |
| { |
| "epoch": 0.6897526501766784, |
| "grad_norm": 9.4375, |
| "learning_rate": 1.0169985828321664e-05, |
| "loss": 0.4240866661071777, |
| "step": 610, |
| "token_acc": 0.900562851782364 |
| }, |
| { |
| "epoch": 0.6954063604240283, |
| "grad_norm": 17.0, |
| "learning_rate": 9.935637741960595e-06, |
| "loss": 0.5268006324768066, |
| "step": 615, |
| "token_acc": 0.8721804511278195 |
| }, |
| { |
| "epoch": 0.7010600706713781, |
| "grad_norm": 13.75, |
| "learning_rate": 9.703852273807745e-06, |
| "loss": 0.5309527397155762, |
| "step": 620, |
| "token_acc": 0.864963503649635 |
| }, |
| { |
| "epoch": 0.7067137809187279, |
| "grad_norm": 10.625, |
| "learning_rate": 9.474719909067592e-06, |
| "loss": 0.46470232009887696, |
| "step": 625, |
| "token_acc": 0.8984674329501916 |
| }, |
| { |
| "epoch": 0.7123674911660778, |
| "grad_norm": 10.75, |
| "learning_rate": 9.248330097216998e-06, |
| "loss": 0.3585221290588379, |
| "step": 630, |
| "token_acc": 0.9072356215213359 |
| }, |
| { |
| "epoch": 0.7180212014134275, |
| "grad_norm": 10.8125, |
| "learning_rate": 9.024771217085648e-06, |
| "loss": 0.45208401679992677, |
| "step": 635, |
| "token_acc": 0.9046728971962616 |
| }, |
| { |
| "epoch": 0.7236749116607774, |
| "grad_norm": 9.0, |
| "learning_rate": 8.804130542354423e-06, |
| "loss": 0.40645594596862794, |
| "step": 640, |
| "token_acc": 0.8970588235294118 |
| }, |
| { |
| "epoch": 0.7293286219081272, |
| "grad_norm": 11.1875, |
| "learning_rate": 8.586494207485173e-06, |
| "loss": 0.5020310878753662, |
| "step": 645, |
| "token_acc": 0.8729582577132486 |
| }, |
| { |
| "epoch": 0.734982332155477, |
| "grad_norm": 26.25, |
| "learning_rate": 8.371947174095276e-06, |
| "loss": 0.5611002445220947, |
| "step": 650, |
| "token_acc": 0.8490566037735849 |
| }, |
| { |
| "epoch": 0.7406360424028269, |
| "grad_norm": 14.8125, |
| "learning_rate": 8.160573197790034e-06, |
| "loss": 0.504447078704834, |
| "step": 655, |
| "token_acc": 0.8843416370106761 |
| }, |
| { |
| "epoch": 0.7462897526501767, |
| "grad_norm": 18.375, |
| "learning_rate": 7.952454795465847e-06, |
| "loss": 0.41913480758666993, |
| "step": 660, |
| "token_acc": 0.8958333333333334 |
| }, |
| { |
| "epoch": 0.7519434628975264, |
| "grad_norm": 8.75, |
| "learning_rate": 7.747673213097013e-06, |
| "loss": 0.4940896511077881, |
| "step": 665, |
| "token_acc": 0.8772893772893773 |
| }, |
| { |
| "epoch": 0.7575971731448763, |
| "grad_norm": 13.5625, |
| "learning_rate": 7.5463083940186235e-06, |
| "loss": 0.47562193870544434, |
| "step": 670, |
| "token_acc": 0.8994614003590664 |
| }, |
| { |
| "epoch": 0.7632508833922261, |
| "grad_norm": 10.0625, |
| "learning_rate": 7.3484389477180245e-06, |
| "loss": 0.48333349227905276, |
| "step": 675, |
| "token_acc": 0.8953488372093024 |
| }, |
| { |
| "epoch": 0.768904593639576, |
| "grad_norm": 8.5, |
| "learning_rate": 7.154142119146981e-06, |
| "loss": 0.41202802658081056, |
| "step": 680, |
| "token_acc": 0.9003831417624522 |
| }, |
| { |
| "epoch": 0.7745583038869258, |
| "grad_norm": 12.9375, |
| "learning_rate": 6.9634937585665066e-06, |
| "loss": 0.47983555793762206, |
| "step": 685, |
| "token_acc": 0.8998211091234347 |
| }, |
| { |
| "epoch": 0.7802120141342757, |
| "grad_norm": 8.6875, |
| "learning_rate": 6.776568291936193e-06, |
| "loss": 0.4969668388366699, |
| "step": 690, |
| "token_acc": 0.8790786948176583 |
| }, |
| { |
| "epoch": 0.7858657243816255, |
| "grad_norm": 10.375, |
| "learning_rate": 6.593438691859566e-06, |
| "loss": 0.44586987495422364, |
| "step": 695, |
| "token_acc": 0.884469696969697 |
| }, |
| { |
| "epoch": 0.7915194346289752, |
| "grad_norm": 11.875, |
| "learning_rate": 6.414176449096749e-06, |
| "loss": 0.5935549736022949, |
| "step": 700, |
| "token_acc": 0.8718929254302104 |
| }, |
| { |
| "epoch": 0.7971731448763251, |
| "grad_norm": 16.75, |
| "learning_rate": 6.238851544655688e-06, |
| "loss": 0.5915599822998047, |
| "step": 705, |
| "token_acc": 0.8622641509433963 |
| }, |
| { |
| "epoch": 0.8028268551236749, |
| "grad_norm": 13.4375, |
| "learning_rate": 6.067532422472728e-06, |
| "loss": 0.45562114715576174, |
| "step": 710, |
| "token_acc": 0.8954372623574145 |
| }, |
| { |
| "epoch": 0.8084805653710248, |
| "grad_norm": 9.5625, |
| "learning_rate": 5.9002859626932115e-06, |
| "loss": 0.44912257194519045, |
| "step": 715, |
| "token_acc": 0.8789571694599627 |
| }, |
| { |
| "epoch": 0.8141342756183746, |
| "grad_norm": 12.6875, |
| "learning_rate": 5.7371774555625925e-06, |
| "loss": 0.4588914394378662, |
| "step": 720, |
| "token_acc": 0.8875638841567292 |
| }, |
| { |
| "epoch": 0.8197879858657244, |
| "grad_norm": 10.5625, |
| "learning_rate": 5.578270575938212e-06, |
| "loss": 0.42406349182128905, |
| "step": 725, |
| "token_acc": 0.9009009009009009 |
| }, |
| { |
| "epoch": 0.8254416961130742, |
| "grad_norm": 13.875, |
| "learning_rate": 5.423627358431671e-06, |
| "loss": 0.49872541427612305, |
| "step": 730, |
| "token_acc": 0.8799249530956847 |
| }, |
| { |
| "epoch": 0.831095406360424, |
| "grad_norm": 17.0, |
| "learning_rate": 5.273308173191575e-06, |
| "loss": 0.48893170356750487, |
| "step": 735, |
| "token_acc": 0.8968105065666041 |
| }, |
| { |
| "epoch": 0.8367491166077738, |
| "grad_norm": 10.3125, |
| "learning_rate": 5.127371702336002e-06, |
| "loss": 0.5636299133300782, |
| "step": 740, |
| "token_acc": 0.860236220472441 |
| }, |
| { |
| "epoch": 0.8424028268551237, |
| "grad_norm": 12.0, |
| "learning_rate": 4.985874917043985e-06, |
| "loss": 0.41251296997070314, |
| "step": 745, |
| "token_acc": 0.8983364140480592 |
| }, |
| { |
| "epoch": 0.8480565371024735, |
| "grad_norm": 12.3125, |
| "learning_rate": 4.848873055314914e-06, |
| "loss": 0.5803286552429199, |
| "step": 750, |
| "token_acc": 0.8612612612612612 |
| }, |
| { |
| "epoch": 0.8537102473498234, |
| "grad_norm": 9.75, |
| "learning_rate": 4.7164196004045305e-06, |
| "loss": 0.5717785835266114, |
| "step": 755, |
| "token_acc": 0.8810408921933085 |
| }, |
| { |
| "epoch": 0.8593639575971731, |
| "grad_norm": 8.9375, |
| "learning_rate": 4.588566259945948e-06, |
| "loss": 0.545508623123169, |
| "step": 760, |
| "token_acc": 0.8704761904761905 |
| }, |
| { |
| "epoch": 0.8650176678445229, |
| "grad_norm": 11.5, |
| "learning_rate": 4.465362945763868e-06, |
| "loss": 0.40853538513183596, |
| "step": 765, |
| "token_acc": 0.9005424954792043 |
| }, |
| { |
| "epoch": 0.8706713780918728, |
| "grad_norm": 15.1875, |
| "learning_rate": 4.3468577543898026e-06, |
| "loss": 0.469269323348999, |
| "step": 770, |
| "token_acc": 0.8923357664233577 |
| }, |
| { |
| "epoch": 0.8763250883392226, |
| "grad_norm": 14.875, |
| "learning_rate": 4.233096948286008e-06, |
| "loss": 0.485385799407959, |
| "step": 775, |
| "token_acc": 0.8886792452830189 |
| }, |
| { |
| "epoch": 0.8819787985865725, |
| "grad_norm": 10.25, |
| "learning_rate": 4.124124937785375e-06, |
| "loss": 0.5617117404937744, |
| "step": 780, |
| "token_acc": 0.8745318352059925 |
| }, |
| { |
| "epoch": 0.8876325088339223, |
| "grad_norm": 14.625, |
| "learning_rate": 4.019984263754374e-06, |
| "loss": 0.5018572807312012, |
| "step": 785, |
| "token_acc": 0.8718861209964412 |
| }, |
| { |
| "epoch": 0.893286219081272, |
| "grad_norm": 20.25, |
| "learning_rate": 3.920715580985813e-06, |
| "loss": 0.5424814224243164, |
| "step": 790, |
| "token_acc": 0.8679245283018868 |
| }, |
| { |
| "epoch": 0.8989399293286219, |
| "grad_norm": 7.375, |
| "learning_rate": 3.8263576423278684e-06, |
| "loss": 0.3595900058746338, |
| "step": 795, |
| "token_acc": 0.9087591240875912 |
| }, |
| { |
| "epoch": 0.9045936395759717, |
| "grad_norm": 13.9375, |
| "learning_rate": 3.736947283555621e-06, |
| "loss": 0.5959813117980957, |
| "step": 800, |
| "token_acc": 0.8745019920318725 |
| }, |
| { |
| "epoch": 0.9102473498233216, |
| "grad_norm": 9.6875, |
| "learning_rate": 3.6525194089909827e-06, |
| "loss": 0.6750380039215088, |
| "step": 805, |
| "token_acc": 0.8412098298676749 |
| }, |
| { |
| "epoch": 0.9159010600706714, |
| "grad_norm": 14.4375, |
| "learning_rate": 3.5731069778766223e-06, |
| "loss": 0.680885648727417, |
| "step": 810, |
| "token_acc": 0.8374291115311909 |
| }, |
| { |
| "epoch": 0.9215547703180212, |
| "grad_norm": 9.0, |
| "learning_rate": 3.498740991509231e-06, |
| "loss": 0.5743994235992431, |
| "step": 815, |
| "token_acc": 0.8820224719101124 |
| }, |
| { |
| "epoch": 0.927208480565371, |
| "grad_norm": 17.5, |
| "learning_rate": 3.4294504811371234e-06, |
| "loss": 0.5227997779846192, |
| "step": 820, |
| "token_acc": 0.887189292543021 |
| }, |
| { |
| "epoch": 0.9328621908127208, |
| "grad_norm": 12.8125, |
| "learning_rate": 3.3652624966269193e-06, |
| "loss": 0.41265015602111815, |
| "step": 825, |
| "token_acc": 0.8854545454545455 |
| }, |
| { |
| "epoch": 0.9385159010600707, |
| "grad_norm": 17.125, |
| "learning_rate": 3.306202095903728e-06, |
| "loss": 0.5101790428161621, |
| "step": 830, |
| "token_acc": 0.891588785046729 |
| }, |
| { |
| "epoch": 0.9441696113074205, |
| "grad_norm": 12.5, |
| "learning_rate": 3.252292335168949e-06, |
| "loss": 0.48482298851013184, |
| "step": 835, |
| "token_acc": 0.8655616942909761 |
| }, |
| { |
| "epoch": 0.9498233215547703, |
| "grad_norm": 11.0, |
| "learning_rate": 3.2035542598995146e-06, |
| "loss": 0.5188216209411621, |
| "step": 840, |
| "token_acc": 0.8686679174484052 |
| }, |
| { |
| "epoch": 0.9554770318021202, |
| "grad_norm": 16.625, |
| "learning_rate": 3.1600068966320774e-06, |
| "loss": 0.5178674697875977, |
| "step": 845, |
| "token_acc": 0.8787878787878788 |
| }, |
| { |
| "epoch": 0.9611307420494699, |
| "grad_norm": 9.375, |
| "learning_rate": 3.1216672455353746e-06, |
| "loss": 0.3571352958679199, |
| "step": 850, |
| "token_acc": 0.9163636363636364 |
| }, |
| { |
| "epoch": 0.9667844522968198, |
| "grad_norm": 19.75, |
| "learning_rate": 3.0885502737736366e-06, |
| "loss": 0.5291311740875244, |
| "step": 855, |
| "token_acc": 0.8817829457364341 |
| }, |
| { |
| "epoch": 0.9724381625441696, |
| "grad_norm": 10.625, |
| "learning_rate": 3.0606689096636604e-06, |
| "loss": 0.7415075778961182, |
| "step": 860, |
| "token_acc": 0.8426763110307414 |
| }, |
| { |
| "epoch": 0.9780918727915194, |
| "grad_norm": 11.125, |
| "learning_rate": 3.0380340376278078e-06, |
| "loss": 0.4753167152404785, |
| "step": 865, |
| "token_acc": 0.8872180451127819 |
| }, |
| { |
| "epoch": 0.9837455830388693, |
| "grad_norm": 13.75, |
| "learning_rate": 3.0206544939449e-06, |
| "loss": 0.5666730403900146, |
| "step": 870, |
| "token_acc": 0.8776595744680851 |
| }, |
| { |
| "epoch": 0.9893992932862191, |
| "grad_norm": 14.1875, |
| "learning_rate": 3.0085370633006945e-06, |
| "loss": 0.5065449714660645, |
| "step": 875, |
| "token_acc": 0.8717948717948718 |
| }, |
| { |
| "epoch": 0.995053003533569, |
| "grad_norm": 12.8125, |
| "learning_rate": 3.0016864761392417e-06, |
| "loss": 0.5106320858001709, |
| "step": 880, |
| "token_acc": 0.874031007751938 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 884, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 200, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.272720385457565e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|