3TF-8B / trainer_state.json
volcanos's picture
Upload folder using huggingface_hub
68e467c
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 2562,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00117096018735363,
"grad_norm": 3.2623555660247803,
"learning_rate": 1.5503875968992249e-07,
"loss": 0.591235339641571,
"memory(GiB)": 130.94,
"step": 1,
"token_acc": 0.8447411307509088,
"train_speed(iter/s)": 0.019417
},
{
"epoch": 0.00585480093676815,
"grad_norm": 3.0421900749206543,
"learning_rate": 7.751937984496125e-07,
"loss": 0.5568965077400208,
"memory(GiB)": 131.94,
"step": 5,
"token_acc": 0.8460283319735964,
"train_speed(iter/s)": 0.043842
},
{
"epoch": 0.0117096018735363,
"grad_norm": 2.8613500595092773,
"learning_rate": 1.550387596899225e-06,
"loss": 0.5551010131835937,
"memory(GiB)": 131.94,
"step": 10,
"token_acc": 0.8356083810191254,
"train_speed(iter/s)": 0.052588
},
{
"epoch": 0.01756440281030445,
"grad_norm": 2.1271445751190186,
"learning_rate": 2.3255813953488376e-06,
"loss": 0.5327572345733642,
"memory(GiB)": 132.78,
"step": 15,
"token_acc": 0.8394243615921753,
"train_speed(iter/s)": 0.057201
},
{
"epoch": 0.0234192037470726,
"grad_norm": 1.0847800970077515,
"learning_rate": 3.10077519379845e-06,
"loss": 0.46837658882141114,
"memory(GiB)": 132.78,
"step": 20,
"token_acc": 0.8431611509225723,
"train_speed(iter/s)": 0.059425
},
{
"epoch": 0.02927400468384075,
"grad_norm": 0.8750381469726562,
"learning_rate": 3.875968992248063e-06,
"loss": 0.44452896118164065,
"memory(GiB)": 132.78,
"step": 25,
"token_acc": 0.8543377731613794,
"train_speed(iter/s)": 0.060974
},
{
"epoch": 0.0351288056206089,
"grad_norm": 0.5234003663063049,
"learning_rate": 4.651162790697675e-06,
"loss": 0.42905311584472655,
"memory(GiB)": 132.78,
"step": 30,
"token_acc": 0.8542961149814849,
"train_speed(iter/s)": 0.062206
},
{
"epoch": 0.040983606557377046,
"grad_norm": 0.5018875598907471,
"learning_rate": 5.4263565891472865e-06,
"loss": 0.4269443988800049,
"memory(GiB)": 132.78,
"step": 35,
"token_acc": 0.8522571433054558,
"train_speed(iter/s)": 0.063069
},
{
"epoch": 0.0468384074941452,
"grad_norm": 0.38381680846214294,
"learning_rate": 6.2015503875969e-06,
"loss": 0.40071582794189453,
"memory(GiB)": 132.78,
"step": 40,
"token_acc": 0.8555349764923779,
"train_speed(iter/s)": 0.063736
},
{
"epoch": 0.05269320843091335,
"grad_norm": 0.3016009032726288,
"learning_rate": 6.976744186046513e-06,
"loss": 0.4113297462463379,
"memory(GiB)": 132.78,
"step": 45,
"token_acc": 0.8491327275191562,
"train_speed(iter/s)": 0.064231
},
{
"epoch": 0.0585480093676815,
"grad_norm": 0.2976464331150055,
"learning_rate": 7.751937984496126e-06,
"loss": 0.4019885540008545,
"memory(GiB)": 132.78,
"step": 50,
"token_acc": 0.8631093056438779,
"train_speed(iter/s)": 0.064655
},
{
"epoch": 0.06440281030444965,
"grad_norm": 0.24487970769405365,
"learning_rate": 8.527131782945736e-06,
"loss": 0.3938943386077881,
"memory(GiB)": 132.78,
"step": 55,
"token_acc": 0.8631337386589192,
"train_speed(iter/s)": 0.065047
},
{
"epoch": 0.0702576112412178,
"grad_norm": 0.23692984879016876,
"learning_rate": 9.30232558139535e-06,
"loss": 0.41377553939819334,
"memory(GiB)": 132.97,
"step": 60,
"token_acc": 0.8609983103219724,
"train_speed(iter/s)": 0.065302
},
{
"epoch": 0.07611241217798595,
"grad_norm": 0.23579329252243042,
"learning_rate": 1.0077519379844963e-05,
"loss": 0.3947890758514404,
"memory(GiB)": 132.97,
"step": 65,
"token_acc": 0.8729405159237655,
"train_speed(iter/s)": 0.065577
},
{
"epoch": 0.08196721311475409,
"grad_norm": 0.2210317999124527,
"learning_rate": 1.0852713178294573e-05,
"loss": 0.3936769962310791,
"memory(GiB)": 132.97,
"step": 70,
"token_acc": 0.860379465686213,
"train_speed(iter/s)": 0.065756
},
{
"epoch": 0.08782201405152225,
"grad_norm": 0.23814593255519867,
"learning_rate": 1.1627906976744187e-05,
"loss": 0.39299349784851073,
"memory(GiB)": 132.97,
"step": 75,
"token_acc": 0.8482827629927034,
"train_speed(iter/s)": 0.06599
},
{
"epoch": 0.0936768149882904,
"grad_norm": 0.24474237859249115,
"learning_rate": 1.24031007751938e-05,
"loss": 0.39170591831207274,
"memory(GiB)": 132.97,
"step": 80,
"token_acc": 0.8571129295007489,
"train_speed(iter/s)": 0.066134
},
{
"epoch": 0.09953161592505855,
"grad_norm": 0.232538640499115,
"learning_rate": 1.3178294573643412e-05,
"loss": 0.3822017669677734,
"memory(GiB)": 132.97,
"step": 85,
"token_acc": 0.8780878727095818,
"train_speed(iter/s)": 0.066366
},
{
"epoch": 0.1053864168618267,
"grad_norm": 0.22437641024589539,
"learning_rate": 1.3953488372093025e-05,
"loss": 0.38762218952178956,
"memory(GiB)": 132.97,
"step": 90,
"token_acc": 0.8614531845562612,
"train_speed(iter/s)": 0.066559
},
{
"epoch": 0.11124121779859485,
"grad_norm": 0.22185830771923065,
"learning_rate": 1.4728682170542636e-05,
"loss": 0.38779487609863283,
"memory(GiB)": 132.97,
"step": 95,
"token_acc": 0.8664786644726099,
"train_speed(iter/s)": 0.066708
},
{
"epoch": 0.117096018735363,
"grad_norm": 0.27393871545791626,
"learning_rate": 1.550387596899225e-05,
"loss": 0.3883920192718506,
"memory(GiB)": 132.97,
"step": 100,
"token_acc": 0.8665084805343176,
"train_speed(iter/s)": 0.066847
},
{
"epoch": 0.12295081967213115,
"grad_norm": 0.22931204736232758,
"learning_rate": 1.6279069767441862e-05,
"loss": 0.38483271598815916,
"memory(GiB)": 132.97,
"step": 105,
"token_acc": 0.86842660702191,
"train_speed(iter/s)": 0.066999
},
{
"epoch": 0.1288056206088993,
"grad_norm": 0.24479679763317108,
"learning_rate": 1.7054263565891473e-05,
"loss": 0.37220172882080077,
"memory(GiB)": 132.97,
"step": 110,
"token_acc": 0.867410052595701,
"train_speed(iter/s)": 0.067112
},
{
"epoch": 0.13466042154566746,
"grad_norm": 0.2648003101348877,
"learning_rate": 1.7829457364341087e-05,
"loss": 0.39059298038482665,
"memory(GiB)": 132.97,
"step": 115,
"token_acc": 0.8748554193704952,
"train_speed(iter/s)": 0.067218
},
{
"epoch": 0.1405152224824356,
"grad_norm": 0.26005980372428894,
"learning_rate": 1.86046511627907e-05,
"loss": 0.3818374156951904,
"memory(GiB)": 132.97,
"step": 120,
"token_acc": 0.8672951527027911,
"train_speed(iter/s)": 0.06731
},
{
"epoch": 0.14637002341920374,
"grad_norm": 0.25006258487701416,
"learning_rate": 1.937984496124031e-05,
"loss": 0.3956636428833008,
"memory(GiB)": 132.97,
"step": 125,
"token_acc": 0.8609716918038115,
"train_speed(iter/s)": 0.067383
},
{
"epoch": 0.1522248243559719,
"grad_norm": 0.2747514545917511,
"learning_rate": 1.9999991663467044e-05,
"loss": 0.3932375907897949,
"memory(GiB)": 132.97,
"step": 130,
"token_acc": 0.8660186100028765,
"train_speed(iter/s)": 0.06745
},
{
"epoch": 0.15807962529274006,
"grad_norm": 0.2641543745994568,
"learning_rate": 1.9999699886272926e-05,
"loss": 0.39503839015960696,
"memory(GiB)": 132.97,
"step": 135,
"token_acc": 0.8533355723899442,
"train_speed(iter/s)": 0.067497
},
{
"epoch": 0.16393442622950818,
"grad_norm": 0.2637743353843689,
"learning_rate": 1.9998991296330317e-05,
"loss": 0.39163637161254883,
"memory(GiB)": 132.97,
"step": 140,
"token_acc": 0.8673312165879645,
"train_speed(iter/s)": 0.067529
},
{
"epoch": 0.16978922716627634,
"grad_norm": 0.2526402175426483,
"learning_rate": 1.9997865923175027e-05,
"loss": 0.3822649002075195,
"memory(GiB)": 132.97,
"step": 145,
"token_acc": 0.8725527891092668,
"train_speed(iter/s)": 0.067555
},
{
"epoch": 0.1756440281030445,
"grad_norm": 0.2798239588737488,
"learning_rate": 1.999632381371545e-05,
"loss": 0.388509464263916,
"memory(GiB)": 133.05,
"step": 150,
"token_acc": 0.8570005695948406,
"train_speed(iter/s)": 0.067614
},
{
"epoch": 0.18149882903981265,
"grad_norm": 0.24978382885456085,
"learning_rate": 1.999436503223061e-05,
"loss": 0.38669638633728026,
"memory(GiB)": 133.05,
"step": 155,
"token_acc": 0.8657276078873382,
"train_speed(iter/s)": 0.067633
},
{
"epoch": 0.1873536299765808,
"grad_norm": 0.2820796072483063,
"learning_rate": 1.9991989660367463e-05,
"loss": 0.39322915077209475,
"memory(GiB)": 133.05,
"step": 160,
"token_acc": 0.8509120957934454,
"train_speed(iter/s)": 0.06766
},
{
"epoch": 0.19320843091334894,
"grad_norm": 0.25325024127960205,
"learning_rate": 1.998919779713751e-05,
"loss": 0.3963874578475952,
"memory(GiB)": 133.05,
"step": 165,
"token_acc": 0.8568015157690381,
"train_speed(iter/s)": 0.067694
},
{
"epoch": 0.1990632318501171,
"grad_norm": 0.23693059384822845,
"learning_rate": 1.998598955891266e-05,
"loss": 0.3861080169677734,
"memory(GiB)": 133.05,
"step": 170,
"token_acc": 0.8704777077082435,
"train_speed(iter/s)": 0.067738
},
{
"epoch": 0.20491803278688525,
"grad_norm": 0.24995002150535583,
"learning_rate": 1.9982365079420382e-05,
"loss": 0.3748037338256836,
"memory(GiB)": 133.05,
"step": 175,
"token_acc": 0.8639677636839712,
"train_speed(iter/s)": 0.06777
},
{
"epoch": 0.2107728337236534,
"grad_norm": 0.2528163492679596,
"learning_rate": 1.9978324509738147e-05,
"loss": 0.37778520584106445,
"memory(GiB)": 133.05,
"step": 180,
"token_acc": 0.8692558237224801,
"train_speed(iter/s)": 0.067808
},
{
"epoch": 0.21662763466042154,
"grad_norm": 0.26185593008995056,
"learning_rate": 1.9973868018287093e-05,
"loss": 0.37712826728820803,
"memory(GiB)": 133.05,
"step": 185,
"token_acc": 0.8629621624330818,
"train_speed(iter/s)": 0.067862
},
{
"epoch": 0.2224824355971897,
"grad_norm": 0.2565723955631256,
"learning_rate": 1.9968995790825048e-05,
"loss": 0.38217387199401853,
"memory(GiB)": 133.05,
"step": 190,
"token_acc": 0.8526548122357622,
"train_speed(iter/s)": 0.06787
},
{
"epoch": 0.22833723653395785,
"grad_norm": 0.24071918427944183,
"learning_rate": 1.9963708030438754e-05,
"loss": 0.38128018379211426,
"memory(GiB)": 133.05,
"step": 195,
"token_acc": 0.86564623713995,
"train_speed(iter/s)": 0.067888
},
{
"epoch": 0.234192037470726,
"grad_norm": 0.2468400001525879,
"learning_rate": 1.995800495753542e-05,
"loss": 0.38081438541412355,
"memory(GiB)": 133.05,
"step": 200,
"token_acc": 0.8573196660493942,
"train_speed(iter/s)": 0.0679
},
{
"epoch": 0.24004683840749413,
"grad_norm": 0.24025513231754303,
"learning_rate": 1.9951886809833537e-05,
"loss": 0.39122610092163085,
"memory(GiB)": 133.05,
"step": 205,
"token_acc": 0.8568699202170693,
"train_speed(iter/s)": 0.067926
},
{
"epoch": 0.2459016393442623,
"grad_norm": 0.262650728225708,
"learning_rate": 1.9945353842352943e-05,
"loss": 0.38733615875244143,
"memory(GiB)": 133.05,
"step": 210,
"token_acc": 0.8605185069498672,
"train_speed(iter/s)": 0.067945
},
{
"epoch": 0.25175644028103045,
"grad_norm": 0.2334696501493454,
"learning_rate": 1.9938406327404233e-05,
"loss": 0.38346500396728517,
"memory(GiB)": 133.05,
"step": 215,
"token_acc": 0.8689892435384466,
"train_speed(iter/s)": 0.067989
},
{
"epoch": 0.2576112412177986,
"grad_norm": 0.2296629697084427,
"learning_rate": 1.9931044554577373e-05,
"loss": 0.3805164575576782,
"memory(GiB)": 133.05,
"step": 220,
"token_acc": 0.862054141615526,
"train_speed(iter/s)": 0.068004
},
{
"epoch": 0.26346604215456676,
"grad_norm": 0.2337953746318817,
"learning_rate": 1.992326883072965e-05,
"loss": 0.38329010009765624,
"memory(GiB)": 133.05,
"step": 225,
"token_acc": 0.8576421234268423,
"train_speed(iter/s)": 0.068016
},
{
"epoch": 0.2693208430913349,
"grad_norm": 0.22751180827617645,
"learning_rate": 1.991507947997287e-05,
"loss": 0.3914541244506836,
"memory(GiB)": 133.05,
"step": 230,
"token_acc": 0.8647305257189656,
"train_speed(iter/s)": 0.068037
},
{
"epoch": 0.275175644028103,
"grad_norm": 0.23834733664989471,
"learning_rate": 1.9906476843659866e-05,
"loss": 0.3868813753128052,
"memory(GiB)": 133.05,
"step": 235,
"token_acc": 0.8718037707532127,
"train_speed(iter/s)": 0.068062
},
{
"epoch": 0.2810304449648712,
"grad_norm": 0.2157682329416275,
"learning_rate": 1.989746128037024e-05,
"loss": 0.3725996971130371,
"memory(GiB)": 133.05,
"step": 240,
"token_acc": 0.8637500196081507,
"train_speed(iter/s)": 0.068079
},
{
"epoch": 0.28688524590163933,
"grad_norm": 0.24432708323001862,
"learning_rate": 1.988803316589545e-05,
"loss": 0.38200843334198,
"memory(GiB)": 133.05,
"step": 245,
"token_acc": 0.863402893772779,
"train_speed(iter/s)": 0.068119
},
{
"epoch": 0.2927400468384075,
"grad_norm": 0.22754515707492828,
"learning_rate": 1.987819289322311e-05,
"loss": 0.38454749584198,
"memory(GiB)": 133.05,
"step": 250,
"token_acc": 0.8616220657129776,
"train_speed(iter/s)": 0.068158
},
{
"epoch": 0.29859484777517564,
"grad_norm": 0.22906067967414856,
"learning_rate": 1.9867940872520646e-05,
"loss": 0.38929970264434816,
"memory(GiB)": 133.05,
"step": 255,
"token_acc": 0.862697854653979,
"train_speed(iter/s)": 0.068143
},
{
"epoch": 0.3044496487119438,
"grad_norm": 0.2391372174024582,
"learning_rate": 1.9857277531118173e-05,
"loss": 0.38328697681427004,
"memory(GiB)": 133.05,
"step": 260,
"token_acc": 0.875577325482754,
"train_speed(iter/s)": 0.068151
},
{
"epoch": 0.31030444964871196,
"grad_norm": 0.23862990736961365,
"learning_rate": 1.9846203313490697e-05,
"loss": 0.3745781660079956,
"memory(GiB)": 133.05,
"step": 265,
"token_acc": 0.8789255692291267,
"train_speed(iter/s)": 0.068172
},
{
"epoch": 0.3161592505854801,
"grad_norm": 0.2886284291744232,
"learning_rate": 1.983471868123958e-05,
"loss": 0.37299673557281493,
"memory(GiB)": 133.05,
"step": 270,
"token_acc": 0.8619748050993121,
"train_speed(iter/s)": 0.068214
},
{
"epoch": 0.32201405152224827,
"grad_norm": 0.25015807151794434,
"learning_rate": 1.98228241130733e-05,
"loss": 0.39740839004516604,
"memory(GiB)": 133.05,
"step": 275,
"token_acc": 0.8667058589327261,
"train_speed(iter/s)": 0.068226
},
{
"epoch": 0.32786885245901637,
"grad_norm": 0.22695152461528778,
"learning_rate": 1.98105201047875e-05,
"loss": 0.3711256980895996,
"memory(GiB)": 133.05,
"step": 280,
"token_acc": 0.8709827404894823,
"train_speed(iter/s)": 0.068264
},
{
"epoch": 0.3337236533957845,
"grad_norm": 0.25948262214660645,
"learning_rate": 1.9797807169244326e-05,
"loss": 0.376755690574646,
"memory(GiB)": 133.05,
"step": 285,
"token_acc": 0.8627933786950365,
"train_speed(iter/s)": 0.068275
},
{
"epoch": 0.3395784543325527,
"grad_norm": 0.2252376824617386,
"learning_rate": 1.9784685836351045e-05,
"loss": 0.3907461166381836,
"memory(GiB)": 133.05,
"step": 290,
"token_acc": 0.8594050471419237,
"train_speed(iter/s)": 0.068273
},
{
"epoch": 0.34543325526932084,
"grad_norm": 0.2580513656139374,
"learning_rate": 1.9771156653037944e-05,
"loss": 0.38218297958374026,
"memory(GiB)": 133.05,
"step": 295,
"token_acc": 0.8619312594063512,
"train_speed(iter/s)": 0.068289
},
{
"epoch": 0.351288056206089,
"grad_norm": 0.21899765729904175,
"learning_rate": 1.975722018323556e-05,
"loss": 0.3749994277954102,
"memory(GiB)": 133.05,
"step": 300,
"token_acc": 0.8698979752198593,
"train_speed(iter/s)": 0.068281
},
{
"epoch": 0.35714285714285715,
"grad_norm": 0.2238709181547165,
"learning_rate": 1.974287700785116e-05,
"loss": 0.37110333442687987,
"memory(GiB)": 133.05,
"step": 305,
"token_acc": 0.8662300629837371,
"train_speed(iter/s)": 0.0683
},
{
"epoch": 0.3629976580796253,
"grad_norm": 0.24307382106781006,
"learning_rate": 1.9728127724744516e-05,
"loss": 0.36276865005493164,
"memory(GiB)": 133.05,
"step": 310,
"token_acc": 0.8663095601853296,
"train_speed(iter/s)": 0.068292
},
{
"epoch": 0.36885245901639346,
"grad_norm": 0.2318965196609497,
"learning_rate": 1.9712972948703006e-05,
"loss": 0.38519649505615233,
"memory(GiB)": 133.05,
"step": 315,
"token_acc": 0.8719233901258103,
"train_speed(iter/s)": 0.068305
},
{
"epoch": 0.3747072599531616,
"grad_norm": 0.22240430116653442,
"learning_rate": 1.9697413311415967e-05,
"loss": 0.3795146465301514,
"memory(GiB)": 133.05,
"step": 320,
"token_acc": 0.8447559871358541,
"train_speed(iter/s)": 0.06832
},
{
"epoch": 0.3805620608899297,
"grad_norm": 0.21727585792541504,
"learning_rate": 1.9681449461448386e-05,
"loss": 0.37623322010040283,
"memory(GiB)": 133.05,
"step": 325,
"token_acc": 0.868092485549133,
"train_speed(iter/s)": 0.068321
},
{
"epoch": 0.3864168618266979,
"grad_norm": 0.24871428310871124,
"learning_rate": 1.9665082064213856e-05,
"loss": 0.3804615497589111,
"memory(GiB)": 133.05,
"step": 330,
"token_acc": 0.8632469719807496,
"train_speed(iter/s)": 0.06834
},
{
"epoch": 0.39227166276346603,
"grad_norm": 0.2242128700017929,
"learning_rate": 1.9648311801946823e-05,
"loss": 0.37839736938476565,
"memory(GiB)": 133.05,
"step": 335,
"token_acc": 0.8620339267458229,
"train_speed(iter/s)": 0.068356
},
{
"epoch": 0.3981264637002342,
"grad_norm": 0.23243097960948944,
"learning_rate": 1.9631139373674188e-05,
"loss": 0.3759917736053467,
"memory(GiB)": 133.05,
"step": 340,
"token_acc": 0.8630340491154014,
"train_speed(iter/s)": 0.068364
},
{
"epoch": 0.40398126463700235,
"grad_norm": 0.2167743444442749,
"learning_rate": 1.9613565495186126e-05,
"loss": 0.36579113006591796,
"memory(GiB)": 133.05,
"step": 345,
"token_acc": 0.8630803983851985,
"train_speed(iter/s)": 0.068389
},
{
"epoch": 0.4098360655737705,
"grad_norm": 0.2554558515548706,
"learning_rate": 1.9595590899006288e-05,
"loss": 0.3840445280075073,
"memory(GiB)": 133.05,
"step": 350,
"token_acc": 0.8682752142033024,
"train_speed(iter/s)": 0.06839
},
{
"epoch": 0.41569086651053866,
"grad_norm": 0.23864524066448212,
"learning_rate": 1.957721633436124e-05,
"loss": 0.3817277908325195,
"memory(GiB)": 133.05,
"step": 355,
"token_acc": 0.8645090065366,
"train_speed(iter/s)": 0.068384
},
{
"epoch": 0.4215456674473068,
"grad_norm": 0.25255629420280457,
"learning_rate": 1.9558442567149244e-05,
"loss": 0.3791682720184326,
"memory(GiB)": 133.05,
"step": 360,
"token_acc": 0.8775300258130478,
"train_speed(iter/s)": 0.068404
},
{
"epoch": 0.4274004683840749,
"grad_norm": 0.2247135043144226,
"learning_rate": 1.953927037990834e-05,
"loss": 0.3860400915145874,
"memory(GiB)": 133.05,
"step": 365,
"token_acc": 0.8536377662766984,
"train_speed(iter/s)": 0.068414
},
{
"epoch": 0.4332552693208431,
"grad_norm": 0.29746949672698975,
"learning_rate": 1.9519700571783718e-05,
"loss": 0.3866363763809204,
"memory(GiB)": 133.05,
"step": 370,
"token_acc": 0.8695576843716825,
"train_speed(iter/s)": 0.0684
},
{
"epoch": 0.43911007025761123,
"grad_norm": 0.23039910197257996,
"learning_rate": 1.9499733958494405e-05,
"loss": 0.38268446922302246,
"memory(GiB)": 133.05,
"step": 375,
"token_acc": 0.8581758827531537,
"train_speed(iter/s)": 0.068407
},
{
"epoch": 0.4449648711943794,
"grad_norm": 0.23166924715042114,
"learning_rate": 1.947937137229928e-05,
"loss": 0.37559897899627687,
"memory(GiB)": 133.05,
"step": 380,
"token_acc": 0.8744556465509139,
"train_speed(iter/s)": 0.068418
},
{
"epoch": 0.45081967213114754,
"grad_norm": 0.22437815368175507,
"learning_rate": 1.9458613661962366e-05,
"loss": 0.37695770263671874,
"memory(GiB)": 133.05,
"step": 385,
"token_acc": 0.8771398753952836,
"train_speed(iter/s)": 0.068428
},
{
"epoch": 0.4566744730679157,
"grad_norm": 0.23045028746128082,
"learning_rate": 1.943746169271746e-05,
"loss": 0.37760295867919924,
"memory(GiB)": 133.05,
"step": 390,
"token_acc": 0.8759578109502548,
"train_speed(iter/s)": 0.068419
},
{
"epoch": 0.46252927400468385,
"grad_norm": 0.21340611577033997,
"learning_rate": 1.941591634623206e-05,
"loss": 0.38206305503845217,
"memory(GiB)": 133.05,
"step": 395,
"token_acc": 0.8683378180616532,
"train_speed(iter/s)": 0.068433
},
{
"epoch": 0.468384074941452,
"grad_norm": 0.2345254123210907,
"learning_rate": 1.9393978520570638e-05,
"loss": 0.3681832790374756,
"memory(GiB)": 133.05,
"step": 400,
"token_acc": 0.8685244618395304,
"train_speed(iter/s)": 0.068457
},
{
"epoch": 0.47423887587822017,
"grad_norm": 0.23758217692375183,
"learning_rate": 1.9371649130157166e-05,
"loss": 0.36426939964294436,
"memory(GiB)": 133.05,
"step": 405,
"token_acc": 0.8676219452965636,
"train_speed(iter/s)": 0.068464
},
{
"epoch": 0.48009367681498827,
"grad_norm": 0.2363872230052948,
"learning_rate": 1.9348929105737044e-05,
"loss": 0.37017192840576174,
"memory(GiB)": 133.05,
"step": 410,
"token_acc": 0.8679473812363037,
"train_speed(iter/s)": 0.068468
},
{
"epoch": 0.4859484777517564,
"grad_norm": 0.24642601609230042,
"learning_rate": 1.932581939433827e-05,
"loss": 0.38428258895874023,
"memory(GiB)": 133.05,
"step": 415,
"token_acc": 0.8687720441289789,
"train_speed(iter/s)": 0.06847
},
{
"epoch": 0.4918032786885246,
"grad_norm": 0.2268989235162735,
"learning_rate": 1.9302320959231997e-05,
"loss": 0.37460167407989503,
"memory(GiB)": 133.05,
"step": 420,
"token_acc": 0.8672426525809843,
"train_speed(iter/s)": 0.068479
},
{
"epoch": 0.49765807962529274,
"grad_norm": 0.21185266971588135,
"learning_rate": 1.927843477989234e-05,
"loss": 0.37124834060668943,
"memory(GiB)": 133.05,
"step": 425,
"token_acc": 0.8814642777451279,
"train_speed(iter/s)": 0.068488
},
{
"epoch": 0.5035128805620609,
"grad_norm": 0.21971659362316132,
"learning_rate": 1.9254161851955587e-05,
"loss": 0.3843217849731445,
"memory(GiB)": 133.05,
"step": 430,
"token_acc": 0.8714790057188723,
"train_speed(iter/s)": 0.068499
},
{
"epoch": 0.509367681498829,
"grad_norm": 0.26225098967552185,
"learning_rate": 1.9229503187178694e-05,
"loss": 0.3771937370300293,
"memory(GiB)": 133.05,
"step": 435,
"token_acc": 0.8658792102647854,
"train_speed(iter/s)": 0.068506
},
{
"epoch": 0.5152224824355972,
"grad_norm": 0.23551629483699799,
"learning_rate": 1.920445981339708e-05,
"loss": 0.37624967098236084,
"memory(GiB)": 133.05,
"step": 440,
"token_acc": 0.8641905035935222,
"train_speed(iter/s)": 0.068518
},
{
"epoch": 0.5210772833723654,
"grad_norm": 0.25343942642211914,
"learning_rate": 1.9179032774481822e-05,
"loss": 0.37384233474731443,
"memory(GiB)": 133.05,
"step": 445,
"token_acc": 0.8723531724486548,
"train_speed(iter/s)": 0.068533
},
{
"epoch": 0.5269320843091335,
"grad_norm": 0.22508122026920319,
"learning_rate": 1.9153223130296125e-05,
"loss": 0.3715523719787598,
"memory(GiB)": 133.05,
"step": 450,
"token_acc": 0.8742618455654583,
"train_speed(iter/s)": 0.068547
},
{
"epoch": 0.5327868852459017,
"grad_norm": 0.2273603081703186,
"learning_rate": 1.9127031956651153e-05,
"loss": 0.3753758192062378,
"memory(GiB)": 133.05,
"step": 455,
"token_acc": 0.8717887326571352,
"train_speed(iter/s)": 0.068556
},
{
"epoch": 0.5386416861826698,
"grad_norm": 0.24021831154823303,
"learning_rate": 1.9100460345261175e-05,
"loss": 0.3885939598083496,
"memory(GiB)": 133.05,
"step": 460,
"token_acc": 0.8648985264452413,
"train_speed(iter/s)": 0.068545
},
{
"epoch": 0.544496487119438,
"grad_norm": 0.25094419717788696,
"learning_rate": 1.9073509403698062e-05,
"loss": 0.3836202621459961,
"memory(GiB)": 133.05,
"step": 465,
"token_acc": 0.8716216427648316,
"train_speed(iter/s)": 0.068548
},
{
"epoch": 0.550351288056206,
"grad_norm": 0.2209528684616089,
"learning_rate": 1.9046180255345142e-05,
"loss": 0.3783407688140869,
"memory(GiB)": 133.05,
"step": 470,
"token_acc": 0.871262499689834,
"train_speed(iter/s)": 0.068558
},
{
"epoch": 0.5562060889929742,
"grad_norm": 0.2333252876996994,
"learning_rate": 1.9018474039350342e-05,
"loss": 0.37140965461730957,
"memory(GiB)": 133.05,
"step": 475,
"token_acc": 0.870434477460474,
"train_speed(iter/s)": 0.068559
},
{
"epoch": 0.5620608899297423,
"grad_norm": 0.22321061789989471,
"learning_rate": 1.899039191057872e-05,
"loss": 0.3732731819152832,
"memory(GiB)": 133.05,
"step": 480,
"token_acc": 0.8651847926051782,
"train_speed(iter/s)": 0.06856
},
{
"epoch": 0.5679156908665105,
"grad_norm": 0.24292093515396118,
"learning_rate": 1.8961935039564338e-05,
"loss": 0.3720050096511841,
"memory(GiB)": 133.05,
"step": 485,
"token_acc": 0.8644098695583844,
"train_speed(iter/s)": 0.06857
},
{
"epoch": 0.5737704918032787,
"grad_norm": 0.25076785683631897,
"learning_rate": 1.8933104612461454e-05,
"loss": 0.37432427406311036,
"memory(GiB)": 133.05,
"step": 490,
"token_acc": 0.865598108538928,
"train_speed(iter/s)": 0.068571
},
{
"epoch": 0.5796252927400468,
"grad_norm": 0.2353287786245346,
"learning_rate": 1.8903901830995093e-05,
"loss": 0.37787389755249023,
"memory(GiB)": 133.05,
"step": 495,
"token_acc": 0.8628752281343229,
"train_speed(iter/s)": 0.068571
},
{
"epoch": 0.585480093676815,
"grad_norm": 0.23301288485527039,
"learning_rate": 1.8874327912410945e-05,
"loss": 0.3894960880279541,
"memory(GiB)": 133.05,
"step": 500,
"token_acc": 0.8649986209317486,
"train_speed(iter/s)": 0.068584
},
{
"epoch": 0.5913348946135831,
"grad_norm": 0.23387756943702698,
"learning_rate": 1.884438408942463e-05,
"loss": 0.37682523727416994,
"memory(GiB)": 133.05,
"step": 505,
"token_acc": 0.8542796019209774,
"train_speed(iter/s)": 0.068582
},
{
"epoch": 0.5971896955503513,
"grad_norm": 0.2101481854915619,
"learning_rate": 1.881407161017033e-05,
"loss": 0.3712585210800171,
"memory(GiB)": 133.05,
"step": 510,
"token_acc": 0.8757052407221665,
"train_speed(iter/s)": 0.068594
},
{
"epoch": 0.6030444964871194,
"grad_norm": 0.2197055220603943,
"learning_rate": 1.8783391738148738e-05,
"loss": 0.3659008026123047,
"memory(GiB)": 133.05,
"step": 515,
"token_acc": 0.8690927312016535,
"train_speed(iter/s)": 0.068604
},
{
"epoch": 0.6088992974238876,
"grad_norm": 0.2129889726638794,
"learning_rate": 1.875234575217441e-05,
"loss": 0.36564500331878663,
"memory(GiB)": 133.05,
"step": 520,
"token_acc": 0.8682967700230018,
"train_speed(iter/s)": 0.068614
},
{
"epoch": 0.6147540983606558,
"grad_norm": 0.20078937709331512,
"learning_rate": 1.8720934946322466e-05,
"loss": 0.3801888465881348,
"memory(GiB)": 133.05,
"step": 525,
"token_acc": 0.8619188686453682,
"train_speed(iter/s)": 0.068624
},
{
"epoch": 0.6206088992974239,
"grad_norm": 0.20143865048885345,
"learning_rate": 1.8689160629874622e-05,
"loss": 0.3495650768280029,
"memory(GiB)": 133.05,
"step": 530,
"token_acc": 0.8823268736367693,
"train_speed(iter/s)": 0.068621
},
{
"epoch": 0.6264637002341921,
"grad_norm": 0.20651988685131073,
"learning_rate": 1.865702412726465e-05,
"loss": 0.36185364723205565,
"memory(GiB)": 133.05,
"step": 535,
"token_acc": 0.879171148410336,
"train_speed(iter/s)": 0.068634
},
{
"epoch": 0.6323185011709602,
"grad_norm": 0.2135830670595169,
"learning_rate": 1.8624526778023142e-05,
"loss": 0.36333141326904295,
"memory(GiB)": 133.05,
"step": 540,
"token_acc": 0.8760890123251218,
"train_speed(iter/s)": 0.068639
},
{
"epoch": 0.6381733021077284,
"grad_norm": 0.21670690178871155,
"learning_rate": 1.85916699367217e-05,
"loss": 0.36627764701843263,
"memory(GiB)": 133.05,
"step": 545,
"token_acc": 0.8693160130902993,
"train_speed(iter/s)": 0.068638
},
{
"epoch": 0.6440281030444965,
"grad_norm": 0.2082773894071579,
"learning_rate": 1.855845497291646e-05,
"loss": 0.3783770799636841,
"memory(GiB)": 133.05,
"step": 550,
"token_acc": 0.8656727592628988,
"train_speed(iter/s)": 0.068642
},
{
"epoch": 0.6498829039812647,
"grad_norm": 0.2064507156610489,
"learning_rate": 1.8524883271091004e-05,
"loss": 0.36701202392578125,
"memory(GiB)": 133.05,
"step": 555,
"token_acc": 0.874370974788701,
"train_speed(iter/s)": 0.068639
},
{
"epoch": 0.6557377049180327,
"grad_norm": 0.19167180359363556,
"learning_rate": 1.8490956230598668e-05,
"loss": 0.3856034755706787,
"memory(GiB)": 133.05,
"step": 560,
"token_acc": 0.8677655700574375,
"train_speed(iter/s)": 0.068642
},
{
"epoch": 0.6615925058548009,
"grad_norm": 0.22284165024757385,
"learning_rate": 1.8456675265604183e-05,
"loss": 0.36545207500457766,
"memory(GiB)": 133.05,
"step": 565,
"token_acc": 0.8674683330306996,
"train_speed(iter/s)": 0.068653
},
{
"epoch": 0.667447306791569,
"grad_norm": 0.2335020750761032,
"learning_rate": 1.842204180502476e-05,
"loss": 0.36900959014892576,
"memory(GiB)": 133.05,
"step": 570,
"token_acc": 0.8763767159865549,
"train_speed(iter/s)": 0.068659
},
{
"epoch": 0.6733021077283372,
"grad_norm": 0.2406488060951233,
"learning_rate": 1.8387057292470517e-05,
"loss": 0.3836709499359131,
"memory(GiB)": 133.05,
"step": 575,
"token_acc": 0.8667227047725787,
"train_speed(iter/s)": 0.068662
},
{
"epoch": 0.6791569086651054,
"grad_norm": 0.21748137474060059,
"learning_rate": 1.8351723186184295e-05,
"loss": 0.3724257707595825,
"memory(GiB)": 133.05,
"step": 580,
"token_acc": 0.8577895654245747,
"train_speed(iter/s)": 0.068669
},
{
"epoch": 0.6850117096018735,
"grad_norm": 0.2269269824028015,
"learning_rate": 1.8316040958980896e-05,
"loss": 0.3713605165481567,
"memory(GiB)": 133.05,
"step": 585,
"token_acc": 0.8802838494896842,
"train_speed(iter/s)": 0.068677
},
{
"epoch": 0.6908665105386417,
"grad_norm": 0.24186237156391144,
"learning_rate": 1.828001209818567e-05,
"loss": 0.3882193088531494,
"memory(GiB)": 133.05,
"step": 590,
"token_acc": 0.8647495837870993,
"train_speed(iter/s)": 0.068687
},
{
"epoch": 0.6967213114754098,
"grad_norm": 0.24182303249835968,
"learning_rate": 1.8243638105572547e-05,
"loss": 0.37105526924133303,
"memory(GiB)": 133.05,
"step": 595,
"token_acc": 0.8747415704995677,
"train_speed(iter/s)": 0.068693
},
{
"epoch": 0.702576112412178,
"grad_norm": 0.2169107049703598,
"learning_rate": 1.82069204973014e-05,
"loss": 0.3660942554473877,
"memory(GiB)": 133.05,
"step": 600,
"token_acc": 0.8819129326127438,
"train_speed(iter/s)": 0.068705
},
{
"epoch": 0.7084309133489461,
"grad_norm": 0.22826465964317322,
"learning_rate": 1.816986080385489e-05,
"loss": 0.38544516563415526,
"memory(GiB)": 133.05,
"step": 605,
"token_acc": 0.850805587726625,
"train_speed(iter/s)": 0.068706
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.22054514288902283,
"learning_rate": 1.813246056997465e-05,
"loss": 0.36968698501586916,
"memory(GiB)": 133.05,
"step": 610,
"token_acc": 0.8651287265831155,
"train_speed(iter/s)": 0.068707
},
{
"epoch": 0.7201405152224825,
"grad_norm": 0.2099841833114624,
"learning_rate": 1.809472135459688e-05,
"loss": 0.3711225509643555,
"memory(GiB)": 133.05,
"step": 615,
"token_acc": 0.8681505343933286,
"train_speed(iter/s)": 0.068708
},
{
"epoch": 0.7259953161592506,
"grad_norm": 0.21193836629390717,
"learning_rate": 1.8056644730787412e-05,
"loss": 0.3799697160720825,
"memory(GiB)": 133.05,
"step": 620,
"token_acc": 0.8738008866124044,
"train_speed(iter/s)": 0.068719
},
{
"epoch": 0.7318501170960188,
"grad_norm": 0.21255411207675934,
"learning_rate": 1.8018232285676092e-05,
"loss": 0.3608224391937256,
"memory(GiB)": 133.05,
"step": 625,
"token_acc": 0.8694407077081082,
"train_speed(iter/s)": 0.068731
},
{
"epoch": 0.7377049180327869,
"grad_norm": 0.21150043606758118,
"learning_rate": 1.797948562039066e-05,
"loss": 0.3775743246078491,
"memory(GiB)": 133.05,
"step": 630,
"token_acc": 0.8636316861199378,
"train_speed(iter/s)": 0.068723
},
{
"epoch": 0.7435597189695551,
"grad_norm": 0.21777065098285675,
"learning_rate": 1.7940406349989987e-05,
"loss": 0.3736081838607788,
"memory(GiB)": 133.05,
"step": 635,
"token_acc": 0.8663054996457302,
"train_speed(iter/s)": 0.068733
},
{
"epoch": 0.7494145199063232,
"grad_norm": 0.20919020473957062,
"learning_rate": 1.7900996103396772e-05,
"loss": 0.36686708927154543,
"memory(GiB)": 133.05,
"step": 640,
"token_acc": 0.8738849498577591,
"train_speed(iter/s)": 0.068733
},
{
"epoch": 0.7552693208430913,
"grad_norm": 0.2190757542848587,
"learning_rate": 1.7861256523329634e-05,
"loss": 0.3648522853851318,
"memory(GiB)": 133.05,
"step": 645,
"token_acc": 0.8633769063180828,
"train_speed(iter/s)": 0.068726
},
{
"epoch": 0.7611241217798594,
"grad_norm": 0.2133089303970337,
"learning_rate": 1.7821189266234647e-05,
"loss": 0.3695883274078369,
"memory(GiB)": 133.05,
"step": 650,
"token_acc": 0.86513161996683,
"train_speed(iter/s)": 0.068724
},
{
"epoch": 0.7669789227166276,
"grad_norm": 0.21529735624790192,
"learning_rate": 1.7780796002216285e-05,
"loss": 0.36347646713256837,
"memory(GiB)": 133.05,
"step": 655,
"token_acc": 0.8618205939317708,
"train_speed(iter/s)": 0.068726
},
{
"epoch": 0.7728337236533958,
"grad_norm": 0.2055824100971222,
"learning_rate": 1.7740078414967817e-05,
"loss": 0.3710654258728027,
"memory(GiB)": 133.05,
"step": 660,
"token_acc": 0.876207411310151,
"train_speed(iter/s)": 0.06873
},
{
"epoch": 0.7786885245901639,
"grad_norm": 0.20337671041488647,
"learning_rate": 1.7699038201701132e-05,
"loss": 0.363714861869812,
"memory(GiB)": 133.05,
"step": 665,
"token_acc": 0.8631361610960301,
"train_speed(iter/s)": 0.068736
},
{
"epoch": 0.7845433255269321,
"grad_norm": 0.2067345827817917,
"learning_rate": 1.7657677073075968e-05,
"loss": 0.3705836296081543,
"memory(GiB)": 133.05,
"step": 670,
"token_acc": 0.8667006816477769,
"train_speed(iter/s)": 0.068738
},
{
"epoch": 0.7903981264637002,
"grad_norm": 0.20614713430404663,
"learning_rate": 1.761599675312864e-05,
"loss": 0.37332298755645754,
"memory(GiB)": 133.05,
"step": 675,
"token_acc": 0.8799103822873227,
"train_speed(iter/s)": 0.068737
},
{
"epoch": 0.7962529274004684,
"grad_norm": 0.21380652487277985,
"learning_rate": 1.7573998979200163e-05,
"loss": 0.36742873191833497,
"memory(GiB)": 133.05,
"step": 680,
"token_acc": 0.8703528431892178,
"train_speed(iter/s)": 0.068746
},
{
"epoch": 0.8021077283372365,
"grad_norm": 0.19453544914722443,
"learning_rate": 1.753168550186383e-05,
"loss": 0.37564864158630373,
"memory(GiB)": 133.05,
"step": 685,
"token_acc": 0.8728679647922801,
"train_speed(iter/s)": 0.068749
},
{
"epoch": 0.8079625292740047,
"grad_norm": 0.19713324308395386,
"learning_rate": 1.7489058084852247e-05,
"loss": 0.37057785987854003,
"memory(GiB)": 133.05,
"step": 690,
"token_acc": 0.8620633488698441,
"train_speed(iter/s)": 0.068747
},
{
"epoch": 0.8138173302107728,
"grad_norm": 0.20321306586265564,
"learning_rate": 1.744611850498383e-05,
"loss": 0.3668221950531006,
"memory(GiB)": 133.05,
"step": 695,
"token_acc": 0.8725824053835161,
"train_speed(iter/s)": 0.068741
},
{
"epoch": 0.819672131147541,
"grad_norm": 0.22502325475215912,
"learning_rate": 1.7402868552088724e-05,
"loss": 0.3616886854171753,
"memory(GiB)": 133.05,
"step": 700,
"token_acc": 0.8672900381533646,
"train_speed(iter/s)": 0.068742
},
{
"epoch": 0.8255269320843092,
"grad_norm": 0.206443652510643,
"learning_rate": 1.73593100289342e-05,
"loss": 0.36960477828979493,
"memory(GiB)": 133.05,
"step": 705,
"token_acc": 0.8645310315863375,
"train_speed(iter/s)": 0.068749
},
{
"epoch": 0.8313817330210773,
"grad_norm": 0.2609001696109772,
"learning_rate": 1.7315444751149533e-05,
"loss": 0.3676512956619263,
"memory(GiB)": 133.05,
"step": 710,
"token_acc": 0.8703732566911265,
"train_speed(iter/s)": 0.068756
},
{
"epoch": 0.8372365339578455,
"grad_norm": 0.20213671028614044,
"learning_rate": 1.727127454715029e-05,
"loss": 0.36738247871398927,
"memory(GiB)": 133.05,
"step": 715,
"token_acc": 0.8776044347530407,
"train_speed(iter/s)": 0.068761
},
{
"epoch": 0.8430913348946136,
"grad_norm": 0.2078767567873001,
"learning_rate": 1.722680125806214e-05,
"loss": 0.3677778720855713,
"memory(GiB)": 133.05,
"step": 720,
"token_acc": 0.8627296514081535,
"train_speed(iter/s)": 0.068763
},
{
"epoch": 0.8489461358313818,
"grad_norm": 0.22138644754886627,
"learning_rate": 1.71820267376441e-05,
"loss": 0.37197351455688477,
"memory(GiB)": 133.05,
"step": 725,
"token_acc": 0.8676777818660314,
"train_speed(iter/s)": 0.068766
},
{
"epoch": 0.8548009367681498,
"grad_norm": 0.21397338807582855,
"learning_rate": 1.7136952852211274e-05,
"loss": 0.37579007148742677,
"memory(GiB)": 133.05,
"step": 730,
"token_acc": 0.8572162173097093,
"train_speed(iter/s)": 0.068772
},
{
"epoch": 0.860655737704918,
"grad_norm": 0.20828036963939667,
"learning_rate": 1.7091581480557057e-05,
"loss": 0.3636088132858276,
"memory(GiB)": 133.05,
"step": 735,
"token_acc": 0.8666745722408246,
"train_speed(iter/s)": 0.068774
},
{
"epoch": 0.8665105386416861,
"grad_norm": 0.21285265684127808,
"learning_rate": 1.7045914513874815e-05,
"loss": 0.37646629810333254,
"memory(GiB)": 133.05,
"step": 740,
"token_acc": 0.8666506652036757,
"train_speed(iter/s)": 0.068785
},
{
"epoch": 0.8723653395784543,
"grad_norm": 0.19855837523937225,
"learning_rate": 1.699995385567907e-05,
"loss": 0.37862300872802734,
"memory(GiB)": 133.05,
"step": 745,
"token_acc": 0.8584255151366506,
"train_speed(iter/s)": 0.068799
},
{
"epoch": 0.8782201405152225,
"grad_norm": 0.21356073021888733,
"learning_rate": 1.695370142172614e-05,
"loss": 0.370495080947876,
"memory(GiB)": 133.05,
"step": 750,
"token_acc": 0.8650399529081709,
"train_speed(iter/s)": 0.068798
},
{
"epoch": 0.8840749414519906,
"grad_norm": 0.21858234703540802,
"learning_rate": 1.690715913993429e-05,
"loss": 0.3731105089187622,
"memory(GiB)": 133.05,
"step": 755,
"token_acc": 0.8690419204765525,
"train_speed(iter/s)": 0.068799
},
{
"epoch": 0.8899297423887588,
"grad_norm": 0.21877680718898773,
"learning_rate": 1.6860328950303392e-05,
"loss": 0.3532438039779663,
"memory(GiB)": 133.05,
"step": 760,
"token_acc": 0.8752962281074447,
"train_speed(iter/s)": 0.068803
},
{
"epoch": 0.8957845433255269,
"grad_norm": 0.2116468995809555,
"learning_rate": 1.6813212804834033e-05,
"loss": 0.3690504550933838,
"memory(GiB)": 133.05,
"step": 765,
"token_acc": 0.861989263346257,
"train_speed(iter/s)": 0.068807
},
{
"epoch": 0.9016393442622951,
"grad_norm": 0.20343121886253357,
"learning_rate": 1.676581266744615e-05,
"loss": 0.3611701488494873,
"memory(GiB)": 133.05,
"step": 770,
"token_acc": 0.8671105242834544,
"train_speed(iter/s)": 0.06881
},
{
"epoch": 0.9074941451990632,
"grad_norm": 0.19857962429523468,
"learning_rate": 1.6718130513897207e-05,
"loss": 0.3600625038146973,
"memory(GiB)": 133.05,
"step": 775,
"token_acc": 0.8728194751658959,
"train_speed(iter/s)": 0.068813
},
{
"epoch": 0.9133489461358314,
"grad_norm": 0.23387958109378815,
"learning_rate": 1.667016833169979e-05,
"loss": 0.3759610176086426,
"memory(GiB)": 133.05,
"step": 780,
"token_acc": 0.8710922399514741,
"train_speed(iter/s)": 0.068813
},
{
"epoch": 0.9192037470725996,
"grad_norm": 0.2053619623184204,
"learning_rate": 1.6621928120038806e-05,
"loss": 0.36916725635528563,
"memory(GiB)": 133.05,
"step": 785,
"token_acc": 0.8602640020509871,
"train_speed(iter/s)": 0.068813
},
{
"epoch": 0.9250585480093677,
"grad_norm": 0.20847375690937042,
"learning_rate": 1.657341188968811e-05,
"loss": 0.36096744537353515,
"memory(GiB)": 133.05,
"step": 790,
"token_acc": 0.8631381808792282,
"train_speed(iter/s)": 0.068819
},
{
"epoch": 0.9309133489461359,
"grad_norm": 0.20935416221618652,
"learning_rate": 1.6524621662926733e-05,
"loss": 0.3602827310562134,
"memory(GiB)": 133.05,
"step": 795,
"token_acc": 0.8806607875578047,
"train_speed(iter/s)": 0.068825
},
{
"epoch": 0.936768149882904,
"grad_norm": 0.214552640914917,
"learning_rate": 1.6475559473454558e-05,
"loss": 0.369510293006897,
"memory(GiB)": 133.05,
"step": 800,
"token_acc": 0.8770849556632923,
"train_speed(iter/s)": 0.068828
},
{
"epoch": 0.9426229508196722,
"grad_norm": 0.21994450688362122,
"learning_rate": 1.6426227366307563e-05,
"loss": 0.37307014465332033,
"memory(GiB)": 133.05,
"step": 805,
"token_acc": 0.876770090527487,
"train_speed(iter/s)": 0.068823
},
{
"epoch": 0.9484777517564403,
"grad_norm": 0.20645499229431152,
"learning_rate": 1.6376627397772576e-05,
"loss": 0.37114017009735106,
"memory(GiB)": 133.05,
"step": 810,
"token_acc": 0.8619496040676315,
"train_speed(iter/s)": 0.068823
},
{
"epoch": 0.9543325526932084,
"grad_norm": 0.2126459777355194,
"learning_rate": 1.6326761635301572e-05,
"loss": 0.3650930166244507,
"memory(GiB)": 133.05,
"step": 815,
"token_acc": 0.870646124823141,
"train_speed(iter/s)": 0.068826
},
{
"epoch": 0.9601873536299765,
"grad_norm": 0.20105397701263428,
"learning_rate": 1.6276632157425475e-05,
"loss": 0.37223210334777834,
"memory(GiB)": 133.05,
"step": 820,
"token_acc": 0.8648889553764547,
"train_speed(iter/s)": 0.068826
},
{
"epoch": 0.9660421545667447,
"grad_norm": 0.2080501765012741,
"learning_rate": 1.6226241053667536e-05,
"loss": 0.37712783813476564,
"memory(GiB)": 133.05,
"step": 825,
"token_acc": 0.8605132566814988,
"train_speed(iter/s)": 0.06883
},
{
"epoch": 0.9718969555035128,
"grad_norm": 0.2141636610031128,
"learning_rate": 1.617559042445625e-05,
"loss": 0.37673077583312986,
"memory(GiB)": 133.05,
"step": 830,
"token_acc": 0.8719900238096734,
"train_speed(iter/s)": 0.06883
},
{
"epoch": 0.977751756440281,
"grad_norm": 0.21488763391971588,
"learning_rate": 1.6124682381037767e-05,
"loss": 0.3640845537185669,
"memory(GiB)": 133.05,
"step": 835,
"token_acc": 0.8693016352169747,
"train_speed(iter/s)": 0.068834
},
{
"epoch": 0.9836065573770492,
"grad_norm": 0.22521890699863434,
"learning_rate": 1.607351904538792e-05,
"loss": 0.3786426782608032,
"memory(GiB)": 133.05,
"step": 840,
"token_acc": 0.86982781737791,
"train_speed(iter/s)": 0.068827
},
{
"epoch": 0.9894613583138173,
"grad_norm": 0.2129945307970047,
"learning_rate": 1.6022102550123775e-05,
"loss": 0.365330171585083,
"memory(GiB)": 133.05,
"step": 845,
"token_acc": 0.864430874708757,
"train_speed(iter/s)": 0.06883
},
{
"epoch": 0.9953161592505855,
"grad_norm": 0.216830313205719,
"learning_rate": 1.597043503841471e-05,
"loss": 0.3653510093688965,
"memory(GiB)": 133.05,
"step": 850,
"token_acc": 0.878798859209881,
"train_speed(iter/s)": 0.068829
},
{
"epoch": 1.0011709601873535,
"grad_norm": 0.2833782732486725,
"learning_rate": 1.5918518663893124e-05,
"loss": 0.35915145874023435,
"memory(GiB)": 133.05,
"step": 855,
"token_acc": 0.873855282676776,
"train_speed(iter/s)": 0.068529
},
{
"epoch": 1.0070257611241218,
"grad_norm": 0.24765369296073914,
"learning_rate": 1.5866355590564637e-05,
"loss": 0.3397256851196289,
"memory(GiB)": 133.05,
"step": 860,
"token_acc": 0.8892689705247213,
"train_speed(iter/s)": 0.068517
},
{
"epoch": 1.0128805620608898,
"grad_norm": 0.2325168401002884,
"learning_rate": 1.5813947992717894e-05,
"loss": 0.327287483215332,
"memory(GiB)": 133.05,
"step": 865,
"token_acc": 0.8796502265193716,
"train_speed(iter/s)": 0.068508
},
{
"epoch": 1.018735362997658,
"grad_norm": 0.2461637407541275,
"learning_rate": 1.5761298054833947e-05,
"loss": 0.3370250701904297,
"memory(GiB)": 133.05,
"step": 870,
"token_acc": 0.8818223536926445,
"train_speed(iter/s)": 0.068498
},
{
"epoch": 1.0245901639344261,
"grad_norm": 0.22223389148712158,
"learning_rate": 1.5708407971495195e-05,
"loss": 0.3431839942932129,
"memory(GiB)": 133.05,
"step": 875,
"token_acc": 0.8771204606261637,
"train_speed(iter/s)": 0.068498
},
{
"epoch": 1.0304449648711944,
"grad_norm": 0.22983962297439575,
"learning_rate": 1.565527994729389e-05,
"loss": 0.333197808265686,
"memory(GiB)": 133.05,
"step": 880,
"token_acc": 0.8869119581976505,
"train_speed(iter/s)": 0.068497
},
{
"epoch": 1.0362997658079625,
"grad_norm": 0.21161960065364838,
"learning_rate": 1.5601916196740283e-05,
"loss": 0.32940354347229006,
"memory(GiB)": 133.05,
"step": 885,
"token_acc": 0.8834938944853924,
"train_speed(iter/s)": 0.068496
},
{
"epoch": 1.0421545667447307,
"grad_norm": 0.22903162240982056,
"learning_rate": 1.5548318944170276e-05,
"loss": 0.3256603956222534,
"memory(GiB)": 133.05,
"step": 890,
"token_acc": 0.8883952211008513,
"train_speed(iter/s)": 0.068494
},
{
"epoch": 1.0480093676814988,
"grad_norm": 0.21301260590553284,
"learning_rate": 1.5494490423652732e-05,
"loss": 0.3253190040588379,
"memory(GiB)": 133.05,
"step": 895,
"token_acc": 0.8813899275623074,
"train_speed(iter/s)": 0.068483
},
{
"epoch": 1.053864168618267,
"grad_norm": 0.2047208845615387,
"learning_rate": 1.544043287889635e-05,
"loss": 0.31666491031646726,
"memory(GiB)": 133.05,
"step": 900,
"token_acc": 0.8909019236833806,
"train_speed(iter/s)": 0.068481
},
{
"epoch": 1.059718969555035,
"grad_norm": 0.23390096426010132,
"learning_rate": 1.538614856315614e-05,
"loss": 0.330989408493042,
"memory(GiB)": 133.05,
"step": 905,
"token_acc": 0.8884555161039297,
"train_speed(iter/s)": 0.068482
},
{
"epoch": 1.0655737704918034,
"grad_norm": 0.20488137006759644,
"learning_rate": 1.5331639739139477e-05,
"loss": 0.3256430149078369,
"memory(GiB)": 133.05,
"step": 910,
"token_acc": 0.8721090848001792,
"train_speed(iter/s)": 0.068473
},
{
"epoch": 1.0714285714285714,
"grad_norm": 0.21736453473567963,
"learning_rate": 1.5276908678911837e-05,
"loss": 0.3228193521499634,
"memory(GiB)": 133.05,
"step": 915,
"token_acc": 0.8874282476871164,
"train_speed(iter/s)": 0.068469
},
{
"epoch": 1.0772833723653397,
"grad_norm": 0.206723153591156,
"learning_rate": 1.5221957663802043e-05,
"loss": 0.3333425521850586,
"memory(GiB)": 133.17,
"step": 920,
"token_acc": 0.886105330059943,
"train_speed(iter/s)": 0.068454
},
{
"epoch": 1.0831381733021077,
"grad_norm": 0.203144371509552,
"learning_rate": 1.5166788984307204e-05,
"loss": 0.33838639259338377,
"memory(GiB)": 133.17,
"step": 925,
"token_acc": 0.8802329092899476,
"train_speed(iter/s)": 0.068444
},
{
"epoch": 1.088992974238876,
"grad_norm": 0.24915394186973572,
"learning_rate": 1.5111404939997227e-05,
"loss": 0.33564419746398927,
"memory(GiB)": 133.17,
"step": 930,
"token_acc": 0.8793440099130728,
"train_speed(iter/s)": 0.068442
},
{
"epoch": 1.094847775175644,
"grad_norm": 0.2503604292869568,
"learning_rate": 1.5055807839418966e-05,
"loss": 0.3157151460647583,
"memory(GiB)": 133.17,
"step": 935,
"token_acc": 0.8862683405108546,
"train_speed(iter/s)": 0.068436
},
{
"epoch": 1.100702576112412,
"grad_norm": 0.20239044725894928,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.3377982139587402,
"memory(GiB)": 133.17,
"step": 940,
"token_acc": 0.8886980901742478,
"train_speed(iter/s)": 0.068436
},
{
"epoch": 1.1065573770491803,
"grad_norm": 0.20267418026924133,
"learning_rate": 1.494398374795204e-05,
"loss": 0.3253162145614624,
"memory(GiB)": 133.17,
"step": 945,
"token_acc": 0.8780125495417973,
"train_speed(iter/s)": 0.068437
},
{
"epoch": 1.1124121779859484,
"grad_norm": 0.2210346758365631,
"learning_rate": 1.4887761418173947e-05,
"loss": 0.3438437461853027,
"memory(GiB)": 133.17,
"step": 950,
"token_acc": 0.8874266802316089,
"train_speed(iter/s)": 0.068436
},
{
"epoch": 1.1182669789227166,
"grad_norm": 0.206399604678154,
"learning_rate": 1.4831335354154444e-05,
"loss": 0.3289347648620605,
"memory(GiB)": 133.17,
"step": 955,
"token_acc": 0.8831363419858116,
"train_speed(iter/s)": 0.068436
},
{
"epoch": 1.1241217798594847,
"grad_norm": 0.21163643896579742,
"learning_rate": 1.4774707907874392e-05,
"loss": 0.32750353813171384,
"memory(GiB)": 133.17,
"step": 960,
"token_acc": 0.8880904228882937,
"train_speed(iter/s)": 0.068427
},
{
"epoch": 1.129976580796253,
"grad_norm": 0.20707455277442932,
"learning_rate": 1.4717881439708786e-05,
"loss": 0.3284764289855957,
"memory(GiB)": 133.17,
"step": 965,
"token_acc": 0.8722379691636817,
"train_speed(iter/s)": 0.068425
},
{
"epoch": 1.135831381733021,
"grad_norm": 0.2046642154455185,
"learning_rate": 1.4660858318328348e-05,
"loss": 0.3317260265350342,
"memory(GiB)": 133.17,
"step": 970,
"token_acc": 0.8710549063749603,
"train_speed(iter/s)": 0.068419
},
{
"epoch": 1.1416861826697893,
"grad_norm": 0.20032472908496857,
"learning_rate": 1.4603640920600813e-05,
"loss": 0.33744547367095945,
"memory(GiB)": 133.17,
"step": 975,
"token_acc": 0.8676646558084457,
"train_speed(iter/s)": 0.068416
},
{
"epoch": 1.1475409836065573,
"grad_norm": 0.20992988348007202,
"learning_rate": 1.4546231631491827e-05,
"loss": 0.3331944704055786,
"memory(GiB)": 133.17,
"step": 980,
"token_acc": 0.8770167266237555,
"train_speed(iter/s)": 0.068406
},
{
"epoch": 1.1533957845433256,
"grad_norm": 0.2045455127954483,
"learning_rate": 1.4488632843965573e-05,
"loss": 0.32609896659851073,
"memory(GiB)": 133.17,
"step": 985,
"token_acc": 0.8671518193224592,
"train_speed(iter/s)": 0.068401
},
{
"epoch": 1.1592505854800936,
"grad_norm": 0.21106521785259247,
"learning_rate": 1.4430846958884995e-05,
"loss": 0.3347620010375977,
"memory(GiB)": 133.17,
"step": 990,
"token_acc": 0.8760981150071534,
"train_speed(iter/s)": 0.068396
},
{
"epoch": 1.165105386416862,
"grad_norm": 0.2021251767873764,
"learning_rate": 1.4372876384911741e-05,
"loss": 0.33538064956665037,
"memory(GiB)": 133.17,
"step": 995,
"token_acc": 0.8768674285536101,
"train_speed(iter/s)": 0.068392
},
{
"epoch": 1.17096018735363,
"grad_norm": 0.22672772407531738,
"learning_rate": 1.4314723538405752e-05,
"loss": 0.3422734260559082,
"memory(GiB)": 133.17,
"step": 1000,
"token_acc": 0.8671538988967151,
"train_speed(iter/s)": 0.068389
},
{
"epoch": 1.1768149882903982,
"grad_norm": 0.2139746993780136,
"learning_rate": 1.4256390843324556e-05,
"loss": 0.3371597766876221,
"memory(GiB)": 133.17,
"step": 1005,
"token_acc": 0.8732182530767119,
"train_speed(iter/s)": 0.068388
},
{
"epoch": 1.1826697892271663,
"grad_norm": 0.21347731351852417,
"learning_rate": 1.4197880731122221e-05,
"loss": 0.3339057922363281,
"memory(GiB)": 133.17,
"step": 1010,
"token_acc": 0.8729292778317514,
"train_speed(iter/s)": 0.06839
},
{
"epoch": 1.1885245901639343,
"grad_norm": 0.21436652541160583,
"learning_rate": 1.4139195640648008e-05,
"loss": 0.3371711730957031,
"memory(GiB)": 133.17,
"step": 1015,
"token_acc": 0.8857815368682034,
"train_speed(iter/s)": 0.068385
},
{
"epoch": 1.1943793911007026,
"grad_norm": 0.21145156025886536,
"learning_rate": 1.4080338018044712e-05,
"loss": 0.3415823459625244,
"memory(GiB)": 133.17,
"step": 1020,
"token_acc": 0.8745781005321704,
"train_speed(iter/s)": 0.068382
},
{
"epoch": 1.2002341920374708,
"grad_norm": 0.2704923748970032,
"learning_rate": 1.4021310316646708e-05,
"loss": 0.33098018169403076,
"memory(GiB)": 133.17,
"step": 1025,
"token_acc": 0.8810291608110821,
"train_speed(iter/s)": 0.06838
},
{
"epoch": 1.2060889929742389,
"grad_norm": 0.20703041553497314,
"learning_rate": 1.3962114996877685e-05,
"loss": 0.3177175045013428,
"memory(GiB)": 133.17,
"step": 1030,
"token_acc": 0.8884392410781509,
"train_speed(iter/s)": 0.068376
},
{
"epoch": 1.211943793911007,
"grad_norm": 0.20425967872142792,
"learning_rate": 1.390275452614808e-05,
"loss": 0.3208155155181885,
"memory(GiB)": 133.17,
"step": 1035,
"token_acc": 0.8798795706976164,
"train_speed(iter/s)": 0.068375
},
{
"epoch": 1.2177985948477752,
"grad_norm": 0.2199791669845581,
"learning_rate": 1.3843231378752252e-05,
"loss": 0.32726430892944336,
"memory(GiB)": 133.17,
"step": 1040,
"token_acc": 0.8785451315143307,
"train_speed(iter/s)": 0.068367
},
{
"epoch": 1.2236533957845432,
"grad_norm": 0.22237712144851685,
"learning_rate": 1.3783548035765327e-05,
"loss": 0.33181195259094237,
"memory(GiB)": 133.17,
"step": 1045,
"token_acc": 0.8834801207851032,
"train_speed(iter/s)": 0.068368
},
{
"epoch": 1.2295081967213115,
"grad_norm": 0.20910513401031494,
"learning_rate": 1.3723706984939783e-05,
"loss": 0.3189753532409668,
"memory(GiB)": 133.17,
"step": 1050,
"token_acc": 0.8769508605389209,
"train_speed(iter/s)": 0.068364
},
{
"epoch": 1.2353629976580796,
"grad_norm": 0.20491260290145874,
"learning_rate": 1.366371072060177e-05,
"loss": 0.33074491024017333,
"memory(GiB)": 133.17,
"step": 1055,
"token_acc": 0.8681569771445384,
"train_speed(iter/s)": 0.068361
},
{
"epoch": 1.2412177985948478,
"grad_norm": 0.1918231099843979,
"learning_rate": 1.3603561743547125e-05,
"loss": 0.3256643772125244,
"memory(GiB)": 133.17,
"step": 1060,
"token_acc": 0.8732954670333983,
"train_speed(iter/s)": 0.068363
},
{
"epoch": 1.2470725995316159,
"grad_norm": 0.21773004531860352,
"learning_rate": 1.3543262560937135e-05,
"loss": 0.33045885562896726,
"memory(GiB)": 133.17,
"step": 1065,
"token_acc": 0.8785313558157261,
"train_speed(iter/s)": 0.068363
},
{
"epoch": 1.2529274004683841,
"grad_norm": 0.21782302856445312,
"learning_rate": 1.3482815686194033e-05,
"loss": 0.3164831161499023,
"memory(GiB)": 133.17,
"step": 1070,
"token_acc": 0.8841838807462733,
"train_speed(iter/s)": 0.068363
},
{
"epoch": 1.2587822014051522,
"grad_norm": 0.21324488520622253,
"learning_rate": 1.3422223638896235e-05,
"loss": 0.32593531608581544,
"memory(GiB)": 133.17,
"step": 1075,
"token_acc": 0.8798167525312546,
"train_speed(iter/s)": 0.068363
},
{
"epoch": 1.2646370023419204,
"grad_norm": 0.22865289449691772,
"learning_rate": 1.3361488944673315e-05,
"loss": 0.3352835178375244,
"memory(GiB)": 133.17,
"step": 1080,
"token_acc": 0.8729886330661392,
"train_speed(iter/s)": 0.068362
},
{
"epoch": 1.2704918032786885,
"grad_norm": 0.20328956842422485,
"learning_rate": 1.3300614135100736e-05,
"loss": 0.332173490524292,
"memory(GiB)": 133.17,
"step": 1085,
"token_acc": 0.8806762689525037,
"train_speed(iter/s)": 0.068357
},
{
"epoch": 1.2763466042154565,
"grad_norm": 0.19926570355892181,
"learning_rate": 1.3239601747594319e-05,
"loss": 0.331054162979126,
"memory(GiB)": 133.17,
"step": 1090,
"token_acc": 0.8812650906933006,
"train_speed(iter/s)": 0.068351
},
{
"epoch": 1.2822014051522248,
"grad_norm": 0.19676311314105988,
"learning_rate": 1.3178454325304472e-05,
"loss": 0.33361315727233887,
"memory(GiB)": 133.17,
"step": 1095,
"token_acc": 0.8700881415265362,
"train_speed(iter/s)": 0.068351
},
{
"epoch": 1.288056206088993,
"grad_norm": 0.20788326859474182,
"learning_rate": 1.3117174417010213e-05,
"loss": 0.31841249465942384,
"memory(GiB)": 133.17,
"step": 1100,
"token_acc": 0.8749374970517477,
"train_speed(iter/s)": 0.06835
},
{
"epoch": 1.2939110070257611,
"grad_norm": 0.21633991599082947,
"learning_rate": 1.3055764577012892e-05,
"loss": 0.34844322204589845,
"memory(GiB)": 133.17,
"step": 1105,
"token_acc": 0.8857762459338606,
"train_speed(iter/s)": 0.068351
},
{
"epoch": 1.2997658079625292,
"grad_norm": 0.2159479707479477,
"learning_rate": 1.2994227365029752e-05,
"loss": 0.32929096221923826,
"memory(GiB)": 133.17,
"step": 1110,
"token_acc": 0.8831624401350396,
"train_speed(iter/s)": 0.06835
},
{
"epoch": 1.3056206088992974,
"grad_norm": 0.21510519087314606,
"learning_rate": 1.2932565346087218e-05,
"loss": 0.33609514236450194,
"memory(GiB)": 133.17,
"step": 1115,
"token_acc": 0.8789613142554319,
"train_speed(iter/s)": 0.068346
},
{
"epoch": 1.3114754098360657,
"grad_norm": 0.19823956489562988,
"learning_rate": 1.2870781090413991e-05,
"loss": 0.3340220212936401,
"memory(GiB)": 133.17,
"step": 1120,
"token_acc": 0.8802133820301311,
"train_speed(iter/s)": 0.068343
},
{
"epoch": 1.3173302107728337,
"grad_norm": 0.19969677925109863,
"learning_rate": 1.2808877173333896e-05,
"loss": 0.32896521091461184,
"memory(GiB)": 133.17,
"step": 1125,
"token_acc": 0.8884312591176619,
"train_speed(iter/s)": 0.068342
},
{
"epoch": 1.3231850117096018,
"grad_norm": 0.19414611160755157,
"learning_rate": 1.2746856175158556e-05,
"loss": 0.33699817657470704,
"memory(GiB)": 133.17,
"step": 1130,
"token_acc": 0.8808933080116763,
"train_speed(iter/s)": 0.068346
},
{
"epoch": 1.32903981264637,
"grad_norm": 0.20659878849983215,
"learning_rate": 1.2684720681079825e-05,
"loss": 0.33256163597106936,
"memory(GiB)": 133.17,
"step": 1135,
"token_acc": 0.8659905808672699,
"train_speed(iter/s)": 0.068345
},
{
"epoch": 1.334894613583138,
"grad_norm": 0.21766500174999237,
"learning_rate": 1.2622473281062042e-05,
"loss": 0.3360875129699707,
"memory(GiB)": 133.17,
"step": 1140,
"token_acc": 0.8805351128851191,
"train_speed(iter/s)": 0.068346
},
{
"epoch": 1.3407494145199064,
"grad_norm": 0.21836382150650024,
"learning_rate": 1.256011656973406e-05,
"loss": 0.3428370952606201,
"memory(GiB)": 133.17,
"step": 1145,
"token_acc": 0.882268280446507,
"train_speed(iter/s)": 0.068346
},
{
"epoch": 1.3466042154566744,
"grad_norm": 0.21305552124977112,
"learning_rate": 1.2497653146281113e-05,
"loss": 0.3323945999145508,
"memory(GiB)": 133.17,
"step": 1150,
"token_acc": 0.8799263041729795,
"train_speed(iter/s)": 0.06834
},
{
"epoch": 1.3524590163934427,
"grad_norm": 0.2115429788827896,
"learning_rate": 1.2435085614336459e-05,
"loss": 0.33839111328125,
"memory(GiB)": 133.17,
"step": 1155,
"token_acc": 0.8877846609149278,
"train_speed(iter/s)": 0.068336
},
{
"epoch": 1.3583138173302107,
"grad_norm": 0.20214448869228363,
"learning_rate": 1.2372416581872857e-05,
"loss": 0.3267178773880005,
"memory(GiB)": 133.17,
"step": 1160,
"token_acc": 0.8858182364221651,
"train_speed(iter/s)": 0.068334
},
{
"epoch": 1.364168618266979,
"grad_norm": 0.19922491908073425,
"learning_rate": 1.2309648661093878e-05,
"loss": 0.33157687187194823,
"memory(GiB)": 133.17,
"step": 1165,
"token_acc": 0.8862414604099004,
"train_speed(iter/s)": 0.068333
},
{
"epoch": 1.370023419203747,
"grad_norm": 0.20893344283103943,
"learning_rate": 1.2246784468324993e-05,
"loss": 0.3382421016693115,
"memory(GiB)": 133.17,
"step": 1170,
"token_acc": 0.8688796266876001,
"train_speed(iter/s)": 0.068327
},
{
"epoch": 1.3758782201405153,
"grad_norm": 0.219789519906044,
"learning_rate": 1.218382662390454e-05,
"loss": 0.3261989116668701,
"memory(GiB)": 133.17,
"step": 1175,
"token_acc": 0.8682563507122426,
"train_speed(iter/s)": 0.068328
},
{
"epoch": 1.3817330210772834,
"grad_norm": 0.2007785141468048,
"learning_rate": 1.2120777752074492e-05,
"loss": 0.33451414108276367,
"memory(GiB)": 133.17,
"step": 1180,
"token_acc": 0.8779171167786075,
"train_speed(iter/s)": 0.068325
},
{
"epoch": 1.3875878220140514,
"grad_norm": 0.20650921761989594,
"learning_rate": 1.2057640480871084e-05,
"loss": 0.33679168224334716,
"memory(GiB)": 133.17,
"step": 1185,
"token_acc": 0.8786453140578265,
"train_speed(iter/s)": 0.068325
},
{
"epoch": 1.3934426229508197,
"grad_norm": 0.20114493370056152,
"learning_rate": 1.1994417442015243e-05,
"loss": 0.33562412261962893,
"memory(GiB)": 133.17,
"step": 1190,
"token_acc": 0.8844727744979327,
"train_speed(iter/s)": 0.068325
},
{
"epoch": 1.399297423887588,
"grad_norm": 0.19498831033706665,
"learning_rate": 1.193111127080292e-05,
"loss": 0.3253043174743652,
"memory(GiB)": 133.17,
"step": 1195,
"token_acc": 0.8870853046866852,
"train_speed(iter/s)": 0.068326
},
{
"epoch": 1.405152224824356,
"grad_norm": 0.1827043890953064,
"learning_rate": 1.186772460599523e-05,
"loss": 0.3244746685028076,
"memory(GiB)": 133.17,
"step": 1200,
"token_acc": 0.8863151296717072,
"train_speed(iter/s)": 0.068323
},
{
"epoch": 1.411007025761124,
"grad_norm": 0.21396119892597198,
"learning_rate": 1.1804260089708464e-05,
"loss": 0.3355713367462158,
"memory(GiB)": 133.17,
"step": 1205,
"token_acc": 0.8714279485774079,
"train_speed(iter/s)": 0.068317
},
{
"epoch": 1.4168618266978923,
"grad_norm": 0.20849740505218506,
"learning_rate": 1.1740720367303958e-05,
"loss": 0.3293231725692749,
"memory(GiB)": 133.17,
"step": 1210,
"token_acc": 0.8799478293040041,
"train_speed(iter/s)": 0.068316
},
{
"epoch": 1.4227166276346606,
"grad_norm": 0.19985808432102203,
"learning_rate": 1.1677108087277835e-05,
"loss": 0.33586926460266114,
"memory(GiB)": 133.17,
"step": 1215,
"token_acc": 0.8803578911815663,
"train_speed(iter/s)": 0.068314
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.2120925784111023,
"learning_rate": 1.1613425901150595e-05,
"loss": 0.335320782661438,
"memory(GiB)": 133.17,
"step": 1220,
"token_acc": 0.8822237863291518,
"train_speed(iter/s)": 0.068311
},
{
"epoch": 1.4344262295081966,
"grad_norm": 0.20144475996494293,
"learning_rate": 1.15496764633566e-05,
"loss": 0.34459710121154785,
"memory(GiB)": 133.17,
"step": 1225,
"token_acc": 0.8714527101578114,
"train_speed(iter/s)": 0.068308
},
{
"epoch": 1.440281030444965,
"grad_norm": 0.1978883147239685,
"learning_rate": 1.1485862431133445e-05,
"loss": 0.334246826171875,
"memory(GiB)": 133.17,
"step": 1230,
"token_acc": 0.8850997230525071,
"train_speed(iter/s)": 0.068306
},
{
"epoch": 1.446135831381733,
"grad_norm": 0.20052959024906158,
"learning_rate": 1.1421986464411169e-05,
"loss": 0.33509197235107424,
"memory(GiB)": 133.17,
"step": 1235,
"token_acc": 0.8704968021392047,
"train_speed(iter/s)": 0.068299
},
{
"epoch": 1.4519906323185012,
"grad_norm": 0.19154983758926392,
"learning_rate": 1.1358051225701404e-05,
"loss": 0.32514162063598634,
"memory(GiB)": 133.17,
"step": 1240,
"token_acc": 0.8735484752584716,
"train_speed(iter/s)": 0.068296
},
{
"epoch": 1.4578454332552693,
"grad_norm": 0.20475593209266663,
"learning_rate": 1.1294059379986384e-05,
"loss": 0.33394522666931153,
"memory(GiB)": 133.17,
"step": 1245,
"token_acc": 0.8737611977698427,
"train_speed(iter/s)": 0.068289
},
{
"epoch": 1.4637002341920375,
"grad_norm": 0.20034635066986084,
"learning_rate": 1.1230013594607874e-05,
"loss": 0.33555524349212645,
"memory(GiB)": 133.17,
"step": 1250,
"token_acc": 0.8783339011605555,
"train_speed(iter/s)": 0.068288
},
{
"epoch": 1.4695550351288056,
"grad_norm": 0.206059530377388,
"learning_rate": 1.1165916539155968e-05,
"loss": 0.33289051055908203,
"memory(GiB)": 133.17,
"step": 1255,
"token_acc": 0.8821623108149916,
"train_speed(iter/s)": 0.068291
},
{
"epoch": 1.4754098360655736,
"grad_norm": 0.1955031454563141,
"learning_rate": 1.1101770885357843e-05,
"loss": 0.3284996509552002,
"memory(GiB)": 133.17,
"step": 1260,
"token_acc": 0.8824508468283658,
"train_speed(iter/s)": 0.068292
},
{
"epoch": 1.481264637002342,
"grad_norm": 0.18819548189640045,
"learning_rate": 1.1037579306966365e-05,
"loss": 0.32820711135864256,
"memory(GiB)": 133.17,
"step": 1265,
"token_acc": 0.8902697768320305,
"train_speed(iter/s)": 0.068288
},
{
"epoch": 1.4871194379391102,
"grad_norm": 0.20186524093151093,
"learning_rate": 1.0973344479648652e-05,
"loss": 0.3230982065200806,
"memory(GiB)": 133.17,
"step": 1270,
"token_acc": 0.8823814255348585,
"train_speed(iter/s)": 0.068286
},
{
"epoch": 1.4929742388758782,
"grad_norm": 0.19547297060489655,
"learning_rate": 1.0909069080874556e-05,
"loss": 0.3249845027923584,
"memory(GiB)": 133.17,
"step": 1275,
"token_acc": 0.8751095158692027,
"train_speed(iter/s)": 0.068285
},
{
"epoch": 1.4988290398126463,
"grad_norm": 0.21490275859832764,
"learning_rate": 1.0844755789805042e-05,
"loss": 0.3330803394317627,
"memory(GiB)": 133.17,
"step": 1280,
"token_acc": 0.8725501507719461,
"train_speed(iter/s)": 0.068283
},
{
"epoch": 1.5046838407494145,
"grad_norm": 0.21036967635154724,
"learning_rate": 1.0780407287180526e-05,
"loss": 0.33710570335388185,
"memory(GiB)": 133.17,
"step": 1285,
"token_acc": 0.8735995618184534,
"train_speed(iter/s)": 0.068276
},
{
"epoch": 1.5105386416861828,
"grad_norm": 0.21496160328388214,
"learning_rate": 1.0716026255209124e-05,
"loss": 0.3322149276733398,
"memory(GiB)": 133.17,
"step": 1290,
"token_acc": 0.8727818581461427,
"train_speed(iter/s)": 0.068276
},
{
"epoch": 1.5163934426229508,
"grad_norm": 0.19405636191368103,
"learning_rate": 1.0651615377454872e-05,
"loss": 0.33303227424621584,
"memory(GiB)": 133.17,
"step": 1295,
"token_acc": 0.8809517074473936,
"train_speed(iter/s)": 0.068274
},
{
"epoch": 1.5222482435597189,
"grad_norm": 0.20200887322425842,
"learning_rate": 1.0587177338725834e-05,
"loss": 0.3389185905456543,
"memory(GiB)": 133.17,
"step": 1300,
"token_acc": 0.8810081420102018,
"train_speed(iter/s)": 0.068274
},
{
"epoch": 1.5281030444964872,
"grad_norm": 0.19218453764915466,
"learning_rate": 1.0522714824962228e-05,
"loss": 0.32448182106018064,
"memory(GiB)": 133.17,
"step": 1305,
"token_acc": 0.8922085069580942,
"train_speed(iter/s)": 0.068274
},
{
"epoch": 1.5339578454332554,
"grad_norm": 0.2063508927822113,
"learning_rate": 1.0458230523124443e-05,
"loss": 0.3380331039428711,
"memory(GiB)": 133.17,
"step": 1310,
"token_acc": 0.8834363870742206,
"train_speed(iter/s)": 0.06827
},
{
"epoch": 1.5398126463700235,
"grad_norm": 0.20604784786701202,
"learning_rate": 1.0393727121081057e-05,
"loss": 0.33421056270599364,
"memory(GiB)": 133.17,
"step": 1315,
"token_acc": 0.8805816011032537,
"train_speed(iter/s)": 0.068273
},
{
"epoch": 1.5456674473067915,
"grad_norm": 0.1895345002412796,
"learning_rate": 1.0329207307496785e-05,
"loss": 0.3230136394500732,
"memory(GiB)": 133.17,
"step": 1320,
"token_acc": 0.8821661202321777,
"train_speed(iter/s)": 0.068273
},
{
"epoch": 1.5515222482435598,
"grad_norm": 0.20009098947048187,
"learning_rate": 1.0264673771720429e-05,
"loss": 0.331970739364624,
"memory(GiB)": 133.17,
"step": 1325,
"token_acc": 0.8856471632036539,
"train_speed(iter/s)": 0.06827
},
{
"epoch": 1.5573770491803278,
"grad_norm": 0.19756639003753662,
"learning_rate": 1.0200129203672754e-05,
"loss": 0.33203625679016113,
"memory(GiB)": 133.17,
"step": 1330,
"token_acc": 0.8719384623094173,
"train_speed(iter/s)": 0.068266
},
{
"epoch": 1.5632318501170959,
"grad_norm": 0.20041348040103912,
"learning_rate": 1.0135576293734381e-05,
"loss": 0.3236687660217285,
"memory(GiB)": 133.17,
"step": 1335,
"token_acc": 0.8890052192879956,
"train_speed(iter/s)": 0.068267
},
{
"epoch": 1.5690866510538641,
"grad_norm": 0.2091531604528427,
"learning_rate": 1.007101773263365e-05,
"loss": 0.3356754302978516,
"memory(GiB)": 133.17,
"step": 1340,
"token_acc": 0.881420303456906,
"train_speed(iter/s)": 0.068267
},
{
"epoch": 1.5749414519906324,
"grad_norm": 0.18961018323898315,
"learning_rate": 1.0006456211334445e-05,
"loss": 0.32959842681884766,
"memory(GiB)": 133.17,
"step": 1345,
"token_acc": 0.881056978636539,
"train_speed(iter/s)": 0.068269
},
{
"epoch": 1.5807962529274004,
"grad_norm": 0.18674606084823608,
"learning_rate": 9.941894420924044e-06,
"loss": 0.3274309396743774,
"memory(GiB)": 133.17,
"step": 1350,
"token_acc": 0.8911319303466276,
"train_speed(iter/s)": 0.068266
},
{
"epoch": 1.5866510538641685,
"grad_norm": 0.19703362882137299,
"learning_rate": 9.87733505250094e-06,
"loss": 0.33193011283874513,
"memory(GiB)": 133.17,
"step": 1355,
"token_acc": 0.880100249375002,
"train_speed(iter/s)": 0.068262
},
{
"epoch": 1.5925058548009368,
"grad_norm": 0.1925787329673767,
"learning_rate": 9.812780797062678e-06,
"loss": 0.328415060043335,
"memory(GiB)": 133.17,
"step": 1360,
"token_acc": 0.8847896196463753,
"train_speed(iter/s)": 0.068258
},
{
"epoch": 1.598360655737705,
"grad_norm": 0.19211165606975555,
"learning_rate": 9.748234345393672e-06,
"loss": 0.32412943840026853,
"memory(GiB)": 133.17,
"step": 1365,
"token_acc": 0.8819075272921836,
"train_speed(iter/s)": 0.068253
},
{
"epoch": 1.604215456674473,
"grad_norm": 0.19750450551509857,
"learning_rate": 9.68369838795306e-06,
"loss": 0.33218812942504883,
"memory(GiB)": 133.17,
"step": 1370,
"token_acc": 0.8781786390424615,
"train_speed(iter/s)": 0.068246
},
{
"epoch": 1.6100702576112411,
"grad_norm": 0.19090089201927185,
"learning_rate": 9.61917561476255e-06,
"loss": 0.3252577781677246,
"memory(GiB)": 133.17,
"step": 1375,
"token_acc": 0.8718890721275258,
"train_speed(iter/s)": 0.068245
},
{
"epoch": 1.6159250585480094,
"grad_norm": 0.2007261961698532,
"learning_rate": 9.554668715294305e-06,
"loss": 0.3365320205688477,
"memory(GiB)": 133.17,
"step": 1380,
"token_acc": 0.8808937423036773,
"train_speed(iter/s)": 0.068246
},
{
"epoch": 1.6217798594847777,
"grad_norm": 0.20129120349884033,
"learning_rate": 9.490180378358826e-06,
"loss": 0.33901381492614746,
"memory(GiB)": 133.17,
"step": 1385,
"token_acc": 0.8765135837259478,
"train_speed(iter/s)": 0.068245
},
{
"epoch": 1.6276346604215457,
"grad_norm": 0.18519413471221924,
"learning_rate": 9.425713291992878e-06,
"loss": 0.32805542945861815,
"memory(GiB)": 133.17,
"step": 1390,
"token_acc": 0.8837560234916173,
"train_speed(iter/s)": 0.068243
},
{
"epoch": 1.6334894613583137,
"grad_norm": 0.19597233831882477,
"learning_rate": 9.361270143347452e-06,
"loss": 0.3414484977722168,
"memory(GiB)": 133.17,
"step": 1395,
"token_acc": 0.8769078651119291,
"train_speed(iter/s)": 0.068243
},
{
"epoch": 1.639344262295082,
"grad_norm": 0.17986047267913818,
"learning_rate": 9.296853618575753e-06,
"loss": 0.32855379581451416,
"memory(GiB)": 133.17,
"step": 1400,
"token_acc": 0.8869200388717233,
"train_speed(iter/s)": 0.068244
},
{
"epoch": 1.6451990632318503,
"grad_norm": 0.2232111245393753,
"learning_rate": 9.232466402721241e-06,
"loss": 0.33907437324523926,
"memory(GiB)": 133.17,
"step": 1405,
"token_acc": 0.8805843000676505,
"train_speed(iter/s)": 0.068245
},
{
"epoch": 1.651053864168618,
"grad_norm": 0.19428326189517975,
"learning_rate": 9.1681111796057e-06,
"loss": 0.3294277906417847,
"memory(GiB)": 133.17,
"step": 1410,
"token_acc": 0.8820219796725579,
"train_speed(iter/s)": 0.068247
},
{
"epoch": 1.6569086651053864,
"grad_norm": 0.205523282289505,
"learning_rate": 9.103790631717375e-06,
"loss": 0.34450831413269045,
"memory(GiB)": 133.17,
"step": 1415,
"token_acc": 0.8722953184421034,
"train_speed(iter/s)": 0.068245
},
{
"epoch": 1.6627634660421546,
"grad_norm": 0.1955317109823227,
"learning_rate": 9.039507440099164e-06,
"loss": 0.32976531982421875,
"memory(GiB)": 133.17,
"step": 1420,
"token_acc": 0.887285426963314,
"train_speed(iter/s)": 0.068244
},
{
"epoch": 1.6686182669789227,
"grad_norm": 0.1974899172782898,
"learning_rate": 8.975264284236866e-06,
"loss": 0.33209028244018557,
"memory(GiB)": 133.17,
"step": 1425,
"token_acc": 0.8826060927102499,
"train_speed(iter/s)": 0.068246
},
{
"epoch": 1.6744730679156907,
"grad_norm": 0.20223510265350342,
"learning_rate": 8.911063841947476e-06,
"loss": 0.33354964256286623,
"memory(GiB)": 133.17,
"step": 1430,
"token_acc": 0.8795497702238948,
"train_speed(iter/s)": 0.068244
},
{
"epoch": 1.680327868852459,
"grad_norm": 0.19802114367485046,
"learning_rate": 8.846908789267589e-06,
"loss": 0.33350410461425783,
"memory(GiB)": 133.17,
"step": 1435,
"token_acc": 0.8820895522388059,
"train_speed(iter/s)": 0.068246
},
{
"epoch": 1.6861826697892273,
"grad_norm": 0.19948238134384155,
"learning_rate": 8.78280180034184e-06,
"loss": 0.3242588758468628,
"memory(GiB)": 133.17,
"step": 1440,
"token_acc": 0.8763353704232109,
"train_speed(iter/s)": 0.068246
},
{
"epoch": 1.6920374707259953,
"grad_norm": 0.19532591104507446,
"learning_rate": 8.718745547311458e-06,
"loss": 0.3360363721847534,
"memory(GiB)": 133.17,
"step": 1445,
"token_acc": 0.8764055183683731,
"train_speed(iter/s)": 0.068241
},
{
"epoch": 1.6978922716627634,
"grad_norm": 0.20000973343849182,
"learning_rate": 8.654742700202849e-06,
"loss": 0.33543264865875244,
"memory(GiB)": 133.17,
"step": 1450,
"token_acc": 0.8791397393130521,
"train_speed(iter/s)": 0.06824
},
{
"epoch": 1.7037470725995316,
"grad_norm": 0.193691685795784,
"learning_rate": 8.590795926816348e-06,
"loss": 0.32405283451080324,
"memory(GiB)": 133.17,
"step": 1455,
"token_acc": 0.8792053838888559,
"train_speed(iter/s)": 0.068239
},
{
"epoch": 1.7096018735362999,
"grad_norm": 0.18100841343402863,
"learning_rate": 8.526907892614986e-06,
"loss": 0.32940475940704345,
"memory(GiB)": 133.17,
"step": 1460,
"token_acc": 0.8829538372890485,
"train_speed(iter/s)": 0.068234
},
{
"epoch": 1.715456674473068,
"grad_norm": 0.2313033789396286,
"learning_rate": 8.463081260613391e-06,
"loss": 0.3310007810592651,
"memory(GiB)": 133.17,
"step": 1465,
"token_acc": 0.8884524843192141,
"train_speed(iter/s)": 0.068231
},
{
"epoch": 1.721311475409836,
"grad_norm": 0.19678162038326263,
"learning_rate": 8.399318691266806e-06,
"loss": 0.3346008062362671,
"memory(GiB)": 133.17,
"step": 1470,
"token_acc": 0.8785229138209752,
"train_speed(iter/s)": 0.068229
},
{
"epoch": 1.7271662763466042,
"grad_norm": 0.20874732732772827,
"learning_rate": 8.335622842360168e-06,
"loss": 0.3276866674423218,
"memory(GiB)": 133.17,
"step": 1475,
"token_acc": 0.8830160906179125,
"train_speed(iter/s)": 0.068224
},
{
"epoch": 1.7330210772833725,
"grad_norm": 0.20175132155418396,
"learning_rate": 8.271996368897345e-06,
"loss": 0.33496603965759275,
"memory(GiB)": 133.17,
"step": 1480,
"token_acc": 0.8852224356801145,
"train_speed(iter/s)": 0.06822
},
{
"epoch": 1.7388758782201406,
"grad_norm": 0.19031141698360443,
"learning_rate": 8.208441922990454e-06,
"loss": 0.32518749237060546,
"memory(GiB)": 133.17,
"step": 1485,
"token_acc": 0.8788670711802744,
"train_speed(iter/s)": 0.068219
},
{
"epoch": 1.7447306791569086,
"grad_norm": 0.19358490407466888,
"learning_rate": 8.144962153749331e-06,
"loss": 0.32768878936767576,
"memory(GiB)": 133.17,
"step": 1490,
"token_acc": 0.8753486456636903,
"train_speed(iter/s)": 0.068217
},
{
"epoch": 1.7505854800936769,
"grad_norm": 0.21087020635604858,
"learning_rate": 8.081559707171094e-06,
"loss": 0.3388930559158325,
"memory(GiB)": 133.17,
"step": 1495,
"token_acc": 0.8764171874364358,
"train_speed(iter/s)": 0.06822
},
{
"epoch": 1.756440281030445,
"grad_norm": 0.1951858252286911,
"learning_rate": 8.01823722602986e-06,
"loss": 0.3247065544128418,
"memory(GiB)": 133.17,
"step": 1500,
"token_acc": 0.8884904457005652,
"train_speed(iter/s)": 0.068221
},
{
"epoch": 1.762295081967213,
"grad_norm": 0.20260894298553467,
"learning_rate": 7.954997349766576e-06,
"loss": 0.33308422565460205,
"memory(GiB)": 133.17,
"step": 1505,
"token_acc": 0.8817160406212514,
"train_speed(iter/s)": 0.068221
},
{
"epoch": 1.7681498829039812,
"grad_norm": 0.19411516189575195,
"learning_rate": 7.891842714379027e-06,
"loss": 0.3207800626754761,
"memory(GiB)": 133.17,
"step": 1510,
"token_acc": 0.8866104646064812,
"train_speed(iter/s)": 0.068218
},
{
"epoch": 1.7740046838407495,
"grad_norm": 0.2132834941148758,
"learning_rate": 7.828775952311921e-06,
"loss": 0.32387499809265136,
"memory(GiB)": 133.17,
"step": 1515,
"token_acc": 0.8781614519597012,
"train_speed(iter/s)": 0.068214
},
{
"epoch": 1.7798594847775175,
"grad_norm": 0.2175895869731903,
"learning_rate": 7.765799692347201e-06,
"loss": 0.32644095420837405,
"memory(GiB)": 133.17,
"step": 1520,
"token_acc": 0.878244971440831,
"train_speed(iter/s)": 0.068212
},
{
"epoch": 1.7857142857142856,
"grad_norm": 0.20511025190353394,
"learning_rate": 7.702916559494444e-06,
"loss": 0.3338191032409668,
"memory(GiB)": 133.17,
"step": 1525,
"token_acc": 0.8815095165856024,
"train_speed(iter/s)": 0.068213
},
{
"epoch": 1.7915690866510539,
"grad_norm": 0.19504858553409576,
"learning_rate": 7.64012917488146e-06,
"loss": 0.31484146118164064,
"memory(GiB)": 133.17,
"step": 1530,
"token_acc": 0.8943310386864273,
"train_speed(iter/s)": 0.068215
},
{
"epoch": 1.7974238875878221,
"grad_norm": 0.2018832564353943,
"learning_rate": 7.577440155645028e-06,
"loss": 0.3253478050231934,
"memory(GiB)": 133.17,
"step": 1535,
"token_acc": 0.883270074462929,
"train_speed(iter/s)": 0.068211
},
{
"epoch": 1.8032786885245902,
"grad_norm": 0.18957826495170593,
"learning_rate": 7.514852114821811e-06,
"loss": 0.3356925010681152,
"memory(GiB)": 133.17,
"step": 1540,
"token_acc": 0.8806853758108548,
"train_speed(iter/s)": 0.06821
},
{
"epoch": 1.8091334894613582,
"grad_norm": 0.18248967826366425,
"learning_rate": 7.452367661239433e-06,
"loss": 0.3128045558929443,
"memory(GiB)": 133.17,
"step": 1545,
"token_acc": 0.8822570031516938,
"train_speed(iter/s)": 0.068208
},
{
"epoch": 1.8149882903981265,
"grad_norm": 0.21197733283042908,
"learning_rate": 7.389989399407741e-06,
"loss": 0.3383420467376709,
"memory(GiB)": 133.17,
"step": 1550,
"token_acc": 0.8810136098103397,
"train_speed(iter/s)": 0.068204
},
{
"epoch": 1.8208430913348947,
"grad_norm": 0.1846388280391693,
"learning_rate": 7.3277199294102485e-06,
"loss": 0.3210147857666016,
"memory(GiB)": 133.17,
"step": 1555,
"token_acc": 0.8783838996638541,
"train_speed(iter/s)": 0.068204
},
{
"epoch": 1.8266978922716628,
"grad_norm": 0.21333329379558563,
"learning_rate": 7.265561846795741e-06,
"loss": 0.33364644050598147,
"memory(GiB)": 133.17,
"step": 1560,
"token_acc": 0.8799311976453201,
"train_speed(iter/s)": 0.068197
},
{
"epoch": 1.8325526932084308,
"grad_norm": 0.1916390359401703,
"learning_rate": 7.203517742470101e-06,
"loss": 0.3300149440765381,
"memory(GiB)": 133.17,
"step": 1565,
"token_acc": 0.8891839280314484,
"train_speed(iter/s)": 0.068196
},
{
"epoch": 1.838407494145199,
"grad_norm": 0.1898123174905777,
"learning_rate": 7.141590202588312e-06,
"loss": 0.3347996711730957,
"memory(GiB)": 133.17,
"step": 1570,
"token_acc": 0.8836206356563897,
"train_speed(iter/s)": 0.068196
},
{
"epoch": 1.8442622950819674,
"grad_norm": 0.25897353887557983,
"learning_rate": 7.079781808446648e-06,
"loss": 0.33739614486694336,
"memory(GiB)": 133.17,
"step": 1575,
"token_acc": 0.8688480209111277,
"train_speed(iter/s)": 0.068193
},
{
"epoch": 1.8501170960187352,
"grad_norm": 0.18949347734451294,
"learning_rate": 7.018095136375089e-06,
"loss": 0.3224343299865723,
"memory(GiB)": 133.17,
"step": 1580,
"token_acc": 0.8803287043737061,
"train_speed(iter/s)": 0.068192
},
{
"epoch": 1.8559718969555035,
"grad_norm": 0.19546827673912048,
"learning_rate": 6.956532757629945e-06,
"loss": 0.3295243740081787,
"memory(GiB)": 133.17,
"step": 1585,
"token_acc": 0.8802496310563046,
"train_speed(iter/s)": 0.068189
},
{
"epoch": 1.8618266978922717,
"grad_norm": 0.1977819800376892,
"learning_rate": 6.89509723828665e-06,
"loss": 0.3339688777923584,
"memory(GiB)": 133.17,
"step": 1590,
"token_acc": 0.8794367319992775,
"train_speed(iter/s)": 0.068188
},
{
"epoch": 1.8676814988290398,
"grad_norm": 0.2035733312368393,
"learning_rate": 6.833791139132824e-06,
"loss": 0.3196906089782715,
"memory(GiB)": 133.17,
"step": 1595,
"token_acc": 0.8819461276705585,
"train_speed(iter/s)": 0.068183
},
{
"epoch": 1.8735362997658078,
"grad_norm": 0.18036054074764252,
"learning_rate": 6.772617015561529e-06,
"loss": 0.3284833192825317,
"memory(GiB)": 133.17,
"step": 1600,
"token_acc": 0.8721648839682242,
"train_speed(iter/s)": 0.068185
},
{
"epoch": 1.879391100702576,
"grad_norm": 0.19073913991451263,
"learning_rate": 6.7115774174647475e-06,
"loss": 0.3214848518371582,
"memory(GiB)": 133.17,
"step": 1605,
"token_acc": 0.8888246134782375,
"train_speed(iter/s)": 0.068184
},
{
"epoch": 1.8852459016393444,
"grad_norm": 0.22237442433834076,
"learning_rate": 6.6506748891271045e-06,
"loss": 0.3328333854675293,
"memory(GiB)": 133.17,
"step": 1610,
"token_acc": 0.8864136225147821,
"train_speed(iter/s)": 0.068183
},
{
"epoch": 1.8911007025761124,
"grad_norm": 0.18580298125743866,
"learning_rate": 6.5899119691198025e-06,
"loss": 0.3259113073348999,
"memory(GiB)": 133.17,
"step": 1615,
"token_acc": 0.8816001292832858,
"train_speed(iter/s)": 0.068182
},
{
"epoch": 1.8969555035128804,
"grad_norm": 0.19562335312366486,
"learning_rate": 6.529291190194829e-06,
"loss": 0.3301589012145996,
"memory(GiB)": 133.17,
"step": 1620,
"token_acc": 0.8816063260815503,
"train_speed(iter/s)": 0.068183
},
{
"epoch": 1.9028103044496487,
"grad_norm": 0.19002656638622284,
"learning_rate": 6.468815079179364e-06,
"loss": 0.32632834911346437,
"memory(GiB)": 133.17,
"step": 1625,
"token_acc": 0.8859821923514176,
"train_speed(iter/s)": 0.06818
},
{
"epoch": 1.908665105386417,
"grad_norm": 0.19892436265945435,
"learning_rate": 6.408486156870466e-06,
"loss": 0.33937792778015136,
"memory(GiB)": 133.17,
"step": 1630,
"token_acc": 0.862874582417446,
"train_speed(iter/s)": 0.068181
},
{
"epoch": 1.914519906323185,
"grad_norm": 0.19243668019771576,
"learning_rate": 6.348306937929991e-06,
"loss": 0.3362755537033081,
"memory(GiB)": 133.17,
"step": 1635,
"token_acc": 0.8769627409259633,
"train_speed(iter/s)": 0.068182
},
{
"epoch": 1.920374707259953,
"grad_norm": 0.18101197481155396,
"learning_rate": 6.288279930779789e-06,
"loss": 0.31793382167816164,
"memory(GiB)": 133.17,
"step": 1640,
"token_acc": 0.890389030411674,
"train_speed(iter/s)": 0.068181
},
{
"epoch": 1.9262295081967213,
"grad_norm": 0.2016856074333191,
"learning_rate": 6.228407637497131e-06,
"loss": 0.3286017417907715,
"memory(GiB)": 133.17,
"step": 1645,
"token_acc": 0.8691879609602018,
"train_speed(iter/s)": 0.068179
},
{
"epoch": 1.9320843091334896,
"grad_norm": 0.18602800369262695,
"learning_rate": 6.1686925537104306e-06,
"loss": 0.3186060905456543,
"memory(GiB)": 133.17,
"step": 1650,
"token_acc": 0.8740382186265122,
"train_speed(iter/s)": 0.068178
},
{
"epoch": 1.9379391100702577,
"grad_norm": 0.19921670854091644,
"learning_rate": 6.109137168495205e-06,
"loss": 0.325826621055603,
"memory(GiB)": 133.17,
"step": 1655,
"token_acc": 0.8942359105977971,
"train_speed(iter/s)": 0.068178
},
{
"epoch": 1.9437939110070257,
"grad_norm": 0.1804487407207489,
"learning_rate": 6.049743964270336e-06,
"loss": 0.33586409091949465,
"memory(GiB)": 133.17,
"step": 1660,
"token_acc": 0.8788306137094006,
"train_speed(iter/s)": 0.068176
},
{
"epoch": 1.949648711943794,
"grad_norm": 0.20771907269954681,
"learning_rate": 5.990515416694591e-06,
"loss": 0.3336956024169922,
"memory(GiB)": 133.17,
"step": 1665,
"token_acc": 0.8826585274697895,
"train_speed(iter/s)": 0.068173
},
{
"epoch": 1.955503512880562,
"grad_norm": 0.19965799152851105,
"learning_rate": 5.931453994563434e-06,
"loss": 0.3285707473754883,
"memory(GiB)": 133.17,
"step": 1670,
"token_acc": 0.8875544099179484,
"train_speed(iter/s)": 0.068174
},
{
"epoch": 1.96135831381733,
"grad_norm": 0.20612315833568573,
"learning_rate": 5.872562159706116e-06,
"loss": 0.3315183877944946,
"memory(GiB)": 133.17,
"step": 1675,
"token_acc": 0.8774614658697704,
"train_speed(iter/s)": 0.068172
},
{
"epoch": 1.9672131147540983,
"grad_norm": 0.18963313102722168,
"learning_rate": 5.8138423668830605e-06,
"loss": 0.324364972114563,
"memory(GiB)": 133.17,
"step": 1680,
"token_acc": 0.8801062072294897,
"train_speed(iter/s)": 0.068174
},
{
"epoch": 1.9730679156908666,
"grad_norm": 0.19694305956363678,
"learning_rate": 5.755297063683551e-06,
"loss": 0.3285407066345215,
"memory(GiB)": 133.17,
"step": 1685,
"token_acc": 0.885107199114613,
"train_speed(iter/s)": 0.068174
},
{
"epoch": 1.9789227166276346,
"grad_norm": 0.18662695586681366,
"learning_rate": 5.696928690423693e-06,
"loss": 0.32373480796813964,
"memory(GiB)": 133.17,
"step": 1690,
"token_acc": 0.8790801928023776,
"train_speed(iter/s)": 0.068176
},
{
"epoch": 1.9847775175644027,
"grad_norm": 0.19431762397289276,
"learning_rate": 5.638739680044718e-06,
"loss": 0.3377500057220459,
"memory(GiB)": 133.17,
"step": 1695,
"token_acc": 0.8722363298833375,
"train_speed(iter/s)": 0.068175
},
{
"epoch": 1.990632318501171,
"grad_norm": 0.2024122029542923,
"learning_rate": 5.580732458011544e-06,
"loss": 0.3272620439529419,
"memory(GiB)": 133.17,
"step": 1700,
"token_acc": 0.8856717266189297,
"train_speed(iter/s)": 0.068173
},
{
"epoch": 1.9964871194379392,
"grad_norm": 0.18394924700260162,
"learning_rate": 5.522909442211708e-06,
"loss": 0.32718348503112793,
"memory(GiB)": 133.17,
"step": 1705,
"token_acc": 0.876887289049153,
"train_speed(iter/s)": 0.06817
},
{
"epoch": 2.002341920374707,
"grad_norm": 0.2651495337486267,
"learning_rate": 5.465273042854551e-06,
"loss": 0.31393914222717284,
"memory(GiB)": 133.17,
"step": 1710,
"token_acc": 0.8893703023658244,
"train_speed(iter/s)": 0.06798
},
{
"epoch": 2.0081967213114753,
"grad_norm": 0.21041427552700043,
"learning_rate": 5.407825662370778e-06,
"loss": 0.299090313911438,
"memory(GiB)": 133.17,
"step": 1715,
"token_acc": 0.8915390401403241,
"train_speed(iter/s)": 0.067979
},
{
"epoch": 2.0140515222482436,
"grad_norm": 0.21380308270454407,
"learning_rate": 5.350569695312313e-06,
"loss": 0.3101144790649414,
"memory(GiB)": 133.17,
"step": 1720,
"token_acc": 0.8875269739992413,
"train_speed(iter/s)": 0.067976
},
{
"epoch": 2.019906323185012,
"grad_norm": 0.1987718939781189,
"learning_rate": 5.293507528252474e-06,
"loss": 0.3136857509613037,
"memory(GiB)": 133.17,
"step": 1725,
"token_acc": 0.8871349620144686,
"train_speed(iter/s)": 0.067974
},
{
"epoch": 2.0257611241217797,
"grad_norm": 0.3591626286506653,
"learning_rate": 5.236641539686518e-06,
"loss": 0.30123333930969237,
"memory(GiB)": 133.17,
"step": 1730,
"token_acc": 0.8860105084502068,
"train_speed(iter/s)": 0.067977
},
{
"epoch": 2.031615925058548,
"grad_norm": 0.19819702208042145,
"learning_rate": 5.179974099932472e-06,
"loss": 0.29487655162811277,
"memory(GiB)": 133.17,
"step": 1735,
"token_acc": 0.8855569615495446,
"train_speed(iter/s)": 0.067974
},
{
"epoch": 2.037470725995316,
"grad_norm": 0.2023162841796875,
"learning_rate": 5.12350757103236e-06,
"loss": 0.29470908641815186,
"memory(GiB)": 133.17,
"step": 1740,
"token_acc": 0.8894021747623796,
"train_speed(iter/s)": 0.067975
},
{
"epoch": 2.0433255269320845,
"grad_norm": 0.19459553062915802,
"learning_rate": 5.067244306653736e-06,
"loss": 0.30195889472961424,
"memory(GiB)": 133.17,
"step": 1745,
"token_acc": 0.8966922700402876,
"train_speed(iter/s)": 0.067976
},
{
"epoch": 2.0491803278688523,
"grad_norm": 0.20582208037376404,
"learning_rate": 5.0111866519915575e-06,
"loss": 0.2972427845001221,
"memory(GiB)": 133.17,
"step": 1750,
"token_acc": 0.8860103790300714,
"train_speed(iter/s)": 0.067974
},
{
"epoch": 2.0550351288056206,
"grad_norm": 0.21163956820964813,
"learning_rate": 4.95533694367047e-06,
"loss": 0.2951073408126831,
"memory(GiB)": 133.17,
"step": 1755,
"token_acc": 0.8911898143660713,
"train_speed(iter/s)": 0.067976
},
{
"epoch": 2.060889929742389,
"grad_norm": 0.271316796541214,
"learning_rate": 4.899697509647379e-06,
"loss": 0.3005206108093262,
"memory(GiB)": 133.17,
"step": 1760,
"token_acc": 0.8827217211398426,
"train_speed(iter/s)": 0.067977
},
{
"epoch": 2.066744730679157,
"grad_norm": 0.1982126086950302,
"learning_rate": 4.844270669114424e-06,
"loss": 0.30247581005096436,
"memory(GiB)": 133.17,
"step": 1765,
"token_acc": 0.8939350325087765,
"train_speed(iter/s)": 0.067979
},
{
"epoch": 2.072599531615925,
"grad_norm": 0.20624509453773499,
"learning_rate": 4.789058732402319e-06,
"loss": 0.2944344520568848,
"memory(GiB)": 133.17,
"step": 1770,
"token_acc": 0.8877186400937866,
"train_speed(iter/s)": 0.067979
},
{
"epoch": 2.078454332552693,
"grad_norm": 0.18864554166793823,
"learning_rate": 4.734064000884044e-06,
"loss": 0.31334614753723145,
"memory(GiB)": 133.17,
"step": 1775,
"token_acc": 0.8753750599625646,
"train_speed(iter/s)": 0.067981
},
{
"epoch": 2.0843091334894615,
"grad_norm": 0.19976413249969482,
"learning_rate": 4.679288766878908e-06,
"loss": 0.3065293073654175,
"memory(GiB)": 133.17,
"step": 1780,
"token_acc": 0.8893787799945783,
"train_speed(iter/s)": 0.067979
},
{
"epoch": 2.0901639344262297,
"grad_norm": 0.20083464682102203,
"learning_rate": 4.624735313557019e-06,
"loss": 0.30294094085693357,
"memory(GiB)": 133.17,
"step": 1785,
"token_acc": 0.894106624191886,
"train_speed(iter/s)": 0.06798
},
{
"epoch": 2.0960187353629975,
"grad_norm": 0.19687768816947937,
"learning_rate": 4.570405914844105e-06,
"loss": 0.29626712799072263,
"memory(GiB)": 133.17,
"step": 1790,
"token_acc": 0.8918388887847958,
"train_speed(iter/s)": 0.067977
},
{
"epoch": 2.101873536299766,
"grad_norm": 0.21042723953723907,
"learning_rate": 4.516302835326723e-06,
"loss": 0.30143260955810547,
"memory(GiB)": 133.17,
"step": 1795,
"token_acc": 0.8979354142270508,
"train_speed(iter/s)": 0.067977
},
{
"epoch": 2.107728337236534,
"grad_norm": 0.20909157395362854,
"learning_rate": 4.462428330157886e-06,
"loss": 0.29250779151916506,
"memory(GiB)": 133.17,
"step": 1800,
"token_acc": 0.8972882018187891,
"train_speed(iter/s)": 0.067976
},
{
"epoch": 2.113583138173302,
"grad_norm": 0.18871068954467773,
"learning_rate": 4.4087846449630475e-06,
"loss": 0.296770715713501,
"memory(GiB)": 133.17,
"step": 1805,
"token_acc": 0.8939121347421645,
"train_speed(iter/s)": 0.067975
},
{
"epoch": 2.11943793911007,
"grad_norm": 0.19644689559936523,
"learning_rate": 4.355374015746493e-06,
"loss": 0.29331092834472655,
"memory(GiB)": 133.17,
"step": 1810,
"token_acc": 0.8898344723236344,
"train_speed(iter/s)": 0.067974
},
{
"epoch": 2.1252927400468384,
"grad_norm": 0.2067333608865738,
"learning_rate": 4.302198668798159e-06,
"loss": 0.298096752166748,
"memory(GiB)": 133.17,
"step": 1815,
"token_acc": 0.8860096940702505,
"train_speed(iter/s)": 0.067975
},
{
"epoch": 2.1311475409836067,
"grad_norm": 0.19337214529514313,
"learning_rate": 4.249260820600813e-06,
"loss": 0.28569879531860354,
"memory(GiB)": 133.17,
"step": 1820,
"token_acc": 0.8937030726309285,
"train_speed(iter/s)": 0.067973
},
{
"epoch": 2.1370023419203745,
"grad_norm": 0.21502645313739777,
"learning_rate": 4.1965626777376766e-06,
"loss": 0.29423298835754397,
"memory(GiB)": 133.17,
"step": 1825,
"token_acc": 0.8971041975679516,
"train_speed(iter/s)": 0.067972
},
{
"epoch": 2.142857142857143,
"grad_norm": 0.18807381391525269,
"learning_rate": 4.144106436800453e-06,
"loss": 0.30044715404510497,
"memory(GiB)": 133.17,
"step": 1830,
"token_acc": 0.8974527790728444,
"train_speed(iter/s)": 0.067975
},
{
"epoch": 2.148711943793911,
"grad_norm": 0.18506018817424774,
"learning_rate": 4.091894284297758e-06,
"loss": 0.2915837526321411,
"memory(GiB)": 133.17,
"step": 1835,
"token_acc": 0.8848257422956048,
"train_speed(iter/s)": 0.067975
},
{
"epoch": 2.1545667447306793,
"grad_norm": 0.22477097809314728,
"learning_rate": 4.039928396563983e-06,
"loss": 0.3101827621459961,
"memory(GiB)": 133.17,
"step": 1840,
"token_acc": 0.8897657467466561,
"train_speed(iter/s)": 0.067975
},
{
"epoch": 2.160421545667447,
"grad_norm": 0.20848192274570465,
"learning_rate": 3.9882109396685845e-06,
"loss": 0.28560404777526854,
"memory(GiB)": 133.17,
"step": 1845,
"token_acc": 0.8866163430466006,
"train_speed(iter/s)": 0.067973
},
{
"epoch": 2.1662763466042154,
"grad_norm": 0.1790919452905655,
"learning_rate": 3.936744069325797e-06,
"loss": 0.28580513000488283,
"memory(GiB)": 133.17,
"step": 1850,
"token_acc": 0.8959953003524735,
"train_speed(iter/s)": 0.067972
},
{
"epoch": 2.1721311475409837,
"grad_norm": 0.1946616917848587,
"learning_rate": 3.885529930804768e-06,
"loss": 0.28203678131103516,
"memory(GiB)": 133.17,
"step": 1855,
"token_acc": 0.8865687290155894,
"train_speed(iter/s)": 0.06797
},
{
"epoch": 2.177985948477752,
"grad_norm": 0.2024662047624588,
"learning_rate": 3.834570658840152e-06,
"loss": 0.3013646602630615,
"memory(GiB)": 133.17,
"step": 1860,
"token_acc": 0.8853980676749265,
"train_speed(iter/s)": 0.06797
},
{
"epoch": 2.1838407494145198,
"grad_norm": 0.1884947568178177,
"learning_rate": 3.7838683775431106e-06,
"loss": 0.2940408706665039,
"memory(GiB)": 133.17,
"step": 1865,
"token_acc": 0.8914837094453064,
"train_speed(iter/s)": 0.067971
},
{
"epoch": 2.189695550351288,
"grad_norm": 0.19168955087661743,
"learning_rate": 3.733425200312797e-06,
"loss": 0.2958191156387329,
"memory(GiB)": 133.17,
"step": 1870,
"token_acc": 0.8822518250452361,
"train_speed(iter/s)": 0.067972
},
{
"epoch": 2.1955503512880563,
"grad_norm": 0.194383904337883,
"learning_rate": 3.683243229748249e-06,
"loss": 0.28948154449462893,
"memory(GiB)": 133.17,
"step": 1875,
"token_acc": 0.8876668322153558,
"train_speed(iter/s)": 0.067972
},
{
"epoch": 2.201405152224824,
"grad_norm": 0.19730046391487122,
"learning_rate": 3.633324557560747e-06,
"loss": 0.29555392265319824,
"memory(GiB)": 133.17,
"step": 1880,
"token_acc": 0.8939645340207796,
"train_speed(iter/s)": 0.067973
},
{
"epoch": 2.2072599531615924,
"grad_norm": 0.18545053899288177,
"learning_rate": 3.5836712644866277e-06,
"loss": 0.28943870067596433,
"memory(GiB)": 133.17,
"step": 1885,
"token_acc": 0.8883624593035462,
"train_speed(iter/s)": 0.067973
},
{
"epoch": 2.2131147540983607,
"grad_norm": 0.20143678784370422,
"learning_rate": 3.5342854202005696e-06,
"loss": 0.29045825004577636,
"memory(GiB)": 133.17,
"step": 1890,
"token_acc": 0.8931490778817771,
"train_speed(iter/s)": 0.067975
},
{
"epoch": 2.218969555035129,
"grad_norm": 0.1933010071516037,
"learning_rate": 3.485169083229293e-06,
"loss": 0.2985133409500122,
"memory(GiB)": 133.17,
"step": 1895,
"token_acc": 0.8983116114671417,
"train_speed(iter/s)": 0.067977
},
{
"epoch": 2.2248243559718968,
"grad_norm": 0.2029975950717926,
"learning_rate": 3.4363243008657842e-06,
"loss": 0.29316296577453616,
"memory(GiB)": 133.17,
"step": 1900,
"token_acc": 0.8817185537873807,
"train_speed(iter/s)": 0.067975
},
{
"epoch": 2.230679156908665,
"grad_norm": 0.1947357952594757,
"learning_rate": 3.3877531090839478e-06,
"loss": 0.2983538627624512,
"memory(GiB)": 133.17,
"step": 1905,
"token_acc": 0.8836724096943308,
"train_speed(iter/s)": 0.067976
},
{
"epoch": 2.2365339578454333,
"grad_norm": 0.19401586055755615,
"learning_rate": 3.3394575324537327e-06,
"loss": 0.3019071578979492,
"memory(GiB)": 133.17,
"step": 1910,
"token_acc": 0.8830038763307387,
"train_speed(iter/s)": 0.067973
},
{
"epoch": 2.2423887587822016,
"grad_norm": 0.18747617304325104,
"learning_rate": 3.2914395840567605e-06,
"loss": 0.2899949312210083,
"memory(GiB)": 133.17,
"step": 1915,
"token_acc": 0.8937491349698655,
"train_speed(iter/s)": 0.067972
},
{
"epoch": 2.2482435597189694,
"grad_norm": 0.18497265875339508,
"learning_rate": 3.2437012654024057e-06,
"loss": 0.29514849185943604,
"memory(GiB)": 133.17,
"step": 1920,
"token_acc": 0.8952329266162637,
"train_speed(iter/s)": 0.067972
},
{
"epoch": 2.2540983606557377,
"grad_norm": 0.19390814006328583,
"learning_rate": 3.1962445663443643e-06,
"loss": 0.29795031547546386,
"memory(GiB)": 133.17,
"step": 1925,
"token_acc": 0.8883286157922032,
"train_speed(iter/s)": 0.067969
},
{
"epoch": 2.259953161592506,
"grad_norm": 0.19152696430683136,
"learning_rate": 3.1490714649977196e-06,
"loss": 0.3013578414916992,
"memory(GiB)": 133.17,
"step": 1930,
"token_acc": 0.8966371415703348,
"train_speed(iter/s)": 0.067968
},
{
"epoch": 2.265807962529274,
"grad_norm": 0.19523312151432037,
"learning_rate": 3.102183927656488e-06,
"loss": 0.29044888019561765,
"memory(GiB)": 133.17,
"step": 1935,
"token_acc": 0.8816311924321633,
"train_speed(iter/s)": 0.067969
},
{
"epoch": 2.271662763466042,
"grad_norm": 0.19032931327819824,
"learning_rate": 3.0555839087116547e-06,
"loss": 0.30231542587280275,
"memory(GiB)": 133.17,
"step": 1940,
"token_acc": 0.89288125,
"train_speed(iter/s)": 0.06797
},
{
"epoch": 2.2775175644028103,
"grad_norm": 0.19542452692985535,
"learning_rate": 3.009273350569705e-06,
"loss": 0.3001267433166504,
"memory(GiB)": 133.17,
"step": 1945,
"token_acc": 0.8971306271312823,
"train_speed(iter/s)": 0.067973
},
{
"epoch": 2.2833723653395785,
"grad_norm": 0.1856907606124878,
"learning_rate": 2.963254183571682e-06,
"loss": 0.29535422325134275,
"memory(GiB)": 133.17,
"step": 1950,
"token_acc": 0.8930906317907196,
"train_speed(iter/s)": 0.067972
},
{
"epoch": 2.289227166276347,
"grad_norm": 0.1937672644853592,
"learning_rate": 2.9175283259126943e-06,
"loss": 0.2962016582489014,
"memory(GiB)": 133.17,
"step": 1955,
"token_acc": 0.898554810095657,
"train_speed(iter/s)": 0.067972
},
{
"epoch": 2.2950819672131146,
"grad_norm": 0.18747203052043915,
"learning_rate": 2.872097683561986e-06,
"loss": 0.2947913885116577,
"memory(GiB)": 133.17,
"step": 1960,
"token_acc": 0.9006477145474906,
"train_speed(iter/s)": 0.067972
},
{
"epoch": 2.300936768149883,
"grad_norm": 0.188889279961586,
"learning_rate": 2.8269641501834834e-06,
"loss": 0.3037715911865234,
"memory(GiB)": 133.17,
"step": 1965,
"token_acc": 0.8837665048634434,
"train_speed(iter/s)": 0.067974
},
{
"epoch": 2.306791569086651,
"grad_norm": 0.18386943638324738,
"learning_rate": 2.782129607056848e-06,
"loss": 0.29630954265594484,
"memory(GiB)": 133.17,
"step": 1970,
"token_acc": 0.8854790349100962,
"train_speed(iter/s)": 0.067975
},
{
"epoch": 2.312646370023419,
"grad_norm": 0.1860065758228302,
"learning_rate": 2.7375959229990856e-06,
"loss": 0.2871407508850098,
"memory(GiB)": 133.17,
"step": 1975,
"token_acc": 0.8921277606269294,
"train_speed(iter/s)": 0.067973
},
{
"epoch": 2.3185011709601873,
"grad_norm": 0.18686528503894806,
"learning_rate": 2.6933649542866326e-06,
"loss": 0.29081072807312014,
"memory(GiB)": 133.17,
"step": 1980,
"token_acc": 0.8890196371424658,
"train_speed(iter/s)": 0.067976
},
{
"epoch": 2.3243559718969555,
"grad_norm": 0.17976053059101105,
"learning_rate": 2.649438544577977e-06,
"loss": 0.2809652090072632,
"memory(GiB)": 133.17,
"step": 1985,
"token_acc": 0.8903470664805608,
"train_speed(iter/s)": 0.067975
},
{
"epoch": 2.330210772833724,
"grad_norm": 0.18407879769802094,
"learning_rate": 2.6058185248368317e-06,
"loss": 0.2934088706970215,
"memory(GiB)": 133.17,
"step": 1990,
"token_acc": 0.896975139427167,
"train_speed(iter/s)": 0.067974
},
{
"epoch": 2.3360655737704916,
"grad_norm": 0.1934359222650528,
"learning_rate": 2.562506713255789e-06,
"loss": 0.2888351917266846,
"memory(GiB)": 133.17,
"step": 1995,
"token_acc": 0.8959334542575936,
"train_speed(iter/s)": 0.067976
},
{
"epoch": 2.34192037470726,
"grad_norm": 0.19506384432315826,
"learning_rate": 2.519504915180555e-06,
"loss": 0.29209365844726565,
"memory(GiB)": 133.17,
"step": 2000,
"token_acc": 0.8933135020860227,
"train_speed(iter/s)": 0.067975
},
{
"epoch": 2.347775175644028,
"grad_norm": 0.2316175252199173,
"learning_rate": 2.4768149230346917e-06,
"loss": 0.30724682807922366,
"memory(GiB)": 133.17,
"step": 2005,
"token_acc": 0.8923097611088004,
"train_speed(iter/s)": 0.067972
},
{
"epoch": 2.3536299765807964,
"grad_norm": 0.19259780645370483,
"learning_rate": 2.4344385162448924e-06,
"loss": 0.29259405136108396,
"memory(GiB)": 133.17,
"step": 2010,
"token_acc": 0.8962570117930629,
"train_speed(iter/s)": 0.067971
},
{
"epoch": 2.3594847775175642,
"grad_norm": 0.18455654382705688,
"learning_rate": 2.392377461166826e-06,
"loss": 0.2909110069274902,
"memory(GiB)": 133.17,
"step": 2015,
"token_acc": 0.8950562827510747,
"train_speed(iter/s)": 0.067971
},
{
"epoch": 2.3653395784543325,
"grad_norm": 0.19333600997924805,
"learning_rate": 2.350633511011511e-06,
"loss": 0.2987373352050781,
"memory(GiB)": 133.17,
"step": 2020,
"token_acc": 0.890084898990847,
"train_speed(iter/s)": 0.067969
},
{
"epoch": 2.371194379391101,
"grad_norm": 0.18590733408927917,
"learning_rate": 2.309208405772221e-06,
"loss": 0.3060739278793335,
"memory(GiB)": 133.17,
"step": 2025,
"token_acc": 0.894386606817296,
"train_speed(iter/s)": 0.06797
},
{
"epoch": 2.3770491803278686,
"grad_norm": 0.19246318936347961,
"learning_rate": 2.2681038721519768e-06,
"loss": 0.3093658208847046,
"memory(GiB)": 133.17,
"step": 2030,
"token_acc": 0.8894458411573517,
"train_speed(iter/s)": 0.067969
},
{
"epoch": 2.382903981264637,
"grad_norm": 0.19371892511844635,
"learning_rate": 2.227321623491563e-06,
"loss": 0.2991630077362061,
"memory(GiB)": 133.17,
"step": 2035,
"token_acc": 0.8873450543890716,
"train_speed(iter/s)": 0.067968
},
{
"epoch": 2.388758782201405,
"grad_norm": 0.17911982536315918,
"learning_rate": 2.186863359698108e-06,
"loss": 0.29452369213104246,
"memory(GiB)": 133.17,
"step": 2040,
"token_acc": 0.9101415057216162,
"train_speed(iter/s)": 0.067967
},
{
"epoch": 2.3946135831381734,
"grad_norm": 0.19897328317165375,
"learning_rate": 2.1467307671742377e-06,
"loss": 0.2978281736373901,
"memory(GiB)": 133.17,
"step": 2045,
"token_acc": 0.8880359089210048,
"train_speed(iter/s)": 0.067968
},
{
"epoch": 2.4004683840749417,
"grad_norm": 0.19614428281784058,
"learning_rate": 2.106925518747779e-06,
"loss": 0.2917934417724609,
"memory(GiB)": 133.17,
"step": 2050,
"token_acc": 0.892301005603362,
"train_speed(iter/s)": 0.067968
},
{
"epoch": 2.4063231850117095,
"grad_norm": 0.18466618657112122,
"learning_rate": 2.06744927360202e-06,
"loss": 0.2950620651245117,
"memory(GiB)": 133.17,
"step": 2055,
"token_acc": 0.8911625268446858,
"train_speed(iter/s)": 0.067964
},
{
"epoch": 2.4121779859484778,
"grad_norm": 0.19196145236492157,
"learning_rate": 2.0283036772065712e-06,
"loss": 0.29646165370941163,
"memory(GiB)": 133.17,
"step": 2060,
"token_acc": 0.8943602932370165,
"train_speed(iter/s)": 0.067964
},
{
"epoch": 2.418032786885246,
"grad_norm": 0.20200960338115692,
"learning_rate": 1.9894903612487683e-06,
"loss": 0.30394654273986815,
"memory(GiB)": 133.17,
"step": 2065,
"token_acc": 0.882068843029542,
"train_speed(iter/s)": 0.067964
},
{
"epoch": 2.423887587822014,
"grad_norm": 0.19281496107578278,
"learning_rate": 1.9510109435656457e-06,
"loss": 0.30283074378967284,
"memory(GiB)": 133.17,
"step": 2070,
"token_acc": 0.8936107928433829,
"train_speed(iter/s)": 0.067965
},
{
"epoch": 2.429742388758782,
"grad_norm": 0.1977166384458542,
"learning_rate": 1.9128670280765283e-06,
"loss": 0.30489649772644045,
"memory(GiB)": 133.17,
"step": 2075,
"token_acc": 0.8869975460007921,
"train_speed(iter/s)": 0.067962
},
{
"epoch": 2.4355971896955504,
"grad_norm": 0.185228630900383,
"learning_rate": 1.8750602047161603e-06,
"loss": 0.29401373863220215,
"memory(GiB)": 133.17,
"step": 2080,
"token_acc": 0.8979298187696017,
"train_speed(iter/s)": 0.067961
},
{
"epoch": 2.4414519906323187,
"grad_norm": 0.19245509803295135,
"learning_rate": 1.8375920493684264e-06,
"loss": 0.3006903171539307,
"memory(GiB)": 133.17,
"step": 2085,
"token_acc": 0.8867607400439009,
"train_speed(iter/s)": 0.067964
},
{
"epoch": 2.4473067915690865,
"grad_norm": 0.19419154524803162,
"learning_rate": 1.8004641238006815e-06,
"loss": 0.29811155796051025,
"memory(GiB)": 133.17,
"step": 2090,
"token_acc": 0.8943640794642241,
"train_speed(iter/s)": 0.067962
},
{
"epoch": 2.4531615925058547,
"grad_norm": 0.1823989599943161,
"learning_rate": 1.7636779755986443e-06,
"loss": 0.3039386749267578,
"memory(GiB)": 133.17,
"step": 2095,
"token_acc": 0.8863327040435527,
"train_speed(iter/s)": 0.067959
},
{
"epoch": 2.459016393442623,
"grad_norm": 0.1855112761259079,
"learning_rate": 1.7272351381018792e-06,
"loss": 0.3009587287902832,
"memory(GiB)": 133.17,
"step": 2100,
"token_acc": 0.8790456780659275,
"train_speed(iter/s)": 0.067959
},
{
"epoch": 2.4648711943793913,
"grad_norm": 0.18876492977142334,
"learning_rate": 1.6911371303399048e-06,
"loss": 0.28830153942108155,
"memory(GiB)": 133.17,
"step": 2105,
"token_acc": 0.8928979046201769,
"train_speed(iter/s)": 0.067956
},
{
"epoch": 2.470725995316159,
"grad_norm": 0.19157980382442474,
"learning_rate": 1.6553854569688632e-06,
"loss": 0.30360212326049807,
"memory(GiB)": 133.17,
"step": 2110,
"token_acc": 0.8886916557875393,
"train_speed(iter/s)": 0.067954
},
{
"epoch": 2.4765807962529274,
"grad_norm": 0.19697441160678864,
"learning_rate": 1.619981608208796e-06,
"loss": 0.30350236892700194,
"memory(GiB)": 133.17,
"step": 2115,
"token_acc": 0.8755962030416897,
"train_speed(iter/s)": 0.067954
},
{
"epoch": 2.4824355971896956,
"grad_norm": 0.19516149163246155,
"learning_rate": 1.584927059781548e-06,
"loss": 0.3021031379699707,
"memory(GiB)": 133.17,
"step": 2120,
"token_acc": 0.8850454875188026,
"train_speed(iter/s)": 0.067954
},
{
"epoch": 2.4882903981264635,
"grad_norm": 0.19685259461402893,
"learning_rate": 1.5502232728492362e-06,
"loss": 0.29403057098388674,
"memory(GiB)": 133.17,
"step": 2125,
"token_acc": 0.8935650598835121,
"train_speed(iter/s)": 0.067954
},
{
"epoch": 2.4941451990632317,
"grad_norm": 0.1944494992494583,
"learning_rate": 1.5158716939533524e-06,
"loss": 0.303509259223938,
"memory(GiB)": 133.17,
"step": 2130,
"token_acc": 0.8899391835374175,
"train_speed(iter/s)": 0.067955
},
{
"epoch": 2.5,
"grad_norm": 0.18963733315467834,
"learning_rate": 1.4818737549544725e-06,
"loss": 0.3023875951766968,
"memory(GiB)": 133.17,
"step": 2135,
"token_acc": 0.8820025957494603,
"train_speed(iter/s)": 0.067956
},
{
"epoch": 2.5058548009367683,
"grad_norm": 0.2587365210056305,
"learning_rate": 1.448230872972568e-06,
"loss": 0.29965691566467284,
"memory(GiB)": 133.17,
"step": 2140,
"token_acc": 0.895712561145832,
"train_speed(iter/s)": 0.067953
},
{
"epoch": 2.5117096018735365,
"grad_norm": 0.1916307806968689,
"learning_rate": 1.4149444503279297e-06,
"loss": 0.3064573764801025,
"memory(GiB)": 133.17,
"step": 2145,
"token_acc": 0.8827446402570668,
"train_speed(iter/s)": 0.067953
},
{
"epoch": 2.5175644028103044,
"grad_norm": 0.18983621895313263,
"learning_rate": 1.382015874482735e-06,
"loss": 0.2994706630706787,
"memory(GiB)": 133.17,
"step": 2150,
"token_acc": 0.889184252992907,
"train_speed(iter/s)": 0.067954
},
{
"epoch": 2.5234192037470726,
"grad_norm": 0.19152384996414185,
"learning_rate": 1.3494465179831895e-06,
"loss": 0.29698777198791504,
"memory(GiB)": 133.17,
"step": 2155,
"token_acc": 0.8862135400891181,
"train_speed(iter/s)": 0.067954
},
{
"epoch": 2.529274004683841,
"grad_norm": 0.17604193091392517,
"learning_rate": 1.3172377384023393e-06,
"loss": 0.2926321029663086,
"memory(GiB)": 133.17,
"step": 2160,
"token_acc": 0.891226010077476,
"train_speed(iter/s)": 0.067956
},
{
"epoch": 2.5351288056206087,
"grad_norm": 0.18896515667438507,
"learning_rate": 1.2853908782834722e-06,
"loss": 0.29559669494628904,
"memory(GiB)": 133.17,
"step": 2165,
"token_acc": 0.8984888499945305,
"train_speed(iter/s)": 0.067953
},
{
"epoch": 2.540983606557377,
"grad_norm": 0.18624024093151093,
"learning_rate": 1.2539072650841523e-06,
"loss": 0.30248537063598635,
"memory(GiB)": 133.17,
"step": 2170,
"token_acc": 0.8883391871864846,
"train_speed(iter/s)": 0.067955
},
{
"epoch": 2.5468384074941453,
"grad_norm": 0.18639948964118958,
"learning_rate": 1.2227882111209011e-06,
"loss": 0.3061221599578857,
"memory(GiB)": 133.17,
"step": 2175,
"token_acc": 0.8831800956700007,
"train_speed(iter/s)": 0.067954
},
{
"epoch": 2.552693208430913,
"grad_norm": 0.4918629229068756,
"learning_rate": 1.1920350135144898e-06,
"loss": 0.29971723556518554,
"memory(GiB)": 133.17,
"step": 2180,
"token_acc": 0.894886042214037,
"train_speed(iter/s)": 0.067954
},
{
"epoch": 2.5585480093676813,
"grad_norm": 0.18684136867523193,
"learning_rate": 1.1616489541358678e-06,
"loss": 0.29734086990356445,
"memory(GiB)": 133.17,
"step": 2185,
"token_acc": 0.8841362452439526,
"train_speed(iter/s)": 0.067954
},
{
"epoch": 2.5644028103044496,
"grad_norm": 0.18606062233448029,
"learning_rate": 1.1316312995527424e-06,
"loss": 0.3008298873901367,
"memory(GiB)": 133.17,
"step": 2190,
"token_acc": 0.8897950269865535,
"train_speed(iter/s)": 0.067955
},
{
"epoch": 2.570257611241218,
"grad_norm": 0.1816904991865158,
"learning_rate": 1.1019833009767744e-06,
"loss": 0.29885680675506593,
"memory(GiB)": 133.17,
"step": 2195,
"token_acc": 0.8878981843540634,
"train_speed(iter/s)": 0.067954
},
{
"epoch": 2.576112412177986,
"grad_norm": 0.19094757735729218,
"learning_rate": 1.072706194211426e-06,
"loss": 0.30043601989746094,
"memory(GiB)": 133.17,
"step": 2200,
"token_acc": 0.8938432293837546,
"train_speed(iter/s)": 0.067955
},
{
"epoch": 2.581967213114754,
"grad_norm": 0.1899169534444809,
"learning_rate": 1.0438011996004581e-06,
"loss": 0.2995189905166626,
"memory(GiB)": 133.17,
"step": 2205,
"token_acc": 0.8880722202892788,
"train_speed(iter/s)": 0.067955
},
{
"epoch": 2.5878220140515222,
"grad_norm": 0.19649627804756165,
"learning_rate": 1.0152695219770558e-06,
"loss": 0.2872016429901123,
"memory(GiB)": 133.17,
"step": 2210,
"token_acc": 0.8980766878765166,
"train_speed(iter/s)": 0.067953
},
{
"epoch": 2.5936768149882905,
"grad_norm": 0.19790223240852356,
"learning_rate": 9.871123506136037e-07,
"loss": 0.29386420249938966,
"memory(GiB)": 133.17,
"step": 2215,
"token_acc": 0.9006561928197363,
"train_speed(iter/s)": 0.067953
},
{
"epoch": 2.5995316159250583,
"grad_norm": 0.18190743029117584,
"learning_rate": 9.593308591721274e-07,
"loss": 0.2908626079559326,
"memory(GiB)": 133.17,
"step": 2220,
"token_acc": 0.8927072444113778,
"train_speed(iter/s)": 0.067953
},
{
"epoch": 2.6053864168618266,
"grad_norm": 0.1853610724210739,
"learning_rate": 9.319262056553602e-07,
"loss": 0.300918436050415,
"memory(GiB)": 133.17,
"step": 2225,
"token_acc": 0.8953082310083849,
"train_speed(iter/s)": 0.067951
},
{
"epoch": 2.611241217798595,
"grad_norm": 0.19064903259277344,
"learning_rate": 9.048995323584764e-07,
"loss": 0.3040909767150879,
"memory(GiB)": 133.17,
"step": 2230,
"token_acc": 0.8990558015887316,
"train_speed(iter/s)": 0.06795
},
{
"epoch": 2.617096018735363,
"grad_norm": 0.18238228559494019,
"learning_rate": 8.78251965821485e-07,
"loss": 0.2880122184753418,
"memory(GiB)": 133.17,
"step": 2235,
"token_acc": 0.8914627457335544,
"train_speed(iter/s)": 0.067949
},
{
"epoch": 2.6229508196721314,
"grad_norm": 0.18738383054733276,
"learning_rate": 8.519846167822665e-07,
"loss": 0.2943183422088623,
"memory(GiB)": 133.17,
"step": 2240,
"token_acc": 0.9038425869666715,
"train_speed(iter/s)": 0.067949
},
{
"epoch": 2.628805620608899,
"grad_norm": 0.1841094046831131,
"learning_rate": 8.260985801302734e-07,
"loss": 0.2850812911987305,
"memory(GiB)": 133.17,
"step": 2245,
"token_acc": 0.8894028305143251,
"train_speed(iter/s)": 0.067948
},
{
"epoch": 2.6346604215456675,
"grad_norm": 0.18611599504947662,
"learning_rate": 8.005949348608977e-07,
"loss": 0.2972105979919434,
"memory(GiB)": 133.17,
"step": 2250,
"token_acc": 0.8871099881800386,
"train_speed(iter/s)": 0.067948
},
{
"epoch": 2.6405152224824358,
"grad_norm": 0.1857517957687378,
"learning_rate": 7.754747440304911e-07,
"loss": 0.30115318298339844,
"memory(GiB)": 133.17,
"step": 2255,
"token_acc": 0.8863566925844406,
"train_speed(iter/s)": 0.067949
},
{
"epoch": 2.6463700234192036,
"grad_norm": 0.19214338064193726,
"learning_rate": 7.507390547120541e-07,
"loss": 0.29389874935150145,
"memory(GiB)": 133.17,
"step": 2260,
"token_acc": 0.8897345572130235,
"train_speed(iter/s)": 0.067951
},
{
"epoch": 2.652224824355972,
"grad_norm": 0.18817630410194397,
"learning_rate": 7.263888979515954e-07,
"loss": 0.3036650657653809,
"memory(GiB)": 133.17,
"step": 2265,
"token_acc": 0.8849702240287362,
"train_speed(iter/s)": 0.067952
},
{
"epoch": 2.65807962529274,
"grad_norm": 0.18061281740665436,
"learning_rate": 7.024252887251548e-07,
"loss": 0.29589831829071045,
"memory(GiB)": 133.17,
"step": 2270,
"token_acc": 0.8896930575764528,
"train_speed(iter/s)": 0.067951
},
{
"epoch": 2.663934426229508,
"grad_norm": 0.18534523248672485,
"learning_rate": 6.788492258964896e-07,
"loss": 0.29939701557159426,
"memory(GiB)": 133.17,
"step": 2275,
"token_acc": 0.8869800488330657,
"train_speed(iter/s)": 0.067951
},
{
"epoch": 2.669789227166276,
"grad_norm": 0.19118830561637878,
"learning_rate": 6.556616921754489e-07,
"loss": 0.29693875312805174,
"memory(GiB)": 133.17,
"step": 2280,
"token_acc": 0.8911168593654094,
"train_speed(iter/s)": 0.067951
},
{
"epoch": 2.6756440281030445,
"grad_norm": 0.18963268399238586,
"learning_rate": 6.328636540770028e-07,
"loss": 0.3002347707748413,
"memory(GiB)": 133.17,
"step": 2285,
"token_acc": 0.889527246797438,
"train_speed(iter/s)": 0.06795
},
{
"epoch": 2.6814988290398127,
"grad_norm": 0.18357062339782715,
"learning_rate": 6.10456061880963e-07,
"loss": 0.304398250579834,
"memory(GiB)": 133.17,
"step": 2290,
"token_acc": 0.8937302240569359,
"train_speed(iter/s)": 0.067949
},
{
"epoch": 2.687353629976581,
"grad_norm": 0.18504291772842407,
"learning_rate": 5.884398495923727e-07,
"loss": 0.29355425834655763,
"memory(GiB)": 133.17,
"step": 2295,
"token_acc": 0.8842415418528523,
"train_speed(iter/s)": 0.06795
},
{
"epoch": 2.693208430913349,
"grad_norm": 0.1993853747844696,
"learning_rate": 5.668159349025649e-07,
"loss": 0.3113893985748291,
"memory(GiB)": 133.17,
"step": 2300,
"token_acc": 0.8775886656746031,
"train_speed(iter/s)": 0.067949
},
{
"epoch": 2.699063231850117,
"grad_norm": 0.18858520686626434,
"learning_rate": 5.455852191509214e-07,
"loss": 0.3054765224456787,
"memory(GiB)": 133.17,
"step": 2305,
"token_acc": 0.8816947533601692,
"train_speed(iter/s)": 0.067949
},
{
"epoch": 2.7049180327868854,
"grad_norm": 0.19768975675106049,
"learning_rate": 5.247485872873026e-07,
"loss": 0.29274706840515136,
"memory(GiB)": 133.17,
"step": 2310,
"token_acc": 0.8946102350213514,
"train_speed(iter/s)": 0.06795
},
{
"epoch": 2.710772833723653,
"grad_norm": 0.17342238128185272,
"learning_rate": 5.043069078351526e-07,
"loss": 0.2879345893859863,
"memory(GiB)": 133.17,
"step": 2315,
"token_acc": 0.8956176935229068,
"train_speed(iter/s)": 0.067951
},
{
"epoch": 2.7166276346604215,
"grad_norm": 0.18019071221351624,
"learning_rate": 4.842610328552999e-07,
"loss": 0.29531962871551515,
"memory(GiB)": 133.17,
"step": 2320,
"token_acc": 0.8930268304142333,
"train_speed(iter/s)": 0.06795
},
{
"epoch": 2.7224824355971897,
"grad_norm": 0.1835058629512787,
"learning_rate": 4.6461179791044806e-07,
"loss": 0.2953210353851318,
"memory(GiB)": 133.21,
"step": 2325,
"token_acc": 0.8869091207514772,
"train_speed(iter/s)": 0.067952
},
{
"epoch": 2.728337236533958,
"grad_norm": 0.1827324628829956,
"learning_rate": 4.453600220303378e-07,
"loss": 0.2804730415344238,
"memory(GiB)": 133.21,
"step": 2330,
"token_acc": 0.8874964005358507,
"train_speed(iter/s)": 0.067953
},
{
"epoch": 2.7341920374707263,
"grad_norm": 0.18949875235557556,
"learning_rate": 4.2650650767761535e-07,
"loss": 0.2842918872833252,
"memory(GiB)": 133.21,
"step": 2335,
"token_acc": 0.8981613154267605,
"train_speed(iter/s)": 0.067952
},
{
"epoch": 2.740046838407494,
"grad_norm": 0.2092583179473877,
"learning_rate": 4.0805204071437953e-07,
"loss": 0.3071431636810303,
"memory(GiB)": 133.21,
"step": 2340,
"token_acc": 0.886714704322126,
"train_speed(iter/s)": 0.06795
},
{
"epoch": 2.7459016393442623,
"grad_norm": 0.18031486868858337,
"learning_rate": 3.899973903694243e-07,
"loss": 0.30032360553741455,
"memory(GiB)": 133.21,
"step": 2345,
"token_acc": 0.8907299680407984,
"train_speed(iter/s)": 0.06795
},
{
"epoch": 2.7517564402810306,
"grad_norm": 0.18996600806713104,
"learning_rate": 3.72343309206179e-07,
"loss": 0.2920222759246826,
"memory(GiB)": 133.21,
"step": 2350,
"token_acc": 0.8913125942460162,
"train_speed(iter/s)": 0.067948
},
{
"epoch": 2.7576112412177984,
"grad_norm": 0.18651233613491058,
"learning_rate": 3.55090533091339e-07,
"loss": 0.2933474063873291,
"memory(GiB)": 133.21,
"step": 2355,
"token_acc": 0.9025793311463176,
"train_speed(iter/s)": 0.067946
},
{
"epoch": 2.7634660421545667,
"grad_norm": 0.17747479677200317,
"learning_rate": 3.382397811641858e-07,
"loss": 0.2873265266418457,
"memory(GiB)": 133.21,
"step": 2360,
"token_acc": 0.8948029740479362,
"train_speed(iter/s)": 0.067947
},
{
"epoch": 2.769320843091335,
"grad_norm": 0.17965154349803925,
"learning_rate": 3.217917558066241e-07,
"loss": 0.2922650337219238,
"memory(GiB)": 133.21,
"step": 2365,
"token_acc": 0.886650070990299,
"train_speed(iter/s)": 0.067947
},
{
"epoch": 2.775175644028103,
"grad_norm": 0.17902247607707977,
"learning_rate": 3.057471426138958e-07,
"loss": 0.3062438488006592,
"memory(GiB)": 133.21,
"step": 2370,
"token_acc": 0.8800339720197485,
"train_speed(iter/s)": 0.067946
},
{
"epoch": 2.781030444964871,
"grad_norm": 0.17590953409671783,
"learning_rate": 2.901066103660033e-07,
"loss": 0.29376084804534913,
"memory(GiB)": 133.21,
"step": 2375,
"token_acc": 0.8933526766467255,
"train_speed(iter/s)": 0.067946
},
{
"epoch": 2.7868852459016393,
"grad_norm": 0.19265642762184143,
"learning_rate": 2.7487081099983435e-07,
"loss": 0.3061210155487061,
"memory(GiB)": 133.21,
"step": 2380,
"token_acc": 0.8897680154530525,
"train_speed(iter/s)": 0.067947
},
{
"epoch": 2.7927400468384076,
"grad_norm": 0.18283043801784515,
"learning_rate": 2.6004037958199167e-07,
"loss": 0.2898393154144287,
"memory(GiB)": 133.21,
"step": 2385,
"token_acc": 0.9019573328471696,
"train_speed(iter/s)": 0.067947
},
{
"epoch": 2.798594847775176,
"grad_norm": 0.18570415675640106,
"learning_rate": 2.4561593428231165e-07,
"loss": 0.29611454010009763,
"memory(GiB)": 133.21,
"step": 2390,
"token_acc": 0.9104894052586534,
"train_speed(iter/s)": 0.067945
},
{
"epoch": 2.8044496487119437,
"grad_norm": 0.18174812197685242,
"learning_rate": 2.3159807634811182e-07,
"loss": 0.28598248958587646,
"memory(GiB)": 133.21,
"step": 2395,
"token_acc": 0.8965343061596744,
"train_speed(iter/s)": 0.067945
},
{
"epoch": 2.810304449648712,
"grad_norm": 0.18527300655841827,
"learning_rate": 2.1798739007911517e-07,
"loss": 0.3005537986755371,
"memory(GiB)": 133.21,
"step": 2400,
"token_acc": 0.887049760238975,
"train_speed(iter/s)": 0.067943
},
{
"epoch": 2.8161592505854802,
"grad_norm": 0.17769944667816162,
"learning_rate": 2.0478444280310206e-07,
"loss": 0.2945347785949707,
"memory(GiB)": 133.21,
"step": 2405,
"token_acc": 0.8905158466381549,
"train_speed(iter/s)": 0.067944
},
{
"epoch": 2.822014051522248,
"grad_norm": 0.22059805691242218,
"learning_rate": 1.919897848522656e-07,
"loss": 0.2934718132019043,
"memory(GiB)": 133.21,
"step": 2410,
"token_acc": 0.8902092807074844,
"train_speed(iter/s)": 0.067943
},
{
"epoch": 2.8278688524590163,
"grad_norm": 0.18694834411144257,
"learning_rate": 1.796039495402646e-07,
"loss": 0.2984294414520264,
"memory(GiB)": 133.21,
"step": 2415,
"token_acc": 0.8962242022599117,
"train_speed(iter/s)": 0.067942
},
{
"epoch": 2.8337236533957846,
"grad_norm": 0.18271034955978394,
"learning_rate": 1.6762745313999795e-07,
"loss": 0.3036228895187378,
"memory(GiB)": 133.21,
"step": 2420,
"token_acc": 0.876532044285046,
"train_speed(iter/s)": 0.06794
},
{
"epoch": 2.839578454332553,
"grad_norm": 0.17917729914188385,
"learning_rate": 1.5606079486208846e-07,
"loss": 0.29344632625579836,
"memory(GiB)": 133.21,
"step": 2425,
"token_acc": 0.8902957800547429,
"train_speed(iter/s)": 0.06794
},
{
"epoch": 2.845433255269321,
"grad_norm": 0.1874186247587204,
"learning_rate": 1.449044568340663e-07,
"loss": 0.3013723373413086,
"memory(GiB)": 133.21,
"step": 2430,
"token_acc": 0.8944912877684091,
"train_speed(iter/s)": 0.06794
},
{
"epoch": 2.851288056206089,
"grad_norm": 0.18850503861904144,
"learning_rate": 1.3415890408027932e-07,
"loss": 0.29042725563049315,
"memory(GiB)": 133.21,
"step": 2435,
"token_acc": 0.8900198911125016,
"train_speed(iter/s)": 0.067939
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.1918351948261261,
"learning_rate": 1.2382458450250657e-07,
"loss": 0.30360941886901854,
"memory(GiB)": 133.21,
"step": 2440,
"token_acc": 0.8830958974326918,
"train_speed(iter/s)": 0.06794
},
{
"epoch": 2.8629976580796255,
"grad_norm": 0.17968802154064178,
"learning_rate": 1.1390192886129304e-07,
"loss": 0.29129633903503416,
"memory(GiB)": 133.21,
"step": 2445,
"token_acc": 0.9028775730901043,
"train_speed(iter/s)": 0.06794
},
{
"epoch": 2.8688524590163933,
"grad_norm": 0.19108013808727264,
"learning_rate": 1.0439135075798634e-07,
"loss": 0.30793027877807616,
"memory(GiB)": 133.21,
"step": 2450,
"token_acc": 0.8880401770150538,
"train_speed(iter/s)": 0.067942
},
{
"epoch": 2.8747072599531616,
"grad_norm": 0.18335837125778198,
"learning_rate": 9.529324661750494e-08,
"loss": 0.301357364654541,
"memory(GiB)": 133.21,
"step": 2455,
"token_acc": 0.8833441771706613,
"train_speed(iter/s)": 0.067941
},
{
"epoch": 2.88056206088993,
"grad_norm": 0.18961112201213837,
"learning_rate": 8.6607995671808e-08,
"loss": 0.29690849781036377,
"memory(GiB)": 133.21,
"step": 2460,
"token_acc": 0.8882351107925328,
"train_speed(iter/s)": 0.067941
},
{
"epoch": 2.8864168618266977,
"grad_norm": 0.22844481468200684,
"learning_rate": 7.833595994409248e-08,
"loss": 0.2876168727874756,
"memory(GiB)": 133.21,
"step": 2465,
"token_acc": 0.888989677822959,
"train_speed(iter/s)": 0.06794
},
{
"epoch": 2.892271662763466,
"grad_norm": 0.19000564515590668,
"learning_rate": 7.047748423370193e-08,
"loss": 0.3021047353744507,
"memory(GiB)": 133.21,
"step": 2470,
"token_acc": 0.8849937208945455,
"train_speed(iter/s)": 0.067942
},
{
"epoch": 2.898126463700234,
"grad_norm": 0.18740171194076538,
"learning_rate": 6.303289610175233e-08,
"loss": 0.29048540592193606,
"memory(GiB)": 133.21,
"step": 2475,
"token_acc": 0.8921316614420063,
"train_speed(iter/s)": 0.067942
},
{
"epoch": 2.9039812646370025,
"grad_norm": 0.18983155488967896,
"learning_rate": 5.6002505857480906e-08,
"loss": 0.2961090326309204,
"memory(GiB)": 133.21,
"step": 2480,
"token_acc": 0.8925172239013309,
"train_speed(iter/s)": 0.067943
},
{
"epoch": 2.9098360655737707,
"grad_norm": 0.1831265538930893,
"learning_rate": 4.938660654530969e-08,
"loss": 0.3080202579498291,
"memory(GiB)": 133.21,
"step": 2485,
"token_acc": 0.8858930624281501,
"train_speed(iter/s)": 0.067943
},
{
"epoch": 2.9156908665105385,
"grad_norm": 0.19748179614543915,
"learning_rate": 4.318547393263317e-08,
"loss": 0.30983719825744627,
"memory(GiB)": 133.21,
"step": 2490,
"token_acc": 0.8897056301087475,
"train_speed(iter/s)": 0.067942
},
{
"epoch": 2.921545667447307,
"grad_norm": 0.18569178879261017,
"learning_rate": 3.739936649832188e-08,
"loss": 0.29312853813171386,
"memory(GiB)": 133.21,
"step": 2495,
"token_acc": 0.8924126241525105,
"train_speed(iter/s)": 0.067945
},
{
"epoch": 2.927400468384075,
"grad_norm": 0.1950037181377411,
"learning_rate": 3.2028525421946563e-08,
"loss": 0.2936956167221069,
"memory(GiB)": 133.21,
"step": 2500,
"token_acc": 0.9020274516704794,
"train_speed(iter/s)": 0.067945
},
{
"epoch": 2.933255269320843,
"grad_norm": 0.18167735636234283,
"learning_rate": 2.70731745737296e-08,
"loss": 0.2973939418792725,
"memory(GiB)": 133.21,
"step": 2505,
"token_acc": 0.8934657981473672,
"train_speed(iter/s)": 0.067946
},
{
"epoch": 2.939110070257611,
"grad_norm": 0.18503886461257935,
"learning_rate": 2.2533520505211294e-08,
"loss": 0.29192218780517576,
"memory(GiB)": 133.21,
"step": 2510,
"token_acc": 0.8951111388611389,
"train_speed(iter/s)": 0.067947
},
{
"epoch": 2.9449648711943794,
"grad_norm": 0.17936980724334717,
"learning_rate": 1.8409752440639027e-08,
"loss": 0.28421769142150877,
"memory(GiB)": 133.21,
"step": 2515,
"token_acc": 0.8924425595173032,
"train_speed(iter/s)": 0.067947
},
{
"epoch": 2.9508196721311473,
"grad_norm": 0.18841403722763062,
"learning_rate": 1.470204226908134e-08,
"loss": 0.30081515312194823,
"memory(GiB)": 133.21,
"step": 2520,
"token_acc": 0.894779086363537,
"train_speed(iter/s)": 0.067944
},
{
"epoch": 2.9566744730679155,
"grad_norm": 0.19020894169807434,
"learning_rate": 1.1410544537263645e-08,
"loss": 0.3081362247467041,
"memory(GiB)": 133.21,
"step": 2525,
"token_acc": 0.8934638595786859,
"train_speed(iter/s)": 0.067943
},
{
"epoch": 2.962529274004684,
"grad_norm": 0.24749897420406342,
"learning_rate": 8.535396443124511e-09,
"loss": 0.2878671884536743,
"memory(GiB)": 133.21,
"step": 2530,
"token_acc": 0.8913681995528473,
"train_speed(iter/s)": 0.067944
},
{
"epoch": 2.968384074941452,
"grad_norm": 0.17989581823349,
"learning_rate": 6.076717830098e-09,
"loss": 0.2899226903915405,
"memory(GiB)": 133.21,
"step": 2535,
"token_acc": 0.8996739041991876,
"train_speed(iter/s)": 0.067943
},
{
"epoch": 2.9742388758782203,
"grad_norm": 0.18506699800491333,
"learning_rate": 4.034611182121007e-09,
"loss": 0.2908132553100586,
"memory(GiB)": 133.21,
"step": 2540,
"token_acc": 0.8988520352276212,
"train_speed(iter/s)": 0.067941
},
{
"epoch": 2.980093676814988,
"grad_norm": 0.18510298430919647,
"learning_rate": 2.40916161935445e-09,
"loss": 0.29580187797546387,
"memory(GiB)": 133.21,
"step": 2545,
"token_acc": 0.8895340031302065,
"train_speed(iter/s)": 0.067941
},
{
"epoch": 2.9859484777517564,
"grad_norm": 0.18303260207176208,
"learning_rate": 1.2004368946427758e-09,
"loss": 0.2922369956970215,
"memory(GiB)": 133.21,
"step": 2550,
"token_acc": 0.8920757330143692,
"train_speed(iter/s)": 0.067941
},
{
"epoch": 2.9918032786885247,
"grad_norm": 0.1823214441537857,
"learning_rate": 4.084873906851083e-10,
"loss": 0.29749574661254885,
"memory(GiB)": 133.21,
"step": 2555,
"token_acc": 0.8995572920769461,
"train_speed(iter/s)": 0.067939
},
{
"epoch": 2.9976580796252925,
"grad_norm": 0.17787551879882812,
"learning_rate": 3.334611793692766e-11,
"loss": 0.29738173484802244,
"memory(GiB)": 133.21,
"step": 2560,
"token_acc": 0.903360959533883,
"train_speed(iter/s)": 0.067939
}
],
"logging_steps": 5,
"max_steps": 2562,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1.0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2476392970944512.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}