3TF-14B / trainer_state.json
volcanos's picture
Upload folder using huggingface_hub
c67ee0f
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 2562,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00117096018735363,
"grad_norm": 2.397789478302002,
"learning_rate": 1.5503875968992249e-07,
"loss": 0.5513913631439209,
"memory(GiB)": 137.67,
"step": 1,
"token_acc": 0.8478124608248715,
"train_speed(iter/s)": 0.014244
},
{
"epoch": 0.00585480093676815,
"grad_norm": 2.213494300842285,
"learning_rate": 7.751937984496125e-07,
"loss": 0.5191692113876343,
"memory(GiB)": 137.67,
"step": 5,
"token_acc": 0.848514893999071,
"train_speed(iter/s)": 0.029361
},
{
"epoch": 0.0117096018735363,
"grad_norm": 2.0672056674957275,
"learning_rate": 1.550387596899225e-06,
"loss": 0.5157936096191407,
"memory(GiB)": 137.67,
"step": 10,
"token_acc": 0.8392344826938901,
"train_speed(iter/s)": 0.034106
},
{
"epoch": 0.01756440281030445,
"grad_norm": 1.588051676750183,
"learning_rate": 2.3255813953488376e-06,
"loss": 0.49305076599121095,
"memory(GiB)": 137.67,
"step": 15,
"token_acc": 0.8437633920693741,
"train_speed(iter/s)": 0.03643
},
{
"epoch": 0.0234192037470726,
"grad_norm": 0.7405409812927246,
"learning_rate": 3.10077519379845e-06,
"loss": 0.43950672149658204,
"memory(GiB)": 137.67,
"step": 20,
"token_acc": 0.848157187048235,
"train_speed(iter/s)": 0.03757
},
{
"epoch": 0.02927400468384075,
"grad_norm": 0.8562428951263428,
"learning_rate": 3.875968992248063e-06,
"loss": 0.4227635383605957,
"memory(GiB)": 137.67,
"step": 25,
"token_acc": 0.8593663993232968,
"train_speed(iter/s)": 0.038283
},
{
"epoch": 0.0351288056206089,
"grad_norm": 0.4966309666633606,
"learning_rate": 4.651162790697675e-06,
"loss": 0.4113954544067383,
"memory(GiB)": 137.67,
"step": 30,
"token_acc": 0.8579081152325363,
"train_speed(iter/s)": 0.038822
},
{
"epoch": 0.040983606557377046,
"grad_norm": 0.4413171410560608,
"learning_rate": 5.4263565891472865e-06,
"loss": 0.40917291641235354,
"memory(GiB)": 137.67,
"step": 35,
"token_acc": 0.8563618960945223,
"train_speed(iter/s)": 0.039192
},
{
"epoch": 0.0468384074941452,
"grad_norm": 0.37367990612983704,
"learning_rate": 6.2015503875969e-06,
"loss": 0.38341727256774905,
"memory(GiB)": 137.67,
"step": 40,
"token_acc": 0.8598059924304837,
"train_speed(iter/s)": 0.039486
},
{
"epoch": 0.05269320843091335,
"grad_norm": 0.2625274062156677,
"learning_rate": 6.976744186046513e-06,
"loss": 0.39299936294555665,
"memory(GiB)": 137.67,
"step": 45,
"token_acc": 0.8545384055298668,
"train_speed(iter/s)": 0.03968
},
{
"epoch": 0.0585480093676815,
"grad_norm": 0.27871787548065186,
"learning_rate": 7.751937984496126e-06,
"loss": 0.38351633548736574,
"memory(GiB)": 137.67,
"step": 50,
"token_acc": 0.8680353205073448,
"train_speed(iter/s)": 0.039861
},
{
"epoch": 0.06440281030444965,
"grad_norm": 0.2245069444179535,
"learning_rate": 8.527131782945736e-06,
"loss": 0.3764484882354736,
"memory(GiB)": 137.67,
"step": 55,
"token_acc": 0.8676952168658857,
"train_speed(iter/s)": 0.040018
},
{
"epoch": 0.0702576112412178,
"grad_norm": 0.22919970750808716,
"learning_rate": 9.30232558139535e-06,
"loss": 0.3956867218017578,
"memory(GiB)": 137.67,
"step": 60,
"token_acc": 0.865152491108186,
"train_speed(iter/s)": 0.040146
},
{
"epoch": 0.07611241217798595,
"grad_norm": 0.21093736588954926,
"learning_rate": 1.0077519379844963e-05,
"loss": 0.37714409828186035,
"memory(GiB)": 137.67,
"step": 65,
"token_acc": 0.8760504070619795,
"train_speed(iter/s)": 0.040253
},
{
"epoch": 0.08196721311475409,
"grad_norm": 0.21410879492759705,
"learning_rate": 1.0852713178294573e-05,
"loss": 0.3757580995559692,
"memory(GiB)": 137.67,
"step": 70,
"token_acc": 0.8649565195567881,
"train_speed(iter/s)": 0.040315
},
{
"epoch": 0.08782201405152225,
"grad_norm": 0.1979837864637375,
"learning_rate": 1.1627906976744187e-05,
"loss": 0.37558441162109374,
"memory(GiB)": 137.67,
"step": 75,
"token_acc": 0.8532517495556191,
"train_speed(iter/s)": 0.040405
},
{
"epoch": 0.0936768149882904,
"grad_norm": 0.207350954413414,
"learning_rate": 1.24031007751938e-05,
"loss": 0.3741091966629028,
"memory(GiB)": 137.67,
"step": 80,
"token_acc": 0.8612590246358096,
"train_speed(iter/s)": 0.040461
},
{
"epoch": 0.09953161592505855,
"grad_norm": 0.19452251493930817,
"learning_rate": 1.3178294573643412e-05,
"loss": 0.3656472682952881,
"memory(GiB)": 137.67,
"step": 85,
"token_acc": 0.8822223551750307,
"train_speed(iter/s)": 0.040557
},
{
"epoch": 0.1053864168618267,
"grad_norm": 0.20653362572193146,
"learning_rate": 1.3953488372093025e-05,
"loss": 0.3706169605255127,
"memory(GiB)": 137.67,
"step": 90,
"token_acc": 0.8654753188641241,
"train_speed(iter/s)": 0.04063
},
{
"epoch": 0.11124121779859485,
"grad_norm": 0.20383736491203308,
"learning_rate": 1.4728682170542636e-05,
"loss": 0.3718616485595703,
"memory(GiB)": 137.67,
"step": 95,
"token_acc": 0.8700523810121971,
"train_speed(iter/s)": 0.040694
},
{
"epoch": 0.117096018735363,
"grad_norm": 0.2144174873828888,
"learning_rate": 1.550387596899225e-05,
"loss": 0.3716637134552002,
"memory(GiB)": 137.67,
"step": 100,
"token_acc": 0.871046915998142,
"train_speed(iter/s)": 0.040754
},
{
"epoch": 0.12295081967213115,
"grad_norm": 0.2225562483072281,
"learning_rate": 1.6279069767441862e-05,
"loss": 0.3682845115661621,
"memory(GiB)": 137.67,
"step": 105,
"token_acc": 0.8729440672893664,
"train_speed(iter/s)": 0.040816
},
{
"epoch": 0.1288056206088993,
"grad_norm": 0.2207648605108261,
"learning_rate": 1.7054263565891473e-05,
"loss": 0.3570878982543945,
"memory(GiB)": 137.67,
"step": 110,
"token_acc": 0.8706495975584588,
"train_speed(iter/s)": 0.04088
},
{
"epoch": 0.13466042154566746,
"grad_norm": 0.2282887101173401,
"learning_rate": 1.7829457364341087e-05,
"loss": 0.3752657175064087,
"memory(GiB)": 137.67,
"step": 115,
"token_acc": 0.8784262063618629,
"train_speed(iter/s)": 0.040925
},
{
"epoch": 0.1405152224824356,
"grad_norm": 0.23532657325267792,
"learning_rate": 1.86046511627907e-05,
"loss": 0.3657325029373169,
"memory(GiB)": 137.67,
"step": 120,
"token_acc": 0.8712829028328604,
"train_speed(iter/s)": 0.040965
},
{
"epoch": 0.14637002341920374,
"grad_norm": 0.2132922112941742,
"learning_rate": 1.937984496124031e-05,
"loss": 0.3799854278564453,
"memory(GiB)": 137.67,
"step": 125,
"token_acc": 0.8649469651038509,
"train_speed(iter/s)": 0.041003
},
{
"epoch": 0.1522248243559719,
"grad_norm": 0.2445414662361145,
"learning_rate": 1.9999991663467044e-05,
"loss": 0.3770766258239746,
"memory(GiB)": 137.67,
"step": 130,
"token_acc": 0.8692484710531911,
"train_speed(iter/s)": 0.041036
},
{
"epoch": 0.15807962529274006,
"grad_norm": 0.2305486649274826,
"learning_rate": 1.9999699886272926e-05,
"loss": 0.3788888931274414,
"memory(GiB)": 137.67,
"step": 135,
"token_acc": 0.8571357490266324,
"train_speed(iter/s)": 0.041054
},
{
"epoch": 0.16393442622950818,
"grad_norm": 0.2297585904598236,
"learning_rate": 1.9998991296330317e-05,
"loss": 0.3768150806427002,
"memory(GiB)": 137.67,
"step": 140,
"token_acc": 0.8707652096887886,
"train_speed(iter/s)": 0.04107
},
{
"epoch": 0.16978922716627634,
"grad_norm": 0.22929546236991882,
"learning_rate": 1.9997865923175027e-05,
"loss": 0.3672610282897949,
"memory(GiB)": 137.67,
"step": 145,
"token_acc": 0.8764070583454463,
"train_speed(iter/s)": 0.041074
},
{
"epoch": 0.1756440281030445,
"grad_norm": 0.2531713843345642,
"learning_rate": 1.999632381371545e-05,
"loss": 0.3735011577606201,
"memory(GiB)": 137.67,
"step": 150,
"token_acc": 0.8610904473031397,
"train_speed(iter/s)": 0.041095
},
{
"epoch": 0.18149882903981265,
"grad_norm": 0.21190133690834045,
"learning_rate": 1.999436503223061e-05,
"loss": 0.37088618278503416,
"memory(GiB)": 137.67,
"step": 155,
"token_acc": 0.869811065319577,
"train_speed(iter/s)": 0.0411
},
{
"epoch": 0.1873536299765808,
"grad_norm": 0.24962091445922852,
"learning_rate": 1.9991989660367463e-05,
"loss": 0.3776357650756836,
"memory(GiB)": 137.67,
"step": 160,
"token_acc": 0.8544295113661168,
"train_speed(iter/s)": 0.041107
},
{
"epoch": 0.19320843091334894,
"grad_norm": 0.20956465601921082,
"learning_rate": 1.998919779713751e-05,
"loss": 0.3805836200714111,
"memory(GiB)": 137.67,
"step": 165,
"token_acc": 0.8613002884067936,
"train_speed(iter/s)": 0.041115
},
{
"epoch": 0.1990632318501171,
"grad_norm": 0.206803560256958,
"learning_rate": 1.998598955891266e-05,
"loss": 0.3702584505081177,
"memory(GiB)": 137.67,
"step": 170,
"token_acc": 0.8749547416575101,
"train_speed(iter/s)": 0.04113
},
{
"epoch": 0.20491803278688525,
"grad_norm": 0.23116904497146606,
"learning_rate": 1.9982365079420382e-05,
"loss": 0.3598947048187256,
"memory(GiB)": 137.67,
"step": 175,
"token_acc": 0.8684363191646153,
"train_speed(iter/s)": 0.041153
},
{
"epoch": 0.2107728337236534,
"grad_norm": 0.22105969488620758,
"learning_rate": 1.9978324509738147e-05,
"loss": 0.36261582374572754,
"memory(GiB)": 137.67,
"step": 180,
"token_acc": 0.8722339081558761,
"train_speed(iter/s)": 0.041173
},
{
"epoch": 0.21662763466042154,
"grad_norm": 0.21819841861724854,
"learning_rate": 1.9973868018287093e-05,
"loss": 0.3629172325134277,
"memory(GiB)": 137.67,
"step": 185,
"token_acc": 0.8667994850156469,
"train_speed(iter/s)": 0.041195
},
{
"epoch": 0.2224824355971897,
"grad_norm": 0.2083064317703247,
"learning_rate": 1.9968995790825048e-05,
"loss": 0.3675278902053833,
"memory(GiB)": 137.67,
"step": 190,
"token_acc": 0.8575012434717731,
"train_speed(iter/s)": 0.0412
},
{
"epoch": 0.22833723653395785,
"grad_norm": 0.21168376505374908,
"learning_rate": 1.9963708030438754e-05,
"loss": 0.3663478374481201,
"memory(GiB)": 137.67,
"step": 195,
"token_acc": 0.8699046566256736,
"train_speed(iter/s)": 0.041213
},
{
"epoch": 0.234192037470726,
"grad_norm": 0.21624095737934113,
"learning_rate": 1.995800495753542e-05,
"loss": 0.36658034324645994,
"memory(GiB)": 137.67,
"step": 200,
"token_acc": 0.8611760598068374,
"train_speed(iter/s)": 0.041221
},
{
"epoch": 0.24004683840749413,
"grad_norm": 0.21765926480293274,
"learning_rate": 1.9951886809833537e-05,
"loss": 0.37610225677490233,
"memory(GiB)": 137.67,
"step": 205,
"token_acc": 0.8608684017275929,
"train_speed(iter/s)": 0.041233
},
{
"epoch": 0.2459016393442623,
"grad_norm": 0.21804192662239075,
"learning_rate": 1.9945353842352943e-05,
"loss": 0.37209372520446776,
"memory(GiB)": 137.67,
"step": 210,
"token_acc": 0.8637638606903014,
"train_speed(iter/s)": 0.041242
},
{
"epoch": 0.25175644028103045,
"grad_norm": 0.21353310346603394,
"learning_rate": 1.9938406327404233e-05,
"loss": 0.36923999786376954,
"memory(GiB)": 137.67,
"step": 215,
"token_acc": 0.8725016214590311,
"train_speed(iter/s)": 0.041259
},
{
"epoch": 0.2576112412177986,
"grad_norm": 0.21438100934028625,
"learning_rate": 1.9931044554577373e-05,
"loss": 0.36598026752471924,
"memory(GiB)": 137.67,
"step": 220,
"token_acc": 0.8663032304289586,
"train_speed(iter/s)": 0.041275
},
{
"epoch": 0.26346604215456676,
"grad_norm": 0.21610133349895477,
"learning_rate": 1.992326883072965e-05,
"loss": 0.36849284172058105,
"memory(GiB)": 137.67,
"step": 225,
"token_acc": 0.8614589650451081,
"train_speed(iter/s)": 0.041281
},
{
"epoch": 0.2693208430913349,
"grad_norm": 0.2203439474105835,
"learning_rate": 1.991507947997287e-05,
"loss": 0.3765848636627197,
"memory(GiB)": 137.67,
"step": 230,
"token_acc": 0.8680725737864995,
"train_speed(iter/s)": 0.041291
},
{
"epoch": 0.275175644028103,
"grad_norm": 0.22208204865455627,
"learning_rate": 1.9906476843659866e-05,
"loss": 0.3718143939971924,
"memory(GiB)": 137.67,
"step": 235,
"token_acc": 0.8758277835099897,
"train_speed(iter/s)": 0.041301
},
{
"epoch": 0.2810304449648712,
"grad_norm": 0.20069433748722076,
"learning_rate": 1.989746128037024e-05,
"loss": 0.3583400249481201,
"memory(GiB)": 137.67,
"step": 240,
"token_acc": 0.8676873362719415,
"train_speed(iter/s)": 0.04131
},
{
"epoch": 0.28688524590163933,
"grad_norm": 0.19968946278095245,
"learning_rate": 1.988803316589545e-05,
"loss": 0.3672914505004883,
"memory(GiB)": 137.67,
"step": 245,
"token_acc": 0.8662484056672067,
"train_speed(iter/s)": 0.041328
},
{
"epoch": 0.2927400468384075,
"grad_norm": 0.21298536658287048,
"learning_rate": 1.987819289322311e-05,
"loss": 0.3696786403656006,
"memory(GiB)": 137.67,
"step": 250,
"token_acc": 0.8654257420775034,
"train_speed(iter/s)": 0.041348
},
{
"epoch": 0.29859484777517564,
"grad_norm": 0.2145387828350067,
"learning_rate": 1.9867940872520646e-05,
"loss": 0.3744542598724365,
"memory(GiB)": 137.67,
"step": 255,
"token_acc": 0.8661229081704401,
"train_speed(iter/s)": 0.041346
},
{
"epoch": 0.3044496487119438,
"grad_norm": 0.2132762223482132,
"learning_rate": 1.9857277531118173e-05,
"loss": 0.36826577186584475,
"memory(GiB)": 137.67,
"step": 260,
"token_acc": 0.8788229158157335,
"train_speed(iter/s)": 0.041353
},
{
"epoch": 0.31030444964871196,
"grad_norm": 0.2133207470178604,
"learning_rate": 1.9846203313490697e-05,
"loss": 0.35997600555419923,
"memory(GiB)": 137.67,
"step": 265,
"token_acc": 0.8834285319525085,
"train_speed(iter/s)": 0.041363
},
{
"epoch": 0.3161592505854801,
"grad_norm": 0.23535007238388062,
"learning_rate": 1.983471868123958e-05,
"loss": 0.3588090896606445,
"memory(GiB)": 137.67,
"step": 270,
"token_acc": 0.8657706943523579,
"train_speed(iter/s)": 0.041379
},
{
"epoch": 0.32201405152224827,
"grad_norm": 0.21440958976745605,
"learning_rate": 1.98228241130733e-05,
"loss": 0.38217363357543943,
"memory(GiB)": 137.67,
"step": 275,
"token_acc": 0.8693404501511701,
"train_speed(iter/s)": 0.041386
},
{
"epoch": 0.32786885245901637,
"grad_norm": 0.21196675300598145,
"learning_rate": 1.98105201047875e-05,
"loss": 0.35698800086975097,
"memory(GiB)": 137.67,
"step": 280,
"token_acc": 0.8743185598247525,
"train_speed(iter/s)": 0.041403
},
{
"epoch": 0.3337236533957845,
"grad_norm": 0.22762241959571838,
"learning_rate": 1.9797807169244326e-05,
"loss": 0.3626487016677856,
"memory(GiB)": 137.67,
"step": 285,
"token_acc": 0.8661923737202862,
"train_speed(iter/s)": 0.041406
},
{
"epoch": 0.3395784543325527,
"grad_norm": 0.21537438035011292,
"learning_rate": 1.9784685836351045e-05,
"loss": 0.37597248554229734,
"memory(GiB)": 137.67,
"step": 290,
"token_acc": 0.8632790864113016,
"train_speed(iter/s)": 0.041408
},
{
"epoch": 0.34543325526932084,
"grad_norm": 0.24162794649600983,
"learning_rate": 1.9771156653037944e-05,
"loss": 0.3674392461776733,
"memory(GiB)": 137.67,
"step": 295,
"token_acc": 0.86579905677273,
"train_speed(iter/s)": 0.041418
},
{
"epoch": 0.351288056206089,
"grad_norm": 0.19127634167671204,
"learning_rate": 1.975722018323556e-05,
"loss": 0.3606871604919434,
"memory(GiB)": 137.67,
"step": 300,
"token_acc": 0.8730913571244476,
"train_speed(iter/s)": 0.041416
},
{
"epoch": 0.35714285714285715,
"grad_norm": 0.21248631179332733,
"learning_rate": 1.974287700785116e-05,
"loss": 0.3568113327026367,
"memory(GiB)": 137.67,
"step": 305,
"token_acc": 0.8697051358380598,
"train_speed(iter/s)": 0.041425
},
{
"epoch": 0.3629976580796253,
"grad_norm": 0.20225107669830322,
"learning_rate": 1.9728127724744516e-05,
"loss": 0.3483549118041992,
"memory(GiB)": 137.67,
"step": 310,
"token_acc": 0.8697423969369493,
"train_speed(iter/s)": 0.041425
},
{
"epoch": 0.36885245901639346,
"grad_norm": 0.2230818122625351,
"learning_rate": 1.9712972948703006e-05,
"loss": 0.36976261138916017,
"memory(GiB)": 137.67,
"step": 315,
"token_acc": 0.8751112598082228,
"train_speed(iter/s)": 0.04143
},
{
"epoch": 0.3747072599531616,
"grad_norm": 0.1945132613182068,
"learning_rate": 1.9697413311415967e-05,
"loss": 0.364810585975647,
"memory(GiB)": 137.67,
"step": 320,
"token_acc": 0.8484778468167483,
"train_speed(iter/s)": 0.041435
},
{
"epoch": 0.3805620608899297,
"grad_norm": 0.19989554584026337,
"learning_rate": 1.9681449461448386e-05,
"loss": 0.3616858959197998,
"memory(GiB)": 137.67,
"step": 325,
"token_acc": 0.8718356506795814,
"train_speed(iter/s)": 0.041435
},
{
"epoch": 0.3864168618266979,
"grad_norm": 0.2084866315126419,
"learning_rate": 1.9665082064213856e-05,
"loss": 0.36598567962646483,
"memory(GiB)": 137.67,
"step": 330,
"token_acc": 0.8664227187552337,
"train_speed(iter/s)": 0.041441
},
{
"epoch": 0.39227166276346603,
"grad_norm": 0.20807960629463196,
"learning_rate": 1.9648311801946823e-05,
"loss": 0.3633120059967041,
"memory(GiB)": 137.67,
"step": 335,
"token_acc": 0.8659399461174416,
"train_speed(iter/s)": 0.041448
},
{
"epoch": 0.3981264637002342,
"grad_norm": 0.21306882798671722,
"learning_rate": 1.9631139373674188e-05,
"loss": 0.36129164695739746,
"memory(GiB)": 137.67,
"step": 340,
"token_acc": 0.8666773452933952,
"train_speed(iter/s)": 0.04145
},
{
"epoch": 0.40398126463700235,
"grad_norm": 0.21947889029979706,
"learning_rate": 1.9613565495186126e-05,
"loss": 0.35186495780944826,
"memory(GiB)": 137.67,
"step": 345,
"token_acc": 0.8666396689403815,
"train_speed(iter/s)": 0.041463
},
{
"epoch": 0.4098360655737705,
"grad_norm": 0.2155865728855133,
"learning_rate": 1.9595590899006288e-05,
"loss": 0.3684532880783081,
"memory(GiB)": 137.67,
"step": 350,
"token_acc": 0.8713802951875973,
"train_speed(iter/s)": 0.041462
},
{
"epoch": 0.41569086651053866,
"grad_norm": 0.2150585651397705,
"learning_rate": 1.957721633436124e-05,
"loss": 0.3669363260269165,
"memory(GiB)": 137.67,
"step": 355,
"token_acc": 0.8683417743625568,
"train_speed(iter/s)": 0.041459
},
{
"epoch": 0.4215456674473068,
"grad_norm": 0.22773627936840057,
"learning_rate": 1.9558442567149244e-05,
"loss": 0.36423306465148925,
"memory(GiB)": 137.67,
"step": 360,
"token_acc": 0.8815313637998826,
"train_speed(iter/s)": 0.041467
},
{
"epoch": 0.4274004683840749,
"grad_norm": 0.19997937977313995,
"learning_rate": 1.953927037990834e-05,
"loss": 0.3707897186279297,
"memory(GiB)": 137.67,
"step": 365,
"token_acc": 0.8580402286389447,
"train_speed(iter/s)": 0.041471
},
{
"epoch": 0.4332552693208431,
"grad_norm": 0.21174229681491852,
"learning_rate": 1.9519700571783718e-05,
"loss": 0.3715445280075073,
"memory(GiB)": 137.67,
"step": 370,
"token_acc": 0.873243385426675,
"train_speed(iter/s)": 0.041468
},
{
"epoch": 0.43911007025761123,
"grad_norm": 0.2164727747440338,
"learning_rate": 1.9499733958494405e-05,
"loss": 0.36826701164245607,
"memory(GiB)": 137.67,
"step": 375,
"token_acc": 0.8624453058192736,
"train_speed(iter/s)": 0.041471
},
{
"epoch": 0.4449648711943794,
"grad_norm": 0.2175064980983734,
"learning_rate": 1.947937137229928e-05,
"loss": 0.3610344648361206,
"memory(GiB)": 137.67,
"step": 380,
"token_acc": 0.8791143721842437,
"train_speed(iter/s)": 0.041474
},
{
"epoch": 0.45081967213114754,
"grad_norm": 0.21257779002189636,
"learning_rate": 1.9458613661962366e-05,
"loss": 0.36273534297943116,
"memory(GiB)": 137.67,
"step": 385,
"token_acc": 0.8811885856547406,
"train_speed(iter/s)": 0.041479
},
{
"epoch": 0.4566744730679157,
"grad_norm": 0.2007063329219818,
"learning_rate": 1.943746169271746e-05,
"loss": 0.36213395595550535,
"memory(GiB)": 137.67,
"step": 390,
"token_acc": 0.8793212957081934,
"train_speed(iter/s)": 0.041474
},
{
"epoch": 0.46252927400468385,
"grad_norm": 0.1982836127281189,
"learning_rate": 1.941591634623206e-05,
"loss": 0.3674773693084717,
"memory(GiB)": 137.67,
"step": 395,
"token_acc": 0.8714787014744528,
"train_speed(iter/s)": 0.04148
},
{
"epoch": 0.468384074941452,
"grad_norm": 0.21029749512672424,
"learning_rate": 1.9393978520570638e-05,
"loss": 0.35383853912353513,
"memory(GiB)": 137.67,
"step": 400,
"token_acc": 0.8725135029354207,
"train_speed(iter/s)": 0.041493
},
{
"epoch": 0.47423887587822017,
"grad_norm": 0.2057942897081375,
"learning_rate": 1.9371649130157166e-05,
"loss": 0.35016608238220215,
"memory(GiB)": 137.67,
"step": 405,
"token_acc": 0.8716170696781026,
"train_speed(iter/s)": 0.041495
},
{
"epoch": 0.48009367681498827,
"grad_norm": 0.21962089836597443,
"learning_rate": 1.9348929105737044e-05,
"loss": 0.3551772117614746,
"memory(GiB)": 137.67,
"step": 410,
"token_acc": 0.8725112535977174,
"train_speed(iter/s)": 0.041495
},
{
"epoch": 0.4859484777517564,
"grad_norm": 0.22210708260536194,
"learning_rate": 1.932581939433827e-05,
"loss": 0.3688118696212769,
"memory(GiB)": 137.67,
"step": 415,
"token_acc": 0.8727626971050538,
"train_speed(iter/s)": 0.041496
},
{
"epoch": 0.4918032786885246,
"grad_norm": 0.21538780629634857,
"learning_rate": 1.9302320959231997e-05,
"loss": 0.3600668430328369,
"memory(GiB)": 137.67,
"step": 420,
"token_acc": 0.87065663645922,
"train_speed(iter/s)": 0.041499
},
{
"epoch": 0.49765807962529274,
"grad_norm": 0.19987384974956512,
"learning_rate": 1.927843477989234e-05,
"loss": 0.3570875644683838,
"memory(GiB)": 137.67,
"step": 425,
"token_acc": 0.8845410461012411,
"train_speed(iter/s)": 0.041501
},
{
"epoch": 0.5035128805620609,
"grad_norm": 0.20627401769161224,
"learning_rate": 1.9254161851955587e-05,
"loss": 0.36909596920013427,
"memory(GiB)": 137.67,
"step": 430,
"token_acc": 0.8750783836660981,
"train_speed(iter/s)": 0.041507
},
{
"epoch": 0.509367681498829,
"grad_norm": 0.22353969514369965,
"learning_rate": 1.9229503187178694e-05,
"loss": 0.36271133422851565,
"memory(GiB)": 137.67,
"step": 435,
"token_acc": 0.8696993866195712,
"train_speed(iter/s)": 0.04151
},
{
"epoch": 0.5152224824355972,
"grad_norm": 0.20142175257205963,
"learning_rate": 1.920445981339708e-05,
"loss": 0.3614756345748901,
"memory(GiB)": 137.67,
"step": 440,
"token_acc": 0.8678934891256075,
"train_speed(iter/s)": 0.041514
},
{
"epoch": 0.5210772833723654,
"grad_norm": 0.2189430445432663,
"learning_rate": 1.9179032774481822e-05,
"loss": 0.3589394330978394,
"memory(GiB)": 137.67,
"step": 445,
"token_acc": 0.8754360673743595,
"train_speed(iter/s)": 0.04152
},
{
"epoch": 0.5269320843091335,
"grad_norm": 0.20788422226905823,
"learning_rate": 1.9153223130296125e-05,
"loss": 0.3571774005889893,
"memory(GiB)": 137.67,
"step": 450,
"token_acc": 0.8775248547087467,
"train_speed(iter/s)": 0.041526
},
{
"epoch": 0.5327868852459017,
"grad_norm": 0.19941285252571106,
"learning_rate": 1.9127031956651153e-05,
"loss": 0.36058688163757324,
"memory(GiB)": 137.67,
"step": 455,
"token_acc": 0.8748390868215994,
"train_speed(iter/s)": 0.041528
},
{
"epoch": 0.5386416861826698,
"grad_norm": 0.20794501900672913,
"learning_rate": 1.9100460345261175e-05,
"loss": 0.37292046546936036,
"memory(GiB)": 137.67,
"step": 460,
"token_acc": 0.8686192757401499,
"train_speed(iter/s)": 0.04152
},
{
"epoch": 0.544496487119438,
"grad_norm": 0.21598728001117706,
"learning_rate": 1.9073509403698062e-05,
"loss": 0.3684291124343872,
"memory(GiB)": 137.67,
"step": 465,
"token_acc": 0.8756676919995869,
"train_speed(iter/s)": 0.041523
},
{
"epoch": 0.550351288056206,
"grad_norm": 0.21292956173419952,
"learning_rate": 1.9046180255345142e-05,
"loss": 0.3640902042388916,
"memory(GiB)": 137.67,
"step": 470,
"token_acc": 0.8750558298801518,
"train_speed(iter/s)": 0.041525
},
{
"epoch": 0.5562060889929742,
"grad_norm": 0.21117296814918518,
"learning_rate": 1.9018474039350342e-05,
"loss": 0.3569709062576294,
"memory(GiB)": 137.67,
"step": 475,
"token_acc": 0.8744779663053135,
"train_speed(iter/s)": 0.041525
},
{
"epoch": 0.5620608899297423,
"grad_norm": 0.20366835594177246,
"learning_rate": 1.899039191057872e-05,
"loss": 0.35825061798095703,
"memory(GiB)": 137.67,
"step": 480,
"token_acc": 0.8689726123486041,
"train_speed(iter/s)": 0.041527
},
{
"epoch": 0.5679156908665105,
"grad_norm": 0.1856691688299179,
"learning_rate": 1.8961935039564338e-05,
"loss": 0.35746235847473146,
"memory(GiB)": 137.67,
"step": 485,
"token_acc": 0.8688354549740689,
"train_speed(iter/s)": 0.041532
},
{
"epoch": 0.5737704918032787,
"grad_norm": 0.23608598113059998,
"learning_rate": 1.8933104612461454e-05,
"loss": 0.35999622344970705,
"memory(GiB)": 137.67,
"step": 490,
"token_acc": 0.8696445021552469,
"train_speed(iter/s)": 0.041533
},
{
"epoch": 0.5796252927400468,
"grad_norm": 0.2125530242919922,
"learning_rate": 1.8903901830995093e-05,
"loss": 0.3631314754486084,
"memory(GiB)": 137.67,
"step": 495,
"token_acc": 0.8666599882919743,
"train_speed(iter/s)": 0.041531
},
{
"epoch": 0.585480093676815,
"grad_norm": 0.20335227251052856,
"learning_rate": 1.8874327912410945e-05,
"loss": 0.37455101013183595,
"memory(GiB)": 137.67,
"step": 500,
"token_acc": 0.8691201544556442,
"train_speed(iter/s)": 0.041538
},
{
"epoch": 0.5913348946135831,
"grad_norm": 0.2046995759010315,
"learning_rate": 1.884438408942463e-05,
"loss": 0.361937952041626,
"memory(GiB)": 137.67,
"step": 505,
"token_acc": 0.8581575277197544,
"train_speed(iter/s)": 0.041539
},
{
"epoch": 0.5971896955503513,
"grad_norm": 0.17991533875465393,
"learning_rate": 1.881407161017033e-05,
"loss": 0.35659379959106446,
"memory(GiB)": 137.67,
"step": 510,
"token_acc": 0.8789336760280843,
"train_speed(iter/s)": 0.041545
},
{
"epoch": 0.6030444964871194,
"grad_norm": 0.24344618618488312,
"learning_rate": 1.8783391738148738e-05,
"loss": 0.35185072422027586,
"memory(GiB)": 137.67,
"step": 515,
"token_acc": 0.8730951113338136,
"train_speed(iter/s)": 0.04155
},
{
"epoch": 0.6088992974238876,
"grad_norm": 0.21754887700080872,
"learning_rate": 1.875234575217441e-05,
"loss": 0.3508215665817261,
"memory(GiB)": 137.67,
"step": 520,
"token_acc": 0.872153412139793,
"train_speed(iter/s)": 0.041554
},
{
"epoch": 0.6147540983606558,
"grad_norm": 0.18687933683395386,
"learning_rate": 1.8720934946322466e-05,
"loss": 0.3653162240982056,
"memory(GiB)": 137.67,
"step": 525,
"token_acc": 0.8658395285187296,
"train_speed(iter/s)": 0.041556
},
{
"epoch": 0.6206088992974239,
"grad_norm": 0.1791500300168991,
"learning_rate": 1.8689160629874622e-05,
"loss": 0.3357256889343262,
"memory(GiB)": 137.67,
"step": 530,
"token_acc": 0.8864503516899346,
"train_speed(iter/s)": 0.041553
},
{
"epoch": 0.6264637002341921,
"grad_norm": 0.18553608655929565,
"learning_rate": 1.865702412726465e-05,
"loss": 0.34752044677734373,
"memory(GiB)": 137.67,
"step": 535,
"token_acc": 0.882398003852215,
"train_speed(iter/s)": 0.041558
},
{
"epoch": 0.6323185011709602,
"grad_norm": 0.19252535700798035,
"learning_rate": 1.8624526778023142e-05,
"loss": 0.3493391513824463,
"memory(GiB)": 137.67,
"step": 540,
"token_acc": 0.8799156751797872,
"train_speed(iter/s)": 0.04156
},
{
"epoch": 0.6381733021077284,
"grad_norm": 0.1979398876428604,
"learning_rate": 1.85916699367217e-05,
"loss": 0.35185253620147705,
"memory(GiB)": 137.67,
"step": 545,
"token_acc": 0.8728044652187243,
"train_speed(iter/s)": 0.041561
},
{
"epoch": 0.6440281030444965,
"grad_norm": 0.19005604088306427,
"learning_rate": 1.855845497291646e-05,
"loss": 0.3633576393127441,
"memory(GiB)": 137.67,
"step": 550,
"token_acc": 0.8699871784073149,
"train_speed(iter/s)": 0.041564
},
{
"epoch": 0.6498829039812647,
"grad_norm": 0.1815745234489441,
"learning_rate": 1.8524883271091004e-05,
"loss": 0.35262117385864256,
"memory(GiB)": 137.67,
"step": 555,
"token_acc": 0.8783439310264622,
"train_speed(iter/s)": 0.041562
},
{
"epoch": 0.6557377049180327,
"grad_norm": 0.17770066857337952,
"learning_rate": 1.8490956230598668e-05,
"loss": 0.3713988780975342,
"memory(GiB)": 137.67,
"step": 560,
"token_acc": 0.8711786567892583,
"train_speed(iter/s)": 0.041563
},
{
"epoch": 0.6615925058548009,
"grad_norm": 0.19120706617832184,
"learning_rate": 1.8456675265604183e-05,
"loss": 0.35135421752929685,
"memory(GiB)": 137.67,
"step": 565,
"token_acc": 0.8704644071404868,
"train_speed(iter/s)": 0.041568
},
{
"epoch": 0.667447306791569,
"grad_norm": 0.22995422780513763,
"learning_rate": 1.842204180502476e-05,
"loss": 0.3541764974594116,
"memory(GiB)": 137.67,
"step": 570,
"token_acc": 0.8800552885370527,
"train_speed(iter/s)": 0.04157
},
{
"epoch": 0.6733021077283372,
"grad_norm": 0.23910608887672424,
"learning_rate": 1.8387057292470517e-05,
"loss": 0.3688697576522827,
"memory(GiB)": 137.67,
"step": 575,
"token_acc": 0.8699386694063074,
"train_speed(iter/s)": 0.041571
},
{
"epoch": 0.6791569086651054,
"grad_norm": 0.18881316483020782,
"learning_rate": 1.8351723186184295e-05,
"loss": 0.358310866355896,
"memory(GiB)": 137.67,
"step": 580,
"token_acc": 0.861880756666604,
"train_speed(iter/s)": 0.041574
},
{
"epoch": 0.6850117096018735,
"grad_norm": 0.19772037863731384,
"learning_rate": 1.8316040958980896e-05,
"loss": 0.3566863536834717,
"memory(GiB)": 137.67,
"step": 585,
"token_acc": 0.8841636264650852,
"train_speed(iter/s)": 0.041578
},
{
"epoch": 0.6908665105386417,
"grad_norm": 0.20680150389671326,
"learning_rate": 1.828001209818567e-05,
"loss": 0.37308592796325685,
"memory(GiB)": 137.67,
"step": 590,
"token_acc": 0.8693373139559628,
"train_speed(iter/s)": 0.041581
},
{
"epoch": 0.6967213114754098,
"grad_norm": 0.21996839344501495,
"learning_rate": 1.8243638105572547e-05,
"loss": 0.3568426132202148,
"memory(GiB)": 137.67,
"step": 595,
"token_acc": 0.8781027202445839,
"train_speed(iter/s)": 0.041584
},
{
"epoch": 0.702576112412178,
"grad_norm": 0.19068636000156403,
"learning_rate": 1.82069204973014e-05,
"loss": 0.3520241975784302,
"memory(GiB)": 137.67,
"step": 600,
"token_acc": 0.8848490938723728,
"train_speed(iter/s)": 0.041592
},
{
"epoch": 0.7084309133489461,
"grad_norm": 0.19711260497570038,
"learning_rate": 1.816986080385489e-05,
"loss": 0.3704382419586182,
"memory(GiB)": 137.67,
"step": 605,
"token_acc": 0.8542210685487001,
"train_speed(iter/s)": 0.041592
},
{
"epoch": 0.7142857142857143,
"grad_norm": 0.2009887397289276,
"learning_rate": 1.813246056997465e-05,
"loss": 0.35552153587341306,
"memory(GiB)": 137.67,
"step": 610,
"token_acc": 0.8681636421482087,
"train_speed(iter/s)": 0.041595
},
{
"epoch": 0.7201405152224825,
"grad_norm": 0.2012893706560135,
"learning_rate": 1.809472135459688e-05,
"loss": 0.3568307399749756,
"memory(GiB)": 137.67,
"step": 615,
"token_acc": 0.8715069766273564,
"train_speed(iter/s)": 0.041596
},
{
"epoch": 0.7259953161592506,
"grad_norm": 0.19377882778644562,
"learning_rate": 1.8056644730787412e-05,
"loss": 0.3658033847808838,
"memory(GiB)": 137.67,
"step": 620,
"token_acc": 0.8766388014057431,
"train_speed(iter/s)": 0.041603
},
{
"epoch": 0.7318501170960188,
"grad_norm": 0.21672694385051727,
"learning_rate": 1.8018232285676092e-05,
"loss": 0.34650683403015137,
"memory(GiB)": 137.67,
"step": 625,
"token_acc": 0.8730951833381114,
"train_speed(iter/s)": 0.041609
},
{
"epoch": 0.7377049180327869,
"grad_norm": 0.20295600593090057,
"learning_rate": 1.797948562039066e-05,
"loss": 0.36364593505859377,
"memory(GiB)": 137.67,
"step": 630,
"token_acc": 0.8673425158178014,
"train_speed(iter/s)": 0.041604
},
{
"epoch": 0.7435597189695551,
"grad_norm": 0.20888152718544006,
"learning_rate": 1.7940406349989987e-05,
"loss": 0.3600362777709961,
"memory(GiB)": 137.67,
"step": 635,
"token_acc": 0.8697917646394914,
"train_speed(iter/s)": 0.04161
},
{
"epoch": 0.7494145199063232,
"grad_norm": 0.18725119531154633,
"learning_rate": 1.7900996103396772e-05,
"loss": 0.3525946617126465,
"memory(GiB)": 137.67,
"step": 640,
"token_acc": 0.8778969516256544,
"train_speed(iter/s)": 0.04161
},
{
"epoch": 0.7552693208430913,
"grad_norm": 0.2023143470287323,
"learning_rate": 1.7861256523329634e-05,
"loss": 0.35059380531311035,
"memory(GiB)": 137.67,
"step": 645,
"token_acc": 0.867270463741052,
"train_speed(iter/s)": 0.041608
},
{
"epoch": 0.7611241217798594,
"grad_norm": 0.18495850265026093,
"learning_rate": 1.7821189266234647e-05,
"loss": 0.35591151714324953,
"memory(GiB)": 137.67,
"step": 650,
"token_acc": 0.8691064057960171,
"train_speed(iter/s)": 0.041607
},
{
"epoch": 0.7669789227166276,
"grad_norm": 0.19239366054534912,
"learning_rate": 1.7780796002216285e-05,
"loss": 0.3489703893661499,
"memory(GiB)": 137.67,
"step": 655,
"token_acc": 0.8661729229440642,
"train_speed(iter/s)": 0.041609
},
{
"epoch": 0.7728337236533958,
"grad_norm": 0.19033724069595337,
"learning_rate": 1.7740078414967817e-05,
"loss": 0.35645670890808107,
"memory(GiB)": 137.67,
"step": 660,
"token_acc": 0.8801652115008279,
"train_speed(iter/s)": 0.041611
},
{
"epoch": 0.7786885245901639,
"grad_norm": 0.1858055591583252,
"learning_rate": 1.7699038201701132e-05,
"loss": 0.3495974063873291,
"memory(GiB)": 137.67,
"step": 665,
"token_acc": 0.86732774248516,
"train_speed(iter/s)": 0.041614
},
{
"epoch": 0.7845433255269321,
"grad_norm": 0.19249401986598969,
"learning_rate": 1.7657677073075968e-05,
"loss": 0.35628108978271483,
"memory(GiB)": 137.67,
"step": 670,
"token_acc": 0.8711122587710429,
"train_speed(iter/s)": 0.041616
},
{
"epoch": 0.7903981264637002,
"grad_norm": 0.1897304505109787,
"learning_rate": 1.761599675312864e-05,
"loss": 0.3588160514831543,
"memory(GiB)": 137.67,
"step": 675,
"token_acc": 0.8833087010138474,
"train_speed(iter/s)": 0.041616
},
{
"epoch": 0.7962529274004684,
"grad_norm": 0.19034340977668762,
"learning_rate": 1.7573998979200163e-05,
"loss": 0.3528533935546875,
"memory(GiB)": 137.67,
"step": 680,
"token_acc": 0.873974659902577,
"train_speed(iter/s)": 0.04162
},
{
"epoch": 0.8021077283372365,
"grad_norm": 0.17828524112701416,
"learning_rate": 1.753168550186383e-05,
"loss": 0.36130833625793457,
"memory(GiB)": 137.67,
"step": 685,
"token_acc": 0.8767166579575643,
"train_speed(iter/s)": 0.041622
},
{
"epoch": 0.8079625292740047,
"grad_norm": 0.18225735425949097,
"learning_rate": 1.7489058084852247e-05,
"loss": 0.3559986114501953,
"memory(GiB)": 137.67,
"step": 690,
"token_acc": 0.8664611837818874,
"train_speed(iter/s)": 0.041619
},
{
"epoch": 0.8138173302107728,
"grad_norm": 0.17824020981788635,
"learning_rate": 1.744611850498383e-05,
"loss": 0.3519934415817261,
"memory(GiB)": 137.67,
"step": 695,
"token_acc": 0.8767726421318924,
"train_speed(iter/s)": 0.04162
},
{
"epoch": 0.819672131147541,
"grad_norm": 0.19619260728359222,
"learning_rate": 1.7402868552088724e-05,
"loss": 0.34758720397949217,
"memory(GiB)": 137.67,
"step": 700,
"token_acc": 0.8710738168196693,
"train_speed(iter/s)": 0.041621
},
{
"epoch": 0.8255269320843092,
"grad_norm": 0.20193175971508026,
"learning_rate": 1.73593100289342e-05,
"loss": 0.3554750919342041,
"memory(GiB)": 137.67,
"step": 705,
"token_acc": 0.8680475894967122,
"train_speed(iter/s)": 0.041625
},
{
"epoch": 0.8313817330210773,
"grad_norm": 0.17672231793403625,
"learning_rate": 1.7315444751149533e-05,
"loss": 0.3531287670135498,
"memory(GiB)": 137.67,
"step": 710,
"token_acc": 0.8739113086739942,
"train_speed(iter/s)": 0.041629
},
{
"epoch": 0.8372365339578455,
"grad_norm": 0.18640753626823425,
"learning_rate": 1.727127454715029e-05,
"loss": 0.3531001329421997,
"memory(GiB)": 137.67,
"step": 715,
"token_acc": 0.8807271048387348,
"train_speed(iter/s)": 0.041632
},
{
"epoch": 0.8430913348946136,
"grad_norm": 0.18654407560825348,
"learning_rate": 1.722680125806214e-05,
"loss": 0.3535622119903564,
"memory(GiB)": 137.67,
"step": 720,
"token_acc": 0.8664340845361018,
"train_speed(iter/s)": 0.041633
},
{
"epoch": 0.8489461358313818,
"grad_norm": 0.19616912305355072,
"learning_rate": 1.71820267376441e-05,
"loss": 0.357543420791626,
"memory(GiB)": 137.67,
"step": 725,
"token_acc": 0.8723300758960031,
"train_speed(iter/s)": 0.041635
},
{
"epoch": 0.8548009367681498,
"grad_norm": 0.1865251064300537,
"learning_rate": 1.7136952852211274e-05,
"loss": 0.36123013496398926,
"memory(GiB)": 137.67,
"step": 730,
"token_acc": 0.8610691821941981,
"train_speed(iter/s)": 0.041638
},
{
"epoch": 0.860655737704918,
"grad_norm": 0.1886809915304184,
"learning_rate": 1.7091581480557057e-05,
"loss": 0.34960460662841797,
"memory(GiB)": 137.67,
"step": 735,
"token_acc": 0.8703787498166635,
"train_speed(iter/s)": 0.041639
},
{
"epoch": 0.8665105386416861,
"grad_norm": 0.19691921770572662,
"learning_rate": 1.7045914513874815e-05,
"loss": 0.3618565320968628,
"memory(GiB)": 137.67,
"step": 740,
"token_acc": 0.8702042368549021,
"train_speed(iter/s)": 0.041645
},
{
"epoch": 0.8723653395784543,
"grad_norm": 0.18920762836933136,
"learning_rate": 1.699995385567907e-05,
"loss": 0.3643482685089111,
"memory(GiB)": 137.67,
"step": 745,
"token_acc": 0.8619865320910651,
"train_speed(iter/s)": 0.041651
},
{
"epoch": 0.8782201405152225,
"grad_norm": 0.19481435418128967,
"learning_rate": 1.695370142172614e-05,
"loss": 0.3560521602630615,
"memory(GiB)": 137.67,
"step": 750,
"token_acc": 0.8686031511447322,
"train_speed(iter/s)": 0.041651
},
{
"epoch": 0.8840749414519906,
"grad_norm": 0.19207534193992615,
"learning_rate": 1.690715913993429e-05,
"loss": 0.3591322422027588,
"memory(GiB)": 137.67,
"step": 755,
"token_acc": 0.8719703155846309,
"train_speed(iter/s)": 0.041652
},
{
"epoch": 0.8899297423887588,
"grad_norm": 0.20057600736618042,
"learning_rate": 1.6860328950303392e-05,
"loss": 0.3394715070724487,
"memory(GiB)": 137.67,
"step": 760,
"token_acc": 0.8781381296322522,
"train_speed(iter/s)": 0.041655
},
{
"epoch": 0.8957845433255269,
"grad_norm": 0.19081991910934448,
"learning_rate": 1.6813212804834033e-05,
"loss": 0.3552083015441895,
"memory(GiB)": 137.67,
"step": 765,
"token_acc": 0.8649747738343772,
"train_speed(iter/s)": 0.041656
},
{
"epoch": 0.9016393442622951,
"grad_norm": 0.17996545135974884,
"learning_rate": 1.676581266744615e-05,
"loss": 0.3466797828674316,
"memory(GiB)": 137.67,
"step": 770,
"token_acc": 0.8719778029670782,
"train_speed(iter/s)": 0.041659
},
{
"epoch": 0.9074941451990632,
"grad_norm": 0.18470925092697144,
"learning_rate": 1.6718130513897207e-05,
"loss": 0.34652736186981203,
"memory(GiB)": 137.67,
"step": 775,
"token_acc": 0.8761688115825458,
"train_speed(iter/s)": 0.041661
},
{
"epoch": 0.9133489461358314,
"grad_norm": 0.1838730424642563,
"learning_rate": 1.667016833169979e-05,
"loss": 0.3616307258605957,
"memory(GiB)": 137.67,
"step": 780,
"token_acc": 0.8749988214255409,
"train_speed(iter/s)": 0.041664
},
{
"epoch": 0.9192037470725996,
"grad_norm": 0.1882750242948532,
"learning_rate": 1.6621928120038806e-05,
"loss": 0.35453338623046876,
"memory(GiB)": 137.67,
"step": 785,
"token_acc": 0.8650788191817312,
"train_speed(iter/s)": 0.041666
},
{
"epoch": 0.9250585480093677,
"grad_norm": 0.18011753261089325,
"learning_rate": 1.657341188968811e-05,
"loss": 0.3467398166656494,
"memory(GiB)": 137.67,
"step": 790,
"token_acc": 0.8665571597898215,
"train_speed(iter/s)": 0.041668
},
{
"epoch": 0.9309133489461359,
"grad_norm": 0.1889754831790924,
"learning_rate": 1.6524621662926733e-05,
"loss": 0.34622554779052733,
"memory(GiB)": 137.67,
"step": 795,
"token_acc": 0.8836526658483215,
"train_speed(iter/s)": 0.041671
},
{
"epoch": 0.936768149882904,
"grad_norm": 0.17811700701713562,
"learning_rate": 1.6475559473454558e-05,
"loss": 0.35440659523010254,
"memory(GiB)": 137.67,
"step": 800,
"token_acc": 0.8802437890929187,
"train_speed(iter/s)": 0.041672
},
{
"epoch": 0.9426229508196722,
"grad_norm": 0.19011390209197998,
"learning_rate": 1.6426227366307563e-05,
"loss": 0.3580695629119873,
"memory(GiB)": 137.67,
"step": 805,
"token_acc": 0.8808476204925909,
"train_speed(iter/s)": 0.04167
},
{
"epoch": 0.9484777517564403,
"grad_norm": 0.18688787519931793,
"learning_rate": 1.6376627397772576e-05,
"loss": 0.35615901947021483,
"memory(GiB)": 137.67,
"step": 810,
"token_acc": 0.8656951211518713,
"train_speed(iter/s)": 0.04167
},
{
"epoch": 0.9543325526932084,
"grad_norm": 0.19855861365795135,
"learning_rate": 1.6326761635301572e-05,
"loss": 0.3505072116851807,
"memory(GiB)": 137.67,
"step": 815,
"token_acc": 0.8734695802546769,
"train_speed(iter/s)": 0.041672
},
{
"epoch": 0.9601873536299765,
"grad_norm": 0.18500158190727234,
"learning_rate": 1.6276632157425475e-05,
"loss": 0.35810859203338624,
"memory(GiB)": 137.67,
"step": 820,
"token_acc": 0.8688002942074786,
"train_speed(iter/s)": 0.041672
},
{
"epoch": 0.9660421545667447,
"grad_norm": 0.2135351300239563,
"learning_rate": 1.6226241053667536e-05,
"loss": 0.3624737739562988,
"memory(GiB)": 137.67,
"step": 825,
"token_acc": 0.8650754688071645,
"train_speed(iter/s)": 0.041674
},
{
"epoch": 0.9718969555035128,
"grad_norm": 0.188192680478096,
"learning_rate": 1.617559042445625e-05,
"loss": 0.3624725818634033,
"memory(GiB)": 137.67,
"step": 830,
"token_acc": 0.8755614748176581,
"train_speed(iter/s)": 0.041674
},
{
"epoch": 0.977751756440281,
"grad_norm": 0.34307366609573364,
"learning_rate": 1.6124682381037767e-05,
"loss": 0.34985201358795165,
"memory(GiB)": 137.67,
"step": 835,
"token_acc": 0.8732973013596538,
"train_speed(iter/s)": 0.041675
},
{
"epoch": 0.9836065573770492,
"grad_norm": 0.19902247190475464,
"learning_rate": 1.607351904538792e-05,
"loss": 0.3641986846923828,
"memory(GiB)": 137.67,
"step": 840,
"token_acc": 0.8725000467718097,
"train_speed(iter/s)": 0.041673
},
{
"epoch": 0.9894613583138173,
"grad_norm": 0.18375855684280396,
"learning_rate": 1.6022102550123775e-05,
"loss": 0.3507267951965332,
"memory(GiB)": 137.67,
"step": 845,
"token_acc": 0.868225976538805,
"train_speed(iter/s)": 0.041674
},
{
"epoch": 0.9953161592505855,
"grad_norm": 0.19543269276618958,
"learning_rate": 1.597043503841471e-05,
"loss": 0.3511422395706177,
"memory(GiB)": 137.67,
"step": 850,
"token_acc": 0.8818226402481499,
"train_speed(iter/s)": 0.041674
},
{
"epoch": 1.0011709601873535,
"grad_norm": 0.2594313323497772,
"learning_rate": 1.5918518663893124e-05,
"loss": 0.3436767339706421,
"memory(GiB)": 137.67,
"step": 855,
"token_acc": 0.8783253667380914,
"train_speed(iter/s)": 0.041472
},
{
"epoch": 1.0070257611241218,
"grad_norm": 0.21433798968791962,
"learning_rate": 1.5866355590564637e-05,
"loss": 0.31752333641052244,
"memory(GiB)": 137.67,
"step": 860,
"token_acc": 0.8950932956103179,
"train_speed(iter/s)": 0.041464
},
{
"epoch": 1.0128805620608898,
"grad_norm": 0.20641100406646729,
"learning_rate": 1.5813947992717894e-05,
"loss": 0.3059502601623535,
"memory(GiB)": 137.67,
"step": 865,
"token_acc": 0.8851299275012688,
"train_speed(iter/s)": 0.041456
},
{
"epoch": 1.018735362997658,
"grad_norm": 0.2776026427745819,
"learning_rate": 1.5761298054833947e-05,
"loss": 0.31491961479187014,
"memory(GiB)": 137.67,
"step": 870,
"token_acc": 0.8871431849329935,
"train_speed(iter/s)": 0.041446
},
{
"epoch": 1.0245901639344261,
"grad_norm": 0.2104882299900055,
"learning_rate": 1.5708407971495195e-05,
"loss": 0.3215550422668457,
"memory(GiB)": 137.67,
"step": 875,
"token_acc": 0.8840142068123856,
"train_speed(iter/s)": 0.041441
},
{
"epoch": 1.0304449648711944,
"grad_norm": 0.2141922563314438,
"learning_rate": 1.565527994729389e-05,
"loss": 0.31157307624816893,
"memory(GiB)": 137.67,
"step": 880,
"token_acc": 0.8925077955478237,
"train_speed(iter/s)": 0.041435
},
{
"epoch": 1.0362997658079625,
"grad_norm": 0.19829437136650085,
"learning_rate": 1.5601916196740283e-05,
"loss": 0.30809755325317384,
"memory(GiB)": 137.67,
"step": 885,
"token_acc": 0.890301896874165,
"train_speed(iter/s)": 0.04143
},
{
"epoch": 1.0421545667447307,
"grad_norm": 0.1938631683588028,
"learning_rate": 1.5548318944170276e-05,
"loss": 0.30415992736816405,
"memory(GiB)": 137.67,
"step": 890,
"token_acc": 0.8950597362393585,
"train_speed(iter/s)": 0.041423
},
{
"epoch": 1.0480093676814988,
"grad_norm": 0.18822869658470154,
"learning_rate": 1.5494490423652732e-05,
"loss": 0.30409889221191405,
"memory(GiB)": 137.67,
"step": 895,
"token_acc": 0.8878764647902749,
"train_speed(iter/s)": 0.041414
},
{
"epoch": 1.053864168618267,
"grad_norm": 0.18639546632766724,
"learning_rate": 1.544043287889635e-05,
"loss": 0.29631519317626953,
"memory(GiB)": 137.67,
"step": 900,
"token_acc": 0.8972942289498581,
"train_speed(iter/s)": 0.041408
},
{
"epoch": 1.059718969555035,
"grad_norm": 0.19313958287239075,
"learning_rate": 1.538614856315614e-05,
"loss": 0.3089482307434082,
"memory(GiB)": 137.67,
"step": 905,
"token_acc": 0.8947345206627453,
"train_speed(iter/s)": 0.041403
},
{
"epoch": 1.0655737704918034,
"grad_norm": 0.1918047070503235,
"learning_rate": 1.5331639739139477e-05,
"loss": 0.30376482009887695,
"memory(GiB)": 137.67,
"step": 910,
"token_acc": 0.878863108904361,
"train_speed(iter/s)": 0.041394
},
{
"epoch": 1.0714285714285714,
"grad_norm": 0.17692717909812927,
"learning_rate": 1.5276908678911837e-05,
"loss": 0.3011662006378174,
"memory(GiB)": 137.67,
"step": 915,
"token_acc": 0.8932026746024828,
"train_speed(iter/s)": 0.041388
},
{
"epoch": 1.0772833723653397,
"grad_norm": 0.1763262152671814,
"learning_rate": 1.5221957663802043e-05,
"loss": 0.31141071319580077,
"memory(GiB)": 137.67,
"step": 920,
"token_acc": 0.8920435427389305,
"train_speed(iter/s)": 0.041376
},
{
"epoch": 1.0831381733021077,
"grad_norm": 0.1730634868144989,
"learning_rate": 1.5166788984307204e-05,
"loss": 0.3161822557449341,
"memory(GiB)": 137.67,
"step": 925,
"token_acc": 0.8866250173014735,
"train_speed(iter/s)": 0.041367
},
{
"epoch": 1.088992974238876,
"grad_norm": 0.20834501087665558,
"learning_rate": 1.5111404939997227e-05,
"loss": 0.3130020618438721,
"memory(GiB)": 137.67,
"step": 930,
"token_acc": 0.8872231505297611,
"train_speed(iter/s)": 0.04136
},
{
"epoch": 1.094847775175644,
"grad_norm": 0.20543096959590912,
"learning_rate": 1.5055807839418966e-05,
"loss": 0.29431891441345215,
"memory(GiB)": 137.67,
"step": 935,
"token_acc": 0.8923718607539866,
"train_speed(iter/s)": 0.041352
},
{
"epoch": 1.100702576112412,
"grad_norm": 0.1818283647298813,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.31560554504394533,
"memory(GiB)": 137.67,
"step": 940,
"token_acc": 0.8944428660187143,
"train_speed(iter/s)": 0.041347
},
{
"epoch": 1.1065573770491803,
"grad_norm": 0.18734754621982574,
"learning_rate": 1.494398374795204e-05,
"loss": 0.30426225662231443,
"memory(GiB)": 137.67,
"step": 945,
"token_acc": 0.8848180693302514,
"train_speed(iter/s)": 0.041343
},
{
"epoch": 1.1124121779859484,
"grad_norm": 0.19308467209339142,
"learning_rate": 1.4887761418173947e-05,
"loss": 0.32167963981628417,
"memory(GiB)": 137.67,
"step": 950,
"token_acc": 0.8939139882185966,
"train_speed(iter/s)": 0.041337
},
{
"epoch": 1.1182669789227166,
"grad_norm": 0.2532450258731842,
"learning_rate": 1.4831335354154444e-05,
"loss": 0.30830209255218505,
"memory(GiB)": 137.67,
"step": 955,
"token_acc": 0.887962551140468,
"train_speed(iter/s)": 0.041333
},
{
"epoch": 1.1241217798594847,
"grad_norm": 0.18927785754203796,
"learning_rate": 1.4774707907874392e-05,
"loss": 0.30596270561218264,
"memory(GiB)": 137.67,
"step": 960,
"token_acc": 0.8945483075403462,
"train_speed(iter/s)": 0.041324
},
{
"epoch": 1.129976580796253,
"grad_norm": 0.18746164441108704,
"learning_rate": 1.4717881439708786e-05,
"loss": 0.3073431491851807,
"memory(GiB)": 137.67,
"step": 965,
"token_acc": 0.8779535897835228,
"train_speed(iter/s)": 0.041318
},
{
"epoch": 1.135831381733021,
"grad_norm": 0.19065742194652557,
"learning_rate": 1.4660858318328348e-05,
"loss": 0.30925755500793456,
"memory(GiB)": 137.67,
"step": 970,
"token_acc": 0.8771556147038887,
"train_speed(iter/s)": 0.041311
},
{
"epoch": 1.1416861826697893,
"grad_norm": 0.19082236289978027,
"learning_rate": 1.4603640920600813e-05,
"loss": 0.31507372856140137,
"memory(GiB)": 137.67,
"step": 975,
"token_acc": 0.8741312286488396,
"train_speed(iter/s)": 0.041305
},
{
"epoch": 1.1475409836065573,
"grad_norm": 0.18480531871318817,
"learning_rate": 1.4546231631491827e-05,
"loss": 0.3110131025314331,
"memory(GiB)": 137.67,
"step": 980,
"token_acc": 0.8829417142215302,
"train_speed(iter/s)": 0.041296
},
{
"epoch": 1.1533957845433256,
"grad_norm": 0.17675240337848663,
"learning_rate": 1.4488632843965573e-05,
"loss": 0.3039939641952515,
"memory(GiB)": 137.67,
"step": 985,
"token_acc": 0.8738143036386449,
"train_speed(iter/s)": 0.041289
},
{
"epoch": 1.1592505854800936,
"grad_norm": 0.19089390337467194,
"learning_rate": 1.4430846958884995e-05,
"loss": 0.31295793056488036,
"memory(GiB)": 137.67,
"step": 990,
"token_acc": 0.8817706633869632,
"train_speed(iter/s)": 0.041282
},
{
"epoch": 1.165105386416862,
"grad_norm": 0.18563120067119598,
"learning_rate": 1.4372876384911741e-05,
"loss": 0.313909912109375,
"memory(GiB)": 137.67,
"step": 995,
"token_acc": 0.8830196916072904,
"train_speed(iter/s)": 0.041276
},
{
"epoch": 1.17096018735363,
"grad_norm": 0.21534429490566254,
"learning_rate": 1.4314723538405752e-05,
"loss": 0.3197300910949707,
"memory(GiB)": 137.67,
"step": 1000,
"token_acc": 0.8747241787695568,
"train_speed(iter/s)": 0.041271
},
{
"epoch": 1.1768149882903982,
"grad_norm": 0.19970309734344482,
"learning_rate": 1.4256390843324556e-05,
"loss": 0.3151378154754639,
"memory(GiB)": 137.67,
"step": 1005,
"token_acc": 0.8791438877655459,
"train_speed(iter/s)": 0.041267
},
{
"epoch": 1.1826697892271663,
"grad_norm": 0.1895560324192047,
"learning_rate": 1.4197880731122221e-05,
"loss": 0.312138032913208,
"memory(GiB)": 137.67,
"step": 1010,
"token_acc": 0.8795711581097576,
"train_speed(iter/s)": 0.041265
},
{
"epoch": 1.1885245901639343,
"grad_norm": 0.19073544442653656,
"learning_rate": 1.4139195640648008e-05,
"loss": 0.315081787109375,
"memory(GiB)": 137.67,
"step": 1015,
"token_acc": 0.8921242173646963,
"train_speed(iter/s)": 0.041259
},
{
"epoch": 1.1943793911007026,
"grad_norm": 0.17704617977142334,
"learning_rate": 1.4080338018044712e-05,
"loss": 0.319437837600708,
"memory(GiB)": 137.67,
"step": 1020,
"token_acc": 0.8815218951006631,
"train_speed(iter/s)": 0.041255
},
{
"epoch": 1.2002341920374708,
"grad_norm": 0.19636361300945282,
"learning_rate": 1.4021310316646708e-05,
"loss": 0.3087984561920166,
"memory(GiB)": 137.67,
"step": 1025,
"token_acc": 0.8875915980726762,
"train_speed(iter/s)": 0.041249
},
{
"epoch": 1.2060889929742389,
"grad_norm": 0.185128852725029,
"learning_rate": 1.3962114996877685e-05,
"loss": 0.29653804302215575,
"memory(GiB)": 137.67,
"step": 1030,
"token_acc": 0.894042061938463,
"train_speed(iter/s)": 0.041243
},
{
"epoch": 1.211943793911007,
"grad_norm": 0.18740731477737427,
"learning_rate": 1.390275452614808e-05,
"loss": 0.2996367454528809,
"memory(GiB)": 137.67,
"step": 1035,
"token_acc": 0.8867371770872332,
"train_speed(iter/s)": 0.041239
},
{
"epoch": 1.2177985948477752,
"grad_norm": 0.19739095866680145,
"learning_rate": 1.3843231378752252e-05,
"loss": 0.3056778907775879,
"memory(GiB)": 137.67,
"step": 1040,
"token_acc": 0.8844194070047138,
"train_speed(iter/s)": 0.041232
},
{
"epoch": 1.2236533957845432,
"grad_norm": 0.18625736236572266,
"learning_rate": 1.3783548035765327e-05,
"loss": 0.3101504802703857,
"memory(GiB)": 137.67,
"step": 1045,
"token_acc": 0.8895319577252139,
"train_speed(iter/s)": 0.041228
},
{
"epoch": 1.2295081967213115,
"grad_norm": 0.19391782581806183,
"learning_rate": 1.3723706984939783e-05,
"loss": 0.2983381271362305,
"memory(GiB)": 137.67,
"step": 1050,
"token_acc": 0.8835933444611258,
"train_speed(iter/s)": 0.041224
},
{
"epoch": 1.2353629976580796,
"grad_norm": 0.18108582496643066,
"learning_rate": 1.366371072060177e-05,
"loss": 0.3086691379547119,
"memory(GiB)": 137.67,
"step": 1055,
"token_acc": 0.8736720857877966,
"train_speed(iter/s)": 0.041218
},
{
"epoch": 1.2412177985948478,
"grad_norm": 0.18043167889118195,
"learning_rate": 1.3603561743547125e-05,
"loss": 0.30459914207458494,
"memory(GiB)": 137.67,
"step": 1060,
"token_acc": 0.8805453249562779,
"train_speed(iter/s)": 0.041215
},
{
"epoch": 1.2470725995316159,
"grad_norm": 0.2246876060962677,
"learning_rate": 1.3543262560937135e-05,
"loss": 0.3085703134536743,
"memory(GiB)": 137.67,
"step": 1065,
"token_acc": 0.8846350880261892,
"train_speed(iter/s)": 0.041212
},
{
"epoch": 1.2529274004683841,
"grad_norm": 0.19236041605472565,
"learning_rate": 1.3482815686194033e-05,
"loss": 0.2960092306137085,
"memory(GiB)": 137.67,
"step": 1070,
"token_acc": 0.8907122097565549,
"train_speed(iter/s)": 0.041208
},
{
"epoch": 1.2587822014051522,
"grad_norm": 0.1928793489933014,
"learning_rate": 1.3422223638896235e-05,
"loss": 0.3040574073791504,
"memory(GiB)": 137.67,
"step": 1075,
"token_acc": 0.886298144007927,
"train_speed(iter/s)": 0.041204
},
{
"epoch": 1.2646370023419204,
"grad_norm": 0.20902785658836365,
"learning_rate": 1.3361488944673315e-05,
"loss": 0.31267333030700684,
"memory(GiB)": 137.67,
"step": 1080,
"token_acc": 0.8800496737817911,
"train_speed(iter/s)": 0.041199
},
{
"epoch": 1.2704918032786885,
"grad_norm": 0.18985559046268463,
"learning_rate": 1.3300614135100736e-05,
"loss": 0.3105930805206299,
"memory(GiB)": 137.67,
"step": 1085,
"token_acc": 0.8869882389382489,
"train_speed(iter/s)": 0.041194
},
{
"epoch": 1.2763466042154565,
"grad_norm": 0.17671886086463928,
"learning_rate": 1.3239601747594319e-05,
"loss": 0.310105037689209,
"memory(GiB)": 137.67,
"step": 1090,
"token_acc": 0.8870674524554854,
"train_speed(iter/s)": 0.041187
},
{
"epoch": 1.2822014051522248,
"grad_norm": 0.17825712263584137,
"learning_rate": 1.3178454325304472e-05,
"loss": 0.31207849979400637,
"memory(GiB)": 137.67,
"step": 1095,
"token_acc": 0.876942551728449,
"train_speed(iter/s)": 0.041183
},
{
"epoch": 1.288056206088993,
"grad_norm": 0.1821722686290741,
"learning_rate": 1.3117174417010213e-05,
"loss": 0.2980069637298584,
"memory(GiB)": 137.67,
"step": 1100,
"token_acc": 0.8805069421513594,
"train_speed(iter/s)": 0.041179
},
{
"epoch": 1.2939110070257611,
"grad_norm": 0.18626025319099426,
"learning_rate": 1.3055764577012892e-05,
"loss": 0.3255163669586182,
"memory(GiB)": 137.67,
"step": 1105,
"token_acc": 0.8920352101893313,
"train_speed(iter/s)": 0.041176
},
{
"epoch": 1.2997658079625292,
"grad_norm": 0.18716710805892944,
"learning_rate": 1.2994227365029752e-05,
"loss": 0.30793008804321287,
"memory(GiB)": 137.67,
"step": 1110,
"token_acc": 0.8887493130250451,
"train_speed(iter/s)": 0.041173
},
{
"epoch": 1.3056206088992974,
"grad_norm": 0.19421324133872986,
"learning_rate": 1.2932565346087218e-05,
"loss": 0.3134599208831787,
"memory(GiB)": 137.67,
"step": 1115,
"token_acc": 0.8847875557218118,
"train_speed(iter/s)": 0.041168
},
{
"epoch": 1.3114754098360657,
"grad_norm": 0.18218953907489777,
"learning_rate": 1.2870781090413991e-05,
"loss": 0.3120888710021973,
"memory(GiB)": 137.67,
"step": 1120,
"token_acc": 0.8869988305263882,
"train_speed(iter/s)": 0.041162
},
{
"epoch": 1.3173302107728337,
"grad_norm": 0.19175498187541962,
"learning_rate": 1.2808877173333896e-05,
"loss": 0.30698199272155763,
"memory(GiB)": 137.67,
"step": 1125,
"token_acc": 0.8941062176165803,
"train_speed(iter/s)": 0.041159
},
{
"epoch": 1.3231850117096018,
"grad_norm": 0.18965595960617065,
"learning_rate": 1.2746856175158556e-05,
"loss": 0.31497323513031006,
"memory(GiB)": 137.67,
"step": 1130,
"token_acc": 0.8871100459606847,
"train_speed(iter/s)": 0.041157
},
{
"epoch": 1.32903981264637,
"grad_norm": 0.18627162277698517,
"learning_rate": 1.2684720681079825e-05,
"loss": 0.31060152053833007,
"memory(GiB)": 137.67,
"step": 1135,
"token_acc": 0.871316468541155,
"train_speed(iter/s)": 0.041153
},
{
"epoch": 1.334894613583138,
"grad_norm": 0.18565431237220764,
"learning_rate": 1.2622473281062042e-05,
"loss": 0.31475396156311036,
"memory(GiB)": 137.67,
"step": 1140,
"token_acc": 0.8868342272670575,
"train_speed(iter/s)": 0.04115
},
{
"epoch": 1.3407494145199064,
"grad_norm": 0.20739679038524628,
"learning_rate": 1.256011656973406e-05,
"loss": 0.32018194198608396,
"memory(GiB)": 137.67,
"step": 1145,
"token_acc": 0.8872068230277186,
"train_speed(iter/s)": 0.041147
},
{
"epoch": 1.3466042154566744,
"grad_norm": 0.1901317983865738,
"learning_rate": 1.2497653146281113e-05,
"loss": 0.3108601331710815,
"memory(GiB)": 137.67,
"step": 1150,
"token_acc": 0.8855189570357069,
"train_speed(iter/s)": 0.041141
},
{
"epoch": 1.3524590163934427,
"grad_norm": 0.16836309432983398,
"learning_rate": 1.2435085614336459e-05,
"loss": 0.315748405456543,
"memory(GiB)": 137.67,
"step": 1155,
"token_acc": 0.8928414676966292,
"train_speed(iter/s)": 0.041138
},
{
"epoch": 1.3583138173302107,
"grad_norm": 0.18492159247398376,
"learning_rate": 1.2372416581872857e-05,
"loss": 0.3051302909851074,
"memory(GiB)": 137.67,
"step": 1160,
"token_acc": 0.8906577988281189,
"train_speed(iter/s)": 0.041133
},
{
"epoch": 1.364168618266979,
"grad_norm": 0.17753958702087402,
"learning_rate": 1.2309648661093878e-05,
"loss": 0.3092564582824707,
"memory(GiB)": 137.67,
"step": 1165,
"token_acc": 0.8921087343363074,
"train_speed(iter/s)": 0.041129
},
{
"epoch": 1.370023419203747,
"grad_norm": 0.18764352798461914,
"learning_rate": 1.2246784468324993e-05,
"loss": 0.3163435935974121,
"memory(GiB)": 137.67,
"step": 1170,
"token_acc": 0.8760536792329402,
"train_speed(iter/s)": 0.041124
},
{
"epoch": 1.3758782201405153,
"grad_norm": 0.19416891038417816,
"learning_rate": 1.218382662390454e-05,
"loss": 0.3042860507965088,
"memory(GiB)": 137.67,
"step": 1175,
"token_acc": 0.875018486527648,
"train_speed(iter/s)": 0.041121
},
{
"epoch": 1.3817330210772834,
"grad_norm": 0.18030278384685516,
"learning_rate": 1.2120777752074492e-05,
"loss": 0.3132922172546387,
"memory(GiB)": 137.67,
"step": 1180,
"token_acc": 0.8838601600050099,
"train_speed(iter/s)": 0.041116
},
{
"epoch": 1.3875878220140514,
"grad_norm": 0.2763387858867645,
"learning_rate": 1.2057640480871084e-05,
"loss": 0.3143471240997314,
"memory(GiB)": 137.67,
"step": 1185,
"token_acc": 0.8852224576271186,
"train_speed(iter/s)": 0.041114
},
{
"epoch": 1.3934426229508197,
"grad_norm": 0.17999497056007385,
"learning_rate": 1.1994417442015243e-05,
"loss": 0.31265532970428467,
"memory(GiB)": 137.67,
"step": 1190,
"token_acc": 0.8907372436335803,
"train_speed(iter/s)": 0.041112
},
{
"epoch": 1.399297423887588,
"grad_norm": 0.18372628092765808,
"learning_rate": 1.193111127080292e-05,
"loss": 0.30383052825927737,
"memory(GiB)": 137.67,
"step": 1195,
"token_acc": 0.8938835107946411,
"train_speed(iter/s)": 0.041109
},
{
"epoch": 1.405152224824356,
"grad_norm": 0.1798890382051468,
"learning_rate": 1.186772460599523e-05,
"loss": 0.30336918830871584,
"memory(GiB)": 137.67,
"step": 1200,
"token_acc": 0.891896889446055,
"train_speed(iter/s)": 0.041105
},
{
"epoch": 1.411007025761124,
"grad_norm": 0.1862761676311493,
"learning_rate": 1.1804260089708464e-05,
"loss": 0.3127150535583496,
"memory(GiB)": 137.67,
"step": 1205,
"token_acc": 0.8781827694454133,
"train_speed(iter/s)": 0.041099
},
{
"epoch": 1.4168618266978923,
"grad_norm": 0.1872834414243698,
"learning_rate": 1.1740720367303958e-05,
"loss": 0.3076412916183472,
"memory(GiB)": 137.67,
"step": 1210,
"token_acc": 0.8865224656924374,
"train_speed(iter/s)": 0.041096
},
{
"epoch": 1.4227166276346606,
"grad_norm": 0.1868448704481125,
"learning_rate": 1.1677108087277835e-05,
"loss": 0.3139200210571289,
"memory(GiB)": 137.67,
"step": 1215,
"token_acc": 0.8866469436643504,
"train_speed(iter/s)": 0.041092
},
{
"epoch": 1.4285714285714286,
"grad_norm": 0.1959424465894699,
"learning_rate": 1.1613425901150595e-05,
"loss": 0.3134448051452637,
"memory(GiB)": 137.67,
"step": 1220,
"token_acc": 0.8883061552452257,
"train_speed(iter/s)": 0.041088
},
{
"epoch": 1.4344262295081966,
"grad_norm": 0.1766284704208374,
"learning_rate": 1.15496764633566e-05,
"loss": 0.3212412357330322,
"memory(GiB)": 137.67,
"step": 1225,
"token_acc": 0.8780539320458743,
"train_speed(iter/s)": 0.041084
},
{
"epoch": 1.440281030444965,
"grad_norm": 0.17711302638053894,
"learning_rate": 1.1485862431133445e-05,
"loss": 0.3123058795928955,
"memory(GiB)": 137.67,
"step": 1230,
"token_acc": 0.8900835233492141,
"train_speed(iter/s)": 0.041082
},
{
"epoch": 1.446135831381733,
"grad_norm": 0.1747256964445114,
"learning_rate": 1.1421986464411169e-05,
"loss": 0.31295697689056395,
"memory(GiB)": 137.67,
"step": 1235,
"token_acc": 0.8767080016888458,
"train_speed(iter/s)": 0.041075
},
{
"epoch": 1.4519906323185012,
"grad_norm": 0.18440908193588257,
"learning_rate": 1.1358051225701404e-05,
"loss": 0.30406386852264405,
"memory(GiB)": 137.67,
"step": 1240,
"token_acc": 0.8795020947920581,
"train_speed(iter/s)": 0.041071
},
{
"epoch": 1.4578454332552693,
"grad_norm": 0.17828240990638733,
"learning_rate": 1.1294059379986384e-05,
"loss": 0.3121625900268555,
"memory(GiB)": 137.67,
"step": 1245,
"token_acc": 0.880069535801541,
"train_speed(iter/s)": 0.041066
},
{
"epoch": 1.4637002341920375,
"grad_norm": 0.19148212671279907,
"learning_rate": 1.1230013594607874e-05,
"loss": 0.31345176696777344,
"memory(GiB)": 137.67,
"step": 1250,
"token_acc": 0.8839757074137398,
"train_speed(iter/s)": 0.041062
},
{
"epoch": 1.4695550351288056,
"grad_norm": 0.1828489750623703,
"learning_rate": 1.1165916539155968e-05,
"loss": 0.3104730129241943,
"memory(GiB)": 137.67,
"step": 1255,
"token_acc": 0.8880499764055864,
"train_speed(iter/s)": 0.04106
},
{
"epoch": 1.4754098360655736,
"grad_norm": 0.17934924364089966,
"learning_rate": 1.1101770885357843e-05,
"loss": 0.3066437244415283,
"memory(GiB)": 137.67,
"step": 1260,
"token_acc": 0.8892594538641362,
"train_speed(iter/s)": 0.041058
},
{
"epoch": 1.481264637002342,
"grad_norm": 0.16536173224449158,
"learning_rate": 1.1037579306966365e-05,
"loss": 0.3071906566619873,
"memory(GiB)": 137.67,
"step": 1265,
"token_acc": 0.8958809106175363,
"train_speed(iter/s)": 0.041054
},
{
"epoch": 1.4871194379391102,
"grad_norm": 0.18694446980953217,
"learning_rate": 1.0973344479648652e-05,
"loss": 0.3013455867767334,
"memory(GiB)": 137.67,
"step": 1270,
"token_acc": 0.8899813852868301,
"train_speed(iter/s)": 0.04105
},
{
"epoch": 1.4929742388758782,
"grad_norm": 0.17580904066562653,
"learning_rate": 1.0909069080874556e-05,
"loss": 0.30318174362182615,
"memory(GiB)": 137.67,
"step": 1275,
"token_acc": 0.8817699648607147,
"train_speed(iter/s)": 0.041047
},
{
"epoch": 1.4988290398126463,
"grad_norm": 0.18754124641418457,
"learning_rate": 1.0844755789805042e-05,
"loss": 0.31064305305480955,
"memory(GiB)": 137.67,
"step": 1280,
"token_acc": 0.8804021416788542,
"train_speed(iter/s)": 0.041044
},
{
"epoch": 1.5046838407494145,
"grad_norm": 0.19590285420417786,
"learning_rate": 1.0780407287180526e-05,
"loss": 0.3148102045059204,
"memory(GiB)": 137.67,
"step": 1285,
"token_acc": 0.8805457351989244,
"train_speed(iter/s)": 0.041039
},
{
"epoch": 1.5105386416861828,
"grad_norm": 0.19473980367183685,
"learning_rate": 1.0716026255209124e-05,
"loss": 0.3106101036071777,
"memory(GiB)": 137.67,
"step": 1290,
"token_acc": 0.879328668153049,
"train_speed(iter/s)": 0.041037
},
{
"epoch": 1.5163934426229508,
"grad_norm": 0.18378229439258575,
"learning_rate": 1.0651615377454872e-05,
"loss": 0.3110929250717163,
"memory(GiB)": 137.67,
"step": 1295,
"token_acc": 0.8856033818930429,
"train_speed(iter/s)": 0.041033
},
{
"epoch": 1.5222482435597189,
"grad_norm": 0.18482638895511627,
"learning_rate": 1.0587177338725834e-05,
"loss": 0.3163102626800537,
"memory(GiB)": 137.67,
"step": 1300,
"token_acc": 0.8870778115329991,
"train_speed(iter/s)": 0.04103
},
{
"epoch": 1.5281030444964872,
"grad_norm": 0.17333081364631653,
"learning_rate": 1.0522714824962228e-05,
"loss": 0.30377721786499023,
"memory(GiB)": 137.67,
"step": 1305,
"token_acc": 0.8980077050082553,
"train_speed(iter/s)": 0.041028
},
{
"epoch": 1.5339578454332554,
"grad_norm": 0.1912304162979126,
"learning_rate": 1.0458230523124443e-05,
"loss": 0.3162518501281738,
"memory(GiB)": 137.67,
"step": 1310,
"token_acc": 0.8886457770855507,
"train_speed(iter/s)": 0.041024
},
{
"epoch": 1.5398126463700235,
"grad_norm": 0.1846192628145218,
"learning_rate": 1.0393727121081057e-05,
"loss": 0.3126535892486572,
"memory(GiB)": 137.67,
"step": 1315,
"token_acc": 0.8860128586991429,
"train_speed(iter/s)": 0.041023
},
{
"epoch": 1.5456674473067915,
"grad_norm": 0.17747725546360016,
"learning_rate": 1.0329207307496785e-05,
"loss": 0.30208649635314944,
"memory(GiB)": 137.67,
"step": 1320,
"token_acc": 0.8879456759093934,
"train_speed(iter/s)": 0.04102
},
{
"epoch": 1.5515222482435598,
"grad_norm": 0.18443572521209717,
"learning_rate": 1.0264673771720429e-05,
"loss": 0.3092689037322998,
"memory(GiB)": 137.67,
"step": 1325,
"token_acc": 0.892488839320581,
"train_speed(iter/s)": 0.041016
},
{
"epoch": 1.5573770491803278,
"grad_norm": 0.18431353569030762,
"learning_rate": 1.0200129203672754e-05,
"loss": 0.3100308656692505,
"memory(GiB)": 137.67,
"step": 1330,
"token_acc": 0.8782463261547713,
"train_speed(iter/s)": 0.041012
},
{
"epoch": 1.5632318501170959,
"grad_norm": 0.1662471741437912,
"learning_rate": 1.0135576293734381e-05,
"loss": 0.30292906761169436,
"memory(GiB)": 137.67,
"step": 1335,
"token_acc": 0.8942868271402976,
"train_speed(iter/s)": 0.04101
},
{
"epoch": 1.5690866510538641,
"grad_norm": 0.1806328445672989,
"learning_rate": 1.007101773263365e-05,
"loss": 0.31366329193115233,
"memory(GiB)": 137.67,
"step": 1340,
"token_acc": 0.8866166119192868,
"train_speed(iter/s)": 0.041006
},
{
"epoch": 1.5749414519906324,
"grad_norm": 0.16915848851203918,
"learning_rate": 1.0006456211334445e-05,
"loss": 0.30766754150390624,
"memory(GiB)": 137.67,
"step": 1345,
"token_acc": 0.8863719744503918,
"train_speed(iter/s)": 0.041006
},
{
"epoch": 1.5807962529274004,
"grad_norm": 0.16690009832382202,
"learning_rate": 9.941894420924044e-06,
"loss": 0.3059431314468384,
"memory(GiB)": 137.67,
"step": 1350,
"token_acc": 0.8971780549005762,
"train_speed(iter/s)": 0.041001
},
{
"epoch": 1.5866510538641685,
"grad_norm": 0.17337647080421448,
"learning_rate": 9.87733505250094e-06,
"loss": 0.3098172664642334,
"memory(GiB)": 137.67,
"step": 1355,
"token_acc": 0.8863237006126697,
"train_speed(iter/s)": 0.040998
},
{
"epoch": 1.5925058548009368,
"grad_norm": 0.17512920498847961,
"learning_rate": 9.812780797062678e-06,
"loss": 0.30655522346496583,
"memory(GiB)": 137.67,
"step": 1360,
"token_acc": 0.8899597184053006,
"train_speed(iter/s)": 0.040993
},
{
"epoch": 1.598360655737705,
"grad_norm": 0.1765688955783844,
"learning_rate": 9.748234345393672e-06,
"loss": 0.3023026466369629,
"memory(GiB)": 137.67,
"step": 1365,
"token_acc": 0.8879338667133921,
"train_speed(iter/s)": 0.040989
},
{
"epoch": 1.604215456674473,
"grad_norm": 0.18416614830493927,
"learning_rate": 9.68369838795306e-06,
"loss": 0.30958683490753175,
"memory(GiB)": 137.67,
"step": 1370,
"token_acc": 0.8849809108691687,
"train_speed(iter/s)": 0.040984
},
{
"epoch": 1.6100702576112411,
"grad_norm": 0.17386697232723236,
"learning_rate": 9.61917561476255e-06,
"loss": 0.30420713424682616,
"memory(GiB)": 137.67,
"step": 1375,
"token_acc": 0.8786233528080887,
"train_speed(iter/s)": 0.040981
},
{
"epoch": 1.6159250585480094,
"grad_norm": 0.18169918656349182,
"learning_rate": 9.554668715294305e-06,
"loss": 0.31483819484710696,
"memory(GiB)": 137.67,
"step": 1380,
"token_acc": 0.8864194675551166,
"train_speed(iter/s)": 0.040979
},
{
"epoch": 1.6217798594847777,
"grad_norm": 0.1892368197441101,
"learning_rate": 9.490180378358826e-06,
"loss": 0.3172303676605225,
"memory(GiB)": 137.67,
"step": 1385,
"token_acc": 0.8828729942067092,
"train_speed(iter/s)": 0.040977
},
{
"epoch": 1.6276346604215457,
"grad_norm": 0.1751379817724228,
"learning_rate": 9.425713291992878e-06,
"loss": 0.30653929710388184,
"memory(GiB)": 137.67,
"step": 1390,
"token_acc": 0.8895787320550146,
"train_speed(iter/s)": 0.040974
},
{
"epoch": 1.6334894613583137,
"grad_norm": 0.18914154171943665,
"learning_rate": 9.361270143347452e-06,
"loss": 0.31959149837493894,
"memory(GiB)": 137.67,
"step": 1395,
"token_acc": 0.8822264278089348,
"train_speed(iter/s)": 0.040972
},
{
"epoch": 1.639344262295082,
"grad_norm": 0.16736507415771484,
"learning_rate": 9.296853618575753e-06,
"loss": 0.30730547904968264,
"memory(GiB)": 137.67,
"step": 1400,
"token_acc": 0.8928722715040367,
"train_speed(iter/s)": 0.04097
},
{
"epoch": 1.6451990632318503,
"grad_norm": 0.1708020716905594,
"learning_rate": 9.232466402721241e-06,
"loss": 0.31717801094055176,
"memory(GiB)": 137.67,
"step": 1405,
"token_acc": 0.886989175916414,
"train_speed(iter/s)": 0.040969
},
{
"epoch": 1.651053864168618,
"grad_norm": 0.17622792720794678,
"learning_rate": 9.1681111796057e-06,
"loss": 0.3083082675933838,
"memory(GiB)": 137.67,
"step": 1410,
"token_acc": 0.8884494066990437,
"train_speed(iter/s)": 0.040968
},
{
"epoch": 1.6569086651053864,
"grad_norm": 0.1885053962469101,
"learning_rate": 9.103790631717375e-06,
"loss": 0.32230064868927,
"memory(GiB)": 137.67,
"step": 1415,
"token_acc": 0.878518037454961,
"train_speed(iter/s)": 0.040965
},
{
"epoch": 1.6627634660421546,
"grad_norm": 0.17244482040405273,
"learning_rate": 9.039507440099164e-06,
"loss": 0.30806798934936525,
"memory(GiB)": 137.67,
"step": 1420,
"token_acc": 0.8929606011942812,
"train_speed(iter/s)": 0.040962
},
{
"epoch": 1.6686182669789227,
"grad_norm": 0.18172700703144073,
"learning_rate": 8.975264284236866e-06,
"loss": 0.30987024307250977,
"memory(GiB)": 137.67,
"step": 1425,
"token_acc": 0.8885019605876434,
"train_speed(iter/s)": 0.040961
},
{
"epoch": 1.6744730679156907,
"grad_norm": 0.18555694818496704,
"learning_rate": 8.911063841947476e-06,
"loss": 0.31224822998046875,
"memory(GiB)": 137.67,
"step": 1430,
"token_acc": 0.8862099925232826,
"train_speed(iter/s)": 0.040958
},
{
"epoch": 1.680327868852459,
"grad_norm": 0.18322236835956573,
"learning_rate": 8.846908789267589e-06,
"loss": 0.31196701526641846,
"memory(GiB)": 137.67,
"step": 1435,
"token_acc": 0.8887980814742356,
"train_speed(iter/s)": 0.040958
},
{
"epoch": 1.6861826697892273,
"grad_norm": 0.17747406661510468,
"learning_rate": 8.78280180034184e-06,
"loss": 0.3032996654510498,
"memory(GiB)": 137.67,
"step": 1440,
"token_acc": 0.8822490977332802,
"train_speed(iter/s)": 0.040955
},
{
"epoch": 1.6920374707259953,
"grad_norm": 0.18120799958705902,
"learning_rate": 8.718745547311458e-06,
"loss": 0.3137194633483887,
"memory(GiB)": 137.67,
"step": 1445,
"token_acc": 0.8828540900663084,
"train_speed(iter/s)": 0.040951
},
{
"epoch": 1.6978922716627634,
"grad_norm": 0.17743031680583954,
"learning_rate": 8.654742700202849e-06,
"loss": 0.31336297988891604,
"memory(GiB)": 137.67,
"step": 1450,
"token_acc": 0.8851623130427727,
"train_speed(iter/s)": 0.040949
},
{
"epoch": 1.7037470725995316,
"grad_norm": 0.1702745109796524,
"learning_rate": 8.590795926816348e-06,
"loss": 0.3027879953384399,
"memory(GiB)": 137.67,
"step": 1455,
"token_acc": 0.8840805588371897,
"train_speed(iter/s)": 0.040947
},
{
"epoch": 1.7096018735362999,
"grad_norm": 0.17240740358829498,
"learning_rate": 8.526907892614986e-06,
"loss": 0.3072841167449951,
"memory(GiB)": 137.67,
"step": 1460,
"token_acc": 0.88948632592922,
"train_speed(iter/s)": 0.040943
},
{
"epoch": 1.715456674473068,
"grad_norm": 0.17982088029384613,
"learning_rate": 8.463081260613391e-06,
"loss": 0.30924406051635744,
"memory(GiB)": 137.67,
"step": 1465,
"token_acc": 0.8940978807037782,
"train_speed(iter/s)": 0.04094
},
{
"epoch": 1.721311475409836,
"grad_norm": 0.19751447439193726,
"learning_rate": 8.399318691266806e-06,
"loss": 0.3119847774505615,
"memory(GiB)": 137.67,
"step": 1470,
"token_acc": 0.8852366571009662,
"train_speed(iter/s)": 0.040936
},
{
"epoch": 1.7271662763466042,
"grad_norm": 0.18603962659835815,
"learning_rate": 8.335622842360168e-06,
"loss": 0.3066195011138916,
"memory(GiB)": 137.67,
"step": 1475,
"token_acc": 0.8890113777789009,
"train_speed(iter/s)": 0.040933
},
{
"epoch": 1.7330210772833725,
"grad_norm": 0.2541693449020386,
"learning_rate": 8.271996368897345e-06,
"loss": 0.3128560781478882,
"memory(GiB)": 137.67,
"step": 1480,
"token_acc": 0.8902386961489684,
"train_speed(iter/s)": 0.040929
},
{
"epoch": 1.7388758782201406,
"grad_norm": 0.16992934048175812,
"learning_rate": 8.208441922990454e-06,
"loss": 0.3037855863571167,
"memory(GiB)": 137.67,
"step": 1485,
"token_acc": 0.8849534643226473,
"train_speed(iter/s)": 0.040926
},
{
"epoch": 1.7447306791569086,
"grad_norm": 0.17065441608428955,
"learning_rate": 8.144962153749331e-06,
"loss": 0.30540289878845217,
"memory(GiB)": 137.67,
"step": 1490,
"token_acc": 0.8819315749736371,
"train_speed(iter/s)": 0.040924
},
{
"epoch": 1.7505854800936769,
"grad_norm": 0.1787635236978531,
"learning_rate": 8.081559707171094e-06,
"loss": 0.31698925495147706,
"memory(GiB)": 137.67,
"step": 1495,
"token_acc": 0.8824724072862914,
"train_speed(iter/s)": 0.040923
},
{
"epoch": 1.756440281030445,
"grad_norm": 0.1751013845205307,
"learning_rate": 8.01823722602986e-06,
"loss": 0.30347585678100586,
"memory(GiB)": 137.67,
"step": 1500,
"token_acc": 0.893298859486769,
"train_speed(iter/s)": 0.040922
},
{
"epoch": 1.762295081967213,
"grad_norm": 0.17399156093597412,
"learning_rate": 7.954997349766576e-06,
"loss": 0.3116060972213745,
"memory(GiB)": 137.67,
"step": 1505,
"token_acc": 0.8889070320988275,
"train_speed(iter/s)": 0.040921
},
{
"epoch": 1.7681498829039812,
"grad_norm": 0.18837633728981018,
"learning_rate": 7.891842714379027e-06,
"loss": 0.29880785942077637,
"memory(GiB)": 137.67,
"step": 1510,
"token_acc": 0.893647204719971,
"train_speed(iter/s)": 0.040918
},
{
"epoch": 1.7740046838407495,
"grad_norm": 0.1845746487379074,
"learning_rate": 7.828775952311921e-06,
"loss": 0.30261945724487305,
"memory(GiB)": 137.67,
"step": 1515,
"token_acc": 0.8851783808483535,
"train_speed(iter/s)": 0.040914
},
{
"epoch": 1.7798594847775175,
"grad_norm": 0.16885152459144592,
"learning_rate": 7.765799692347201e-06,
"loss": 0.3042313575744629,
"memory(GiB)": 137.67,
"step": 1520,
"token_acc": 0.8835214994418757,
"train_speed(iter/s)": 0.040911
},
{
"epoch": 1.7857142857142856,
"grad_norm": 0.1790182739496231,
"learning_rate": 7.702916559494444e-06,
"loss": 0.31259956359863283,
"memory(GiB)": 137.67,
"step": 1525,
"token_acc": 0.8878653758934018,
"train_speed(iter/s)": 0.040909
},
{
"epoch": 1.7915690866510539,
"grad_norm": 0.17695166170597076,
"learning_rate": 7.64012917488146e-06,
"loss": 0.29359025955200196,
"memory(GiB)": 137.67,
"step": 1530,
"token_acc": 0.9000399023492115,
"train_speed(iter/s)": 0.040908
},
{
"epoch": 1.7974238875878221,
"grad_norm": 0.18347503244876862,
"learning_rate": 7.577440155645028e-06,
"loss": 0.30249216556549074,
"memory(GiB)": 137.67,
"step": 1535,
"token_acc": 0.8902694639046774,
"train_speed(iter/s)": 0.040904
},
{
"epoch": 1.8032786885245902,
"grad_norm": 0.1697729527950287,
"learning_rate": 7.514852114821811e-06,
"loss": 0.31291751861572265,
"memory(GiB)": 137.67,
"step": 1540,
"token_acc": 0.8868685350765146,
"train_speed(iter/s)": 0.040902
},
{
"epoch": 1.8091334894613582,
"grad_norm": 0.16477090120315552,
"learning_rate": 7.452367661239433e-06,
"loss": 0.29220216274261473,
"memory(GiB)": 137.67,
"step": 1545,
"token_acc": 0.8877543630965312,
"train_speed(iter/s)": 0.040899
},
{
"epoch": 1.8149882903981265,
"grad_norm": 0.19079044461250305,
"learning_rate": 7.389989399407741e-06,
"loss": 0.3156083106994629,
"memory(GiB)": 137.67,
"step": 1550,
"token_acc": 0.8873283112245697,
"train_speed(iter/s)": 0.040896
},
{
"epoch": 1.8208430913348947,
"grad_norm": 0.1723940074443817,
"learning_rate": 7.3277199294102485e-06,
"loss": 0.30045547485351565,
"memory(GiB)": 137.67,
"step": 1555,
"token_acc": 0.8850201501823112,
"train_speed(iter/s)": 0.040894
},
{
"epoch": 1.8266978922716628,
"grad_norm": 0.18594853579998016,
"learning_rate": 7.265561846795741e-06,
"loss": 0.3101131677627563,
"memory(GiB)": 137.67,
"step": 1560,
"token_acc": 0.8868083283139077,
"train_speed(iter/s)": 0.040889
},
{
"epoch": 1.8325526932084308,
"grad_norm": 0.1757504642009735,
"learning_rate": 7.203517742470101e-06,
"loss": 0.30873966217041016,
"memory(GiB)": 137.67,
"step": 1565,
"token_acc": 0.8949954641669187,
"train_speed(iter/s)": 0.040886
},
{
"epoch": 1.838407494145199,
"grad_norm": 0.2077726572751999,
"learning_rate": 7.141590202588312e-06,
"loss": 0.3127377986907959,
"memory(GiB)": 137.67,
"step": 1570,
"token_acc": 0.888584743745537,
"train_speed(iter/s)": 0.040885
},
{
"epoch": 1.8442622950819674,
"grad_norm": 0.17814461886882782,
"learning_rate": 7.079781808446648e-06,
"loss": 0.31596999168395995,
"memory(GiB)": 137.67,
"step": 1575,
"token_acc": 0.8755756783669405,
"train_speed(iter/s)": 0.040882
},
{
"epoch": 1.8501170960187352,
"grad_norm": 0.16512958705425262,
"learning_rate": 7.018095136375089e-06,
"loss": 0.3012762308120728,
"memory(GiB)": 137.67,
"step": 1580,
"token_acc": 0.8862999993707803,
"train_speed(iter/s)": 0.04088
},
{
"epoch": 1.8559718969555035,
"grad_norm": 0.18698780238628387,
"learning_rate": 6.956532757629945e-06,
"loss": 0.3080646514892578,
"memory(GiB)": 137.67,
"step": 1585,
"token_acc": 0.8861714900322669,
"train_speed(iter/s)": 0.040877
},
{
"epoch": 1.8618266978922717,
"grad_norm": 0.17041386663913727,
"learning_rate": 6.89509723828665e-06,
"loss": 0.3119032382965088,
"memory(GiB)": 137.67,
"step": 1590,
"token_acc": 0.8861256952099799,
"train_speed(iter/s)": 0.040875
},
{
"epoch": 1.8676814988290398,
"grad_norm": 0.18812042474746704,
"learning_rate": 6.833791139132824e-06,
"loss": 0.2984042167663574,
"memory(GiB)": 137.67,
"step": 1595,
"token_acc": 0.8881694299555838,
"train_speed(iter/s)": 0.040871
},
{
"epoch": 1.8735362997658078,
"grad_norm": 0.16663610935211182,
"learning_rate": 6.772617015561529e-06,
"loss": 0.3069270610809326,
"memory(GiB)": 137.67,
"step": 1600,
"token_acc": 0.8785419403265153,
"train_speed(iter/s)": 0.040869
},
{
"epoch": 1.879391100702576,
"grad_norm": 0.16731353104114532,
"learning_rate": 6.7115774174647475e-06,
"loss": 0.29993810653686526,
"memory(GiB)": 137.67,
"step": 1605,
"token_acc": 0.8944355407195264,
"train_speed(iter/s)": 0.040868
},
{
"epoch": 1.8852459016393444,
"grad_norm": 0.18671032786369324,
"learning_rate": 6.6506748891271045e-06,
"loss": 0.3104290723800659,
"memory(GiB)": 137.67,
"step": 1610,
"token_acc": 0.893398089707724,
"train_speed(iter/s)": 0.040866
},
{
"epoch": 1.8911007025761124,
"grad_norm": 0.17069920897483826,
"learning_rate": 6.5899119691198025e-06,
"loss": 0.30440511703491213,
"memory(GiB)": 137.67,
"step": 1615,
"token_acc": 0.8883004841907675,
"train_speed(iter/s)": 0.040865
},
{
"epoch": 1.8969555035128804,
"grad_norm": 0.1704709678888321,
"learning_rate": 6.529291190194829e-06,
"loss": 0.3084626436233521,
"memory(GiB)": 137.67,
"step": 1620,
"token_acc": 0.887373335138147,
"train_speed(iter/s)": 0.040864
},
{
"epoch": 1.9028103044496487,
"grad_norm": 0.1708633005619049,
"learning_rate": 6.468815079179364e-06,
"loss": 0.30423784255981445,
"memory(GiB)": 137.67,
"step": 1625,
"token_acc": 0.8923868074324853,
"train_speed(iter/s)": 0.040862
},
{
"epoch": 1.908665105386417,
"grad_norm": 0.17672830820083618,
"learning_rate": 6.408486156870466e-06,
"loss": 0.31655054092407225,
"memory(GiB)": 137.67,
"step": 1630,
"token_acc": 0.8692423282788768,
"train_speed(iter/s)": 0.04086
},
{
"epoch": 1.914519906323185,
"grad_norm": 0.1735108494758606,
"learning_rate": 6.348306937929991e-06,
"loss": 0.31425652503967283,
"memory(GiB)": 137.67,
"step": 1635,
"token_acc": 0.882395514622517,
"train_speed(iter/s)": 0.04086
},
{
"epoch": 1.920374707259953,
"grad_norm": 0.15910685062408447,
"learning_rate": 6.288279930779789e-06,
"loss": 0.29740355014801023,
"memory(GiB)": 137.67,
"step": 1640,
"token_acc": 0.8963298424379659,
"train_speed(iter/s)": 0.040858
},
{
"epoch": 1.9262295081967213,
"grad_norm": 0.17650458216667175,
"learning_rate": 6.228407637497131e-06,
"loss": 0.30800676345825195,
"memory(GiB)": 137.67,
"step": 1645,
"token_acc": 0.8754677877967858,
"train_speed(iter/s)": 0.040855
},
{
"epoch": 1.9320843091334896,
"grad_norm": 0.16745297610759735,
"learning_rate": 6.1686925537104306e-06,
"loss": 0.2977410316467285,
"memory(GiB)": 137.67,
"step": 1650,
"token_acc": 0.8798736234089867,
"train_speed(iter/s)": 0.040852
},
{
"epoch": 1.9379391100702577,
"grad_norm": 0.1728445142507553,
"learning_rate": 6.109137168495205e-06,
"loss": 0.304546856880188,
"memory(GiB)": 137.67,
"step": 1655,
"token_acc": 0.9005831398969597,
"train_speed(iter/s)": 0.040851
},
{
"epoch": 1.9437939110070257,
"grad_norm": 0.1682547777891159,
"learning_rate": 6.049743964270336e-06,
"loss": 0.3136142730712891,
"memory(GiB)": 137.67,
"step": 1660,
"token_acc": 0.8856946741131322,
"train_speed(iter/s)": 0.040848
},
{
"epoch": 1.949648711943794,
"grad_norm": 0.18915309011936188,
"learning_rate": 5.990515416694591e-06,
"loss": 0.3113490104675293,
"memory(GiB)": 137.67,
"step": 1665,
"token_acc": 0.8886227731406503,
"train_speed(iter/s)": 0.040845
},
{
"epoch": 1.955503512880562,
"grad_norm": 0.18081413209438324,
"learning_rate": 5.931453994563434e-06,
"loss": 0.30602524280548093,
"memory(GiB)": 137.67,
"step": 1670,
"token_acc": 0.8937767328555647,
"train_speed(iter/s)": 0.040844
},
{
"epoch": 1.96135831381733,
"grad_norm": 0.2595233917236328,
"learning_rate": 5.872562159706116e-06,
"loss": 0.309699273109436,
"memory(GiB)": 137.67,
"step": 1675,
"token_acc": 0.883843976093111,
"train_speed(iter/s)": 0.040842
},
{
"epoch": 1.9672131147540983,
"grad_norm": 0.17678314447402954,
"learning_rate": 5.8138423668830605e-06,
"loss": 0.30298714637756347,
"memory(GiB)": 137.67,
"step": 1680,
"token_acc": 0.8865513684995878,
"train_speed(iter/s)": 0.040842
},
{
"epoch": 1.9730679156908666,
"grad_norm": 0.1795545518398285,
"learning_rate": 5.755297063683551e-06,
"loss": 0.30653939247131345,
"memory(GiB)": 137.67,
"step": 1685,
"token_acc": 0.8907540567138181,
"train_speed(iter/s)": 0.040841
},
{
"epoch": 1.9789227166276346,
"grad_norm": 0.17241141200065613,
"learning_rate": 5.696928690423693e-06,
"loss": 0.30241034030914304,
"memory(GiB)": 137.67,
"step": 1690,
"token_acc": 0.8856109987263056,
"train_speed(iter/s)": 0.040841
},
{
"epoch": 1.9847775175644027,
"grad_norm": 0.1767030656337738,
"learning_rate": 5.638739680044718e-06,
"loss": 0.3159188270568848,
"memory(GiB)": 137.67,
"step": 1695,
"token_acc": 0.8789045280418222,
"train_speed(iter/s)": 0.040839
},
{
"epoch": 1.990632318501171,
"grad_norm": 0.1798180490732193,
"learning_rate": 5.580732458011544e-06,
"loss": 0.3054344654083252,
"memory(GiB)": 137.67,
"step": 1700,
"token_acc": 0.8914613695909465,
"train_speed(iter/s)": 0.040837
},
{
"epoch": 1.9964871194379392,
"grad_norm": 0.1673898547887802,
"learning_rate": 5.522909442211708e-06,
"loss": 0.3050167798995972,
"memory(GiB)": 137.67,
"step": 1705,
"token_acc": 0.8836358249226172,
"train_speed(iter/s)": 0.040834
},
{
"epoch": 2.002341920374707,
"grad_norm": 0.24459093809127808,
"learning_rate": 5.465273042854551e-06,
"loss": 0.2896696090698242,
"memory(GiB)": 137.67,
"step": 1710,
"token_acc": 0.8956877534575909,
"train_speed(iter/s)": 0.040723
},
{
"epoch": 2.0081967213114753,
"grad_norm": 0.19826985895633698,
"learning_rate": 5.407825662370778e-06,
"loss": 0.2708754301071167,
"memory(GiB)": 137.67,
"step": 1715,
"token_acc": 0.8993573677984775,
"train_speed(iter/s)": 0.040721
},
{
"epoch": 2.0140515222482436,
"grad_norm": 0.20230858027935028,
"learning_rate": 5.350569695312313e-06,
"loss": 0.27931761741638184,
"memory(GiB)": 137.67,
"step": 1720,
"token_acc": 0.8964727026237073,
"train_speed(iter/s)": 0.040718
},
{
"epoch": 2.019906323185012,
"grad_norm": 0.17940187454223633,
"learning_rate": 5.293507528252474e-06,
"loss": 0.2833970308303833,
"memory(GiB)": 137.67,
"step": 1725,
"token_acc": 0.8971622665586578,
"train_speed(iter/s)": 0.040716
},
{
"epoch": 2.0257611241217797,
"grad_norm": 0.2274295687675476,
"learning_rate": 5.236641539686518e-06,
"loss": 0.2709039211273193,
"memory(GiB)": 137.67,
"step": 1730,
"token_acc": 0.8940215607642851,
"train_speed(iter/s)": 0.040716
},
{
"epoch": 2.031615925058548,
"grad_norm": 0.17937658727169037,
"learning_rate": 5.179974099932472e-06,
"loss": 0.2649374961853027,
"memory(GiB)": 137.67,
"step": 1735,
"token_acc": 0.8949033413934375,
"train_speed(iter/s)": 0.040713
},
{
"epoch": 2.037470725995316,
"grad_norm": 0.1847214251756668,
"learning_rate": 5.12350757103236e-06,
"loss": 0.26505355834960936,
"memory(GiB)": 137.67,
"step": 1740,
"token_acc": 0.8981974914281606,
"train_speed(iter/s)": 0.040712
},
{
"epoch": 2.0433255269320845,
"grad_norm": 0.1737840622663498,
"learning_rate": 5.067244306653736e-06,
"loss": 0.27186686992645265,
"memory(GiB)": 137.67,
"step": 1745,
"token_acc": 0.9053836113307479,
"train_speed(iter/s)": 0.040711
},
{
"epoch": 2.0491803278688523,
"grad_norm": 0.1807735711336136,
"learning_rate": 5.0111866519915575e-06,
"loss": 0.2668013334274292,
"memory(GiB)": 137.67,
"step": 1750,
"token_acc": 0.8954151927308955,
"train_speed(iter/s)": 0.040709
},
{
"epoch": 2.0550351288056206,
"grad_norm": 0.17946134507656097,
"learning_rate": 4.95533694367047e-06,
"loss": 0.26618137359619143,
"memory(GiB)": 137.67,
"step": 1755,
"token_acc": 0.8999696707241193,
"train_speed(iter/s)": 0.040708
},
{
"epoch": 2.060889929742389,
"grad_norm": 0.17995508015155792,
"learning_rate": 4.899697509647379e-06,
"loss": 0.27054500579833984,
"memory(GiB)": 137.67,
"step": 1760,
"token_acc": 0.8920381030958765,
"train_speed(iter/s)": 0.040707
},
{
"epoch": 2.066744730679157,
"grad_norm": 0.22271017730236053,
"learning_rate": 4.844270669114424e-06,
"loss": 0.2727907657623291,
"memory(GiB)": 137.67,
"step": 1765,
"token_acc": 0.9031526316777533,
"train_speed(iter/s)": 0.040706
},
{
"epoch": 2.072599531615925,
"grad_norm": 0.18377523124217987,
"learning_rate": 4.789058732402319e-06,
"loss": 0.26617846488952634,
"memory(GiB)": 137.67,
"step": 1770,
"token_acc": 0.8968159437280188,
"train_speed(iter/s)": 0.040704
},
{
"epoch": 2.078454332552693,
"grad_norm": 0.18358266353607178,
"learning_rate": 4.734064000884044e-06,
"loss": 0.2815399646759033,
"memory(GiB)": 137.67,
"step": 1775,
"token_acc": 0.8860162596527972,
"train_speed(iter/s)": 0.040703
},
{
"epoch": 2.0843091334894615,
"grad_norm": 0.17939767241477966,
"learning_rate": 4.679288766878908e-06,
"loss": 0.2770793914794922,
"memory(GiB)": 137.67,
"step": 1780,
"token_acc": 0.8990350010749907,
"train_speed(iter/s)": 0.0407
},
{
"epoch": 2.0901639344262297,
"grad_norm": 0.18252268433570862,
"learning_rate": 4.624735313557019e-06,
"loss": 0.27314205169677735,
"memory(GiB)": 137.67,
"step": 1785,
"token_acc": 0.9036665729722977,
"train_speed(iter/s)": 0.040699
},
{
"epoch": 2.0960187353629975,
"grad_norm": 0.17692163586616516,
"learning_rate": 4.570405914844105e-06,
"loss": 0.26518521308898924,
"memory(GiB)": 137.67,
"step": 1790,
"token_acc": 0.9007013796506218,
"train_speed(iter/s)": 0.040696
},
{
"epoch": 2.101873536299766,
"grad_norm": 0.1812998205423355,
"learning_rate": 4.516302835326723e-06,
"loss": 0.27246594429016113,
"memory(GiB)": 137.67,
"step": 1795,
"token_acc": 0.9057411329497284,
"train_speed(iter/s)": 0.040694
},
{
"epoch": 2.107728337236534,
"grad_norm": 0.17790301144123077,
"learning_rate": 4.462428330157886e-06,
"loss": 0.2635958671569824,
"memory(GiB)": 137.67,
"step": 1800,
"token_acc": 0.9060071718018364,
"train_speed(iter/s)": 0.040692
},
{
"epoch": 2.113583138173302,
"grad_norm": 0.1772291511297226,
"learning_rate": 4.4087846449630475e-06,
"loss": 0.2673187732696533,
"memory(GiB)": 137.67,
"step": 1805,
"token_acc": 0.902466497498459,
"train_speed(iter/s)": 0.040691
},
{
"epoch": 2.11943793911007,
"grad_norm": 0.1833985149860382,
"learning_rate": 4.355374015746493e-06,
"loss": 0.26436376571655273,
"memory(GiB)": 137.67,
"step": 1810,
"token_acc": 0.8990824248093747,
"train_speed(iter/s)": 0.040688
},
{
"epoch": 2.1252927400468384,
"grad_norm": 0.1888750046491623,
"learning_rate": 4.302198668798159e-06,
"loss": 0.2690884113311768,
"memory(GiB)": 137.67,
"step": 1815,
"token_acc": 0.8948256326325066,
"train_speed(iter/s)": 0.040688
},
{
"epoch": 2.1311475409836067,
"grad_norm": 0.1726667881011963,
"learning_rate": 4.249260820600813e-06,
"loss": 0.2568142175674438,
"memory(GiB)": 137.67,
"step": 1820,
"token_acc": 0.9027062619756462,
"train_speed(iter/s)": 0.040686
},
{
"epoch": 2.1370023419203745,
"grad_norm": 0.18242421746253967,
"learning_rate": 4.1965626777376766e-06,
"loss": 0.26575822830200196,
"memory(GiB)": 137.67,
"step": 1825,
"token_acc": 0.9058191422116245,
"train_speed(iter/s)": 0.040685
},
{
"epoch": 2.142857142857143,
"grad_norm": 0.17865152657032013,
"learning_rate": 4.144106436800453e-06,
"loss": 0.2705830097198486,
"memory(GiB)": 137.67,
"step": 1830,
"token_acc": 0.9064275903781455,
"train_speed(iter/s)": 0.040686
},
{
"epoch": 2.148711943793911,
"grad_norm": 0.1739743947982788,
"learning_rate": 4.091894284297758e-06,
"loss": 0.262749981880188,
"memory(GiB)": 137.67,
"step": 1835,
"token_acc": 0.8932282627390278,
"train_speed(iter/s)": 0.040684
},
{
"epoch": 2.1545667447306793,
"grad_norm": 0.18693114817142487,
"learning_rate": 4.039928396563983e-06,
"loss": 0.27836999893188474,
"memory(GiB)": 137.67,
"step": 1840,
"token_acc": 0.8999278596166879,
"train_speed(iter/s)": 0.040683
},
{
"epoch": 2.160421545667447,
"grad_norm": 0.18225987255573273,
"learning_rate": 3.9882109396685845e-06,
"loss": 0.25630941390991213,
"memory(GiB)": 137.67,
"step": 1845,
"token_acc": 0.8964322481719588,
"train_speed(iter/s)": 0.04068
},
{
"epoch": 2.1662763466042154,
"grad_norm": 0.1680818498134613,
"learning_rate": 3.936744069325797e-06,
"loss": 0.25788373947143556,
"memory(GiB)": 137.67,
"step": 1850,
"token_acc": 0.9047133964952628,
"train_speed(iter/s)": 0.040677
},
{
"epoch": 2.1721311475409837,
"grad_norm": 0.17563344538211823,
"learning_rate": 3.885529930804768e-06,
"loss": 0.2534646987915039,
"memory(GiB)": 137.67,
"step": 1855,
"token_acc": 0.895904841548197,
"train_speed(iter/s)": 0.040675
},
{
"epoch": 2.177985948477752,
"grad_norm": 0.2031351625919342,
"learning_rate": 3.834570658840152e-06,
"loss": 0.2712204933166504,
"memory(GiB)": 137.67,
"step": 1860,
"token_acc": 0.8943131411791787,
"train_speed(iter/s)": 0.040674
},
{
"epoch": 2.1838407494145198,
"grad_norm": 0.1767955720424652,
"learning_rate": 3.7838683775431106e-06,
"loss": 0.26442804336547854,
"memory(GiB)": 137.67,
"step": 1865,
"token_acc": 0.9006802168952266,
"train_speed(iter/s)": 0.040673
},
{
"epoch": 2.189695550351288,
"grad_norm": 0.17129677534103394,
"learning_rate": 3.733425200312797e-06,
"loss": 0.2669063091278076,
"memory(GiB)": 137.67,
"step": 1870,
"token_acc": 0.8917139826542709,
"train_speed(iter/s)": 0.040672
},
{
"epoch": 2.1955503512880563,
"grad_norm": 0.17820899188518524,
"learning_rate": 3.683243229748249e-06,
"loss": 0.2608784198760986,
"memory(GiB)": 137.67,
"step": 1875,
"token_acc": 0.8967133346325762,
"train_speed(iter/s)": 0.04067
},
{
"epoch": 2.201405152224824,
"grad_norm": 0.18119502067565918,
"learning_rate": 3.633324557560747e-06,
"loss": 0.265275239944458,
"memory(GiB)": 137.67,
"step": 1880,
"token_acc": 0.9029575814389501,
"train_speed(iter/s)": 0.040669
},
{
"epoch": 2.2072599531615924,
"grad_norm": 0.17707428336143494,
"learning_rate": 3.5836712644866277e-06,
"loss": 0.2611743450164795,
"memory(GiB)": 137.67,
"step": 1885,
"token_acc": 0.8965409189329774,
"train_speed(iter/s)": 0.040668
},
{
"epoch": 2.2131147540983607,
"grad_norm": 0.1768161803483963,
"learning_rate": 3.5342854202005696e-06,
"loss": 0.26110024452209474,
"memory(GiB)": 137.67,
"step": 1890,
"token_acc": 0.9035024093649873,
"train_speed(iter/s)": 0.040667
},
{
"epoch": 2.218969555035129,
"grad_norm": 0.17210449278354645,
"learning_rate": 3.485169083229293e-06,
"loss": 0.26915616989135743,
"memory(GiB)": 137.67,
"step": 1895,
"token_acc": 0.9061759392893929,
"train_speed(iter/s)": 0.040667
},
{
"epoch": 2.2248243559718968,
"grad_norm": 0.16969619691371918,
"learning_rate": 3.4363243008657842e-06,
"loss": 0.2634119987487793,
"memory(GiB)": 137.67,
"step": 1900,
"token_acc": 0.8916742749773309,
"train_speed(iter/s)": 0.040664
},
{
"epoch": 2.230679156908665,
"grad_norm": 0.17764930427074432,
"learning_rate": 3.3877531090839478e-06,
"loss": 0.2685534000396729,
"memory(GiB)": 137.67,
"step": 1905,
"token_acc": 0.8940042290704804,
"train_speed(iter/s)": 0.040663
},
{
"epoch": 2.2365339578454333,
"grad_norm": 0.17651669681072235,
"learning_rate": 3.3394575324537327e-06,
"loss": 0.27190165519714354,
"memory(GiB)": 137.67,
"step": 1910,
"token_acc": 0.8928626982497402,
"train_speed(iter/s)": 0.04066
},
{
"epoch": 2.2423887587822016,
"grad_norm": 0.16508856415748596,
"learning_rate": 3.2914395840567605e-06,
"loss": 0.2606737852096558,
"memory(GiB)": 137.67,
"step": 1915,
"token_acc": 0.9028335241642236,
"train_speed(iter/s)": 0.040658
},
{
"epoch": 2.2482435597189694,
"grad_norm": 0.16644766926765442,
"learning_rate": 3.2437012654024057e-06,
"loss": 0.2660099983215332,
"memory(GiB)": 137.67,
"step": 1920,
"token_acc": 0.9046304613618784,
"train_speed(iter/s)": 0.040656
},
{
"epoch": 2.2540983606557377,
"grad_norm": 0.16391952335834503,
"learning_rate": 3.1962445663443643e-06,
"loss": 0.2678091287612915,
"memory(GiB)": 137.67,
"step": 1925,
"token_acc": 0.8979980130091664,
"train_speed(iter/s)": 0.040653
},
{
"epoch": 2.259953161592506,
"grad_norm": 0.1803101897239685,
"learning_rate": 3.1490714649977196e-06,
"loss": 0.27110137939453127,
"memory(GiB)": 137.67,
"step": 1930,
"token_acc": 0.905863734174048,
"train_speed(iter/s)": 0.04065
},
{
"epoch": 2.265807962529274,
"grad_norm": 0.17323030531406403,
"learning_rate": 3.102183927656488e-06,
"loss": 0.26174540519714357,
"memory(GiB)": 137.67,
"step": 1935,
"token_acc": 0.8902694797112273,
"train_speed(iter/s)": 0.040649
},
{
"epoch": 2.271662763466042,
"grad_norm": 0.18379603326320648,
"learning_rate": 3.0555839087116547e-06,
"loss": 0.27245678901672366,
"memory(GiB)": 137.67,
"step": 1940,
"token_acc": 0.90194375,
"train_speed(iter/s)": 0.040648
},
{
"epoch": 2.2775175644028103,
"grad_norm": 0.1765807718038559,
"learning_rate": 3.009273350569705e-06,
"loss": 0.2700004816055298,
"memory(GiB)": 137.67,
"step": 1945,
"token_acc": 0.9060629034421867,
"train_speed(iter/s)": 0.040648
},
{
"epoch": 2.2833723653395785,
"grad_norm": 0.17609137296676636,
"learning_rate": 2.963254183571682e-06,
"loss": 0.2663255214691162,
"memory(GiB)": 137.67,
"step": 1950,
"token_acc": 0.9028553183442811,
"train_speed(iter/s)": 0.040646
},
{
"epoch": 2.289227166276347,
"grad_norm": 0.1761084645986557,
"learning_rate": 2.9175283259126943e-06,
"loss": 0.2662710428237915,
"memory(GiB)": 137.67,
"step": 1955,
"token_acc": 0.9068832885430957,
"train_speed(iter/s)": 0.040645
},
{
"epoch": 2.2950819672131146,
"grad_norm": 0.16875940561294556,
"learning_rate": 2.872097683561986e-06,
"loss": 0.2650928497314453,
"memory(GiB)": 137.67,
"step": 1960,
"token_acc": 0.9107070141504632,
"train_speed(iter/s)": 0.040644
},
{
"epoch": 2.300936768149883,
"grad_norm": 0.18349847197532654,
"learning_rate": 2.8269641501834834e-06,
"loss": 0.2731610298156738,
"memory(GiB)": 137.67,
"step": 1965,
"token_acc": 0.8929668563025367,
"train_speed(iter/s)": 0.040644
},
{
"epoch": 2.306791569086651,
"grad_norm": 0.17049305140972137,
"learning_rate": 2.782129607056848e-06,
"loss": 0.2668560028076172,
"memory(GiB)": 137.67,
"step": 1970,
"token_acc": 0.8946301039908395,
"train_speed(iter/s)": 0.040643
},
{
"epoch": 2.312646370023419,
"grad_norm": 0.17511935532093048,
"learning_rate": 2.7375959229990856e-06,
"loss": 0.25858211517333984,
"memory(GiB)": 137.67,
"step": 1975,
"token_acc": 0.9011111249984377,
"train_speed(iter/s)": 0.040641
},
{
"epoch": 2.3185011709601873,
"grad_norm": 0.16913901269435883,
"learning_rate": 2.6933649542866326e-06,
"loss": 0.2623398780822754,
"memory(GiB)": 137.67,
"step": 1980,
"token_acc": 0.8980817363368075,
"train_speed(iter/s)": 0.04064
},
{
"epoch": 2.3243559718969555,
"grad_norm": 0.16392305493354797,
"learning_rate": 2.649438544577977e-06,
"loss": 0.25210521221160886,
"memory(GiB)": 137.67,
"step": 1985,
"token_acc": 0.9006790772077851,
"train_speed(iter/s)": 0.040639
},
{
"epoch": 2.330210772833724,
"grad_norm": 0.16555212438106537,
"learning_rate": 2.6058185248368317e-06,
"loss": 0.26413559913635254,
"memory(GiB)": 137.67,
"step": 1990,
"token_acc": 0.9057566877776727,
"train_speed(iter/s)": 0.040637
},
{
"epoch": 2.3360655737704916,
"grad_norm": 0.17122185230255127,
"learning_rate": 2.562506713255789e-06,
"loss": 0.2596926689147949,
"memory(GiB)": 137.67,
"step": 1995,
"token_acc": 0.9047409789878514,
"train_speed(iter/s)": 0.040636
},
{
"epoch": 2.34192037470726,
"grad_norm": 0.17818881571292877,
"learning_rate": 2.519504915180555e-06,
"loss": 0.2623495101928711,
"memory(GiB)": 137.67,
"step": 2000,
"token_acc": 0.9031698814490531,
"train_speed(iter/s)": 0.040635
},
{
"epoch": 2.347775175644028,
"grad_norm": 0.17120912671089172,
"learning_rate": 2.4768149230346917e-06,
"loss": 0.2763922929763794,
"memory(GiB)": 137.67,
"step": 2005,
"token_acc": 0.90147262555157,
"train_speed(iter/s)": 0.040633
},
{
"epoch": 2.3536299765807964,
"grad_norm": 0.1725643426179886,
"learning_rate": 2.4344385162448924e-06,
"loss": 0.26347975730895995,
"memory(GiB)": 137.67,
"step": 2010,
"token_acc": 0.9056239470479484,
"train_speed(iter/s)": 0.040632
},
{
"epoch": 2.3594847775175642,
"grad_norm": 0.17098568379878998,
"learning_rate": 2.392377461166826e-06,
"loss": 0.26201567649841306,
"memory(GiB)": 137.67,
"step": 2015,
"token_acc": 0.9030459083951856,
"train_speed(iter/s)": 0.040631
},
{
"epoch": 2.3653395784543325,
"grad_norm": 0.17561163008213043,
"learning_rate": 2.350633511011511e-06,
"loss": 0.26811957359313965,
"memory(GiB)": 137.67,
"step": 2020,
"token_acc": 0.8995977151723318,
"train_speed(iter/s)": 0.040628
},
{
"epoch": 2.371194379391101,
"grad_norm": 0.1689569056034088,
"learning_rate": 2.309208405772221e-06,
"loss": 0.2759255409240723,
"memory(GiB)": 137.67,
"step": 2025,
"token_acc": 0.9044138910892334,
"train_speed(iter/s)": 0.040628
},
{
"epoch": 2.3770491803278686,
"grad_norm": 0.26568159461021423,
"learning_rate": 2.2681038721519768e-06,
"loss": 0.2785911560058594,
"memory(GiB)": 137.67,
"step": 2030,
"token_acc": 0.8982950398323113,
"train_speed(iter/s)": 0.040625
},
{
"epoch": 2.382903981264637,
"grad_norm": 0.18388140201568604,
"learning_rate": 2.227321623491563e-06,
"loss": 0.26940011978149414,
"memory(GiB)": 137.67,
"step": 2035,
"token_acc": 0.8968315203642803,
"train_speed(iter/s)": 0.040624
},
{
"epoch": 2.388758782201405,
"grad_norm": 0.16938382387161255,
"learning_rate": 2.186863359698108e-06,
"loss": 0.26633501052856445,
"memory(GiB)": 137.67,
"step": 2040,
"token_acc": 0.9180211235459854,
"train_speed(iter/s)": 0.040622
},
{
"epoch": 2.3946135831381734,
"grad_norm": 0.17878937721252441,
"learning_rate": 2.1467307671742377e-06,
"loss": 0.2687513828277588,
"memory(GiB)": 137.67,
"step": 2045,
"token_acc": 0.8974434682640148,
"train_speed(iter/s)": 0.040621
},
{
"epoch": 2.4004683840749417,
"grad_norm": 0.1779458373785019,
"learning_rate": 2.106925518747779e-06,
"loss": 0.26202917098999023,
"memory(GiB)": 137.67,
"step": 2050,
"token_acc": 0.9011938413047829,
"train_speed(iter/s)": 0.04062
},
{
"epoch": 2.4063231850117095,
"grad_norm": 0.17342902719974518,
"learning_rate": 2.06744927360202e-06,
"loss": 0.26468615531921386,
"memory(GiB)": 137.67,
"step": 2055,
"token_acc": 0.8999491938022672,
"train_speed(iter/s)": 0.040617
},
{
"epoch": 2.4121779859484778,
"grad_norm": 0.17159196734428406,
"learning_rate": 2.0283036772065712e-06,
"loss": 0.26631085872650145,
"memory(GiB)": 137.67,
"step": 2060,
"token_acc": 0.904679059271446,
"train_speed(iter/s)": 0.040615
},
{
"epoch": 2.418032786885246,
"grad_norm": 0.19288575649261475,
"learning_rate": 1.9894903612487683e-06,
"loss": 0.2730381488800049,
"memory(GiB)": 137.67,
"step": 2065,
"token_acc": 0.8923981017844846,
"train_speed(iter/s)": 0.040614
},
{
"epoch": 2.423887587822014,
"grad_norm": 0.17374974489212036,
"learning_rate": 1.9510109435656457e-06,
"loss": 0.27329106330871583,
"memory(GiB)": 137.67,
"step": 2070,
"token_acc": 0.9024526900268184,
"train_speed(iter/s)": 0.040613
},
{
"epoch": 2.429742388758782,
"grad_norm": 0.1817113608121872,
"learning_rate": 1.9128670280765283e-06,
"loss": 0.27490620613098143,
"memory(GiB)": 137.67,
"step": 2075,
"token_acc": 0.8959030374086766,
"train_speed(iter/s)": 0.040611
},
{
"epoch": 2.4355971896955504,
"grad_norm": 0.17148195207118988,
"learning_rate": 1.8750602047161603e-06,
"loss": 0.26430578231811525,
"memory(GiB)": 137.67,
"step": 2080,
"token_acc": 0.9074351491670378,
"train_speed(iter/s)": 0.040609
},
{
"epoch": 2.4414519906323187,
"grad_norm": 0.1715674251317978,
"learning_rate": 1.8375920493684264e-06,
"loss": 0.2722649574279785,
"memory(GiB)": 137.67,
"step": 2085,
"token_acc": 0.8960112888052681,
"train_speed(iter/s)": 0.040609
},
{
"epoch": 2.4473067915690865,
"grad_norm": 0.1820991337299347,
"learning_rate": 1.8004641238006815e-06,
"loss": 0.2675884485244751,
"memory(GiB)": 137.67,
"step": 2090,
"token_acc": 0.9040590405904059,
"train_speed(iter/s)": 0.040607
},
{
"epoch": 2.4531615925058547,
"grad_norm": 0.1691906452178955,
"learning_rate": 1.7636779755986443e-06,
"loss": 0.2732096195220947,
"memory(GiB)": 137.67,
"step": 2095,
"token_acc": 0.8958253626778894,
"train_speed(iter/s)": 0.040605
},
{
"epoch": 2.459016393442623,
"grad_norm": 0.17061816155910492,
"learning_rate": 1.7272351381018792e-06,
"loss": 0.2712996482849121,
"memory(GiB)": 137.67,
"step": 2100,
"token_acc": 0.8880485387880261,
"train_speed(iter/s)": 0.040603
},
{
"epoch": 2.4648711943793913,
"grad_norm": 0.17594653367996216,
"learning_rate": 1.6911371303399048e-06,
"loss": 0.2586531162261963,
"memory(GiB)": 137.67,
"step": 2105,
"token_acc": 0.9022650028060307,
"train_speed(iter/s)": 0.0406
},
{
"epoch": 2.470725995316159,
"grad_norm": 0.18380020558834076,
"learning_rate": 1.6553854569688632e-06,
"loss": 0.2727813720703125,
"memory(GiB)": 137.67,
"step": 2110,
"token_acc": 0.8974262645615947,
"train_speed(iter/s)": 0.040598
},
{
"epoch": 2.4765807962529274,
"grad_norm": 0.16742826998233795,
"learning_rate": 1.619981608208796e-06,
"loss": 0.2734941244125366,
"memory(GiB)": 137.67,
"step": 2115,
"token_acc": 0.8847918638392509,
"train_speed(iter/s)": 0.040597
},
{
"epoch": 2.4824355971896956,
"grad_norm": 0.17516812682151794,
"learning_rate": 1.584927059781548e-06,
"loss": 0.2728161334991455,
"memory(GiB)": 137.67,
"step": 2120,
"token_acc": 0.8936656628114019,
"train_speed(iter/s)": 0.040595
},
{
"epoch": 2.4882903981264635,
"grad_norm": 0.17867887020111084,
"learning_rate": 1.5502232728492362e-06,
"loss": 0.264336085319519,
"memory(GiB)": 137.67,
"step": 2125,
"token_acc": 0.9031589138208336,
"train_speed(iter/s)": 0.040594
},
{
"epoch": 2.4941451990632317,
"grad_norm": 0.17173421382904053,
"learning_rate": 1.5158716939533524e-06,
"loss": 0.27242002487182615,
"memory(GiB)": 137.67,
"step": 2130,
"token_acc": 0.8990930988723483,
"train_speed(iter/s)": 0.040593
},
{
"epoch": 2.5,
"grad_norm": 0.1708640456199646,
"learning_rate": 1.4818737549544725e-06,
"loss": 0.27319111824035647,
"memory(GiB)": 137.67,
"step": 2135,
"token_acc": 0.8916305799253722,
"train_speed(iter/s)": 0.040593
},
{
"epoch": 2.5058548009367683,
"grad_norm": 0.17307148873806,
"learning_rate": 1.448230872972568e-06,
"loss": 0.2695432424545288,
"memory(GiB)": 137.67,
"step": 2140,
"token_acc": 0.905385863209386,
"train_speed(iter/s)": 0.04059
},
{
"epoch": 2.5117096018735365,
"grad_norm": 0.17106083035469055,
"learning_rate": 1.4149444503279297e-06,
"loss": 0.27602252960205076,
"memory(GiB)": 137.67,
"step": 2145,
"token_acc": 0.8923752322136868,
"train_speed(iter/s)": 0.040589
},
{
"epoch": 2.5175644028103044,
"grad_norm": 0.17844541370868683,
"learning_rate": 1.382015874482735e-06,
"loss": 0.2688480615615845,
"memory(GiB)": 137.67,
"step": 2150,
"token_acc": 0.8990480241183902,
"train_speed(iter/s)": 0.040588
},
{
"epoch": 2.5234192037470726,
"grad_norm": 0.17703387141227722,
"learning_rate": 1.3494465179831895e-06,
"loss": 0.26667649745941163,
"memory(GiB)": 137.67,
"step": 2155,
"token_acc": 0.8957748501946923,
"train_speed(iter/s)": 0.040587
},
{
"epoch": 2.529274004683841,
"grad_norm": 0.1624777913093567,
"learning_rate": 1.3172377384023393e-06,
"loss": 0.26247563362121584,
"memory(GiB)": 137.67,
"step": 2160,
"token_acc": 0.9005889918174871,
"train_speed(iter/s)": 0.040586
},
{
"epoch": 2.5351288056206087,
"grad_norm": 0.17209553718566895,
"learning_rate": 1.2853908782834722e-06,
"loss": 0.2671672821044922,
"memory(GiB)": 137.67,
"step": 2165,
"token_acc": 0.9070119235517494,
"train_speed(iter/s)": 0.040583
},
{
"epoch": 2.540983606557377,
"grad_norm": 0.17611093819141388,
"learning_rate": 1.2539072650841523e-06,
"loss": 0.2725430250167847,
"memory(GiB)": 137.67,
"step": 2170,
"token_acc": 0.8966264886593998,
"train_speed(iter/s)": 0.040583
},
{
"epoch": 2.5468384074941453,
"grad_norm": 0.1783149093389511,
"learning_rate": 1.2227882111209011e-06,
"loss": 0.27568228244781495,
"memory(GiB)": 137.67,
"step": 2175,
"token_acc": 0.8947381499658321,
"train_speed(iter/s)": 0.040582
},
{
"epoch": 2.552693208430913,
"grad_norm": 0.17337878048419952,
"learning_rate": 1.1920350135144898e-06,
"loss": 0.269814133644104,
"memory(GiB)": 137.67,
"step": 2180,
"token_acc": 0.9041164343092762,
"train_speed(iter/s)": 0.040581
},
{
"epoch": 2.5585480093676813,
"grad_norm": 0.16845281422138214,
"learning_rate": 1.1616489541358678e-06,
"loss": 0.26679143905639646,
"memory(GiB)": 137.67,
"step": 2185,
"token_acc": 0.8939169722162003,
"train_speed(iter/s)": 0.04058
},
{
"epoch": 2.5644028103044496,
"grad_norm": 0.17022623121738434,
"learning_rate": 1.1316312995527424e-06,
"loss": 0.2700947761535645,
"memory(GiB)": 137.67,
"step": 2190,
"token_acc": 0.8979253112033195,
"train_speed(iter/s)": 0.040579
},
{
"epoch": 2.570257611241218,
"grad_norm": 0.16687875986099243,
"learning_rate": 1.1019833009767744e-06,
"loss": 0.268681001663208,
"memory(GiB)": 137.67,
"step": 2195,
"token_acc": 0.8966215038230679,
"train_speed(iter/s)": 0.040578
},
{
"epoch": 2.576112412177986,
"grad_norm": 0.17770424485206604,
"learning_rate": 1.072706194211426e-06,
"loss": 0.27028694152832033,
"memory(GiB)": 137.67,
"step": 2200,
"token_acc": 0.9029025046417339,
"train_speed(iter/s)": 0.040577
},
{
"epoch": 2.581967213114754,
"grad_norm": 0.17755696177482605,
"learning_rate": 1.0438011996004581e-06,
"loss": 0.269865894317627,
"memory(GiB)": 137.67,
"step": 2205,
"token_acc": 0.8967394005666286,
"train_speed(iter/s)": 0.040575
},
{
"epoch": 2.5878220140515222,
"grad_norm": 0.17752693593502045,
"learning_rate": 1.0152695219770558e-06,
"loss": 0.257364559173584,
"memory(GiB)": 137.67,
"step": 2210,
"token_acc": 0.9068669110660224,
"train_speed(iter/s)": 0.040573
},
{
"epoch": 2.5936768149882905,
"grad_norm": 0.16752499341964722,
"learning_rate": 9.871123506136037e-07,
"loss": 0.2638521194458008,
"memory(GiB)": 137.67,
"step": 2215,
"token_acc": 0.9083980061833554,
"train_speed(iter/s)": 0.040572
},
{
"epoch": 2.5995316159250583,
"grad_norm": 0.16032443940639496,
"learning_rate": 9.593308591721274e-07,
"loss": 0.2622210025787354,
"memory(GiB)": 137.67,
"step": 2220,
"token_acc": 0.900316748757648,
"train_speed(iter/s)": 0.040571
},
{
"epoch": 2.6053864168618266,
"grad_norm": 0.17415659129619598,
"learning_rate": 9.319262056553602e-07,
"loss": 0.2700244903564453,
"memory(GiB)": 137.67,
"step": 2225,
"token_acc": 0.9051188644286028,
"train_speed(iter/s)": 0.040569
},
{
"epoch": 2.611241217798595,
"grad_norm": 0.1722276359796524,
"learning_rate": 9.048995323584764e-07,
"loss": 0.2738530397415161,
"memory(GiB)": 137.67,
"step": 2230,
"token_acc": 0.9079698943901274,
"train_speed(iter/s)": 0.040568
},
{
"epoch": 2.617096018735363,
"grad_norm": 0.17455357313156128,
"learning_rate": 8.78251965821485e-07,
"loss": 0.25915350914001467,
"memory(GiB)": 137.67,
"step": 2235,
"token_acc": 0.9004381754945836,
"train_speed(iter/s)": 0.040566
},
{
"epoch": 2.6229508196721314,
"grad_norm": 0.17298012971878052,
"learning_rate": 8.519846167822665e-07,
"loss": 0.2638465404510498,
"memory(GiB)": 137.67,
"step": 2240,
"token_acc": 0.9118884831119326,
"train_speed(iter/s)": 0.040565
},
{
"epoch": 2.628805620608899,
"grad_norm": 0.1699805110692978,
"learning_rate": 8.260985801302734e-07,
"loss": 0.25593223571777346,
"memory(GiB)": 137.67,
"step": 2245,
"token_acc": 0.8991087959330969,
"train_speed(iter/s)": 0.040565
},
{
"epoch": 2.6346604215456675,
"grad_norm": 0.1722072809934616,
"learning_rate": 8.005949348608977e-07,
"loss": 0.2674243927001953,
"memory(GiB)": 137.67,
"step": 2250,
"token_acc": 0.8965253065997911,
"train_speed(iter/s)": 0.040563
},
{
"epoch": 2.6405152224824358,
"grad_norm": 0.1668199747800827,
"learning_rate": 7.754747440304911e-07,
"loss": 0.27177164554595945,
"memory(GiB)": 137.67,
"step": 2255,
"token_acc": 0.8954008941320247,
"train_speed(iter/s)": 0.040563
},
{
"epoch": 2.6463700234192036,
"grad_norm": 0.16813580691814423,
"learning_rate": 7.507390547120541e-07,
"loss": 0.2651193857192993,
"memory(GiB)": 137.67,
"step": 2260,
"token_acc": 0.8984925665335315,
"train_speed(iter/s)": 0.040562
},
{
"epoch": 2.652224824355972,
"grad_norm": 0.17678076028823853,
"learning_rate": 7.263888979515954e-07,
"loss": 0.27275819778442384,
"memory(GiB)": 137.67,
"step": 2265,
"token_acc": 0.8936288874184706,
"train_speed(iter/s)": 0.040562
},
{
"epoch": 2.65807962529274,
"grad_norm": 0.16264022886753082,
"learning_rate": 7.024252887251548e-07,
"loss": 0.2669191360473633,
"memory(GiB)": 137.67,
"step": 2270,
"token_acc": 0.8972385552618926,
"train_speed(iter/s)": 0.04056
},
{
"epoch": 2.663934426229508,
"grad_norm": 0.1690565049648285,
"learning_rate": 6.788492258964896e-07,
"loss": 0.2695984125137329,
"memory(GiB)": 137.67,
"step": 2275,
"token_acc": 0.8963350061434133,
"train_speed(iter/s)": 0.040559
},
{
"epoch": 2.669789227166276,
"grad_norm": 0.1730775386095047,
"learning_rate": 6.556616921754489e-07,
"loss": 0.26709651947021484,
"memory(GiB)": 137.67,
"step": 2280,
"token_acc": 0.9004803898235022,
"train_speed(iter/s)": 0.040558
},
{
"epoch": 2.6756440281030445,
"grad_norm": 0.1701081544160843,
"learning_rate": 6.328636540770028e-07,
"loss": 0.26933286190032957,
"memory(GiB)": 137.67,
"step": 2285,
"token_acc": 0.898853457766213,
"train_speed(iter/s)": 0.040557
},
{
"epoch": 2.6814988290398127,
"grad_norm": 0.19118832051753998,
"learning_rate": 6.10456061880963e-07,
"loss": 0.2741654396057129,
"memory(GiB)": 137.67,
"step": 2290,
"token_acc": 0.9025216185680262,
"train_speed(iter/s)": 0.040556
},
{
"epoch": 2.687353629976581,
"grad_norm": 0.17062994837760925,
"learning_rate": 5.884398495923727e-07,
"loss": 0.2640299558639526,
"memory(GiB)": 137.67,
"step": 2295,
"token_acc": 0.8934425971755339,
"train_speed(iter/s)": 0.040556
},
{
"epoch": 2.693208430913349,
"grad_norm": 0.18749327957630157,
"learning_rate": 5.668159349025649e-07,
"loss": 0.2795866966247559,
"memory(GiB)": 137.67,
"step": 2300,
"token_acc": 0.8874596974206349,
"train_speed(iter/s)": 0.040554
},
{
"epoch": 2.699063231850117,
"grad_norm": 0.1760568916797638,
"learning_rate": 5.455852191509214e-07,
"loss": 0.27616961002349855,
"memory(GiB)": 137.67,
"step": 2305,
"token_acc": 0.8910418230197176,
"train_speed(iter/s)": 0.040553
},
{
"epoch": 2.7049180327868854,
"grad_norm": 0.1760990172624588,
"learning_rate": 5.247485872873026e-07,
"loss": 0.26389687061309813,
"memory(GiB)": 137.67,
"step": 2310,
"token_acc": 0.9032378371322547,
"train_speed(iter/s)": 0.040552
},
{
"epoch": 2.710772833723653,
"grad_norm": 0.16184002161026,
"learning_rate": 5.043069078351526e-07,
"loss": 0.2583066463470459,
"memory(GiB)": 137.67,
"step": 2315,
"token_acc": 0.9048499210110584,
"train_speed(iter/s)": 0.040551
},
{
"epoch": 2.7166276346604215,
"grad_norm": 0.16953077912330627,
"learning_rate": 4.842610328552999e-07,
"loss": 0.26470949649810793,
"memory(GiB)": 137.67,
"step": 2320,
"token_acc": 0.9023021945368386,
"train_speed(iter/s)": 0.04055
},
{
"epoch": 2.7224824355971897,
"grad_norm": 0.16833004355430603,
"learning_rate": 4.6461179791044806e-07,
"loss": 0.26623120307922366,
"memory(GiB)": 137.67,
"step": 2325,
"token_acc": 0.895680773698298,
"train_speed(iter/s)": 0.04055
},
{
"epoch": 2.728337236533958,
"grad_norm": 0.1694810837507248,
"learning_rate": 4.453600220303378e-07,
"loss": 0.25267777442932127,
"memory(GiB)": 137.67,
"step": 2330,
"token_acc": 0.8968080577917444,
"train_speed(iter/s)": 0.04055
},
{
"epoch": 2.7341920374707263,
"grad_norm": 0.18032941222190857,
"learning_rate": 4.2650650767761535e-07,
"loss": 0.25408167839050294,
"memory(GiB)": 137.67,
"step": 2335,
"token_acc": 0.9085095809749435,
"train_speed(iter/s)": 0.040549
},
{
"epoch": 2.740046838407494,
"grad_norm": 0.18011276423931122,
"learning_rate": 4.0805204071437953e-07,
"loss": 0.27644264698028564,
"memory(GiB)": 137.67,
"step": 2340,
"token_acc": 0.8965790537297598,
"train_speed(iter/s)": 0.040547
},
{
"epoch": 2.7459016393442623,
"grad_norm": 0.16562311351299286,
"learning_rate": 3.899973903694243e-07,
"loss": 0.26986749172210694,
"memory(GiB)": 137.67,
"step": 2345,
"token_acc": 0.9012060017454879,
"train_speed(iter/s)": 0.040546
},
{
"epoch": 2.7517564402810306,
"grad_norm": 0.17436754703521729,
"learning_rate": 3.72343309206179e-07,
"loss": 0.26195201873779295,
"memory(GiB)": 137.67,
"step": 2350,
"token_acc": 0.9009433222876742,
"train_speed(iter/s)": 0.040545
},
{
"epoch": 2.7576112412177984,
"grad_norm": 0.1674078106880188,
"learning_rate": 3.55090533091339e-07,
"loss": 0.26260790824890134,
"memory(GiB)": 137.67,
"step": 2355,
"token_acc": 0.9115999937809979,
"train_speed(iter/s)": 0.040543
},
{
"epoch": 2.7634660421545667,
"grad_norm": 0.1657068282365799,
"learning_rate": 3.382397811641858e-07,
"loss": 0.25954129695892336,
"memory(GiB)": 137.67,
"step": 2360,
"token_acc": 0.9021908567865544,
"train_speed(iter/s)": 0.040543
},
{
"epoch": 2.769320843091335,
"grad_norm": 0.167274609208107,
"learning_rate": 3.217917558066241e-07,
"loss": 0.262769889831543,
"memory(GiB)": 137.67,
"step": 2365,
"token_acc": 0.8952377080453587,
"train_speed(iter/s)": 0.040542
},
{
"epoch": 2.775175644028103,
"grad_norm": 0.16418085992336273,
"learning_rate": 3.057471426138958e-07,
"loss": 0.2759857654571533,
"memory(GiB)": 137.67,
"step": 2370,
"token_acc": 0.8904371253200432,
"train_speed(iter/s)": 0.04054
},
{
"epoch": 2.781030444964871,
"grad_norm": 0.16312485933303833,
"learning_rate": 2.901066103660033e-07,
"loss": 0.26541569232940676,
"memory(GiB)": 137.67,
"step": 2375,
"token_acc": 0.9018337335217314,
"train_speed(iter/s)": 0.04054
},
{
"epoch": 2.7868852459016393,
"grad_norm": 0.17677490413188934,
"learning_rate": 2.7487081099983435e-07,
"loss": 0.27631726264953616,
"memory(GiB)": 137.67,
"step": 2380,
"token_acc": 0.9002755878263168,
"train_speed(iter/s)": 0.040539
},
{
"epoch": 2.7927400468384076,
"grad_norm": 0.1672162115573883,
"learning_rate": 2.6004037958199167e-07,
"loss": 0.26006388664245605,
"memory(GiB)": 137.67,
"step": 2385,
"token_acc": 0.910639127168484,
"train_speed(iter/s)": 0.040538
},
{
"epoch": 2.798594847775176,
"grad_norm": 0.1678304672241211,
"learning_rate": 2.4561593428231165e-07,
"loss": 0.26682395935058595,
"memory(GiB)": 137.67,
"step": 2390,
"token_acc": 0.91889434727678,
"train_speed(iter/s)": 0.040535
},
{
"epoch": 2.8044496487119437,
"grad_norm": 0.16077911853790283,
"learning_rate": 2.3159807634811182e-07,
"loss": 0.2570212364196777,
"memory(GiB)": 137.67,
"step": 2395,
"token_acc": 0.9051587858378934,
"train_speed(iter/s)": 0.040535
},
{
"epoch": 2.810304449648712,
"grad_norm": 0.16872599720954895,
"learning_rate": 2.1798739007911517e-07,
"loss": 0.27098655700683594,
"memory(GiB)": 137.67,
"step": 2400,
"token_acc": 0.8959861646097005,
"train_speed(iter/s)": 0.040533
},
{
"epoch": 2.8161592505854802,
"grad_norm": 0.16125863790512085,
"learning_rate": 2.0478444280310206e-07,
"loss": 0.26554141044616697,
"memory(GiB)": 137.67,
"step": 2405,
"token_acc": 0.8993798050995196,
"train_speed(iter/s)": 0.040533
},
{
"epoch": 2.822014051522248,
"grad_norm": 0.19162511825561523,
"learning_rate": 1.919897848522656e-07,
"loss": 0.26296229362487794,
"memory(GiB)": 137.67,
"step": 2410,
"token_acc": 0.8993982865613145,
"train_speed(iter/s)": 0.040532
},
{
"epoch": 2.8278688524590163,
"grad_norm": 0.20407338440418243,
"learning_rate": 1.796039495402646e-07,
"loss": 0.26827549934387207,
"memory(GiB)": 137.67,
"step": 2415,
"token_acc": 0.9050311652650377,
"train_speed(iter/s)": 0.04053
},
{
"epoch": 2.8337236533957846,
"grad_norm": 0.17013327777385712,
"learning_rate": 1.6762745313999795e-07,
"loss": 0.2727066516876221,
"memory(GiB)": 137.67,
"step": 2420,
"token_acc": 0.8865242476220178,
"train_speed(iter/s)": 0.040529
},
{
"epoch": 2.839578454332553,
"grad_norm": 0.1698453575372696,
"learning_rate": 1.5606079486208846e-07,
"loss": 0.2641671895980835,
"memory(GiB)": 137.67,
"step": 2425,
"token_acc": 0.9000177898735047,
"train_speed(iter/s)": 0.040529
},
{
"epoch": 2.845433255269321,
"grad_norm": 0.17142532765865326,
"learning_rate": 1.449044568340663e-07,
"loss": 0.2717731952667236,
"memory(GiB)": 137.67,
"step": 2430,
"token_acc": 0.9031580860350494,
"train_speed(iter/s)": 0.040528
},
{
"epoch": 2.851288056206089,
"grad_norm": 0.1803494244813919,
"learning_rate": 1.3415890408027932e-07,
"loss": 0.26016151905059814,
"memory(GiB)": 137.67,
"step": 2435,
"token_acc": 0.9004292620366133,
"train_speed(iter/s)": 0.040526
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.17327673733234406,
"learning_rate": 1.2382458450250657e-07,
"loss": 0.2739871025085449,
"memory(GiB)": 137.67,
"step": 2440,
"token_acc": 0.8937226907040563,
"train_speed(iter/s)": 0.040526
},
{
"epoch": 2.8629976580796255,
"grad_norm": 0.1648455113172531,
"learning_rate": 1.1390192886129304e-07,
"loss": 0.26163692474365235,
"memory(GiB)": 137.67,
"step": 2445,
"token_acc": 0.9109708459314515,
"train_speed(iter/s)": 0.040525
},
{
"epoch": 2.8688524590163933,
"grad_norm": 0.17209313809871674,
"learning_rate": 1.0439135075798634e-07,
"loss": 0.2778321266174316,
"memory(GiB)": 137.67,
"step": 2450,
"token_acc": 0.8971170667512587,
"train_speed(iter/s)": 0.040525
},
{
"epoch": 2.8747072599531616,
"grad_norm": 0.16632598638534546,
"learning_rate": 9.529324661750494e-08,
"loss": 0.2714024305343628,
"memory(GiB)": 137.67,
"step": 2455,
"token_acc": 0.8926179928835372,
"train_speed(iter/s)": 0.040524
},
{
"epoch": 2.88056206088993,
"grad_norm": 0.17401184141635895,
"learning_rate": 8.6607995671808e-08,
"loss": 0.2663599967956543,
"memory(GiB)": 137.67,
"step": 2460,
"token_acc": 0.8979368591641474,
"train_speed(iter/s)": 0.040523
},
{
"epoch": 2.8864168618266977,
"grad_norm": 0.17087528109550476,
"learning_rate": 7.833595994409248e-08,
"loss": 0.2583767414093018,
"memory(GiB)": 137.67,
"step": 2465,
"token_acc": 0.8988238974038161,
"train_speed(iter/s)": 0.040522
},
{
"epoch": 2.892271662763466,
"grad_norm": 0.17502275109291077,
"learning_rate": 7.047748423370193e-08,
"loss": 0.27132668495178225,
"memory(GiB)": 137.67,
"step": 2470,
"token_acc": 0.8950027089407572,
"train_speed(iter/s)": 0.040522
},
{
"epoch": 2.898126463700234,
"grad_norm": 0.16457100212574005,
"learning_rate": 6.303289610175233e-08,
"loss": 0.262396240234375,
"memory(GiB)": 137.67,
"step": 2475,
"token_acc": 0.9005705329153605,
"train_speed(iter/s)": 0.040522
},
{
"epoch": 2.9039812646370025,
"grad_norm": 0.17186148464679718,
"learning_rate": 5.6002505857480906e-08,
"loss": 0.2651688098907471,
"memory(GiB)": 137.67,
"step": 2480,
"token_acc": 0.903142540689707,
"train_speed(iter/s)": 0.040521
},
{
"epoch": 2.9098360655737707,
"grad_norm": 0.16921843588352203,
"learning_rate": 4.938660654530969e-08,
"loss": 0.27781147956848146,
"memory(GiB)": 137.67,
"step": 2485,
"token_acc": 0.8947337181986305,
"train_speed(iter/s)": 0.040521
},
{
"epoch": 2.9156908665105385,
"grad_norm": 0.17168040573596954,
"learning_rate": 4.318547393263317e-08,
"loss": 0.27856767177581787,
"memory(GiB)": 137.67,
"step": 2490,
"token_acc": 0.8994483098446597,
"train_speed(iter/s)": 0.04052
},
{
"epoch": 2.921545667447307,
"grad_norm": 0.17257463932037354,
"learning_rate": 3.739936649832188e-08,
"loss": 0.26465725898742676,
"memory(GiB)": 137.67,
"step": 2495,
"token_acc": 0.9003965374896801,
"train_speed(iter/s)": 0.04052
},
{
"epoch": 2.927400468384075,
"grad_norm": 0.17007899284362793,
"learning_rate": 3.2028525421946563e-08,
"loss": 0.26408021450042723,
"memory(GiB)": 137.67,
"step": 2500,
"token_acc": 0.9105243972950552,
"train_speed(iter/s)": 0.04052
},
{
"epoch": 2.933255269320843,
"grad_norm": 0.16546528041362762,
"learning_rate": 2.70731745737296e-08,
"loss": 0.26817855834960935,
"memory(GiB)": 137.67,
"step": 2505,
"token_acc": 0.9032225815017886,
"train_speed(iter/s)": 0.040519
},
{
"epoch": 2.939110070257611,
"grad_norm": 0.1731211543083191,
"learning_rate": 2.2533520505211294e-08,
"loss": 0.26341302394866944,
"memory(GiB)": 137.67,
"step": 2510,
"token_acc": 0.9048233016983017,
"train_speed(iter/s)": 0.040519
},
{
"epoch": 2.9449648711943794,
"grad_norm": 0.16093143820762634,
"learning_rate": 1.8409752440639027e-08,
"loss": 0.25573346614837644,
"memory(GiB)": 137.67,
"step": 2515,
"token_acc": 0.9019553343056392,
"train_speed(iter/s)": 0.040518
},
{
"epoch": 2.9508196721311473,
"grad_norm": 0.16452209651470184,
"learning_rate": 1.470204226908134e-08,
"loss": 0.2707658767700195,
"memory(GiB)": 137.67,
"step": 2520,
"token_acc": 0.904132819893002,
"train_speed(iter/s)": 0.040517
},
{
"epoch": 2.9566744730679155,
"grad_norm": 0.1768556386232376,
"learning_rate": 1.1410544537263645e-08,
"loss": 0.27701735496520996,
"memory(GiB)": 137.67,
"step": 2525,
"token_acc": 0.903024352910179,
"train_speed(iter/s)": 0.040515
},
{
"epoch": 2.962529274004684,
"grad_norm": 0.16568534076213837,
"learning_rate": 8.535396443124511e-09,
"loss": 0.25813367366790774,
"memory(GiB)": 137.67,
"step": 2530,
"token_acc": 0.9017673177727538,
"train_speed(iter/s)": 0.040514
},
{
"epoch": 2.968384074941452,
"grad_norm": 0.16622532904148102,
"learning_rate": 6.076717830098e-09,
"loss": 0.260286283493042,
"memory(GiB)": 137.67,
"step": 2535,
"token_acc": 0.9083364106929379,
"train_speed(iter/s)": 0.040513
},
{
"epoch": 2.9742388758782203,
"grad_norm": 0.17745059728622437,
"learning_rate": 4.034611182121007e-09,
"loss": 0.26159353256225587,
"memory(GiB)": 137.67,
"step": 2540,
"token_acc": 0.9072020079994492,
"train_speed(iter/s)": 0.040512
},
{
"epoch": 2.980093676814988,
"grad_norm": 0.16991080343723297,
"learning_rate": 2.40916161935445e-09,
"loss": 0.26626038551330566,
"memory(GiB)": 137.67,
"step": 2545,
"token_acc": 0.8986437875498561,
"train_speed(iter/s)": 0.040511
},
{
"epoch": 2.9859484777517564,
"grad_norm": 0.16490155458450317,
"learning_rate": 1.2004368946427758e-09,
"loss": 0.2636513948440552,
"memory(GiB)": 137.67,
"step": 2550,
"token_acc": 0.9014935708777286,
"train_speed(iter/s)": 0.040511
},
{
"epoch": 2.9918032786885247,
"grad_norm": 0.1677451878786087,
"learning_rate": 4.084873906851083e-10,
"loss": 0.26745948791503904,
"memory(GiB)": 137.67,
"step": 2555,
"token_acc": 0.9085500921651726,
"train_speed(iter/s)": 0.04051
},
{
"epoch": 2.9976580796252925,
"grad_norm": 0.1645430028438568,
"learning_rate": 3.334611793692766e-11,
"loss": 0.26831555366516113,
"memory(GiB)": 137.67,
"step": 2560,
"token_acc": 0.9117214925099609,
"train_speed(iter/s)": 0.040508
}
],
"logging_steps": 5,
"max_steps": 2562,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1.0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3414295945805824.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}