{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 2562, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00117096018735363, "grad_norm": 2.397789478302002, "learning_rate": 1.5503875968992249e-07, "loss": 0.5513913631439209, "memory(GiB)": 137.67, "step": 1, "token_acc": 0.8478124608248715, "train_speed(iter/s)": 0.014244 }, { "epoch": 0.00585480093676815, "grad_norm": 2.213494300842285, "learning_rate": 7.751937984496125e-07, "loss": 0.5191692113876343, "memory(GiB)": 137.67, "step": 5, "token_acc": 0.848514893999071, "train_speed(iter/s)": 0.029361 }, { "epoch": 0.0117096018735363, "grad_norm": 2.0672056674957275, "learning_rate": 1.550387596899225e-06, "loss": 0.5157936096191407, "memory(GiB)": 137.67, "step": 10, "token_acc": 0.8392344826938901, "train_speed(iter/s)": 0.034106 }, { "epoch": 0.01756440281030445, "grad_norm": 1.588051676750183, "learning_rate": 2.3255813953488376e-06, "loss": 0.49305076599121095, "memory(GiB)": 137.67, "step": 15, "token_acc": 0.8437633920693741, "train_speed(iter/s)": 0.03643 }, { "epoch": 0.0234192037470726, "grad_norm": 0.7405409812927246, "learning_rate": 3.10077519379845e-06, "loss": 0.43950672149658204, "memory(GiB)": 137.67, "step": 20, "token_acc": 0.848157187048235, "train_speed(iter/s)": 0.03757 }, { "epoch": 0.02927400468384075, "grad_norm": 0.8562428951263428, "learning_rate": 3.875968992248063e-06, "loss": 0.4227635383605957, "memory(GiB)": 137.67, "step": 25, "token_acc": 0.8593663993232968, "train_speed(iter/s)": 0.038283 }, { "epoch": 0.0351288056206089, "grad_norm": 0.4966309666633606, "learning_rate": 4.651162790697675e-06, "loss": 0.4113954544067383, "memory(GiB)": 137.67, "step": 30, "token_acc": 0.8579081152325363, "train_speed(iter/s)": 0.038822 }, { "epoch": 0.040983606557377046, "grad_norm": 0.4413171410560608, "learning_rate": 5.4263565891472865e-06, "loss": 0.40917291641235354, "memory(GiB)": 137.67, "step": 35, "token_acc": 0.8563618960945223, "train_speed(iter/s)": 0.039192 }, { "epoch": 0.0468384074941452, "grad_norm": 0.37367990612983704, "learning_rate": 6.2015503875969e-06, "loss": 0.38341727256774905, "memory(GiB)": 137.67, "step": 40, "token_acc": 0.8598059924304837, "train_speed(iter/s)": 0.039486 }, { "epoch": 0.05269320843091335, "grad_norm": 0.2625274062156677, "learning_rate": 6.976744186046513e-06, "loss": 0.39299936294555665, "memory(GiB)": 137.67, "step": 45, "token_acc": 0.8545384055298668, "train_speed(iter/s)": 0.03968 }, { "epoch": 0.0585480093676815, "grad_norm": 0.27871787548065186, "learning_rate": 7.751937984496126e-06, "loss": 0.38351633548736574, "memory(GiB)": 137.67, "step": 50, "token_acc": 0.8680353205073448, "train_speed(iter/s)": 0.039861 }, { "epoch": 0.06440281030444965, "grad_norm": 0.2245069444179535, "learning_rate": 8.527131782945736e-06, "loss": 0.3764484882354736, "memory(GiB)": 137.67, "step": 55, "token_acc": 0.8676952168658857, "train_speed(iter/s)": 0.040018 }, { "epoch": 0.0702576112412178, "grad_norm": 0.22919970750808716, "learning_rate": 9.30232558139535e-06, "loss": 0.3956867218017578, "memory(GiB)": 137.67, "step": 60, "token_acc": 0.865152491108186, "train_speed(iter/s)": 0.040146 }, { "epoch": 0.07611241217798595, "grad_norm": 0.21093736588954926, "learning_rate": 1.0077519379844963e-05, "loss": 0.37714409828186035, "memory(GiB)": 137.67, "step": 65, "token_acc": 0.8760504070619795, "train_speed(iter/s)": 0.040253 }, { "epoch": 0.08196721311475409, "grad_norm": 0.21410879492759705, "learning_rate": 1.0852713178294573e-05, "loss": 0.3757580995559692, "memory(GiB)": 137.67, "step": 70, "token_acc": 0.8649565195567881, "train_speed(iter/s)": 0.040315 }, { "epoch": 0.08782201405152225, "grad_norm": 0.1979837864637375, "learning_rate": 1.1627906976744187e-05, "loss": 0.37558441162109374, "memory(GiB)": 137.67, "step": 75, "token_acc": 0.8532517495556191, "train_speed(iter/s)": 0.040405 }, { "epoch": 0.0936768149882904, "grad_norm": 0.207350954413414, "learning_rate": 1.24031007751938e-05, "loss": 0.3741091966629028, "memory(GiB)": 137.67, "step": 80, "token_acc": 0.8612590246358096, "train_speed(iter/s)": 0.040461 }, { "epoch": 0.09953161592505855, "grad_norm": 0.19452251493930817, "learning_rate": 1.3178294573643412e-05, "loss": 0.3656472682952881, "memory(GiB)": 137.67, "step": 85, "token_acc": 0.8822223551750307, "train_speed(iter/s)": 0.040557 }, { "epoch": 0.1053864168618267, "grad_norm": 0.20653362572193146, "learning_rate": 1.3953488372093025e-05, "loss": 0.3706169605255127, "memory(GiB)": 137.67, "step": 90, "token_acc": 0.8654753188641241, "train_speed(iter/s)": 0.04063 }, { "epoch": 0.11124121779859485, "grad_norm": 0.20383736491203308, "learning_rate": 1.4728682170542636e-05, "loss": 0.3718616485595703, "memory(GiB)": 137.67, "step": 95, "token_acc": 0.8700523810121971, "train_speed(iter/s)": 0.040694 }, { "epoch": 0.117096018735363, "grad_norm": 0.2144174873828888, "learning_rate": 1.550387596899225e-05, "loss": 0.3716637134552002, "memory(GiB)": 137.67, "step": 100, "token_acc": 0.871046915998142, "train_speed(iter/s)": 0.040754 }, { "epoch": 0.12295081967213115, "grad_norm": 0.2225562483072281, "learning_rate": 1.6279069767441862e-05, "loss": 0.3682845115661621, "memory(GiB)": 137.67, "step": 105, "token_acc": 0.8729440672893664, "train_speed(iter/s)": 0.040816 }, { "epoch": 0.1288056206088993, "grad_norm": 0.2207648605108261, "learning_rate": 1.7054263565891473e-05, "loss": 0.3570878982543945, "memory(GiB)": 137.67, "step": 110, "token_acc": 0.8706495975584588, "train_speed(iter/s)": 0.04088 }, { "epoch": 0.13466042154566746, "grad_norm": 0.2282887101173401, "learning_rate": 1.7829457364341087e-05, "loss": 0.3752657175064087, "memory(GiB)": 137.67, "step": 115, "token_acc": 0.8784262063618629, "train_speed(iter/s)": 0.040925 }, { "epoch": 0.1405152224824356, "grad_norm": 0.23532657325267792, "learning_rate": 1.86046511627907e-05, "loss": 0.3657325029373169, "memory(GiB)": 137.67, "step": 120, "token_acc": 0.8712829028328604, "train_speed(iter/s)": 0.040965 }, { "epoch": 0.14637002341920374, "grad_norm": 0.2132922112941742, "learning_rate": 1.937984496124031e-05, "loss": 0.3799854278564453, "memory(GiB)": 137.67, "step": 125, "token_acc": 0.8649469651038509, "train_speed(iter/s)": 0.041003 }, { "epoch": 0.1522248243559719, "grad_norm": 0.2445414662361145, "learning_rate": 1.9999991663467044e-05, "loss": 0.3770766258239746, "memory(GiB)": 137.67, "step": 130, "token_acc": 0.8692484710531911, "train_speed(iter/s)": 0.041036 }, { "epoch": 0.15807962529274006, "grad_norm": 0.2305486649274826, "learning_rate": 1.9999699886272926e-05, "loss": 0.3788888931274414, "memory(GiB)": 137.67, "step": 135, "token_acc": 0.8571357490266324, "train_speed(iter/s)": 0.041054 }, { "epoch": 0.16393442622950818, "grad_norm": 0.2297585904598236, "learning_rate": 1.9998991296330317e-05, "loss": 0.3768150806427002, "memory(GiB)": 137.67, "step": 140, "token_acc": 0.8707652096887886, "train_speed(iter/s)": 0.04107 }, { "epoch": 0.16978922716627634, "grad_norm": 0.22929546236991882, "learning_rate": 1.9997865923175027e-05, "loss": 0.3672610282897949, "memory(GiB)": 137.67, "step": 145, "token_acc": 0.8764070583454463, "train_speed(iter/s)": 0.041074 }, { "epoch": 0.1756440281030445, "grad_norm": 0.2531713843345642, "learning_rate": 1.999632381371545e-05, "loss": 0.3735011577606201, "memory(GiB)": 137.67, "step": 150, "token_acc": 0.8610904473031397, "train_speed(iter/s)": 0.041095 }, { "epoch": 0.18149882903981265, "grad_norm": 0.21190133690834045, "learning_rate": 1.999436503223061e-05, "loss": 0.37088618278503416, "memory(GiB)": 137.67, "step": 155, "token_acc": 0.869811065319577, "train_speed(iter/s)": 0.0411 }, { "epoch": 0.1873536299765808, "grad_norm": 0.24962091445922852, "learning_rate": 1.9991989660367463e-05, "loss": 0.3776357650756836, "memory(GiB)": 137.67, "step": 160, "token_acc": 0.8544295113661168, "train_speed(iter/s)": 0.041107 }, { "epoch": 0.19320843091334894, "grad_norm": 0.20956465601921082, "learning_rate": 1.998919779713751e-05, "loss": 0.3805836200714111, "memory(GiB)": 137.67, "step": 165, "token_acc": 0.8613002884067936, "train_speed(iter/s)": 0.041115 }, { "epoch": 0.1990632318501171, "grad_norm": 0.206803560256958, "learning_rate": 1.998598955891266e-05, "loss": 0.3702584505081177, "memory(GiB)": 137.67, "step": 170, "token_acc": 0.8749547416575101, "train_speed(iter/s)": 0.04113 }, { "epoch": 0.20491803278688525, "grad_norm": 0.23116904497146606, "learning_rate": 1.9982365079420382e-05, "loss": 0.3598947048187256, "memory(GiB)": 137.67, "step": 175, "token_acc": 0.8684363191646153, "train_speed(iter/s)": 0.041153 }, { "epoch": 0.2107728337236534, "grad_norm": 0.22105969488620758, "learning_rate": 1.9978324509738147e-05, "loss": 0.36261582374572754, "memory(GiB)": 137.67, "step": 180, "token_acc": 0.8722339081558761, "train_speed(iter/s)": 0.041173 }, { "epoch": 0.21662763466042154, "grad_norm": 0.21819841861724854, "learning_rate": 1.9973868018287093e-05, "loss": 0.3629172325134277, "memory(GiB)": 137.67, "step": 185, "token_acc": 0.8667994850156469, "train_speed(iter/s)": 0.041195 }, { "epoch": 0.2224824355971897, "grad_norm": 0.2083064317703247, "learning_rate": 1.9968995790825048e-05, "loss": 0.3675278902053833, "memory(GiB)": 137.67, "step": 190, "token_acc": 0.8575012434717731, "train_speed(iter/s)": 0.0412 }, { "epoch": 0.22833723653395785, "grad_norm": 0.21168376505374908, "learning_rate": 1.9963708030438754e-05, "loss": 0.3663478374481201, "memory(GiB)": 137.67, "step": 195, "token_acc": 0.8699046566256736, "train_speed(iter/s)": 0.041213 }, { "epoch": 0.234192037470726, "grad_norm": 0.21624095737934113, "learning_rate": 1.995800495753542e-05, "loss": 0.36658034324645994, "memory(GiB)": 137.67, "step": 200, "token_acc": 0.8611760598068374, "train_speed(iter/s)": 0.041221 }, { "epoch": 0.24004683840749413, "grad_norm": 0.21765926480293274, "learning_rate": 1.9951886809833537e-05, "loss": 0.37610225677490233, "memory(GiB)": 137.67, "step": 205, "token_acc": 0.8608684017275929, "train_speed(iter/s)": 0.041233 }, { "epoch": 0.2459016393442623, "grad_norm": 0.21804192662239075, "learning_rate": 1.9945353842352943e-05, "loss": 0.37209372520446776, "memory(GiB)": 137.67, "step": 210, "token_acc": 0.8637638606903014, "train_speed(iter/s)": 0.041242 }, { "epoch": 0.25175644028103045, "grad_norm": 0.21353310346603394, "learning_rate": 1.9938406327404233e-05, "loss": 0.36923999786376954, "memory(GiB)": 137.67, "step": 215, "token_acc": 0.8725016214590311, "train_speed(iter/s)": 0.041259 }, { "epoch": 0.2576112412177986, "grad_norm": 0.21438100934028625, "learning_rate": 1.9931044554577373e-05, "loss": 0.36598026752471924, "memory(GiB)": 137.67, "step": 220, "token_acc": 0.8663032304289586, "train_speed(iter/s)": 0.041275 }, { "epoch": 0.26346604215456676, "grad_norm": 0.21610133349895477, "learning_rate": 1.992326883072965e-05, "loss": 0.36849284172058105, "memory(GiB)": 137.67, "step": 225, "token_acc": 0.8614589650451081, "train_speed(iter/s)": 0.041281 }, { "epoch": 0.2693208430913349, "grad_norm": 0.2203439474105835, "learning_rate": 1.991507947997287e-05, "loss": 0.3765848636627197, "memory(GiB)": 137.67, "step": 230, "token_acc": 0.8680725737864995, "train_speed(iter/s)": 0.041291 }, { "epoch": 0.275175644028103, "grad_norm": 0.22208204865455627, "learning_rate": 1.9906476843659866e-05, "loss": 0.3718143939971924, "memory(GiB)": 137.67, "step": 235, "token_acc": 0.8758277835099897, "train_speed(iter/s)": 0.041301 }, { "epoch": 0.2810304449648712, "grad_norm": 0.20069433748722076, "learning_rate": 1.989746128037024e-05, "loss": 0.3583400249481201, "memory(GiB)": 137.67, "step": 240, "token_acc": 0.8676873362719415, "train_speed(iter/s)": 0.04131 }, { "epoch": 0.28688524590163933, "grad_norm": 0.19968946278095245, "learning_rate": 1.988803316589545e-05, "loss": 0.3672914505004883, "memory(GiB)": 137.67, "step": 245, "token_acc": 0.8662484056672067, "train_speed(iter/s)": 0.041328 }, { "epoch": 0.2927400468384075, "grad_norm": 0.21298536658287048, "learning_rate": 1.987819289322311e-05, "loss": 0.3696786403656006, "memory(GiB)": 137.67, "step": 250, "token_acc": 0.8654257420775034, "train_speed(iter/s)": 0.041348 }, { "epoch": 0.29859484777517564, "grad_norm": 0.2145387828350067, "learning_rate": 1.9867940872520646e-05, "loss": 0.3744542598724365, "memory(GiB)": 137.67, "step": 255, "token_acc": 0.8661229081704401, "train_speed(iter/s)": 0.041346 }, { "epoch": 0.3044496487119438, "grad_norm": 0.2132762223482132, "learning_rate": 1.9857277531118173e-05, "loss": 0.36826577186584475, "memory(GiB)": 137.67, "step": 260, "token_acc": 0.8788229158157335, "train_speed(iter/s)": 0.041353 }, { "epoch": 0.31030444964871196, "grad_norm": 0.2133207470178604, "learning_rate": 1.9846203313490697e-05, "loss": 0.35997600555419923, "memory(GiB)": 137.67, "step": 265, "token_acc": 0.8834285319525085, "train_speed(iter/s)": 0.041363 }, { "epoch": 0.3161592505854801, "grad_norm": 0.23535007238388062, "learning_rate": 1.983471868123958e-05, "loss": 0.3588090896606445, "memory(GiB)": 137.67, "step": 270, "token_acc": 0.8657706943523579, "train_speed(iter/s)": 0.041379 }, { "epoch": 0.32201405152224827, "grad_norm": 0.21440958976745605, "learning_rate": 1.98228241130733e-05, "loss": 0.38217363357543943, "memory(GiB)": 137.67, "step": 275, "token_acc": 0.8693404501511701, "train_speed(iter/s)": 0.041386 }, { "epoch": 0.32786885245901637, "grad_norm": 0.21196675300598145, "learning_rate": 1.98105201047875e-05, "loss": 0.35698800086975097, "memory(GiB)": 137.67, "step": 280, "token_acc": 0.8743185598247525, "train_speed(iter/s)": 0.041403 }, { "epoch": 0.3337236533957845, "grad_norm": 0.22762241959571838, "learning_rate": 1.9797807169244326e-05, "loss": 0.3626487016677856, "memory(GiB)": 137.67, "step": 285, "token_acc": 0.8661923737202862, "train_speed(iter/s)": 0.041406 }, { "epoch": 0.3395784543325527, "grad_norm": 0.21537438035011292, "learning_rate": 1.9784685836351045e-05, "loss": 0.37597248554229734, "memory(GiB)": 137.67, "step": 290, "token_acc": 0.8632790864113016, "train_speed(iter/s)": 0.041408 }, { "epoch": 0.34543325526932084, "grad_norm": 0.24162794649600983, "learning_rate": 1.9771156653037944e-05, "loss": 0.3674392461776733, "memory(GiB)": 137.67, "step": 295, "token_acc": 0.86579905677273, "train_speed(iter/s)": 0.041418 }, { "epoch": 0.351288056206089, "grad_norm": 0.19127634167671204, "learning_rate": 1.975722018323556e-05, "loss": 0.3606871604919434, "memory(GiB)": 137.67, "step": 300, "token_acc": 0.8730913571244476, "train_speed(iter/s)": 0.041416 }, { "epoch": 0.35714285714285715, "grad_norm": 0.21248631179332733, "learning_rate": 1.974287700785116e-05, "loss": 0.3568113327026367, "memory(GiB)": 137.67, "step": 305, "token_acc": 0.8697051358380598, "train_speed(iter/s)": 0.041425 }, { "epoch": 0.3629976580796253, "grad_norm": 0.20225107669830322, "learning_rate": 1.9728127724744516e-05, "loss": 0.3483549118041992, "memory(GiB)": 137.67, "step": 310, "token_acc": 0.8697423969369493, "train_speed(iter/s)": 0.041425 }, { "epoch": 0.36885245901639346, "grad_norm": 0.2230818122625351, "learning_rate": 1.9712972948703006e-05, "loss": 0.36976261138916017, "memory(GiB)": 137.67, "step": 315, "token_acc": 0.8751112598082228, "train_speed(iter/s)": 0.04143 }, { "epoch": 0.3747072599531616, "grad_norm": 0.1945132613182068, "learning_rate": 1.9697413311415967e-05, "loss": 0.364810585975647, "memory(GiB)": 137.67, "step": 320, "token_acc": 0.8484778468167483, "train_speed(iter/s)": 0.041435 }, { "epoch": 0.3805620608899297, "grad_norm": 0.19989554584026337, "learning_rate": 1.9681449461448386e-05, "loss": 0.3616858959197998, "memory(GiB)": 137.67, "step": 325, "token_acc": 0.8718356506795814, "train_speed(iter/s)": 0.041435 }, { "epoch": 0.3864168618266979, "grad_norm": 0.2084866315126419, "learning_rate": 1.9665082064213856e-05, "loss": 0.36598567962646483, "memory(GiB)": 137.67, "step": 330, "token_acc": 0.8664227187552337, "train_speed(iter/s)": 0.041441 }, { "epoch": 0.39227166276346603, "grad_norm": 0.20807960629463196, "learning_rate": 1.9648311801946823e-05, "loss": 0.3633120059967041, "memory(GiB)": 137.67, "step": 335, "token_acc": 0.8659399461174416, "train_speed(iter/s)": 0.041448 }, { "epoch": 0.3981264637002342, "grad_norm": 0.21306882798671722, "learning_rate": 1.9631139373674188e-05, "loss": 0.36129164695739746, "memory(GiB)": 137.67, "step": 340, "token_acc": 0.8666773452933952, "train_speed(iter/s)": 0.04145 }, { "epoch": 0.40398126463700235, "grad_norm": 0.21947889029979706, "learning_rate": 1.9613565495186126e-05, "loss": 0.35186495780944826, "memory(GiB)": 137.67, "step": 345, "token_acc": 0.8666396689403815, "train_speed(iter/s)": 0.041463 }, { "epoch": 0.4098360655737705, "grad_norm": 0.2155865728855133, "learning_rate": 1.9595590899006288e-05, "loss": 0.3684532880783081, "memory(GiB)": 137.67, "step": 350, "token_acc": 0.8713802951875973, "train_speed(iter/s)": 0.041462 }, { "epoch": 0.41569086651053866, "grad_norm": 0.2150585651397705, "learning_rate": 1.957721633436124e-05, "loss": 0.3669363260269165, "memory(GiB)": 137.67, "step": 355, "token_acc": 0.8683417743625568, "train_speed(iter/s)": 0.041459 }, { "epoch": 0.4215456674473068, "grad_norm": 0.22773627936840057, "learning_rate": 1.9558442567149244e-05, "loss": 0.36423306465148925, "memory(GiB)": 137.67, "step": 360, "token_acc": 0.8815313637998826, "train_speed(iter/s)": 0.041467 }, { "epoch": 0.4274004683840749, "grad_norm": 0.19997937977313995, "learning_rate": 1.953927037990834e-05, "loss": 0.3707897186279297, "memory(GiB)": 137.67, "step": 365, "token_acc": 0.8580402286389447, "train_speed(iter/s)": 0.041471 }, { "epoch": 0.4332552693208431, "grad_norm": 0.21174229681491852, "learning_rate": 1.9519700571783718e-05, "loss": 0.3715445280075073, "memory(GiB)": 137.67, "step": 370, "token_acc": 0.873243385426675, "train_speed(iter/s)": 0.041468 }, { "epoch": 0.43911007025761123, "grad_norm": 0.2164727747440338, "learning_rate": 1.9499733958494405e-05, "loss": 0.36826701164245607, "memory(GiB)": 137.67, "step": 375, "token_acc": 0.8624453058192736, "train_speed(iter/s)": 0.041471 }, { "epoch": 0.4449648711943794, "grad_norm": 0.2175064980983734, "learning_rate": 1.947937137229928e-05, "loss": 0.3610344648361206, "memory(GiB)": 137.67, "step": 380, "token_acc": 0.8791143721842437, "train_speed(iter/s)": 0.041474 }, { "epoch": 0.45081967213114754, "grad_norm": 0.21257779002189636, "learning_rate": 1.9458613661962366e-05, "loss": 0.36273534297943116, "memory(GiB)": 137.67, "step": 385, "token_acc": 0.8811885856547406, "train_speed(iter/s)": 0.041479 }, { "epoch": 0.4566744730679157, "grad_norm": 0.2007063329219818, "learning_rate": 1.943746169271746e-05, "loss": 0.36213395595550535, "memory(GiB)": 137.67, "step": 390, "token_acc": 0.8793212957081934, "train_speed(iter/s)": 0.041474 }, { "epoch": 0.46252927400468385, "grad_norm": 0.1982836127281189, "learning_rate": 1.941591634623206e-05, "loss": 0.3674773693084717, "memory(GiB)": 137.67, "step": 395, "token_acc": 0.8714787014744528, "train_speed(iter/s)": 0.04148 }, { "epoch": 0.468384074941452, "grad_norm": 0.21029749512672424, "learning_rate": 1.9393978520570638e-05, "loss": 0.35383853912353513, "memory(GiB)": 137.67, "step": 400, "token_acc": 0.8725135029354207, "train_speed(iter/s)": 0.041493 }, { "epoch": 0.47423887587822017, "grad_norm": 0.2057942897081375, "learning_rate": 1.9371649130157166e-05, "loss": 0.35016608238220215, "memory(GiB)": 137.67, "step": 405, "token_acc": 0.8716170696781026, "train_speed(iter/s)": 0.041495 }, { "epoch": 0.48009367681498827, "grad_norm": 0.21962089836597443, "learning_rate": 1.9348929105737044e-05, "loss": 0.3551772117614746, "memory(GiB)": 137.67, "step": 410, "token_acc": 0.8725112535977174, "train_speed(iter/s)": 0.041495 }, { "epoch": 0.4859484777517564, "grad_norm": 0.22210708260536194, "learning_rate": 1.932581939433827e-05, "loss": 0.3688118696212769, "memory(GiB)": 137.67, "step": 415, "token_acc": 0.8727626971050538, "train_speed(iter/s)": 0.041496 }, { "epoch": 0.4918032786885246, "grad_norm": 0.21538780629634857, "learning_rate": 1.9302320959231997e-05, "loss": 0.3600668430328369, "memory(GiB)": 137.67, "step": 420, "token_acc": 0.87065663645922, "train_speed(iter/s)": 0.041499 }, { "epoch": 0.49765807962529274, "grad_norm": 0.19987384974956512, "learning_rate": 1.927843477989234e-05, "loss": 0.3570875644683838, "memory(GiB)": 137.67, "step": 425, "token_acc": 0.8845410461012411, "train_speed(iter/s)": 0.041501 }, { "epoch": 0.5035128805620609, "grad_norm": 0.20627401769161224, "learning_rate": 1.9254161851955587e-05, "loss": 0.36909596920013427, "memory(GiB)": 137.67, "step": 430, "token_acc": 0.8750783836660981, "train_speed(iter/s)": 0.041507 }, { "epoch": 0.509367681498829, "grad_norm": 0.22353969514369965, "learning_rate": 1.9229503187178694e-05, "loss": 0.36271133422851565, "memory(GiB)": 137.67, "step": 435, "token_acc": 0.8696993866195712, "train_speed(iter/s)": 0.04151 }, { "epoch": 0.5152224824355972, "grad_norm": 0.20142175257205963, "learning_rate": 1.920445981339708e-05, "loss": 0.3614756345748901, "memory(GiB)": 137.67, "step": 440, "token_acc": 0.8678934891256075, "train_speed(iter/s)": 0.041514 }, { "epoch": 0.5210772833723654, "grad_norm": 0.2189430445432663, "learning_rate": 1.9179032774481822e-05, "loss": 0.3589394330978394, "memory(GiB)": 137.67, "step": 445, "token_acc": 0.8754360673743595, "train_speed(iter/s)": 0.04152 }, { "epoch": 0.5269320843091335, "grad_norm": 0.20788422226905823, "learning_rate": 1.9153223130296125e-05, "loss": 0.3571774005889893, "memory(GiB)": 137.67, "step": 450, "token_acc": 0.8775248547087467, "train_speed(iter/s)": 0.041526 }, { "epoch": 0.5327868852459017, "grad_norm": 0.19941285252571106, "learning_rate": 1.9127031956651153e-05, "loss": 0.36058688163757324, "memory(GiB)": 137.67, "step": 455, "token_acc": 0.8748390868215994, "train_speed(iter/s)": 0.041528 }, { "epoch": 0.5386416861826698, "grad_norm": 0.20794501900672913, "learning_rate": 1.9100460345261175e-05, "loss": 0.37292046546936036, "memory(GiB)": 137.67, "step": 460, "token_acc": 0.8686192757401499, "train_speed(iter/s)": 0.04152 }, { "epoch": 0.544496487119438, "grad_norm": 0.21598728001117706, "learning_rate": 1.9073509403698062e-05, "loss": 0.3684291124343872, "memory(GiB)": 137.67, "step": 465, "token_acc": 0.8756676919995869, "train_speed(iter/s)": 0.041523 }, { "epoch": 0.550351288056206, "grad_norm": 0.21292956173419952, "learning_rate": 1.9046180255345142e-05, "loss": 0.3640902042388916, "memory(GiB)": 137.67, "step": 470, "token_acc": 0.8750558298801518, "train_speed(iter/s)": 0.041525 }, { "epoch": 0.5562060889929742, "grad_norm": 0.21117296814918518, "learning_rate": 1.9018474039350342e-05, "loss": 0.3569709062576294, "memory(GiB)": 137.67, "step": 475, "token_acc": 0.8744779663053135, "train_speed(iter/s)": 0.041525 }, { "epoch": 0.5620608899297423, "grad_norm": 0.20366835594177246, "learning_rate": 1.899039191057872e-05, "loss": 0.35825061798095703, "memory(GiB)": 137.67, "step": 480, "token_acc": 0.8689726123486041, "train_speed(iter/s)": 0.041527 }, { "epoch": 0.5679156908665105, "grad_norm": 0.1856691688299179, "learning_rate": 1.8961935039564338e-05, "loss": 0.35746235847473146, "memory(GiB)": 137.67, "step": 485, "token_acc": 0.8688354549740689, "train_speed(iter/s)": 0.041532 }, { "epoch": 0.5737704918032787, "grad_norm": 0.23608598113059998, "learning_rate": 1.8933104612461454e-05, "loss": 0.35999622344970705, "memory(GiB)": 137.67, "step": 490, "token_acc": 0.8696445021552469, "train_speed(iter/s)": 0.041533 }, { "epoch": 0.5796252927400468, "grad_norm": 0.2125530242919922, "learning_rate": 1.8903901830995093e-05, "loss": 0.3631314754486084, "memory(GiB)": 137.67, "step": 495, "token_acc": 0.8666599882919743, "train_speed(iter/s)": 0.041531 }, { "epoch": 0.585480093676815, "grad_norm": 0.20335227251052856, "learning_rate": 1.8874327912410945e-05, "loss": 0.37455101013183595, "memory(GiB)": 137.67, "step": 500, "token_acc": 0.8691201544556442, "train_speed(iter/s)": 0.041538 }, { "epoch": 0.5913348946135831, "grad_norm": 0.2046995759010315, "learning_rate": 1.884438408942463e-05, "loss": 0.361937952041626, "memory(GiB)": 137.67, "step": 505, "token_acc": 0.8581575277197544, "train_speed(iter/s)": 0.041539 }, { "epoch": 0.5971896955503513, "grad_norm": 0.17991533875465393, "learning_rate": 1.881407161017033e-05, "loss": 0.35659379959106446, "memory(GiB)": 137.67, "step": 510, "token_acc": 0.8789336760280843, "train_speed(iter/s)": 0.041545 }, { "epoch": 0.6030444964871194, "grad_norm": 0.24344618618488312, "learning_rate": 1.8783391738148738e-05, "loss": 0.35185072422027586, "memory(GiB)": 137.67, "step": 515, "token_acc": 0.8730951113338136, "train_speed(iter/s)": 0.04155 }, { "epoch": 0.6088992974238876, "grad_norm": 0.21754887700080872, "learning_rate": 1.875234575217441e-05, "loss": 0.3508215665817261, "memory(GiB)": 137.67, "step": 520, "token_acc": 0.872153412139793, "train_speed(iter/s)": 0.041554 }, { "epoch": 0.6147540983606558, "grad_norm": 0.18687933683395386, "learning_rate": 1.8720934946322466e-05, "loss": 0.3653162240982056, "memory(GiB)": 137.67, "step": 525, "token_acc": 0.8658395285187296, "train_speed(iter/s)": 0.041556 }, { "epoch": 0.6206088992974239, "grad_norm": 0.1791500300168991, "learning_rate": 1.8689160629874622e-05, "loss": 0.3357256889343262, "memory(GiB)": 137.67, "step": 530, "token_acc": 0.8864503516899346, "train_speed(iter/s)": 0.041553 }, { "epoch": 0.6264637002341921, "grad_norm": 0.18553608655929565, "learning_rate": 1.865702412726465e-05, "loss": 0.34752044677734373, "memory(GiB)": 137.67, "step": 535, "token_acc": 0.882398003852215, "train_speed(iter/s)": 0.041558 }, { "epoch": 0.6323185011709602, "grad_norm": 0.19252535700798035, "learning_rate": 1.8624526778023142e-05, "loss": 0.3493391513824463, "memory(GiB)": 137.67, "step": 540, "token_acc": 0.8799156751797872, "train_speed(iter/s)": 0.04156 }, { "epoch": 0.6381733021077284, "grad_norm": 0.1979398876428604, "learning_rate": 1.85916699367217e-05, "loss": 0.35185253620147705, "memory(GiB)": 137.67, "step": 545, "token_acc": 0.8728044652187243, "train_speed(iter/s)": 0.041561 }, { "epoch": 0.6440281030444965, "grad_norm": 0.19005604088306427, "learning_rate": 1.855845497291646e-05, "loss": 0.3633576393127441, "memory(GiB)": 137.67, "step": 550, "token_acc": 0.8699871784073149, "train_speed(iter/s)": 0.041564 }, { "epoch": 0.6498829039812647, "grad_norm": 0.1815745234489441, "learning_rate": 1.8524883271091004e-05, "loss": 0.35262117385864256, "memory(GiB)": 137.67, "step": 555, "token_acc": 0.8783439310264622, "train_speed(iter/s)": 0.041562 }, { "epoch": 0.6557377049180327, "grad_norm": 0.17770066857337952, "learning_rate": 1.8490956230598668e-05, "loss": 0.3713988780975342, "memory(GiB)": 137.67, "step": 560, "token_acc": 0.8711786567892583, "train_speed(iter/s)": 0.041563 }, { "epoch": 0.6615925058548009, "grad_norm": 0.19120706617832184, "learning_rate": 1.8456675265604183e-05, "loss": 0.35135421752929685, "memory(GiB)": 137.67, "step": 565, "token_acc": 0.8704644071404868, "train_speed(iter/s)": 0.041568 }, { "epoch": 0.667447306791569, "grad_norm": 0.22995422780513763, "learning_rate": 1.842204180502476e-05, "loss": 0.3541764974594116, "memory(GiB)": 137.67, "step": 570, "token_acc": 0.8800552885370527, "train_speed(iter/s)": 0.04157 }, { "epoch": 0.6733021077283372, "grad_norm": 0.23910608887672424, "learning_rate": 1.8387057292470517e-05, "loss": 0.3688697576522827, "memory(GiB)": 137.67, "step": 575, "token_acc": 0.8699386694063074, "train_speed(iter/s)": 0.041571 }, { "epoch": 0.6791569086651054, "grad_norm": 0.18881316483020782, "learning_rate": 1.8351723186184295e-05, "loss": 0.358310866355896, "memory(GiB)": 137.67, "step": 580, "token_acc": 0.861880756666604, "train_speed(iter/s)": 0.041574 }, { "epoch": 0.6850117096018735, "grad_norm": 0.19772037863731384, "learning_rate": 1.8316040958980896e-05, "loss": 0.3566863536834717, "memory(GiB)": 137.67, "step": 585, "token_acc": 0.8841636264650852, "train_speed(iter/s)": 0.041578 }, { "epoch": 0.6908665105386417, "grad_norm": 0.20680150389671326, "learning_rate": 1.828001209818567e-05, "loss": 0.37308592796325685, "memory(GiB)": 137.67, "step": 590, "token_acc": 0.8693373139559628, "train_speed(iter/s)": 0.041581 }, { "epoch": 0.6967213114754098, "grad_norm": 0.21996839344501495, "learning_rate": 1.8243638105572547e-05, "loss": 0.3568426132202148, "memory(GiB)": 137.67, "step": 595, "token_acc": 0.8781027202445839, "train_speed(iter/s)": 0.041584 }, { "epoch": 0.702576112412178, "grad_norm": 0.19068636000156403, "learning_rate": 1.82069204973014e-05, "loss": 0.3520241975784302, "memory(GiB)": 137.67, "step": 600, "token_acc": 0.8848490938723728, "train_speed(iter/s)": 0.041592 }, { "epoch": 0.7084309133489461, "grad_norm": 0.19711260497570038, "learning_rate": 1.816986080385489e-05, "loss": 0.3704382419586182, "memory(GiB)": 137.67, "step": 605, "token_acc": 0.8542210685487001, "train_speed(iter/s)": 0.041592 }, { "epoch": 0.7142857142857143, "grad_norm": 0.2009887397289276, "learning_rate": 1.813246056997465e-05, "loss": 0.35552153587341306, "memory(GiB)": 137.67, "step": 610, "token_acc": 0.8681636421482087, "train_speed(iter/s)": 0.041595 }, { "epoch": 0.7201405152224825, "grad_norm": 0.2012893706560135, "learning_rate": 1.809472135459688e-05, "loss": 0.3568307399749756, "memory(GiB)": 137.67, "step": 615, "token_acc": 0.8715069766273564, "train_speed(iter/s)": 0.041596 }, { "epoch": 0.7259953161592506, "grad_norm": 0.19377882778644562, "learning_rate": 1.8056644730787412e-05, "loss": 0.3658033847808838, "memory(GiB)": 137.67, "step": 620, "token_acc": 0.8766388014057431, "train_speed(iter/s)": 0.041603 }, { "epoch": 0.7318501170960188, "grad_norm": 0.21672694385051727, "learning_rate": 1.8018232285676092e-05, "loss": 0.34650683403015137, "memory(GiB)": 137.67, "step": 625, "token_acc": 0.8730951833381114, "train_speed(iter/s)": 0.041609 }, { "epoch": 0.7377049180327869, "grad_norm": 0.20295600593090057, "learning_rate": 1.797948562039066e-05, "loss": 0.36364593505859377, "memory(GiB)": 137.67, "step": 630, "token_acc": 0.8673425158178014, "train_speed(iter/s)": 0.041604 }, { "epoch": 0.7435597189695551, "grad_norm": 0.20888152718544006, "learning_rate": 1.7940406349989987e-05, "loss": 0.3600362777709961, "memory(GiB)": 137.67, "step": 635, "token_acc": 0.8697917646394914, "train_speed(iter/s)": 0.04161 }, { "epoch": 0.7494145199063232, "grad_norm": 0.18725119531154633, "learning_rate": 1.7900996103396772e-05, "loss": 0.3525946617126465, "memory(GiB)": 137.67, "step": 640, "token_acc": 0.8778969516256544, "train_speed(iter/s)": 0.04161 }, { "epoch": 0.7552693208430913, "grad_norm": 0.2023143470287323, "learning_rate": 1.7861256523329634e-05, "loss": 0.35059380531311035, "memory(GiB)": 137.67, "step": 645, "token_acc": 0.867270463741052, "train_speed(iter/s)": 0.041608 }, { "epoch": 0.7611241217798594, "grad_norm": 0.18495850265026093, "learning_rate": 1.7821189266234647e-05, "loss": 0.35591151714324953, "memory(GiB)": 137.67, "step": 650, "token_acc": 0.8691064057960171, "train_speed(iter/s)": 0.041607 }, { "epoch": 0.7669789227166276, "grad_norm": 0.19239366054534912, "learning_rate": 1.7780796002216285e-05, "loss": 0.3489703893661499, "memory(GiB)": 137.67, "step": 655, "token_acc": 0.8661729229440642, "train_speed(iter/s)": 0.041609 }, { "epoch": 0.7728337236533958, "grad_norm": 0.19033724069595337, "learning_rate": 1.7740078414967817e-05, "loss": 0.35645670890808107, "memory(GiB)": 137.67, "step": 660, "token_acc": 0.8801652115008279, "train_speed(iter/s)": 0.041611 }, { "epoch": 0.7786885245901639, "grad_norm": 0.1858055591583252, "learning_rate": 1.7699038201701132e-05, "loss": 0.3495974063873291, "memory(GiB)": 137.67, "step": 665, "token_acc": 0.86732774248516, "train_speed(iter/s)": 0.041614 }, { "epoch": 0.7845433255269321, "grad_norm": 0.19249401986598969, "learning_rate": 1.7657677073075968e-05, "loss": 0.35628108978271483, "memory(GiB)": 137.67, "step": 670, "token_acc": 0.8711122587710429, "train_speed(iter/s)": 0.041616 }, { "epoch": 0.7903981264637002, "grad_norm": 0.1897304505109787, "learning_rate": 1.761599675312864e-05, "loss": 0.3588160514831543, "memory(GiB)": 137.67, "step": 675, "token_acc": 0.8833087010138474, "train_speed(iter/s)": 0.041616 }, { "epoch": 0.7962529274004684, "grad_norm": 0.19034340977668762, "learning_rate": 1.7573998979200163e-05, "loss": 0.3528533935546875, "memory(GiB)": 137.67, "step": 680, "token_acc": 0.873974659902577, "train_speed(iter/s)": 0.04162 }, { "epoch": 0.8021077283372365, "grad_norm": 0.17828524112701416, "learning_rate": 1.753168550186383e-05, "loss": 0.36130833625793457, "memory(GiB)": 137.67, "step": 685, "token_acc": 0.8767166579575643, "train_speed(iter/s)": 0.041622 }, { "epoch": 0.8079625292740047, "grad_norm": 0.18225735425949097, "learning_rate": 1.7489058084852247e-05, "loss": 0.3559986114501953, "memory(GiB)": 137.67, "step": 690, "token_acc": 0.8664611837818874, "train_speed(iter/s)": 0.041619 }, { "epoch": 0.8138173302107728, "grad_norm": 0.17824020981788635, "learning_rate": 1.744611850498383e-05, "loss": 0.3519934415817261, "memory(GiB)": 137.67, "step": 695, "token_acc": 0.8767726421318924, "train_speed(iter/s)": 0.04162 }, { "epoch": 0.819672131147541, "grad_norm": 0.19619260728359222, "learning_rate": 1.7402868552088724e-05, "loss": 0.34758720397949217, "memory(GiB)": 137.67, "step": 700, "token_acc": 0.8710738168196693, "train_speed(iter/s)": 0.041621 }, { "epoch": 0.8255269320843092, "grad_norm": 0.20193175971508026, "learning_rate": 1.73593100289342e-05, "loss": 0.3554750919342041, "memory(GiB)": 137.67, "step": 705, "token_acc": 0.8680475894967122, "train_speed(iter/s)": 0.041625 }, { "epoch": 0.8313817330210773, "grad_norm": 0.17672231793403625, "learning_rate": 1.7315444751149533e-05, "loss": 0.3531287670135498, "memory(GiB)": 137.67, "step": 710, "token_acc": 0.8739113086739942, "train_speed(iter/s)": 0.041629 }, { "epoch": 0.8372365339578455, "grad_norm": 0.18640753626823425, "learning_rate": 1.727127454715029e-05, "loss": 0.3531001329421997, "memory(GiB)": 137.67, "step": 715, "token_acc": 0.8807271048387348, "train_speed(iter/s)": 0.041632 }, { "epoch": 0.8430913348946136, "grad_norm": 0.18654407560825348, "learning_rate": 1.722680125806214e-05, "loss": 0.3535622119903564, "memory(GiB)": 137.67, "step": 720, "token_acc": 0.8664340845361018, "train_speed(iter/s)": 0.041633 }, { "epoch": 0.8489461358313818, "grad_norm": 0.19616912305355072, "learning_rate": 1.71820267376441e-05, "loss": 0.357543420791626, "memory(GiB)": 137.67, "step": 725, "token_acc": 0.8723300758960031, "train_speed(iter/s)": 0.041635 }, { "epoch": 0.8548009367681498, "grad_norm": 0.1865251064300537, "learning_rate": 1.7136952852211274e-05, "loss": 0.36123013496398926, "memory(GiB)": 137.67, "step": 730, "token_acc": 0.8610691821941981, "train_speed(iter/s)": 0.041638 }, { "epoch": 0.860655737704918, "grad_norm": 0.1886809915304184, "learning_rate": 1.7091581480557057e-05, "loss": 0.34960460662841797, "memory(GiB)": 137.67, "step": 735, "token_acc": 0.8703787498166635, "train_speed(iter/s)": 0.041639 }, { "epoch": 0.8665105386416861, "grad_norm": 0.19691921770572662, "learning_rate": 1.7045914513874815e-05, "loss": 0.3618565320968628, "memory(GiB)": 137.67, "step": 740, "token_acc": 0.8702042368549021, "train_speed(iter/s)": 0.041645 }, { "epoch": 0.8723653395784543, "grad_norm": 0.18920762836933136, "learning_rate": 1.699995385567907e-05, "loss": 0.3643482685089111, "memory(GiB)": 137.67, "step": 745, "token_acc": 0.8619865320910651, "train_speed(iter/s)": 0.041651 }, { "epoch": 0.8782201405152225, "grad_norm": 0.19481435418128967, "learning_rate": 1.695370142172614e-05, "loss": 0.3560521602630615, "memory(GiB)": 137.67, "step": 750, "token_acc": 0.8686031511447322, "train_speed(iter/s)": 0.041651 }, { "epoch": 0.8840749414519906, "grad_norm": 0.19207534193992615, "learning_rate": 1.690715913993429e-05, "loss": 0.3591322422027588, "memory(GiB)": 137.67, "step": 755, "token_acc": 0.8719703155846309, "train_speed(iter/s)": 0.041652 }, { "epoch": 0.8899297423887588, "grad_norm": 0.20057600736618042, "learning_rate": 1.6860328950303392e-05, "loss": 0.3394715070724487, "memory(GiB)": 137.67, "step": 760, "token_acc": 0.8781381296322522, "train_speed(iter/s)": 0.041655 }, { "epoch": 0.8957845433255269, "grad_norm": 0.19081991910934448, "learning_rate": 1.6813212804834033e-05, "loss": 0.3552083015441895, "memory(GiB)": 137.67, "step": 765, "token_acc": 0.8649747738343772, "train_speed(iter/s)": 0.041656 }, { "epoch": 0.9016393442622951, "grad_norm": 0.17996545135974884, "learning_rate": 1.676581266744615e-05, "loss": 0.3466797828674316, "memory(GiB)": 137.67, "step": 770, "token_acc": 0.8719778029670782, "train_speed(iter/s)": 0.041659 }, { "epoch": 0.9074941451990632, "grad_norm": 0.18470925092697144, "learning_rate": 1.6718130513897207e-05, "loss": 0.34652736186981203, "memory(GiB)": 137.67, "step": 775, "token_acc": 0.8761688115825458, "train_speed(iter/s)": 0.041661 }, { "epoch": 0.9133489461358314, "grad_norm": 0.1838730424642563, "learning_rate": 1.667016833169979e-05, "loss": 0.3616307258605957, "memory(GiB)": 137.67, "step": 780, "token_acc": 0.8749988214255409, "train_speed(iter/s)": 0.041664 }, { "epoch": 0.9192037470725996, "grad_norm": 0.1882750242948532, "learning_rate": 1.6621928120038806e-05, "loss": 0.35453338623046876, "memory(GiB)": 137.67, "step": 785, "token_acc": 0.8650788191817312, "train_speed(iter/s)": 0.041666 }, { "epoch": 0.9250585480093677, "grad_norm": 0.18011753261089325, "learning_rate": 1.657341188968811e-05, "loss": 0.3467398166656494, "memory(GiB)": 137.67, "step": 790, "token_acc": 0.8665571597898215, "train_speed(iter/s)": 0.041668 }, { "epoch": 0.9309133489461359, "grad_norm": 0.1889754831790924, "learning_rate": 1.6524621662926733e-05, "loss": 0.34622554779052733, "memory(GiB)": 137.67, "step": 795, "token_acc": 0.8836526658483215, "train_speed(iter/s)": 0.041671 }, { "epoch": 0.936768149882904, "grad_norm": 0.17811700701713562, "learning_rate": 1.6475559473454558e-05, "loss": 0.35440659523010254, "memory(GiB)": 137.67, "step": 800, "token_acc": 0.8802437890929187, "train_speed(iter/s)": 0.041672 }, { "epoch": 0.9426229508196722, "grad_norm": 0.19011390209197998, "learning_rate": 1.6426227366307563e-05, "loss": 0.3580695629119873, "memory(GiB)": 137.67, "step": 805, "token_acc": 0.8808476204925909, "train_speed(iter/s)": 0.04167 }, { "epoch": 0.9484777517564403, "grad_norm": 0.18688787519931793, "learning_rate": 1.6376627397772576e-05, "loss": 0.35615901947021483, "memory(GiB)": 137.67, "step": 810, "token_acc": 0.8656951211518713, "train_speed(iter/s)": 0.04167 }, { "epoch": 0.9543325526932084, "grad_norm": 0.19855861365795135, "learning_rate": 1.6326761635301572e-05, "loss": 0.3505072116851807, "memory(GiB)": 137.67, "step": 815, "token_acc": 0.8734695802546769, "train_speed(iter/s)": 0.041672 }, { "epoch": 0.9601873536299765, "grad_norm": 0.18500158190727234, "learning_rate": 1.6276632157425475e-05, "loss": 0.35810859203338624, "memory(GiB)": 137.67, "step": 820, "token_acc": 0.8688002942074786, "train_speed(iter/s)": 0.041672 }, { "epoch": 0.9660421545667447, "grad_norm": 0.2135351300239563, "learning_rate": 1.6226241053667536e-05, "loss": 0.3624737739562988, "memory(GiB)": 137.67, "step": 825, "token_acc": 0.8650754688071645, "train_speed(iter/s)": 0.041674 }, { "epoch": 0.9718969555035128, "grad_norm": 0.188192680478096, "learning_rate": 1.617559042445625e-05, "loss": 0.3624725818634033, "memory(GiB)": 137.67, "step": 830, "token_acc": 0.8755614748176581, "train_speed(iter/s)": 0.041674 }, { "epoch": 0.977751756440281, "grad_norm": 0.34307366609573364, "learning_rate": 1.6124682381037767e-05, "loss": 0.34985201358795165, "memory(GiB)": 137.67, "step": 835, "token_acc": 0.8732973013596538, "train_speed(iter/s)": 0.041675 }, { "epoch": 0.9836065573770492, "grad_norm": 0.19902247190475464, "learning_rate": 1.607351904538792e-05, "loss": 0.3641986846923828, "memory(GiB)": 137.67, "step": 840, "token_acc": 0.8725000467718097, "train_speed(iter/s)": 0.041673 }, { "epoch": 0.9894613583138173, "grad_norm": 0.18375855684280396, "learning_rate": 1.6022102550123775e-05, "loss": 0.3507267951965332, "memory(GiB)": 137.67, "step": 845, "token_acc": 0.868225976538805, "train_speed(iter/s)": 0.041674 }, { "epoch": 0.9953161592505855, "grad_norm": 0.19543269276618958, "learning_rate": 1.597043503841471e-05, "loss": 0.3511422395706177, "memory(GiB)": 137.67, "step": 850, "token_acc": 0.8818226402481499, "train_speed(iter/s)": 0.041674 }, { "epoch": 1.0011709601873535, "grad_norm": 0.2594313323497772, "learning_rate": 1.5918518663893124e-05, "loss": 0.3436767339706421, "memory(GiB)": 137.67, "step": 855, "token_acc": 0.8783253667380914, "train_speed(iter/s)": 0.041472 }, { "epoch": 1.0070257611241218, "grad_norm": 0.21433798968791962, "learning_rate": 1.5866355590564637e-05, "loss": 0.31752333641052244, "memory(GiB)": 137.67, "step": 860, "token_acc": 0.8950932956103179, "train_speed(iter/s)": 0.041464 }, { "epoch": 1.0128805620608898, "grad_norm": 0.20641100406646729, "learning_rate": 1.5813947992717894e-05, "loss": 0.3059502601623535, "memory(GiB)": 137.67, "step": 865, "token_acc": 0.8851299275012688, "train_speed(iter/s)": 0.041456 }, { "epoch": 1.018735362997658, "grad_norm": 0.2776026427745819, "learning_rate": 1.5761298054833947e-05, "loss": 0.31491961479187014, "memory(GiB)": 137.67, "step": 870, "token_acc": 0.8871431849329935, "train_speed(iter/s)": 0.041446 }, { "epoch": 1.0245901639344261, "grad_norm": 0.2104882299900055, "learning_rate": 1.5708407971495195e-05, "loss": 0.3215550422668457, "memory(GiB)": 137.67, "step": 875, "token_acc": 0.8840142068123856, "train_speed(iter/s)": 0.041441 }, { "epoch": 1.0304449648711944, "grad_norm": 0.2141922563314438, "learning_rate": 1.565527994729389e-05, "loss": 0.31157307624816893, "memory(GiB)": 137.67, "step": 880, "token_acc": 0.8925077955478237, "train_speed(iter/s)": 0.041435 }, { "epoch": 1.0362997658079625, "grad_norm": 0.19829437136650085, "learning_rate": 1.5601916196740283e-05, "loss": 0.30809755325317384, "memory(GiB)": 137.67, "step": 885, "token_acc": 0.890301896874165, "train_speed(iter/s)": 0.04143 }, { "epoch": 1.0421545667447307, "grad_norm": 0.1938631683588028, "learning_rate": 1.5548318944170276e-05, "loss": 0.30415992736816405, "memory(GiB)": 137.67, "step": 890, "token_acc": 0.8950597362393585, "train_speed(iter/s)": 0.041423 }, { "epoch": 1.0480093676814988, "grad_norm": 0.18822869658470154, "learning_rate": 1.5494490423652732e-05, "loss": 0.30409889221191405, "memory(GiB)": 137.67, "step": 895, "token_acc": 0.8878764647902749, "train_speed(iter/s)": 0.041414 }, { "epoch": 1.053864168618267, "grad_norm": 0.18639546632766724, "learning_rate": 1.544043287889635e-05, "loss": 0.29631519317626953, "memory(GiB)": 137.67, "step": 900, "token_acc": 0.8972942289498581, "train_speed(iter/s)": 0.041408 }, { "epoch": 1.059718969555035, "grad_norm": 0.19313958287239075, "learning_rate": 1.538614856315614e-05, "loss": 0.3089482307434082, "memory(GiB)": 137.67, "step": 905, "token_acc": 0.8947345206627453, "train_speed(iter/s)": 0.041403 }, { "epoch": 1.0655737704918034, "grad_norm": 0.1918047070503235, "learning_rate": 1.5331639739139477e-05, "loss": 0.30376482009887695, "memory(GiB)": 137.67, "step": 910, "token_acc": 0.878863108904361, "train_speed(iter/s)": 0.041394 }, { "epoch": 1.0714285714285714, "grad_norm": 0.17692717909812927, "learning_rate": 1.5276908678911837e-05, "loss": 0.3011662006378174, "memory(GiB)": 137.67, "step": 915, "token_acc": 0.8932026746024828, "train_speed(iter/s)": 0.041388 }, { "epoch": 1.0772833723653397, "grad_norm": 0.1763262152671814, "learning_rate": 1.5221957663802043e-05, "loss": 0.31141071319580077, "memory(GiB)": 137.67, "step": 920, "token_acc": 0.8920435427389305, "train_speed(iter/s)": 0.041376 }, { "epoch": 1.0831381733021077, "grad_norm": 0.1730634868144989, "learning_rate": 1.5166788984307204e-05, "loss": 0.3161822557449341, "memory(GiB)": 137.67, "step": 925, "token_acc": 0.8866250173014735, "train_speed(iter/s)": 0.041367 }, { "epoch": 1.088992974238876, "grad_norm": 0.20834501087665558, "learning_rate": 1.5111404939997227e-05, "loss": 0.3130020618438721, "memory(GiB)": 137.67, "step": 930, "token_acc": 0.8872231505297611, "train_speed(iter/s)": 0.04136 }, { "epoch": 1.094847775175644, "grad_norm": 0.20543096959590912, "learning_rate": 1.5055807839418966e-05, "loss": 0.29431891441345215, "memory(GiB)": 137.67, "step": 935, "token_acc": 0.8923718607539866, "train_speed(iter/s)": 0.041352 }, { "epoch": 1.100702576112412, "grad_norm": 0.1818283647298813, "learning_rate": 1.5000000000000002e-05, "loss": 0.31560554504394533, "memory(GiB)": 137.67, "step": 940, "token_acc": 0.8944428660187143, "train_speed(iter/s)": 0.041347 }, { "epoch": 1.1065573770491803, "grad_norm": 0.18734754621982574, "learning_rate": 1.494398374795204e-05, "loss": 0.30426225662231443, "memory(GiB)": 137.67, "step": 945, "token_acc": 0.8848180693302514, "train_speed(iter/s)": 0.041343 }, { "epoch": 1.1124121779859484, "grad_norm": 0.19308467209339142, "learning_rate": 1.4887761418173947e-05, "loss": 0.32167963981628417, "memory(GiB)": 137.67, "step": 950, "token_acc": 0.8939139882185966, "train_speed(iter/s)": 0.041337 }, { "epoch": 1.1182669789227166, "grad_norm": 0.2532450258731842, "learning_rate": 1.4831335354154444e-05, "loss": 0.30830209255218505, "memory(GiB)": 137.67, "step": 955, "token_acc": 0.887962551140468, "train_speed(iter/s)": 0.041333 }, { "epoch": 1.1241217798594847, "grad_norm": 0.18927785754203796, "learning_rate": 1.4774707907874392e-05, "loss": 0.30596270561218264, "memory(GiB)": 137.67, "step": 960, "token_acc": 0.8945483075403462, "train_speed(iter/s)": 0.041324 }, { "epoch": 1.129976580796253, "grad_norm": 0.18746164441108704, "learning_rate": 1.4717881439708786e-05, "loss": 0.3073431491851807, "memory(GiB)": 137.67, "step": 965, "token_acc": 0.8779535897835228, "train_speed(iter/s)": 0.041318 }, { "epoch": 1.135831381733021, "grad_norm": 0.19065742194652557, "learning_rate": 1.4660858318328348e-05, "loss": 0.30925755500793456, "memory(GiB)": 137.67, "step": 970, "token_acc": 0.8771556147038887, "train_speed(iter/s)": 0.041311 }, { "epoch": 1.1416861826697893, "grad_norm": 0.19082236289978027, "learning_rate": 1.4603640920600813e-05, "loss": 0.31507372856140137, "memory(GiB)": 137.67, "step": 975, "token_acc": 0.8741312286488396, "train_speed(iter/s)": 0.041305 }, { "epoch": 1.1475409836065573, "grad_norm": 0.18480531871318817, "learning_rate": 1.4546231631491827e-05, "loss": 0.3110131025314331, "memory(GiB)": 137.67, "step": 980, "token_acc": 0.8829417142215302, "train_speed(iter/s)": 0.041296 }, { "epoch": 1.1533957845433256, "grad_norm": 0.17675240337848663, "learning_rate": 1.4488632843965573e-05, "loss": 0.3039939641952515, "memory(GiB)": 137.67, "step": 985, "token_acc": 0.8738143036386449, "train_speed(iter/s)": 0.041289 }, { "epoch": 1.1592505854800936, "grad_norm": 0.19089390337467194, "learning_rate": 1.4430846958884995e-05, "loss": 0.31295793056488036, "memory(GiB)": 137.67, "step": 990, "token_acc": 0.8817706633869632, "train_speed(iter/s)": 0.041282 }, { "epoch": 1.165105386416862, "grad_norm": 0.18563120067119598, "learning_rate": 1.4372876384911741e-05, "loss": 0.313909912109375, "memory(GiB)": 137.67, "step": 995, "token_acc": 0.8830196916072904, "train_speed(iter/s)": 0.041276 }, { "epoch": 1.17096018735363, "grad_norm": 0.21534429490566254, "learning_rate": 1.4314723538405752e-05, "loss": 0.3197300910949707, "memory(GiB)": 137.67, "step": 1000, "token_acc": 0.8747241787695568, "train_speed(iter/s)": 0.041271 }, { "epoch": 1.1768149882903982, "grad_norm": 0.19970309734344482, "learning_rate": 1.4256390843324556e-05, "loss": 0.3151378154754639, "memory(GiB)": 137.67, "step": 1005, "token_acc": 0.8791438877655459, "train_speed(iter/s)": 0.041267 }, { "epoch": 1.1826697892271663, "grad_norm": 0.1895560324192047, "learning_rate": 1.4197880731122221e-05, "loss": 0.312138032913208, "memory(GiB)": 137.67, "step": 1010, "token_acc": 0.8795711581097576, "train_speed(iter/s)": 0.041265 }, { "epoch": 1.1885245901639343, "grad_norm": 0.19073544442653656, "learning_rate": 1.4139195640648008e-05, "loss": 0.315081787109375, "memory(GiB)": 137.67, "step": 1015, "token_acc": 0.8921242173646963, "train_speed(iter/s)": 0.041259 }, { "epoch": 1.1943793911007026, "grad_norm": 0.17704617977142334, "learning_rate": 1.4080338018044712e-05, "loss": 0.319437837600708, "memory(GiB)": 137.67, "step": 1020, "token_acc": 0.8815218951006631, "train_speed(iter/s)": 0.041255 }, { "epoch": 1.2002341920374708, "grad_norm": 0.19636361300945282, "learning_rate": 1.4021310316646708e-05, "loss": 0.3087984561920166, "memory(GiB)": 137.67, "step": 1025, "token_acc": 0.8875915980726762, "train_speed(iter/s)": 0.041249 }, { "epoch": 1.2060889929742389, "grad_norm": 0.185128852725029, "learning_rate": 1.3962114996877685e-05, "loss": 0.29653804302215575, "memory(GiB)": 137.67, "step": 1030, "token_acc": 0.894042061938463, "train_speed(iter/s)": 0.041243 }, { "epoch": 1.211943793911007, "grad_norm": 0.18740731477737427, "learning_rate": 1.390275452614808e-05, "loss": 0.2996367454528809, "memory(GiB)": 137.67, "step": 1035, "token_acc": 0.8867371770872332, "train_speed(iter/s)": 0.041239 }, { "epoch": 1.2177985948477752, "grad_norm": 0.19739095866680145, "learning_rate": 1.3843231378752252e-05, "loss": 0.3056778907775879, "memory(GiB)": 137.67, "step": 1040, "token_acc": 0.8844194070047138, "train_speed(iter/s)": 0.041232 }, { "epoch": 1.2236533957845432, "grad_norm": 0.18625736236572266, "learning_rate": 1.3783548035765327e-05, "loss": 0.3101504802703857, "memory(GiB)": 137.67, "step": 1045, "token_acc": 0.8895319577252139, "train_speed(iter/s)": 0.041228 }, { "epoch": 1.2295081967213115, "grad_norm": 0.19391782581806183, "learning_rate": 1.3723706984939783e-05, "loss": 0.2983381271362305, "memory(GiB)": 137.67, "step": 1050, "token_acc": 0.8835933444611258, "train_speed(iter/s)": 0.041224 }, { "epoch": 1.2353629976580796, "grad_norm": 0.18108582496643066, "learning_rate": 1.366371072060177e-05, "loss": 0.3086691379547119, "memory(GiB)": 137.67, "step": 1055, "token_acc": 0.8736720857877966, "train_speed(iter/s)": 0.041218 }, { "epoch": 1.2412177985948478, "grad_norm": 0.18043167889118195, "learning_rate": 1.3603561743547125e-05, "loss": 0.30459914207458494, "memory(GiB)": 137.67, "step": 1060, "token_acc": 0.8805453249562779, "train_speed(iter/s)": 0.041215 }, { "epoch": 1.2470725995316159, "grad_norm": 0.2246876060962677, "learning_rate": 1.3543262560937135e-05, "loss": 0.3085703134536743, "memory(GiB)": 137.67, "step": 1065, "token_acc": 0.8846350880261892, "train_speed(iter/s)": 0.041212 }, { "epoch": 1.2529274004683841, "grad_norm": 0.19236041605472565, "learning_rate": 1.3482815686194033e-05, "loss": 0.2960092306137085, "memory(GiB)": 137.67, "step": 1070, "token_acc": 0.8907122097565549, "train_speed(iter/s)": 0.041208 }, { "epoch": 1.2587822014051522, "grad_norm": 0.1928793489933014, "learning_rate": 1.3422223638896235e-05, "loss": 0.3040574073791504, "memory(GiB)": 137.67, "step": 1075, "token_acc": 0.886298144007927, "train_speed(iter/s)": 0.041204 }, { "epoch": 1.2646370023419204, "grad_norm": 0.20902785658836365, "learning_rate": 1.3361488944673315e-05, "loss": 0.31267333030700684, "memory(GiB)": 137.67, "step": 1080, "token_acc": 0.8800496737817911, "train_speed(iter/s)": 0.041199 }, { "epoch": 1.2704918032786885, "grad_norm": 0.18985559046268463, "learning_rate": 1.3300614135100736e-05, "loss": 0.3105930805206299, "memory(GiB)": 137.67, "step": 1085, "token_acc": 0.8869882389382489, "train_speed(iter/s)": 0.041194 }, { "epoch": 1.2763466042154565, "grad_norm": 0.17671886086463928, "learning_rate": 1.3239601747594319e-05, "loss": 0.310105037689209, "memory(GiB)": 137.67, "step": 1090, "token_acc": 0.8870674524554854, "train_speed(iter/s)": 0.041187 }, { "epoch": 1.2822014051522248, "grad_norm": 0.17825712263584137, "learning_rate": 1.3178454325304472e-05, "loss": 0.31207849979400637, "memory(GiB)": 137.67, "step": 1095, "token_acc": 0.876942551728449, "train_speed(iter/s)": 0.041183 }, { "epoch": 1.288056206088993, "grad_norm": 0.1821722686290741, "learning_rate": 1.3117174417010213e-05, "loss": 0.2980069637298584, "memory(GiB)": 137.67, "step": 1100, "token_acc": 0.8805069421513594, "train_speed(iter/s)": 0.041179 }, { "epoch": 1.2939110070257611, "grad_norm": 0.18626025319099426, "learning_rate": 1.3055764577012892e-05, "loss": 0.3255163669586182, "memory(GiB)": 137.67, "step": 1105, "token_acc": 0.8920352101893313, "train_speed(iter/s)": 0.041176 }, { "epoch": 1.2997658079625292, "grad_norm": 0.18716710805892944, "learning_rate": 1.2994227365029752e-05, "loss": 0.30793008804321287, "memory(GiB)": 137.67, "step": 1110, "token_acc": 0.8887493130250451, "train_speed(iter/s)": 0.041173 }, { "epoch": 1.3056206088992974, "grad_norm": 0.19421324133872986, "learning_rate": 1.2932565346087218e-05, "loss": 0.3134599208831787, "memory(GiB)": 137.67, "step": 1115, "token_acc": 0.8847875557218118, "train_speed(iter/s)": 0.041168 }, { "epoch": 1.3114754098360657, "grad_norm": 0.18218953907489777, "learning_rate": 1.2870781090413991e-05, "loss": 0.3120888710021973, "memory(GiB)": 137.67, "step": 1120, "token_acc": 0.8869988305263882, "train_speed(iter/s)": 0.041162 }, { "epoch": 1.3173302107728337, "grad_norm": 0.19175498187541962, "learning_rate": 1.2808877173333896e-05, "loss": 0.30698199272155763, "memory(GiB)": 137.67, "step": 1125, "token_acc": 0.8941062176165803, "train_speed(iter/s)": 0.041159 }, { "epoch": 1.3231850117096018, "grad_norm": 0.18965595960617065, "learning_rate": 1.2746856175158556e-05, "loss": 0.31497323513031006, "memory(GiB)": 137.67, "step": 1130, "token_acc": 0.8871100459606847, "train_speed(iter/s)": 0.041157 }, { "epoch": 1.32903981264637, "grad_norm": 0.18627162277698517, "learning_rate": 1.2684720681079825e-05, "loss": 0.31060152053833007, "memory(GiB)": 137.67, "step": 1135, "token_acc": 0.871316468541155, "train_speed(iter/s)": 0.041153 }, { "epoch": 1.334894613583138, "grad_norm": 0.18565431237220764, "learning_rate": 1.2622473281062042e-05, "loss": 0.31475396156311036, "memory(GiB)": 137.67, "step": 1140, "token_acc": 0.8868342272670575, "train_speed(iter/s)": 0.04115 }, { "epoch": 1.3407494145199064, "grad_norm": 0.20739679038524628, "learning_rate": 1.256011656973406e-05, "loss": 0.32018194198608396, "memory(GiB)": 137.67, "step": 1145, "token_acc": 0.8872068230277186, "train_speed(iter/s)": 0.041147 }, { "epoch": 1.3466042154566744, "grad_norm": 0.1901317983865738, "learning_rate": 1.2497653146281113e-05, "loss": 0.3108601331710815, "memory(GiB)": 137.67, "step": 1150, "token_acc": 0.8855189570357069, "train_speed(iter/s)": 0.041141 }, { "epoch": 1.3524590163934427, "grad_norm": 0.16836309432983398, "learning_rate": 1.2435085614336459e-05, "loss": 0.315748405456543, "memory(GiB)": 137.67, "step": 1155, "token_acc": 0.8928414676966292, "train_speed(iter/s)": 0.041138 }, { "epoch": 1.3583138173302107, "grad_norm": 0.18492159247398376, "learning_rate": 1.2372416581872857e-05, "loss": 0.3051302909851074, "memory(GiB)": 137.67, "step": 1160, "token_acc": 0.8906577988281189, "train_speed(iter/s)": 0.041133 }, { "epoch": 1.364168618266979, "grad_norm": 0.17753958702087402, "learning_rate": 1.2309648661093878e-05, "loss": 0.3092564582824707, "memory(GiB)": 137.67, "step": 1165, "token_acc": 0.8921087343363074, "train_speed(iter/s)": 0.041129 }, { "epoch": 1.370023419203747, "grad_norm": 0.18764352798461914, "learning_rate": 1.2246784468324993e-05, "loss": 0.3163435935974121, "memory(GiB)": 137.67, "step": 1170, "token_acc": 0.8760536792329402, "train_speed(iter/s)": 0.041124 }, { "epoch": 1.3758782201405153, "grad_norm": 0.19416891038417816, "learning_rate": 1.218382662390454e-05, "loss": 0.3042860507965088, "memory(GiB)": 137.67, "step": 1175, "token_acc": 0.875018486527648, "train_speed(iter/s)": 0.041121 }, { "epoch": 1.3817330210772834, "grad_norm": 0.18030278384685516, "learning_rate": 1.2120777752074492e-05, "loss": 0.3132922172546387, "memory(GiB)": 137.67, "step": 1180, "token_acc": 0.8838601600050099, "train_speed(iter/s)": 0.041116 }, { "epoch": 1.3875878220140514, "grad_norm": 0.2763387858867645, "learning_rate": 1.2057640480871084e-05, "loss": 0.3143471240997314, "memory(GiB)": 137.67, "step": 1185, "token_acc": 0.8852224576271186, "train_speed(iter/s)": 0.041114 }, { "epoch": 1.3934426229508197, "grad_norm": 0.17999497056007385, "learning_rate": 1.1994417442015243e-05, "loss": 0.31265532970428467, "memory(GiB)": 137.67, "step": 1190, "token_acc": 0.8907372436335803, "train_speed(iter/s)": 0.041112 }, { "epoch": 1.399297423887588, "grad_norm": 0.18372628092765808, "learning_rate": 1.193111127080292e-05, "loss": 0.30383052825927737, "memory(GiB)": 137.67, "step": 1195, "token_acc": 0.8938835107946411, "train_speed(iter/s)": 0.041109 }, { "epoch": 1.405152224824356, "grad_norm": 0.1798890382051468, "learning_rate": 1.186772460599523e-05, "loss": 0.30336918830871584, "memory(GiB)": 137.67, "step": 1200, "token_acc": 0.891896889446055, "train_speed(iter/s)": 0.041105 }, { "epoch": 1.411007025761124, "grad_norm": 0.1862761676311493, "learning_rate": 1.1804260089708464e-05, "loss": 0.3127150535583496, "memory(GiB)": 137.67, "step": 1205, "token_acc": 0.8781827694454133, "train_speed(iter/s)": 0.041099 }, { "epoch": 1.4168618266978923, "grad_norm": 0.1872834414243698, "learning_rate": 1.1740720367303958e-05, "loss": 0.3076412916183472, "memory(GiB)": 137.67, "step": 1210, "token_acc": 0.8865224656924374, "train_speed(iter/s)": 0.041096 }, { "epoch": 1.4227166276346606, "grad_norm": 0.1868448704481125, "learning_rate": 1.1677108087277835e-05, "loss": 0.3139200210571289, "memory(GiB)": 137.67, "step": 1215, "token_acc": 0.8866469436643504, "train_speed(iter/s)": 0.041092 }, { "epoch": 1.4285714285714286, "grad_norm": 0.1959424465894699, "learning_rate": 1.1613425901150595e-05, "loss": 0.3134448051452637, "memory(GiB)": 137.67, "step": 1220, "token_acc": 0.8883061552452257, "train_speed(iter/s)": 0.041088 }, { "epoch": 1.4344262295081966, "grad_norm": 0.1766284704208374, "learning_rate": 1.15496764633566e-05, "loss": 0.3212412357330322, "memory(GiB)": 137.67, "step": 1225, "token_acc": 0.8780539320458743, "train_speed(iter/s)": 0.041084 }, { "epoch": 1.440281030444965, "grad_norm": 0.17711302638053894, "learning_rate": 1.1485862431133445e-05, "loss": 0.3123058795928955, "memory(GiB)": 137.67, "step": 1230, "token_acc": 0.8900835233492141, "train_speed(iter/s)": 0.041082 }, { "epoch": 1.446135831381733, "grad_norm": 0.1747256964445114, "learning_rate": 1.1421986464411169e-05, "loss": 0.31295697689056395, "memory(GiB)": 137.67, "step": 1235, "token_acc": 0.8767080016888458, "train_speed(iter/s)": 0.041075 }, { "epoch": 1.4519906323185012, "grad_norm": 0.18440908193588257, "learning_rate": 1.1358051225701404e-05, "loss": 0.30406386852264405, "memory(GiB)": 137.67, "step": 1240, "token_acc": 0.8795020947920581, "train_speed(iter/s)": 0.041071 }, { "epoch": 1.4578454332552693, "grad_norm": 0.17828240990638733, "learning_rate": 1.1294059379986384e-05, "loss": 0.3121625900268555, "memory(GiB)": 137.67, "step": 1245, "token_acc": 0.880069535801541, "train_speed(iter/s)": 0.041066 }, { "epoch": 1.4637002341920375, "grad_norm": 0.19148212671279907, "learning_rate": 1.1230013594607874e-05, "loss": 0.31345176696777344, "memory(GiB)": 137.67, "step": 1250, "token_acc": 0.8839757074137398, "train_speed(iter/s)": 0.041062 }, { "epoch": 1.4695550351288056, "grad_norm": 0.1828489750623703, "learning_rate": 1.1165916539155968e-05, "loss": 0.3104730129241943, "memory(GiB)": 137.67, "step": 1255, "token_acc": 0.8880499764055864, "train_speed(iter/s)": 0.04106 }, { "epoch": 1.4754098360655736, "grad_norm": 0.17934924364089966, "learning_rate": 1.1101770885357843e-05, "loss": 0.3066437244415283, "memory(GiB)": 137.67, "step": 1260, "token_acc": 0.8892594538641362, "train_speed(iter/s)": 0.041058 }, { "epoch": 1.481264637002342, "grad_norm": 0.16536173224449158, "learning_rate": 1.1037579306966365e-05, "loss": 0.3071906566619873, "memory(GiB)": 137.67, "step": 1265, "token_acc": 0.8958809106175363, "train_speed(iter/s)": 0.041054 }, { "epoch": 1.4871194379391102, "grad_norm": 0.18694446980953217, "learning_rate": 1.0973344479648652e-05, "loss": 0.3013455867767334, "memory(GiB)": 137.67, "step": 1270, "token_acc": 0.8899813852868301, "train_speed(iter/s)": 0.04105 }, { "epoch": 1.4929742388758782, "grad_norm": 0.17580904066562653, "learning_rate": 1.0909069080874556e-05, "loss": 0.30318174362182615, "memory(GiB)": 137.67, "step": 1275, "token_acc": 0.8817699648607147, "train_speed(iter/s)": 0.041047 }, { "epoch": 1.4988290398126463, "grad_norm": 0.18754124641418457, "learning_rate": 1.0844755789805042e-05, "loss": 0.31064305305480955, "memory(GiB)": 137.67, "step": 1280, "token_acc": 0.8804021416788542, "train_speed(iter/s)": 0.041044 }, { "epoch": 1.5046838407494145, "grad_norm": 0.19590285420417786, "learning_rate": 1.0780407287180526e-05, "loss": 0.3148102045059204, "memory(GiB)": 137.67, "step": 1285, "token_acc": 0.8805457351989244, "train_speed(iter/s)": 0.041039 }, { "epoch": 1.5105386416861828, "grad_norm": 0.19473980367183685, "learning_rate": 1.0716026255209124e-05, "loss": 0.3106101036071777, "memory(GiB)": 137.67, "step": 1290, "token_acc": 0.879328668153049, "train_speed(iter/s)": 0.041037 }, { "epoch": 1.5163934426229508, "grad_norm": 0.18378229439258575, "learning_rate": 1.0651615377454872e-05, "loss": 0.3110929250717163, "memory(GiB)": 137.67, "step": 1295, "token_acc": 0.8856033818930429, "train_speed(iter/s)": 0.041033 }, { "epoch": 1.5222482435597189, "grad_norm": 0.18482638895511627, "learning_rate": 1.0587177338725834e-05, "loss": 0.3163102626800537, "memory(GiB)": 137.67, "step": 1300, "token_acc": 0.8870778115329991, "train_speed(iter/s)": 0.04103 }, { "epoch": 1.5281030444964872, "grad_norm": 0.17333081364631653, "learning_rate": 1.0522714824962228e-05, "loss": 0.30377721786499023, "memory(GiB)": 137.67, "step": 1305, "token_acc": 0.8980077050082553, "train_speed(iter/s)": 0.041028 }, { "epoch": 1.5339578454332554, "grad_norm": 0.1912304162979126, "learning_rate": 1.0458230523124443e-05, "loss": 0.3162518501281738, "memory(GiB)": 137.67, "step": 1310, "token_acc": 0.8886457770855507, "train_speed(iter/s)": 0.041024 }, { "epoch": 1.5398126463700235, "grad_norm": 0.1846192628145218, "learning_rate": 1.0393727121081057e-05, "loss": 0.3126535892486572, "memory(GiB)": 137.67, "step": 1315, "token_acc": 0.8860128586991429, "train_speed(iter/s)": 0.041023 }, { "epoch": 1.5456674473067915, "grad_norm": 0.17747725546360016, "learning_rate": 1.0329207307496785e-05, "loss": 0.30208649635314944, "memory(GiB)": 137.67, "step": 1320, "token_acc": 0.8879456759093934, "train_speed(iter/s)": 0.04102 }, { "epoch": 1.5515222482435598, "grad_norm": 0.18443572521209717, "learning_rate": 1.0264673771720429e-05, "loss": 0.3092689037322998, "memory(GiB)": 137.67, "step": 1325, "token_acc": 0.892488839320581, "train_speed(iter/s)": 0.041016 }, { "epoch": 1.5573770491803278, "grad_norm": 0.18431353569030762, "learning_rate": 1.0200129203672754e-05, "loss": 0.3100308656692505, "memory(GiB)": 137.67, "step": 1330, "token_acc": 0.8782463261547713, "train_speed(iter/s)": 0.041012 }, { "epoch": 1.5632318501170959, "grad_norm": 0.1662471741437912, "learning_rate": 1.0135576293734381e-05, "loss": 0.30292906761169436, "memory(GiB)": 137.67, "step": 1335, "token_acc": 0.8942868271402976, "train_speed(iter/s)": 0.04101 }, { "epoch": 1.5690866510538641, "grad_norm": 0.1806328445672989, "learning_rate": 1.007101773263365e-05, "loss": 0.31366329193115233, "memory(GiB)": 137.67, "step": 1340, "token_acc": 0.8866166119192868, "train_speed(iter/s)": 0.041006 }, { "epoch": 1.5749414519906324, "grad_norm": 0.16915848851203918, "learning_rate": 1.0006456211334445e-05, "loss": 0.30766754150390624, "memory(GiB)": 137.67, "step": 1345, "token_acc": 0.8863719744503918, "train_speed(iter/s)": 0.041006 }, { "epoch": 1.5807962529274004, "grad_norm": 0.16690009832382202, "learning_rate": 9.941894420924044e-06, "loss": 0.3059431314468384, "memory(GiB)": 137.67, "step": 1350, "token_acc": 0.8971780549005762, "train_speed(iter/s)": 0.041001 }, { "epoch": 1.5866510538641685, "grad_norm": 0.17337647080421448, "learning_rate": 9.87733505250094e-06, "loss": 0.3098172664642334, "memory(GiB)": 137.67, "step": 1355, "token_acc": 0.8863237006126697, "train_speed(iter/s)": 0.040998 }, { "epoch": 1.5925058548009368, "grad_norm": 0.17512920498847961, "learning_rate": 9.812780797062678e-06, "loss": 0.30655522346496583, "memory(GiB)": 137.67, "step": 1360, "token_acc": 0.8899597184053006, "train_speed(iter/s)": 0.040993 }, { "epoch": 1.598360655737705, "grad_norm": 0.1765688955783844, "learning_rate": 9.748234345393672e-06, "loss": 0.3023026466369629, "memory(GiB)": 137.67, "step": 1365, "token_acc": 0.8879338667133921, "train_speed(iter/s)": 0.040989 }, { "epoch": 1.604215456674473, "grad_norm": 0.18416614830493927, "learning_rate": 9.68369838795306e-06, "loss": 0.30958683490753175, "memory(GiB)": 137.67, "step": 1370, "token_acc": 0.8849809108691687, "train_speed(iter/s)": 0.040984 }, { "epoch": 1.6100702576112411, "grad_norm": 0.17386697232723236, "learning_rate": 9.61917561476255e-06, "loss": 0.30420713424682616, "memory(GiB)": 137.67, "step": 1375, "token_acc": 0.8786233528080887, "train_speed(iter/s)": 0.040981 }, { "epoch": 1.6159250585480094, "grad_norm": 0.18169918656349182, "learning_rate": 9.554668715294305e-06, "loss": 0.31483819484710696, "memory(GiB)": 137.67, "step": 1380, "token_acc": 0.8864194675551166, "train_speed(iter/s)": 0.040979 }, { "epoch": 1.6217798594847777, "grad_norm": 0.1892368197441101, "learning_rate": 9.490180378358826e-06, "loss": 0.3172303676605225, "memory(GiB)": 137.67, "step": 1385, "token_acc": 0.8828729942067092, "train_speed(iter/s)": 0.040977 }, { "epoch": 1.6276346604215457, "grad_norm": 0.1751379817724228, "learning_rate": 9.425713291992878e-06, "loss": 0.30653929710388184, "memory(GiB)": 137.67, "step": 1390, "token_acc": 0.8895787320550146, "train_speed(iter/s)": 0.040974 }, { "epoch": 1.6334894613583137, "grad_norm": 0.18914154171943665, "learning_rate": 9.361270143347452e-06, "loss": 0.31959149837493894, "memory(GiB)": 137.67, "step": 1395, "token_acc": 0.8822264278089348, "train_speed(iter/s)": 0.040972 }, { "epoch": 1.639344262295082, "grad_norm": 0.16736507415771484, "learning_rate": 9.296853618575753e-06, "loss": 0.30730547904968264, "memory(GiB)": 137.67, "step": 1400, "token_acc": 0.8928722715040367, "train_speed(iter/s)": 0.04097 }, { "epoch": 1.6451990632318503, "grad_norm": 0.1708020716905594, "learning_rate": 9.232466402721241e-06, "loss": 0.31717801094055176, "memory(GiB)": 137.67, "step": 1405, "token_acc": 0.886989175916414, "train_speed(iter/s)": 0.040969 }, { "epoch": 1.651053864168618, "grad_norm": 0.17622792720794678, "learning_rate": 9.1681111796057e-06, "loss": 0.3083082675933838, "memory(GiB)": 137.67, "step": 1410, "token_acc": 0.8884494066990437, "train_speed(iter/s)": 0.040968 }, { "epoch": 1.6569086651053864, "grad_norm": 0.1885053962469101, "learning_rate": 9.103790631717375e-06, "loss": 0.32230064868927, "memory(GiB)": 137.67, "step": 1415, "token_acc": 0.878518037454961, "train_speed(iter/s)": 0.040965 }, { "epoch": 1.6627634660421546, "grad_norm": 0.17244482040405273, "learning_rate": 9.039507440099164e-06, "loss": 0.30806798934936525, "memory(GiB)": 137.67, "step": 1420, "token_acc": 0.8929606011942812, "train_speed(iter/s)": 0.040962 }, { "epoch": 1.6686182669789227, "grad_norm": 0.18172700703144073, "learning_rate": 8.975264284236866e-06, "loss": 0.30987024307250977, "memory(GiB)": 137.67, "step": 1425, "token_acc": 0.8885019605876434, "train_speed(iter/s)": 0.040961 }, { "epoch": 1.6744730679156907, "grad_norm": 0.18555694818496704, "learning_rate": 8.911063841947476e-06, "loss": 0.31224822998046875, "memory(GiB)": 137.67, "step": 1430, "token_acc": 0.8862099925232826, "train_speed(iter/s)": 0.040958 }, { "epoch": 1.680327868852459, "grad_norm": 0.18322236835956573, "learning_rate": 8.846908789267589e-06, "loss": 0.31196701526641846, "memory(GiB)": 137.67, "step": 1435, "token_acc": 0.8887980814742356, "train_speed(iter/s)": 0.040958 }, { "epoch": 1.6861826697892273, "grad_norm": 0.17747406661510468, "learning_rate": 8.78280180034184e-06, "loss": 0.3032996654510498, "memory(GiB)": 137.67, "step": 1440, "token_acc": 0.8822490977332802, "train_speed(iter/s)": 0.040955 }, { "epoch": 1.6920374707259953, "grad_norm": 0.18120799958705902, "learning_rate": 8.718745547311458e-06, "loss": 0.3137194633483887, "memory(GiB)": 137.67, "step": 1445, "token_acc": 0.8828540900663084, "train_speed(iter/s)": 0.040951 }, { "epoch": 1.6978922716627634, "grad_norm": 0.17743031680583954, "learning_rate": 8.654742700202849e-06, "loss": 0.31336297988891604, "memory(GiB)": 137.67, "step": 1450, "token_acc": 0.8851623130427727, "train_speed(iter/s)": 0.040949 }, { "epoch": 1.7037470725995316, "grad_norm": 0.1702745109796524, "learning_rate": 8.590795926816348e-06, "loss": 0.3027879953384399, "memory(GiB)": 137.67, "step": 1455, "token_acc": 0.8840805588371897, "train_speed(iter/s)": 0.040947 }, { "epoch": 1.7096018735362999, "grad_norm": 0.17240740358829498, "learning_rate": 8.526907892614986e-06, "loss": 0.3072841167449951, "memory(GiB)": 137.67, "step": 1460, "token_acc": 0.88948632592922, "train_speed(iter/s)": 0.040943 }, { "epoch": 1.715456674473068, "grad_norm": 0.17982088029384613, "learning_rate": 8.463081260613391e-06, "loss": 0.30924406051635744, "memory(GiB)": 137.67, "step": 1465, "token_acc": 0.8940978807037782, "train_speed(iter/s)": 0.04094 }, { "epoch": 1.721311475409836, "grad_norm": 0.19751447439193726, "learning_rate": 8.399318691266806e-06, "loss": 0.3119847774505615, "memory(GiB)": 137.67, "step": 1470, "token_acc": 0.8852366571009662, "train_speed(iter/s)": 0.040936 }, { "epoch": 1.7271662763466042, "grad_norm": 0.18603962659835815, "learning_rate": 8.335622842360168e-06, "loss": 0.3066195011138916, "memory(GiB)": 137.67, "step": 1475, "token_acc": 0.8890113777789009, "train_speed(iter/s)": 0.040933 }, { "epoch": 1.7330210772833725, "grad_norm": 0.2541693449020386, "learning_rate": 8.271996368897345e-06, "loss": 0.3128560781478882, "memory(GiB)": 137.67, "step": 1480, "token_acc": 0.8902386961489684, "train_speed(iter/s)": 0.040929 }, { "epoch": 1.7388758782201406, "grad_norm": 0.16992934048175812, "learning_rate": 8.208441922990454e-06, "loss": 0.3037855863571167, "memory(GiB)": 137.67, "step": 1485, "token_acc": 0.8849534643226473, "train_speed(iter/s)": 0.040926 }, { "epoch": 1.7447306791569086, "grad_norm": 0.17065441608428955, "learning_rate": 8.144962153749331e-06, "loss": 0.30540289878845217, "memory(GiB)": 137.67, "step": 1490, "token_acc": 0.8819315749736371, "train_speed(iter/s)": 0.040924 }, { "epoch": 1.7505854800936769, "grad_norm": 0.1787635236978531, "learning_rate": 8.081559707171094e-06, "loss": 0.31698925495147706, "memory(GiB)": 137.67, "step": 1495, "token_acc": 0.8824724072862914, "train_speed(iter/s)": 0.040923 }, { "epoch": 1.756440281030445, "grad_norm": 0.1751013845205307, "learning_rate": 8.01823722602986e-06, "loss": 0.30347585678100586, "memory(GiB)": 137.67, "step": 1500, "token_acc": 0.893298859486769, "train_speed(iter/s)": 0.040922 }, { "epoch": 1.762295081967213, "grad_norm": 0.17399156093597412, "learning_rate": 7.954997349766576e-06, "loss": 0.3116060972213745, "memory(GiB)": 137.67, "step": 1505, "token_acc": 0.8889070320988275, "train_speed(iter/s)": 0.040921 }, { "epoch": 1.7681498829039812, "grad_norm": 0.18837633728981018, "learning_rate": 7.891842714379027e-06, "loss": 0.29880785942077637, "memory(GiB)": 137.67, "step": 1510, "token_acc": 0.893647204719971, "train_speed(iter/s)": 0.040918 }, { "epoch": 1.7740046838407495, "grad_norm": 0.1845746487379074, "learning_rate": 7.828775952311921e-06, "loss": 0.30261945724487305, "memory(GiB)": 137.67, "step": 1515, "token_acc": 0.8851783808483535, "train_speed(iter/s)": 0.040914 }, { "epoch": 1.7798594847775175, "grad_norm": 0.16885152459144592, "learning_rate": 7.765799692347201e-06, "loss": 0.3042313575744629, "memory(GiB)": 137.67, "step": 1520, "token_acc": 0.8835214994418757, "train_speed(iter/s)": 0.040911 }, { "epoch": 1.7857142857142856, "grad_norm": 0.1790182739496231, "learning_rate": 7.702916559494444e-06, "loss": 0.31259956359863283, "memory(GiB)": 137.67, "step": 1525, "token_acc": 0.8878653758934018, "train_speed(iter/s)": 0.040909 }, { "epoch": 1.7915690866510539, "grad_norm": 0.17695166170597076, "learning_rate": 7.64012917488146e-06, "loss": 0.29359025955200196, "memory(GiB)": 137.67, "step": 1530, "token_acc": 0.9000399023492115, "train_speed(iter/s)": 0.040908 }, { "epoch": 1.7974238875878221, "grad_norm": 0.18347503244876862, "learning_rate": 7.577440155645028e-06, "loss": 0.30249216556549074, "memory(GiB)": 137.67, "step": 1535, "token_acc": 0.8902694639046774, "train_speed(iter/s)": 0.040904 }, { "epoch": 1.8032786885245902, "grad_norm": 0.1697729527950287, "learning_rate": 7.514852114821811e-06, "loss": 0.31291751861572265, "memory(GiB)": 137.67, "step": 1540, "token_acc": 0.8868685350765146, "train_speed(iter/s)": 0.040902 }, { "epoch": 1.8091334894613582, "grad_norm": 0.16477090120315552, "learning_rate": 7.452367661239433e-06, "loss": 0.29220216274261473, "memory(GiB)": 137.67, "step": 1545, "token_acc": 0.8877543630965312, "train_speed(iter/s)": 0.040899 }, { "epoch": 1.8149882903981265, "grad_norm": 0.19079044461250305, "learning_rate": 7.389989399407741e-06, "loss": 0.3156083106994629, "memory(GiB)": 137.67, "step": 1550, "token_acc": 0.8873283112245697, "train_speed(iter/s)": 0.040896 }, { "epoch": 1.8208430913348947, "grad_norm": 0.1723940074443817, "learning_rate": 7.3277199294102485e-06, "loss": 0.30045547485351565, "memory(GiB)": 137.67, "step": 1555, "token_acc": 0.8850201501823112, "train_speed(iter/s)": 0.040894 }, { "epoch": 1.8266978922716628, "grad_norm": 0.18594853579998016, "learning_rate": 7.265561846795741e-06, "loss": 0.3101131677627563, "memory(GiB)": 137.67, "step": 1560, "token_acc": 0.8868083283139077, "train_speed(iter/s)": 0.040889 }, { "epoch": 1.8325526932084308, "grad_norm": 0.1757504642009735, "learning_rate": 7.203517742470101e-06, "loss": 0.30873966217041016, "memory(GiB)": 137.67, "step": 1565, "token_acc": 0.8949954641669187, "train_speed(iter/s)": 0.040886 }, { "epoch": 1.838407494145199, "grad_norm": 0.2077726572751999, "learning_rate": 7.141590202588312e-06, "loss": 0.3127377986907959, "memory(GiB)": 137.67, "step": 1570, "token_acc": 0.888584743745537, "train_speed(iter/s)": 0.040885 }, { "epoch": 1.8442622950819674, "grad_norm": 0.17814461886882782, "learning_rate": 7.079781808446648e-06, "loss": 0.31596999168395995, "memory(GiB)": 137.67, "step": 1575, "token_acc": 0.8755756783669405, "train_speed(iter/s)": 0.040882 }, { "epoch": 1.8501170960187352, "grad_norm": 0.16512958705425262, "learning_rate": 7.018095136375089e-06, "loss": 0.3012762308120728, "memory(GiB)": 137.67, "step": 1580, "token_acc": 0.8862999993707803, "train_speed(iter/s)": 0.04088 }, { "epoch": 1.8559718969555035, "grad_norm": 0.18698780238628387, "learning_rate": 6.956532757629945e-06, "loss": 0.3080646514892578, "memory(GiB)": 137.67, "step": 1585, "token_acc": 0.8861714900322669, "train_speed(iter/s)": 0.040877 }, { "epoch": 1.8618266978922717, "grad_norm": 0.17041386663913727, "learning_rate": 6.89509723828665e-06, "loss": 0.3119032382965088, "memory(GiB)": 137.67, "step": 1590, "token_acc": 0.8861256952099799, "train_speed(iter/s)": 0.040875 }, { "epoch": 1.8676814988290398, "grad_norm": 0.18812042474746704, "learning_rate": 6.833791139132824e-06, "loss": 0.2984042167663574, "memory(GiB)": 137.67, "step": 1595, "token_acc": 0.8881694299555838, "train_speed(iter/s)": 0.040871 }, { "epoch": 1.8735362997658078, "grad_norm": 0.16663610935211182, "learning_rate": 6.772617015561529e-06, "loss": 0.3069270610809326, "memory(GiB)": 137.67, "step": 1600, "token_acc": 0.8785419403265153, "train_speed(iter/s)": 0.040869 }, { "epoch": 1.879391100702576, "grad_norm": 0.16731353104114532, "learning_rate": 6.7115774174647475e-06, "loss": 0.29993810653686526, "memory(GiB)": 137.67, "step": 1605, "token_acc": 0.8944355407195264, "train_speed(iter/s)": 0.040868 }, { "epoch": 1.8852459016393444, "grad_norm": 0.18671032786369324, "learning_rate": 6.6506748891271045e-06, "loss": 0.3104290723800659, "memory(GiB)": 137.67, "step": 1610, "token_acc": 0.893398089707724, "train_speed(iter/s)": 0.040866 }, { "epoch": 1.8911007025761124, "grad_norm": 0.17069920897483826, "learning_rate": 6.5899119691198025e-06, "loss": 0.30440511703491213, "memory(GiB)": 137.67, "step": 1615, "token_acc": 0.8883004841907675, "train_speed(iter/s)": 0.040865 }, { "epoch": 1.8969555035128804, "grad_norm": 0.1704709678888321, "learning_rate": 6.529291190194829e-06, "loss": 0.3084626436233521, "memory(GiB)": 137.67, "step": 1620, "token_acc": 0.887373335138147, "train_speed(iter/s)": 0.040864 }, { "epoch": 1.9028103044496487, "grad_norm": 0.1708633005619049, "learning_rate": 6.468815079179364e-06, "loss": 0.30423784255981445, "memory(GiB)": 137.67, "step": 1625, "token_acc": 0.8923868074324853, "train_speed(iter/s)": 0.040862 }, { "epoch": 1.908665105386417, "grad_norm": 0.17672830820083618, "learning_rate": 6.408486156870466e-06, "loss": 0.31655054092407225, "memory(GiB)": 137.67, "step": 1630, "token_acc": 0.8692423282788768, "train_speed(iter/s)": 0.04086 }, { "epoch": 1.914519906323185, "grad_norm": 0.1735108494758606, "learning_rate": 6.348306937929991e-06, "loss": 0.31425652503967283, "memory(GiB)": 137.67, "step": 1635, "token_acc": 0.882395514622517, "train_speed(iter/s)": 0.04086 }, { "epoch": 1.920374707259953, "grad_norm": 0.15910685062408447, "learning_rate": 6.288279930779789e-06, "loss": 0.29740355014801023, "memory(GiB)": 137.67, "step": 1640, "token_acc": 0.8963298424379659, "train_speed(iter/s)": 0.040858 }, { "epoch": 1.9262295081967213, "grad_norm": 0.17650458216667175, "learning_rate": 6.228407637497131e-06, "loss": 0.30800676345825195, "memory(GiB)": 137.67, "step": 1645, "token_acc": 0.8754677877967858, "train_speed(iter/s)": 0.040855 }, { "epoch": 1.9320843091334896, "grad_norm": 0.16745297610759735, "learning_rate": 6.1686925537104306e-06, "loss": 0.2977410316467285, "memory(GiB)": 137.67, "step": 1650, "token_acc": 0.8798736234089867, "train_speed(iter/s)": 0.040852 }, { "epoch": 1.9379391100702577, "grad_norm": 0.1728445142507553, "learning_rate": 6.109137168495205e-06, "loss": 0.304546856880188, "memory(GiB)": 137.67, "step": 1655, "token_acc": 0.9005831398969597, "train_speed(iter/s)": 0.040851 }, { "epoch": 1.9437939110070257, "grad_norm": 0.1682547777891159, "learning_rate": 6.049743964270336e-06, "loss": 0.3136142730712891, "memory(GiB)": 137.67, "step": 1660, "token_acc": 0.8856946741131322, "train_speed(iter/s)": 0.040848 }, { "epoch": 1.949648711943794, "grad_norm": 0.18915309011936188, "learning_rate": 5.990515416694591e-06, "loss": 0.3113490104675293, "memory(GiB)": 137.67, "step": 1665, "token_acc": 0.8886227731406503, "train_speed(iter/s)": 0.040845 }, { "epoch": 1.955503512880562, "grad_norm": 0.18081413209438324, "learning_rate": 5.931453994563434e-06, "loss": 0.30602524280548093, "memory(GiB)": 137.67, "step": 1670, "token_acc": 0.8937767328555647, "train_speed(iter/s)": 0.040844 }, { "epoch": 1.96135831381733, "grad_norm": 0.2595233917236328, "learning_rate": 5.872562159706116e-06, "loss": 0.309699273109436, "memory(GiB)": 137.67, "step": 1675, "token_acc": 0.883843976093111, "train_speed(iter/s)": 0.040842 }, { "epoch": 1.9672131147540983, "grad_norm": 0.17678314447402954, "learning_rate": 5.8138423668830605e-06, "loss": 0.30298714637756347, "memory(GiB)": 137.67, "step": 1680, "token_acc": 0.8865513684995878, "train_speed(iter/s)": 0.040842 }, { "epoch": 1.9730679156908666, "grad_norm": 0.1795545518398285, "learning_rate": 5.755297063683551e-06, "loss": 0.30653939247131345, "memory(GiB)": 137.67, "step": 1685, "token_acc": 0.8907540567138181, "train_speed(iter/s)": 0.040841 }, { "epoch": 1.9789227166276346, "grad_norm": 0.17241141200065613, "learning_rate": 5.696928690423693e-06, "loss": 0.30241034030914304, "memory(GiB)": 137.67, "step": 1690, "token_acc": 0.8856109987263056, "train_speed(iter/s)": 0.040841 }, { "epoch": 1.9847775175644027, "grad_norm": 0.1767030656337738, "learning_rate": 5.638739680044718e-06, "loss": 0.3159188270568848, "memory(GiB)": 137.67, "step": 1695, "token_acc": 0.8789045280418222, "train_speed(iter/s)": 0.040839 }, { "epoch": 1.990632318501171, "grad_norm": 0.1798180490732193, "learning_rate": 5.580732458011544e-06, "loss": 0.3054344654083252, "memory(GiB)": 137.67, "step": 1700, "token_acc": 0.8914613695909465, "train_speed(iter/s)": 0.040837 }, { "epoch": 1.9964871194379392, "grad_norm": 0.1673898547887802, "learning_rate": 5.522909442211708e-06, "loss": 0.3050167798995972, "memory(GiB)": 137.67, "step": 1705, "token_acc": 0.8836358249226172, "train_speed(iter/s)": 0.040834 }, { "epoch": 2.002341920374707, "grad_norm": 0.24459093809127808, "learning_rate": 5.465273042854551e-06, "loss": 0.2896696090698242, "memory(GiB)": 137.67, "step": 1710, "token_acc": 0.8956877534575909, "train_speed(iter/s)": 0.040723 }, { "epoch": 2.0081967213114753, "grad_norm": 0.19826985895633698, "learning_rate": 5.407825662370778e-06, "loss": 0.2708754301071167, "memory(GiB)": 137.67, "step": 1715, "token_acc": 0.8993573677984775, "train_speed(iter/s)": 0.040721 }, { "epoch": 2.0140515222482436, "grad_norm": 0.20230858027935028, "learning_rate": 5.350569695312313e-06, "loss": 0.27931761741638184, "memory(GiB)": 137.67, "step": 1720, "token_acc": 0.8964727026237073, "train_speed(iter/s)": 0.040718 }, { "epoch": 2.019906323185012, "grad_norm": 0.17940187454223633, "learning_rate": 5.293507528252474e-06, "loss": 0.2833970308303833, "memory(GiB)": 137.67, "step": 1725, "token_acc": 0.8971622665586578, "train_speed(iter/s)": 0.040716 }, { "epoch": 2.0257611241217797, "grad_norm": 0.2274295687675476, "learning_rate": 5.236641539686518e-06, "loss": 0.2709039211273193, "memory(GiB)": 137.67, "step": 1730, "token_acc": 0.8940215607642851, "train_speed(iter/s)": 0.040716 }, { "epoch": 2.031615925058548, "grad_norm": 0.17937658727169037, "learning_rate": 5.179974099932472e-06, "loss": 0.2649374961853027, "memory(GiB)": 137.67, "step": 1735, "token_acc": 0.8949033413934375, "train_speed(iter/s)": 0.040713 }, { "epoch": 2.037470725995316, "grad_norm": 0.1847214251756668, "learning_rate": 5.12350757103236e-06, "loss": 0.26505355834960936, "memory(GiB)": 137.67, "step": 1740, "token_acc": 0.8981974914281606, "train_speed(iter/s)": 0.040712 }, { "epoch": 2.0433255269320845, "grad_norm": 0.1737840622663498, "learning_rate": 5.067244306653736e-06, "loss": 0.27186686992645265, "memory(GiB)": 137.67, "step": 1745, "token_acc": 0.9053836113307479, "train_speed(iter/s)": 0.040711 }, { "epoch": 2.0491803278688523, "grad_norm": 0.1807735711336136, "learning_rate": 5.0111866519915575e-06, "loss": 0.2668013334274292, "memory(GiB)": 137.67, "step": 1750, "token_acc": 0.8954151927308955, "train_speed(iter/s)": 0.040709 }, { "epoch": 2.0550351288056206, "grad_norm": 0.17946134507656097, "learning_rate": 4.95533694367047e-06, "loss": 0.26618137359619143, "memory(GiB)": 137.67, "step": 1755, "token_acc": 0.8999696707241193, "train_speed(iter/s)": 0.040708 }, { "epoch": 2.060889929742389, "grad_norm": 0.17995508015155792, "learning_rate": 4.899697509647379e-06, "loss": 0.27054500579833984, "memory(GiB)": 137.67, "step": 1760, "token_acc": 0.8920381030958765, "train_speed(iter/s)": 0.040707 }, { "epoch": 2.066744730679157, "grad_norm": 0.22271017730236053, "learning_rate": 4.844270669114424e-06, "loss": 0.2727907657623291, "memory(GiB)": 137.67, "step": 1765, "token_acc": 0.9031526316777533, "train_speed(iter/s)": 0.040706 }, { "epoch": 2.072599531615925, "grad_norm": 0.18377523124217987, "learning_rate": 4.789058732402319e-06, "loss": 0.26617846488952634, "memory(GiB)": 137.67, "step": 1770, "token_acc": 0.8968159437280188, "train_speed(iter/s)": 0.040704 }, { "epoch": 2.078454332552693, "grad_norm": 0.18358266353607178, "learning_rate": 4.734064000884044e-06, "loss": 0.2815399646759033, "memory(GiB)": 137.67, "step": 1775, "token_acc": 0.8860162596527972, "train_speed(iter/s)": 0.040703 }, { "epoch": 2.0843091334894615, "grad_norm": 0.17939767241477966, "learning_rate": 4.679288766878908e-06, "loss": 0.2770793914794922, "memory(GiB)": 137.67, "step": 1780, "token_acc": 0.8990350010749907, "train_speed(iter/s)": 0.0407 }, { "epoch": 2.0901639344262297, "grad_norm": 0.18252268433570862, "learning_rate": 4.624735313557019e-06, "loss": 0.27314205169677735, "memory(GiB)": 137.67, "step": 1785, "token_acc": 0.9036665729722977, "train_speed(iter/s)": 0.040699 }, { "epoch": 2.0960187353629975, "grad_norm": 0.17692163586616516, "learning_rate": 4.570405914844105e-06, "loss": 0.26518521308898924, "memory(GiB)": 137.67, "step": 1790, "token_acc": 0.9007013796506218, "train_speed(iter/s)": 0.040696 }, { "epoch": 2.101873536299766, "grad_norm": 0.1812998205423355, "learning_rate": 4.516302835326723e-06, "loss": 0.27246594429016113, "memory(GiB)": 137.67, "step": 1795, "token_acc": 0.9057411329497284, "train_speed(iter/s)": 0.040694 }, { "epoch": 2.107728337236534, "grad_norm": 0.17790301144123077, "learning_rate": 4.462428330157886e-06, "loss": 0.2635958671569824, "memory(GiB)": 137.67, "step": 1800, "token_acc": 0.9060071718018364, "train_speed(iter/s)": 0.040692 }, { "epoch": 2.113583138173302, "grad_norm": 0.1772291511297226, "learning_rate": 4.4087846449630475e-06, "loss": 0.2673187732696533, "memory(GiB)": 137.67, "step": 1805, "token_acc": 0.902466497498459, "train_speed(iter/s)": 0.040691 }, { "epoch": 2.11943793911007, "grad_norm": 0.1833985149860382, "learning_rate": 4.355374015746493e-06, "loss": 0.26436376571655273, "memory(GiB)": 137.67, "step": 1810, "token_acc": 0.8990824248093747, "train_speed(iter/s)": 0.040688 }, { "epoch": 2.1252927400468384, "grad_norm": 0.1888750046491623, "learning_rate": 4.302198668798159e-06, "loss": 0.2690884113311768, "memory(GiB)": 137.67, "step": 1815, "token_acc": 0.8948256326325066, "train_speed(iter/s)": 0.040688 }, { "epoch": 2.1311475409836067, "grad_norm": 0.1726667881011963, "learning_rate": 4.249260820600813e-06, "loss": 0.2568142175674438, "memory(GiB)": 137.67, "step": 1820, "token_acc": 0.9027062619756462, "train_speed(iter/s)": 0.040686 }, { "epoch": 2.1370023419203745, "grad_norm": 0.18242421746253967, "learning_rate": 4.1965626777376766e-06, "loss": 0.26575822830200196, "memory(GiB)": 137.67, "step": 1825, "token_acc": 0.9058191422116245, "train_speed(iter/s)": 0.040685 }, { "epoch": 2.142857142857143, "grad_norm": 0.17865152657032013, "learning_rate": 4.144106436800453e-06, "loss": 0.2705830097198486, "memory(GiB)": 137.67, "step": 1830, "token_acc": 0.9064275903781455, "train_speed(iter/s)": 0.040686 }, { "epoch": 2.148711943793911, "grad_norm": 0.1739743947982788, "learning_rate": 4.091894284297758e-06, "loss": 0.262749981880188, "memory(GiB)": 137.67, "step": 1835, "token_acc": 0.8932282627390278, "train_speed(iter/s)": 0.040684 }, { "epoch": 2.1545667447306793, "grad_norm": 0.18693114817142487, "learning_rate": 4.039928396563983e-06, "loss": 0.27836999893188474, "memory(GiB)": 137.67, "step": 1840, "token_acc": 0.8999278596166879, "train_speed(iter/s)": 0.040683 }, { "epoch": 2.160421545667447, "grad_norm": 0.18225987255573273, "learning_rate": 3.9882109396685845e-06, "loss": 0.25630941390991213, "memory(GiB)": 137.67, "step": 1845, "token_acc": 0.8964322481719588, "train_speed(iter/s)": 0.04068 }, { "epoch": 2.1662763466042154, "grad_norm": 0.1680818498134613, "learning_rate": 3.936744069325797e-06, "loss": 0.25788373947143556, "memory(GiB)": 137.67, "step": 1850, "token_acc": 0.9047133964952628, "train_speed(iter/s)": 0.040677 }, { "epoch": 2.1721311475409837, "grad_norm": 0.17563344538211823, "learning_rate": 3.885529930804768e-06, "loss": 0.2534646987915039, "memory(GiB)": 137.67, "step": 1855, "token_acc": 0.895904841548197, "train_speed(iter/s)": 0.040675 }, { "epoch": 2.177985948477752, "grad_norm": 0.2031351625919342, "learning_rate": 3.834570658840152e-06, "loss": 0.2712204933166504, "memory(GiB)": 137.67, "step": 1860, "token_acc": 0.8943131411791787, "train_speed(iter/s)": 0.040674 }, { "epoch": 2.1838407494145198, "grad_norm": 0.1767955720424652, "learning_rate": 3.7838683775431106e-06, "loss": 0.26442804336547854, "memory(GiB)": 137.67, "step": 1865, "token_acc": 0.9006802168952266, "train_speed(iter/s)": 0.040673 }, { "epoch": 2.189695550351288, "grad_norm": 0.17129677534103394, "learning_rate": 3.733425200312797e-06, "loss": 0.2669063091278076, "memory(GiB)": 137.67, "step": 1870, "token_acc": 0.8917139826542709, "train_speed(iter/s)": 0.040672 }, { "epoch": 2.1955503512880563, "grad_norm": 0.17820899188518524, "learning_rate": 3.683243229748249e-06, "loss": 0.2608784198760986, "memory(GiB)": 137.67, "step": 1875, "token_acc": 0.8967133346325762, "train_speed(iter/s)": 0.04067 }, { "epoch": 2.201405152224824, "grad_norm": 0.18119502067565918, "learning_rate": 3.633324557560747e-06, "loss": 0.265275239944458, "memory(GiB)": 137.67, "step": 1880, "token_acc": 0.9029575814389501, "train_speed(iter/s)": 0.040669 }, { "epoch": 2.2072599531615924, "grad_norm": 0.17707428336143494, "learning_rate": 3.5836712644866277e-06, "loss": 0.2611743450164795, "memory(GiB)": 137.67, "step": 1885, "token_acc": 0.8965409189329774, "train_speed(iter/s)": 0.040668 }, { "epoch": 2.2131147540983607, "grad_norm": 0.1768161803483963, "learning_rate": 3.5342854202005696e-06, "loss": 0.26110024452209474, "memory(GiB)": 137.67, "step": 1890, "token_acc": 0.9035024093649873, "train_speed(iter/s)": 0.040667 }, { "epoch": 2.218969555035129, "grad_norm": 0.17210449278354645, "learning_rate": 3.485169083229293e-06, "loss": 0.26915616989135743, "memory(GiB)": 137.67, "step": 1895, "token_acc": 0.9061759392893929, "train_speed(iter/s)": 0.040667 }, { "epoch": 2.2248243559718968, "grad_norm": 0.16969619691371918, "learning_rate": 3.4363243008657842e-06, "loss": 0.2634119987487793, "memory(GiB)": 137.67, "step": 1900, "token_acc": 0.8916742749773309, "train_speed(iter/s)": 0.040664 }, { "epoch": 2.230679156908665, "grad_norm": 0.17764930427074432, "learning_rate": 3.3877531090839478e-06, "loss": 0.2685534000396729, "memory(GiB)": 137.67, "step": 1905, "token_acc": 0.8940042290704804, "train_speed(iter/s)": 0.040663 }, { "epoch": 2.2365339578454333, "grad_norm": 0.17651669681072235, "learning_rate": 3.3394575324537327e-06, "loss": 0.27190165519714354, "memory(GiB)": 137.67, "step": 1910, "token_acc": 0.8928626982497402, "train_speed(iter/s)": 0.04066 }, { "epoch": 2.2423887587822016, "grad_norm": 0.16508856415748596, "learning_rate": 3.2914395840567605e-06, "loss": 0.2606737852096558, "memory(GiB)": 137.67, "step": 1915, "token_acc": 0.9028335241642236, "train_speed(iter/s)": 0.040658 }, { "epoch": 2.2482435597189694, "grad_norm": 0.16644766926765442, "learning_rate": 3.2437012654024057e-06, "loss": 0.2660099983215332, "memory(GiB)": 137.67, "step": 1920, "token_acc": 0.9046304613618784, "train_speed(iter/s)": 0.040656 }, { "epoch": 2.2540983606557377, "grad_norm": 0.16391952335834503, "learning_rate": 3.1962445663443643e-06, "loss": 0.2678091287612915, "memory(GiB)": 137.67, "step": 1925, "token_acc": 0.8979980130091664, "train_speed(iter/s)": 0.040653 }, { "epoch": 2.259953161592506, "grad_norm": 0.1803101897239685, "learning_rate": 3.1490714649977196e-06, "loss": 0.27110137939453127, "memory(GiB)": 137.67, "step": 1930, "token_acc": 0.905863734174048, "train_speed(iter/s)": 0.04065 }, { "epoch": 2.265807962529274, "grad_norm": 0.17323030531406403, "learning_rate": 3.102183927656488e-06, "loss": 0.26174540519714357, "memory(GiB)": 137.67, "step": 1935, "token_acc": 0.8902694797112273, "train_speed(iter/s)": 0.040649 }, { "epoch": 2.271662763466042, "grad_norm": 0.18379603326320648, "learning_rate": 3.0555839087116547e-06, "loss": 0.27245678901672366, "memory(GiB)": 137.67, "step": 1940, "token_acc": 0.90194375, "train_speed(iter/s)": 0.040648 }, { "epoch": 2.2775175644028103, "grad_norm": 0.1765807718038559, "learning_rate": 3.009273350569705e-06, "loss": 0.2700004816055298, "memory(GiB)": 137.67, "step": 1945, "token_acc": 0.9060629034421867, "train_speed(iter/s)": 0.040648 }, { "epoch": 2.2833723653395785, "grad_norm": 0.17609137296676636, "learning_rate": 2.963254183571682e-06, "loss": 0.2663255214691162, "memory(GiB)": 137.67, "step": 1950, "token_acc": 0.9028553183442811, "train_speed(iter/s)": 0.040646 }, { "epoch": 2.289227166276347, "grad_norm": 0.1761084645986557, "learning_rate": 2.9175283259126943e-06, "loss": 0.2662710428237915, "memory(GiB)": 137.67, "step": 1955, "token_acc": 0.9068832885430957, "train_speed(iter/s)": 0.040645 }, { "epoch": 2.2950819672131146, "grad_norm": 0.16875940561294556, "learning_rate": 2.872097683561986e-06, "loss": 0.2650928497314453, "memory(GiB)": 137.67, "step": 1960, "token_acc": 0.9107070141504632, "train_speed(iter/s)": 0.040644 }, { "epoch": 2.300936768149883, "grad_norm": 0.18349847197532654, "learning_rate": 2.8269641501834834e-06, "loss": 0.2731610298156738, "memory(GiB)": 137.67, "step": 1965, "token_acc": 0.8929668563025367, "train_speed(iter/s)": 0.040644 }, { "epoch": 2.306791569086651, "grad_norm": 0.17049305140972137, "learning_rate": 2.782129607056848e-06, "loss": 0.2668560028076172, "memory(GiB)": 137.67, "step": 1970, "token_acc": 0.8946301039908395, "train_speed(iter/s)": 0.040643 }, { "epoch": 2.312646370023419, "grad_norm": 0.17511935532093048, "learning_rate": 2.7375959229990856e-06, "loss": 0.25858211517333984, "memory(GiB)": 137.67, "step": 1975, "token_acc": 0.9011111249984377, "train_speed(iter/s)": 0.040641 }, { "epoch": 2.3185011709601873, "grad_norm": 0.16913901269435883, "learning_rate": 2.6933649542866326e-06, "loss": 0.2623398780822754, "memory(GiB)": 137.67, "step": 1980, "token_acc": 0.8980817363368075, "train_speed(iter/s)": 0.04064 }, { "epoch": 2.3243559718969555, "grad_norm": 0.16392305493354797, "learning_rate": 2.649438544577977e-06, "loss": 0.25210521221160886, "memory(GiB)": 137.67, "step": 1985, "token_acc": 0.9006790772077851, "train_speed(iter/s)": 0.040639 }, { "epoch": 2.330210772833724, "grad_norm": 0.16555212438106537, "learning_rate": 2.6058185248368317e-06, "loss": 0.26413559913635254, "memory(GiB)": 137.67, "step": 1990, "token_acc": 0.9057566877776727, "train_speed(iter/s)": 0.040637 }, { "epoch": 2.3360655737704916, "grad_norm": 0.17122185230255127, "learning_rate": 2.562506713255789e-06, "loss": 0.2596926689147949, "memory(GiB)": 137.67, "step": 1995, "token_acc": 0.9047409789878514, "train_speed(iter/s)": 0.040636 }, { "epoch": 2.34192037470726, "grad_norm": 0.17818881571292877, "learning_rate": 2.519504915180555e-06, "loss": 0.2623495101928711, "memory(GiB)": 137.67, "step": 2000, "token_acc": 0.9031698814490531, "train_speed(iter/s)": 0.040635 }, { "epoch": 2.347775175644028, "grad_norm": 0.17120912671089172, "learning_rate": 2.4768149230346917e-06, "loss": 0.2763922929763794, "memory(GiB)": 137.67, "step": 2005, "token_acc": 0.90147262555157, "train_speed(iter/s)": 0.040633 }, { "epoch": 2.3536299765807964, "grad_norm": 0.1725643426179886, "learning_rate": 2.4344385162448924e-06, "loss": 0.26347975730895995, "memory(GiB)": 137.67, "step": 2010, "token_acc": 0.9056239470479484, "train_speed(iter/s)": 0.040632 }, { "epoch": 2.3594847775175642, "grad_norm": 0.17098568379878998, "learning_rate": 2.392377461166826e-06, "loss": 0.26201567649841306, "memory(GiB)": 137.67, "step": 2015, "token_acc": 0.9030459083951856, "train_speed(iter/s)": 0.040631 }, { "epoch": 2.3653395784543325, "grad_norm": 0.17561163008213043, "learning_rate": 2.350633511011511e-06, "loss": 0.26811957359313965, "memory(GiB)": 137.67, "step": 2020, "token_acc": 0.8995977151723318, "train_speed(iter/s)": 0.040628 }, { "epoch": 2.371194379391101, "grad_norm": 0.1689569056034088, "learning_rate": 2.309208405772221e-06, "loss": 0.2759255409240723, "memory(GiB)": 137.67, "step": 2025, "token_acc": 0.9044138910892334, "train_speed(iter/s)": 0.040628 }, { "epoch": 2.3770491803278686, "grad_norm": 0.26568159461021423, "learning_rate": 2.2681038721519768e-06, "loss": 0.2785911560058594, "memory(GiB)": 137.67, "step": 2030, "token_acc": 0.8982950398323113, "train_speed(iter/s)": 0.040625 }, { "epoch": 2.382903981264637, "grad_norm": 0.18388140201568604, "learning_rate": 2.227321623491563e-06, "loss": 0.26940011978149414, "memory(GiB)": 137.67, "step": 2035, "token_acc": 0.8968315203642803, "train_speed(iter/s)": 0.040624 }, { "epoch": 2.388758782201405, "grad_norm": 0.16938382387161255, "learning_rate": 2.186863359698108e-06, "loss": 0.26633501052856445, "memory(GiB)": 137.67, "step": 2040, "token_acc": 0.9180211235459854, "train_speed(iter/s)": 0.040622 }, { "epoch": 2.3946135831381734, "grad_norm": 0.17878937721252441, "learning_rate": 2.1467307671742377e-06, "loss": 0.2687513828277588, "memory(GiB)": 137.67, "step": 2045, "token_acc": 0.8974434682640148, "train_speed(iter/s)": 0.040621 }, { "epoch": 2.4004683840749417, "grad_norm": 0.1779458373785019, "learning_rate": 2.106925518747779e-06, "loss": 0.26202917098999023, "memory(GiB)": 137.67, "step": 2050, "token_acc": 0.9011938413047829, "train_speed(iter/s)": 0.04062 }, { "epoch": 2.4063231850117095, "grad_norm": 0.17342902719974518, "learning_rate": 2.06744927360202e-06, "loss": 0.26468615531921386, "memory(GiB)": 137.67, "step": 2055, "token_acc": 0.8999491938022672, "train_speed(iter/s)": 0.040617 }, { "epoch": 2.4121779859484778, "grad_norm": 0.17159196734428406, "learning_rate": 2.0283036772065712e-06, "loss": 0.26631085872650145, "memory(GiB)": 137.67, "step": 2060, "token_acc": 0.904679059271446, "train_speed(iter/s)": 0.040615 }, { "epoch": 2.418032786885246, "grad_norm": 0.19288575649261475, "learning_rate": 1.9894903612487683e-06, "loss": 0.2730381488800049, "memory(GiB)": 137.67, "step": 2065, "token_acc": 0.8923981017844846, "train_speed(iter/s)": 0.040614 }, { "epoch": 2.423887587822014, "grad_norm": 0.17374974489212036, "learning_rate": 1.9510109435656457e-06, "loss": 0.27329106330871583, "memory(GiB)": 137.67, "step": 2070, "token_acc": 0.9024526900268184, "train_speed(iter/s)": 0.040613 }, { "epoch": 2.429742388758782, "grad_norm": 0.1817113608121872, "learning_rate": 1.9128670280765283e-06, "loss": 0.27490620613098143, "memory(GiB)": 137.67, "step": 2075, "token_acc": 0.8959030374086766, "train_speed(iter/s)": 0.040611 }, { "epoch": 2.4355971896955504, "grad_norm": 0.17148195207118988, "learning_rate": 1.8750602047161603e-06, "loss": 0.26430578231811525, "memory(GiB)": 137.67, "step": 2080, "token_acc": 0.9074351491670378, "train_speed(iter/s)": 0.040609 }, { "epoch": 2.4414519906323187, "grad_norm": 0.1715674251317978, "learning_rate": 1.8375920493684264e-06, "loss": 0.2722649574279785, "memory(GiB)": 137.67, "step": 2085, "token_acc": 0.8960112888052681, "train_speed(iter/s)": 0.040609 }, { "epoch": 2.4473067915690865, "grad_norm": 0.1820991337299347, "learning_rate": 1.8004641238006815e-06, "loss": 0.2675884485244751, "memory(GiB)": 137.67, "step": 2090, "token_acc": 0.9040590405904059, "train_speed(iter/s)": 0.040607 }, { "epoch": 2.4531615925058547, "grad_norm": 0.1691906452178955, "learning_rate": 1.7636779755986443e-06, "loss": 0.2732096195220947, "memory(GiB)": 137.67, "step": 2095, "token_acc": 0.8958253626778894, "train_speed(iter/s)": 0.040605 }, { "epoch": 2.459016393442623, "grad_norm": 0.17061816155910492, "learning_rate": 1.7272351381018792e-06, "loss": 0.2712996482849121, "memory(GiB)": 137.67, "step": 2100, "token_acc": 0.8880485387880261, "train_speed(iter/s)": 0.040603 }, { "epoch": 2.4648711943793913, "grad_norm": 0.17594653367996216, "learning_rate": 1.6911371303399048e-06, "loss": 0.2586531162261963, "memory(GiB)": 137.67, "step": 2105, "token_acc": 0.9022650028060307, "train_speed(iter/s)": 0.0406 }, { "epoch": 2.470725995316159, "grad_norm": 0.18380020558834076, "learning_rate": 1.6553854569688632e-06, "loss": 0.2727813720703125, "memory(GiB)": 137.67, "step": 2110, "token_acc": 0.8974262645615947, "train_speed(iter/s)": 0.040598 }, { "epoch": 2.4765807962529274, "grad_norm": 0.16742826998233795, "learning_rate": 1.619981608208796e-06, "loss": 0.2734941244125366, "memory(GiB)": 137.67, "step": 2115, "token_acc": 0.8847918638392509, "train_speed(iter/s)": 0.040597 }, { "epoch": 2.4824355971896956, "grad_norm": 0.17516812682151794, "learning_rate": 1.584927059781548e-06, "loss": 0.2728161334991455, "memory(GiB)": 137.67, "step": 2120, "token_acc": 0.8936656628114019, "train_speed(iter/s)": 0.040595 }, { "epoch": 2.4882903981264635, "grad_norm": 0.17867887020111084, "learning_rate": 1.5502232728492362e-06, "loss": 0.264336085319519, "memory(GiB)": 137.67, "step": 2125, "token_acc": 0.9031589138208336, "train_speed(iter/s)": 0.040594 }, { "epoch": 2.4941451990632317, "grad_norm": 0.17173421382904053, "learning_rate": 1.5158716939533524e-06, "loss": 0.27242002487182615, "memory(GiB)": 137.67, "step": 2130, "token_acc": 0.8990930988723483, "train_speed(iter/s)": 0.040593 }, { "epoch": 2.5, "grad_norm": 0.1708640456199646, "learning_rate": 1.4818737549544725e-06, "loss": 0.27319111824035647, "memory(GiB)": 137.67, "step": 2135, "token_acc": 0.8916305799253722, "train_speed(iter/s)": 0.040593 }, { "epoch": 2.5058548009367683, "grad_norm": 0.17307148873806, "learning_rate": 1.448230872972568e-06, "loss": 0.2695432424545288, "memory(GiB)": 137.67, "step": 2140, "token_acc": 0.905385863209386, "train_speed(iter/s)": 0.04059 }, { "epoch": 2.5117096018735365, "grad_norm": 0.17106083035469055, "learning_rate": 1.4149444503279297e-06, "loss": 0.27602252960205076, "memory(GiB)": 137.67, "step": 2145, "token_acc": 0.8923752322136868, "train_speed(iter/s)": 0.040589 }, { "epoch": 2.5175644028103044, "grad_norm": 0.17844541370868683, "learning_rate": 1.382015874482735e-06, "loss": 0.2688480615615845, "memory(GiB)": 137.67, "step": 2150, "token_acc": 0.8990480241183902, "train_speed(iter/s)": 0.040588 }, { "epoch": 2.5234192037470726, "grad_norm": 0.17703387141227722, "learning_rate": 1.3494465179831895e-06, "loss": 0.26667649745941163, "memory(GiB)": 137.67, "step": 2155, "token_acc": 0.8957748501946923, "train_speed(iter/s)": 0.040587 }, { "epoch": 2.529274004683841, "grad_norm": 0.1624777913093567, "learning_rate": 1.3172377384023393e-06, "loss": 0.26247563362121584, "memory(GiB)": 137.67, "step": 2160, "token_acc": 0.9005889918174871, "train_speed(iter/s)": 0.040586 }, { "epoch": 2.5351288056206087, "grad_norm": 0.17209553718566895, "learning_rate": 1.2853908782834722e-06, "loss": 0.2671672821044922, "memory(GiB)": 137.67, "step": 2165, "token_acc": 0.9070119235517494, "train_speed(iter/s)": 0.040583 }, { "epoch": 2.540983606557377, "grad_norm": 0.17611093819141388, "learning_rate": 1.2539072650841523e-06, "loss": 0.2725430250167847, "memory(GiB)": 137.67, "step": 2170, "token_acc": 0.8966264886593998, "train_speed(iter/s)": 0.040583 }, { "epoch": 2.5468384074941453, "grad_norm": 0.1783149093389511, "learning_rate": 1.2227882111209011e-06, "loss": 0.27568228244781495, "memory(GiB)": 137.67, "step": 2175, "token_acc": 0.8947381499658321, "train_speed(iter/s)": 0.040582 }, { "epoch": 2.552693208430913, "grad_norm": 0.17337878048419952, "learning_rate": 1.1920350135144898e-06, "loss": 0.269814133644104, "memory(GiB)": 137.67, "step": 2180, "token_acc": 0.9041164343092762, "train_speed(iter/s)": 0.040581 }, { "epoch": 2.5585480093676813, "grad_norm": 0.16845281422138214, "learning_rate": 1.1616489541358678e-06, "loss": 0.26679143905639646, "memory(GiB)": 137.67, "step": 2185, "token_acc": 0.8939169722162003, "train_speed(iter/s)": 0.04058 }, { "epoch": 2.5644028103044496, "grad_norm": 0.17022623121738434, "learning_rate": 1.1316312995527424e-06, "loss": 0.2700947761535645, "memory(GiB)": 137.67, "step": 2190, "token_acc": 0.8979253112033195, "train_speed(iter/s)": 0.040579 }, { "epoch": 2.570257611241218, "grad_norm": 0.16687875986099243, "learning_rate": 1.1019833009767744e-06, "loss": 0.268681001663208, "memory(GiB)": 137.67, "step": 2195, "token_acc": 0.8966215038230679, "train_speed(iter/s)": 0.040578 }, { "epoch": 2.576112412177986, "grad_norm": 0.17770424485206604, "learning_rate": 1.072706194211426e-06, "loss": 0.27028694152832033, "memory(GiB)": 137.67, "step": 2200, "token_acc": 0.9029025046417339, "train_speed(iter/s)": 0.040577 }, { "epoch": 2.581967213114754, "grad_norm": 0.17755696177482605, "learning_rate": 1.0438011996004581e-06, "loss": 0.269865894317627, "memory(GiB)": 137.67, "step": 2205, "token_acc": 0.8967394005666286, "train_speed(iter/s)": 0.040575 }, { "epoch": 2.5878220140515222, "grad_norm": 0.17752693593502045, "learning_rate": 1.0152695219770558e-06, "loss": 0.257364559173584, "memory(GiB)": 137.67, "step": 2210, "token_acc": 0.9068669110660224, "train_speed(iter/s)": 0.040573 }, { "epoch": 2.5936768149882905, "grad_norm": 0.16752499341964722, "learning_rate": 9.871123506136037e-07, "loss": 0.2638521194458008, "memory(GiB)": 137.67, "step": 2215, "token_acc": 0.9083980061833554, "train_speed(iter/s)": 0.040572 }, { "epoch": 2.5995316159250583, "grad_norm": 0.16032443940639496, "learning_rate": 9.593308591721274e-07, "loss": 0.2622210025787354, "memory(GiB)": 137.67, "step": 2220, "token_acc": 0.900316748757648, "train_speed(iter/s)": 0.040571 }, { "epoch": 2.6053864168618266, "grad_norm": 0.17415659129619598, "learning_rate": 9.319262056553602e-07, "loss": 0.2700244903564453, "memory(GiB)": 137.67, "step": 2225, "token_acc": 0.9051188644286028, "train_speed(iter/s)": 0.040569 }, { "epoch": 2.611241217798595, "grad_norm": 0.1722276359796524, "learning_rate": 9.048995323584764e-07, "loss": 0.2738530397415161, "memory(GiB)": 137.67, "step": 2230, "token_acc": 0.9079698943901274, "train_speed(iter/s)": 0.040568 }, { "epoch": 2.617096018735363, "grad_norm": 0.17455357313156128, "learning_rate": 8.78251965821485e-07, "loss": 0.25915350914001467, "memory(GiB)": 137.67, "step": 2235, "token_acc": 0.9004381754945836, "train_speed(iter/s)": 0.040566 }, { "epoch": 2.6229508196721314, "grad_norm": 0.17298012971878052, "learning_rate": 8.519846167822665e-07, "loss": 0.2638465404510498, "memory(GiB)": 137.67, "step": 2240, "token_acc": 0.9118884831119326, "train_speed(iter/s)": 0.040565 }, { "epoch": 2.628805620608899, "grad_norm": 0.1699805110692978, "learning_rate": 8.260985801302734e-07, "loss": 0.25593223571777346, "memory(GiB)": 137.67, "step": 2245, "token_acc": 0.8991087959330969, "train_speed(iter/s)": 0.040565 }, { "epoch": 2.6346604215456675, "grad_norm": 0.1722072809934616, "learning_rate": 8.005949348608977e-07, "loss": 0.2674243927001953, "memory(GiB)": 137.67, "step": 2250, "token_acc": 0.8965253065997911, "train_speed(iter/s)": 0.040563 }, { "epoch": 2.6405152224824358, "grad_norm": 0.1668199747800827, "learning_rate": 7.754747440304911e-07, "loss": 0.27177164554595945, "memory(GiB)": 137.67, "step": 2255, "token_acc": 0.8954008941320247, "train_speed(iter/s)": 0.040563 }, { "epoch": 2.6463700234192036, "grad_norm": 0.16813580691814423, "learning_rate": 7.507390547120541e-07, "loss": 0.2651193857192993, "memory(GiB)": 137.67, "step": 2260, "token_acc": 0.8984925665335315, "train_speed(iter/s)": 0.040562 }, { "epoch": 2.652224824355972, "grad_norm": 0.17678076028823853, "learning_rate": 7.263888979515954e-07, "loss": 0.27275819778442384, "memory(GiB)": 137.67, "step": 2265, "token_acc": 0.8936288874184706, "train_speed(iter/s)": 0.040562 }, { "epoch": 2.65807962529274, "grad_norm": 0.16264022886753082, "learning_rate": 7.024252887251548e-07, "loss": 0.2669191360473633, "memory(GiB)": 137.67, "step": 2270, "token_acc": 0.8972385552618926, "train_speed(iter/s)": 0.04056 }, { "epoch": 2.663934426229508, "grad_norm": 0.1690565049648285, "learning_rate": 6.788492258964896e-07, "loss": 0.2695984125137329, "memory(GiB)": 137.67, "step": 2275, "token_acc": 0.8963350061434133, "train_speed(iter/s)": 0.040559 }, { "epoch": 2.669789227166276, "grad_norm": 0.1730775386095047, "learning_rate": 6.556616921754489e-07, "loss": 0.26709651947021484, "memory(GiB)": 137.67, "step": 2280, "token_acc": 0.9004803898235022, "train_speed(iter/s)": 0.040558 }, { "epoch": 2.6756440281030445, "grad_norm": 0.1701081544160843, "learning_rate": 6.328636540770028e-07, "loss": 0.26933286190032957, "memory(GiB)": 137.67, "step": 2285, "token_acc": 0.898853457766213, "train_speed(iter/s)": 0.040557 }, { "epoch": 2.6814988290398127, "grad_norm": 0.19118832051753998, "learning_rate": 6.10456061880963e-07, "loss": 0.2741654396057129, "memory(GiB)": 137.67, "step": 2290, "token_acc": 0.9025216185680262, "train_speed(iter/s)": 0.040556 }, { "epoch": 2.687353629976581, "grad_norm": 0.17062994837760925, "learning_rate": 5.884398495923727e-07, "loss": 0.2640299558639526, "memory(GiB)": 137.67, "step": 2295, "token_acc": 0.8934425971755339, "train_speed(iter/s)": 0.040556 }, { "epoch": 2.693208430913349, "grad_norm": 0.18749327957630157, "learning_rate": 5.668159349025649e-07, "loss": 0.2795866966247559, "memory(GiB)": 137.67, "step": 2300, "token_acc": 0.8874596974206349, "train_speed(iter/s)": 0.040554 }, { "epoch": 2.699063231850117, "grad_norm": 0.1760568916797638, "learning_rate": 5.455852191509214e-07, "loss": 0.27616961002349855, "memory(GiB)": 137.67, "step": 2305, "token_acc": 0.8910418230197176, "train_speed(iter/s)": 0.040553 }, { "epoch": 2.7049180327868854, "grad_norm": 0.1760990172624588, "learning_rate": 5.247485872873026e-07, "loss": 0.26389687061309813, "memory(GiB)": 137.67, "step": 2310, "token_acc": 0.9032378371322547, "train_speed(iter/s)": 0.040552 }, { "epoch": 2.710772833723653, "grad_norm": 0.16184002161026, "learning_rate": 5.043069078351526e-07, "loss": 0.2583066463470459, "memory(GiB)": 137.67, "step": 2315, "token_acc": 0.9048499210110584, "train_speed(iter/s)": 0.040551 }, { "epoch": 2.7166276346604215, "grad_norm": 0.16953077912330627, "learning_rate": 4.842610328552999e-07, "loss": 0.26470949649810793, "memory(GiB)": 137.67, "step": 2320, "token_acc": 0.9023021945368386, "train_speed(iter/s)": 0.04055 }, { "epoch": 2.7224824355971897, "grad_norm": 0.16833004355430603, "learning_rate": 4.6461179791044806e-07, "loss": 0.26623120307922366, "memory(GiB)": 137.67, "step": 2325, "token_acc": 0.895680773698298, "train_speed(iter/s)": 0.04055 }, { "epoch": 2.728337236533958, "grad_norm": 0.1694810837507248, "learning_rate": 4.453600220303378e-07, "loss": 0.25267777442932127, "memory(GiB)": 137.67, "step": 2330, "token_acc": 0.8968080577917444, "train_speed(iter/s)": 0.04055 }, { "epoch": 2.7341920374707263, "grad_norm": 0.18032941222190857, "learning_rate": 4.2650650767761535e-07, "loss": 0.25408167839050294, "memory(GiB)": 137.67, "step": 2335, "token_acc": 0.9085095809749435, "train_speed(iter/s)": 0.040549 }, { "epoch": 2.740046838407494, "grad_norm": 0.18011276423931122, "learning_rate": 4.0805204071437953e-07, "loss": 0.27644264698028564, "memory(GiB)": 137.67, "step": 2340, "token_acc": 0.8965790537297598, "train_speed(iter/s)": 0.040547 }, { "epoch": 2.7459016393442623, "grad_norm": 0.16562311351299286, "learning_rate": 3.899973903694243e-07, "loss": 0.26986749172210694, "memory(GiB)": 137.67, "step": 2345, "token_acc": 0.9012060017454879, "train_speed(iter/s)": 0.040546 }, { "epoch": 2.7517564402810306, "grad_norm": 0.17436754703521729, "learning_rate": 3.72343309206179e-07, "loss": 0.26195201873779295, "memory(GiB)": 137.67, "step": 2350, "token_acc": 0.9009433222876742, "train_speed(iter/s)": 0.040545 }, { "epoch": 2.7576112412177984, "grad_norm": 0.1674078106880188, "learning_rate": 3.55090533091339e-07, "loss": 0.26260790824890134, "memory(GiB)": 137.67, "step": 2355, "token_acc": 0.9115999937809979, "train_speed(iter/s)": 0.040543 }, { "epoch": 2.7634660421545667, "grad_norm": 0.1657068282365799, "learning_rate": 3.382397811641858e-07, "loss": 0.25954129695892336, "memory(GiB)": 137.67, "step": 2360, "token_acc": 0.9021908567865544, "train_speed(iter/s)": 0.040543 }, { "epoch": 2.769320843091335, "grad_norm": 0.167274609208107, "learning_rate": 3.217917558066241e-07, "loss": 0.262769889831543, "memory(GiB)": 137.67, "step": 2365, "token_acc": 0.8952377080453587, "train_speed(iter/s)": 0.040542 }, { "epoch": 2.775175644028103, "grad_norm": 0.16418085992336273, "learning_rate": 3.057471426138958e-07, "loss": 0.2759857654571533, "memory(GiB)": 137.67, "step": 2370, "token_acc": 0.8904371253200432, "train_speed(iter/s)": 0.04054 }, { "epoch": 2.781030444964871, "grad_norm": 0.16312485933303833, "learning_rate": 2.901066103660033e-07, "loss": 0.26541569232940676, "memory(GiB)": 137.67, "step": 2375, "token_acc": 0.9018337335217314, "train_speed(iter/s)": 0.04054 }, { "epoch": 2.7868852459016393, "grad_norm": 0.17677490413188934, "learning_rate": 2.7487081099983435e-07, "loss": 0.27631726264953616, "memory(GiB)": 137.67, "step": 2380, "token_acc": 0.9002755878263168, "train_speed(iter/s)": 0.040539 }, { "epoch": 2.7927400468384076, "grad_norm": 0.1672162115573883, "learning_rate": 2.6004037958199167e-07, "loss": 0.26006388664245605, "memory(GiB)": 137.67, "step": 2385, "token_acc": 0.910639127168484, "train_speed(iter/s)": 0.040538 }, { "epoch": 2.798594847775176, "grad_norm": 0.1678304672241211, "learning_rate": 2.4561593428231165e-07, "loss": 0.26682395935058595, "memory(GiB)": 137.67, "step": 2390, "token_acc": 0.91889434727678, "train_speed(iter/s)": 0.040535 }, { "epoch": 2.8044496487119437, "grad_norm": 0.16077911853790283, "learning_rate": 2.3159807634811182e-07, "loss": 0.2570212364196777, "memory(GiB)": 137.67, "step": 2395, "token_acc": 0.9051587858378934, "train_speed(iter/s)": 0.040535 }, { "epoch": 2.810304449648712, "grad_norm": 0.16872599720954895, "learning_rate": 2.1798739007911517e-07, "loss": 0.27098655700683594, "memory(GiB)": 137.67, "step": 2400, "token_acc": 0.8959861646097005, "train_speed(iter/s)": 0.040533 }, { "epoch": 2.8161592505854802, "grad_norm": 0.16125863790512085, "learning_rate": 2.0478444280310206e-07, "loss": 0.26554141044616697, "memory(GiB)": 137.67, "step": 2405, "token_acc": 0.8993798050995196, "train_speed(iter/s)": 0.040533 }, { "epoch": 2.822014051522248, "grad_norm": 0.19162511825561523, "learning_rate": 1.919897848522656e-07, "loss": 0.26296229362487794, "memory(GiB)": 137.67, "step": 2410, "token_acc": 0.8993982865613145, "train_speed(iter/s)": 0.040532 }, { "epoch": 2.8278688524590163, "grad_norm": 0.20407338440418243, "learning_rate": 1.796039495402646e-07, "loss": 0.26827549934387207, "memory(GiB)": 137.67, "step": 2415, "token_acc": 0.9050311652650377, "train_speed(iter/s)": 0.04053 }, { "epoch": 2.8337236533957846, "grad_norm": 0.17013327777385712, "learning_rate": 1.6762745313999795e-07, "loss": 0.2727066516876221, "memory(GiB)": 137.67, "step": 2420, "token_acc": 0.8865242476220178, "train_speed(iter/s)": 0.040529 }, { "epoch": 2.839578454332553, "grad_norm": 0.1698453575372696, "learning_rate": 1.5606079486208846e-07, "loss": 0.2641671895980835, "memory(GiB)": 137.67, "step": 2425, "token_acc": 0.9000177898735047, "train_speed(iter/s)": 0.040529 }, { "epoch": 2.845433255269321, "grad_norm": 0.17142532765865326, "learning_rate": 1.449044568340663e-07, "loss": 0.2717731952667236, "memory(GiB)": 137.67, "step": 2430, "token_acc": 0.9031580860350494, "train_speed(iter/s)": 0.040528 }, { "epoch": 2.851288056206089, "grad_norm": 0.1803494244813919, "learning_rate": 1.3415890408027932e-07, "loss": 0.26016151905059814, "memory(GiB)": 137.67, "step": 2435, "token_acc": 0.9004292620366133, "train_speed(iter/s)": 0.040526 }, { "epoch": 2.857142857142857, "grad_norm": 0.17327673733234406, "learning_rate": 1.2382458450250657e-07, "loss": 0.2739871025085449, "memory(GiB)": 137.67, "step": 2440, "token_acc": 0.8937226907040563, "train_speed(iter/s)": 0.040526 }, { "epoch": 2.8629976580796255, "grad_norm": 0.1648455113172531, "learning_rate": 1.1390192886129304e-07, "loss": 0.26163692474365235, "memory(GiB)": 137.67, "step": 2445, "token_acc": 0.9109708459314515, "train_speed(iter/s)": 0.040525 }, { "epoch": 2.8688524590163933, "grad_norm": 0.17209313809871674, "learning_rate": 1.0439135075798634e-07, "loss": 0.2778321266174316, "memory(GiB)": 137.67, "step": 2450, "token_acc": 0.8971170667512587, "train_speed(iter/s)": 0.040525 }, { "epoch": 2.8747072599531616, "grad_norm": 0.16632598638534546, "learning_rate": 9.529324661750494e-08, "loss": 0.2714024305343628, "memory(GiB)": 137.67, "step": 2455, "token_acc": 0.8926179928835372, "train_speed(iter/s)": 0.040524 }, { "epoch": 2.88056206088993, "grad_norm": 0.17401184141635895, "learning_rate": 8.6607995671808e-08, "loss": 0.2663599967956543, "memory(GiB)": 137.67, "step": 2460, "token_acc": 0.8979368591641474, "train_speed(iter/s)": 0.040523 }, { "epoch": 2.8864168618266977, "grad_norm": 0.17087528109550476, "learning_rate": 7.833595994409248e-08, "loss": 0.2583767414093018, "memory(GiB)": 137.67, "step": 2465, "token_acc": 0.8988238974038161, "train_speed(iter/s)": 0.040522 }, { "epoch": 2.892271662763466, "grad_norm": 0.17502275109291077, "learning_rate": 7.047748423370193e-08, "loss": 0.27132668495178225, "memory(GiB)": 137.67, "step": 2470, "token_acc": 0.8950027089407572, "train_speed(iter/s)": 0.040522 }, { "epoch": 2.898126463700234, "grad_norm": 0.16457100212574005, "learning_rate": 6.303289610175233e-08, "loss": 0.262396240234375, "memory(GiB)": 137.67, "step": 2475, "token_acc": 0.9005705329153605, "train_speed(iter/s)": 0.040522 }, { "epoch": 2.9039812646370025, "grad_norm": 0.17186148464679718, "learning_rate": 5.6002505857480906e-08, "loss": 0.2651688098907471, "memory(GiB)": 137.67, "step": 2480, "token_acc": 0.903142540689707, "train_speed(iter/s)": 0.040521 }, { "epoch": 2.9098360655737707, "grad_norm": 0.16921843588352203, "learning_rate": 4.938660654530969e-08, "loss": 0.27781147956848146, "memory(GiB)": 137.67, "step": 2485, "token_acc": 0.8947337181986305, "train_speed(iter/s)": 0.040521 }, { "epoch": 2.9156908665105385, "grad_norm": 0.17168040573596954, "learning_rate": 4.318547393263317e-08, "loss": 0.27856767177581787, "memory(GiB)": 137.67, "step": 2490, "token_acc": 0.8994483098446597, "train_speed(iter/s)": 0.04052 }, { "epoch": 2.921545667447307, "grad_norm": 0.17257463932037354, "learning_rate": 3.739936649832188e-08, "loss": 0.26465725898742676, "memory(GiB)": 137.67, "step": 2495, "token_acc": 0.9003965374896801, "train_speed(iter/s)": 0.04052 }, { "epoch": 2.927400468384075, "grad_norm": 0.17007899284362793, "learning_rate": 3.2028525421946563e-08, "loss": 0.26408021450042723, "memory(GiB)": 137.67, "step": 2500, "token_acc": 0.9105243972950552, "train_speed(iter/s)": 0.04052 }, { "epoch": 2.933255269320843, "grad_norm": 0.16546528041362762, "learning_rate": 2.70731745737296e-08, "loss": 0.26817855834960935, "memory(GiB)": 137.67, "step": 2505, "token_acc": 0.9032225815017886, "train_speed(iter/s)": 0.040519 }, { "epoch": 2.939110070257611, "grad_norm": 0.1731211543083191, "learning_rate": 2.2533520505211294e-08, "loss": 0.26341302394866944, "memory(GiB)": 137.67, "step": 2510, "token_acc": 0.9048233016983017, "train_speed(iter/s)": 0.040519 }, { "epoch": 2.9449648711943794, "grad_norm": 0.16093143820762634, "learning_rate": 1.8409752440639027e-08, "loss": 0.25573346614837644, "memory(GiB)": 137.67, "step": 2515, "token_acc": 0.9019553343056392, "train_speed(iter/s)": 0.040518 }, { "epoch": 2.9508196721311473, "grad_norm": 0.16452209651470184, "learning_rate": 1.470204226908134e-08, "loss": 0.2707658767700195, "memory(GiB)": 137.67, "step": 2520, "token_acc": 0.904132819893002, "train_speed(iter/s)": 0.040517 }, { "epoch": 2.9566744730679155, "grad_norm": 0.1768556386232376, "learning_rate": 1.1410544537263645e-08, "loss": 0.27701735496520996, "memory(GiB)": 137.67, "step": 2525, "token_acc": 0.903024352910179, "train_speed(iter/s)": 0.040515 }, { "epoch": 2.962529274004684, "grad_norm": 0.16568534076213837, "learning_rate": 8.535396443124511e-09, "loss": 0.25813367366790774, "memory(GiB)": 137.67, "step": 2530, "token_acc": 0.9017673177727538, "train_speed(iter/s)": 0.040514 }, { "epoch": 2.968384074941452, "grad_norm": 0.16622532904148102, "learning_rate": 6.076717830098e-09, "loss": 0.260286283493042, "memory(GiB)": 137.67, "step": 2535, "token_acc": 0.9083364106929379, "train_speed(iter/s)": 0.040513 }, { "epoch": 2.9742388758782203, "grad_norm": 0.17745059728622437, "learning_rate": 4.034611182121007e-09, "loss": 0.26159353256225587, "memory(GiB)": 137.67, "step": 2540, "token_acc": 0.9072020079994492, "train_speed(iter/s)": 0.040512 }, { "epoch": 2.980093676814988, "grad_norm": 0.16991080343723297, "learning_rate": 2.40916161935445e-09, "loss": 0.26626038551330566, "memory(GiB)": 137.67, "step": 2545, "token_acc": 0.8986437875498561, "train_speed(iter/s)": 0.040511 }, { "epoch": 2.9859484777517564, "grad_norm": 0.16490155458450317, "learning_rate": 1.2004368946427758e-09, "loss": 0.2636513948440552, "memory(GiB)": 137.67, "step": 2550, "token_acc": 0.9014935708777286, "train_speed(iter/s)": 0.040511 }, { "epoch": 2.9918032786885247, "grad_norm": 0.1677451878786087, "learning_rate": 4.084873906851083e-10, "loss": 0.26745948791503904, "memory(GiB)": 137.67, "step": 2555, "token_acc": 0.9085500921651726, "train_speed(iter/s)": 0.04051 }, { "epoch": 2.9976580796252925, "grad_norm": 0.1645430028438568, "learning_rate": 3.334611793692766e-11, "loss": 0.26831555366516113, "memory(GiB)": 137.67, "step": 2560, "token_acc": 0.9117214925099609, "train_speed(iter/s)": 0.040508 } ], "logging_steps": 5, "max_steps": 2562, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3414295945805824.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }