{ "best_global_step": 10596, "best_metric": 0.23324698, "best_model_checkpoint": "/data/oss_bucket_0/xwt/output/citywalker/4d815960480fc88bcc76f00e7fcc7bace26a4251-1-ep/v0-20251014-003550/checkpoint-10596", "epoch": 1.0, "eval_steps": 1111111111, "global_step": 10596, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 9.437523593808984e-05, "grad_norm": 84.30111694335938, "learning_rate": 3.773584905660378e-07, "loss": 3.5155749320983887, "memory(GiB)": 34.14, "step": 1, "token_acc": 0.610738255033557, "train_speed(iter/s)": 0.00942 }, { "epoch": 0.00047187617969044924, "grad_norm": 106.11502075195312, "learning_rate": 1.8867924528301887e-06, "loss": 3.807619571685791, "memory(GiB)": 63.23, "step": 5, "token_acc": 0.6100302637267618, "train_speed(iter/s)": 0.03435 }, { "epoch": 0.0009437523593808985, "grad_norm": 116.6819839477539, "learning_rate": 3.7735849056603773e-06, "loss": 3.0834783554077148, "memory(GiB)": 73.6, "step": 10, "token_acc": 0.6099345398536773, "train_speed(iter/s)": 0.05377 }, { "epoch": 0.0014156285390713476, "grad_norm": 48.40726089477539, "learning_rate": 5.660377358490566e-06, "loss": 2.1324913024902346, "memory(GiB)": 73.6, "step": 15, "token_acc": 0.6634066829665851, "train_speed(iter/s)": 0.066562 }, { "epoch": 0.001887504718761797, "grad_norm": 42.45460891723633, "learning_rate": 7.547169811320755e-06, "loss": 1.2360769271850587, "memory(GiB)": 73.6, "step": 20, "token_acc": 0.8050813815005955, "train_speed(iter/s)": 0.076262 }, { "epoch": 0.002359380898452246, "grad_norm": 70.99395751953125, "learning_rate": 9.433962264150944e-06, "loss": 1.01270751953125, "memory(GiB)": 91.64, "step": 25, "token_acc": 0.8459764814387826, "train_speed(iter/s)": 0.083391 }, { "epoch": 0.0028312570781426952, "grad_norm": 19.72198486328125, "learning_rate": 1.1320754716981132e-05, "loss": 0.9193931579589844, "memory(GiB)": 91.64, "step": 30, "token_acc": 0.8607944732297064, "train_speed(iter/s)": 0.089177 }, { "epoch": 0.003303133257833145, "grad_norm": 20.32855987548828, "learning_rate": 1.320754716981132e-05, "loss": 0.7122109413146973, "memory(GiB)": 91.64, "step": 35, "token_acc": 0.8821861304459913, "train_speed(iter/s)": 0.093733 }, { "epoch": 0.003775009437523594, "grad_norm": 21.04654312133789, "learning_rate": 1.509433962264151e-05, "loss": 0.6282638549804688, "memory(GiB)": 91.64, "step": 40, "token_acc": 0.8689788053949904, "train_speed(iter/s)": 0.097197 }, { "epoch": 0.004246885617214043, "grad_norm": 9.78100872039795, "learning_rate": 1.69811320754717e-05, "loss": 0.4371612548828125, "memory(GiB)": 91.64, "step": 45, "token_acc": 0.9013560223344855, "train_speed(iter/s)": 0.100185 }, { "epoch": 0.004718761796904492, "grad_norm": 9.462187767028809, "learning_rate": 1.8867924528301888e-05, "loss": 0.3639880657196045, "memory(GiB)": 91.64, "step": 50, "token_acc": 0.9021005251312828, "train_speed(iter/s)": 0.102924 }, { "epoch": 0.005190637976594942, "grad_norm": 2.678065061569214, "learning_rate": 2.0754716981132076e-05, "loss": 0.33274307250976565, "memory(GiB)": 91.64, "step": 55, "token_acc": 0.8931506849315068, "train_speed(iter/s)": 0.105295 }, { "epoch": 0.0056625141562853904, "grad_norm": 2.3717150688171387, "learning_rate": 2.2641509433962265e-05, "loss": 0.35691156387329104, "memory(GiB)": 91.64, "step": 60, "token_acc": 0.8906422018348624, "train_speed(iter/s)": 0.107475 }, { "epoch": 0.00613439033597584, "grad_norm": 22.78520965576172, "learning_rate": 2.4528301886792453e-05, "loss": 0.4278131008148193, "memory(GiB)": 91.64, "step": 65, "token_acc": 0.871261378413524, "train_speed(iter/s)": 0.109376 }, { "epoch": 0.00660626651566629, "grad_norm": 2.964755058288574, "learning_rate": 2.641509433962264e-05, "loss": 0.39853944778442385, "memory(GiB)": 91.64, "step": 70, "token_acc": 0.877030162412993, "train_speed(iter/s)": 0.111001 }, { "epoch": 0.007078142695356738, "grad_norm": 2.3992581367492676, "learning_rate": 2.830188679245283e-05, "loss": 0.3416656494140625, "memory(GiB)": 91.64, "step": 75, "token_acc": 0.8882584024443474, "train_speed(iter/s)": 0.112467 }, { "epoch": 0.007550018875047188, "grad_norm": 2.020433187484741, "learning_rate": 3.018867924528302e-05, "loss": 0.35348923206329347, "memory(GiB)": 91.64, "step": 80, "token_acc": 0.888859132297804, "train_speed(iter/s)": 0.113855 }, { "epoch": 0.008021895054737637, "grad_norm": 3.7929301261901855, "learning_rate": 3.207547169811321e-05, "loss": 0.34899582862854006, "memory(GiB)": 91.64, "step": 85, "token_acc": 0.8849840255591054, "train_speed(iter/s)": 0.114958 }, { "epoch": 0.008493771234428085, "grad_norm": 1.895822525024414, "learning_rate": 3.39622641509434e-05, "loss": 0.38731689453125, "memory(GiB)": 91.64, "step": 90, "token_acc": 0.8807424593967518, "train_speed(iter/s)": 0.116143 }, { "epoch": 0.008965647414118535, "grad_norm": 2.664459705352783, "learning_rate": 3.5849056603773584e-05, "loss": 0.36236350536346434, "memory(GiB)": 91.64, "step": 95, "token_acc": 0.8755798090040928, "train_speed(iter/s)": 0.117219 }, { "epoch": 0.009437523593808984, "grad_norm": 6.293849468231201, "learning_rate": 3.7735849056603776e-05, "loss": 0.4948585033416748, "memory(GiB)": 91.64, "step": 100, "token_acc": 0.8815642458100559, "train_speed(iter/s)": 0.118142 }, { "epoch": 0.009909399773499434, "grad_norm": 1.9665182828903198, "learning_rate": 3.962264150943397e-05, "loss": 0.41594877243041994, "memory(GiB)": 91.64, "step": 105, "token_acc": 0.8591625883632409, "train_speed(iter/s)": 0.118993 }, { "epoch": 0.010381275953189883, "grad_norm": 4.004781723022461, "learning_rate": 4.150943396226415e-05, "loss": 0.35399832725524905, "memory(GiB)": 91.64, "step": 110, "token_acc": 0.8827626233313988, "train_speed(iter/s)": 0.119709 }, { "epoch": 0.010853152132880333, "grad_norm": 1.3648808002471924, "learning_rate": 4.3396226415094345e-05, "loss": 0.36987504959106443, "memory(GiB)": 91.64, "step": 115, "token_acc": 0.8744955609362389, "train_speed(iter/s)": 0.120437 }, { "epoch": 0.011325028312570781, "grad_norm": 1.6580424308776855, "learning_rate": 4.528301886792453e-05, "loss": 0.3571889877319336, "memory(GiB)": 91.64, "step": 120, "token_acc": 0.8779239766081871, "train_speed(iter/s)": 0.121126 }, { "epoch": 0.01179690449226123, "grad_norm": 3.6433212757110596, "learning_rate": 4.716981132075472e-05, "loss": 0.3812561511993408, "memory(GiB)": 91.64, "step": 125, "token_acc": 0.8725761772853186, "train_speed(iter/s)": 0.121769 }, { "epoch": 0.01226878067195168, "grad_norm": 4.834733009338379, "learning_rate": 4.9056603773584906e-05, "loss": 0.37772574424743655, "memory(GiB)": 91.64, "step": 130, "token_acc": 0.8891786179921773, "train_speed(iter/s)": 0.122353 }, { "epoch": 0.01274065685164213, "grad_norm": 1.4346174001693726, "learning_rate": 5.09433962264151e-05, "loss": 0.3767831802368164, "memory(GiB)": 91.64, "step": 135, "token_acc": 0.8672922252010724, "train_speed(iter/s)": 0.122876 }, { "epoch": 0.01321253303133258, "grad_norm": 1.908555269241333, "learning_rate": 5.283018867924528e-05, "loss": 0.3630094051361084, "memory(GiB)": 91.64, "step": 140, "token_acc": 0.8912050256996003, "train_speed(iter/s)": 0.123352 }, { "epoch": 0.013684409211023027, "grad_norm": 1.579922080039978, "learning_rate": 5.4716981132075475e-05, "loss": 0.38754441738128664, "memory(GiB)": 91.64, "step": 145, "token_acc": 0.8529411764705882, "train_speed(iter/s)": 0.12383 }, { "epoch": 0.014156285390713477, "grad_norm": 2.1948578357696533, "learning_rate": 5.660377358490566e-05, "loss": 0.38694057464599607, "memory(GiB)": 91.64, "step": 150, "token_acc": 0.8721523291397484, "train_speed(iter/s)": 0.124241 }, { "epoch": 0.014628161570403926, "grad_norm": 2.0328688621520996, "learning_rate": 5.849056603773585e-05, "loss": 0.3814719200134277, "memory(GiB)": 91.64, "step": 155, "token_acc": 0.8973154362416107, "train_speed(iter/s)": 0.1247 }, { "epoch": 0.015100037750094376, "grad_norm": 1.0338304042816162, "learning_rate": 6.037735849056604e-05, "loss": 0.46609911918640134, "memory(GiB)": 91.64, "step": 160, "token_acc": 0.8663366336633663, "train_speed(iter/s)": 0.12509 }, { "epoch": 0.015571913929784825, "grad_norm": 2.470008134841919, "learning_rate": 6.226415094339622e-05, "loss": 0.37815041542053224, "memory(GiB)": 91.64, "step": 165, "token_acc": 0.8509316770186336, "train_speed(iter/s)": 0.125425 }, { "epoch": 0.016043790109475275, "grad_norm": 5.2904839515686035, "learning_rate": 6.415094339622641e-05, "loss": 0.40090017318725585, "memory(GiB)": 91.64, "step": 170, "token_acc": 0.8588604286461056, "train_speed(iter/s)": 0.125806 }, { "epoch": 0.016515666289165724, "grad_norm": 2.9606592655181885, "learning_rate": 6.60377358490566e-05, "loss": 0.39040379524230956, "memory(GiB)": 91.64, "step": 175, "token_acc": 0.8624469589816125, "train_speed(iter/s)": 0.126167 }, { "epoch": 0.01698754246885617, "grad_norm": 2.9512269496917725, "learning_rate": 6.79245283018868e-05, "loss": 0.40850982666015623, "memory(GiB)": 91.64, "step": 180, "token_acc": 0.8886337543053962, "train_speed(iter/s)": 0.126417 }, { "epoch": 0.01745941864854662, "grad_norm": 3.298267126083374, "learning_rate": 6.981132075471698e-05, "loss": 0.37546262741088865, "memory(GiB)": 91.64, "step": 185, "token_acc": 0.8762331838565023, "train_speed(iter/s)": 0.126711 }, { "epoch": 0.01793129482823707, "grad_norm": 11.315258026123047, "learning_rate": 7.169811320754717e-05, "loss": 0.3741611480712891, "memory(GiB)": 91.64, "step": 190, "token_acc": 0.8901128425577647, "train_speed(iter/s)": 0.127037 }, { "epoch": 0.01840317100792752, "grad_norm": 1.4289817810058594, "learning_rate": 7.358490566037736e-05, "loss": 0.4472553253173828, "memory(GiB)": 91.64, "step": 195, "token_acc": 0.8647993536224078, "train_speed(iter/s)": 0.127361 }, { "epoch": 0.01887504718761797, "grad_norm": 21.130821228027344, "learning_rate": 7.547169811320755e-05, "loss": 0.43447041511535645, "memory(GiB)": 91.64, "step": 200, "token_acc": 0.8622900763358778, "train_speed(iter/s)": 0.127616 }, { "epoch": 0.01934692336730842, "grad_norm": 2.252457857131958, "learning_rate": 7.735849056603774e-05, "loss": 0.4470320701599121, "memory(GiB)": 91.64, "step": 205, "token_acc": 0.8723309608540926, "train_speed(iter/s)": 0.127869 }, { "epoch": 0.019818799546998868, "grad_norm": 2.8665921688079834, "learning_rate": 7.924528301886794e-05, "loss": 0.3971832275390625, "memory(GiB)": 91.64, "step": 210, "token_acc": 0.8759057971014492, "train_speed(iter/s)": 0.128077 }, { "epoch": 0.020290675726689317, "grad_norm": 3.2112598419189453, "learning_rate": 8.113207547169813e-05, "loss": 0.3845433235168457, "memory(GiB)": 91.64, "step": 215, "token_acc": 0.8769537745261058, "train_speed(iter/s)": 0.12831 }, { "epoch": 0.020762551906379767, "grad_norm": 1.5424221754074097, "learning_rate": 8.30188679245283e-05, "loss": 0.4125929832458496, "memory(GiB)": 91.64, "step": 220, "token_acc": 0.8703056768558952, "train_speed(iter/s)": 0.128563 }, { "epoch": 0.021234428086070217, "grad_norm": 1.9207730293273926, "learning_rate": 8.49056603773585e-05, "loss": 0.3798954010009766, "memory(GiB)": 91.64, "step": 225, "token_acc": 0.8849710982658959, "train_speed(iter/s)": 0.128725 }, { "epoch": 0.021706304265760666, "grad_norm": 3.5698459148406982, "learning_rate": 8.679245283018869e-05, "loss": 0.4055330753326416, "memory(GiB)": 91.64, "step": 230, "token_acc": 0.8681796233703525, "train_speed(iter/s)": 0.128949 }, { "epoch": 0.022178180445451112, "grad_norm": 4.7955732345581055, "learning_rate": 8.867924528301888e-05, "loss": 0.3989933967590332, "memory(GiB)": 91.64, "step": 235, "token_acc": 0.8699075659020883, "train_speed(iter/s)": 0.129146 }, { "epoch": 0.022650056625141562, "grad_norm": 1.4037288427352905, "learning_rate": 9.056603773584906e-05, "loss": 0.40497756004333496, "memory(GiB)": 91.64, "step": 240, "token_acc": 0.8680724520140578, "train_speed(iter/s)": 0.129334 }, { "epoch": 0.02312193280483201, "grad_norm": 3.678190231323242, "learning_rate": 9.245283018867925e-05, "loss": 0.41619534492492677, "memory(GiB)": 91.64, "step": 245, "token_acc": 0.8728121353558926, "train_speed(iter/s)": 0.129504 }, { "epoch": 0.02359380898452246, "grad_norm": 2.952890634536743, "learning_rate": 9.433962264150944e-05, "loss": 0.43350043296813967, "memory(GiB)": 91.64, "step": 250, "token_acc": 0.8761645962732919, "train_speed(iter/s)": 0.129679 }, { "epoch": 0.02406568516421291, "grad_norm": 1.438887357711792, "learning_rate": 9.622641509433963e-05, "loss": 0.44051084518432615, "memory(GiB)": 91.64, "step": 255, "token_acc": 0.8877816291161178, "train_speed(iter/s)": 0.129866 }, { "epoch": 0.02453756134390336, "grad_norm": 5.033264636993408, "learning_rate": 9.811320754716981e-05, "loss": 0.520989179611206, "memory(GiB)": 91.64, "step": 260, "token_acc": 0.8630212648439658, "train_speed(iter/s)": 0.129983 }, { "epoch": 0.02500943752359381, "grad_norm": 1.3177125453948975, "learning_rate": 0.0001, "loss": 0.4122615337371826, "memory(GiB)": 91.64, "step": 265, "token_acc": 0.8586235489220564, "train_speed(iter/s)": 0.130162 }, { "epoch": 0.02548131370328426, "grad_norm": 1.7470556497573853, "learning_rate": 0.0001018867924528302, "loss": 0.3909600734710693, "memory(GiB)": 91.64, "step": 270, "token_acc": 0.862627197039778, "train_speed(iter/s)": 0.130349 }, { "epoch": 0.02595318988297471, "grad_norm": 1.922580361366272, "learning_rate": 0.00010377358490566037, "loss": 0.40008974075317383, "memory(GiB)": 91.64, "step": 275, "token_acc": 0.8686131386861314, "train_speed(iter/s)": 0.130529 }, { "epoch": 0.02642506606266516, "grad_norm": 1.2326915264129639, "learning_rate": 0.00010566037735849057, "loss": 0.406398344039917, "memory(GiB)": 91.64, "step": 280, "token_acc": 0.8661087866108786, "train_speed(iter/s)": 0.130651 }, { "epoch": 0.026896942242355604, "grad_norm": 1.0071676969528198, "learning_rate": 0.00010754716981132076, "loss": 0.42357187271118163, "memory(GiB)": 91.64, "step": 285, "token_acc": 0.868785399622404, "train_speed(iter/s)": 0.130776 }, { "epoch": 0.027368818422046054, "grad_norm": 1.469139814376831, "learning_rate": 0.00010943396226415095, "loss": 0.40146756172180176, "memory(GiB)": 91.64, "step": 290, "token_acc": 0.8522532800912721, "train_speed(iter/s)": 0.130922 }, { "epoch": 0.027840694601736504, "grad_norm": 1.8155490159988403, "learning_rate": 0.00011132075471698113, "loss": 0.40570869445800783, "memory(GiB)": 91.64, "step": 295, "token_acc": 0.8601811736904293, "train_speed(iter/s)": 0.131047 }, { "epoch": 0.028312570781426953, "grad_norm": 0.9587322473526001, "learning_rate": 0.00011320754716981132, "loss": 0.4192817687988281, "memory(GiB)": 91.64, "step": 300, "token_acc": 0.8656987295825771, "train_speed(iter/s)": 0.131187 }, { "epoch": 0.028784446961117403, "grad_norm": 1.1372607946395874, "learning_rate": 0.00011509433962264151, "loss": 0.40810341835021974, "memory(GiB)": 91.64, "step": 305, "token_acc": 0.8578063594140765, "train_speed(iter/s)": 0.131329 }, { "epoch": 0.029256323140807852, "grad_norm": 0.9617637991905212, "learning_rate": 0.0001169811320754717, "loss": 0.41143293380737306, "memory(GiB)": 91.64, "step": 310, "token_acc": 0.8715596330275229, "train_speed(iter/s)": 0.131434 }, { "epoch": 0.029728199320498302, "grad_norm": 3.1000680923461914, "learning_rate": 0.00011886792452830188, "loss": 0.42690815925598147, "memory(GiB)": 91.64, "step": 315, "token_acc": 0.8557730723132241, "train_speed(iter/s)": 0.131554 }, { "epoch": 0.03020007550018875, "grad_norm": 1.8936156034469604, "learning_rate": 0.00012075471698113207, "loss": 0.4507251739501953, "memory(GiB)": 91.64, "step": 320, "token_acc": 0.8282674772036475, "train_speed(iter/s)": 0.131656 }, { "epoch": 0.0306719516798792, "grad_norm": 2.192671298980713, "learning_rate": 0.00012264150943396227, "loss": 0.42845516204833983, "memory(GiB)": 91.64, "step": 325, "token_acc": 0.8745208280092001, "train_speed(iter/s)": 0.131757 }, { "epoch": 0.03114382785956965, "grad_norm": 1.4483352899551392, "learning_rate": 0.00012452830188679244, "loss": 0.41372270584106446, "memory(GiB)": 91.64, "step": 330, "token_acc": 0.8579447322970639, "train_speed(iter/s)": 0.131828 }, { "epoch": 0.0316157040392601, "grad_norm": 0.6700084805488586, "learning_rate": 0.00012641509433962265, "loss": 0.3887781620025635, "memory(GiB)": 91.64, "step": 335, "token_acc": 0.881233595800525, "train_speed(iter/s)": 0.131926 }, { "epoch": 0.03208758021895055, "grad_norm": 0.9793855547904968, "learning_rate": 0.00012830188679245283, "loss": 0.3957367897033691, "memory(GiB)": 91.64, "step": 340, "token_acc": 0.8718430951101558, "train_speed(iter/s)": 0.132006 }, { "epoch": 0.032559456398640996, "grad_norm": 1.0212165117263794, "learning_rate": 0.000130188679245283, "loss": 0.401882266998291, "memory(GiB)": 91.64, "step": 345, "token_acc": 0.8426698450536353, "train_speed(iter/s)": 0.132057 }, { "epoch": 0.03303133257833145, "grad_norm": 1.0362790822982788, "learning_rate": 0.0001320754716981132, "loss": 0.40561609268188475, "memory(GiB)": 91.64, "step": 350, "token_acc": 0.8569958847736625, "train_speed(iter/s)": 0.132164 }, { "epoch": 0.033503208758021895, "grad_norm": 0.9072962999343872, "learning_rate": 0.0001339622641509434, "loss": 0.409895133972168, "memory(GiB)": 91.64, "step": 355, "token_acc": 0.8754699248120301, "train_speed(iter/s)": 0.132257 }, { "epoch": 0.03397508493771234, "grad_norm": 1.0432530641555786, "learning_rate": 0.0001358490566037736, "loss": 0.4178511619567871, "memory(GiB)": 91.64, "step": 360, "token_acc": 0.8715596330275229, "train_speed(iter/s)": 0.132366 }, { "epoch": 0.034446961117402794, "grad_norm": 1.1534943580627441, "learning_rate": 0.00013773584905660377, "loss": 0.41347360610961914, "memory(GiB)": 91.64, "step": 365, "token_acc": 0.8692509855453351, "train_speed(iter/s)": 0.132474 }, { "epoch": 0.03491883729709324, "grad_norm": 1.7700964212417603, "learning_rate": 0.00013962264150943395, "loss": 0.417569637298584, "memory(GiB)": 91.64, "step": 370, "token_acc": 0.8729519977004886, "train_speed(iter/s)": 0.132563 }, { "epoch": 0.03539071347678369, "grad_norm": 1.860262393951416, "learning_rate": 0.00014150943396226416, "loss": 0.4185757637023926, "memory(GiB)": 91.64, "step": 375, "token_acc": 0.8906384196965359, "train_speed(iter/s)": 0.132637 }, { "epoch": 0.03586258965647414, "grad_norm": 1.596077561378479, "learning_rate": 0.00014339622641509434, "loss": 0.411014461517334, "memory(GiB)": 91.64, "step": 380, "token_acc": 0.8643669149353195, "train_speed(iter/s)": 0.132728 }, { "epoch": 0.03633446583616459, "grad_norm": 0.9201560020446777, "learning_rate": 0.00014528301886792451, "loss": 0.42378597259521483, "memory(GiB)": 91.64, "step": 385, "token_acc": 0.8777292576419214, "train_speed(iter/s)": 0.132789 }, { "epoch": 0.03680634201585504, "grad_norm": 0.9091364741325378, "learning_rate": 0.00014716981132075472, "loss": 0.41143045425415037, "memory(GiB)": 91.64, "step": 390, "token_acc": 0.8801906058543226, "train_speed(iter/s)": 0.132888 }, { "epoch": 0.03727821819554549, "grad_norm": 1.1616473197937012, "learning_rate": 0.0001490566037735849, "loss": 0.4186095237731934, "memory(GiB)": 91.64, "step": 395, "token_acc": 0.8518700787401575, "train_speed(iter/s)": 0.132959 }, { "epoch": 0.03775009437523594, "grad_norm": 0.7037743330001831, "learning_rate": 0.0001509433962264151, "loss": 0.4113043785095215, "memory(GiB)": 91.64, "step": 400, "token_acc": 0.8516409912926992, "train_speed(iter/s)": 0.133015 }, { "epoch": 0.03822197055492639, "grad_norm": 0.7240108847618103, "learning_rate": 0.0001528301886792453, "loss": 0.4177090167999268, "memory(GiB)": 91.64, "step": 405, "token_acc": 0.8457389428263214, "train_speed(iter/s)": 0.133099 }, { "epoch": 0.03869384673461684, "grad_norm": 1.0540796518325806, "learning_rate": 0.0001547169811320755, "loss": 0.42041950225830077, "memory(GiB)": 91.64, "step": 410, "token_acc": 0.8516787080322992, "train_speed(iter/s)": 0.133188 }, { "epoch": 0.03916572291430728, "grad_norm": 0.980476975440979, "learning_rate": 0.00015660377358490567, "loss": 0.4235046863555908, "memory(GiB)": 91.64, "step": 415, "token_acc": 0.8591037545417844, "train_speed(iter/s)": 0.133243 }, { "epoch": 0.039637599093997736, "grad_norm": 0.9957146644592285, "learning_rate": 0.00015849056603773587, "loss": 0.426760196685791, "memory(GiB)": 91.64, "step": 420, "token_acc": 0.837030191004313, "train_speed(iter/s)": 0.13331 }, { "epoch": 0.04010947527368818, "grad_norm": 1.0860295295715332, "learning_rate": 0.00016037735849056605, "loss": 0.42039642333984373, "memory(GiB)": 91.64, "step": 425, "token_acc": 0.8553571428571428, "train_speed(iter/s)": 0.13337 }, { "epoch": 0.040581351453378635, "grad_norm": 0.6787883043289185, "learning_rate": 0.00016226415094339625, "loss": 0.4263429641723633, "memory(GiB)": 91.64, "step": 430, "token_acc": 0.8620312072269367, "train_speed(iter/s)": 0.133447 }, { "epoch": 0.04105322763306908, "grad_norm": 0.7591102123260498, "learning_rate": 0.00016415094339622643, "loss": 0.41852893829345705, "memory(GiB)": 91.64, "step": 435, "token_acc": 0.8613795401532822, "train_speed(iter/s)": 0.133505 }, { "epoch": 0.041525103812759534, "grad_norm": 0.6089534759521484, "learning_rate": 0.0001660377358490566, "loss": 0.42598543167114256, "memory(GiB)": 91.64, "step": 440, "token_acc": 0.8578378378378378, "train_speed(iter/s)": 0.133578 }, { "epoch": 0.04199697999244998, "grad_norm": 1.8172188997268677, "learning_rate": 0.00016792452830188682, "loss": 0.44203596115112304, "memory(GiB)": 91.64, "step": 445, "token_acc": 0.8689731321310269, "train_speed(iter/s)": 0.133639 }, { "epoch": 0.04246885617214043, "grad_norm": 1.0775083303451538, "learning_rate": 0.000169811320754717, "loss": 0.43241491317749026, "memory(GiB)": 91.64, "step": 450, "token_acc": 0.8645731108930323, "train_speed(iter/s)": 0.133695 }, { "epoch": 0.04294073235183088, "grad_norm": 1.2232269048690796, "learning_rate": 0.00017169811320754717, "loss": 0.42380399703979493, "memory(GiB)": 91.64, "step": 455, "token_acc": 0.8739164696611506, "train_speed(iter/s)": 0.133754 }, { "epoch": 0.04341260853152133, "grad_norm": 0.6929795145988464, "learning_rate": 0.00017358490566037738, "loss": 0.425693416595459, "memory(GiB)": 91.64, "step": 460, "token_acc": 0.8682867557715674, "train_speed(iter/s)": 0.13382 }, { "epoch": 0.04388448471121178, "grad_norm": 0.7897017002105713, "learning_rate": 0.00017547169811320756, "loss": 0.41992640495300293, "memory(GiB)": 91.64, "step": 465, "token_acc": 0.8354898336414048, "train_speed(iter/s)": 0.133876 }, { "epoch": 0.044356360890902224, "grad_norm": 0.5486307740211487, "learning_rate": 0.00017735849056603776, "loss": 0.42015676498413085, "memory(GiB)": 91.64, "step": 470, "token_acc": 0.8480052753049786, "train_speed(iter/s)": 0.133918 }, { "epoch": 0.04482823707059268, "grad_norm": 1.3324936628341675, "learning_rate": 0.00017924528301886794, "loss": 0.417246150970459, "memory(GiB)": 91.64, "step": 475, "token_acc": 0.8778747026169706, "train_speed(iter/s)": 0.133961 }, { "epoch": 0.045300113250283124, "grad_norm": 0.9353549480438232, "learning_rate": 0.00018113207547169812, "loss": 0.4723100185394287, "memory(GiB)": 91.64, "step": 480, "token_acc": 0.8507697141061892, "train_speed(iter/s)": 0.134019 }, { "epoch": 0.04577198942997358, "grad_norm": 1.1144438982009888, "learning_rate": 0.00018301886792452832, "loss": 0.45328826904296876, "memory(GiB)": 91.64, "step": 485, "token_acc": 0.8691880638445524, "train_speed(iter/s)": 0.134049 }, { "epoch": 0.04624386560966402, "grad_norm": 1.5882467031478882, "learning_rate": 0.0001849056603773585, "loss": 0.42916183471679686, "memory(GiB)": 91.64, "step": 490, "token_acc": 0.8745184369840396, "train_speed(iter/s)": 0.134102 }, { "epoch": 0.046715741789354476, "grad_norm": 1.1314642429351807, "learning_rate": 0.00018679245283018868, "loss": 0.41251192092895506, "memory(GiB)": 91.64, "step": 495, "token_acc": 0.8797017960013555, "train_speed(iter/s)": 0.13416 }, { "epoch": 0.04718761796904492, "grad_norm": 0.6526018381118774, "learning_rate": 0.00018867924528301889, "loss": 0.42115144729614257, "memory(GiB)": 91.64, "step": 500, "token_acc": 0.8817605371130175, "train_speed(iter/s)": 0.134197 }, { "epoch": 0.047659494148735375, "grad_norm": 0.7289260029792786, "learning_rate": 0.00019056603773584906, "loss": 0.4194014549255371, "memory(GiB)": 91.64, "step": 505, "token_acc": 0.8838260869565218, "train_speed(iter/s)": 0.134245 }, { "epoch": 0.04813137032842582, "grad_norm": 1.5535837411880493, "learning_rate": 0.00019245283018867927, "loss": 0.42951335906982424, "memory(GiB)": 91.64, "step": 510, "token_acc": 0.8675577156743621, "train_speed(iter/s)": 0.134314 }, { "epoch": 0.04860324650811627, "grad_norm": 0.6319753527641296, "learning_rate": 0.00019433962264150945, "loss": 0.42447052001953123, "memory(GiB)": 91.64, "step": 515, "token_acc": 0.8672985781990521, "train_speed(iter/s)": 0.134352 }, { "epoch": 0.04907512268780672, "grad_norm": 0.7045039534568787, "learning_rate": 0.00019622641509433963, "loss": 0.43744239807128904, "memory(GiB)": 91.64, "step": 520, "token_acc": 0.8579258010118044, "train_speed(iter/s)": 0.134408 }, { "epoch": 0.049546998867497166, "grad_norm": 0.7517572045326233, "learning_rate": 0.00019811320754716983, "loss": 0.4918349266052246, "memory(GiB)": 91.64, "step": 525, "token_acc": 0.8434403487911217, "train_speed(iter/s)": 0.134457 }, { "epoch": 0.05001887504718762, "grad_norm": 1.5722893476486206, "learning_rate": 0.0002, "loss": 0.44678421020507814, "memory(GiB)": 91.64, "step": 530, "token_acc": 0.8664154103852596, "train_speed(iter/s)": 0.134489 }, { "epoch": 0.050490751226878065, "grad_norm": 1.0157049894332886, "learning_rate": 0.00019999987824247315, "loss": 0.42899537086486816, "memory(GiB)": 91.64, "step": 535, "token_acc": 0.8645294725956567, "train_speed(iter/s)": 0.134533 }, { "epoch": 0.05096262740656852, "grad_norm": 0.9441537261009216, "learning_rate": 0.00019999951297018905, "loss": 0.41001176834106445, "memory(GiB)": 91.64, "step": 540, "token_acc": 0.8450834879406308, "train_speed(iter/s)": 0.134584 }, { "epoch": 0.051434503586258964, "grad_norm": 0.6020461320877075, "learning_rate": 0.0001999989041840372, "loss": 0.41864852905273436, "memory(GiB)": 91.64, "step": 545, "token_acc": 0.8733031674208145, "train_speed(iter/s)": 0.134636 }, { "epoch": 0.05190637976594942, "grad_norm": 0.7992700338363647, "learning_rate": 0.0001999980518855001, "loss": 0.4274590969085693, "memory(GiB)": 91.64, "step": 550, "token_acc": 0.8683141503046716, "train_speed(iter/s)": 0.134693 }, { "epoch": 0.052378255945639864, "grad_norm": 0.6813804507255554, "learning_rate": 0.00019999695607665326, "loss": 0.4231581687927246, "memory(GiB)": 91.64, "step": 555, "token_acc": 0.8547925608011445, "train_speed(iter/s)": 0.134736 }, { "epoch": 0.05285013212533032, "grad_norm": 1.6035652160644531, "learning_rate": 0.00019999561676016506, "loss": 0.4291111946105957, "memory(GiB)": 91.64, "step": 560, "token_acc": 0.8530872959545777, "train_speed(iter/s)": 0.134778 }, { "epoch": 0.05332200830502076, "grad_norm": 1.459104299545288, "learning_rate": 0.00019999403393929695, "loss": 0.438527774810791, "memory(GiB)": 91.64, "step": 565, "token_acc": 0.8590631364562118, "train_speed(iter/s)": 0.134828 }, { "epoch": 0.05379388448471121, "grad_norm": 1.4410645961761475, "learning_rate": 0.0001999922076179034, "loss": 0.4256152153015137, "memory(GiB)": 91.64, "step": 570, "token_acc": 0.8751229105211407, "train_speed(iter/s)": 0.134867 }, { "epoch": 0.05426576066440166, "grad_norm": 1.2330466508865356, "learning_rate": 0.00019999013780043175, "loss": 0.4717870712280273, "memory(GiB)": 91.64, "step": 575, "token_acc": 0.8500267809319765, "train_speed(iter/s)": 0.134918 }, { "epoch": 0.05473763684409211, "grad_norm": 0.652735710144043, "learning_rate": 0.0001999878244919223, "loss": 0.4285730361938477, "memory(GiB)": 91.64, "step": 580, "token_acc": 0.8572246065808298, "train_speed(iter/s)": 0.134957 }, { "epoch": 0.05520951302378256, "grad_norm": 1.1914304494857788, "learning_rate": 0.0001999852676980083, "loss": 0.4361170768737793, "memory(GiB)": 91.64, "step": 585, "token_acc": 0.8608202443280978, "train_speed(iter/s)": 0.134985 }, { "epoch": 0.05568138920347301, "grad_norm": 1.619393229484558, "learning_rate": 0.00019998246742491596, "loss": 0.4270325660705566, "memory(GiB)": 91.64, "step": 590, "token_acc": 0.8668300653594772, "train_speed(iter/s)": 0.13503 }, { "epoch": 0.05615326538316346, "grad_norm": 0.7896348237991333, "learning_rate": 0.00019997942367946437, "loss": 0.43086681365966795, "memory(GiB)": 91.64, "step": 595, "token_acc": 0.8616211374832065, "train_speed(iter/s)": 0.135073 }, { "epoch": 0.056625141562853906, "grad_norm": 0.5002231597900391, "learning_rate": 0.00019997613646906544, "loss": 0.4146383285522461, "memory(GiB)": 91.64, "step": 600, "token_acc": 0.8878950506857484, "train_speed(iter/s)": 0.135104 }, { "epoch": 0.05709701774254436, "grad_norm": 0.8216480612754822, "learning_rate": 0.00019997260580172408, "loss": 0.4246044158935547, "memory(GiB)": 91.64, "step": 605, "token_acc": 0.8645484949832776, "train_speed(iter/s)": 0.135117 }, { "epoch": 0.057568893922234805, "grad_norm": 0.4890948534011841, "learning_rate": 0.000199968831686038, "loss": 0.42087130546569823, "memory(GiB)": 91.64, "step": 610, "token_acc": 0.8492407809110629, "train_speed(iter/s)": 0.135152 }, { "epoch": 0.05804077010192525, "grad_norm": 0.6820136308670044, "learning_rate": 0.00019996481413119772, "loss": 0.41388683319091796, "memory(GiB)": 91.64, "step": 615, "token_acc": 0.8731758165392633, "train_speed(iter/s)": 0.135191 }, { "epoch": 0.058512646281615704, "grad_norm": 1.1169610023498535, "learning_rate": 0.00019996055314698658, "loss": 0.4246358394622803, "memory(GiB)": 91.64, "step": 620, "token_acc": 0.8521199586349535, "train_speed(iter/s)": 0.135201 }, { "epoch": 0.05898452246130615, "grad_norm": 0.9080762267112732, "learning_rate": 0.0001999560487437808, "loss": 0.41969666481018064, "memory(GiB)": 91.64, "step": 625, "token_acc": 0.842851667305481, "train_speed(iter/s)": 0.135239 }, { "epoch": 0.059456398640996604, "grad_norm": 0.5982246398925781, "learning_rate": 0.0001999513009325491, "loss": 0.4150944709777832, "memory(GiB)": 91.64, "step": 630, "token_acc": 0.8643006263048016, "train_speed(iter/s)": 0.135284 }, { "epoch": 0.05992827482068705, "grad_norm": 0.8043394088745117, "learning_rate": 0.00019994630972485332, "loss": 0.42877888679504395, "memory(GiB)": 91.64, "step": 635, "token_acc": 0.8495774647887324, "train_speed(iter/s)": 0.135316 }, { "epoch": 0.0604001510003775, "grad_norm": 0.7642725706100464, "learning_rate": 0.00019994107513284767, "loss": 0.41966772079467773, "memory(GiB)": 91.64, "step": 640, "token_acc": 0.8726851851851852, "train_speed(iter/s)": 0.135354 }, { "epoch": 0.06087202718006795, "grad_norm": 0.5839939713478088, "learning_rate": 0.00019993559716927924, "loss": 0.41250057220458985, "memory(GiB)": 91.64, "step": 645, "token_acc": 0.8587731811697575, "train_speed(iter/s)": 0.135396 }, { "epoch": 0.0613439033597584, "grad_norm": 0.7859387397766113, "learning_rate": 0.00019992987584748764, "loss": 0.4213667392730713, "memory(GiB)": 91.64, "step": 650, "token_acc": 0.8893985728848114, "train_speed(iter/s)": 0.135438 }, { "epoch": 0.06181577953944885, "grad_norm": 0.5287458300590515, "learning_rate": 0.00019992391118140517, "loss": 0.41655263900756834, "memory(GiB)": 91.64, "step": 655, "token_acc": 0.8686520376175548, "train_speed(iter/s)": 0.135466 }, { "epoch": 0.0622876557191393, "grad_norm": 0.9565641283988953, "learning_rate": 0.00019991770318555672, "loss": 0.4196015357971191, "memory(GiB)": 91.64, "step": 660, "token_acc": 0.8530906011854361, "train_speed(iter/s)": 0.135483 }, { "epoch": 0.06275953189882974, "grad_norm": 0.6690832376480103, "learning_rate": 0.00019991125187505965, "loss": 0.41780900955200195, "memory(GiB)": 91.64, "step": 665, "token_acc": 0.8785834738617201, "train_speed(iter/s)": 0.135505 }, { "epoch": 0.0632314080785202, "grad_norm": 0.4967804253101349, "learning_rate": 0.0001999045572656239, "loss": 0.41567211151123046, "memory(GiB)": 91.64, "step": 670, "token_acc": 0.8707653701380176, "train_speed(iter/s)": 0.135538 }, { "epoch": 0.06370328425821065, "grad_norm": 0.5052830576896667, "learning_rate": 0.00019989761937355186, "loss": 0.4151804447174072, "memory(GiB)": 91.64, "step": 675, "token_acc": 0.8579277864992151, "train_speed(iter/s)": 0.135555 }, { "epoch": 0.0641751604379011, "grad_norm": 0.9486806392669678, "learning_rate": 0.0001998904382157383, "loss": 0.4172024726867676, "memory(GiB)": 91.64, "step": 680, "token_acc": 0.8693062368605466, "train_speed(iter/s)": 0.135583 }, { "epoch": 0.06464703661759154, "grad_norm": 0.8576740622520447, "learning_rate": 0.00019988301380967046, "loss": 0.40970048904418943, "memory(GiB)": 91.64, "step": 685, "token_acc": 0.8697332817935833, "train_speed(iter/s)": 0.135611 }, { "epoch": 0.06511891279728199, "grad_norm": 1.150675654411316, "learning_rate": 0.0001998753461734279, "loss": 0.42277183532714846, "memory(GiB)": 91.64, "step": 690, "token_acc": 0.8751705320600273, "train_speed(iter/s)": 0.135635 }, { "epoch": 0.06559078897697244, "grad_norm": 1.0340559482574463, "learning_rate": 0.0001998674353256824, "loss": 0.41498851776123047, "memory(GiB)": 91.64, "step": 695, "token_acc": 0.8739859383450513, "train_speed(iter/s)": 0.135653 }, { "epoch": 0.0660626651566629, "grad_norm": 1.0053695440292358, "learning_rate": 0.00019985928128569814, "loss": 0.41504592895507814, "memory(GiB)": 91.64, "step": 700, "token_acc": 0.8800152846771112, "train_speed(iter/s)": 0.135671 }, { "epoch": 0.06653454133635334, "grad_norm": 0.6694364547729492, "learning_rate": 0.00019985088407333137, "loss": 0.41004395484924316, "memory(GiB)": 91.64, "step": 705, "token_acc": 0.8829944002357796, "train_speed(iter/s)": 0.1357 }, { "epoch": 0.06700641751604379, "grad_norm": 0.6840929388999939, "learning_rate": 0.0001998422437090306, "loss": 0.4107684135437012, "memory(GiB)": 91.64, "step": 710, "token_acc": 0.8496487119437939, "train_speed(iter/s)": 0.135697 }, { "epoch": 0.06747829369573424, "grad_norm": 0.8968580365180969, "learning_rate": 0.00019983336021383642, "loss": 0.40598034858703613, "memory(GiB)": 91.64, "step": 715, "token_acc": 0.8637623762376238, "train_speed(iter/s)": 0.135716 }, { "epoch": 0.06795016987542468, "grad_norm": 1.14011549949646, "learning_rate": 0.0001998242336093815, "loss": 0.4146572113037109, "memory(GiB)": 91.64, "step": 720, "token_acc": 0.8786955196586407, "train_speed(iter/s)": 0.135739 }, { "epoch": 0.06842204605511513, "grad_norm": 1.8799290657043457, "learning_rate": 0.00019981486391789044, "loss": 0.4101266860961914, "memory(GiB)": 91.64, "step": 725, "token_acc": 0.8825644098262433, "train_speed(iter/s)": 0.13576 }, { "epoch": 0.06889392223480559, "grad_norm": 1.097159743309021, "learning_rate": 0.00019980525116217987, "loss": 0.41389617919921873, "memory(GiB)": 91.64, "step": 730, "token_acc": 0.8438617401668653, "train_speed(iter/s)": 0.135793 }, { "epoch": 0.06936579841449604, "grad_norm": 0.39971891045570374, "learning_rate": 0.00019979539536565835, "loss": 0.4141077518463135, "memory(GiB)": 91.64, "step": 735, "token_acc": 0.8802670004171882, "train_speed(iter/s)": 0.135817 }, { "epoch": 0.06983767459418648, "grad_norm": 3.0273921489715576, "learning_rate": 0.00019978529655232614, "loss": 0.43627090454101564, "memory(GiB)": 91.64, "step": 740, "token_acc": 0.8638635695383711, "train_speed(iter/s)": 0.135832 }, { "epoch": 0.07030955077387693, "grad_norm": 1.658718228340149, "learning_rate": 0.00019977495474677543, "loss": 0.4568758010864258, "memory(GiB)": 91.64, "step": 745, "token_acc": 0.8712955122777307, "train_speed(iter/s)": 0.135857 }, { "epoch": 0.07078142695356739, "grad_norm": 0.8811335563659668, "learning_rate": 0.00019976436997419004, "loss": 0.405039119720459, "memory(GiB)": 91.64, "step": 750, "token_acc": 0.8927808628791115, "train_speed(iter/s)": 0.135875 }, { "epoch": 0.07125330313325784, "grad_norm": 0.4463726878166199, "learning_rate": 0.00019975354226034554, "loss": 0.3967690706253052, "memory(GiB)": 91.64, "step": 755, "token_acc": 0.8465842167255595, "train_speed(iter/s)": 0.135889 }, { "epoch": 0.07172517931294828, "grad_norm": 1.1370724439620972, "learning_rate": 0.00019974247163160897, "loss": 0.4012022018432617, "memory(GiB)": 91.64, "step": 760, "token_acc": 0.8686626746506986, "train_speed(iter/s)": 0.135918 }, { "epoch": 0.07219705549263873, "grad_norm": 0.6569880843162537, "learning_rate": 0.00019973115811493903, "loss": 0.39702539443969725, "memory(GiB)": 91.64, "step": 765, "token_acc": 0.8665835411471322, "train_speed(iter/s)": 0.135947 }, { "epoch": 0.07266893167232918, "grad_norm": 0.48011693358421326, "learning_rate": 0.00019971960173788581, "loss": 0.4035386085510254, "memory(GiB)": 91.64, "step": 770, "token_acc": 0.8578104138851802, "train_speed(iter/s)": 0.135961 }, { "epoch": 0.07314080785201962, "grad_norm": 0.4577121436595917, "learning_rate": 0.00019970780252859087, "loss": 0.39451889991760253, "memory(GiB)": 91.64, "step": 775, "token_acc": 0.8687527162103433, "train_speed(iter/s)": 0.135979 }, { "epoch": 0.07361268403171008, "grad_norm": 0.7687115669250488, "learning_rate": 0.000199695760515787, "loss": 0.40027799606323244, "memory(GiB)": 91.64, "step": 780, "token_acc": 0.8462029355456286, "train_speed(iter/s)": 0.135999 }, { "epoch": 0.07408456021140053, "grad_norm": 0.5802925825119019, "learning_rate": 0.00019968347572879835, "loss": 0.3972629070281982, "memory(GiB)": 91.64, "step": 785, "token_acc": 0.8566473988439306, "train_speed(iter/s)": 0.136023 }, { "epoch": 0.07455643639109098, "grad_norm": 0.4147838056087494, "learning_rate": 0.0001996709481975402, "loss": 0.40576868057250975, "memory(GiB)": 91.64, "step": 790, "token_acc": 0.8650012010569301, "train_speed(iter/s)": 0.136051 }, { "epoch": 0.07502831257078142, "grad_norm": 0.7321649789810181, "learning_rate": 0.00019965817795251903, "loss": 0.3949580669403076, "memory(GiB)": 91.64, "step": 795, "token_acc": 0.8702346041055719, "train_speed(iter/s)": 0.136065 }, { "epoch": 0.07550018875047187, "grad_norm": 0.42227479815483093, "learning_rate": 0.00019964516502483224, "loss": 0.40514750480651857, "memory(GiB)": 91.64, "step": 800, "token_acc": 0.8411989795918368, "train_speed(iter/s)": 0.136081 }, { "epoch": 0.07597206493016233, "grad_norm": 1.0944310426712036, "learning_rate": 0.0001996319094461683, "loss": 0.3979844808578491, "memory(GiB)": 91.64, "step": 805, "token_acc": 0.8827761320355481, "train_speed(iter/s)": 0.136098 }, { "epoch": 0.07644394110985278, "grad_norm": 1.040534496307373, "learning_rate": 0.00019961841124880656, "loss": 0.39571661949157716, "memory(GiB)": 91.64, "step": 810, "token_acc": 0.8898480662983426, "train_speed(iter/s)": 0.13611 }, { "epoch": 0.07691581728954322, "grad_norm": 0.605493426322937, "learning_rate": 0.00019960467046561712, "loss": 0.4014116287231445, "memory(GiB)": 91.64, "step": 815, "token_acc": 0.8638888888888889, "train_speed(iter/s)": 0.136124 }, { "epoch": 0.07738769346923367, "grad_norm": 0.7245599627494812, "learning_rate": 0.0001995906871300609, "loss": 0.39299564361572265, "memory(GiB)": 91.64, "step": 820, "token_acc": 0.8532883642495784, "train_speed(iter/s)": 0.136151 }, { "epoch": 0.07785956964892413, "grad_norm": 0.6854239702224731, "learning_rate": 0.00019957646127618937, "loss": 0.403260326385498, "memory(GiB)": 91.64, "step": 825, "token_acc": 0.8646728971962617, "train_speed(iter/s)": 0.136172 }, { "epoch": 0.07833144582861457, "grad_norm": 0.603828489780426, "learning_rate": 0.00019956199293864467, "loss": 0.41519527435302733, "memory(GiB)": 91.64, "step": 830, "token_acc": 0.8636763412489006, "train_speed(iter/s)": 0.13618 }, { "epoch": 0.07880332200830502, "grad_norm": 1.0101585388183594, "learning_rate": 0.00019954728215265937, "loss": 0.3980675935745239, "memory(GiB)": 91.64, "step": 835, "token_acc": 0.8703146374829002, "train_speed(iter/s)": 0.136184 }, { "epoch": 0.07927519818799547, "grad_norm": 0.4053173065185547, "learning_rate": 0.00019953232895405644, "loss": 0.3960963010787964, "memory(GiB)": 91.64, "step": 840, "token_acc": 0.8587474472430224, "train_speed(iter/s)": 0.136185 }, { "epoch": 0.07974707436768592, "grad_norm": 0.5084025859832764, "learning_rate": 0.0001995171333792492, "loss": 0.4032759666442871, "memory(GiB)": 91.64, "step": 845, "token_acc": 0.8585987261146497, "train_speed(iter/s)": 0.136191 }, { "epoch": 0.08021895054737636, "grad_norm": 0.4399580955505371, "learning_rate": 0.0001995016954652411, "loss": 0.39526617527008057, "memory(GiB)": 91.64, "step": 850, "token_acc": 0.8773885350318471, "train_speed(iter/s)": 0.136206 }, { "epoch": 0.08069082672706682, "grad_norm": 0.9100202322006226, "learning_rate": 0.00019948601524962588, "loss": 0.4031196117401123, "memory(GiB)": 91.64, "step": 855, "token_acc": 0.8691389599317988, "train_speed(iter/s)": 0.13623 }, { "epoch": 0.08116270290675727, "grad_norm": 0.5204232335090637, "learning_rate": 0.00019947009277058712, "loss": 0.38583035469055177, "memory(GiB)": 91.64, "step": 860, "token_acc": 0.8842271293375394, "train_speed(iter/s)": 0.136259 }, { "epoch": 0.08163457908644772, "grad_norm": 0.770131528377533, "learning_rate": 0.00019945392806689855, "loss": 0.3988801956176758, "memory(GiB)": 91.64, "step": 865, "token_acc": 0.8591459528362014, "train_speed(iter/s)": 0.136277 }, { "epoch": 0.08210645526613816, "grad_norm": 1.0950678586959839, "learning_rate": 0.00019943752117792358, "loss": 0.40749850273132326, "memory(GiB)": 91.64, "step": 870, "token_acc": 0.8809041835357625, "train_speed(iter/s)": 0.136294 }, { "epoch": 0.08257833144582861, "grad_norm": 0.7339961528778076, "learning_rate": 0.00019942087214361548, "loss": 0.40170702934265134, "memory(GiB)": 91.64, "step": 875, "token_acc": 0.8847161572052402, "train_speed(iter/s)": 0.136308 }, { "epoch": 0.08305020762551907, "grad_norm": 0.5357945561408997, "learning_rate": 0.0001994039810045172, "loss": 0.3935965061187744, "memory(GiB)": 91.64, "step": 880, "token_acc": 0.8578924355050285, "train_speed(iter/s)": 0.136328 }, { "epoch": 0.08352208380520951, "grad_norm": 0.9235381484031677, "learning_rate": 0.0001993868478017611, "loss": 0.39939312934875487, "memory(GiB)": 91.64, "step": 885, "token_acc": 0.8347355769230769, "train_speed(iter/s)": 0.136345 }, { "epoch": 0.08399395998489996, "grad_norm": 0.5018760561943054, "learning_rate": 0.00019936947257706921, "loss": 0.398266339302063, "memory(GiB)": 91.64, "step": 890, "token_acc": 0.8601160013647219, "train_speed(iter/s)": 0.136359 }, { "epoch": 0.08446583616459041, "grad_norm": 0.48619768023490906, "learning_rate": 0.00019935185537275278, "loss": 0.39243621826171876, "memory(GiB)": 91.64, "step": 895, "token_acc": 0.8709032773780975, "train_speed(iter/s)": 0.136385 }, { "epoch": 0.08493771234428087, "grad_norm": 0.6260401606559753, "learning_rate": 0.00019933399623171236, "loss": 0.39727630615234377, "memory(GiB)": 91.64, "step": 900, "token_acc": 0.8716502115655853, "train_speed(iter/s)": 0.136395 }, { "epoch": 0.0854095885239713, "grad_norm": 0.4643126428127289, "learning_rate": 0.00019931589519743765, "loss": 0.39346966743469236, "memory(GiB)": 91.64, "step": 905, "token_acc": 0.8780300115429012, "train_speed(iter/s)": 0.136405 }, { "epoch": 0.08588146470366176, "grad_norm": 0.5169233679771423, "learning_rate": 0.00019929755231400735, "loss": 0.3928957939147949, "memory(GiB)": 91.64, "step": 910, "token_acc": 0.8716216216216216, "train_speed(iter/s)": 0.136426 }, { "epoch": 0.08635334088335221, "grad_norm": 0.39350777864456177, "learning_rate": 0.00019927896762608922, "loss": 0.3948735952377319, "memory(GiB)": 91.64, "step": 915, "token_acc": 0.8647242455775234, "train_speed(iter/s)": 0.136437 }, { "epoch": 0.08682521706304266, "grad_norm": 0.40696701407432556, "learning_rate": 0.0001992601411789397, "loss": 0.38743617534637453, "memory(GiB)": 91.64, "step": 920, "token_acc": 0.8676893576222435, "train_speed(iter/s)": 0.136452 }, { "epoch": 0.0872970932427331, "grad_norm": 0.4113665521144867, "learning_rate": 0.00019924107301840408, "loss": 0.3946674823760986, "memory(GiB)": 91.64, "step": 925, "token_acc": 0.8602645198389879, "train_speed(iter/s)": 0.136484 }, { "epoch": 0.08776896942242356, "grad_norm": 0.3710317015647888, "learning_rate": 0.00019922176319091617, "loss": 0.3960568904876709, "memory(GiB)": 91.64, "step": 930, "token_acc": 0.8746690203000883, "train_speed(iter/s)": 0.136487 }, { "epoch": 0.08824084560211401, "grad_norm": 0.4853060245513916, "learning_rate": 0.0001992022117434983, "loss": 0.39677085876464846, "memory(GiB)": 91.64, "step": 935, "token_acc": 0.8561736770691994, "train_speed(iter/s)": 0.136506 }, { "epoch": 0.08871272178180445, "grad_norm": 0.49833860993385315, "learning_rate": 0.0001991824187237612, "loss": 0.39057235717773436, "memory(GiB)": 91.64, "step": 940, "token_acc": 0.852015732546706, "train_speed(iter/s)": 0.136515 }, { "epoch": 0.0891845979614949, "grad_norm": 0.8697565793991089, "learning_rate": 0.00019916238417990386, "loss": 0.39734203815460206, "memory(GiB)": 91.64, "step": 945, "token_acc": 0.8645515558267236, "train_speed(iter/s)": 0.136517 }, { "epoch": 0.08965647414118535, "grad_norm": 0.3841949701309204, "learning_rate": 0.0001991421081607134, "loss": 0.39315037727355956, "memory(GiB)": 91.64, "step": 950, "token_acc": 0.8595624558927312, "train_speed(iter/s)": 0.136535 }, { "epoch": 0.09012835032087581, "grad_norm": 0.6174625158309937, "learning_rate": 0.00019912159071556497, "loss": 0.3937983512878418, "memory(GiB)": 91.64, "step": 955, "token_acc": 0.8652637332604537, "train_speed(iter/s)": 0.136548 }, { "epoch": 0.09060022650056625, "grad_norm": 0.5916112065315247, "learning_rate": 0.0001991008318944217, "loss": 0.3945147037506104, "memory(GiB)": 91.64, "step": 960, "token_acc": 0.8693599160545645, "train_speed(iter/s)": 0.136565 }, { "epoch": 0.0910721026802567, "grad_norm": 0.6667875051498413, "learning_rate": 0.00019907983174783433, "loss": 0.38831090927124023, "memory(GiB)": 91.64, "step": 965, "token_acc": 0.8872151409810738, "train_speed(iter/s)": 0.13657 }, { "epoch": 0.09154397885994715, "grad_norm": 0.4363083839416504, "learning_rate": 0.00019905859032694147, "loss": 0.3933609962463379, "memory(GiB)": 91.64, "step": 970, "token_acc": 0.8592730661696178, "train_speed(iter/s)": 0.136591 }, { "epoch": 0.09201585503963759, "grad_norm": 0.5822863578796387, "learning_rate": 0.00019903710768346918, "loss": 0.38769237995147704, "memory(GiB)": 91.64, "step": 975, "token_acc": 0.8806196840826246, "train_speed(iter/s)": 0.136602 }, { "epoch": 0.09248773121932805, "grad_norm": 0.342798113822937, "learning_rate": 0.00019901538386973085, "loss": 0.39563870429992676, "memory(GiB)": 91.64, "step": 980, "token_acc": 0.8935498421290031, "train_speed(iter/s)": 0.13661 }, { "epoch": 0.0929596073990185, "grad_norm": 0.38594508171081543, "learning_rate": 0.0001989934189386273, "loss": 0.39223246574401854, "memory(GiB)": 91.64, "step": 985, "token_acc": 0.8759615384615385, "train_speed(iter/s)": 0.136622 }, { "epoch": 0.09343148357870895, "grad_norm": 0.4435591399669647, "learning_rate": 0.00019897121294364643, "loss": 0.39308857917785645, "memory(GiB)": 91.64, "step": 990, "token_acc": 0.8816035968527538, "train_speed(iter/s)": 0.136635 }, { "epoch": 0.09390335975839939, "grad_norm": 0.5884724855422974, "learning_rate": 0.0001989487659388632, "loss": 0.3950310707092285, "memory(GiB)": 91.64, "step": 995, "token_acc": 0.8725490196078431, "train_speed(iter/s)": 0.136656 }, { "epoch": 0.09437523593808984, "grad_norm": 0.785302460193634, "learning_rate": 0.00019892607797893943, "loss": 0.3945254564285278, "memory(GiB)": 91.64, "step": 1000, "token_acc": 0.8808167141500475, "train_speed(iter/s)": 0.136669 }, { "epoch": 0.0948471121177803, "grad_norm": 0.4744601547718048, "learning_rate": 0.00019890314911912368, "loss": 0.3849745750427246, "memory(GiB)": 91.64, "step": 1005, "token_acc": 0.8613861386138614, "train_speed(iter/s)": 0.13669 }, { "epoch": 0.09531898829747075, "grad_norm": 0.5876829624176025, "learning_rate": 0.00019887997941525124, "loss": 0.3877379894256592, "memory(GiB)": 91.64, "step": 1010, "token_acc": 0.8721071863580999, "train_speed(iter/s)": 0.1367 }, { "epoch": 0.09579086447716119, "grad_norm": 0.8457826375961304, "learning_rate": 0.00019885656892374378, "loss": 0.3987894535064697, "memory(GiB)": 91.64, "step": 1015, "token_acc": 0.8600823045267489, "train_speed(iter/s)": 0.136711 }, { "epoch": 0.09626274065685164, "grad_norm": 1.2091189622879028, "learning_rate": 0.00019883291770160942, "loss": 0.38491311073303225, "memory(GiB)": 91.64, "step": 1020, "token_acc": 0.8802992518703242, "train_speed(iter/s)": 0.136726 }, { "epoch": 0.0967346168365421, "grad_norm": 0.43290698528289795, "learning_rate": 0.0001988090258064424, "loss": 0.3924283027648926, "memory(GiB)": 91.64, "step": 1025, "token_acc": 0.8670157068062827, "train_speed(iter/s)": 0.136735 }, { "epoch": 0.09720649301623253, "grad_norm": 0.4230733811855316, "learning_rate": 0.00019878489329642308, "loss": 0.38915348052978516, "memory(GiB)": 91.64, "step": 1030, "token_acc": 0.8749527767283718, "train_speed(iter/s)": 0.136743 }, { "epoch": 0.09767836919592299, "grad_norm": 0.5611448884010315, "learning_rate": 0.00019876052023031778, "loss": 0.3783283233642578, "memory(GiB)": 91.64, "step": 1035, "token_acc": 0.8774747852073216, "train_speed(iter/s)": 0.13675 }, { "epoch": 0.09815024537561344, "grad_norm": 0.8954288363456726, "learning_rate": 0.00019873590666747855, "loss": 0.38250117301940917, "memory(GiB)": 91.64, "step": 1040, "token_acc": 0.8612850082372323, "train_speed(iter/s)": 0.136771 }, { "epoch": 0.0986221215553039, "grad_norm": 0.601012110710144, "learning_rate": 0.00019871105266784317, "loss": 0.38401503562927247, "memory(GiB)": 91.64, "step": 1045, "token_acc": 0.8982758620689655, "train_speed(iter/s)": 0.13678 }, { "epoch": 0.09909399773499433, "grad_norm": 0.5251648426055908, "learning_rate": 0.00019868595829193486, "loss": 0.38367199897766113, "memory(GiB)": 91.64, "step": 1050, "token_acc": 0.8643162393162394, "train_speed(iter/s)": 0.136804 }, { "epoch": 0.09956587391468479, "grad_norm": 0.553338348865509, "learning_rate": 0.00019866062360086216, "loss": 0.383012580871582, "memory(GiB)": 91.64, "step": 1055, "token_acc": 0.8829745596868884, "train_speed(iter/s)": 0.13682 }, { "epoch": 0.10003775009437524, "grad_norm": 0.48636507987976074, "learning_rate": 0.00019863504865631892, "loss": 0.39072608947753906, "memory(GiB)": 91.64, "step": 1060, "token_acc": 0.8856960408684547, "train_speed(iter/s)": 0.136826 }, { "epoch": 0.10050962627406569, "grad_norm": 0.560430645942688, "learning_rate": 0.00019860923352058393, "loss": 0.3910938262939453, "memory(GiB)": 91.64, "step": 1065, "token_acc": 0.8764075067024129, "train_speed(iter/s)": 0.136831 }, { "epoch": 0.10098150245375613, "grad_norm": 0.5854150652885437, "learning_rate": 0.00019858317825652096, "loss": 0.38489365577697754, "memory(GiB)": 91.64, "step": 1070, "token_acc": 0.8923697270471465, "train_speed(iter/s)": 0.136832 }, { "epoch": 0.10145337863344658, "grad_norm": 0.47770190238952637, "learning_rate": 0.00019855688292757848, "loss": 0.39474029541015626, "memory(GiB)": 91.64, "step": 1075, "token_acc": 0.8581871345029239, "train_speed(iter/s)": 0.136842 }, { "epoch": 0.10192525481313704, "grad_norm": 0.6249523162841797, "learning_rate": 0.00019853034759778957, "loss": 0.37869269847869874, "memory(GiB)": 91.64, "step": 1080, "token_acc": 0.8848101265822785, "train_speed(iter/s)": 0.136845 }, { "epoch": 0.10239713099282748, "grad_norm": 0.5587074756622314, "learning_rate": 0.00019850357233177176, "loss": 0.37889723777770995, "memory(GiB)": 91.64, "step": 1085, "token_acc": 0.8695814648729447, "train_speed(iter/s)": 0.136852 }, { "epoch": 0.10286900717251793, "grad_norm": 0.456506609916687, "learning_rate": 0.00019847655719472688, "loss": 0.37969346046447755, "memory(GiB)": 91.64, "step": 1090, "token_acc": 0.8711864406779661, "train_speed(iter/s)": 0.136854 }, { "epoch": 0.10334088335220838, "grad_norm": 0.48825180530548096, "learning_rate": 0.00019844930225244083, "loss": 0.3927449226379395, "memory(GiB)": 91.64, "step": 1095, "token_acc": 0.8670253651037664, "train_speed(iter/s)": 0.136853 }, { "epoch": 0.10381275953189883, "grad_norm": 0.33617979288101196, "learning_rate": 0.0001984218075712835, "loss": 0.3827697277069092, "memory(GiB)": 91.64, "step": 1100, "token_acc": 0.8889289578074288, "train_speed(iter/s)": 0.136858 }, { "epoch": 0.10428463571158927, "grad_norm": 0.5517711639404297, "learning_rate": 0.00019839407321820858, "loss": 0.3828376293182373, "memory(GiB)": 91.64, "step": 1105, "token_acc": 0.8939336131247615, "train_speed(iter/s)": 0.136873 }, { "epoch": 0.10475651189127973, "grad_norm": 1.1416659355163574, "learning_rate": 0.0001983660992607534, "loss": 0.39458913803100587, "memory(GiB)": 91.64, "step": 1110, "token_acc": 0.8489618218352311, "train_speed(iter/s)": 0.136884 }, { "epoch": 0.10522838807097018, "grad_norm": 0.3741232752799988, "learning_rate": 0.00019833788576703875, "loss": 0.3905258893966675, "memory(GiB)": 91.64, "step": 1115, "token_acc": 0.8970338983050847, "train_speed(iter/s)": 0.136893 }, { "epoch": 0.10570026425066063, "grad_norm": 0.4323633015155792, "learning_rate": 0.00019830943280576874, "loss": 0.38057544231414797, "memory(GiB)": 91.64, "step": 1120, "token_acc": 0.8544902093180283, "train_speed(iter/s)": 0.136906 }, { "epoch": 0.10617214043035107, "grad_norm": 0.5611393451690674, "learning_rate": 0.0001982807404462306, "loss": 0.3737280607223511, "memory(GiB)": 91.64, "step": 1125, "token_acc": 0.8825597749648383, "train_speed(iter/s)": 0.136914 }, { "epoch": 0.10664401661004153, "grad_norm": 0.39299675822257996, "learning_rate": 0.00019825180875829456, "loss": 0.37208285331726076, "memory(GiB)": 91.64, "step": 1130, "token_acc": 0.8761904761904762, "train_speed(iter/s)": 0.136918 }, { "epoch": 0.10711589278973198, "grad_norm": 0.6676709055900574, "learning_rate": 0.0001982226378124136, "loss": 0.37977089881896975, "memory(GiB)": 91.64, "step": 1135, "token_acc": 0.883206106870229, "train_speed(iter/s)": 0.136928 }, { "epoch": 0.10758776896942242, "grad_norm": 0.4939993619918823, "learning_rate": 0.00019819322767962344, "loss": 0.3877007007598877, "memory(GiB)": 91.64, "step": 1140, "token_acc": 0.8383121732636296, "train_speed(iter/s)": 0.136939 }, { "epoch": 0.10805964514911287, "grad_norm": 0.3608226776123047, "learning_rate": 0.00019816357843154212, "loss": 0.37889995574951174, "memory(GiB)": 91.64, "step": 1145, "token_acc": 0.8914505283381364, "train_speed(iter/s)": 0.136944 }, { "epoch": 0.10853152132880332, "grad_norm": 0.45509615540504456, "learning_rate": 0.00019813369014037003, "loss": 0.38171145915985105, "memory(GiB)": 91.64, "step": 1150, "token_acc": 0.8768656716417911, "train_speed(iter/s)": 0.136949 }, { "epoch": 0.10900339750849378, "grad_norm": 0.5381651520729065, "learning_rate": 0.00019810356287888967, "loss": 0.38211042881011964, "memory(GiB)": 91.64, "step": 1155, "token_acc": 0.8691718858733473, "train_speed(iter/s)": 0.136952 }, { "epoch": 0.10947527368818422, "grad_norm": 0.5979215502738953, "learning_rate": 0.00019807319672046546, "loss": 0.37435040473937986, "memory(GiB)": 91.64, "step": 1160, "token_acc": 0.8914702953866578, "train_speed(iter/s)": 0.136955 }, { "epoch": 0.10994714986787467, "grad_norm": 0.46846890449523926, "learning_rate": 0.00019804259173904356, "loss": 0.3730193614959717, "memory(GiB)": 91.64, "step": 1165, "token_acc": 0.872072072072072, "train_speed(iter/s)": 0.136965 }, { "epoch": 0.11041902604756512, "grad_norm": 0.6530776619911194, "learning_rate": 0.0001980117480091517, "loss": 0.3740133762359619, "memory(GiB)": 91.64, "step": 1170, "token_acc": 0.8564527260179434, "train_speed(iter/s)": 0.136979 }, { "epoch": 0.11089090222725557, "grad_norm": 1.131178379058838, "learning_rate": 0.000197980665605899, "loss": 0.3640794277191162, "memory(GiB)": 91.64, "step": 1175, "token_acc": 0.8569848875783266, "train_speed(iter/s)": 0.136983 }, { "epoch": 0.11136277840694601, "grad_norm": 0.4460451602935791, "learning_rate": 0.00019794934460497582, "loss": 0.3784611225128174, "memory(GiB)": 91.64, "step": 1180, "token_acc": 0.8822531387852053, "train_speed(iter/s)": 0.136986 }, { "epoch": 0.11183465458663647, "grad_norm": 0.6955581903457642, "learning_rate": 0.00019791778508265352, "loss": 0.3845529556274414, "memory(GiB)": 91.64, "step": 1185, "token_acc": 0.8754593711719069, "train_speed(iter/s)": 0.137001 }, { "epoch": 0.11230653076632692, "grad_norm": 0.39989179372787476, "learning_rate": 0.0001978859871157842, "loss": 0.3827672958374023, "memory(GiB)": 91.64, "step": 1190, "token_acc": 0.8808139534883721, "train_speed(iter/s)": 0.137003 }, { "epoch": 0.11277840694601736, "grad_norm": 0.3942891061306, "learning_rate": 0.0001978539507818008, "loss": 0.3772392511367798, "memory(GiB)": 91.64, "step": 1195, "token_acc": 0.8851182197496523, "train_speed(iter/s)": 0.137013 }, { "epoch": 0.11325028312570781, "grad_norm": 0.422818660736084, "learning_rate": 0.00019782167615871657, "loss": 0.37031795978546145, "memory(GiB)": 91.64, "step": 1200, "token_acc": 0.8915779283639884, "train_speed(iter/s)": 0.137021 }, { "epoch": 0.11372215930539827, "grad_norm": 0.8060660362243652, "learning_rate": 0.00019778916332512507, "loss": 0.3858052730560303, "memory(GiB)": 91.64, "step": 1205, "token_acc": 0.894151417294582, "train_speed(iter/s)": 0.137033 }, { "epoch": 0.11419403548508872, "grad_norm": 0.44119134545326233, "learning_rate": 0.00019775641236019996, "loss": 0.37988340854644775, "memory(GiB)": 91.64, "step": 1210, "token_acc": 0.883441258094357, "train_speed(iter/s)": 0.137044 }, { "epoch": 0.11466591166477916, "grad_norm": 0.654055655002594, "learning_rate": 0.00019772342334369478, "loss": 0.3706467866897583, "memory(GiB)": 91.64, "step": 1215, "token_acc": 0.8836206896551724, "train_speed(iter/s)": 0.137053 }, { "epoch": 0.11513778784446961, "grad_norm": 0.45024436712265015, "learning_rate": 0.00019769019635594272, "loss": 0.3729224443435669, "memory(GiB)": 91.64, "step": 1220, "token_acc": 0.8666930379746836, "train_speed(iter/s)": 0.137064 }, { "epoch": 0.11560966402416006, "grad_norm": 0.3812342882156372, "learning_rate": 0.00019765673147785652, "loss": 0.38361666202545164, "memory(GiB)": 91.64, "step": 1225, "token_acc": 0.8751983077736647, "train_speed(iter/s)": 0.13708 }, { "epoch": 0.1160815402038505, "grad_norm": 0.39537158608436584, "learning_rate": 0.0001976230287909282, "loss": 0.3797783136367798, "memory(GiB)": 91.64, "step": 1230, "token_acc": 0.8721294363256785, "train_speed(iter/s)": 0.137091 }, { "epoch": 0.11655341638354096, "grad_norm": 0.5020424127578735, "learning_rate": 0.00019758908837722884, "loss": 0.37000179290771484, "memory(GiB)": 91.64, "step": 1235, "token_acc": 0.8931271477663231, "train_speed(iter/s)": 0.137094 }, { "epoch": 0.11702529256323141, "grad_norm": 0.4574194550514221, "learning_rate": 0.00019755491031940854, "loss": 0.36977810859680177, "memory(GiB)": 91.64, "step": 1240, "token_acc": 0.8930348258706468, "train_speed(iter/s)": 0.137102 }, { "epoch": 0.11749716874292186, "grad_norm": 0.6839103698730469, "learning_rate": 0.0001975204947006959, "loss": 0.3748194932937622, "memory(GiB)": 91.64, "step": 1245, "token_acc": 0.8971393791844188, "train_speed(iter/s)": 0.13711 }, { "epoch": 0.1179690449226123, "grad_norm": 0.43332648277282715, "learning_rate": 0.0001974858416048982, "loss": 0.36992840766906737, "memory(GiB)": 91.64, "step": 1250, "token_acc": 0.8771353482260184, "train_speed(iter/s)": 0.13711 }, { "epoch": 0.11844092110230275, "grad_norm": 0.7626131176948547, "learning_rate": 0.00019745095111640094, "loss": 0.3885170936584473, "memory(GiB)": 91.64, "step": 1255, "token_acc": 0.8769186214885607, "train_speed(iter/s)": 0.137124 }, { "epoch": 0.11891279728199321, "grad_norm": 1.1946239471435547, "learning_rate": 0.00019741582332016773, "loss": 0.3764191150665283, "memory(GiB)": 91.64, "step": 1260, "token_acc": 0.8837127845884414, "train_speed(iter/s)": 0.137128 }, { "epoch": 0.11938467346168366, "grad_norm": 0.37812870740890503, "learning_rate": 0.00019738045830173997, "loss": 0.3727047204971313, "memory(GiB)": 91.64, "step": 1265, "token_acc": 0.868295994568907, "train_speed(iter/s)": 0.137132 }, { "epoch": 0.1198565496413741, "grad_norm": 0.9704969525337219, "learning_rate": 0.0001973448561472369, "loss": 0.3743635892868042, "memory(GiB)": 91.64, "step": 1270, "token_acc": 0.8887052341597796, "train_speed(iter/s)": 0.137142 }, { "epoch": 0.12032842582106455, "grad_norm": 0.5081287622451782, "learning_rate": 0.00019730901694335503, "loss": 0.4186855316162109, "memory(GiB)": 91.64, "step": 1275, "token_acc": 0.9019308943089431, "train_speed(iter/s)": 0.137157 }, { "epoch": 0.120800302000755, "grad_norm": 0.50217205286026, "learning_rate": 0.0001972729407773683, "loss": 0.37697782516479494, "memory(GiB)": 91.64, "step": 1280, "token_acc": 0.874384236453202, "train_speed(iter/s)": 0.137164 }, { "epoch": 0.12127217818044544, "grad_norm": 0.7912167310714722, "learning_rate": 0.0001972366277371276, "loss": 0.3852388381958008, "memory(GiB)": 91.64, "step": 1285, "token_acc": 0.8706407137064072, "train_speed(iter/s)": 0.137174 }, { "epoch": 0.1217440543601359, "grad_norm": 0.42602792382240295, "learning_rate": 0.00019720007791106057, "loss": 0.3708258390426636, "memory(GiB)": 91.64, "step": 1290, "token_acc": 0.8843969261610425, "train_speed(iter/s)": 0.13718 }, { "epoch": 0.12221593053982635, "grad_norm": 0.3974127471446991, "learning_rate": 0.00019716329138817158, "loss": 0.3643842935562134, "memory(GiB)": 91.64, "step": 1295, "token_acc": 0.8741738688357905, "train_speed(iter/s)": 0.137184 }, { "epoch": 0.1226878067195168, "grad_norm": 0.4702013432979584, "learning_rate": 0.0001971262682580414, "loss": 0.3616140604019165, "memory(GiB)": 91.64, "step": 1300, "token_acc": 0.886223191566703, "train_speed(iter/s)": 0.13719 }, { "epoch": 0.12315968289920724, "grad_norm": 0.43112415075302124, "learning_rate": 0.00019708900861082685, "loss": 0.3715237855911255, "memory(GiB)": 91.64, "step": 1305, "token_acc": 0.886991461577097, "train_speed(iter/s)": 0.137195 }, { "epoch": 0.1236315590788977, "grad_norm": 0.5010024905204773, "learning_rate": 0.00019705151253726082, "loss": 0.37075207233428953, "memory(GiB)": 91.64, "step": 1310, "token_acc": 0.8941935483870967, "train_speed(iter/s)": 0.137196 }, { "epoch": 0.12410343525858815, "grad_norm": 0.6552342772483826, "learning_rate": 0.0001970137801286519, "loss": 0.3803473234176636, "memory(GiB)": 91.64, "step": 1315, "token_acc": 0.8743248109470652, "train_speed(iter/s)": 0.137199 }, { "epoch": 0.1245753114382786, "grad_norm": 0.45841383934020996, "learning_rate": 0.00019697581147688417, "loss": 0.3710304260253906, "memory(GiB)": 91.64, "step": 1320, "token_acc": 0.867526746381372, "train_speed(iter/s)": 0.137207 }, { "epoch": 0.12504718761796904, "grad_norm": 0.4373835027217865, "learning_rate": 0.00019693760667441703, "loss": 0.35978107452392577, "memory(GiB)": 91.64, "step": 1325, "token_acc": 0.8925126320624713, "train_speed(iter/s)": 0.137217 }, { "epoch": 0.12551906379765948, "grad_norm": 0.3811391592025757, "learning_rate": 0.00019689916581428488, "loss": 0.3601937770843506, "memory(GiB)": 91.64, "step": 1330, "token_acc": 0.8777089783281734, "train_speed(iter/s)": 0.137222 }, { "epoch": 0.12599093997734995, "grad_norm": 0.40880128741264343, "learning_rate": 0.00019686048899009704, "loss": 0.3690077066421509, "memory(GiB)": 91.64, "step": 1335, "token_acc": 0.8848944591029023, "train_speed(iter/s)": 0.137221 }, { "epoch": 0.1264628161570404, "grad_norm": 0.6039553880691528, "learning_rate": 0.0001968215762960374, "loss": 0.37348055839538574, "memory(GiB)": 91.64, "step": 1340, "token_acc": 0.870300204022151, "train_speed(iter/s)": 0.137226 }, { "epoch": 0.12693469233673085, "grad_norm": 1.1906999349594116, "learning_rate": 0.00019678242782686421, "loss": 0.36064743995666504, "memory(GiB)": 91.64, "step": 1345, "token_acc": 0.8814722395508422, "train_speed(iter/s)": 0.137241 }, { "epoch": 0.1274065685164213, "grad_norm": 0.6271125674247742, "learning_rate": 0.00019674304367790993, "loss": 0.3587361812591553, "memory(GiB)": 91.64, "step": 1350, "token_acc": 0.8639104220499569, "train_speed(iter/s)": 0.137247 }, { "epoch": 0.12787844469611173, "grad_norm": 0.34620094299316406, "learning_rate": 0.0001967034239450808, "loss": 0.3652570486068726, "memory(GiB)": 91.64, "step": 1355, "token_acc": 0.879513492968453, "train_speed(iter/s)": 0.137257 }, { "epoch": 0.1283503208758022, "grad_norm": 0.5999593734741211, "learning_rate": 0.00019666356872485695, "loss": 0.36589975357055665, "memory(GiB)": 91.64, "step": 1360, "token_acc": 0.8735487919673675, "train_speed(iter/s)": 0.137273 }, { "epoch": 0.12882219705549264, "grad_norm": 0.4182213842868805, "learning_rate": 0.00019662347811429172, "loss": 0.3619654178619385, "memory(GiB)": 91.64, "step": 1365, "token_acc": 0.8859732824427481, "train_speed(iter/s)": 0.137286 }, { "epoch": 0.12929407323518308, "grad_norm": 0.7008164525032043, "learning_rate": 0.0001965831522110119, "loss": 0.36653695106506345, "memory(GiB)": 91.64, "step": 1370, "token_acc": 0.8621755253399258, "train_speed(iter/s)": 0.137294 }, { "epoch": 0.12976594941487354, "grad_norm": 0.5119627118110657, "learning_rate": 0.00019654259111321704, "loss": 0.3641792297363281, "memory(GiB)": 91.64, "step": 1375, "token_acc": 0.8765696784073507, "train_speed(iter/s)": 0.137302 }, { "epoch": 0.13023782559456398, "grad_norm": 0.6845554709434509, "learning_rate": 0.00019650179491967955, "loss": 0.36969287395477296, "memory(GiB)": 91.64, "step": 1380, "token_acc": 0.8856742883807747, "train_speed(iter/s)": 0.1373 }, { "epoch": 0.13070970177425442, "grad_norm": 0.7057746052742004, "learning_rate": 0.00019646076372974432, "loss": 0.364498496055603, "memory(GiB)": 91.64, "step": 1385, "token_acc": 0.8957236842105263, "train_speed(iter/s)": 0.137308 }, { "epoch": 0.1311815779539449, "grad_norm": 0.5785896182060242, "learning_rate": 0.0001964194976433285, "loss": 0.3685713768005371, "memory(GiB)": 91.64, "step": 1390, "token_acc": 0.8923533778767632, "train_speed(iter/s)": 0.137314 }, { "epoch": 0.13165345413363533, "grad_norm": 0.43366122245788574, "learning_rate": 0.00019637799676092114, "loss": 0.3690282106399536, "memory(GiB)": 91.64, "step": 1395, "token_acc": 0.8743633276740238, "train_speed(iter/s)": 0.137317 }, { "epoch": 0.1321253303133258, "grad_norm": 0.3319828510284424, "learning_rate": 0.0001963362611835832, "loss": 0.3694582462310791, "memory(GiB)": 91.64, "step": 1400, "token_acc": 0.8864421416234888, "train_speed(iter/s)": 0.137319 }, { "epoch": 0.13259720649301623, "grad_norm": 0.42545682191848755, "learning_rate": 0.00019629429101294707, "loss": 0.3619790315628052, "memory(GiB)": 91.64, "step": 1405, "token_acc": 0.8672329012069736, "train_speed(iter/s)": 0.137334 }, { "epoch": 0.13306908267270667, "grad_norm": 0.6269343495368958, "learning_rate": 0.00019625208635121646, "loss": 0.3626497983932495, "memory(GiB)": 91.64, "step": 1410, "token_acc": 0.8965665236051502, "train_speed(iter/s)": 0.137348 }, { "epoch": 0.13354095885239714, "grad_norm": 0.5690509676933289, "learning_rate": 0.00019620964730116601, "loss": 0.35851593017578126, "memory(GiB)": 91.64, "step": 1415, "token_acc": 0.8846794338051623, "train_speed(iter/s)": 0.137357 }, { "epoch": 0.13401283503208758, "grad_norm": 0.592526912689209, "learning_rate": 0.00019616697396614128, "loss": 0.3695559501647949, "memory(GiB)": 91.64, "step": 1420, "token_acc": 0.8799270072992701, "train_speed(iter/s)": 0.137358 }, { "epoch": 0.13448471121177802, "grad_norm": 0.5720744132995605, "learning_rate": 0.0001961240664500582, "loss": 0.36823060512542727, "memory(GiB)": 91.64, "step": 1425, "token_acc": 0.8664506839452844, "train_speed(iter/s)": 0.137366 }, { "epoch": 0.13495658739146849, "grad_norm": 0.854423463344574, "learning_rate": 0.00019608092485740307, "loss": 0.3638261318206787, "memory(GiB)": 91.64, "step": 1430, "token_acc": 0.8540372670807453, "train_speed(iter/s)": 0.13737 }, { "epoch": 0.13542846357115892, "grad_norm": 0.40354272723197937, "learning_rate": 0.00019603754929323214, "loss": 0.36725308895111086, "memory(GiB)": 91.64, "step": 1435, "token_acc": 0.8717277486910995, "train_speed(iter/s)": 0.13738 }, { "epoch": 0.13590033975084936, "grad_norm": 0.5269042253494263, "learning_rate": 0.00019599393986317147, "loss": 0.3656820297241211, "memory(GiB)": 91.64, "step": 1440, "token_acc": 0.8932565232124704, "train_speed(iter/s)": 0.137388 }, { "epoch": 0.13637221593053983, "grad_norm": 0.47906067967414856, "learning_rate": 0.00019595009667341655, "loss": 0.3657586097717285, "memory(GiB)": 91.64, "step": 1445, "token_acc": 0.8870967741935484, "train_speed(iter/s)": 0.137395 }, { "epoch": 0.13684409211023027, "grad_norm": 0.821492075920105, "learning_rate": 0.00019590601983073214, "loss": 0.3700244665145874, "memory(GiB)": 91.64, "step": 1450, "token_acc": 0.8615896041100031, "train_speed(iter/s)": 0.137399 }, { "epoch": 0.13731596828992074, "grad_norm": 1.1818538904190063, "learning_rate": 0.00019586170944245202, "loss": 0.36403641700744627, "memory(GiB)": 91.64, "step": 1455, "token_acc": 0.8834304746044963, "train_speed(iter/s)": 0.137405 }, { "epoch": 0.13778784446961118, "grad_norm": 0.3865146040916443, "learning_rate": 0.00019581716561647866, "loss": 0.3664146661758423, "memory(GiB)": 91.64, "step": 1460, "token_acc": 0.8741547708489857, "train_speed(iter/s)": 0.137411 }, { "epoch": 0.13825972064930162, "grad_norm": 0.8478065729141235, "learning_rate": 0.00019577238846128295, "loss": 0.37954490184783934, "memory(GiB)": 91.64, "step": 1465, "token_acc": 0.8746763335059554, "train_speed(iter/s)": 0.137423 }, { "epoch": 0.13873159682899208, "grad_norm": 0.49488478899002075, "learning_rate": 0.000195727378085904, "loss": 0.3570873737335205, "memory(GiB)": 91.64, "step": 1470, "token_acc": 0.8735310576385003, "train_speed(iter/s)": 0.137431 }, { "epoch": 0.13920347300868252, "grad_norm": 0.6807460784912109, "learning_rate": 0.0001956821345999489, "loss": 0.3615212917327881, "memory(GiB)": 91.64, "step": 1475, "token_acc": 0.8776844070961718, "train_speed(iter/s)": 0.137442 }, { "epoch": 0.13967534918837296, "grad_norm": 0.9388037323951721, "learning_rate": 0.0001956366581135923, "loss": 0.3591162204742432, "memory(GiB)": 91.64, "step": 1480, "token_acc": 0.8671586715867159, "train_speed(iter/s)": 0.137452 }, { "epoch": 0.14014722536806343, "grad_norm": 0.8437421321868896, "learning_rate": 0.0001955909487375763, "loss": 0.353442907333374, "memory(GiB)": 91.64, "step": 1485, "token_acc": 0.8862820205889395, "train_speed(iter/s)": 0.137459 }, { "epoch": 0.14061910154775387, "grad_norm": 0.6493034362792969, "learning_rate": 0.00019554500658321015, "loss": 0.3546589851379395, "memory(GiB)": 91.64, "step": 1490, "token_acc": 0.879837067209776, "train_speed(iter/s)": 0.137462 }, { "epoch": 0.1410909777274443, "grad_norm": 0.38441404700279236, "learning_rate": 0.00019549883176236987, "loss": 0.36052756309509276, "memory(GiB)": 91.64, "step": 1495, "token_acc": 0.8771869639794168, "train_speed(iter/s)": 0.137469 }, { "epoch": 0.14156285390713477, "grad_norm": 0.3363651633262634, "learning_rate": 0.00019545242438749808, "loss": 0.36542329788208006, "memory(GiB)": 91.64, "step": 1500, "token_acc": 0.871866295264624, "train_speed(iter/s)": 0.137472 }, { "epoch": 0.1420347300868252, "grad_norm": 0.371928334236145, "learning_rate": 0.0001954057845716038, "loss": 0.3581106662750244, "memory(GiB)": 91.64, "step": 1505, "token_acc": 0.8747779751332149, "train_speed(iter/s)": 0.137476 }, { "epoch": 0.14250660626651568, "grad_norm": 0.44092419743537903, "learning_rate": 0.00019535891242826193, "loss": 0.3675301313400269, "memory(GiB)": 91.64, "step": 1510, "token_acc": 0.8932142857142857, "train_speed(iter/s)": 0.137482 }, { "epoch": 0.14297848244620612, "grad_norm": 0.5470117926597595, "learning_rate": 0.00019531180807161322, "loss": 0.3621679306030273, "memory(GiB)": 91.64, "step": 1515, "token_acc": 0.8902282636573481, "train_speed(iter/s)": 0.137486 }, { "epoch": 0.14345035862589656, "grad_norm": 0.5375911593437195, "learning_rate": 0.0001952644716163639, "loss": 0.36718566417694093, "memory(GiB)": 91.64, "step": 1520, "token_acc": 0.8784158415841584, "train_speed(iter/s)": 0.137495 }, { "epoch": 0.14392223480558702, "grad_norm": 0.7067252993583679, "learning_rate": 0.00019521690317778528, "loss": 0.3592665672302246, "memory(GiB)": 91.64, "step": 1525, "token_acc": 0.8725556304787593, "train_speed(iter/s)": 0.137504 }, { "epoch": 0.14439411098527746, "grad_norm": 0.6723697185516357, "learning_rate": 0.0001951691028717138, "loss": 0.3603172779083252, "memory(GiB)": 91.64, "step": 1530, "token_acc": 0.8805710306406686, "train_speed(iter/s)": 0.13751 }, { "epoch": 0.1448659871649679, "grad_norm": 0.5974426865577698, "learning_rate": 0.0001951210708145503, "loss": 0.3683944225311279, "memory(GiB)": 91.64, "step": 1535, "token_acc": 0.8440899202320522, "train_speed(iter/s)": 0.137524 }, { "epoch": 0.14533786334465837, "grad_norm": 0.32669320702552795, "learning_rate": 0.00019507280712326006, "loss": 0.3616074562072754, "memory(GiB)": 91.64, "step": 1540, "token_acc": 0.9022123893805309, "train_speed(iter/s)": 0.137534 }, { "epoch": 0.1458097395243488, "grad_norm": 0.6083944439888, "learning_rate": 0.00019502431191537249, "loss": 0.3704042673110962, "memory(GiB)": 91.64, "step": 1545, "token_acc": 0.8691735213474344, "train_speed(iter/s)": 0.137545 }, { "epoch": 0.14628161570403925, "grad_norm": 0.4255918860435486, "learning_rate": 0.0001949755853089807, "loss": 0.3596953392028809, "memory(GiB)": 91.64, "step": 1550, "token_acc": 0.866793893129771, "train_speed(iter/s)": 0.137553 }, { "epoch": 0.14675349188372971, "grad_norm": 0.5203797221183777, "learning_rate": 0.00019492662742274134, "loss": 0.3596514701843262, "memory(GiB)": 91.64, "step": 1555, "token_acc": 0.8798060270176654, "train_speed(iter/s)": 0.137558 }, { "epoch": 0.14722536806342015, "grad_norm": 0.556632936000824, "learning_rate": 0.0001948774383758742, "loss": 0.362532377243042, "memory(GiB)": 91.64, "step": 1560, "token_acc": 0.8814923189465984, "train_speed(iter/s)": 0.137567 }, { "epoch": 0.14769724424311062, "grad_norm": 0.6500077843666077, "learning_rate": 0.00019482801828816197, "loss": 0.36057684421539304, "memory(GiB)": 91.64, "step": 1565, "token_acc": 0.8865593410707601, "train_speed(iter/s)": 0.137567 }, { "epoch": 0.14816912042280106, "grad_norm": 0.43490278720855713, "learning_rate": 0.0001947783672799501, "loss": 0.37216577529907224, "memory(GiB)": 91.64, "step": 1570, "token_acc": 0.8793157076205288, "train_speed(iter/s)": 0.137577 }, { "epoch": 0.1486409966024915, "grad_norm": 0.31350913643836975, "learning_rate": 0.0001947284854721462, "loss": 0.3571339130401611, "memory(GiB)": 91.64, "step": 1575, "token_acc": 0.8714285714285714, "train_speed(iter/s)": 0.137584 }, { "epoch": 0.14911287278218197, "grad_norm": 0.49556922912597656, "learning_rate": 0.00019467837298622003, "loss": 0.36325485706329347, "memory(GiB)": 91.64, "step": 1580, "token_acc": 0.8804733727810651, "train_speed(iter/s)": 0.137588 }, { "epoch": 0.1495847489618724, "grad_norm": 0.3206709623336792, "learning_rate": 0.00019462802994420298, "loss": 0.3566303730010986, "memory(GiB)": 91.64, "step": 1585, "token_acc": 0.8815612382234186, "train_speed(iter/s)": 0.137592 }, { "epoch": 0.15005662514156284, "grad_norm": 0.4770624041557312, "learning_rate": 0.000194577456468688, "loss": 0.3560822010040283, "memory(GiB)": 91.64, "step": 1590, "token_acc": 0.878257328990228, "train_speed(iter/s)": 0.137597 }, { "epoch": 0.1505285013212533, "grad_norm": 0.36648330092430115, "learning_rate": 0.00019452665268282905, "loss": 0.35216608047485354, "memory(GiB)": 91.64, "step": 1595, "token_acc": 0.8765673981191222, "train_speed(iter/s)": 0.137598 }, { "epoch": 0.15100037750094375, "grad_norm": 0.34637942910194397, "learning_rate": 0.00019447561871034107, "loss": 0.35890846252441405, "memory(GiB)": 91.64, "step": 1600, "token_acc": 0.8925100057175529, "train_speed(iter/s)": 0.137604 }, { "epoch": 0.1514722536806342, "grad_norm": 0.5771172642707825, "learning_rate": 0.00019442435467549937, "loss": 0.3579749345779419, "memory(GiB)": 91.64, "step": 1605, "token_acc": 0.8734729493891797, "train_speed(iter/s)": 0.137605 }, { "epoch": 0.15194412986032466, "grad_norm": 0.382007360458374, "learning_rate": 0.0001943728607031397, "loss": 0.35507354736328123, "memory(GiB)": 91.64, "step": 1610, "token_acc": 0.8889200561009818, "train_speed(iter/s)": 0.13761 }, { "epoch": 0.1524160060400151, "grad_norm": 0.3741127848625183, "learning_rate": 0.00019432113691865755, "loss": 0.3627027988433838, "memory(GiB)": 91.64, "step": 1615, "token_acc": 0.8872416891284816, "train_speed(iter/s)": 0.137613 }, { "epoch": 0.15288788221970556, "grad_norm": 0.9472887516021729, "learning_rate": 0.00019426918344800815, "loss": 0.36210317611694337, "memory(GiB)": 91.64, "step": 1620, "token_acc": 0.8653122648607976, "train_speed(iter/s)": 0.137618 }, { "epoch": 0.153359758399396, "grad_norm": 0.4501311480998993, "learning_rate": 0.00019421700041770602, "loss": 0.3565349578857422, "memory(GiB)": 91.64, "step": 1625, "token_acc": 0.8737430167597765, "train_speed(iter/s)": 0.137627 }, { "epoch": 0.15383163457908644, "grad_norm": 0.7638270854949951, "learning_rate": 0.0001941645879548247, "loss": 0.3586350679397583, "memory(GiB)": 91.64, "step": 1630, "token_acc": 0.8927091963545982, "train_speed(iter/s)": 0.137633 }, { "epoch": 0.1543035107587769, "grad_norm": 0.32415205240249634, "learning_rate": 0.00019411194618699644, "loss": 0.3630037307739258, "memory(GiB)": 91.64, "step": 1635, "token_acc": 0.8924870466321243, "train_speed(iter/s)": 0.137637 }, { "epoch": 0.15477538693846735, "grad_norm": 0.46527740359306335, "learning_rate": 0.00019405907524241184, "loss": 0.36022109985351564, "memory(GiB)": 91.64, "step": 1640, "token_acc": 0.892293114339861, "train_speed(iter/s)": 0.137643 }, { "epoch": 0.15524726311815779, "grad_norm": 0.47606101632118225, "learning_rate": 0.00019400597524981965, "loss": 0.3650399684906006, "memory(GiB)": 91.64, "step": 1645, "token_acc": 0.8674858850880106, "train_speed(iter/s)": 0.137657 }, { "epoch": 0.15571913929784825, "grad_norm": 0.45343896746635437, "learning_rate": 0.0001939526463385263, "loss": 0.353916597366333, "memory(GiB)": 91.64, "step": 1650, "token_acc": 0.8914905768132496, "train_speed(iter/s)": 0.13767 }, { "epoch": 0.1561910154775387, "grad_norm": 0.7184441685676575, "learning_rate": 0.00019389908863839573, "loss": 0.3685162544250488, "memory(GiB)": 91.64, "step": 1655, "token_acc": 0.8822937625754527, "train_speed(iter/s)": 0.13768 }, { "epoch": 0.15666289165722913, "grad_norm": 0.5629715919494629, "learning_rate": 0.00019384530227984902, "loss": 0.3554409027099609, "memory(GiB)": 91.64, "step": 1660, "token_acc": 0.8831908831908832, "train_speed(iter/s)": 0.137687 }, { "epoch": 0.1571347678369196, "grad_norm": 0.39644697308540344, "learning_rate": 0.00019379128739386404, "loss": 0.351816725730896, "memory(GiB)": 91.64, "step": 1665, "token_acc": 0.8597662771285476, "train_speed(iter/s)": 0.137691 }, { "epoch": 0.15760664401661004, "grad_norm": 0.6162962317466736, "learning_rate": 0.00019373704411197517, "loss": 0.3518479585647583, "memory(GiB)": 91.64, "step": 1670, "token_acc": 0.8764940239043825, "train_speed(iter/s)": 0.137697 }, { "epoch": 0.1580785201963005, "grad_norm": 0.30298227071762085, "learning_rate": 0.000193682572566273, "loss": 0.35238142013549806, "memory(GiB)": 91.64, "step": 1675, "token_acc": 0.8691389599317988, "train_speed(iter/s)": 0.137704 }, { "epoch": 0.15855039637599094, "grad_norm": 0.5133036375045776, "learning_rate": 0.00019362787288940383, "loss": 0.36153383255004884, "memory(GiB)": 91.64, "step": 1680, "token_acc": 0.894973436861463, "train_speed(iter/s)": 0.137709 }, { "epoch": 0.15902227255568138, "grad_norm": 0.2759738266468048, "learning_rate": 0.0001935729452145697, "loss": 0.3581871509552002, "memory(GiB)": 91.64, "step": 1685, "token_acc": 0.8885003885003885, "train_speed(iter/s)": 0.137718 }, { "epoch": 0.15949414873537185, "grad_norm": 0.38993126153945923, "learning_rate": 0.0001935177896755278, "loss": 0.35147097110748293, "memory(GiB)": 91.64, "step": 1690, "token_acc": 0.887071240105541, "train_speed(iter/s)": 0.137727 }, { "epoch": 0.1599660249150623, "grad_norm": 0.6348713040351868, "learning_rate": 0.00019346240640659012, "loss": 0.3537603378295898, "memory(GiB)": 91.64, "step": 1695, "token_acc": 0.8848167539267016, "train_speed(iter/s)": 0.137723 }, { "epoch": 0.16043790109475273, "grad_norm": 0.4917113184928894, "learning_rate": 0.00019340679554262323, "loss": 0.35138711929321287, "memory(GiB)": 91.64, "step": 1700, "token_acc": 0.8826676176890157, "train_speed(iter/s)": 0.137738 }, { "epoch": 0.1609097772744432, "grad_norm": 0.5605958104133606, "learning_rate": 0.000193350957219048, "loss": 0.35388185977935793, "memory(GiB)": 91.64, "step": 1705, "token_acc": 0.8839228295819935, "train_speed(iter/s)": 0.137745 }, { "epoch": 0.16138165345413363, "grad_norm": 0.4609486162662506, "learning_rate": 0.0001932948915718391, "loss": 0.3516982555389404, "memory(GiB)": 91.64, "step": 1710, "token_acc": 0.8835051546391752, "train_speed(iter/s)": 0.137747 }, { "epoch": 0.16185352963382407, "grad_norm": 0.4357410967350006, "learning_rate": 0.00019323859873752493, "loss": 0.3436026096343994, "memory(GiB)": 91.64, "step": 1715, "token_acc": 0.8933823529411765, "train_speed(iter/s)": 0.137753 }, { "epoch": 0.16232540581351454, "grad_norm": 1.353634238243103, "learning_rate": 0.0001931820788531869, "loss": 0.3458723068237305, "memory(GiB)": 91.64, "step": 1720, "token_acc": 0.8818590704647676, "train_speed(iter/s)": 0.13776 }, { "epoch": 0.16279728199320498, "grad_norm": 1.2291808128356934, "learning_rate": 0.0001931253320564595, "loss": 0.35339980125427245, "memory(GiB)": 91.64, "step": 1725, "token_acc": 0.8876443822191109, "train_speed(iter/s)": 0.137759 }, { "epoch": 0.16326915817289545, "grad_norm": 0.3997421860694885, "learning_rate": 0.0001930683584855297, "loss": 0.35820183753967283, "memory(GiB)": 91.64, "step": 1730, "token_acc": 0.8761955366631243, "train_speed(iter/s)": 0.137767 }, { "epoch": 0.16374103435258588, "grad_norm": 0.3965195119380951, "learning_rate": 0.00019301115827913672, "loss": 0.3507367134094238, "memory(GiB)": 91.64, "step": 1735, "token_acc": 0.887131252672082, "train_speed(iter/s)": 0.137769 }, { "epoch": 0.16421291053227632, "grad_norm": 0.6176219582557678, "learning_rate": 0.0001929537315765717, "loss": 0.3522993326187134, "memory(GiB)": 91.64, "step": 1740, "token_acc": 0.9011488111140796, "train_speed(iter/s)": 0.13777 }, { "epoch": 0.1646847867119668, "grad_norm": 0.5894827842712402, "learning_rate": 0.00019289607851767727, "loss": 0.3515509843826294, "memory(GiB)": 91.64, "step": 1745, "token_acc": 0.9020016680567139, "train_speed(iter/s)": 0.13777 }, { "epoch": 0.16515666289165723, "grad_norm": 0.5238110423088074, "learning_rate": 0.00019283819924284732, "loss": 0.3614342212677002, "memory(GiB)": 91.64, "step": 1750, "token_acc": 0.868234415826801, "train_speed(iter/s)": 0.13777 }, { "epoch": 0.16562853907134767, "grad_norm": 0.6640599370002747, "learning_rate": 0.0001927800938930266, "loss": 0.3606734752655029, "memory(GiB)": 91.64, "step": 1755, "token_acc": 0.8917274939172749, "train_speed(iter/s)": 0.13777 }, { "epoch": 0.16610041525103814, "grad_norm": 0.3279666602611542, "learning_rate": 0.00019272176260971038, "loss": 0.3470527410507202, "memory(GiB)": 91.64, "step": 1760, "token_acc": 0.8820662768031189, "train_speed(iter/s)": 0.137768 }, { "epoch": 0.16657229143072858, "grad_norm": 0.557361900806427, "learning_rate": 0.00019266320553494413, "loss": 0.36066641807556155, "memory(GiB)": 91.64, "step": 1765, "token_acc": 0.8856569709127382, "train_speed(iter/s)": 0.137766 }, { "epoch": 0.16704416761041901, "grad_norm": 0.5631670951843262, "learning_rate": 0.00019260442281132314, "loss": 0.347049617767334, "memory(GiB)": 91.64, "step": 1770, "token_acc": 0.8933909000332115, "train_speed(iter/s)": 0.137773 }, { "epoch": 0.16751604379010948, "grad_norm": 0.3367120921611786, "learning_rate": 0.00019254541458199218, "loss": 0.3531044483184814, "memory(GiB)": 91.64, "step": 1775, "token_acc": 0.866046511627907, "train_speed(iter/s)": 0.137778 }, { "epoch": 0.16798791996979992, "grad_norm": 0.30985838174819946, "learning_rate": 0.00019248618099064517, "loss": 0.3543743133544922, "memory(GiB)": 91.64, "step": 1780, "token_acc": 0.8813512004466778, "train_speed(iter/s)": 0.137783 }, { "epoch": 0.1684597961494904, "grad_norm": 0.29574140906333923, "learning_rate": 0.00019242672218152483, "loss": 0.3577969312667847, "memory(GiB)": 91.64, "step": 1785, "token_acc": 0.8939130434782608, "train_speed(iter/s)": 0.137789 }, { "epoch": 0.16893167232918083, "grad_norm": 0.5875484943389893, "learning_rate": 0.00019236703829942232, "loss": 0.350512170791626, "memory(GiB)": 91.64, "step": 1790, "token_acc": 0.8707829408020369, "train_speed(iter/s)": 0.137795 }, { "epoch": 0.16940354850887127, "grad_norm": 1.0582468509674072, "learning_rate": 0.0001923071294896768, "loss": 0.3436901330947876, "memory(GiB)": 91.64, "step": 1795, "token_acc": 0.8852157943067034, "train_speed(iter/s)": 0.137799 }, { "epoch": 0.16987542468856173, "grad_norm": 0.3438156843185425, "learning_rate": 0.00019224699589817537, "loss": 0.34713518619537354, "memory(GiB)": 91.64, "step": 1800, "token_acc": 0.8709556057185854, "train_speed(iter/s)": 0.137799 }, { "epoch": 0.17034730086825217, "grad_norm": 0.38124313950538635, "learning_rate": 0.00019218663767135233, "loss": 0.3550222396850586, "memory(GiB)": 91.64, "step": 1805, "token_acc": 0.8765864332603939, "train_speed(iter/s)": 0.137801 }, { "epoch": 0.1708191770479426, "grad_norm": 0.4326140582561493, "learning_rate": 0.00019212605495618897, "loss": 0.3468668460845947, "memory(GiB)": 91.64, "step": 1810, "token_acc": 0.8850841555426582, "train_speed(iter/s)": 0.137805 }, { "epoch": 0.17129105322763308, "grad_norm": 0.43024584650993347, "learning_rate": 0.0001920652479002134, "loss": 0.34256269931793215, "memory(GiB)": 91.64, "step": 1815, "token_acc": 0.870197904540163, "train_speed(iter/s)": 0.137809 }, { "epoch": 0.17176292940732352, "grad_norm": 0.3962891101837158, "learning_rate": 0.00019200421665149998, "loss": 0.35373740196228026, "memory(GiB)": 91.64, "step": 1820, "token_acc": 0.8898639754278193, "train_speed(iter/s)": 0.137813 }, { "epoch": 0.17223480558701396, "grad_norm": 0.3830450177192688, "learning_rate": 0.00019194296135866893, "loss": 0.3469409704208374, "memory(GiB)": 91.64, "step": 1825, "token_acc": 0.8796900489396411, "train_speed(iter/s)": 0.137819 }, { "epoch": 0.17270668176670442, "grad_norm": 0.5176311731338501, "learning_rate": 0.0001918814821708861, "loss": 0.3452960968017578, "memory(GiB)": 91.64, "step": 1830, "token_acc": 0.8795876288659794, "train_speed(iter/s)": 0.137818 }, { "epoch": 0.17317855794639486, "grad_norm": 0.3469870984554291, "learning_rate": 0.00019181977923786258, "loss": 0.35071775913238523, "memory(GiB)": 91.64, "step": 1835, "token_acc": 0.8695652173913043, "train_speed(iter/s)": 0.137825 }, { "epoch": 0.17365043412608533, "grad_norm": 0.3066750764846802, "learning_rate": 0.00019175785270985433, "loss": 0.3505476713180542, "memory(GiB)": 91.64, "step": 1840, "token_acc": 0.8618200567156483, "train_speed(iter/s)": 0.137819 }, { "epoch": 0.17412231030577577, "grad_norm": 0.6881127953529358, "learning_rate": 0.00019169570273766176, "loss": 0.3488409996032715, "memory(GiB)": 91.64, "step": 1845, "token_acc": 0.8822618125484121, "train_speed(iter/s)": 0.137815 }, { "epoch": 0.1745941864854662, "grad_norm": 0.5931531190872192, "learning_rate": 0.0001916333294726294, "loss": 0.3492976188659668, "memory(GiB)": 91.64, "step": 1850, "token_acc": 0.89794921875, "train_speed(iter/s)": 0.137818 }, { "epoch": 0.17506606266515667, "grad_norm": 0.6179839968681335, "learning_rate": 0.00019157073306664554, "loss": 0.3676631212234497, "memory(GiB)": 91.64, "step": 1855, "token_acc": 0.8740814963259853, "train_speed(iter/s)": 0.13782 }, { "epoch": 0.1755379388448471, "grad_norm": 0.5834555625915527, "learning_rate": 0.00019150791367214182, "loss": 0.3476147174835205, "memory(GiB)": 91.64, "step": 1860, "token_acc": 0.8826677645121449, "train_speed(iter/s)": 0.137819 }, { "epoch": 0.17600981502453755, "grad_norm": 0.35329657793045044, "learning_rate": 0.000191444871442093, "loss": 0.35233583450317385, "memory(GiB)": 91.64, "step": 1865, "token_acc": 0.8933369156367544, "train_speed(iter/s)": 0.13782 }, { "epoch": 0.17648169120422802, "grad_norm": 0.35915401577949524, "learning_rate": 0.00019138160653001633, "loss": 0.34854936599731445, "memory(GiB)": 91.64, "step": 1870, "token_acc": 0.8787451533309835, "train_speed(iter/s)": 0.137821 }, { "epoch": 0.17695356738391846, "grad_norm": 0.8864762783050537, "learning_rate": 0.00019131811908997142, "loss": 0.34482736587524415, "memory(GiB)": 91.64, "step": 1875, "token_acc": 0.8787006578947368, "train_speed(iter/s)": 0.137827 }, { "epoch": 0.1774254435636089, "grad_norm": 0.4602474868297577, "learning_rate": 0.00019125440927655974, "loss": 0.3450265645980835, "memory(GiB)": 91.64, "step": 1880, "token_acc": 0.8929677134011499, "train_speed(iter/s)": 0.137831 }, { "epoch": 0.17789731974329936, "grad_norm": 0.35482195019721985, "learning_rate": 0.00019119047724492426, "loss": 0.33423638343811035, "memory(GiB)": 91.64, "step": 1885, "token_acc": 0.8888008452192288, "train_speed(iter/s)": 0.137832 }, { "epoch": 0.1783691959229898, "grad_norm": 0.9185347557067871, "learning_rate": 0.00019112632315074915, "loss": 0.35080528259277344, "memory(GiB)": 91.64, "step": 1890, "token_acc": 0.8741188318227593, "train_speed(iter/s)": 0.137838 }, { "epoch": 0.17884107210268024, "grad_norm": 0.3791806399822235, "learning_rate": 0.00019106194715025926, "loss": 0.3456167459487915, "memory(GiB)": 91.64, "step": 1895, "token_acc": 0.8879935535858179, "train_speed(iter/s)": 0.137842 }, { "epoch": 0.1793129482823707, "grad_norm": 0.5742256045341492, "learning_rate": 0.00019099734940021982, "loss": 0.3473632335662842, "memory(GiB)": 91.64, "step": 1900, "token_acc": 0.8950050968399592, "train_speed(iter/s)": 0.13785 }, { "epoch": 0.17978482446206115, "grad_norm": 0.49934887886047363, "learning_rate": 0.00019093253005793607, "loss": 0.3430349111557007, "memory(GiB)": 91.64, "step": 1905, "token_acc": 0.8664504716981132, "train_speed(iter/s)": 0.137855 }, { "epoch": 0.18025670064175162, "grad_norm": 0.5481332540512085, "learning_rate": 0.00019086748928125294, "loss": 0.3437246322631836, "memory(GiB)": 91.64, "step": 1910, "token_acc": 0.8839852892009361, "train_speed(iter/s)": 0.137858 }, { "epoch": 0.18072857682144206, "grad_norm": 0.28932511806488037, "learning_rate": 0.00019080222722855442, "loss": 0.3524455547332764, "memory(GiB)": 91.64, "step": 1915, "token_acc": 0.8890915724188252, "train_speed(iter/s)": 0.137863 }, { "epoch": 0.1812004530011325, "grad_norm": 0.8510509133338928, "learning_rate": 0.00019073674405876347, "loss": 0.3491034030914307, "memory(GiB)": 91.64, "step": 1920, "token_acc": 0.8880655226209049, "train_speed(iter/s)": 0.137866 }, { "epoch": 0.18167232918082296, "grad_norm": 0.316115140914917, "learning_rate": 0.00019067103993134152, "loss": 0.3515012264251709, "memory(GiB)": 91.64, "step": 1925, "token_acc": 0.8913354303688876, "train_speed(iter/s)": 0.137862 }, { "epoch": 0.1821442053605134, "grad_norm": 0.3926486670970917, "learning_rate": 0.00019060511500628794, "loss": 0.35095829963684083, "memory(GiB)": 91.64, "step": 1930, "token_acc": 0.8682614555256065, "train_speed(iter/s)": 0.137868 }, { "epoch": 0.18261608154020384, "grad_norm": 0.35603636503219604, "learning_rate": 0.00019053896944413984, "loss": 0.34353179931640626, "memory(GiB)": 91.64, "step": 1935, "token_acc": 0.8846398980242193, "train_speed(iter/s)": 0.13787 }, { "epoch": 0.1830879577198943, "grad_norm": 0.43732625246047974, "learning_rate": 0.0001904726034059717, "loss": 0.3492013454437256, "memory(GiB)": 91.64, "step": 1940, "token_acc": 0.8862512363996043, "train_speed(iter/s)": 0.137869 }, { "epoch": 0.18355983389958475, "grad_norm": 0.4010016918182373, "learning_rate": 0.00019040601705339473, "loss": 0.34664440155029297, "memory(GiB)": 91.64, "step": 1945, "token_acc": 0.9094759131815775, "train_speed(iter/s)": 0.137868 }, { "epoch": 0.18403171007927518, "grad_norm": 0.6890151500701904, "learning_rate": 0.00019033921054855676, "loss": 0.34554276466369627, "memory(GiB)": 91.64, "step": 1950, "token_acc": 0.8837018837018837, "train_speed(iter/s)": 0.137871 }, { "epoch": 0.18450358625896565, "grad_norm": 0.36494889855384827, "learning_rate": 0.00019027218405414168, "loss": 0.34601216316223143, "memory(GiB)": 91.64, "step": 1955, "token_acc": 0.8904270986745213, "train_speed(iter/s)": 0.137876 }, { "epoch": 0.1849754624386561, "grad_norm": 0.3291149437427521, "learning_rate": 0.0001902049377333691, "loss": 0.34858098030090334, "memory(GiB)": 91.64, "step": 1960, "token_acc": 0.8801042571676803, "train_speed(iter/s)": 0.137876 }, { "epoch": 0.18544733861834656, "grad_norm": 0.3290327489376068, "learning_rate": 0.0001901374717499939, "loss": 0.343774151802063, "memory(GiB)": 91.64, "step": 1965, "token_acc": 0.8665939658306071, "train_speed(iter/s)": 0.137879 }, { "epoch": 0.185919214798037, "grad_norm": 0.7721136212348938, "learning_rate": 0.000190069786268306, "loss": 0.3448851823806763, "memory(GiB)": 91.64, "step": 1970, "token_acc": 0.907328730748805, "train_speed(iter/s)": 0.137886 }, { "epoch": 0.18639109097772744, "grad_norm": 0.42297878861427307, "learning_rate": 0.00019000188145312964, "loss": 0.34144585132598876, "memory(GiB)": 91.64, "step": 1975, "token_acc": 0.8630643967431533, "train_speed(iter/s)": 0.137895 }, { "epoch": 0.1868629671574179, "grad_norm": 0.35999032855033875, "learning_rate": 0.00018993375746982331, "loss": 0.3448899030685425, "memory(GiB)": 91.64, "step": 1980, "token_acc": 0.8727481793790725, "train_speed(iter/s)": 0.137894 }, { "epoch": 0.18733484333710834, "grad_norm": 0.5476382374763489, "learning_rate": 0.00018986541448427915, "loss": 0.34505505561828614, "memory(GiB)": 91.64, "step": 1985, "token_acc": 0.8727154046997389, "train_speed(iter/s)": 0.137901 }, { "epoch": 0.18780671951679878, "grad_norm": 0.459293931722641, "learning_rate": 0.00018979685266292263, "loss": 0.34939863681793215, "memory(GiB)": 91.64, "step": 1990, "token_acc": 0.859118086696562, "train_speed(iter/s)": 0.137908 }, { "epoch": 0.18827859569648925, "grad_norm": 0.3321842551231384, "learning_rate": 0.00018972807217271207, "loss": 0.35269508361816404, "memory(GiB)": 91.64, "step": 1995, "token_acc": 0.8570105003088326, "train_speed(iter/s)": 0.137911 }, { "epoch": 0.1887504718761797, "grad_norm": 0.301276296377182, "learning_rate": 0.00018965907318113838, "loss": 0.34697985649108887, "memory(GiB)": 91.64, "step": 2000, "token_acc": 0.8835132117603275, "train_speed(iter/s)": 0.13791 }, { "epoch": 0.18922234805587013, "grad_norm": 0.2910401523113251, "learning_rate": 0.00018958985585622445, "loss": 0.3419046878814697, "memory(GiB)": 91.64, "step": 2005, "token_acc": 0.8818718764198091, "train_speed(iter/s)": 0.137915 }, { "epoch": 0.1896942242355606, "grad_norm": 0.7058345079421997, "learning_rate": 0.00018952042036652486, "loss": 0.34183340072631835, "memory(GiB)": 91.64, "step": 2010, "token_acc": 0.8762214983713354, "train_speed(iter/s)": 0.137919 }, { "epoch": 0.19016610041525103, "grad_norm": 0.30306434631347656, "learning_rate": 0.00018945076688112552, "loss": 0.3396963834762573, "memory(GiB)": 91.64, "step": 2015, "token_acc": 0.8915574963609898, "train_speed(iter/s)": 0.137921 }, { "epoch": 0.1906379765949415, "grad_norm": 0.48909690976142883, "learning_rate": 0.0001893808955696432, "loss": 0.3448690176010132, "memory(GiB)": 91.64, "step": 2020, "token_acc": 0.8720196353436185, "train_speed(iter/s)": 0.137919 }, { "epoch": 0.19110985277463194, "grad_norm": 0.6389379501342773, "learning_rate": 0.00018931080660222497, "loss": 0.3356884002685547, "memory(GiB)": 91.64, "step": 2025, "token_acc": 0.8754716981132076, "train_speed(iter/s)": 0.137926 }, { "epoch": 0.19158172895432238, "grad_norm": 0.39773041009902954, "learning_rate": 0.00018924050014954805, "loss": 0.34584047794342043, "memory(GiB)": 91.64, "step": 2030, "token_acc": 0.9035667107001321, "train_speed(iter/s)": 0.137923 }, { "epoch": 0.19205360513401284, "grad_norm": 0.30064910650253296, "learning_rate": 0.00018916997638281923, "loss": 0.34599852561950684, "memory(GiB)": 91.64, "step": 2035, "token_acc": 0.8658097686375321, "train_speed(iter/s)": 0.137924 }, { "epoch": 0.19252548131370328, "grad_norm": 0.3374609351158142, "learning_rate": 0.00018909923547377454, "loss": 0.35094680786132815, "memory(GiB)": 91.64, "step": 2040, "token_acc": 0.8824691358024691, "train_speed(iter/s)": 0.137931 }, { "epoch": 0.19299735749339372, "grad_norm": 0.30539119243621826, "learning_rate": 0.00018902827759467868, "loss": 0.3404444932937622, "memory(GiB)": 91.64, "step": 2045, "token_acc": 0.8939899833055092, "train_speed(iter/s)": 0.137935 }, { "epoch": 0.1934692336730842, "grad_norm": 0.2880310118198395, "learning_rate": 0.00018895710291832484, "loss": 0.3389493703842163, "memory(GiB)": 91.64, "step": 2050, "token_acc": 0.872093023255814, "train_speed(iter/s)": 0.137938 }, { "epoch": 0.19394110985277463, "grad_norm": 0.4194709360599518, "learning_rate": 0.00018888571161803402, "loss": 0.3463752269744873, "memory(GiB)": 91.64, "step": 2055, "token_acc": 0.883762732174955, "train_speed(iter/s)": 0.137944 }, { "epoch": 0.19441298603246507, "grad_norm": 1.0667372941970825, "learning_rate": 0.00018881410386765478, "loss": 0.3408158540725708, "memory(GiB)": 91.64, "step": 2060, "token_acc": 0.8831054256726952, "train_speed(iter/s)": 0.137947 }, { "epoch": 0.19488486221215554, "grad_norm": 0.5454498529434204, "learning_rate": 0.00018874227984156278, "loss": 0.35147881507873535, "memory(GiB)": 91.64, "step": 2065, "token_acc": 0.8823058446757406, "train_speed(iter/s)": 0.137952 }, { "epoch": 0.19535673839184597, "grad_norm": 0.3480747640132904, "learning_rate": 0.00018867023971466036, "loss": 0.34201037883758545, "memory(GiB)": 91.64, "step": 2070, "token_acc": 0.8867823765020026, "train_speed(iter/s)": 0.137954 }, { "epoch": 0.19582861457153644, "grad_norm": 0.4492743909358978, "learning_rate": 0.00018859798366237604, "loss": 0.33846278190612794, "memory(GiB)": 91.64, "step": 2075, "token_acc": 0.8773491592482691, "train_speed(iter/s)": 0.137958 }, { "epoch": 0.19630049075122688, "grad_norm": 0.5267034769058228, "learning_rate": 0.0001885255118606642, "loss": 0.3435330390930176, "memory(GiB)": 91.64, "step": 2080, "token_acc": 0.8721130644605308, "train_speed(iter/s)": 0.137965 }, { "epoch": 0.19677236693091732, "grad_norm": 0.28133124113082886, "learning_rate": 0.0001884528244860046, "loss": 0.3497642755508423, "memory(GiB)": 91.64, "step": 2085, "token_acc": 0.8880718954248366, "train_speed(iter/s)": 0.137966 }, { "epoch": 0.1972442431106078, "grad_norm": 0.2900506556034088, "learning_rate": 0.0001883799217154019, "loss": 0.3394758224487305, "memory(GiB)": 91.64, "step": 2090, "token_acc": 0.8921568627450981, "train_speed(iter/s)": 0.137962 }, { "epoch": 0.19771611929029823, "grad_norm": 0.8465926647186279, "learning_rate": 0.00018830680372638537, "loss": 0.3450798034667969, "memory(GiB)": 91.64, "step": 2095, "token_acc": 0.8832747041893189, "train_speed(iter/s)": 0.137967 }, { "epoch": 0.19818799546998866, "grad_norm": 0.5081422328948975, "learning_rate": 0.00018823347069700828, "loss": 0.33031363487243653, "memory(GiB)": 91.64, "step": 2100, "token_acc": 0.8912106135986733, "train_speed(iter/s)": 0.137971 }, { "epoch": 0.19865987164967913, "grad_norm": 0.4367571175098419, "learning_rate": 0.00018815992280584763, "loss": 0.3436918258666992, "memory(GiB)": 91.64, "step": 2105, "token_acc": 0.8999052731291443, "train_speed(iter/s)": 0.137972 }, { "epoch": 0.19913174782936957, "grad_norm": 0.3542596399784088, "learning_rate": 0.00018808616023200357, "loss": 0.3388987064361572, "memory(GiB)": 91.64, "step": 2110, "token_acc": 0.8867167919799499, "train_speed(iter/s)": 0.137975 }, { "epoch": 0.19960362400906, "grad_norm": 0.5012041926383972, "learning_rate": 0.00018801218315509912, "loss": 0.3462409019470215, "memory(GiB)": 91.64, "step": 2115, "token_acc": 0.8946236559139785, "train_speed(iter/s)": 0.137983 }, { "epoch": 0.20007550018875048, "grad_norm": 0.6917479634284973, "learning_rate": 0.00018793799175527954, "loss": 0.34777753353118895, "memory(GiB)": 91.64, "step": 2120, "token_acc": 0.886994775914215, "train_speed(iter/s)": 0.137984 }, { "epoch": 0.20054737636844092, "grad_norm": 0.299472451210022, "learning_rate": 0.00018786358621321211, "loss": 0.3416252136230469, "memory(GiB)": 91.64, "step": 2125, "token_acc": 0.879359095193214, "train_speed(iter/s)": 0.137994 }, { "epoch": 0.20101925254813138, "grad_norm": 0.542940080165863, "learning_rate": 0.0001877889667100855, "loss": 0.3419647216796875, "memory(GiB)": 91.64, "step": 2130, "token_acc": 0.8806986382474837, "train_speed(iter/s)": 0.137995 }, { "epoch": 0.20149112872782182, "grad_norm": 0.34102553129196167, "learning_rate": 0.00018771413342760944, "loss": 0.3418309688568115, "memory(GiB)": 91.64, "step": 2135, "token_acc": 0.8886278195488722, "train_speed(iter/s)": 0.137999 }, { "epoch": 0.20196300490751226, "grad_norm": 0.4004146158695221, "learning_rate": 0.00018763908654801422, "loss": 0.33717515468597414, "memory(GiB)": 91.64, "step": 2140, "token_acc": 0.894689870593485, "train_speed(iter/s)": 0.138001 }, { "epoch": 0.20243488108720273, "grad_norm": 0.5726518034934998, "learning_rate": 0.0001875638262540503, "loss": 0.3408660888671875, "memory(GiB)": 91.64, "step": 2145, "token_acc": 0.8878548161935784, "train_speed(iter/s)": 0.138006 }, { "epoch": 0.20290675726689317, "grad_norm": 0.4096904397010803, "learning_rate": 0.00018748835272898781, "loss": 0.3478860378265381, "memory(GiB)": 91.64, "step": 2150, "token_acc": 0.8895281933256617, "train_speed(iter/s)": 0.13801 }, { "epoch": 0.2033786334465836, "grad_norm": 0.2845887243747711, "learning_rate": 0.0001874126661566162, "loss": 0.3438119888305664, "memory(GiB)": 91.64, "step": 2155, "token_acc": 0.8817876021143681, "train_speed(iter/s)": 0.138015 }, { "epoch": 0.20385050962627407, "grad_norm": 0.49546805024147034, "learning_rate": 0.00018733676672124362, "loss": 0.3439002990722656, "memory(GiB)": 91.64, "step": 2160, "token_acc": 0.8995042812077513, "train_speed(iter/s)": 0.138018 }, { "epoch": 0.2043223858059645, "grad_norm": 0.6575082540512085, "learning_rate": 0.00018726065460769663, "loss": 0.3439802885055542, "memory(GiB)": 91.64, "step": 2165, "token_acc": 0.8819188191881919, "train_speed(iter/s)": 0.138019 }, { "epoch": 0.20479426198565495, "grad_norm": 0.678426206111908, "learning_rate": 0.00018718433000131966, "loss": 0.33720600605010986, "memory(GiB)": 91.64, "step": 2170, "token_acc": 0.8794220229197808, "train_speed(iter/s)": 0.138026 }, { "epoch": 0.20526613816534542, "grad_norm": 0.5582923293113708, "learning_rate": 0.00018710779308797468, "loss": 0.3416036605834961, "memory(GiB)": 91.64, "step": 2175, "token_acc": 0.8883531157270029, "train_speed(iter/s)": 0.138031 }, { "epoch": 0.20573801434503586, "grad_norm": 0.39158037304878235, "learning_rate": 0.00018703104405404055, "loss": 0.335191011428833, "memory(GiB)": 91.64, "step": 2180, "token_acc": 0.9014705882352941, "train_speed(iter/s)": 0.138035 }, { "epoch": 0.20620989052472632, "grad_norm": 0.45636487007141113, "learning_rate": 0.00018695408308641272, "loss": 0.34262187480926515, "memory(GiB)": 91.64, "step": 2185, "token_acc": 0.883441258094357, "train_speed(iter/s)": 0.138032 }, { "epoch": 0.20668176670441676, "grad_norm": 0.8102078437805176, "learning_rate": 0.00018687691037250277, "loss": 0.33262019157409667, "memory(GiB)": 91.64, "step": 2190, "token_acc": 0.8790444511641972, "train_speed(iter/s)": 0.138034 }, { "epoch": 0.2071536428841072, "grad_norm": 0.31004661321640015, "learning_rate": 0.0001867995261002378, "loss": 0.34704036712646485, "memory(GiB)": 91.64, "step": 2195, "token_acc": 0.895397489539749, "train_speed(iter/s)": 0.138038 }, { "epoch": 0.20762551906379767, "grad_norm": 0.8132814168930054, "learning_rate": 0.00018672193045806023, "loss": 0.34265220165252686, "memory(GiB)": 91.64, "step": 2200, "token_acc": 0.8636037329504667, "train_speed(iter/s)": 0.138039 }, { "epoch": 0.2080973952434881, "grad_norm": 0.7821739315986633, "learning_rate": 0.00018664412363492708, "loss": 0.33681278228759765, "memory(GiB)": 91.64, "step": 2205, "token_acc": 0.8768713718301253, "train_speed(iter/s)": 0.13804 }, { "epoch": 0.20856927142317855, "grad_norm": 0.6573207974433899, "learning_rate": 0.00018656610582030975, "loss": 0.3311905860900879, "memory(GiB)": 91.64, "step": 2210, "token_acc": 0.8870541611624835, "train_speed(iter/s)": 0.138041 }, { "epoch": 0.20904114760286902, "grad_norm": 0.5945848226547241, "learning_rate": 0.0001864878772041933, "loss": 0.34422693252563474, "memory(GiB)": 91.64, "step": 2215, "token_acc": 0.8831013916500994, "train_speed(iter/s)": 0.138044 }, { "epoch": 0.20951302378255945, "grad_norm": 0.5761186480522156, "learning_rate": 0.00018640943797707622, "loss": 0.3473784923553467, "memory(GiB)": 91.64, "step": 2220, "token_acc": 0.8858757062146893, "train_speed(iter/s)": 0.138045 }, { "epoch": 0.2099848999622499, "grad_norm": 0.3242824077606201, "learning_rate": 0.00018633078832996978, "loss": 0.33851065635681155, "memory(GiB)": 91.64, "step": 2225, "token_acc": 0.8659020732245258, "train_speed(iter/s)": 0.13805 }, { "epoch": 0.21045677614194036, "grad_norm": 0.5747153759002686, "learning_rate": 0.0001862519284543978, "loss": 0.33396263122558595, "memory(GiB)": 91.64, "step": 2230, "token_acc": 0.8875242404654169, "train_speed(iter/s)": 0.138054 }, { "epoch": 0.2109286523216308, "grad_norm": 0.742813229560852, "learning_rate": 0.00018617285854239586, "loss": 0.3415235996246338, "memory(GiB)": 91.64, "step": 2235, "token_acc": 0.8529603122966819, "train_speed(iter/s)": 0.138056 }, { "epoch": 0.21140052850132127, "grad_norm": 0.5336731672286987, "learning_rate": 0.00018609357878651115, "loss": 0.33643336296081544, "memory(GiB)": 91.64, "step": 2240, "token_acc": 0.9009042954031651, "train_speed(iter/s)": 0.138057 }, { "epoch": 0.2118724046810117, "grad_norm": 0.3660552203655243, "learning_rate": 0.00018601408937980182, "loss": 0.3392070770263672, "memory(GiB)": 91.64, "step": 2245, "token_acc": 0.8835978835978836, "train_speed(iter/s)": 0.138053 }, { "epoch": 0.21234428086070214, "grad_norm": 0.5543057322502136, "learning_rate": 0.00018593439051583653, "loss": 0.3458813428878784, "memory(GiB)": 91.64, "step": 2250, "token_acc": 0.8912521440823328, "train_speed(iter/s)": 0.138054 }, { "epoch": 0.2128161570403926, "grad_norm": 0.2876913845539093, "learning_rate": 0.00018585448238869393, "loss": 0.33101418018341067, "memory(GiB)": 91.64, "step": 2255, "token_acc": 0.8919026725169525, "train_speed(iter/s)": 0.138055 }, { "epoch": 0.21328803322008305, "grad_norm": 0.3627128303050995, "learning_rate": 0.00018577436519296247, "loss": 0.3388057231903076, "memory(GiB)": 91.64, "step": 2260, "token_acc": 0.8724961479198767, "train_speed(iter/s)": 0.138057 }, { "epoch": 0.2137599093997735, "grad_norm": 0.5142070055007935, "learning_rate": 0.00018569403912373951, "loss": 0.3349132061004639, "memory(GiB)": 91.64, "step": 2265, "token_acc": 0.8903488898957861, "train_speed(iter/s)": 0.138059 }, { "epoch": 0.21423178557946396, "grad_norm": 0.29270824790000916, "learning_rate": 0.00018561350437663115, "loss": 0.34359285831451414, "memory(GiB)": 91.64, "step": 2270, "token_acc": 0.8882921589688507, "train_speed(iter/s)": 0.138061 }, { "epoch": 0.2147036617591544, "grad_norm": 1.000394344329834, "learning_rate": 0.00018553276114775157, "loss": 0.34334120750427244, "memory(GiB)": 91.64, "step": 2275, "token_acc": 0.9124653739612189, "train_speed(iter/s)": 0.138058 }, { "epoch": 0.21517553793884484, "grad_norm": 0.6676534414291382, "learning_rate": 0.00018545180963372272, "loss": 0.33390681743621825, "memory(GiB)": 91.64, "step": 2280, "token_acc": 0.8856531049250536, "train_speed(iter/s)": 0.138056 }, { "epoch": 0.2156474141185353, "grad_norm": 0.3300536870956421, "learning_rate": 0.00018537065003167377, "loss": 0.3315050840377808, "memory(GiB)": 91.64, "step": 2285, "token_acc": 0.8751619870410368, "train_speed(iter/s)": 0.138061 }, { "epoch": 0.21611929029822574, "grad_norm": 0.4177890121936798, "learning_rate": 0.0001852892825392405, "loss": 0.3441403865814209, "memory(GiB)": 91.64, "step": 2290, "token_acc": 0.8829141370338248, "train_speed(iter/s)": 0.138065 }, { "epoch": 0.2165911664779162, "grad_norm": 0.3767399787902832, "learning_rate": 0.00018520770735456504, "loss": 0.3340781211853027, "memory(GiB)": 91.64, "step": 2295, "token_acc": 0.8791012838801712, "train_speed(iter/s)": 0.138068 }, { "epoch": 0.21706304265760665, "grad_norm": 0.5728538632392883, "learning_rate": 0.0001851259246762952, "loss": 0.33160576820373533, "memory(GiB)": 91.64, "step": 2300, "token_acc": 0.8773809523809524, "train_speed(iter/s)": 0.13807 }, { "epoch": 0.2175349188372971, "grad_norm": 0.38368016481399536, "learning_rate": 0.00018504393470358417, "loss": 0.33443965911865237, "memory(GiB)": 91.64, "step": 2305, "token_acc": 0.894983866236433, "train_speed(iter/s)": 0.138071 }, { "epoch": 0.21800679501698755, "grad_norm": 0.8141961097717285, "learning_rate": 0.00018496173763608986, "loss": 0.3331787109375, "memory(GiB)": 91.64, "step": 2310, "token_acc": 0.8896752706078268, "train_speed(iter/s)": 0.138073 }, { "epoch": 0.218478671196678, "grad_norm": 0.4129309356212616, "learning_rate": 0.00018487933367397448, "loss": 0.33819682598114015, "memory(GiB)": 91.64, "step": 2315, "token_acc": 0.8782961460446247, "train_speed(iter/s)": 0.138076 }, { "epoch": 0.21895054737636843, "grad_norm": 0.45642325282096863, "learning_rate": 0.0001847967230179041, "loss": 0.34285683631896974, "memory(GiB)": 91.64, "step": 2320, "token_acc": 0.8848589522164652, "train_speed(iter/s)": 0.138083 }, { "epoch": 0.2194224235560589, "grad_norm": 0.40785184502601624, "learning_rate": 0.0001847139058690481, "loss": 0.3394650459289551, "memory(GiB)": 91.64, "step": 2325, "token_acc": 0.8692338547934216, "train_speed(iter/s)": 0.138082 }, { "epoch": 0.21989429973574934, "grad_norm": 0.2887260913848877, "learning_rate": 0.0001846308824290787, "loss": 0.3331992864608765, "memory(GiB)": 91.64, "step": 2330, "token_acc": 0.8966675277706019, "train_speed(iter/s)": 0.13808 }, { "epoch": 0.22036617591543978, "grad_norm": 0.7396420836448669, "learning_rate": 0.0001845476529001705, "loss": 0.33391461372375486, "memory(GiB)": 91.64, "step": 2335, "token_acc": 0.8978511367175335, "train_speed(iter/s)": 0.138083 }, { "epoch": 0.22083805209513024, "grad_norm": 0.2597915232181549, "learning_rate": 0.00018446421748499986, "loss": 0.3247242450714111, "memory(GiB)": 91.64, "step": 2340, "token_acc": 0.8870259481037924, "train_speed(iter/s)": 0.138086 }, { "epoch": 0.22130992827482068, "grad_norm": 0.5094233155250549, "learning_rate": 0.0001843805763867447, "loss": 0.3364422798156738, "memory(GiB)": 91.64, "step": 2345, "token_acc": 0.878769782024485, "train_speed(iter/s)": 0.13809 }, { "epoch": 0.22178180445451115, "grad_norm": 0.35699471831321716, "learning_rate": 0.00018429672980908355, "loss": 0.3271955490112305, "memory(GiB)": 91.64, "step": 2350, "token_acc": 0.8881599500156201, "train_speed(iter/s)": 0.138088 }, { "epoch": 0.2222536806342016, "grad_norm": 0.31184324622154236, "learning_rate": 0.00018421267795619555, "loss": 0.335361909866333, "memory(GiB)": 91.64, "step": 2355, "token_acc": 0.880623346074684, "train_speed(iter/s)": 0.138091 }, { "epoch": 0.22272555681389203, "grad_norm": 0.599033772945404, "learning_rate": 0.00018412842103275956, "loss": 0.328480863571167, "memory(GiB)": 91.64, "step": 2360, "token_acc": 0.8927937522186723, "train_speed(iter/s)": 0.138092 }, { "epoch": 0.2231974329935825, "grad_norm": 0.5339512825012207, "learning_rate": 0.00018404395924395388, "loss": 0.33516457080841067, "memory(GiB)": 91.64, "step": 2365, "token_acc": 0.9051851851851852, "train_speed(iter/s)": 0.138093 }, { "epoch": 0.22366930917327293, "grad_norm": 0.5576615929603577, "learning_rate": 0.0001839592927954557, "loss": 0.333830738067627, "memory(GiB)": 91.64, "step": 2370, "token_acc": 0.874083519285942, "train_speed(iter/s)": 0.138096 }, { "epoch": 0.22414118535296337, "grad_norm": 0.32573401927948, "learning_rate": 0.00018387442189344056, "loss": 0.3350250244140625, "memory(GiB)": 91.64, "step": 2375, "token_acc": 0.8787170063481456, "train_speed(iter/s)": 0.138099 }, { "epoch": 0.22461306153265384, "grad_norm": 0.7854357361793518, "learning_rate": 0.00018378934674458187, "loss": 0.33852076530456543, "memory(GiB)": 91.64, "step": 2380, "token_acc": 0.904169079328315, "train_speed(iter/s)": 0.138101 }, { "epoch": 0.22508493771234428, "grad_norm": 0.4120338559150696, "learning_rate": 0.00018370406755605046, "loss": 0.33258886337280275, "memory(GiB)": 91.64, "step": 2385, "token_acc": 0.8675496688741722, "train_speed(iter/s)": 0.138098 }, { "epoch": 0.22555681389203472, "grad_norm": 0.6627610325813293, "learning_rate": 0.00018361858453551393, "loss": 0.329588794708252, "memory(GiB)": 91.64, "step": 2390, "token_acc": 0.8796651552145099, "train_speed(iter/s)": 0.138103 }, { "epoch": 0.22602869007172519, "grad_norm": 1.0246161222457886, "learning_rate": 0.00018353289789113636, "loss": 0.3348996639251709, "memory(GiB)": 91.64, "step": 2395, "token_acc": 0.895040369088812, "train_speed(iter/s)": 0.138106 }, { "epoch": 0.22650056625141562, "grad_norm": 0.34904682636260986, "learning_rate": 0.0001834470078315776, "loss": 0.33976428508758544, "memory(GiB)": 91.64, "step": 2400, "token_acc": 0.8983688833124216, "train_speed(iter/s)": 0.138109 }, { "epoch": 0.22697244243110606, "grad_norm": 0.3885119557380676, "learning_rate": 0.00018336091456599288, "loss": 0.3324185609817505, "memory(GiB)": 91.64, "step": 2405, "token_acc": 0.8879310344827587, "train_speed(iter/s)": 0.138113 }, { "epoch": 0.22744431861079653, "grad_norm": 0.8067770600318909, "learning_rate": 0.00018327461830403228, "loss": 0.3337501049041748, "memory(GiB)": 91.64, "step": 2410, "token_acc": 0.8831425187524674, "train_speed(iter/s)": 0.138115 }, { "epoch": 0.22791619479048697, "grad_norm": 1.321143388748169, "learning_rate": 0.00018318811925584013, "loss": 0.34863355159759524, "memory(GiB)": 91.64, "step": 2415, "token_acc": 0.8935230618253189, "train_speed(iter/s)": 0.138113 }, { "epoch": 0.22838807097017744, "grad_norm": 0.2674782872200012, "learning_rate": 0.00018310141763205472, "loss": 0.3419090747833252, "memory(GiB)": 91.64, "step": 2420, "token_acc": 0.8874555160142349, "train_speed(iter/s)": 0.13811 }, { "epoch": 0.22885994714986788, "grad_norm": 0.6279721260070801, "learning_rate": 0.0001830145136438075, "loss": 0.33653717041015624, "memory(GiB)": 91.64, "step": 2425, "token_acc": 0.8700137551581844, "train_speed(iter/s)": 0.138108 }, { "epoch": 0.22933182332955832, "grad_norm": 0.4765380322933197, "learning_rate": 0.00018292740750272277, "loss": 0.33394408226013184, "memory(GiB)": 91.64, "step": 2430, "token_acc": 0.8937568455640745, "train_speed(iter/s)": 0.138109 }, { "epoch": 0.22980369950924878, "grad_norm": 0.4631507098674774, "learning_rate": 0.0001828400994209171, "loss": 0.33988327980041505, "memory(GiB)": 91.64, "step": 2435, "token_acc": 0.8815399802566634, "train_speed(iter/s)": 0.138114 }, { "epoch": 0.23027557568893922, "grad_norm": 0.6181381344795227, "learning_rate": 0.0001827525896109988, "loss": 0.3365536451339722, "memory(GiB)": 91.64, "step": 2440, "token_acc": 0.8993469074145217, "train_speed(iter/s)": 0.138118 }, { "epoch": 0.23074745186862966, "grad_norm": 0.8235430121421814, "learning_rate": 0.0001826648782860675, "loss": 0.33196301460266114, "memory(GiB)": 91.64, "step": 2445, "token_acc": 0.8881469115191987, "train_speed(iter/s)": 0.13812 }, { "epoch": 0.23121932804832013, "grad_norm": 0.3884918689727783, "learning_rate": 0.00018257696565971337, "loss": 0.34457688331604003, "memory(GiB)": 91.64, "step": 2450, "token_acc": 0.8957507082152975, "train_speed(iter/s)": 0.138123 }, { "epoch": 0.23169120422801057, "grad_norm": 1.2805403470993042, "learning_rate": 0.00018248885194601698, "loss": 0.3333995580673218, "memory(GiB)": 91.64, "step": 2455, "token_acc": 0.8563068920676203, "train_speed(iter/s)": 0.138118 }, { "epoch": 0.232163080407701, "grad_norm": 0.6470995545387268, "learning_rate": 0.0001824005373595484, "loss": 0.3331939697265625, "memory(GiB)": 91.64, "step": 2460, "token_acc": 0.8931245745405038, "train_speed(iter/s)": 0.138119 }, { "epoch": 0.23263495658739147, "grad_norm": 0.41226083040237427, "learning_rate": 0.00018231202211536703, "loss": 0.33048851490020753, "memory(GiB)": 91.64, "step": 2465, "token_acc": 0.9143766271333527, "train_speed(iter/s)": 0.138121 }, { "epoch": 0.2331068327670819, "grad_norm": 0.3055603504180908, "learning_rate": 0.0001822233064290208, "loss": 0.33061680793762205, "memory(GiB)": 91.64, "step": 2470, "token_acc": 0.8926151761517616, "train_speed(iter/s)": 0.138122 }, { "epoch": 0.23357870894677238, "grad_norm": 0.44368642568588257, "learning_rate": 0.0001821343905165457, "loss": 0.33341374397277834, "memory(GiB)": 91.64, "step": 2475, "token_acc": 0.904647983595352, "train_speed(iter/s)": 0.138122 }, { "epoch": 0.23405058512646282, "grad_norm": 0.6508908867835999, "learning_rate": 0.00018204527459446542, "loss": 0.3335702419281006, "memory(GiB)": 91.64, "step": 2480, "token_acc": 0.8927335640138409, "train_speed(iter/s)": 0.138129 }, { "epoch": 0.23452246130615326, "grad_norm": 0.6149075031280518, "learning_rate": 0.00018195595887979062, "loss": 0.32854518890380857, "memory(GiB)": 91.64, "step": 2485, "token_acc": 0.8804051421893261, "train_speed(iter/s)": 0.138129 }, { "epoch": 0.23499433748584372, "grad_norm": 0.6348846554756165, "learning_rate": 0.0001818664435900185, "loss": 0.328797721862793, "memory(GiB)": 91.64, "step": 2490, "token_acc": 0.8710801393728222, "train_speed(iter/s)": 0.138132 }, { "epoch": 0.23546621366553416, "grad_norm": 0.37644708156585693, "learning_rate": 0.00018177672894313234, "loss": 0.3341526508331299, "memory(GiB)": 91.64, "step": 2495, "token_acc": 0.8936955063715627, "train_speed(iter/s)": 0.138135 }, { "epoch": 0.2359380898452246, "grad_norm": 0.4598802924156189, "learning_rate": 0.00018168681515760068, "loss": 0.3312446355819702, "memory(GiB)": 91.64, "step": 2500, "token_acc": 0.897131552917903, "train_speed(iter/s)": 0.138137 }, { "epoch": 0.23640996602491507, "grad_norm": 0.48607075214385986, "learning_rate": 0.00018159670245237726, "loss": 0.3310050964355469, "memory(GiB)": 91.64, "step": 2505, "token_acc": 0.884657634566093, "train_speed(iter/s)": 0.138142 }, { "epoch": 0.2368818422046055, "grad_norm": 0.38543701171875, "learning_rate": 0.0001815063910469, "loss": 0.3297031164169312, "memory(GiB)": 91.64, "step": 2510, "token_acc": 0.9097881665449233, "train_speed(iter/s)": 0.138138 }, { "epoch": 0.23735371838429595, "grad_norm": 0.5178254842758179, "learning_rate": 0.00018141588116109077, "loss": 0.32350549697875974, "memory(GiB)": 91.64, "step": 2515, "token_acc": 0.890393567498942, "train_speed(iter/s)": 0.13814 }, { "epoch": 0.23782559456398641, "grad_norm": 0.33711934089660645, "learning_rate": 0.0001813251730153548, "loss": 0.3261995553970337, "memory(GiB)": 91.64, "step": 2520, "token_acc": 0.8786370597243491, "train_speed(iter/s)": 0.138146 }, { "epoch": 0.23829747074367685, "grad_norm": 0.6296564340591431, "learning_rate": 0.00018123426683058007, "loss": 0.3304123401641846, "memory(GiB)": 91.64, "step": 2525, "token_acc": 0.8883516483516484, "train_speed(iter/s)": 0.13815 }, { "epoch": 0.23876934692336732, "grad_norm": 0.6025164127349854, "learning_rate": 0.0001811431628281368, "loss": 0.323737645149231, "memory(GiB)": 91.64, "step": 2530, "token_acc": 0.8825503355704698, "train_speed(iter/s)": 0.138151 }, { "epoch": 0.23924122310305776, "grad_norm": 0.4461984932422638, "learning_rate": 0.000181051861229877, "loss": 0.3296360015869141, "memory(GiB)": 91.64, "step": 2535, "token_acc": 0.8823049741777657, "train_speed(iter/s)": 0.138152 }, { "epoch": 0.2397130992827482, "grad_norm": 0.3749321401119232, "learning_rate": 0.00018096036225813373, "loss": 0.3249626636505127, "memory(GiB)": 91.64, "step": 2540, "token_acc": 0.8823712948517941, "train_speed(iter/s)": 0.138157 }, { "epoch": 0.24018497546243867, "grad_norm": 0.4789837598800659, "learning_rate": 0.00018086866613572085, "loss": 0.32210140228271483, "memory(GiB)": 91.64, "step": 2545, "token_acc": 0.8996778647031753, "train_speed(iter/s)": 0.138159 }, { "epoch": 0.2406568516421291, "grad_norm": 0.30677059292793274, "learning_rate": 0.00018077677308593216, "loss": 0.3380124568939209, "memory(GiB)": 91.64, "step": 2550, "token_acc": 0.8955823293172691, "train_speed(iter/s)": 0.138164 }, { "epoch": 0.24112872782181954, "grad_norm": 0.26206114888191223, "learning_rate": 0.00018068468333254107, "loss": 0.33399908542633056, "memory(GiB)": 91.64, "step": 2555, "token_acc": 0.8851699279093718, "train_speed(iter/s)": 0.138167 }, { "epoch": 0.24160060400151, "grad_norm": 0.3598458170890808, "learning_rate": 0.00018059239709980002, "loss": 0.33064751625061034, "memory(GiB)": 91.64, "step": 2560, "token_acc": 0.8942148760330578, "train_speed(iter/s)": 0.138168 }, { "epoch": 0.24207248018120045, "grad_norm": 0.32919731736183167, "learning_rate": 0.00018049991461243988, "loss": 0.33213248252868655, "memory(GiB)": 91.64, "step": 2565, "token_acc": 0.8634217217580821, "train_speed(iter/s)": 0.138164 }, { "epoch": 0.2425443563608909, "grad_norm": 0.37082603573799133, "learning_rate": 0.00018040723609566943, "loss": 0.33206088542938234, "memory(GiB)": 91.64, "step": 2570, "token_acc": 0.8974438902743143, "train_speed(iter/s)": 0.138169 }, { "epoch": 0.24301623254058136, "grad_norm": 0.875857412815094, "learning_rate": 0.00018031436177517478, "loss": 0.3363888502120972, "memory(GiB)": 91.64, "step": 2575, "token_acc": 0.8790530108080288, "train_speed(iter/s)": 0.138168 }, { "epoch": 0.2434881087202718, "grad_norm": 0.5113961696624756, "learning_rate": 0.0001802212918771189, "loss": 0.3324090003967285, "memory(GiB)": 91.64, "step": 2580, "token_acc": 0.886832363828662, "train_speed(iter/s)": 0.13817 }, { "epoch": 0.24395998489996226, "grad_norm": 0.4555577337741852, "learning_rate": 0.000180128026628141, "loss": 0.3298499345779419, "memory(GiB)": 91.64, "step": 2585, "token_acc": 0.8806023664395841, "train_speed(iter/s)": 0.138172 }, { "epoch": 0.2444318610796527, "grad_norm": 0.7389398217201233, "learning_rate": 0.00018003456625535603, "loss": 0.32846970558166505, "memory(GiB)": 91.64, "step": 2590, "token_acc": 0.8857536132140399, "train_speed(iter/s)": 0.138176 }, { "epoch": 0.24490373725934314, "grad_norm": 0.36795300245285034, "learning_rate": 0.000179940910986354, "loss": 0.32522149085998536, "memory(GiB)": 91.64, "step": 2595, "token_acc": 0.8899871078642029, "train_speed(iter/s)": 0.138179 }, { "epoch": 0.2453756134390336, "grad_norm": 0.5684471726417542, "learning_rate": 0.00017984706104919965, "loss": 0.3347635746002197, "memory(GiB)": 91.64, "step": 2600, "token_acc": 0.8730314960629921, "train_speed(iter/s)": 0.138185 }, { "epoch": 0.24584748961872405, "grad_norm": 0.4991260766983032, "learning_rate": 0.00017975301667243166, "loss": 0.32751965522766113, "memory(GiB)": 91.64, "step": 2605, "token_acc": 0.8909626719056974, "train_speed(iter/s)": 0.13819 }, { "epoch": 0.24631936579841449, "grad_norm": 0.5270681381225586, "learning_rate": 0.00017965877808506228, "loss": 0.32968854904174805, "memory(GiB)": 91.64, "step": 2610, "token_acc": 0.8979523329976502, "train_speed(iter/s)": 0.138192 }, { "epoch": 0.24679124197810495, "grad_norm": 0.713347315788269, "learning_rate": 0.00017956434551657667, "loss": 0.32762675285339354, "memory(GiB)": 91.64, "step": 2615, "token_acc": 0.8938199917046868, "train_speed(iter/s)": 0.138197 }, { "epoch": 0.2472631181577954, "grad_norm": 0.32395628094673157, "learning_rate": 0.00017946971919693229, "loss": 0.3311192989349365, "memory(GiB)": 91.64, "step": 2620, "token_acc": 0.8778821520068317, "train_speed(iter/s)": 0.138197 }, { "epoch": 0.24773499433748583, "grad_norm": 0.4176148474216461, "learning_rate": 0.0001793748993565585, "loss": 0.3304103136062622, "memory(GiB)": 91.64, "step": 2625, "token_acc": 0.8812527185732927, "train_speed(iter/s)": 0.138198 }, { "epoch": 0.2482068705171763, "grad_norm": 0.5663833618164062, "learning_rate": 0.0001792798862263559, "loss": 0.32198286056518555, "memory(GiB)": 91.64, "step": 2630, "token_acc": 0.8690159574468085, "train_speed(iter/s)": 0.138197 }, { "epoch": 0.24867874669686674, "grad_norm": 0.3792598247528076, "learning_rate": 0.0001791846800376958, "loss": 0.31867480278015137, "memory(GiB)": 91.64, "step": 2635, "token_acc": 0.8866906474820144, "train_speed(iter/s)": 0.1382 }, { "epoch": 0.2491506228765572, "grad_norm": 0.32338786125183105, "learning_rate": 0.00017908928102241953, "loss": 0.3264533519744873, "memory(GiB)": 91.64, "step": 2640, "token_acc": 0.8988023952095808, "train_speed(iter/s)": 0.138202 }, { "epoch": 0.24962249905624764, "grad_norm": 1.4472591876983643, "learning_rate": 0.00017899368941283808, "loss": 0.3345726490020752, "memory(GiB)": 91.64, "step": 2645, "token_acc": 0.8777016341591988, "train_speed(iter/s)": 0.138203 }, { "epoch": 0.2500943752359381, "grad_norm": 0.6152178645133972, "learning_rate": 0.00017889790544173143, "loss": 0.3289012432098389, "memory(GiB)": 91.64, "step": 2650, "token_acc": 0.8993955094991365, "train_speed(iter/s)": 0.138207 }, { "epoch": 0.25056625141562855, "grad_norm": 0.3531012237071991, "learning_rate": 0.00017880192934234792, "loss": 0.3324916362762451, "memory(GiB)": 91.64, "step": 2655, "token_acc": 0.9044006069802731, "train_speed(iter/s)": 0.13821 }, { "epoch": 0.25103812759531896, "grad_norm": 0.6609277725219727, "learning_rate": 0.00017870576134840381, "loss": 0.327985954284668, "memory(GiB)": 91.64, "step": 2660, "token_acc": 0.8779822767552828, "train_speed(iter/s)": 0.13821 }, { "epoch": 0.2515100037750094, "grad_norm": 0.297342449426651, "learning_rate": 0.00017860940169408274, "loss": 0.3250416278839111, "memory(GiB)": 91.64, "step": 2665, "token_acc": 0.8967880085653105, "train_speed(iter/s)": 0.138215 }, { "epoch": 0.2519818799546999, "grad_norm": 0.4424315094947815, "learning_rate": 0.00017851285061403483, "loss": 0.3348080158233643, "memory(GiB)": 91.64, "step": 2670, "token_acc": 0.875974025974026, "train_speed(iter/s)": 0.138217 }, { "epoch": 0.25245375613439036, "grad_norm": 0.3924655020236969, "learning_rate": 0.0001784161083433766, "loss": 0.32940819263458254, "memory(GiB)": 91.64, "step": 2675, "token_acc": 0.8746097814776275, "train_speed(iter/s)": 0.138219 }, { "epoch": 0.2529256323140808, "grad_norm": 0.4169272482395172, "learning_rate": 0.00017831917511769, "loss": 0.32938084602355955, "memory(GiB)": 91.64, "step": 2680, "token_acc": 0.8867924528301887, "train_speed(iter/s)": 0.138222 }, { "epoch": 0.25339750849377124, "grad_norm": 0.5509438514709473, "learning_rate": 0.000178222051173022, "loss": 0.32615439891815184, "memory(GiB)": 91.64, "step": 2685, "token_acc": 0.8933631618195377, "train_speed(iter/s)": 0.138225 }, { "epoch": 0.2538693846734617, "grad_norm": 0.27454128861427307, "learning_rate": 0.00017812473674588407, "loss": 0.3298367977142334, "memory(GiB)": 91.64, "step": 2690, "token_acc": 0.8921161825726142, "train_speed(iter/s)": 0.138226 }, { "epoch": 0.2543412608531521, "grad_norm": 0.2513178884983063, "learning_rate": 0.0001780272320732515, "loss": 0.3351443767547607, "memory(GiB)": 91.64, "step": 2695, "token_acc": 0.8785100286532951, "train_speed(iter/s)": 0.138227 }, { "epoch": 0.2548131370328426, "grad_norm": 0.3216034471988678, "learning_rate": 0.00017792953739256278, "loss": 0.3156256198883057, "memory(GiB)": 91.64, "step": 2700, "token_acc": 0.8910359634997316, "train_speed(iter/s)": 0.13823 }, { "epoch": 0.25528501321253305, "grad_norm": 0.7129026651382446, "learning_rate": 0.0001778316529417192, "loss": 0.3225963354110718, "memory(GiB)": 91.64, "step": 2705, "token_acc": 0.8915942028985507, "train_speed(iter/s)": 0.138232 }, { "epoch": 0.25575688939222346, "grad_norm": 1.063512921333313, "learning_rate": 0.0001777335789590842, "loss": 0.3296067237854004, "memory(GiB)": 91.64, "step": 2710, "token_acc": 0.8891454965357968, "train_speed(iter/s)": 0.138231 }, { "epoch": 0.25622876557191393, "grad_norm": 0.6636164784431458, "learning_rate": 0.0001776353156834826, "loss": 0.32224364280700685, "memory(GiB)": 91.64, "step": 2715, "token_acc": 0.8841752721183366, "train_speed(iter/s)": 0.138229 }, { "epoch": 0.2567006417516044, "grad_norm": 0.5418056845664978, "learning_rate": 0.00017753686335420028, "loss": 0.32490553855896, "memory(GiB)": 91.64, "step": 2720, "token_acc": 0.8908418131359852, "train_speed(iter/s)": 0.13823 }, { "epoch": 0.2571725179312948, "grad_norm": 0.984113872051239, "learning_rate": 0.0001774382222109835, "loss": 0.3314798831939697, "memory(GiB)": 91.64, "step": 2725, "token_acc": 0.8758210822646231, "train_speed(iter/s)": 0.138232 }, { "epoch": 0.2576443941109853, "grad_norm": 0.35621482133865356, "learning_rate": 0.00017733939249403835, "loss": 0.3266749382019043, "memory(GiB)": 91.64, "step": 2730, "token_acc": 0.8859649122807017, "train_speed(iter/s)": 0.138236 }, { "epoch": 0.25811627029067574, "grad_norm": 0.31070077419281006, "learning_rate": 0.00017724037444402993, "loss": 0.3295797109603882, "memory(GiB)": 91.64, "step": 2735, "token_acc": 0.8840749414519906, "train_speed(iter/s)": 0.138237 }, { "epoch": 0.25858814647036615, "grad_norm": 0.36651989817619324, "learning_rate": 0.00017714116830208228, "loss": 0.3238197326660156, "memory(GiB)": 91.64, "step": 2740, "token_acc": 0.8655913978494624, "train_speed(iter/s)": 0.13824 }, { "epoch": 0.2590600226500566, "grad_norm": 0.3921261131763458, "learning_rate": 0.00017704177430977712, "loss": 0.3325662612915039, "memory(GiB)": 91.64, "step": 2745, "token_acc": 0.8959709379128138, "train_speed(iter/s)": 0.138242 }, { "epoch": 0.2595318988297471, "grad_norm": 0.2784421741962433, "learning_rate": 0.00017694219270915387, "loss": 0.318132209777832, "memory(GiB)": 91.64, "step": 2750, "token_acc": 0.8974579922447221, "train_speed(iter/s)": 0.138246 }, { "epoch": 0.2600037750094375, "grad_norm": 0.536296010017395, "learning_rate": 0.0001768424237427087, "loss": 0.327205753326416, "memory(GiB)": 91.64, "step": 2755, "token_acc": 0.9128829826504246, "train_speed(iter/s)": 0.138246 }, { "epoch": 0.26047565118912797, "grad_norm": 0.29977917671203613, "learning_rate": 0.00017674246765339406, "loss": 0.3198941707611084, "memory(GiB)": 91.64, "step": 2760, "token_acc": 0.8957388939256573, "train_speed(iter/s)": 0.13825 }, { "epoch": 0.26094752736881843, "grad_norm": 0.26089954376220703, "learning_rate": 0.00017664232468461808, "loss": 0.3271759510040283, "memory(GiB)": 91.64, "step": 2765, "token_acc": 0.8782735208535403, "train_speed(iter/s)": 0.13825 }, { "epoch": 0.26141940354850884, "grad_norm": 0.6381005644798279, "learning_rate": 0.00017654199508024396, "loss": 0.31565151214599607, "memory(GiB)": 91.64, "step": 2770, "token_acc": 0.8883669521967394, "train_speed(iter/s)": 0.138249 }, { "epoch": 0.2618912797281993, "grad_norm": 0.35093823075294495, "learning_rate": 0.0001764414790845894, "loss": 0.32278971672058104, "memory(GiB)": 91.64, "step": 2775, "token_acc": 0.8895833333333333, "train_speed(iter/s)": 0.138248 }, { "epoch": 0.2623631559078898, "grad_norm": 0.5740118026733398, "learning_rate": 0.00017634077694242599, "loss": 0.32536165714263915, "memory(GiB)": 91.64, "step": 2780, "token_acc": 0.8868584758942457, "train_speed(iter/s)": 0.138251 }, { "epoch": 0.26283503208758024, "grad_norm": 0.30373477935791016, "learning_rate": 0.00017623988889897856, "loss": 0.31938059329986573, "memory(GiB)": 91.64, "step": 2785, "token_acc": 0.8855127509495387, "train_speed(iter/s)": 0.138254 }, { "epoch": 0.26330690826727066, "grad_norm": 0.400481641292572, "learning_rate": 0.00017613881519992474, "loss": 0.3272983551025391, "memory(GiB)": 91.64, "step": 2790, "token_acc": 0.8951557093425605, "train_speed(iter/s)": 0.138251 }, { "epoch": 0.2637787844469611, "grad_norm": 0.26797062158584595, "learning_rate": 0.00017603755609139413, "loss": 0.32006087303161623, "memory(GiB)": 91.64, "step": 2795, "token_acc": 0.8942536552193131, "train_speed(iter/s)": 0.138254 }, { "epoch": 0.2642506606266516, "grad_norm": 0.3913300633430481, "learning_rate": 0.00017593611181996802, "loss": 0.316060733795166, "memory(GiB)": 91.64, "step": 2800, "token_acc": 0.8809629959875167, "train_speed(iter/s)": 0.138253 }, { "epoch": 0.264722536806342, "grad_norm": 0.6603167057037354, "learning_rate": 0.00017583448263267835, "loss": 0.32715344429016113, "memory(GiB)": 91.64, "step": 2805, "token_acc": 0.8789531079607416, "train_speed(iter/s)": 0.138254 }, { "epoch": 0.26519441298603247, "grad_norm": 0.5087538957595825, "learning_rate": 0.00017573266877700755, "loss": 0.32497029304504393, "memory(GiB)": 91.64, "step": 2810, "token_acc": 0.8834763354553004, "train_speed(iter/s)": 0.138253 }, { "epoch": 0.26566628916572294, "grad_norm": 0.5204217433929443, "learning_rate": 0.00017563067050088772, "loss": 0.33654916286468506, "memory(GiB)": 91.64, "step": 2815, "token_acc": 0.871804654711942, "train_speed(iter/s)": 0.13826 }, { "epoch": 0.26613816534541335, "grad_norm": 0.7432185411453247, "learning_rate": 0.0001755284880527, "loss": 0.3227522611618042, "memory(GiB)": 91.64, "step": 2820, "token_acc": 0.8913313977602655, "train_speed(iter/s)": 0.138262 }, { "epoch": 0.2666100415251038, "grad_norm": 0.7067577242851257, "learning_rate": 0.00017542612168127395, "loss": 0.3221942186355591, "memory(GiB)": 91.64, "step": 2825, "token_acc": 0.9026233881725211, "train_speed(iter/s)": 0.138265 }, { "epoch": 0.2670819177047943, "grad_norm": 0.29249584674835205, "learning_rate": 0.0001753235716358872, "loss": 0.32654409408569335, "memory(GiB)": 91.64, "step": 2830, "token_acc": 0.9053850025163563, "train_speed(iter/s)": 0.138265 }, { "epoch": 0.2675537938844847, "grad_norm": 0.2350318282842636, "learning_rate": 0.00017522083816626452, "loss": 0.3282480239868164, "memory(GiB)": 91.64, "step": 2835, "token_acc": 0.8849056603773585, "train_speed(iter/s)": 0.138263 }, { "epoch": 0.26802567006417516, "grad_norm": 0.3909781873226166, "learning_rate": 0.00017511792152257735, "loss": 0.32073369026184084, "memory(GiB)": 91.64, "step": 2840, "token_acc": 0.8862385321100917, "train_speed(iter/s)": 0.138259 }, { "epoch": 0.2684975462438656, "grad_norm": 0.42215001583099365, "learning_rate": 0.0001750148219554432, "loss": 0.3243825435638428, "memory(GiB)": 91.64, "step": 2845, "token_acc": 0.8947040498442368, "train_speed(iter/s)": 0.138263 }, { "epoch": 0.26896942242355604, "grad_norm": 0.5986440181732178, "learning_rate": 0.00017491153971592506, "loss": 0.3247550010681152, "memory(GiB)": 91.64, "step": 2850, "token_acc": 0.8752844788347747, "train_speed(iter/s)": 0.138259 }, { "epoch": 0.2694412986032465, "grad_norm": 0.3086279630661011, "learning_rate": 0.00017480807505553076, "loss": 0.3214439868927002, "memory(GiB)": 91.64, "step": 2855, "token_acc": 0.8910550458715596, "train_speed(iter/s)": 0.138259 }, { "epoch": 0.26991317478293697, "grad_norm": 0.275035560131073, "learning_rate": 0.00017470442822621228, "loss": 0.32530651092529295, "memory(GiB)": 91.64, "step": 2860, "token_acc": 0.8911111111111111, "train_speed(iter/s)": 0.138259 }, { "epoch": 0.2703850509626274, "grad_norm": 0.6449964046478271, "learning_rate": 0.00017460059948036527, "loss": 0.32490763664245603, "memory(GiB)": 91.64, "step": 2865, "token_acc": 0.882903981264637, "train_speed(iter/s)": 0.138266 }, { "epoch": 0.27085692714231785, "grad_norm": 0.8364949226379395, "learning_rate": 0.00017449658907082833, "loss": 0.32590255737304685, "memory(GiB)": 91.64, "step": 2870, "token_acc": 0.8773987206823027, "train_speed(iter/s)": 0.138269 }, { "epoch": 0.2713288033220083, "grad_norm": 1.0155036449432373, "learning_rate": 0.0001743923972508825, "loss": 0.32789173126220705, "memory(GiB)": 91.64, "step": 2875, "token_acc": 0.889273356401384, "train_speed(iter/s)": 0.138268 }, { "epoch": 0.2718006795016987, "grad_norm": 0.6500275731086731, "learning_rate": 0.00017428802427425053, "loss": 0.3223546504974365, "memory(GiB)": 91.64, "step": 2880, "token_acc": 0.8844172569220863, "train_speed(iter/s)": 0.138272 }, { "epoch": 0.2722725556813892, "grad_norm": 0.5770022869110107, "learning_rate": 0.00017418347039509634, "loss": 0.3223582744598389, "memory(GiB)": 91.64, "step": 2885, "token_acc": 0.8766788766788767, "train_speed(iter/s)": 0.138279 }, { "epoch": 0.27274443186107966, "grad_norm": 0.4336351454257965, "learning_rate": 0.00017407873586802435, "loss": 0.324751091003418, "memory(GiB)": 91.64, "step": 2890, "token_acc": 0.8822988505747127, "train_speed(iter/s)": 0.138273 }, { "epoch": 0.27321630804077013, "grad_norm": 0.49264365434646606, "learning_rate": 0.00017397382094807892, "loss": 0.315723180770874, "memory(GiB)": 91.64, "step": 2895, "token_acc": 0.8913617502829121, "train_speed(iter/s)": 0.138277 }, { "epoch": 0.27368818422046054, "grad_norm": 0.47523409128189087, "learning_rate": 0.00017386872589074366, "loss": 0.32211527824401853, "memory(GiB)": 91.64, "step": 2900, "token_acc": 0.8896848137535817, "train_speed(iter/s)": 0.13828 }, { "epoch": 0.274160060400151, "grad_norm": 0.5986055135726929, "learning_rate": 0.00017376345095194084, "loss": 0.3235922336578369, "memory(GiB)": 91.64, "step": 2905, "token_acc": 0.882665832290363, "train_speed(iter/s)": 0.138282 }, { "epoch": 0.2746319365798415, "grad_norm": 0.25783222913742065, "learning_rate": 0.0001736579963880308, "loss": 0.31179332733154297, "memory(GiB)": 91.64, "step": 2910, "token_acc": 0.8972863302054274, "train_speed(iter/s)": 0.138281 }, { "epoch": 0.2751038127595319, "grad_norm": 0.7452314496040344, "learning_rate": 0.0001735523624558113, "loss": 0.3252346277236938, "memory(GiB)": 91.64, "step": 2915, "token_acc": 0.884393063583815, "train_speed(iter/s)": 0.138284 }, { "epoch": 0.27557568893922235, "grad_norm": 0.4005107879638672, "learning_rate": 0.00017344654941251682, "loss": 0.3287190437316895, "memory(GiB)": 91.64, "step": 2920, "token_acc": 0.8755208333333333, "train_speed(iter/s)": 0.138286 }, { "epoch": 0.2760475651189128, "grad_norm": 0.7920652031898499, "learning_rate": 0.00017334055751581812, "loss": 0.323880934715271, "memory(GiB)": 91.64, "step": 2925, "token_acc": 0.8801882755669662, "train_speed(iter/s)": 0.138289 }, { "epoch": 0.27651944129860323, "grad_norm": 0.34472015500068665, "learning_rate": 0.0001732343870238213, "loss": 0.3178147554397583, "memory(GiB)": 91.64, "step": 2930, "token_acc": 0.8979652020053082, "train_speed(iter/s)": 0.138289 }, { "epoch": 0.2769913174782937, "grad_norm": 0.38036808371543884, "learning_rate": 0.00017312803819506762, "loss": 0.3199058771133423, "memory(GiB)": 91.64, "step": 2935, "token_acc": 0.9014567266495287, "train_speed(iter/s)": 0.138292 }, { "epoch": 0.27746319365798416, "grad_norm": 0.36448225378990173, "learning_rate": 0.00017302151128853244, "loss": 0.31929521560668944, "memory(GiB)": 91.64, "step": 2940, "token_acc": 0.8818827708703375, "train_speed(iter/s)": 0.138299 }, { "epoch": 0.2779350698376746, "grad_norm": 0.3512881100177765, "learning_rate": 0.00017291480656362479, "loss": 0.31873791217803954, "memory(GiB)": 91.64, "step": 2945, "token_acc": 0.9066164154103853, "train_speed(iter/s)": 0.1383 }, { "epoch": 0.27840694601736504, "grad_norm": 0.4474186301231384, "learning_rate": 0.00017280792428018678, "loss": 0.3215645313262939, "memory(GiB)": 91.64, "step": 2950, "token_acc": 0.8794084186575654, "train_speed(iter/s)": 0.138302 }, { "epoch": 0.2788788221970555, "grad_norm": 0.5548481345176697, "learning_rate": 0.0001727008646984928, "loss": 0.3308894634246826, "memory(GiB)": 91.64, "step": 2955, "token_acc": 0.9011608623548922, "train_speed(iter/s)": 0.1383 }, { "epoch": 0.2793506983767459, "grad_norm": 0.5263611078262329, "learning_rate": 0.00017259362807924914, "loss": 0.3184781074523926, "memory(GiB)": 91.64, "step": 2960, "token_acc": 0.9090909090909091, "train_speed(iter/s)": 0.138303 }, { "epoch": 0.2798225745564364, "grad_norm": 0.49656400084495544, "learning_rate": 0.000172486214683593, "loss": 0.3242170333862305, "memory(GiB)": 91.64, "step": 2965, "token_acc": 0.8780850431162652, "train_speed(iter/s)": 0.138303 }, { "epoch": 0.28029445073612685, "grad_norm": 0.33543533086776733, "learning_rate": 0.00017237862477309225, "loss": 0.3202751636505127, "memory(GiB)": 91.64, "step": 2970, "token_acc": 0.900549115314216, "train_speed(iter/s)": 0.138304 }, { "epoch": 0.28076632691581727, "grad_norm": 0.2617323100566864, "learning_rate": 0.00017227085860974453, "loss": 0.31397452354431155, "memory(GiB)": 91.64, "step": 2975, "token_acc": 0.8893352812271731, "train_speed(iter/s)": 0.138308 }, { "epoch": 0.28123820309550773, "grad_norm": 0.33117854595184326, "learning_rate": 0.0001721629164559766, "loss": 0.313932466506958, "memory(GiB)": 91.64, "step": 2980, "token_acc": 0.8594104308390023, "train_speed(iter/s)": 0.138311 }, { "epoch": 0.2817100792751982, "grad_norm": 0.27345171570777893, "learning_rate": 0.00017205479857464387, "loss": 0.3203251361846924, "memory(GiB)": 91.64, "step": 2985, "token_acc": 0.8858647936786654, "train_speed(iter/s)": 0.138315 }, { "epoch": 0.2821819554548886, "grad_norm": 0.30517563223838806, "learning_rate": 0.0001719465052290297, "loss": 0.3147796630859375, "memory(GiB)": 91.64, "step": 2990, "token_acc": 0.8925373134328358, "train_speed(iter/s)": 0.138317 }, { "epoch": 0.2826538316345791, "grad_norm": 0.6084278225898743, "learning_rate": 0.00017183803668284467, "loss": 0.32270588874816897, "memory(GiB)": 91.64, "step": 2995, "token_acc": 0.8847248576850095, "train_speed(iter/s)": 0.138318 }, { "epoch": 0.28312570781426954, "grad_norm": 0.9644866585731506, "learning_rate": 0.000171729393200226, "loss": 0.315791392326355, "memory(GiB)": 91.64, "step": 3000, "token_acc": 0.8951439621830684, "train_speed(iter/s)": 0.13832 }, { "epoch": 0.28359758399396, "grad_norm": 0.4065193235874176, "learning_rate": 0.00017162057504573695, "loss": 0.31624255180358884, "memory(GiB)": 91.64, "step": 3005, "token_acc": 0.8852857721929469, "train_speed(iter/s)": 0.138323 }, { "epoch": 0.2840694601736504, "grad_norm": 0.6837791800498962, "learning_rate": 0.00017151158248436608, "loss": 0.32163176536560056, "memory(GiB)": 91.64, "step": 3010, "token_acc": 0.8927978758712247, "train_speed(iter/s)": 0.138325 }, { "epoch": 0.2845413363533409, "grad_norm": 0.3094852566719055, "learning_rate": 0.0001714024157815267, "loss": 0.3260573625564575, "memory(GiB)": 91.64, "step": 3015, "token_acc": 0.9311740890688259, "train_speed(iter/s)": 0.138327 }, { "epoch": 0.28501321253303136, "grad_norm": 0.45613107085227966, "learning_rate": 0.00017129307520305615, "loss": 0.3153514385223389, "memory(GiB)": 91.64, "step": 3020, "token_acc": 0.8873579056148811, "train_speed(iter/s)": 0.138327 }, { "epoch": 0.28548508871272177, "grad_norm": 0.3602701425552368, "learning_rate": 0.00017118356101521523, "loss": 0.3154455184936523, "memory(GiB)": 91.64, "step": 3025, "token_acc": 0.8888233559422, "train_speed(iter/s)": 0.138328 }, { "epoch": 0.28595696489241224, "grad_norm": 0.35357511043548584, "learning_rate": 0.00017107387348468746, "loss": 0.3162027597427368, "memory(GiB)": 91.64, "step": 3030, "token_acc": 0.8909919383105503, "train_speed(iter/s)": 0.138331 }, { "epoch": 0.2864288410721027, "grad_norm": 0.6155738234519958, "learning_rate": 0.0001709640128785785, "loss": 0.3166818141937256, "memory(GiB)": 91.64, "step": 3035, "token_acc": 0.8842105263157894, "train_speed(iter/s)": 0.13833 }, { "epoch": 0.2869007172517931, "grad_norm": 0.47968873381614685, "learning_rate": 0.00017085397946441542, "loss": 0.3206371784210205, "memory(GiB)": 91.64, "step": 3040, "token_acc": 0.8922330097087379, "train_speed(iter/s)": 0.138328 }, { "epoch": 0.2873725934314836, "grad_norm": 0.4527382254600525, "learning_rate": 0.00017074377351014618, "loss": 0.3101963996887207, "memory(GiB)": 91.64, "step": 3045, "token_acc": 0.9003378378378378, "train_speed(iter/s)": 0.138329 }, { "epoch": 0.28784446961117405, "grad_norm": 0.3803810775279999, "learning_rate": 0.0001706333952841389, "loss": 0.31847975254058836, "memory(GiB)": 91.64, "step": 3050, "token_acc": 0.9010079193664506, "train_speed(iter/s)": 0.13833 }, { "epoch": 0.28831634579086446, "grad_norm": 0.4184848666191101, "learning_rate": 0.0001705228450551811, "loss": 0.32082357406616213, "memory(GiB)": 91.64, "step": 3055, "token_acc": 0.90530058177117, "train_speed(iter/s)": 0.13833 }, { "epoch": 0.2887882219705549, "grad_norm": 0.6643010377883911, "learning_rate": 0.00017041212309247926, "loss": 0.3204943180084229, "memory(GiB)": 91.64, "step": 3060, "token_acc": 0.9075657894736842, "train_speed(iter/s)": 0.138332 }, { "epoch": 0.2892600981502454, "grad_norm": 1.0790079832077026, "learning_rate": 0.00017030122966565808, "loss": 0.3205895900726318, "memory(GiB)": 91.64, "step": 3065, "token_acc": 0.884206529992407, "train_speed(iter/s)": 0.138335 }, { "epoch": 0.2897319743299358, "grad_norm": 0.6009517908096313, "learning_rate": 0.00017019016504475967, "loss": 0.32358787059783933, "memory(GiB)": 91.64, "step": 3070, "token_acc": 0.8528336380255942, "train_speed(iter/s)": 0.138335 }, { "epoch": 0.29020385050962627, "grad_norm": 0.23621824383735657, "learning_rate": 0.00017007892950024315, "loss": 0.32037298679351806, "memory(GiB)": 91.64, "step": 3075, "token_acc": 0.8835883588358836, "train_speed(iter/s)": 0.138333 }, { "epoch": 0.29067572668931674, "grad_norm": 0.26536548137664795, "learning_rate": 0.00016996752330298383, "loss": 0.31683764457702634, "memory(GiB)": 91.64, "step": 3080, "token_acc": 0.8833831695856471, "train_speed(iter/s)": 0.138335 }, { "epoch": 0.29114760286900715, "grad_norm": 0.6876998543739319, "learning_rate": 0.0001698559467242725, "loss": 0.33230607509613036, "memory(GiB)": 91.64, "step": 3085, "token_acc": 0.8910573842120594, "train_speed(iter/s)": 0.138336 }, { "epoch": 0.2916194790486976, "grad_norm": 0.7065960168838501, "learning_rate": 0.000169744200035815, "loss": 0.3083855628967285, "memory(GiB)": 91.64, "step": 3090, "token_acc": 0.9105636179547756, "train_speed(iter/s)": 0.138336 }, { "epoch": 0.2920913552283881, "grad_norm": 0.3593423068523407, "learning_rate": 0.0001696322835097313, "loss": 0.30929980278015134, "memory(GiB)": 91.64, "step": 3095, "token_acc": 0.8987049028677151, "train_speed(iter/s)": 0.138338 }, { "epoch": 0.2925632314080785, "grad_norm": 0.3623899519443512, "learning_rate": 0.00016952019741855502, "loss": 0.321060037612915, "memory(GiB)": 91.64, "step": 3100, "token_acc": 0.8926689027311931, "train_speed(iter/s)": 0.13834 }, { "epoch": 0.29303510758776896, "grad_norm": 0.2958219051361084, "learning_rate": 0.0001694079420352326, "loss": 0.3155770778656006, "memory(GiB)": 91.64, "step": 3105, "token_acc": 0.8705526116578349, "train_speed(iter/s)": 0.138342 }, { "epoch": 0.29350698376745943, "grad_norm": 0.26606494188308716, "learning_rate": 0.00016929551763312283, "loss": 0.3177908420562744, "memory(GiB)": 91.64, "step": 3110, "token_acc": 0.9123818307585889, "train_speed(iter/s)": 0.138341 }, { "epoch": 0.2939788599471499, "grad_norm": 0.2279764711856842, "learning_rate": 0.00016918292448599612, "loss": 0.3189659118652344, "memory(GiB)": 91.64, "step": 3115, "token_acc": 0.902122641509434, "train_speed(iter/s)": 0.138342 }, { "epoch": 0.2944507361268403, "grad_norm": 0.3087345063686371, "learning_rate": 0.00016907016286803363, "loss": 0.3192793369293213, "memory(GiB)": 91.64, "step": 3120, "token_acc": 0.8875233769703447, "train_speed(iter/s)": 0.13834 }, { "epoch": 0.2949226123065308, "grad_norm": 0.36169660091400146, "learning_rate": 0.00016895723305382693, "loss": 0.3232297897338867, "memory(GiB)": 91.64, "step": 3125, "token_acc": 0.9030837004405287, "train_speed(iter/s)": 0.138341 }, { "epoch": 0.29539448848622124, "grad_norm": 0.4390930235385895, "learning_rate": 0.0001688441353183771, "loss": 0.31847243309020995, "memory(GiB)": 91.64, "step": 3130, "token_acc": 0.8952380952380953, "train_speed(iter/s)": 0.138342 }, { "epoch": 0.29586636466591165, "grad_norm": 0.3723819851875305, "learning_rate": 0.0001687308699370942, "loss": 0.3224406480789185, "memory(GiB)": 91.64, "step": 3135, "token_acc": 0.9039268955297605, "train_speed(iter/s)": 0.138343 }, { "epoch": 0.2963382408456021, "grad_norm": 0.36210358142852783, "learning_rate": 0.00016861743718579638, "loss": 0.3119763612747192, "memory(GiB)": 91.64, "step": 3140, "token_acc": 0.8790560471976401, "train_speed(iter/s)": 0.138345 }, { "epoch": 0.2968101170252926, "grad_norm": 0.3111349046230316, "learning_rate": 0.00016850383734070957, "loss": 0.3166660785675049, "memory(GiB)": 91.64, "step": 3145, "token_acc": 0.8888489208633094, "train_speed(iter/s)": 0.138349 }, { "epoch": 0.297281993204983, "grad_norm": 0.41482797265052795, "learning_rate": 0.00016839007067846645, "loss": 0.31719019412994387, "memory(GiB)": 91.64, "step": 3150, "token_acc": 0.8931875525651808, "train_speed(iter/s)": 0.138351 }, { "epoch": 0.29775386938467346, "grad_norm": 0.8108663558959961, "learning_rate": 0.00016827613747610597, "loss": 0.31843905448913573, "memory(GiB)": 91.64, "step": 3155, "token_acc": 0.8946395563770795, "train_speed(iter/s)": 0.138352 }, { "epoch": 0.29822574556436393, "grad_norm": 0.3246685564517975, "learning_rate": 0.0001681620380110726, "loss": 0.3225980758666992, "memory(GiB)": 91.64, "step": 3160, "token_acc": 0.904508541024923, "train_speed(iter/s)": 0.138354 }, { "epoch": 0.29869762174405434, "grad_norm": 0.26903632283210754, "learning_rate": 0.00016804777256121576, "loss": 0.31510913372039795, "memory(GiB)": 91.64, "step": 3165, "token_acc": 0.8867300537412154, "train_speed(iter/s)": 0.138354 }, { "epoch": 0.2991694979237448, "grad_norm": 0.5990667343139648, "learning_rate": 0.000167933341404789, "loss": 0.3187410831451416, "memory(GiB)": 91.64, "step": 3170, "token_acc": 0.8661604176554343, "train_speed(iter/s)": 0.138357 }, { "epoch": 0.2996413741034353, "grad_norm": 0.39676856994628906, "learning_rate": 0.00016781874482044943, "loss": 0.3103852510452271, "memory(GiB)": 91.64, "step": 3175, "token_acc": 0.9174556213017752, "train_speed(iter/s)": 0.138358 }, { "epoch": 0.3001132502831257, "grad_norm": 0.30802735686302185, "learning_rate": 0.00016770398308725698, "loss": 0.3097995281219482, "memory(GiB)": 91.64, "step": 3180, "token_acc": 0.8779197884530631, "train_speed(iter/s)": 0.138361 }, { "epoch": 0.30058512646281615, "grad_norm": 0.3544689416885376, "learning_rate": 0.00016758905648467373, "loss": 0.31096925735473635, "memory(GiB)": 91.64, "step": 3185, "token_acc": 0.8993464052287582, "train_speed(iter/s)": 0.138365 }, { "epoch": 0.3010570026425066, "grad_norm": 0.3460041582584381, "learning_rate": 0.00016747396529256326, "loss": 0.3142538070678711, "memory(GiB)": 91.64, "step": 3190, "token_acc": 0.8960473078120137, "train_speed(iter/s)": 0.138364 }, { "epoch": 0.30152887882219703, "grad_norm": 0.2710016369819641, "learning_rate": 0.00016735870979118995, "loss": 0.32133386135101316, "memory(GiB)": 91.64, "step": 3195, "token_acc": 0.8818143986683312, "train_speed(iter/s)": 0.138366 }, { "epoch": 0.3020007550018875, "grad_norm": 0.2878091335296631, "learning_rate": 0.0001672432902612183, "loss": 0.3245548248291016, "memory(GiB)": 91.64, "step": 3200, "token_acc": 0.8998724489795918, "train_speed(iter/s)": 0.138364 }, { "epoch": 0.30247263118157797, "grad_norm": 0.38925909996032715, "learning_rate": 0.0001671277069837122, "loss": 0.3146018981933594, "memory(GiB)": 91.64, "step": 3205, "token_acc": 0.8907684871918802, "train_speed(iter/s)": 0.138366 }, { "epoch": 0.3029445073612684, "grad_norm": 0.31076857447624207, "learning_rate": 0.0001670119602401344, "loss": 0.3132633209228516, "memory(GiB)": 91.64, "step": 3210, "token_acc": 0.8922895821071218, "train_speed(iter/s)": 0.13837 }, { "epoch": 0.30341638354095884, "grad_norm": 0.40293845534324646, "learning_rate": 0.00016689605031234566, "loss": 0.31130781173706057, "memory(GiB)": 91.64, "step": 3215, "token_acc": 0.9012658227848102, "train_speed(iter/s)": 0.138374 }, { "epoch": 0.3038882597206493, "grad_norm": 0.45072442293167114, "learning_rate": 0.000166779977482604, "loss": 0.31590077877044676, "memory(GiB)": 91.64, "step": 3220, "token_acc": 0.9161572052401746, "train_speed(iter/s)": 0.138376 }, { "epoch": 0.3043601359003397, "grad_norm": 0.2620558440685272, "learning_rate": 0.00016666374203356431, "loss": 0.32024450302124025, "memory(GiB)": 91.64, "step": 3225, "token_acc": 0.8846487424111015, "train_speed(iter/s)": 0.138379 }, { "epoch": 0.3048320120800302, "grad_norm": 0.6706470251083374, "learning_rate": 0.00016654734424827742, "loss": 0.3171579122543335, "memory(GiB)": 91.64, "step": 3230, "token_acc": 0.9007765314926661, "train_speed(iter/s)": 0.138382 }, { "epoch": 0.30530388825972066, "grad_norm": 0.500370442867279, "learning_rate": 0.00016643078441018938, "loss": 0.31276688575744627, "memory(GiB)": 91.64, "step": 3235, "token_acc": 0.8980530973451327, "train_speed(iter/s)": 0.138383 }, { "epoch": 0.3057757644394111, "grad_norm": 0.6752060055732727, "learning_rate": 0.000166314062803141, "loss": 0.31355462074279783, "memory(GiB)": 91.64, "step": 3240, "token_acc": 0.9082494969818914, "train_speed(iter/s)": 0.138384 }, { "epoch": 0.30624764061910154, "grad_norm": 0.7671592831611633, "learning_rate": 0.00016619717971136697, "loss": 0.3132540941238403, "memory(GiB)": 91.64, "step": 3245, "token_acc": 0.8937125748502994, "train_speed(iter/s)": 0.138384 }, { "epoch": 0.306719516798792, "grad_norm": 0.46400538086891174, "learning_rate": 0.00016608013541949518, "loss": 0.31029553413391114, "memory(GiB)": 91.64, "step": 3250, "token_acc": 0.8741035856573706, "train_speed(iter/s)": 0.138384 }, { "epoch": 0.30719139297848247, "grad_norm": 0.2673157751560211, "learning_rate": 0.00016596293021254612, "loss": 0.3108152151107788, "memory(GiB)": 91.64, "step": 3255, "token_acc": 0.8941227312013829, "train_speed(iter/s)": 0.138388 }, { "epoch": 0.3076632691581729, "grad_norm": 0.32742491364479065, "learning_rate": 0.00016584556437593213, "loss": 0.3182346343994141, "memory(GiB)": 91.64, "step": 3260, "token_acc": 0.9013292433537833, "train_speed(iter/s)": 0.138392 }, { "epoch": 0.30813514533786335, "grad_norm": 0.47153425216674805, "learning_rate": 0.00016572803819545664, "loss": 0.31491539478302, "memory(GiB)": 91.64, "step": 3265, "token_acc": 0.895458440445587, "train_speed(iter/s)": 0.13839 }, { "epoch": 0.3086070215175538, "grad_norm": 0.5636700391769409, "learning_rate": 0.00016561035195731364, "loss": 0.31096749305725097, "memory(GiB)": 91.64, "step": 3270, "token_acc": 0.8844444444444445, "train_speed(iter/s)": 0.138391 }, { "epoch": 0.3090788976972442, "grad_norm": 0.24482609331607819, "learning_rate": 0.00016549250594808683, "loss": 0.31171326637268065, "memory(GiB)": 91.64, "step": 3275, "token_acc": 0.899330811754437, "train_speed(iter/s)": 0.138394 }, { "epoch": 0.3095507738769347, "grad_norm": 0.32658451795578003, "learning_rate": 0.00016537450045474894, "loss": 0.3069904804229736, "memory(GiB)": 91.64, "step": 3280, "token_acc": 0.9060240963855422, "train_speed(iter/s)": 0.138394 }, { "epoch": 0.31002265005662516, "grad_norm": 0.3008076846599579, "learning_rate": 0.00016525633576466116, "loss": 0.3172896862030029, "memory(GiB)": 91.64, "step": 3285, "token_acc": 0.8990936555891239, "train_speed(iter/s)": 0.138396 }, { "epoch": 0.31049452623631557, "grad_norm": 0.3459542393684387, "learning_rate": 0.00016513801216557226, "loss": 0.3143473148345947, "memory(GiB)": 91.64, "step": 3290, "token_acc": 0.903512157468159, "train_speed(iter/s)": 0.138398 }, { "epoch": 0.31096640241600604, "grad_norm": 0.2709652781486511, "learning_rate": 0.00016501952994561804, "loss": 0.3146618366241455, "memory(GiB)": 91.64, "step": 3295, "token_acc": 0.8988359201773836, "train_speed(iter/s)": 0.138401 }, { "epoch": 0.3114382785956965, "grad_norm": 0.31961753964424133, "learning_rate": 0.00016490088939332054, "loss": 0.3087768077850342, "memory(GiB)": 91.64, "step": 3300, "token_acc": 0.8871085214857976, "train_speed(iter/s)": 0.138401 }, { "epoch": 0.3119101547753869, "grad_norm": 0.32156920433044434, "learning_rate": 0.0001647820907975874, "loss": 0.3088067531585693, "memory(GiB)": 91.64, "step": 3305, "token_acc": 0.8805100182149362, "train_speed(iter/s)": 0.138401 }, { "epoch": 0.3123820309550774, "grad_norm": 0.37048351764678955, "learning_rate": 0.000164663134447711, "loss": 0.3027732133865356, "memory(GiB)": 91.64, "step": 3310, "token_acc": 0.9139474444123592, "train_speed(iter/s)": 0.138404 }, { "epoch": 0.31285390713476785, "grad_norm": 1.188414216041565, "learning_rate": 0.00016454402063336804, "loss": 0.30920934677124023, "memory(GiB)": 91.64, "step": 3315, "token_acc": 0.8970201577563541, "train_speed(iter/s)": 0.138405 }, { "epoch": 0.31332578331445826, "grad_norm": 0.27507731318473816, "learning_rate": 0.00016442474964461853, "loss": 0.31197390556335447, "memory(GiB)": 91.64, "step": 3320, "token_acc": 0.8936553713049747, "train_speed(iter/s)": 0.138404 }, { "epoch": 0.31379765949414873, "grad_norm": 0.21555930376052856, "learning_rate": 0.0001643053217719053, "loss": 0.31116595268249514, "memory(GiB)": 91.64, "step": 3325, "token_acc": 0.898416166029492, "train_speed(iter/s)": 0.138404 }, { "epoch": 0.3142695356738392, "grad_norm": 0.44676095247268677, "learning_rate": 0.00016418573730605322, "loss": 0.31005539894104006, "memory(GiB)": 91.64, "step": 3330, "token_acc": 0.8707945597709377, "train_speed(iter/s)": 0.138407 }, { "epoch": 0.3147414118535296, "grad_norm": 0.33203697204589844, "learning_rate": 0.00016406599653826843, "loss": 0.3174571990966797, "memory(GiB)": 91.64, "step": 3335, "token_acc": 0.8960258780036968, "train_speed(iter/s)": 0.138408 }, { "epoch": 0.3152132880332201, "grad_norm": 0.4387269914150238, "learning_rate": 0.00016394609976013778, "loss": 0.3156848430633545, "memory(GiB)": 91.64, "step": 3340, "token_acc": 0.8951187335092349, "train_speed(iter/s)": 0.138406 }, { "epoch": 0.31568516421291054, "grad_norm": 0.2519628703594208, "learning_rate": 0.00016382604726362793, "loss": 0.3058452606201172, "memory(GiB)": 91.64, "step": 3345, "token_acc": 0.8932913102206214, "train_speed(iter/s)": 0.138403 }, { "epoch": 0.316157040392601, "grad_norm": 0.4981694221496582, "learning_rate": 0.00016370583934108477, "loss": 0.31234307289123536, "memory(GiB)": 91.64, "step": 3350, "token_acc": 0.9024309024309024, "train_speed(iter/s)": 0.138404 }, { "epoch": 0.3166289165722914, "grad_norm": 0.24707676470279694, "learning_rate": 0.00016358547628523272, "loss": 0.3052816867828369, "memory(GiB)": 91.64, "step": 3355, "token_acc": 0.9038121686442817, "train_speed(iter/s)": 0.138405 }, { "epoch": 0.3171007927519819, "grad_norm": 0.3029780685901642, "learning_rate": 0.00016346495838917395, "loss": 0.30600829124450685, "memory(GiB)": 91.64, "step": 3360, "token_acc": 0.898513251454428, "train_speed(iter/s)": 0.138405 }, { "epoch": 0.31757266893167235, "grad_norm": 0.6097707152366638, "learning_rate": 0.0001633442859463876, "loss": 0.30912506580352783, "memory(GiB)": 91.64, "step": 3365, "token_acc": 0.9050100200400801, "train_speed(iter/s)": 0.138405 }, { "epoch": 0.31804454511136276, "grad_norm": 0.39238670468330383, "learning_rate": 0.00016322345925072934, "loss": 0.3076608180999756, "memory(GiB)": 91.64, "step": 3370, "token_acc": 0.8900841908325537, "train_speed(iter/s)": 0.138407 }, { "epoch": 0.31851642129105323, "grad_norm": 0.764627993106842, "learning_rate": 0.00016310247859643032, "loss": 0.3063749551773071, "memory(GiB)": 91.64, "step": 3375, "token_acc": 0.8825581395348837, "train_speed(iter/s)": 0.138408 }, { "epoch": 0.3189882974707437, "grad_norm": 0.3981649875640869, "learning_rate": 0.00016298134427809662, "loss": 0.31308808326721194, "memory(GiB)": 91.64, "step": 3380, "token_acc": 0.8966457594764112, "train_speed(iter/s)": 0.13841 }, { "epoch": 0.3194601736504341, "grad_norm": 0.44580742716789246, "learning_rate": 0.00016286005659070857, "loss": 0.31591091156005857, "memory(GiB)": 91.64, "step": 3385, "token_acc": 0.8994082840236687, "train_speed(iter/s)": 0.138411 }, { "epoch": 0.3199320498301246, "grad_norm": 0.4073268175125122, "learning_rate": 0.00016273861582961994, "loss": 0.3123687982559204, "memory(GiB)": 91.64, "step": 3390, "token_acc": 0.888494528246081, "train_speed(iter/s)": 0.13841 }, { "epoch": 0.32040392600981504, "grad_norm": 0.3851412534713745, "learning_rate": 0.00016261702229055725, "loss": 0.31351797580718993, "memory(GiB)": 91.64, "step": 3395, "token_acc": 0.887447539107211, "train_speed(iter/s)": 0.138414 }, { "epoch": 0.32087580218950545, "grad_norm": 0.5207167267799377, "learning_rate": 0.00016249527626961907, "loss": 0.3095412731170654, "memory(GiB)": 91.64, "step": 3400, "token_acc": 0.8894736842105263, "train_speed(iter/s)": 0.138413 }, { "epoch": 0.3213476783691959, "grad_norm": 0.5070729851722717, "learning_rate": 0.00016237337806327532, "loss": 0.3108300447463989, "memory(GiB)": 91.64, "step": 3405, "token_acc": 0.902782433791485, "train_speed(iter/s)": 0.138413 }, { "epoch": 0.3218195545488864, "grad_norm": 0.46263793110847473, "learning_rate": 0.0001622513279683665, "loss": 0.3083954811096191, "memory(GiB)": 91.64, "step": 3410, "token_acc": 0.8961522548613984, "train_speed(iter/s)": 0.138414 }, { "epoch": 0.3222914307285768, "grad_norm": 0.5839296579360962, "learning_rate": 0.0001621291262821029, "loss": 0.3138519287109375, "memory(GiB)": 91.64, "step": 3415, "token_acc": 0.8892988929889298, "train_speed(iter/s)": 0.138412 }, { "epoch": 0.32276330690826727, "grad_norm": 0.2847861647605896, "learning_rate": 0.00016200677330206403, "loss": 0.30753250122070314, "memory(GiB)": 91.64, "step": 3420, "token_acc": 0.9061032863849765, "train_speed(iter/s)": 0.13841 }, { "epoch": 0.32323518308795773, "grad_norm": 0.339787095785141, "learning_rate": 0.00016188426932619784, "loss": 0.31532652378082277, "memory(GiB)": 91.64, "step": 3425, "token_acc": 0.892657793044225, "train_speed(iter/s)": 0.138412 }, { "epoch": 0.32370705926764815, "grad_norm": 0.48385143280029297, "learning_rate": 0.00016176161465281997, "loss": 0.31803653240203855, "memory(GiB)": 91.64, "step": 3430, "token_acc": 0.8876627051499717, "train_speed(iter/s)": 0.138413 }, { "epoch": 0.3241789354473386, "grad_norm": 0.8372563719749451, "learning_rate": 0.000161638809580613, "loss": 0.30901002883911133, "memory(GiB)": 91.64, "step": 3435, "token_acc": 0.8927664974619289, "train_speed(iter/s)": 0.138414 }, { "epoch": 0.3246508116270291, "grad_norm": 0.29702985286712646, "learning_rate": 0.00016151585440862573, "loss": 0.3036106824874878, "memory(GiB)": 91.64, "step": 3440, "token_acc": 0.9132579054250073, "train_speed(iter/s)": 0.138415 }, { "epoch": 0.3251226878067195, "grad_norm": 0.42178598046302795, "learning_rate": 0.0001613927494362726, "loss": 0.3027750015258789, "memory(GiB)": 91.64, "step": 3445, "token_acc": 0.8794378698224852, "train_speed(iter/s)": 0.138414 }, { "epoch": 0.32559456398640996, "grad_norm": 0.6362924575805664, "learning_rate": 0.00016126949496333263, "loss": 0.3112166881561279, "memory(GiB)": 91.64, "step": 3450, "token_acc": 0.9103899249212885, "train_speed(iter/s)": 0.138416 }, { "epoch": 0.3260664401661004, "grad_norm": 0.5326300263404846, "learning_rate": 0.00016114609128994908, "loss": 0.31546103954315186, "memory(GiB)": 91.64, "step": 3455, "token_acc": 0.9140065146579804, "train_speed(iter/s)": 0.138418 }, { "epoch": 0.3265383163457909, "grad_norm": 0.34611740708351135, "learning_rate": 0.00016102253871662852, "loss": 0.31009516716003416, "memory(GiB)": 91.64, "step": 3460, "token_acc": 0.9055606617647058, "train_speed(iter/s)": 0.138418 }, { "epoch": 0.3270101925254813, "grad_norm": 0.27188488841056824, "learning_rate": 0.00016089883754423997, "loss": 0.31755566596984863, "memory(GiB)": 91.64, "step": 3465, "token_acc": 0.8814242526032919, "train_speed(iter/s)": 0.13842 }, { "epoch": 0.32748206870517177, "grad_norm": 0.24603348970413208, "learning_rate": 0.00016077498807401448, "loss": 0.3162256717681885, "memory(GiB)": 91.64, "step": 3470, "token_acc": 0.8958736299161831, "train_speed(iter/s)": 0.138421 }, { "epoch": 0.32795394488486224, "grad_norm": 0.3293556272983551, "learning_rate": 0.0001606509906075441, "loss": 0.3149267196655273, "memory(GiB)": 91.64, "step": 3475, "token_acc": 0.904655075715087, "train_speed(iter/s)": 0.138423 }, { "epoch": 0.32842582106455265, "grad_norm": 0.2843257784843445, "learning_rate": 0.00016052684544678138, "loss": 0.3054977893829346, "memory(GiB)": 91.64, "step": 3480, "token_acc": 0.8907051282051283, "train_speed(iter/s)": 0.138427 }, { "epoch": 0.3288976972442431, "grad_norm": 0.47237107157707214, "learning_rate": 0.00016040255289403844, "loss": 0.30878467559814454, "memory(GiB)": 91.64, "step": 3485, "token_acc": 0.9008951406649617, "train_speed(iter/s)": 0.138428 }, { "epoch": 0.3293695734239336, "grad_norm": 0.4679430425167084, "learning_rate": 0.00016027811325198637, "loss": 0.31091535091400146, "memory(GiB)": 91.64, "step": 3490, "token_acc": 0.9059921062073915, "train_speed(iter/s)": 0.138427 }, { "epoch": 0.329841449603624, "grad_norm": 0.9541602730751038, "learning_rate": 0.0001601535268236544, "loss": 0.31114275455474855, "memory(GiB)": 91.64, "step": 3495, "token_acc": 0.8911082474226805, "train_speed(iter/s)": 0.138429 }, { "epoch": 0.33031332578331446, "grad_norm": 1.2509143352508545, "learning_rate": 0.00016002879391242928, "loss": 0.3070805311203003, "memory(GiB)": 91.64, "step": 3500, "token_acc": 0.8869294605809128, "train_speed(iter/s)": 0.13843 }, { "epoch": 0.3307852019630049, "grad_norm": 0.5359411835670471, "learning_rate": 0.00015990391482205443, "loss": 0.30284867286682127, "memory(GiB)": 91.64, "step": 3505, "token_acc": 0.8969344608879493, "train_speed(iter/s)": 0.138433 }, { "epoch": 0.33125707814269534, "grad_norm": 0.2854415476322174, "learning_rate": 0.00015977888985662918, "loss": 0.318118691444397, "memory(GiB)": 91.64, "step": 3510, "token_acc": 0.8925649235147635, "train_speed(iter/s)": 0.138433 }, { "epoch": 0.3317289543223858, "grad_norm": 0.3062131404876709, "learning_rate": 0.0001596537193206082, "loss": 0.30570647716522215, "memory(GiB)": 91.64, "step": 3515, "token_acc": 0.8835709436524101, "train_speed(iter/s)": 0.138437 }, { "epoch": 0.33220083050207627, "grad_norm": 0.35242435336112976, "learning_rate": 0.00015952840351880058, "loss": 0.30632739067077636, "memory(GiB)": 91.64, "step": 3520, "token_acc": 0.8913419913419913, "train_speed(iter/s)": 0.138435 }, { "epoch": 0.3326727066817667, "grad_norm": 0.3083515167236328, "learning_rate": 0.00015940294275636912, "loss": 0.3100026845932007, "memory(GiB)": 91.64, "step": 3525, "token_acc": 0.8972868217054264, "train_speed(iter/s)": 0.138437 }, { "epoch": 0.33314458286145715, "grad_norm": 0.38937556743621826, "learning_rate": 0.00015927733733882968, "loss": 0.3004534006118774, "memory(GiB)": 91.64, "step": 3530, "token_acc": 0.8906178489702518, "train_speed(iter/s)": 0.138442 }, { "epoch": 0.3336164590411476, "grad_norm": 0.4373650550842285, "learning_rate": 0.0001591515875720504, "loss": 0.3044088363647461, "memory(GiB)": 91.64, "step": 3535, "token_acc": 0.8975325565455792, "train_speed(iter/s)": 0.138446 }, { "epoch": 0.33408833522083803, "grad_norm": 0.3125515282154083, "learning_rate": 0.00015902569376225083, "loss": 0.30311577320098876, "memory(GiB)": 91.64, "step": 3540, "token_acc": 0.9021792966815255, "train_speed(iter/s)": 0.138448 }, { "epoch": 0.3345602114005285, "grad_norm": 0.6560575366020203, "learning_rate": 0.00015889965621600138, "loss": 0.309435510635376, "memory(GiB)": 91.64, "step": 3545, "token_acc": 0.9111111111111111, "train_speed(iter/s)": 0.13845 }, { "epoch": 0.33503208758021896, "grad_norm": 0.2778153121471405, "learning_rate": 0.00015877347524022247, "loss": 0.30498385429382324, "memory(GiB)": 91.64, "step": 3550, "token_acc": 0.8926521239954076, "train_speed(iter/s)": 0.138452 }, { "epoch": 0.3355039637599094, "grad_norm": 0.513312041759491, "learning_rate": 0.00015864715114218372, "loss": 0.30378971099853513, "memory(GiB)": 91.64, "step": 3555, "token_acc": 0.9048288795124239, "train_speed(iter/s)": 0.138454 }, { "epoch": 0.33597583993959984, "grad_norm": 0.9006767272949219, "learning_rate": 0.00015852068422950337, "loss": 0.30725460052490233, "memory(GiB)": 91.64, "step": 3560, "token_acc": 0.8941216913028532, "train_speed(iter/s)": 0.138452 }, { "epoch": 0.3364477161192903, "grad_norm": 0.65769362449646, "learning_rate": 0.00015839407481014738, "loss": 0.3100194692611694, "memory(GiB)": 91.64, "step": 3565, "token_acc": 0.892912571132954, "train_speed(iter/s)": 0.138455 }, { "epoch": 0.3369195922989808, "grad_norm": 0.7264442443847656, "learning_rate": 0.0001582673231924287, "loss": 0.30505690574645994, "memory(GiB)": 91.64, "step": 3570, "token_acc": 0.8920722135007849, "train_speed(iter/s)": 0.138456 }, { "epoch": 0.3373914684786712, "grad_norm": 0.3359721004962921, "learning_rate": 0.0001581404296850067, "loss": 0.3063464403152466, "memory(GiB)": 91.64, "step": 3575, "token_acc": 0.9028258362168397, "train_speed(iter/s)": 0.138459 }, { "epoch": 0.33786334465836165, "grad_norm": 0.49838733673095703, "learning_rate": 0.0001580133945968861, "loss": 0.3045190334320068, "memory(GiB)": 91.64, "step": 3580, "token_acc": 0.8754628071356446, "train_speed(iter/s)": 0.138459 }, { "epoch": 0.3383352208380521, "grad_norm": 0.31267309188842773, "learning_rate": 0.00015788621823741646, "loss": 0.304337739944458, "memory(GiB)": 91.64, "step": 3585, "token_acc": 0.889163322012967, "train_speed(iter/s)": 0.138462 }, { "epoch": 0.33880709701774253, "grad_norm": 0.41228607296943665, "learning_rate": 0.0001577589009162914, "loss": 0.3088655948638916, "memory(GiB)": 91.64, "step": 3590, "token_acc": 0.8918482647296206, "train_speed(iter/s)": 0.138463 }, { "epoch": 0.339278973197433, "grad_norm": 0.5077301263809204, "learning_rate": 0.0001576314429435477, "loss": 0.3072696924209595, "memory(GiB)": 91.64, "step": 3595, "token_acc": 0.8791666666666667, "train_speed(iter/s)": 0.138468 }, { "epoch": 0.33975084937712347, "grad_norm": 0.368539035320282, "learning_rate": 0.00015750384462956477, "loss": 0.3104322671890259, "memory(GiB)": 91.64, "step": 3600, "token_acc": 0.8836734693877552, "train_speed(iter/s)": 0.138469 }, { "epoch": 0.3402227255568139, "grad_norm": 0.5741416215896606, "learning_rate": 0.00015737610628506368, "loss": 0.3075693607330322, "memory(GiB)": 91.64, "step": 3605, "token_acc": 0.8923333333333333, "train_speed(iter/s)": 0.138469 }, { "epoch": 0.34069460173650434, "grad_norm": 0.5280699133872986, "learning_rate": 0.00015724822822110656, "loss": 0.3020766735076904, "memory(GiB)": 91.64, "step": 3610, "token_acc": 0.91324028668427, "train_speed(iter/s)": 0.138473 }, { "epoch": 0.3411664779161948, "grad_norm": 0.65427565574646, "learning_rate": 0.00015712021074909573, "loss": 0.3077415466308594, "memory(GiB)": 91.64, "step": 3615, "token_acc": 0.889433962264151, "train_speed(iter/s)": 0.138475 }, { "epoch": 0.3416383540958852, "grad_norm": 0.25699713826179504, "learning_rate": 0.00015699205418077302, "loss": 0.30338454246520996, "memory(GiB)": 91.64, "step": 3620, "token_acc": 0.8946078431372549, "train_speed(iter/s)": 0.138476 }, { "epoch": 0.3421102302755757, "grad_norm": 0.382522314786911, "learning_rate": 0.00015686375882821885, "loss": 0.30158405303955077, "memory(GiB)": 91.64, "step": 3625, "token_acc": 0.9050859598853869, "train_speed(iter/s)": 0.138474 }, { "epoch": 0.34258210645526616, "grad_norm": 0.5537758469581604, "learning_rate": 0.00015673532500385192, "loss": 0.31232216358184817, "memory(GiB)": 91.64, "step": 3630, "token_acc": 0.8968973747016706, "train_speed(iter/s)": 0.138476 }, { "epoch": 0.34305398263495657, "grad_norm": 0.4347175061702728, "learning_rate": 0.0001566067530204278, "loss": 0.30861697196960447, "memory(GiB)": 91.64, "step": 3635, "token_acc": 0.9140995260663507, "train_speed(iter/s)": 0.138474 }, { "epoch": 0.34352585881464703, "grad_norm": 1.3980445861816406, "learning_rate": 0.00015647804319103862, "loss": 0.3118103265762329, "memory(GiB)": 91.64, "step": 3640, "token_acc": 0.891973445986723, "train_speed(iter/s)": 0.138474 }, { "epoch": 0.3439977349943375, "grad_norm": 0.7892172336578369, "learning_rate": 0.00015634919582911225, "loss": 0.2955352783203125, "memory(GiB)": 91.64, "step": 3645, "token_acc": 0.8974358974358975, "train_speed(iter/s)": 0.138472 }, { "epoch": 0.3444696111740279, "grad_norm": 0.25731360912323, "learning_rate": 0.0001562202112484114, "loss": 0.30662920475006106, "memory(GiB)": 91.64, "step": 3650, "token_acc": 0.9029054799558661, "train_speed(iter/s)": 0.138474 }, { "epoch": 0.3449414873537184, "grad_norm": 0.25891074538230896, "learning_rate": 0.00015609108976303283, "loss": 0.30108160972595216, "memory(GiB)": 91.64, "step": 3655, "token_acc": 0.9025487256371814, "train_speed(iter/s)": 0.138475 }, { "epoch": 0.34541336353340885, "grad_norm": 0.9629558324813843, "learning_rate": 0.00015596183168740694, "loss": 0.30233757495880126, "memory(GiB)": 91.64, "step": 3660, "token_acc": 0.8685909608811242, "train_speed(iter/s)": 0.138477 }, { "epoch": 0.34588523971309926, "grad_norm": 0.5921810865402222, "learning_rate": 0.00015583243733629655, "loss": 0.3014842510223389, "memory(GiB)": 91.64, "step": 3665, "token_acc": 0.9083843617522374, "train_speed(iter/s)": 0.138475 }, { "epoch": 0.3463571158927897, "grad_norm": 0.5052902698516846, "learning_rate": 0.00015570290702479638, "loss": 0.29955530166625977, "memory(GiB)": 91.64, "step": 3670, "token_acc": 0.8916758544652701, "train_speed(iter/s)": 0.138475 }, { "epoch": 0.3468289920724802, "grad_norm": 0.5260510444641113, "learning_rate": 0.00015557324106833223, "loss": 0.30066709518432616, "memory(GiB)": 91.64, "step": 3675, "token_acc": 0.9076858813700919, "train_speed(iter/s)": 0.138477 }, { "epoch": 0.34730086825217066, "grad_norm": 0.33802786469459534, "learning_rate": 0.00015544343978266025, "loss": 0.2984708547592163, "memory(GiB)": 91.64, "step": 3680, "token_acc": 0.901496693351897, "train_speed(iter/s)": 0.138479 }, { "epoch": 0.34777274443186107, "grad_norm": 0.32484322786331177, "learning_rate": 0.00015531350348386606, "loss": 0.3036123037338257, "memory(GiB)": 91.64, "step": 3685, "token_acc": 0.9054111631870473, "train_speed(iter/s)": 0.138482 }, { "epoch": 0.34824462061155154, "grad_norm": 0.3622114956378937, "learning_rate": 0.00015518343248836417, "loss": 0.29522085189819336, "memory(GiB)": 91.64, "step": 3690, "token_acc": 0.9036262659261679, "train_speed(iter/s)": 0.13848 }, { "epoch": 0.348716496791242, "grad_norm": 0.4862135946750641, "learning_rate": 0.000155053227112897, "loss": 0.2977942705154419, "memory(GiB)": 91.64, "step": 3695, "token_acc": 0.9008559201141226, "train_speed(iter/s)": 0.138482 }, { "epoch": 0.3491883729709324, "grad_norm": 0.5323036313056946, "learning_rate": 0.00015492288767453424, "loss": 0.31482396125793455, "memory(GiB)": 91.64, "step": 3700, "token_acc": 0.9135959339263025, "train_speed(iter/s)": 0.138484 }, { "epoch": 0.3496602491506229, "grad_norm": 0.45398956537246704, "learning_rate": 0.00015479241449067207, "loss": 0.3046382427215576, "memory(GiB)": 91.64, "step": 3705, "token_acc": 0.8907425580634609, "train_speed(iter/s)": 0.138487 }, { "epoch": 0.35013212533031335, "grad_norm": 0.24290789663791656, "learning_rate": 0.00015466180787903228, "loss": 0.30463085174560545, "memory(GiB)": 91.64, "step": 3710, "token_acc": 0.8997484728710026, "train_speed(iter/s)": 0.138489 }, { "epoch": 0.35060400151000376, "grad_norm": 0.6027255058288574, "learning_rate": 0.00015453106815766169, "loss": 0.3037456512451172, "memory(GiB)": 91.64, "step": 3715, "token_acc": 0.8926788685524126, "train_speed(iter/s)": 0.13849 }, { "epoch": 0.3510758776896942, "grad_norm": 0.3494771718978882, "learning_rate": 0.00015440019564493112, "loss": 0.30643885135650634, "memory(GiB)": 91.64, "step": 3720, "token_acc": 0.8992887624466572, "train_speed(iter/s)": 0.138489 }, { "epoch": 0.3515477538693847, "grad_norm": 0.45546987652778625, "learning_rate": 0.00015426919065953496, "loss": 0.3024001598358154, "memory(GiB)": 91.64, "step": 3725, "token_acc": 0.9199255121042831, "train_speed(iter/s)": 0.138493 }, { "epoch": 0.3520196300490751, "grad_norm": 0.2523688077926636, "learning_rate": 0.00015413805352048997, "loss": 0.30108323097229006, "memory(GiB)": 91.64, "step": 3730, "token_acc": 0.9001024590163934, "train_speed(iter/s)": 0.138496 }, { "epoch": 0.35249150622876557, "grad_norm": 0.4431523382663727, "learning_rate": 0.00015400678454713487, "loss": 0.3021445989608765, "memory(GiB)": 91.64, "step": 3735, "token_acc": 0.9001941747572816, "train_speed(iter/s)": 0.138497 }, { "epoch": 0.35296338240845604, "grad_norm": 0.6595969200134277, "learning_rate": 0.00015387538405912937, "loss": 0.3013103485107422, "memory(GiB)": 91.64, "step": 3740, "token_acc": 0.9106609099966788, "train_speed(iter/s)": 0.138499 }, { "epoch": 0.35343525858814645, "grad_norm": 0.2660990059375763, "learning_rate": 0.00015374385237645343, "loss": 0.29910807609558104, "memory(GiB)": 91.64, "step": 3745, "token_acc": 0.8930566640063847, "train_speed(iter/s)": 0.138502 }, { "epoch": 0.3539071347678369, "grad_norm": 0.30778956413269043, "learning_rate": 0.00015361218981940647, "loss": 0.30529217720031737, "memory(GiB)": 91.64, "step": 3750, "token_acc": 0.8790162633875446, "train_speed(iter/s)": 0.138502 }, { "epoch": 0.3543790109475274, "grad_norm": 0.3072812557220459, "learning_rate": 0.0001534803967086067, "loss": 0.2946580410003662, "memory(GiB)": 91.64, "step": 3755, "token_acc": 0.8934729064039408, "train_speed(iter/s)": 0.138503 }, { "epoch": 0.3548508871272178, "grad_norm": 0.3878733515739441, "learning_rate": 0.00015334847336499015, "loss": 0.30007166862487794, "memory(GiB)": 91.64, "step": 3760, "token_acc": 0.9072721498888536, "train_speed(iter/s)": 0.138504 }, { "epoch": 0.35532276330690826, "grad_norm": 0.2825472950935364, "learning_rate": 0.00015321642010981, "loss": 0.2999789953231812, "memory(GiB)": 91.64, "step": 3765, "token_acc": 0.9049180327868852, "train_speed(iter/s)": 0.138508 }, { "epoch": 0.35579463948659873, "grad_norm": 0.3111068904399872, "learning_rate": 0.0001530842372646358, "loss": 0.29370770454406736, "memory(GiB)": 91.64, "step": 3770, "token_acc": 0.9016277423920736, "train_speed(iter/s)": 0.138503 }, { "epoch": 0.35626651566628914, "grad_norm": 0.42809635400772095, "learning_rate": 0.00015295192515135274, "loss": 0.29727604389190676, "memory(GiB)": 91.64, "step": 3775, "token_acc": 0.9122373300370828, "train_speed(iter/s)": 0.138504 }, { "epoch": 0.3567383918459796, "grad_norm": 0.8906384110450745, "learning_rate": 0.0001528194840921607, "loss": 0.3030253887176514, "memory(GiB)": 91.64, "step": 3780, "token_acc": 0.8868296529968455, "train_speed(iter/s)": 0.138503 }, { "epoch": 0.3572102680256701, "grad_norm": 0.7155807018280029, "learning_rate": 0.00015268691440957355, "loss": 0.29479668140411375, "memory(GiB)": 91.64, "step": 3785, "token_acc": 0.9038887132469174, "train_speed(iter/s)": 0.138504 }, { "epoch": 0.3576821442053605, "grad_norm": 0.4430237114429474, "learning_rate": 0.0001525542164264185, "loss": 0.30679366588592527, "memory(GiB)": 91.64, "step": 3790, "token_acc": 0.9016441573693482, "train_speed(iter/s)": 0.138505 }, { "epoch": 0.35815402038505095, "grad_norm": 0.8023039102554321, "learning_rate": 0.0001524213904658351, "loss": 0.3044567108154297, "memory(GiB)": 91.64, "step": 3795, "token_acc": 0.8872512896094326, "train_speed(iter/s)": 0.138505 }, { "epoch": 0.3586258965647414, "grad_norm": 0.6200763583183289, "learning_rate": 0.00015228843685127452, "loss": 0.30850720405578613, "memory(GiB)": 91.64, "step": 3800, "token_acc": 0.8852781880846874, "train_speed(iter/s)": 0.138505 }, { "epoch": 0.3590977727444319, "grad_norm": 0.7332703471183777, "learning_rate": 0.00015215535590649886, "loss": 0.30007758140563967, "memory(GiB)": 91.64, "step": 3805, "token_acc": 0.9086795557383792, "train_speed(iter/s)": 0.138506 }, { "epoch": 0.3595696489241223, "grad_norm": 0.5468603372573853, "learning_rate": 0.00015202214795558022, "loss": 0.2976674556732178, "memory(GiB)": 91.64, "step": 3810, "token_acc": 0.8955123911587408, "train_speed(iter/s)": 0.138505 }, { "epoch": 0.36004152510381277, "grad_norm": 0.4006412625312805, "learning_rate": 0.00015188881332290003, "loss": 0.2963697910308838, "memory(GiB)": 91.64, "step": 3815, "token_acc": 0.9008033531260915, "train_speed(iter/s)": 0.138505 }, { "epoch": 0.36051340128350323, "grad_norm": 0.2284918576478958, "learning_rate": 0.00015175535233314823, "loss": 0.29793496131896974, "memory(GiB)": 91.64, "step": 3820, "token_acc": 0.8894009216589862, "train_speed(iter/s)": 0.138507 }, { "epoch": 0.36098527746319364, "grad_norm": 0.8408511877059937, "learning_rate": 0.00015162176531132235, "loss": 0.31119661331176757, "memory(GiB)": 91.64, "step": 3825, "token_acc": 0.8848153926157046, "train_speed(iter/s)": 0.138506 }, { "epoch": 0.3614571536428841, "grad_norm": 0.23569151759147644, "learning_rate": 0.00015148805258272696, "loss": 0.3092831611633301, "memory(GiB)": 91.64, "step": 3830, "token_acc": 0.8938014737754659, "train_speed(iter/s)": 0.138509 }, { "epoch": 0.3619290298225746, "grad_norm": 0.3385840356349945, "learning_rate": 0.0001513542144729726, "loss": 0.29678735733032224, "memory(GiB)": 91.64, "step": 3835, "token_acc": 0.8935003915426781, "train_speed(iter/s)": 0.138507 }, { "epoch": 0.362400906002265, "grad_norm": 0.35799628496170044, "learning_rate": 0.00015122025130797536, "loss": 0.29270572662353517, "memory(GiB)": 91.64, "step": 3840, "token_acc": 0.9016266460108443, "train_speed(iter/s)": 0.138508 }, { "epoch": 0.36287278218195546, "grad_norm": 0.34791991114616394, "learning_rate": 0.00015108616341395558, "loss": 0.2929104804992676, "memory(GiB)": 91.64, "step": 3845, "token_acc": 0.9117132867132867, "train_speed(iter/s)": 0.138507 }, { "epoch": 0.3633446583616459, "grad_norm": 0.2428075075149536, "learning_rate": 0.0001509519511174375, "loss": 0.29956960678100586, "memory(GiB)": 91.64, "step": 3850, "token_acc": 0.9037393557941503, "train_speed(iter/s)": 0.138507 }, { "epoch": 0.36381653454133633, "grad_norm": 0.38686317205429077, "learning_rate": 0.00015081761474524828, "loss": 0.29664180278778074, "memory(GiB)": 91.64, "step": 3855, "token_acc": 0.9018382352941177, "train_speed(iter/s)": 0.138507 }, { "epoch": 0.3642884107210268, "grad_norm": 0.6946428418159485, "learning_rate": 0.00015068315462451722, "loss": 0.297865891456604, "memory(GiB)": 91.64, "step": 3860, "token_acc": 0.9015151515151515, "train_speed(iter/s)": 0.138509 }, { "epoch": 0.36476028690071727, "grad_norm": 0.24777008593082428, "learning_rate": 0.00015054857108267496, "loss": 0.2987982749938965, "memory(GiB)": 91.64, "step": 3865, "token_acc": 0.8901398839986353, "train_speed(iter/s)": 0.138511 }, { "epoch": 0.3652321630804077, "grad_norm": 0.5574231147766113, "learning_rate": 0.00015041386444745268, "loss": 0.3010772705078125, "memory(GiB)": 91.64, "step": 3870, "token_acc": 0.8945620589456206, "train_speed(iter/s)": 0.138511 }, { "epoch": 0.36570403926009815, "grad_norm": 0.648216962814331, "learning_rate": 0.00015027903504688127, "loss": 0.3043731927871704, "memory(GiB)": 91.64, "step": 3875, "token_acc": 0.9052585832246849, "train_speed(iter/s)": 0.138513 }, { "epoch": 0.3661759154397886, "grad_norm": 1.228064775466919, "learning_rate": 0.00015014408320929062, "loss": 0.30399317741394044, "memory(GiB)": 91.64, "step": 3880, "token_acc": 0.9066859066859067, "train_speed(iter/s)": 0.138512 }, { "epoch": 0.366647791619479, "grad_norm": 0.36719319224357605, "learning_rate": 0.00015000900926330886, "loss": 0.3041665077209473, "memory(GiB)": 91.64, "step": 3885, "token_acc": 0.89, "train_speed(iter/s)": 0.138513 }, { "epoch": 0.3671196677991695, "grad_norm": 0.5185071229934692, "learning_rate": 0.0001498738135378613, "loss": 0.2928229570388794, "memory(GiB)": 91.64, "step": 3890, "token_acc": 0.9091915836101883, "train_speed(iter/s)": 0.138514 }, { "epoch": 0.36759154397885996, "grad_norm": 0.3124889135360718, "learning_rate": 0.00014973849636216993, "loss": 0.3014270067214966, "memory(GiB)": 91.64, "step": 3895, "token_acc": 0.908899420747762, "train_speed(iter/s)": 0.138515 }, { "epoch": 0.36806342015855037, "grad_norm": 0.2866859436035156, "learning_rate": 0.0001496030580657524, "loss": 0.2979059934616089, "memory(GiB)": 91.64, "step": 3900, "token_acc": 0.9056129572745043, "train_speed(iter/s)": 0.138516 }, { "epoch": 0.36853529633824084, "grad_norm": 0.725142776966095, "learning_rate": 0.00014946749897842135, "loss": 0.30335102081298826, "memory(GiB)": 91.64, "step": 3905, "token_acc": 0.9037496309418365, "train_speed(iter/s)": 0.138514 }, { "epoch": 0.3690071725179313, "grad_norm": 0.7367982268333435, "learning_rate": 0.0001493318194302836, "loss": 0.3035327911376953, "memory(GiB)": 91.64, "step": 3910, "token_acc": 0.9125295508274232, "train_speed(iter/s)": 0.138515 }, { "epoch": 0.36947904869762177, "grad_norm": 0.31670787930488586, "learning_rate": 0.00014919601975173924, "loss": 0.29859652519226076, "memory(GiB)": 91.64, "step": 3915, "token_acc": 0.9066422594142259, "train_speed(iter/s)": 0.138515 }, { "epoch": 0.3699509248773122, "grad_norm": 0.6017026305198669, "learning_rate": 0.00014906010027348096, "loss": 0.3074479579925537, "memory(GiB)": 91.64, "step": 3920, "token_acc": 0.8810875410815656, "train_speed(iter/s)": 0.138514 }, { "epoch": 0.37042280105700265, "grad_norm": 0.5601099133491516, "learning_rate": 0.00014892406132649316, "loss": 0.2999934434890747, "memory(GiB)": 91.64, "step": 3925, "token_acc": 0.8974358974358975, "train_speed(iter/s)": 0.138517 }, { "epoch": 0.3708946772366931, "grad_norm": 0.5873445868492126, "learning_rate": 0.00014878790324205108, "loss": 0.2997703552246094, "memory(GiB)": 91.64, "step": 3930, "token_acc": 0.913626209977662, "train_speed(iter/s)": 0.138516 }, { "epoch": 0.3713665534163835, "grad_norm": 0.27552977204322815, "learning_rate": 0.00014865162635172024, "loss": 0.3029902935028076, "memory(GiB)": 91.64, "step": 3935, "token_acc": 0.906828119744366, "train_speed(iter/s)": 0.138515 }, { "epoch": 0.371838429596074, "grad_norm": 0.35505324602127075, "learning_rate": 0.00014851523098735535, "loss": 0.30098762512207033, "memory(GiB)": 91.64, "step": 3940, "token_acc": 0.8870905587668594, "train_speed(iter/s)": 0.138517 }, { "epoch": 0.37231030577576446, "grad_norm": 0.9836472868919373, "learning_rate": 0.00014837871748109963, "loss": 0.3018435001373291, "memory(GiB)": 91.64, "step": 3945, "token_acc": 0.8946090335114133, "train_speed(iter/s)": 0.138517 }, { "epoch": 0.37278218195545487, "grad_norm": 0.26430022716522217, "learning_rate": 0.00014824208616538405, "loss": 0.2975289344787598, "memory(GiB)": 91.64, "step": 3950, "token_acc": 0.9091860769432086, "train_speed(iter/s)": 0.138518 }, { "epoch": 0.37325405813514534, "grad_norm": 0.6483102440834045, "learning_rate": 0.00014810533737292646, "loss": 0.29604153633117675, "memory(GiB)": 91.64, "step": 3955, "token_acc": 0.882892606583918, "train_speed(iter/s)": 0.138519 }, { "epoch": 0.3737259343148358, "grad_norm": 0.4567039906978607, "learning_rate": 0.0001479684714367307, "loss": 0.2948923587799072, "memory(GiB)": 91.64, "step": 3960, "token_acc": 0.9090909090909091, "train_speed(iter/s)": 0.138519 }, { "epoch": 0.3741978104945262, "grad_norm": 0.232547327876091, "learning_rate": 0.00014783148869008592, "loss": 0.2958275079727173, "memory(GiB)": 91.64, "step": 3965, "token_acc": 0.8747252747252747, "train_speed(iter/s)": 0.13852 }, { "epoch": 0.3746696866742167, "grad_norm": 0.34699806571006775, "learning_rate": 0.0001476943894665658, "loss": 0.2959395408630371, "memory(GiB)": 91.64, "step": 3970, "token_acc": 0.8965679360601787, "train_speed(iter/s)": 0.138519 }, { "epoch": 0.37514156285390715, "grad_norm": 0.41914603114128113, "learning_rate": 0.00014755717410002748, "loss": 0.3048529624938965, "memory(GiB)": 91.64, "step": 3975, "token_acc": 0.9008640291041382, "train_speed(iter/s)": 0.138521 }, { "epoch": 0.37561343903359756, "grad_norm": 0.30545568466186523, "learning_rate": 0.00014741984292461117, "loss": 0.300109338760376, "memory(GiB)": 91.64, "step": 3980, "token_acc": 0.9146689497716894, "train_speed(iter/s)": 0.138523 }, { "epoch": 0.37608531521328803, "grad_norm": 0.7996503114700317, "learning_rate": 0.00014728239627473884, "loss": 0.3002124309539795, "memory(GiB)": 91.64, "step": 3985, "token_acc": 0.9075589792970631, "train_speed(iter/s)": 0.138524 }, { "epoch": 0.3765571913929785, "grad_norm": 0.601633608341217, "learning_rate": 0.00014714483448511384, "loss": 0.30334570407867434, "memory(GiB)": 91.64, "step": 3990, "token_acc": 0.906820723071189, "train_speed(iter/s)": 0.138525 }, { "epoch": 0.3770290675726689, "grad_norm": 0.23861028254032135, "learning_rate": 0.00014700715789071978, "loss": 0.3003624200820923, "memory(GiB)": 91.64, "step": 3995, "token_acc": 0.904647983595352, "train_speed(iter/s)": 0.138526 }, { "epoch": 0.3775009437523594, "grad_norm": 0.46522191166877747, "learning_rate": 0.00014686936682681994, "loss": 0.2874744415283203, "memory(GiB)": 91.64, "step": 4000, "token_acc": 0.8961826614341777, "train_speed(iter/s)": 0.138526 }, { "epoch": 0.37797281993204984, "grad_norm": 0.4399687945842743, "learning_rate": 0.0001467314616289563, "loss": 0.2904350280761719, "memory(GiB)": 91.64, "step": 4005, "token_acc": 0.8935816428333888, "train_speed(iter/s)": 0.138526 }, { "epoch": 0.37844469611174025, "grad_norm": 0.5468761920928955, "learning_rate": 0.00014659344263294875, "loss": 0.2920623779296875, "memory(GiB)": 91.64, "step": 4010, "token_acc": 0.8952150211992732, "train_speed(iter/s)": 0.138529 }, { "epoch": 0.3789165722914307, "grad_norm": 0.31320616602897644, "learning_rate": 0.00014645531017489432, "loss": 0.313277530670166, "memory(GiB)": 91.64, "step": 4015, "token_acc": 0.885201793721973, "train_speed(iter/s)": 0.138529 }, { "epoch": 0.3793884484711212, "grad_norm": 0.230007141828537, "learning_rate": 0.00014631706459116637, "loss": 0.30183398723602295, "memory(GiB)": 91.64, "step": 4020, "token_acc": 0.8790294627383015, "train_speed(iter/s)": 0.13853 }, { "epoch": 0.37986032465081165, "grad_norm": 0.24103610217571259, "learning_rate": 0.00014617870621841375, "loss": 0.2933482646942139, "memory(GiB)": 91.64, "step": 4025, "token_acc": 0.8915499322187076, "train_speed(iter/s)": 0.138529 }, { "epoch": 0.38033220083050207, "grad_norm": 0.6546054482460022, "learning_rate": 0.0001460402353935598, "loss": 0.2997136116027832, "memory(GiB)": 91.64, "step": 4030, "token_acc": 0.8830313014827018, "train_speed(iter/s)": 0.13853 }, { "epoch": 0.38080407701019253, "grad_norm": 0.5257068276405334, "learning_rate": 0.0001459016524538019, "loss": 0.3064009428024292, "memory(GiB)": 91.64, "step": 4035, "token_acc": 0.8834586466165414, "train_speed(iter/s)": 0.13853 }, { "epoch": 0.381275953189883, "grad_norm": 0.7437881231307983, "learning_rate": 0.0001457629577366104, "loss": 0.30060720443725586, "memory(GiB)": 91.64, "step": 4040, "token_acc": 0.9072522392372147, "train_speed(iter/s)": 0.138529 }, { "epoch": 0.3817478293695734, "grad_norm": 0.6455546617507935, "learning_rate": 0.0001456241515797278, "loss": 0.298064136505127, "memory(GiB)": 91.64, "step": 4045, "token_acc": 0.9005726184279021, "train_speed(iter/s)": 0.138531 }, { "epoch": 0.3822197055492639, "grad_norm": 0.5941305756568909, "learning_rate": 0.00014548523432116785, "loss": 0.30365958213806155, "memory(GiB)": 91.64, "step": 4050, "token_acc": 0.904631217838765, "train_speed(iter/s)": 0.138532 }, { "epoch": 0.38269158172895434, "grad_norm": 0.7511333227157593, "learning_rate": 0.0001453462062992152, "loss": 0.2938352108001709, "memory(GiB)": 91.64, "step": 4055, "token_acc": 0.9002293577981652, "train_speed(iter/s)": 0.138532 }, { "epoch": 0.38316345790864476, "grad_norm": 0.2584506571292877, "learning_rate": 0.0001452070678524239, "loss": 0.2902865171432495, "memory(GiB)": 91.64, "step": 4060, "token_acc": 0.8862512363996043, "train_speed(iter/s)": 0.138534 }, { "epoch": 0.3836353340883352, "grad_norm": 0.38080471754074097, "learning_rate": 0.000145067819319617, "loss": 0.30122551918029783, "memory(GiB)": 91.64, "step": 4065, "token_acc": 0.9068033550792172, "train_speed(iter/s)": 0.138537 }, { "epoch": 0.3841072102680257, "grad_norm": 0.3039911687374115, "learning_rate": 0.0001449284610398857, "loss": 0.29466953277587893, "memory(GiB)": 91.64, "step": 4070, "token_acc": 0.9058471454880295, "train_speed(iter/s)": 0.138537 }, { "epoch": 0.3845790864477161, "grad_norm": 0.34186533093452454, "learning_rate": 0.00014478899335258836, "loss": 0.29807510375976565, "memory(GiB)": 91.64, "step": 4075, "token_acc": 0.885883347421809, "train_speed(iter/s)": 0.138539 }, { "epoch": 0.38505096262740657, "grad_norm": 0.6868099570274353, "learning_rate": 0.00014464941659734977, "loss": 0.29289746284484863, "memory(GiB)": 91.64, "step": 4080, "token_acc": 0.8864503816793893, "train_speed(iter/s)": 0.138537 }, { "epoch": 0.38552283880709703, "grad_norm": 0.4959952235221863, "learning_rate": 0.00014450973111406037, "loss": 0.30514447689056395, "memory(GiB)": 91.64, "step": 4085, "token_acc": 0.8926809210526315, "train_speed(iter/s)": 0.138538 }, { "epoch": 0.38599471498678745, "grad_norm": 0.5920107364654541, "learning_rate": 0.00014436993724287534, "loss": 0.30403847694396974, "memory(GiB)": 91.64, "step": 4090, "token_acc": 0.893070044709389, "train_speed(iter/s)": 0.138538 }, { "epoch": 0.3864665911664779, "grad_norm": 0.2796635925769806, "learning_rate": 0.00014423003532421376, "loss": 0.2975569248199463, "memory(GiB)": 91.64, "step": 4095, "token_acc": 0.9021779254337394, "train_speed(iter/s)": 0.138537 }, { "epoch": 0.3869384673461684, "grad_norm": 0.2953856289386749, "learning_rate": 0.00014409002569875794, "loss": 0.29867355823516845, "memory(GiB)": 91.64, "step": 4100, "token_acc": 0.910139030179722, "train_speed(iter/s)": 0.138537 }, { "epoch": 0.3874103435258588, "grad_norm": 0.42904919385910034, "learning_rate": 0.00014394990870745234, "loss": 0.2972731590270996, "memory(GiB)": 91.64, "step": 4105, "token_acc": 0.8958496476115897, "train_speed(iter/s)": 0.138538 }, { "epoch": 0.38788221970554926, "grad_norm": 0.3960139751434326, "learning_rate": 0.0001438096846915029, "loss": 0.2955931663513184, "memory(GiB)": 91.64, "step": 4110, "token_acc": 0.8847637415621986, "train_speed(iter/s)": 0.138539 }, { "epoch": 0.3883540958852397, "grad_norm": 0.46903130412101746, "learning_rate": 0.00014366935399237626, "loss": 0.2966940402984619, "memory(GiB)": 91.64, "step": 4115, "token_acc": 0.914792603698151, "train_speed(iter/s)": 0.13854 }, { "epoch": 0.38882597206493014, "grad_norm": 0.35994887351989746, "learning_rate": 0.00014352891695179878, "loss": 0.29332523345947265, "memory(GiB)": 91.64, "step": 4120, "token_acc": 0.8934058898847631, "train_speed(iter/s)": 0.13854 }, { "epoch": 0.3892978482446206, "grad_norm": 0.4265725314617157, "learning_rate": 0.00014338837391175582, "loss": 0.29866001605987547, "memory(GiB)": 91.64, "step": 4125, "token_acc": 0.8938466025080198, "train_speed(iter/s)": 0.13854 }, { "epoch": 0.38976972442431107, "grad_norm": 0.41101741790771484, "learning_rate": 0.0001432477252144908, "loss": 0.2899683952331543, "memory(GiB)": 91.64, "step": 4130, "token_acc": 0.9041523571651576, "train_speed(iter/s)": 0.13854 }, { "epoch": 0.39024160060400154, "grad_norm": 0.4724404215812683, "learning_rate": 0.00014310697120250448, "loss": 0.30640535354614257, "memory(GiB)": 91.64, "step": 4135, "token_acc": 0.8994540491355778, "train_speed(iter/s)": 0.138541 }, { "epoch": 0.39071347678369195, "grad_norm": 0.21818110346794128, "learning_rate": 0.0001429661122185541, "loss": 0.2890320301055908, "memory(GiB)": 91.64, "step": 4140, "token_acc": 0.9128375177640928, "train_speed(iter/s)": 0.138542 }, { "epoch": 0.3911853529633824, "grad_norm": 0.2800583243370056, "learning_rate": 0.00014282514860565246, "loss": 0.3026628017425537, "memory(GiB)": 91.64, "step": 4145, "token_acc": 0.8859737638748738, "train_speed(iter/s)": 0.138542 }, { "epoch": 0.3916572291430729, "grad_norm": 0.3000319302082062, "learning_rate": 0.00014268408070706713, "loss": 0.2907330274581909, "memory(GiB)": 91.64, "step": 4150, "token_acc": 0.8899253731343284, "train_speed(iter/s)": 0.138542 }, { "epoch": 0.3921291053227633, "grad_norm": 0.21889916062355042, "learning_rate": 0.00014254290886631977, "loss": 0.2907184362411499, "memory(GiB)": 91.64, "step": 4155, "token_acc": 0.9149686520376176, "train_speed(iter/s)": 0.138542 }, { "epoch": 0.39260098150245376, "grad_norm": 0.6156848669052124, "learning_rate": 0.00014240163342718506, "loss": 0.29220128059387207, "memory(GiB)": 91.64, "step": 4160, "token_acc": 0.8869187019069923, "train_speed(iter/s)": 0.138542 }, { "epoch": 0.3930728576821442, "grad_norm": 0.30616524815559387, "learning_rate": 0.00014226025473368988, "loss": 0.30183541774749756, "memory(GiB)": 91.64, "step": 4165, "token_acc": 0.9006644518272425, "train_speed(iter/s)": 0.138541 }, { "epoch": 0.39354473386183464, "grad_norm": 0.3282259404659271, "learning_rate": 0.0001421187731301127, "loss": 0.3005667209625244, "memory(GiB)": 91.64, "step": 4170, "token_acc": 0.8975103734439834, "train_speed(iter/s)": 0.138543 }, { "epoch": 0.3940166100415251, "grad_norm": 0.28773191571235657, "learning_rate": 0.0001419771889609825, "loss": 0.3000694751739502, "memory(GiB)": 91.64, "step": 4175, "token_acc": 0.8820326678765881, "train_speed(iter/s)": 0.138543 }, { "epoch": 0.3944884862212156, "grad_norm": 0.3656611144542694, "learning_rate": 0.00014183550257107803, "loss": 0.2959104299545288, "memory(GiB)": 91.64, "step": 4180, "token_acc": 0.8863755917937928, "train_speed(iter/s)": 0.138543 }, { "epoch": 0.394960362400906, "grad_norm": 0.45343348383903503, "learning_rate": 0.00014169371430542698, "loss": 0.2970226526260376, "memory(GiB)": 91.64, "step": 4185, "token_acc": 0.8920325203252033, "train_speed(iter/s)": 0.138545 }, { "epoch": 0.39543223858059645, "grad_norm": 0.2663649916648865, "learning_rate": 0.00014155182450930516, "loss": 0.2905903339385986, "memory(GiB)": 91.64, "step": 4190, "token_acc": 0.9216404247528378, "train_speed(iter/s)": 0.138544 }, { "epoch": 0.3959041147602869, "grad_norm": 0.3120744526386261, "learning_rate": 0.00014140983352823558, "loss": 0.2967799186706543, "memory(GiB)": 91.64, "step": 4195, "token_acc": 0.9013350700097688, "train_speed(iter/s)": 0.138546 }, { "epoch": 0.39637599093997733, "grad_norm": 0.2620052397251129, "learning_rate": 0.0001412677417079876, "loss": 0.2900829792022705, "memory(GiB)": 91.64, "step": 4200, "token_acc": 0.8982475975127191, "train_speed(iter/s)": 0.138548 }, { "epoch": 0.3968478671196678, "grad_norm": 0.2274962216615677, "learning_rate": 0.00014112554939457625, "loss": 0.29586215019226075, "memory(GiB)": 91.64, "step": 4205, "token_acc": 0.9071954210956664, "train_speed(iter/s)": 0.138548 }, { "epoch": 0.39731974329935826, "grad_norm": 0.3711623549461365, "learning_rate": 0.00014098325693426118, "loss": 0.2935636520385742, "memory(GiB)": 91.64, "step": 4210, "token_acc": 0.8952965235173824, "train_speed(iter/s)": 0.138549 }, { "epoch": 0.3977916194790487, "grad_norm": 0.9405967593193054, "learning_rate": 0.00014084086467354597, "loss": 0.2912130355834961, "memory(GiB)": 91.64, "step": 4215, "token_acc": 0.9064059900166389, "train_speed(iter/s)": 0.13855 }, { "epoch": 0.39826349565873914, "grad_norm": 0.33666670322418213, "learning_rate": 0.00014069837295917721, "loss": 0.29177026748657225, "memory(GiB)": 91.64, "step": 4220, "token_acc": 0.9011857707509882, "train_speed(iter/s)": 0.138552 }, { "epoch": 0.3987353718384296, "grad_norm": 0.22122500836849213, "learning_rate": 0.00014055578213814366, "loss": 0.2964980125427246, "memory(GiB)": 91.64, "step": 4225, "token_acc": 0.8981173864894795, "train_speed(iter/s)": 0.138553 }, { "epoch": 0.39920724801812, "grad_norm": 0.37318891286849976, "learning_rate": 0.00014041309255767548, "loss": 0.29087071418762206, "memory(GiB)": 91.64, "step": 4230, "token_acc": 0.9126625211984172, "train_speed(iter/s)": 0.138554 }, { "epoch": 0.3996791241978105, "grad_norm": 0.24224835634231567, "learning_rate": 0.00014027030456524318, "loss": 0.2874420166015625, "memory(GiB)": 91.64, "step": 4235, "token_acc": 0.895663514835344, "train_speed(iter/s)": 0.138555 }, { "epoch": 0.40015100037750095, "grad_norm": 0.44189468026161194, "learning_rate": 0.00014012741850855714, "loss": 0.2921741962432861, "memory(GiB)": 91.64, "step": 4240, "token_acc": 0.9087829033098775, "train_speed(iter/s)": 0.138555 }, { "epoch": 0.40062287655719137, "grad_norm": 0.28330740332603455, "learning_rate": 0.00013998443473556632, "loss": 0.2978023052215576, "memory(GiB)": 91.64, "step": 4245, "token_acc": 0.8874931731294374, "train_speed(iter/s)": 0.138557 }, { "epoch": 0.40109475273688183, "grad_norm": 0.2942737936973572, "learning_rate": 0.00013984135359445778, "loss": 0.289691686630249, "memory(GiB)": 91.64, "step": 4250, "token_acc": 0.8960396039603961, "train_speed(iter/s)": 0.138559 }, { "epoch": 0.4015666289165723, "grad_norm": 0.9360629320144653, "learning_rate": 0.00013969817543365562, "loss": 0.2971461772918701, "memory(GiB)": 91.64, "step": 4255, "token_acc": 0.9020383328262853, "train_speed(iter/s)": 0.138556 }, { "epoch": 0.40203850509626277, "grad_norm": 0.4696950316429138, "learning_rate": 0.00013955490060182024, "loss": 0.2846244812011719, "memory(GiB)": 91.64, "step": 4260, "token_acc": 0.8936451897616946, "train_speed(iter/s)": 0.138559 }, { "epoch": 0.4025103812759532, "grad_norm": 0.24814550578594208, "learning_rate": 0.0001394115294478474, "loss": 0.2938679695129395, "memory(GiB)": 91.64, "step": 4265, "token_acc": 0.9173832923832924, "train_speed(iter/s)": 0.138559 }, { "epoch": 0.40298225745564364, "grad_norm": 0.40624096989631653, "learning_rate": 0.00013926806232086744, "loss": 0.29901700019836425, "memory(GiB)": 91.64, "step": 4270, "token_acc": 0.9090644973852411, "train_speed(iter/s)": 0.13856 }, { "epoch": 0.4034541336353341, "grad_norm": 0.4126349687576294, "learning_rate": 0.00013912449957024443, "loss": 0.30084829330444335, "memory(GiB)": 91.64, "step": 4275, "token_acc": 0.8965719308526223, "train_speed(iter/s)": 0.138562 }, { "epoch": 0.4039260098150245, "grad_norm": 0.3654210865497589, "learning_rate": 0.00013898084154557528, "loss": 0.29240951538085935, "memory(GiB)": 91.64, "step": 4280, "token_acc": 0.8849469496021221, "train_speed(iter/s)": 0.13856 }, { "epoch": 0.404397885994715, "grad_norm": 0.4598270356655121, "learning_rate": 0.00013883708859668885, "loss": 0.29373817443847655, "memory(GiB)": 91.64, "step": 4285, "token_acc": 0.9065555957986237, "train_speed(iter/s)": 0.138562 }, { "epoch": 0.40486976217440546, "grad_norm": 0.33815762400627136, "learning_rate": 0.0001386932410736453, "loss": 0.29302225112915037, "memory(GiB)": 91.64, "step": 4290, "token_acc": 0.8969513731418494, "train_speed(iter/s)": 0.138561 }, { "epoch": 0.40534163835409587, "grad_norm": 0.5209315419197083, "learning_rate": 0.00013854929932673494, "loss": 0.2958113431930542, "memory(GiB)": 91.64, "step": 4295, "token_acc": 0.8755449861276259, "train_speed(iter/s)": 0.138563 }, { "epoch": 0.40581351453378633, "grad_norm": 0.2827267050743103, "learning_rate": 0.0001384052637064776, "loss": 0.28914942741394045, "memory(GiB)": 91.64, "step": 4300, "token_acc": 0.9065727699530517, "train_speed(iter/s)": 0.138562 }, { "epoch": 0.4062853907134768, "grad_norm": 0.6436609625816345, "learning_rate": 0.00013826113456362176, "loss": 0.2951486825942993, "memory(GiB)": 91.64, "step": 4305, "token_acc": 0.8828323993886907, "train_speed(iter/s)": 0.138563 }, { "epoch": 0.4067572668931672, "grad_norm": 0.8339882493019104, "learning_rate": 0.00013811691224914347, "loss": 0.28337812423706055, "memory(GiB)": 91.64, "step": 4310, "token_acc": 0.8929663608562691, "train_speed(iter/s)": 0.138563 }, { "epoch": 0.4072291430728577, "grad_norm": 0.3736589848995209, "learning_rate": 0.0001379725971142459, "loss": 0.28997302055358887, "memory(GiB)": 91.64, "step": 4315, "token_acc": 0.8899193548387097, "train_speed(iter/s)": 0.138565 }, { "epoch": 0.40770101925254815, "grad_norm": 0.3151237964630127, "learning_rate": 0.000137828189510358, "loss": 0.2918028116226196, "memory(GiB)": 91.64, "step": 4320, "token_acc": 0.8964912280701754, "train_speed(iter/s)": 0.138566 }, { "epoch": 0.40817289543223856, "grad_norm": 0.3310835361480713, "learning_rate": 0.0001376836897891341, "loss": 0.2858395576477051, "memory(GiB)": 91.64, "step": 4325, "token_acc": 0.8929663608562691, "train_speed(iter/s)": 0.138566 }, { "epoch": 0.408644771611929, "grad_norm": 0.4066063165664673, "learning_rate": 0.0001375390983024528, "loss": 0.293898868560791, "memory(GiB)": 91.64, "step": 4330, "token_acc": 0.9106681432262828, "train_speed(iter/s)": 0.138565 }, { "epoch": 0.4091166477916195, "grad_norm": 0.6316620111465454, "learning_rate": 0.00013739441540241607, "loss": 0.29479825496673584, "memory(GiB)": 91.64, "step": 4335, "token_acc": 0.9061522419186653, "train_speed(iter/s)": 0.138568 }, { "epoch": 0.4095885239713099, "grad_norm": 0.7944101095199585, "learning_rate": 0.00013724964144134856, "loss": 0.2885154962539673, "memory(GiB)": 91.64, "step": 4340, "token_acc": 0.9099597585513078, "train_speed(iter/s)": 0.138567 }, { "epoch": 0.41006040015100037, "grad_norm": 0.3424244523048401, "learning_rate": 0.00013710477677179674, "loss": 0.2906686544418335, "memory(GiB)": 91.64, "step": 4345, "token_acc": 0.9176111595466434, "train_speed(iter/s)": 0.138568 }, { "epoch": 0.41053227633069084, "grad_norm": 0.4654749631881714, "learning_rate": 0.00013695982174652779, "loss": 0.2884217262268066, "memory(GiB)": 91.64, "step": 4350, "token_acc": 0.9127239320165366, "train_speed(iter/s)": 0.138569 }, { "epoch": 0.41100415251038125, "grad_norm": 0.22995208203792572, "learning_rate": 0.00013681477671852903, "loss": 0.2872136354446411, "memory(GiB)": 91.64, "step": 4355, "token_acc": 0.9024103468547913, "train_speed(iter/s)": 0.13857 }, { "epoch": 0.4114760286900717, "grad_norm": 0.4856892228126526, "learning_rate": 0.00013666964204100702, "loss": 0.2831977605819702, "memory(GiB)": 91.64, "step": 4360, "token_acc": 0.894754539340955, "train_speed(iter/s)": 0.13857 }, { "epoch": 0.4119479048697622, "grad_norm": 0.406127393245697, "learning_rate": 0.00013652441806738644, "loss": 0.2858266830444336, "memory(GiB)": 91.64, "step": 4365, "token_acc": 0.9002932551319648, "train_speed(iter/s)": 0.138572 }, { "epoch": 0.41241978104945265, "grad_norm": 0.3872334957122803, "learning_rate": 0.0001363791051513096, "loss": 0.2899020195007324, "memory(GiB)": 91.64, "step": 4370, "token_acc": 0.8910081743869209, "train_speed(iter/s)": 0.138572 }, { "epoch": 0.41289165722914306, "grad_norm": 0.3237299621105194, "learning_rate": 0.0001362337036466353, "loss": 0.28980767726898193, "memory(GiB)": 91.64, "step": 4375, "token_acc": 0.8978611959842864, "train_speed(iter/s)": 0.138573 }, { "epoch": 0.41336353340883353, "grad_norm": 0.5206886529922485, "learning_rate": 0.00013608821390743812, "loss": 0.28344340324401857, "memory(GiB)": 91.64, "step": 4380, "token_acc": 0.88828089375285, "train_speed(iter/s)": 0.138575 }, { "epoch": 0.413835409588524, "grad_norm": 0.39191269874572754, "learning_rate": 0.0001359426362880074, "loss": 0.29194035530090334, "memory(GiB)": 91.64, "step": 4385, "token_acc": 0.9174977334542158, "train_speed(iter/s)": 0.138575 }, { "epoch": 0.4143072857682144, "grad_norm": 0.41927480697631836, "learning_rate": 0.00013579697114284665, "loss": 0.2890349864959717, "memory(GiB)": 91.64, "step": 4390, "token_acc": 0.8929208804283165, "train_speed(iter/s)": 0.138574 }, { "epoch": 0.4147791619479049, "grad_norm": 0.28609418869018555, "learning_rate": 0.0001356512188266724, "loss": 0.2916177034378052, "memory(GiB)": 91.64, "step": 4395, "token_acc": 0.8874501992031872, "train_speed(iter/s)": 0.138574 }, { "epoch": 0.41525103812759534, "grad_norm": 0.20930610597133636, "learning_rate": 0.00013550537969441343, "loss": 0.2926508903503418, "memory(GiB)": 91.64, "step": 4400, "token_acc": 0.9018830525272548, "train_speed(iter/s)": 0.138575 }, { "epoch": 0.41572291430728575, "grad_norm": 0.4391603171825409, "learning_rate": 0.00013535945410121002, "loss": 0.29059476852416993, "memory(GiB)": 91.64, "step": 4405, "token_acc": 0.9014660756904194, "train_speed(iter/s)": 0.138575 }, { "epoch": 0.4161947904869762, "grad_norm": 0.3281775116920471, "learning_rate": 0.000135213442402413, "loss": 0.2879380226135254, "memory(GiB)": 91.64, "step": 4410, "token_acc": 0.8988439306358381, "train_speed(iter/s)": 0.138577 }, { "epoch": 0.4166666666666667, "grad_norm": 0.33387717604637146, "learning_rate": 0.00013506734495358276, "loss": 0.29560070037841796, "memory(GiB)": 91.64, "step": 4415, "token_acc": 0.9064398541919806, "train_speed(iter/s)": 0.138575 }, { "epoch": 0.4171385428463571, "grad_norm": 0.4845304489135742, "learning_rate": 0.0001349211621104886, "loss": 0.2906494140625, "memory(GiB)": 91.64, "step": 4420, "token_acc": 0.8972520908004779, "train_speed(iter/s)": 0.138576 }, { "epoch": 0.41761041902604756, "grad_norm": 0.33037856221199036, "learning_rate": 0.0001347748942291078, "loss": 0.28696584701538086, "memory(GiB)": 91.64, "step": 4425, "token_acc": 0.9130630630630631, "train_speed(iter/s)": 0.138577 }, { "epoch": 0.41808229520573803, "grad_norm": 0.2779523432254791, "learning_rate": 0.00013462854166562463, "loss": 0.29009506702423093, "memory(GiB)": 91.64, "step": 4430, "token_acc": 0.9063520871143376, "train_speed(iter/s)": 0.138577 }, { "epoch": 0.41855417138542844, "grad_norm": 0.4156259000301361, "learning_rate": 0.00013448210477642956, "loss": 0.2897838354110718, "memory(GiB)": 91.64, "step": 4435, "token_acc": 0.8880031885213232, "train_speed(iter/s)": 0.138576 }, { "epoch": 0.4190260475651189, "grad_norm": 0.24767661094665527, "learning_rate": 0.00013433558391811858, "loss": 0.29002995491027833, "memory(GiB)": 91.64, "step": 4440, "token_acc": 0.8957399103139013, "train_speed(iter/s)": 0.138579 }, { "epoch": 0.4194979237448094, "grad_norm": 0.30752333998680115, "learning_rate": 0.00013418897944749195, "loss": 0.29051032066345217, "memory(GiB)": 91.64, "step": 4445, "token_acc": 0.8970005659309565, "train_speed(iter/s)": 0.13858 }, { "epoch": 0.4199697999244998, "grad_norm": 0.45796823501586914, "learning_rate": 0.00013404229172155364, "loss": 0.2903867244720459, "memory(GiB)": 91.64, "step": 4450, "token_acc": 0.8912252325111201, "train_speed(iter/s)": 0.13858 }, { "epoch": 0.42044167610419025, "grad_norm": 0.39090868830680847, "learning_rate": 0.00013389552109751036, "loss": 0.2896425724029541, "memory(GiB)": 91.64, "step": 4455, "token_acc": 0.904796511627907, "train_speed(iter/s)": 0.138579 }, { "epoch": 0.4209135522838807, "grad_norm": 0.5254302620887756, "learning_rate": 0.00013374866793277066, "loss": 0.2874057054519653, "memory(GiB)": 91.64, "step": 4460, "token_acc": 0.8978494623655914, "train_speed(iter/s)": 0.13858 }, { "epoch": 0.42138542846357113, "grad_norm": 0.5833097100257874, "learning_rate": 0.00013360173258494416, "loss": 0.28268094062805177, "memory(GiB)": 91.64, "step": 4465, "token_acc": 0.9087018544935807, "train_speed(iter/s)": 0.138579 }, { "epoch": 0.4218573046432616, "grad_norm": 0.2850300371646881, "learning_rate": 0.00013345471541184042, "loss": 0.28884856700897216, "memory(GiB)": 91.64, "step": 4470, "token_acc": 0.9155321782178217, "train_speed(iter/s)": 0.138575 }, { "epoch": 0.42232918082295207, "grad_norm": 1.1341931819915771, "learning_rate": 0.00013330761677146852, "loss": 0.2828017473220825, "memory(GiB)": 91.64, "step": 4475, "token_acc": 0.9020037570444583, "train_speed(iter/s)": 0.138577 }, { "epoch": 0.42280105700264253, "grad_norm": 1.6812070608139038, "learning_rate": 0.00013316043702203575, "loss": 0.29272284507751467, "memory(GiB)": 91.64, "step": 4480, "token_acc": 0.897029702970297, "train_speed(iter/s)": 0.138578 }, { "epoch": 0.42327293318233294, "grad_norm": 0.2931070029735565, "learning_rate": 0.00013301317652194693, "loss": 0.28561973571777344, "memory(GiB)": 91.64, "step": 4485, "token_acc": 0.898884239888424, "train_speed(iter/s)": 0.138577 }, { "epoch": 0.4237448093620234, "grad_norm": 0.40677410364151, "learning_rate": 0.00013286583562980355, "loss": 0.28868684768676756, "memory(GiB)": 91.64, "step": 4490, "token_acc": 0.8961675579322638, "train_speed(iter/s)": 0.138578 }, { "epoch": 0.4242166855417139, "grad_norm": 0.5539590716362, "learning_rate": 0.00013271841470440288, "loss": 0.2832665920257568, "memory(GiB)": 91.64, "step": 4495, "token_acc": 0.8930443900734009, "train_speed(iter/s)": 0.138578 }, { "epoch": 0.4246885617214043, "grad_norm": 0.2117510586977005, "learning_rate": 0.0001325709141047371, "loss": 0.28936703205108644, "memory(GiB)": 91.64, "step": 4500, "token_acc": 0.8862788963460104, "train_speed(iter/s)": 0.138577 }, { "epoch": 0.42516043790109476, "grad_norm": 0.4565034508705139, "learning_rate": 0.00013242333418999228, "loss": 0.28919024467468263, "memory(GiB)": 91.64, "step": 4505, "token_acc": 0.9185295578738202, "train_speed(iter/s)": 0.138576 }, { "epoch": 0.4256323140807852, "grad_norm": 0.8646222949028015, "learning_rate": 0.00013227567531954784, "loss": 0.2934823513031006, "memory(GiB)": 91.64, "step": 4510, "token_acc": 0.9073763621123219, "train_speed(iter/s)": 0.138576 }, { "epoch": 0.42610419026047563, "grad_norm": 0.48075807094573975, "learning_rate": 0.00013212793785297527, "loss": 0.29014410972595217, "memory(GiB)": 91.64, "step": 4515, "token_acc": 0.8953418027828192, "train_speed(iter/s)": 0.138576 }, { "epoch": 0.4265760664401661, "grad_norm": 0.5212975740432739, "learning_rate": 0.00013198012215003758, "loss": 0.28605401515960693, "memory(GiB)": 91.64, "step": 4520, "token_acc": 0.8984655566438132, "train_speed(iter/s)": 0.138577 }, { "epoch": 0.42704794261985657, "grad_norm": 0.38360151648521423, "learning_rate": 0.00013183222857068828, "loss": 0.2868018388748169, "memory(GiB)": 91.64, "step": 4525, "token_acc": 0.8966820663586729, "train_speed(iter/s)": 0.138577 }, { "epoch": 0.427519818799547, "grad_norm": 0.3686319887638092, "learning_rate": 0.00013168425747507042, "loss": 0.281552791595459, "memory(GiB)": 91.64, "step": 4530, "token_acc": 0.8858676207513417, "train_speed(iter/s)": 0.138582 }, { "epoch": 0.42799169497923745, "grad_norm": 1.0960360765457153, "learning_rate": 0.00013153620922351598, "loss": 0.28768799304962156, "memory(GiB)": 91.64, "step": 4535, "token_acc": 0.9006509078451524, "train_speed(iter/s)": 0.138581 }, { "epoch": 0.4284635711589279, "grad_norm": 0.9694100022315979, "learning_rate": 0.00013138808417654472, "loss": 0.28849072456359864, "memory(GiB)": 91.64, "step": 4540, "token_acc": 0.8936080740117746, "train_speed(iter/s)": 0.138583 }, { "epoch": 0.4289354473386183, "grad_norm": 0.32766884565353394, "learning_rate": 0.00013123988269486336, "loss": 0.28848419189453123, "memory(GiB)": 91.64, "step": 4545, "token_acc": 0.9198760513501549, "train_speed(iter/s)": 0.138584 }, { "epoch": 0.4294073235183088, "grad_norm": 0.9172677397727966, "learning_rate": 0.00013109160513936492, "loss": 0.2858105659484863, "memory(GiB)": 91.64, "step": 4550, "token_acc": 0.8982770046388336, "train_speed(iter/s)": 0.138585 }, { "epoch": 0.42987919969799926, "grad_norm": 0.4844764769077301, "learning_rate": 0.0001309432518711275, "loss": 0.29333882331848143, "memory(GiB)": 91.64, "step": 4555, "token_acc": 0.8893459481694775, "train_speed(iter/s)": 0.138584 }, { "epoch": 0.43035107587768967, "grad_norm": 0.5546042323112488, "learning_rate": 0.00013079482325141365, "loss": 0.29157898426055906, "memory(GiB)": 91.64, "step": 4560, "token_acc": 0.903052805280528, "train_speed(iter/s)": 0.138584 }, { "epoch": 0.43082295205738014, "grad_norm": 0.224000945687294, "learning_rate": 0.0001306463196416694, "loss": 0.2820766448974609, "memory(GiB)": 91.64, "step": 4565, "token_acc": 0.9094988780852655, "train_speed(iter/s)": 0.138584 }, { "epoch": 0.4312948282370706, "grad_norm": 0.2916657328605652, "learning_rate": 0.00013049774140352346, "loss": 0.28515851497650146, "memory(GiB)": 91.64, "step": 4570, "token_acc": 0.9044019564250778, "train_speed(iter/s)": 0.138587 }, { "epoch": 0.431766704416761, "grad_norm": 0.31271326541900635, "learning_rate": 0.00013034908889878613, "loss": 0.290648365020752, "memory(GiB)": 91.64, "step": 4575, "token_acc": 0.9117174959871589, "train_speed(iter/s)": 0.13859 }, { "epoch": 0.4322385805964515, "grad_norm": 0.48812609910964966, "learning_rate": 0.00013020036248944863, "loss": 0.2845763206481934, "memory(GiB)": 91.64, "step": 4580, "token_acc": 0.911256242796773, "train_speed(iter/s)": 0.138591 }, { "epoch": 0.43271045677614195, "grad_norm": 0.490957647562027, "learning_rate": 0.0001300515625376822, "loss": 0.28296747207641604, "memory(GiB)": 91.64, "step": 4585, "token_acc": 0.9053865475858219, "train_speed(iter/s)": 0.138591 }, { "epoch": 0.4331823329558324, "grad_norm": 0.6826327443122864, "learning_rate": 0.00012990268940583715, "loss": 0.28066396713256836, "memory(GiB)": 91.64, "step": 4590, "token_acc": 0.912207625760974, "train_speed(iter/s)": 0.138591 }, { "epoch": 0.43365420913552283, "grad_norm": 0.3873187005519867, "learning_rate": 0.0001297537434564419, "loss": 0.2871824026107788, "memory(GiB)": 91.64, "step": 4595, "token_acc": 0.9020979020979021, "train_speed(iter/s)": 0.138591 }, { "epoch": 0.4341260853152133, "grad_norm": 0.24573656916618347, "learning_rate": 0.00012960472505220227, "loss": 0.2789003849029541, "memory(GiB)": 91.64, "step": 4600, "token_acc": 0.9040368271954674, "train_speed(iter/s)": 0.138591 }, { "epoch": 0.43459796149490376, "grad_norm": 0.5334983468055725, "learning_rate": 0.00012945563455600052, "loss": 0.287949800491333, "memory(GiB)": 91.64, "step": 4605, "token_acc": 0.886, "train_speed(iter/s)": 0.13859 }, { "epoch": 0.4350698376745942, "grad_norm": 0.22217601537704468, "learning_rate": 0.00012930647233089451, "loss": 0.28804755210876465, "memory(GiB)": 91.64, "step": 4610, "token_acc": 0.8871165644171779, "train_speed(iter/s)": 0.138592 }, { "epoch": 0.43554171385428464, "grad_norm": 0.8007298707962036, "learning_rate": 0.0001291572387401166, "loss": 0.2913137197494507, "memory(GiB)": 91.64, "step": 4615, "token_acc": 0.8878835562549174, "train_speed(iter/s)": 0.13859 }, { "epoch": 0.4360135900339751, "grad_norm": 0.763749361038208, "learning_rate": 0.0001290079341470731, "loss": 0.27843732833862306, "memory(GiB)": 91.64, "step": 4620, "token_acc": 0.9164843180160467, "train_speed(iter/s)": 0.13859 }, { "epoch": 0.4364854662136655, "grad_norm": 0.36388909816741943, "learning_rate": 0.0001288585589153432, "loss": 0.28447413444519043, "memory(GiB)": 91.64, "step": 4625, "token_acc": 0.8996680191811139, "train_speed(iter/s)": 0.138593 }, { "epoch": 0.436957342393356, "grad_norm": 0.6230308413505554, "learning_rate": 0.00012870911340867806, "loss": 0.28026676177978516, "memory(GiB)": 91.64, "step": 4630, "token_acc": 0.895593220338983, "train_speed(iter/s)": 0.138591 }, { "epoch": 0.43742921857304645, "grad_norm": 0.40676748752593994, "learning_rate": 0.00012855959799099997, "loss": 0.2786916971206665, "memory(GiB)": 91.64, "step": 4635, "token_acc": 0.8946716232961586, "train_speed(iter/s)": 0.138591 }, { "epoch": 0.43790109475273686, "grad_norm": 0.3990239202976227, "learning_rate": 0.0001284100130264015, "loss": 0.2920806646347046, "memory(GiB)": 91.64, "step": 4640, "token_acc": 0.9059865092748736, "train_speed(iter/s)": 0.138593 }, { "epoch": 0.43837297093242733, "grad_norm": 0.2727039158344269, "learning_rate": 0.0001282603588791445, "loss": 0.27799243927001954, "memory(GiB)": 91.64, "step": 4645, "token_acc": 0.9078553954879043, "train_speed(iter/s)": 0.138593 }, { "epoch": 0.4388448471121178, "grad_norm": 0.6722660064697266, "learning_rate": 0.00012811063591365942, "loss": 0.28724350929260256, "memory(GiB)": 91.64, "step": 4650, "token_acc": 0.9086161879895561, "train_speed(iter/s)": 0.138593 }, { "epoch": 0.4393167232918082, "grad_norm": 0.5787277817726135, "learning_rate": 0.0001279608444945442, "loss": 0.2847604274749756, "memory(GiB)": 91.64, "step": 4655, "token_acc": 0.9099365750528541, "train_speed(iter/s)": 0.138596 }, { "epoch": 0.4397885994714987, "grad_norm": 0.7865330576896667, "learning_rate": 0.00012781098498656343, "loss": 0.2823522090911865, "memory(GiB)": 91.64, "step": 4660, "token_acc": 0.8874614594039054, "train_speed(iter/s)": 0.138596 }, { "epoch": 0.44026047565118914, "grad_norm": 0.2751135528087616, "learning_rate": 0.00012766105775464769, "loss": 0.28550019264221194, "memory(GiB)": 91.64, "step": 4665, "token_acc": 0.9065579340808698, "train_speed(iter/s)": 0.138597 }, { "epoch": 0.44073235183087955, "grad_norm": 0.3212601840496063, "learning_rate": 0.00012751106316389227, "loss": 0.29028480052947997, "memory(GiB)": 91.64, "step": 4670, "token_acc": 0.8936355710549259, "train_speed(iter/s)": 0.138597 }, { "epoch": 0.44120422801057, "grad_norm": 0.574639081954956, "learning_rate": 0.0001273610015795566, "loss": 0.28751068115234374, "memory(GiB)": 91.64, "step": 4675, "token_acc": 0.8857025809094633, "train_speed(iter/s)": 0.138596 }, { "epoch": 0.4416761041902605, "grad_norm": 0.5098394155502319, "learning_rate": 0.00012721087336706326, "loss": 0.2824862480163574, "memory(GiB)": 91.64, "step": 4680, "token_acc": 0.9175931981687377, "train_speed(iter/s)": 0.138597 }, { "epoch": 0.4421479803699509, "grad_norm": 0.6681429147720337, "learning_rate": 0.00012706067889199697, "loss": 0.2874873876571655, "memory(GiB)": 91.64, "step": 4685, "token_acc": 0.9023062139654068, "train_speed(iter/s)": 0.138597 }, { "epoch": 0.44261985654964137, "grad_norm": 0.28191789984703064, "learning_rate": 0.00012691041852010398, "loss": 0.28571357727050783, "memory(GiB)": 91.64, "step": 4690, "token_acc": 0.9098571763053149, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.44309173272933183, "grad_norm": 0.5222494602203369, "learning_rate": 0.00012676009261729086, "loss": 0.2767521858215332, "memory(GiB)": 91.64, "step": 4695, "token_acc": 0.9009402283411686, "train_speed(iter/s)": 0.138599 }, { "epoch": 0.4435636089090223, "grad_norm": 0.22164411842823029, "learning_rate": 0.00012660970154962383, "loss": 0.27899010181427003, "memory(GiB)": 91.64, "step": 4700, "token_acc": 0.8977528089887641, "train_speed(iter/s)": 0.138596 }, { "epoch": 0.4440354850887127, "grad_norm": 0.2828960716724396, "learning_rate": 0.00012645924568332773, "loss": 0.28249554634094237, "memory(GiB)": 91.64, "step": 4705, "token_acc": 0.9085754783841248, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.4445073612684032, "grad_norm": 0.4036547541618347, "learning_rate": 0.00012630872538478536, "loss": 0.28681211471557616, "memory(GiB)": 91.64, "step": 4710, "token_acc": 0.8995236032914682, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.44497923744809365, "grad_norm": 0.39314863085746765, "learning_rate": 0.00012615814102053617, "loss": 0.28807053565979, "memory(GiB)": 91.64, "step": 4715, "token_acc": 0.9016070842899311, "train_speed(iter/s)": 0.138597 }, { "epoch": 0.44545111362778406, "grad_norm": 0.5945762991905212, "learning_rate": 0.00012600749295727583, "loss": 0.2825813293457031, "memory(GiB)": 91.64, "step": 4720, "token_acc": 0.8978658536585366, "train_speed(iter/s)": 0.1386 }, { "epoch": 0.4459229898074745, "grad_norm": 0.3599962294101715, "learning_rate": 0.00012585678156185507, "loss": 0.2848550319671631, "memory(GiB)": 91.64, "step": 4725, "token_acc": 0.9100689655172414, "train_speed(iter/s)": 0.138601 }, { "epoch": 0.446394865987165, "grad_norm": 0.41364148259162903, "learning_rate": 0.0001257060072012788, "loss": 0.27649877071380613, "memory(GiB)": 91.64, "step": 4730, "token_acc": 0.9065370070232307, "train_speed(iter/s)": 0.138602 }, { "epoch": 0.4468667421668554, "grad_norm": 0.3855108916759491, "learning_rate": 0.00012555517024270525, "loss": 0.2867574214935303, "memory(GiB)": 91.64, "step": 4735, "token_acc": 0.9041860465116279, "train_speed(iter/s)": 0.138603 }, { "epoch": 0.44733861834654587, "grad_norm": 0.4569513499736786, "learning_rate": 0.00012540427105344517, "loss": 0.29142746925354, "memory(GiB)": 91.64, "step": 4740, "token_acc": 0.8947759346372864, "train_speed(iter/s)": 0.138604 }, { "epoch": 0.44781049452623634, "grad_norm": 0.36718836426734924, "learning_rate": 0.00012525331000096078, "loss": 0.27325663566589353, "memory(GiB)": 91.64, "step": 4745, "token_acc": 0.9051918735891648, "train_speed(iter/s)": 0.138602 }, { "epoch": 0.44828237070592675, "grad_norm": 0.45342808961868286, "learning_rate": 0.000125102287452865, "loss": 0.2808860778808594, "memory(GiB)": 91.64, "step": 4750, "token_acc": 0.901932712956335, "train_speed(iter/s)": 0.138602 }, { "epoch": 0.4487542468856172, "grad_norm": 0.39637887477874756, "learning_rate": 0.00012495120377692038, "loss": 0.27970137596130373, "memory(GiB)": 91.64, "step": 4755, "token_acc": 0.8976339932399807, "train_speed(iter/s)": 0.138602 }, { "epoch": 0.4492261230653077, "grad_norm": 0.5339449048042297, "learning_rate": 0.0001248000593410385, "loss": 0.27558255195617676, "memory(GiB)": 91.64, "step": 4760, "token_acc": 0.9057873485868102, "train_speed(iter/s)": 0.138603 }, { "epoch": 0.4496979992449981, "grad_norm": 0.3682711720466614, "learning_rate": 0.0001246488545132788, "loss": 0.28574070930480955, "memory(GiB)": 91.64, "step": 4765, "token_acc": 0.9038128249566725, "train_speed(iter/s)": 0.138605 }, { "epoch": 0.45016987542468856, "grad_norm": 0.25929683446884155, "learning_rate": 0.0001244975896618478, "loss": 0.29129462242126464, "memory(GiB)": 91.64, "step": 4770, "token_acc": 0.9121370067014147, "train_speed(iter/s)": 0.138603 }, { "epoch": 0.450641751604379, "grad_norm": 0.37731558084487915, "learning_rate": 0.0001243462651550982, "loss": 0.2848989725112915, "memory(GiB)": 91.64, "step": 4775, "token_acc": 0.906934306569343, "train_speed(iter/s)": 0.138604 }, { "epoch": 0.45111362778406944, "grad_norm": 0.29577717185020447, "learning_rate": 0.00012419488136152784, "loss": 0.28867268562316895, "memory(GiB)": 91.64, "step": 4780, "token_acc": 0.9120754716981132, "train_speed(iter/s)": 0.138603 }, { "epoch": 0.4515855039637599, "grad_norm": 0.2625930607318878, "learning_rate": 0.00012404343864977918, "loss": 0.2829215288162231, "memory(GiB)": 91.64, "step": 4785, "token_acc": 0.8994281870164816, "train_speed(iter/s)": 0.138603 }, { "epoch": 0.45205738014345037, "grad_norm": 0.4252516031265259, "learning_rate": 0.00012389193738863795, "loss": 0.27890982627868655, "memory(GiB)": 91.64, "step": 4790, "token_acc": 0.9049789621318373, "train_speed(iter/s)": 0.138601 }, { "epoch": 0.4525292563231408, "grad_norm": 0.24441945552825928, "learning_rate": 0.0001237403779470326, "loss": 0.27833251953125, "memory(GiB)": 91.64, "step": 4795, "token_acc": 0.907057462398766, "train_speed(iter/s)": 0.1386 }, { "epoch": 0.45300113250283125, "grad_norm": 0.894290566444397, "learning_rate": 0.00012358876069403312, "loss": 0.2981924057006836, "memory(GiB)": 91.64, "step": 4800, "token_acc": 0.9085677749360613, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.4534730086825217, "grad_norm": 0.49250420928001404, "learning_rate": 0.0001234370859988503, "loss": 0.2824810743331909, "memory(GiB)": 91.64, "step": 4805, "token_acc": 0.9156916724019271, "train_speed(iter/s)": 0.138599 }, { "epoch": 0.45394488486221213, "grad_norm": 0.34194937348365784, "learning_rate": 0.00012328535423083498, "loss": 0.2780169486999512, "memory(GiB)": 91.64, "step": 4810, "token_acc": 0.9040823099900431, "train_speed(iter/s)": 0.138596 }, { "epoch": 0.4544167610419026, "grad_norm": 0.2699131667613983, "learning_rate": 0.0001231335657594768, "loss": 0.27982387542724607, "memory(GiB)": 91.64, "step": 4815, "token_acc": 0.9047619047619048, "train_speed(iter/s)": 0.138597 }, { "epoch": 0.45488863722159306, "grad_norm": 0.3031172454357147, "learning_rate": 0.0001229817209544035, "loss": 0.28308122158050536, "memory(GiB)": 91.64, "step": 4820, "token_acc": 0.9010152284263959, "train_speed(iter/s)": 0.138596 }, { "epoch": 0.45536051340128353, "grad_norm": 0.2868700921535492, "learning_rate": 0.00012282982018538006, "loss": 0.28293166160583494, "memory(GiB)": 91.64, "step": 4825, "token_acc": 0.8836694540088539, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.45583238958097394, "grad_norm": 0.3862973749637604, "learning_rate": 0.00012267786382230778, "loss": 0.28501114845275877, "memory(GiB)": 91.64, "step": 4830, "token_acc": 0.9142234068330506, "train_speed(iter/s)": 0.138592 }, { "epoch": 0.4563042657606644, "grad_norm": 0.3525838255882263, "learning_rate": 0.00012252585223522318, "loss": 0.2820533037185669, "memory(GiB)": 91.64, "step": 4835, "token_acc": 0.903065964694952, "train_speed(iter/s)": 0.13859 }, { "epoch": 0.4567761419403549, "grad_norm": 0.28131887316703796, "learning_rate": 0.00012237378579429742, "loss": 0.2824002742767334, "memory(GiB)": 91.64, "step": 4840, "token_acc": 0.9106837606837607, "train_speed(iter/s)": 0.138591 }, { "epoch": 0.4572480181200453, "grad_norm": 0.6042118072509766, "learning_rate": 0.00012222166486983518, "loss": 0.28409135341644287, "memory(GiB)": 91.64, "step": 4845, "token_acc": 0.9043731778425655, "train_speed(iter/s)": 0.138589 }, { "epoch": 0.45771989429973575, "grad_norm": 0.3936363756656647, "learning_rate": 0.00012206948983227375, "loss": 0.28306241035461427, "memory(GiB)": 91.64, "step": 4850, "token_acc": 0.8994461014060503, "train_speed(iter/s)": 0.138587 }, { "epoch": 0.4581917704794262, "grad_norm": 0.5257007479667664, "learning_rate": 0.00012191726105218233, "loss": 0.289243221282959, "memory(GiB)": 91.64, "step": 4855, "token_acc": 0.8839373163565132, "train_speed(iter/s)": 0.138587 }, { "epoch": 0.45866364665911663, "grad_norm": 0.21987028419971466, "learning_rate": 0.00012176497890026088, "loss": 0.2828011989593506, "memory(GiB)": 91.64, "step": 4860, "token_acc": 0.9015120555782591, "train_speed(iter/s)": 0.138586 }, { "epoch": 0.4591355228388071, "grad_norm": 0.5803897976875305, "learning_rate": 0.00012161264374733936, "loss": 0.27418339252471924, "memory(GiB)": 91.64, "step": 4865, "token_acc": 0.8991545893719807, "train_speed(iter/s)": 0.138586 }, { "epoch": 0.45960739901849756, "grad_norm": 0.45357078313827515, "learning_rate": 0.0001214602559643768, "loss": 0.28390045166015626, "memory(GiB)": 91.64, "step": 4870, "token_acc": 0.9008333333333334, "train_speed(iter/s)": 0.138586 }, { "epoch": 0.460079275198188, "grad_norm": 0.454944372177124, "learning_rate": 0.00012130781592246041, "loss": 0.278179407119751, "memory(GiB)": 91.64, "step": 4875, "token_acc": 0.8991797676008202, "train_speed(iter/s)": 0.138587 }, { "epoch": 0.46055115137787844, "grad_norm": 0.2925609052181244, "learning_rate": 0.00012115532399280463, "loss": 0.27666945457458497, "memory(GiB)": 91.64, "step": 4880, "token_acc": 0.8829588014981273, "train_speed(iter/s)": 0.138587 }, { "epoch": 0.4610230275575689, "grad_norm": 0.41853830218315125, "learning_rate": 0.00012100278054675025, "loss": 0.2862995624542236, "memory(GiB)": 91.64, "step": 4885, "token_acc": 0.9114873035066505, "train_speed(iter/s)": 0.138586 }, { "epoch": 0.4614949037372593, "grad_norm": 0.9547513127326965, "learning_rate": 0.00012085018595576353, "loss": 0.2757422924041748, "memory(GiB)": 91.64, "step": 4890, "token_acc": 0.9112970711297071, "train_speed(iter/s)": 0.138584 }, { "epoch": 0.4619667799169498, "grad_norm": 0.3806958496570587, "learning_rate": 0.00012069754059143528, "loss": 0.2767773628234863, "memory(GiB)": 91.64, "step": 4895, "token_acc": 0.8986072423398329, "train_speed(iter/s)": 0.138585 }, { "epoch": 0.46243865609664025, "grad_norm": 0.23360499739646912, "learning_rate": 0.00012054484482547996, "loss": 0.28561379909515383, "memory(GiB)": 91.64, "step": 4900, "token_acc": 0.9079869219990658, "train_speed(iter/s)": 0.138584 }, { "epoch": 0.46291053227633067, "grad_norm": 0.42152461409568787, "learning_rate": 0.0001203920990297347, "loss": 0.2833158254623413, "memory(GiB)": 91.64, "step": 4905, "token_acc": 0.9091880341880342, "train_speed(iter/s)": 0.138585 }, { "epoch": 0.46338240845602113, "grad_norm": 1.3780053853988647, "learning_rate": 0.00012023930357615854, "loss": 0.2830458641052246, "memory(GiB)": 91.64, "step": 4910, "token_acc": 0.9006878761822872, "train_speed(iter/s)": 0.138585 }, { "epoch": 0.4638542846357116, "grad_norm": 0.7108725905418396, "learning_rate": 0.0001200864588368314, "loss": 0.27931234836578367, "memory(GiB)": 91.64, "step": 4915, "token_acc": 0.9094964945825367, "train_speed(iter/s)": 0.138587 }, { "epoch": 0.464326160815402, "grad_norm": 0.2742263674736023, "learning_rate": 0.00011993356518395322, "loss": 0.27413043975830076, "memory(GiB)": 91.64, "step": 4920, "token_acc": 0.9130901287553648, "train_speed(iter/s)": 0.138587 }, { "epoch": 0.4647980369950925, "grad_norm": 0.3077310621738434, "learning_rate": 0.0001197806229898431, "loss": 0.2840799570083618, "memory(GiB)": 91.64, "step": 4925, "token_acc": 0.9001019367991845, "train_speed(iter/s)": 0.13859 }, { "epoch": 0.46526991317478295, "grad_norm": 0.2271108776330948, "learning_rate": 0.00011962763262693826, "loss": 0.280320930480957, "memory(GiB)": 91.64, "step": 4930, "token_acc": 0.8968949044585988, "train_speed(iter/s)": 0.138592 }, { "epoch": 0.4657417893544734, "grad_norm": 0.3191837966442108, "learning_rate": 0.0001194745944677933, "loss": 0.28061847686767577, "memory(GiB)": 91.64, "step": 4935, "token_acc": 0.8997225525168451, "train_speed(iter/s)": 0.138591 }, { "epoch": 0.4662136655341638, "grad_norm": 0.9220530390739441, "learning_rate": 0.00011932150888507911, "loss": 0.2785890340805054, "memory(GiB)": 91.64, "step": 4940, "token_acc": 0.914657481821661, "train_speed(iter/s)": 0.13859 }, { "epoch": 0.4666855417138543, "grad_norm": 0.2771691083908081, "learning_rate": 0.00011916837625158221, "loss": 0.2860894680023193, "memory(GiB)": 91.64, "step": 4945, "token_acc": 0.901593252108716, "train_speed(iter/s)": 0.138589 }, { "epoch": 0.46715741789354476, "grad_norm": 0.32764509320259094, "learning_rate": 0.00011901519694020358, "loss": 0.281624960899353, "memory(GiB)": 91.64, "step": 4950, "token_acc": 0.8995609220636663, "train_speed(iter/s)": 0.13859 }, { "epoch": 0.46762929407323517, "grad_norm": 0.25029462575912476, "learning_rate": 0.00011886197132395791, "loss": 0.28546116352081297, "memory(GiB)": 91.64, "step": 4955, "token_acc": 0.9098073555166375, "train_speed(iter/s)": 0.138588 }, { "epoch": 0.46810117025292564, "grad_norm": 0.39748215675354004, "learning_rate": 0.00011870869977597263, "loss": 0.27789499759674074, "memory(GiB)": 91.64, "step": 4960, "token_acc": 0.8938492063492064, "train_speed(iter/s)": 0.138587 }, { "epoch": 0.4685730464326161, "grad_norm": 0.23799273371696472, "learning_rate": 0.00011855538266948702, "loss": 0.27652902603149415, "memory(GiB)": 91.64, "step": 4965, "token_acc": 0.9004487964096287, "train_speed(iter/s)": 0.138586 }, { "epoch": 0.4690449226123065, "grad_norm": 0.5671650171279907, "learning_rate": 0.00011840202037785138, "loss": 0.27930173873901365, "memory(GiB)": 91.64, "step": 4970, "token_acc": 0.9092416079569001, "train_speed(iter/s)": 0.138586 }, { "epoch": 0.469516798791997, "grad_norm": 0.27737873792648315, "learning_rate": 0.00011824861327452587, "loss": 0.28255581855773926, "memory(GiB)": 91.64, "step": 4975, "token_acc": 0.9156298600311042, "train_speed(iter/s)": 0.138588 }, { "epoch": 0.46998867497168745, "grad_norm": 0.3608155846595764, "learning_rate": 0.00011809516173307997, "loss": 0.2820130348205566, "memory(GiB)": 91.64, "step": 4980, "token_acc": 0.8924418604651163, "train_speed(iter/s)": 0.138588 }, { "epoch": 0.47046055115137786, "grad_norm": 0.25761914253234863, "learning_rate": 0.00011794166612719126, "loss": 0.27811760902404786, "memory(GiB)": 91.64, "step": 4985, "token_acc": 0.8917102315160568, "train_speed(iter/s)": 0.138588 }, { "epoch": 0.4709324273310683, "grad_norm": 0.31159085035324097, "learning_rate": 0.00011778812683064464, "loss": 0.27784423828125, "memory(GiB)": 91.64, "step": 4990, "token_acc": 0.9108947959565705, "train_speed(iter/s)": 0.138584 }, { "epoch": 0.4714043035107588, "grad_norm": 0.2755048871040344, "learning_rate": 0.00011763454421733138, "loss": 0.27713913917541505, "memory(GiB)": 91.64, "step": 4995, "token_acc": 0.9096820809248555, "train_speed(iter/s)": 0.138582 }, { "epoch": 0.4718761796904492, "grad_norm": 0.5099897384643555, "learning_rate": 0.00011748091866124833, "loss": 0.28005452156066896, "memory(GiB)": 91.64, "step": 5000, "token_acc": 0.8963607594936709, "train_speed(iter/s)": 0.13858 }, { "epoch": 0.47234805587013967, "grad_norm": 0.34181883931159973, "learning_rate": 0.0001173272505364968, "loss": 0.28417515754699707, "memory(GiB)": 91.64, "step": 5005, "token_acc": 0.9090192989365892, "train_speed(iter/s)": 0.138578 }, { "epoch": 0.47281993204983014, "grad_norm": 0.3527189791202545, "learning_rate": 0.0001171735402172818, "loss": 0.27751028537750244, "memory(GiB)": 91.64, "step": 5010, "token_acc": 0.8898207056101793, "train_speed(iter/s)": 0.138577 }, { "epoch": 0.47329180822952055, "grad_norm": 0.25944942235946655, "learning_rate": 0.00011701978807791114, "loss": 0.27676069736480713, "memory(GiB)": 91.64, "step": 5015, "token_acc": 0.896022549326652, "train_speed(iter/s)": 0.138577 }, { "epoch": 0.473763684409211, "grad_norm": 0.30124396085739136, "learning_rate": 0.00011686599449279436, "loss": 0.28163583278656007, "memory(GiB)": 91.64, "step": 5020, "token_acc": 0.894380118610562, "train_speed(iter/s)": 0.138578 }, { "epoch": 0.4742355605889015, "grad_norm": 0.3558872640132904, "learning_rate": 0.00011671215983644203, "loss": 0.27531468868255615, "memory(GiB)": 91.64, "step": 5025, "token_acc": 0.9087152516904583, "train_speed(iter/s)": 0.138576 }, { "epoch": 0.4747074367685919, "grad_norm": 0.33486005663871765, "learning_rate": 0.00011655828448346473, "loss": 0.2796565294265747, "memory(GiB)": 91.64, "step": 5030, "token_acc": 0.9095816464237517, "train_speed(iter/s)": 0.138575 }, { "epoch": 0.47517931294828236, "grad_norm": 0.33892741799354553, "learning_rate": 0.00011640436880857208, "loss": 0.2787603855133057, "memory(GiB)": 91.64, "step": 5035, "token_acc": 0.8977215189873418, "train_speed(iter/s)": 0.138578 }, { "epoch": 0.47565118912797283, "grad_norm": 0.840950608253479, "learning_rate": 0.00011625041318657186, "loss": 0.277506947517395, "memory(GiB)": 91.64, "step": 5040, "token_acc": 0.9116925592804579, "train_speed(iter/s)": 0.138576 }, { "epoch": 0.4761230653076633, "grad_norm": 0.30797073245048523, "learning_rate": 0.00011609641799236928, "loss": 0.27592084407806394, "memory(GiB)": 91.64, "step": 5045, "token_acc": 0.9042925278219396, "train_speed(iter/s)": 0.138576 }, { "epoch": 0.4765949414873537, "grad_norm": 0.4821353852748871, "learning_rate": 0.00011594238360096577, "loss": 0.2765143871307373, "memory(GiB)": 91.64, "step": 5050, "token_acc": 0.8992502343017807, "train_speed(iter/s)": 0.138577 }, { "epoch": 0.4770668176670442, "grad_norm": 0.7837891578674316, "learning_rate": 0.00011578831038745826, "loss": 0.2858426570892334, "memory(GiB)": 91.64, "step": 5055, "token_acc": 0.9004637887977167, "train_speed(iter/s)": 0.138577 }, { "epoch": 0.47753869384673464, "grad_norm": 0.5565283894538879, "learning_rate": 0.0001156341987270382, "loss": 0.27644643783569334, "memory(GiB)": 91.64, "step": 5060, "token_acc": 0.8971206729213846, "train_speed(iter/s)": 0.138578 }, { "epoch": 0.47801057002642505, "grad_norm": 0.5778424143791199, "learning_rate": 0.00011548004899499076, "loss": 0.27022864818573, "memory(GiB)": 91.64, "step": 5065, "token_acc": 0.8980617372577172, "train_speed(iter/s)": 0.138578 }, { "epoch": 0.4784824462061155, "grad_norm": 0.21273185312747955, "learning_rate": 0.00011532586156669368, "loss": 0.2737504720687866, "memory(GiB)": 91.64, "step": 5070, "token_acc": 0.8996960486322189, "train_speed(iter/s)": 0.138579 }, { "epoch": 0.478954322385806, "grad_norm": 0.20052437484264374, "learning_rate": 0.00011517163681761653, "loss": 0.28348593711853026, "memory(GiB)": 91.64, "step": 5075, "token_acc": 0.9174463401210787, "train_speed(iter/s)": 0.138581 }, { "epoch": 0.4794261985654964, "grad_norm": 0.35244038701057434, "learning_rate": 0.00011501737512331987, "loss": 0.27732019424438475, "memory(GiB)": 91.64, "step": 5080, "token_acc": 0.8944756864042342, "train_speed(iter/s)": 0.138581 }, { "epoch": 0.47989807474518686, "grad_norm": 0.65986168384552, "learning_rate": 0.0001148630768594541, "loss": 0.2808716058731079, "memory(GiB)": 91.64, "step": 5085, "token_acc": 0.8806818181818182, "train_speed(iter/s)": 0.13858 }, { "epoch": 0.48036995092487733, "grad_norm": 0.3498198390007019, "learning_rate": 0.00011470874240175873, "loss": 0.2739971876144409, "memory(GiB)": 91.64, "step": 5090, "token_acc": 0.9123263888888888, "train_speed(iter/s)": 0.138582 }, { "epoch": 0.48084182710456774, "grad_norm": 0.5786554217338562, "learning_rate": 0.0001145543721260614, "loss": 0.2853843212127686, "memory(GiB)": 91.64, "step": 5095, "token_acc": 0.9068446464072707, "train_speed(iter/s)": 0.13858 }, { "epoch": 0.4813137032842582, "grad_norm": 0.6657483577728271, "learning_rate": 0.00011439996640827694, "loss": 0.28014469146728516, "memory(GiB)": 91.64, "step": 5100, "token_acc": 0.8972332015810277, "train_speed(iter/s)": 0.138582 }, { "epoch": 0.4817855794639487, "grad_norm": 0.5863536596298218, "learning_rate": 0.00011424552562440658, "loss": 0.27579662799835203, "memory(GiB)": 91.64, "step": 5105, "token_acc": 0.9050228310502283, "train_speed(iter/s)": 0.13858 }, { "epoch": 0.4822574556436391, "grad_norm": 0.21976174414157867, "learning_rate": 0.00011409105015053683, "loss": 0.2734682083129883, "memory(GiB)": 91.64, "step": 5110, "token_acc": 0.8935144609991236, "train_speed(iter/s)": 0.13858 }, { "epoch": 0.48272933182332955, "grad_norm": 0.3334028720855713, "learning_rate": 0.00011393654036283875, "loss": 0.27859272956848147, "memory(GiB)": 91.64, "step": 5115, "token_acc": 0.8913043478260869, "train_speed(iter/s)": 0.138578 }, { "epoch": 0.48320120800302, "grad_norm": 0.4304635226726532, "learning_rate": 0.0001137819966375669, "loss": 0.27210822105407717, "memory(GiB)": 91.64, "step": 5120, "token_acc": 0.9054319371727748, "train_speed(iter/s)": 0.138581 }, { "epoch": 0.48367308418271043, "grad_norm": 0.6442833542823792, "learning_rate": 0.00011362741935105849, "loss": 0.2701150894165039, "memory(GiB)": 91.64, "step": 5125, "token_acc": 0.890727035263387, "train_speed(iter/s)": 0.13858 }, { "epoch": 0.4841449603624009, "grad_norm": 0.38582414388656616, "learning_rate": 0.00011347280887973259, "loss": 0.2726860046386719, "memory(GiB)": 91.64, "step": 5130, "token_acc": 0.9164477141355754, "train_speed(iter/s)": 0.138581 }, { "epoch": 0.48461683654209137, "grad_norm": 0.2989153563976288, "learning_rate": 0.0001133181656000889, "loss": 0.2714090347290039, "memory(GiB)": 91.64, "step": 5135, "token_acc": 0.8948035487959443, "train_speed(iter/s)": 0.138582 }, { "epoch": 0.4850887127217818, "grad_norm": 0.2917657196521759, "learning_rate": 0.00011316348988870705, "loss": 0.2744471073150635, "memory(GiB)": 91.64, "step": 5140, "token_acc": 0.9163398692810457, "train_speed(iter/s)": 0.138582 }, { "epoch": 0.48556058890147225, "grad_norm": 0.3109903037548065, "learning_rate": 0.00011300878212224577, "loss": 0.28097503185272216, "memory(GiB)": 91.64, "step": 5145, "token_acc": 0.9047619047619048, "train_speed(iter/s)": 0.138584 }, { "epoch": 0.4860324650811627, "grad_norm": 0.33942723274230957, "learning_rate": 0.00011285404267744171, "loss": 0.27203121185302737, "memory(GiB)": 91.64, "step": 5150, "token_acc": 0.9211481359287363, "train_speed(iter/s)": 0.138585 }, { "epoch": 0.4865043412608532, "grad_norm": 0.4545697271823883, "learning_rate": 0.00011269927193110869, "loss": 0.2756700038909912, "memory(GiB)": 91.64, "step": 5155, "token_acc": 0.9127371273712737, "train_speed(iter/s)": 0.138582 }, { "epoch": 0.4869762174405436, "grad_norm": 0.2911210358142853, "learning_rate": 0.00011254447026013682, "loss": 0.27875099182128904, "memory(GiB)": 91.64, "step": 5160, "token_acc": 0.8810038944180009, "train_speed(iter/s)": 0.138583 }, { "epoch": 0.48744809362023406, "grad_norm": 0.25021690130233765, "learning_rate": 0.00011238963804149148, "loss": 0.26958017349243163, "memory(GiB)": 91.64, "step": 5165, "token_acc": 0.8997645475950219, "train_speed(iter/s)": 0.138583 }, { "epoch": 0.4879199697999245, "grad_norm": 0.38456693291664124, "learning_rate": 0.00011223477565221236, "loss": 0.27497286796569825, "memory(GiB)": 91.64, "step": 5170, "token_acc": 0.890403015366773, "train_speed(iter/s)": 0.138583 }, { "epoch": 0.48839184597961494, "grad_norm": 0.8317140936851501, "learning_rate": 0.00011207988346941273, "loss": 0.27993662357330323, "memory(GiB)": 91.64, "step": 5175, "token_acc": 0.9226804123711341, "train_speed(iter/s)": 0.138583 }, { "epoch": 0.4888637221593054, "grad_norm": 0.31843581795692444, "learning_rate": 0.00011192496187027843, "loss": 0.2792136430740356, "memory(GiB)": 91.64, "step": 5180, "token_acc": 0.9134140870345446, "train_speed(iter/s)": 0.138584 }, { "epoch": 0.48933559833899587, "grad_norm": 0.3232461214065552, "learning_rate": 0.00011177001123206681, "loss": 0.26682229042053224, "memory(GiB)": 91.64, "step": 5185, "token_acc": 0.9106923392052437, "train_speed(iter/s)": 0.138585 }, { "epoch": 0.4898074745186863, "grad_norm": 0.388791561126709, "learning_rate": 0.00011161503193210599, "loss": 0.27460460662841796, "memory(GiB)": 91.64, "step": 5190, "token_acc": 0.90976, "train_speed(iter/s)": 0.138587 }, { "epoch": 0.49027935069837675, "grad_norm": 0.6369457244873047, "learning_rate": 0.00011146002434779394, "loss": 0.27512354850769044, "memory(GiB)": 91.64, "step": 5195, "token_acc": 0.8958634654324559, "train_speed(iter/s)": 0.138588 }, { "epoch": 0.4907512268780672, "grad_norm": 0.3902687430381775, "learning_rate": 0.00011130498885659744, "loss": 0.2719013214111328, "memory(GiB)": 91.64, "step": 5200, "token_acc": 0.9022752704214845, "train_speed(iter/s)": 0.138588 }, { "epoch": 0.4912231030577576, "grad_norm": 0.44649550318717957, "learning_rate": 0.00011114992583605126, "loss": 0.2780723571777344, "memory(GiB)": 91.64, "step": 5205, "token_acc": 0.9057301293900185, "train_speed(iter/s)": 0.13859 }, { "epoch": 0.4916949792374481, "grad_norm": 0.3157523572444916, "learning_rate": 0.00011099483566375717, "loss": 0.27315502166748046, "memory(GiB)": 91.64, "step": 5210, "token_acc": 0.9008341056533827, "train_speed(iter/s)": 0.138587 }, { "epoch": 0.49216685541713856, "grad_norm": 0.37949439883232117, "learning_rate": 0.00011083971871738311, "loss": 0.2804953813552856, "memory(GiB)": 91.64, "step": 5215, "token_acc": 0.8958660387231816, "train_speed(iter/s)": 0.138589 }, { "epoch": 0.49263873159682897, "grad_norm": 0.5504710078239441, "learning_rate": 0.0001106845753746622, "loss": 0.2684622764587402, "memory(GiB)": 91.64, "step": 5220, "token_acc": 0.9050916496945011, "train_speed(iter/s)": 0.13859 }, { "epoch": 0.49311060777651944, "grad_norm": 0.6442100405693054, "learning_rate": 0.00011052940601339181, "loss": 0.27536282539367674, "memory(GiB)": 91.64, "step": 5225, "token_acc": 0.9010629599345871, "train_speed(iter/s)": 0.138591 }, { "epoch": 0.4935824839562099, "grad_norm": 0.42355185747146606, "learning_rate": 0.0001103742110114327, "loss": 0.28316774368286135, "memory(GiB)": 91.64, "step": 5230, "token_acc": 0.9025316455696203, "train_speed(iter/s)": 0.138593 }, { "epoch": 0.4940543601359003, "grad_norm": 0.29734212160110474, "learning_rate": 0.00011021899074670811, "loss": 0.26995747089385985, "memory(GiB)": 91.64, "step": 5235, "token_acc": 0.9120280264694434, "train_speed(iter/s)": 0.138592 }, { "epoch": 0.4945262363155908, "grad_norm": 0.30176839232444763, "learning_rate": 0.00011006374559720268, "loss": 0.27476816177368163, "memory(GiB)": 91.64, "step": 5240, "token_acc": 0.9170903402424716, "train_speed(iter/s)": 0.138592 }, { "epoch": 0.49499811249528125, "grad_norm": 0.6972224116325378, "learning_rate": 0.00010990847594096176, "loss": 0.26695716381073, "memory(GiB)": 91.64, "step": 5245, "token_acc": 0.9249381358262304, "train_speed(iter/s)": 0.138593 }, { "epoch": 0.49546998867497166, "grad_norm": 0.6553791761398315, "learning_rate": 0.00010975318215609035, "loss": 0.27399606704711915, "memory(GiB)": 91.64, "step": 5250, "token_acc": 0.9224839400428265, "train_speed(iter/s)": 0.138592 }, { "epoch": 0.49594186485466213, "grad_norm": 0.5006653070449829, "learning_rate": 0.00010959786462075214, "loss": 0.27437796592712405, "memory(GiB)": 91.64, "step": 5255, "token_acc": 0.8942583732057416, "train_speed(iter/s)": 0.138591 }, { "epoch": 0.4964137410343526, "grad_norm": 0.3082018196582794, "learning_rate": 0.00010944252371316874, "loss": 0.2674814224243164, "memory(GiB)": 91.64, "step": 5260, "token_acc": 0.9183520599250936, "train_speed(iter/s)": 0.138591 }, { "epoch": 0.49688561721404306, "grad_norm": 0.22847510874271393, "learning_rate": 0.00010928715981161868, "loss": 0.2763264894485474, "memory(GiB)": 91.64, "step": 5265, "token_acc": 0.8880643166357453, "train_speed(iter/s)": 0.13859 }, { "epoch": 0.4973574933937335, "grad_norm": 0.4131425619125366, "learning_rate": 0.0001091317732944364, "loss": 0.2672610282897949, "memory(GiB)": 91.64, "step": 5270, "token_acc": 0.9162442674390538, "train_speed(iter/s)": 0.13859 }, { "epoch": 0.49782936957342394, "grad_norm": 0.3631506562232971, "learning_rate": 0.00010897636454001145, "loss": 0.276334285736084, "memory(GiB)": 91.64, "step": 5275, "token_acc": 0.9140866873065016, "train_speed(iter/s)": 0.13859 }, { "epoch": 0.4983012457531144, "grad_norm": 0.49582716822624207, "learning_rate": 0.00010882093392678761, "loss": 0.270448637008667, "memory(GiB)": 91.64, "step": 5280, "token_acc": 0.8968347010550997, "train_speed(iter/s)": 0.138592 }, { "epoch": 0.4987731219328048, "grad_norm": 0.35381758213043213, "learning_rate": 0.00010866548183326176, "loss": 0.27590155601501465, "memory(GiB)": 91.64, "step": 5285, "token_acc": 0.9073126692747517, "train_speed(iter/s)": 0.138591 }, { "epoch": 0.4992449981124953, "grad_norm": 0.2577316462993622, "learning_rate": 0.00010851000863798313, "loss": 0.27100181579589844, "memory(GiB)": 91.64, "step": 5290, "token_acc": 0.9157062891010866, "train_speed(iter/s)": 0.138591 }, { "epoch": 0.49971687429218575, "grad_norm": 0.6241110563278198, "learning_rate": 0.00010835451471955245, "loss": 0.27226576805114744, "memory(GiB)": 91.64, "step": 5295, "token_acc": 0.9051878354203936, "train_speed(iter/s)": 0.138591 }, { "epoch": 0.5001887504718762, "grad_norm": 0.43986672163009644, "learning_rate": 0.00010819900045662073, "loss": 0.27079594135284424, "memory(GiB)": 91.64, "step": 5300, "token_acc": 0.9174471037114117, "train_speed(iter/s)": 0.13859 }, { "epoch": 0.5006606266515666, "grad_norm": 0.2602379322052002, "learning_rate": 0.00010804346622788866, "loss": 0.27014808654785155, "memory(GiB)": 91.64, "step": 5305, "token_acc": 0.9015012815818382, "train_speed(iter/s)": 0.138591 }, { "epoch": 0.5011325028312571, "grad_norm": 0.45675429701805115, "learning_rate": 0.00010788791241210547, "loss": 0.27579355239868164, "memory(GiB)": 91.64, "step": 5310, "token_acc": 0.8968147151188874, "train_speed(iter/s)": 0.138592 }, { "epoch": 0.5016043790109476, "grad_norm": 0.29645124077796936, "learning_rate": 0.00010773233938806812, "loss": 0.26997838020324705, "memory(GiB)": 91.64, "step": 5315, "token_acc": 0.9151072569602922, "train_speed(iter/s)": 0.138592 }, { "epoch": 0.5020762551906379, "grad_norm": 0.4871494174003601, "learning_rate": 0.00010757674753462039, "loss": 0.26801414489746095, "memory(GiB)": 91.64, "step": 5320, "token_acc": 0.9187116564417178, "train_speed(iter/s)": 0.138594 }, { "epoch": 0.5025481313703284, "grad_norm": 0.2918551564216614, "learning_rate": 0.00010742113723065181, "loss": 0.27881925106048583, "memory(GiB)": 91.64, "step": 5325, "token_acc": 0.9141835518474374, "train_speed(iter/s)": 0.138592 }, { "epoch": 0.5030200075500189, "grad_norm": 0.5010596513748169, "learning_rate": 0.00010726550885509689, "loss": 0.2730778455734253, "memory(GiB)": 91.64, "step": 5330, "token_acc": 0.9, "train_speed(iter/s)": 0.138592 }, { "epoch": 0.5034918837297093, "grad_norm": 0.26482099294662476, "learning_rate": 0.00010710986278693424, "loss": 0.27079339027404786, "memory(GiB)": 91.64, "step": 5335, "token_acc": 0.9089954497724886, "train_speed(iter/s)": 0.138592 }, { "epoch": 0.5039637599093998, "grad_norm": 0.2955012321472168, "learning_rate": 0.00010695419940518536, "loss": 0.2673619747161865, "memory(GiB)": 91.64, "step": 5340, "token_acc": 0.9062937062937063, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.5044356360890903, "grad_norm": 0.2763923704624176, "learning_rate": 0.00010679851908891405, "loss": 0.279115629196167, "memory(GiB)": 91.64, "step": 5345, "token_acc": 0.8722680913064594, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.5049075122687807, "grad_norm": 0.2069929838180542, "learning_rate": 0.00010664282221722538, "loss": 0.2716416835784912, "memory(GiB)": 91.64, "step": 5350, "token_acc": 0.9151846785225718, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.5053793884484711, "grad_norm": 0.5250474810600281, "learning_rate": 0.00010648710916926458, "loss": 0.2689443826675415, "memory(GiB)": 91.64, "step": 5355, "token_acc": 0.9039106145251397, "train_speed(iter/s)": 0.138597 }, { "epoch": 0.5058512646281615, "grad_norm": 0.5674706697463989, "learning_rate": 0.00010633138032421638, "loss": 0.27478585243225095, "memory(GiB)": 91.64, "step": 5360, "token_acc": 0.9083916083916084, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.506323140807852, "grad_norm": 0.3561114966869354, "learning_rate": 0.00010617563606130403, "loss": 0.27205119132995603, "memory(GiB)": 91.64, "step": 5365, "token_acc": 0.8959741404642962, "train_speed(iter/s)": 0.138599 }, { "epoch": 0.5067950169875425, "grad_norm": 0.43625959753990173, "learning_rate": 0.0001060198767597882, "loss": 0.27315943241119384, "memory(GiB)": 91.64, "step": 5370, "token_acc": 0.9110037944118662, "train_speed(iter/s)": 0.138599 }, { "epoch": 0.507266893167233, "grad_norm": 0.27587762475013733, "learning_rate": 0.00010586410279896619, "loss": 0.2728897571563721, "memory(GiB)": 91.64, "step": 5375, "token_acc": 0.9074920858248329, "train_speed(iter/s)": 0.138601 }, { "epoch": 0.5077387693469234, "grad_norm": 0.23502488434314728, "learning_rate": 0.00010570831455817116, "loss": 0.2788903474807739, "memory(GiB)": 91.64, "step": 5380, "token_acc": 0.9112850619699935, "train_speed(iter/s)": 0.138602 }, { "epoch": 0.5082106455266138, "grad_norm": 0.22300395369529724, "learning_rate": 0.00010555251241677086, "loss": 0.26743249893188475, "memory(GiB)": 91.64, "step": 5385, "token_acc": 0.9102902374670184, "train_speed(iter/s)": 0.138602 }, { "epoch": 0.5086825217063042, "grad_norm": 0.3851175606250763, "learning_rate": 0.00010539669675416694, "loss": 0.26730880737304685, "memory(GiB)": 91.64, "step": 5390, "token_acc": 0.9054170249355116, "train_speed(iter/s)": 0.138602 }, { "epoch": 0.5091543978859947, "grad_norm": 0.303202360868454, "learning_rate": 0.00010524086794979402, "loss": 0.2683709144592285, "memory(GiB)": 91.64, "step": 5395, "token_acc": 0.9025044722719141, "train_speed(iter/s)": 0.138603 }, { "epoch": 0.5096262740656852, "grad_norm": 0.36325204372406006, "learning_rate": 0.00010508502638311873, "loss": 0.27001941204071045, "memory(GiB)": 91.64, "step": 5400, "token_acc": 0.928078250863061, "train_speed(iter/s)": 0.138602 }, { "epoch": 0.5100981502453756, "grad_norm": 0.7915918827056885, "learning_rate": 0.00010492917243363867, "loss": 0.271225643157959, "memory(GiB)": 91.64, "step": 5405, "token_acc": 0.9089173711480775, "train_speed(iter/s)": 0.138599 }, { "epoch": 0.5105700264250661, "grad_norm": 0.2819935083389282, "learning_rate": 0.00010477330648088171, "loss": 0.2733079671859741, "memory(GiB)": 91.64, "step": 5410, "token_acc": 0.8977853492333902, "train_speed(iter/s)": 0.138599 }, { "epoch": 0.5110419026047565, "grad_norm": 0.22760671377182007, "learning_rate": 0.00010461742890440493, "loss": 0.2703261375427246, "memory(GiB)": 91.64, "step": 5415, "token_acc": 0.9160453808752026, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.5115137787844469, "grad_norm": 0.7035602927207947, "learning_rate": 0.00010446154008379367, "loss": 0.2706472873687744, "memory(GiB)": 91.64, "step": 5420, "token_acc": 0.9036195286195287, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.5119856549641374, "grad_norm": 0.7053150534629822, "learning_rate": 0.00010430564039866067, "loss": 0.27515950202941897, "memory(GiB)": 91.64, "step": 5425, "token_acc": 0.9001291433491175, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.5124575311438279, "grad_norm": 0.4593672454357147, "learning_rate": 0.00010414973022864514, "loss": 0.27523515224456785, "memory(GiB)": 91.64, "step": 5430, "token_acc": 0.8985200845665962, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.5129294073235183, "grad_norm": 0.22665126621723175, "learning_rate": 0.00010399380995341181, "loss": 0.2573527812957764, "memory(GiB)": 91.64, "step": 5435, "token_acc": 0.9121645172533984, "train_speed(iter/s)": 0.138599 }, { "epoch": 0.5134012835032088, "grad_norm": 0.5087513327598572, "learning_rate": 0.00010383787995265004, "loss": 0.27467942237854004, "memory(GiB)": 91.64, "step": 5440, "token_acc": 0.8909090909090909, "train_speed(iter/s)": 0.138601 }, { "epoch": 0.5138731596828993, "grad_norm": 0.48231035470962524, "learning_rate": 0.00010368194060607283, "loss": 0.27229771614074705, "memory(GiB)": 91.64, "step": 5445, "token_acc": 0.8985872855701312, "train_speed(iter/s)": 0.138601 }, { "epoch": 0.5143450358625896, "grad_norm": 0.5203849673271179, "learning_rate": 0.00010352599229341597, "loss": 0.2672194242477417, "memory(GiB)": 91.64, "step": 5450, "token_acc": 0.9112739112739112, "train_speed(iter/s)": 0.138602 }, { "epoch": 0.5148169120422801, "grad_norm": 0.5567079782485962, "learning_rate": 0.0001033700353944371, "loss": 0.2730486154556274, "memory(GiB)": 91.64, "step": 5455, "token_acc": 0.900904033379694, "train_speed(iter/s)": 0.138602 }, { "epoch": 0.5152887882219706, "grad_norm": 0.4877086281776428, "learning_rate": 0.0001032140702889147, "loss": 0.2705501079559326, "memory(GiB)": 91.64, "step": 5460, "token_acc": 0.9008064516129032, "train_speed(iter/s)": 0.138603 }, { "epoch": 0.515760664401661, "grad_norm": 0.3445092737674713, "learning_rate": 0.00010305809735664735, "loss": 0.2583261728286743, "memory(GiB)": 91.64, "step": 5465, "token_acc": 0.9392538791680423, "train_speed(iter/s)": 0.138605 }, { "epoch": 0.5162325405813515, "grad_norm": 0.41431504487991333, "learning_rate": 0.00010290211697745258, "loss": 0.2675126075744629, "memory(GiB)": 91.64, "step": 5470, "token_acc": 0.917844232665133, "train_speed(iter/s)": 0.138607 }, { "epoch": 0.516704416761042, "grad_norm": 0.24442262947559357, "learning_rate": 0.00010274612953116605, "loss": 0.26557936668396, "memory(GiB)": 91.64, "step": 5475, "token_acc": 0.9098092643051771, "train_speed(iter/s)": 0.138607 }, { "epoch": 0.5171762929407323, "grad_norm": 0.6545706391334534, "learning_rate": 0.00010259013539764074, "loss": 0.27161517143249514, "memory(GiB)": 91.64, "step": 5480, "token_acc": 0.9188552188552188, "train_speed(iter/s)": 0.138607 }, { "epoch": 0.5176481691204228, "grad_norm": 0.2782509922981262, "learning_rate": 0.00010243413495674583, "loss": 0.262584924697876, "memory(GiB)": 91.64, "step": 5485, "token_acc": 0.8945548833189283, "train_speed(iter/s)": 0.138608 }, { "epoch": 0.5181200453001132, "grad_norm": 0.2864302694797516, "learning_rate": 0.00010227812858836585, "loss": 0.2646970510482788, "memory(GiB)": 91.64, "step": 5490, "token_acc": 0.9124331550802139, "train_speed(iter/s)": 0.138608 }, { "epoch": 0.5185919214798037, "grad_norm": 0.5489168763160706, "learning_rate": 0.00010212211667239982, "loss": 0.26138916015625, "memory(GiB)": 91.64, "step": 5495, "token_acc": 0.9172252987467211, "train_speed(iter/s)": 0.138606 }, { "epoch": 0.5190637976594942, "grad_norm": 0.828153669834137, "learning_rate": 0.00010196609958876027, "loss": 0.2727668762207031, "memory(GiB)": 91.64, "step": 5500, "token_acc": 0.9049342105263158, "train_speed(iter/s)": 0.138608 }, { "epoch": 0.5195356738391846, "grad_norm": 0.3575778901576996, "learning_rate": 0.00010181007771737221, "loss": 0.2691819667816162, "memory(GiB)": 91.64, "step": 5505, "token_acc": 0.9117647058823529, "train_speed(iter/s)": 0.138609 }, { "epoch": 0.520007550018875, "grad_norm": 0.340814471244812, "learning_rate": 0.00010165405143817242, "loss": 0.278232479095459, "memory(GiB)": 91.64, "step": 5510, "token_acc": 0.8898630136986302, "train_speed(iter/s)": 0.138607 }, { "epoch": 0.5204794261985655, "grad_norm": 0.27852863073349, "learning_rate": 0.00010149802113110843, "loss": 0.2711049556732178, "memory(GiB)": 91.64, "step": 5515, "token_acc": 0.8898026315789473, "train_speed(iter/s)": 0.138606 }, { "epoch": 0.5209513023782559, "grad_norm": 0.40802115201950073, "learning_rate": 0.00010134198717613743, "loss": 0.27061238288879397, "memory(GiB)": 91.64, "step": 5520, "token_acc": 0.9073778345576493, "train_speed(iter/s)": 0.138606 }, { "epoch": 0.5214231785579464, "grad_norm": 0.3736279010772705, "learning_rate": 0.00010118594995322563, "loss": 0.26925182342529297, "memory(GiB)": 91.64, "step": 5525, "token_acc": 0.9193942354665364, "train_speed(iter/s)": 0.138606 }, { "epoch": 0.5218950547376369, "grad_norm": 0.28507253527641296, "learning_rate": 0.00010102990984234721, "loss": 0.27002944946289065, "memory(GiB)": 91.64, "step": 5530, "token_acc": 0.9087465564738292, "train_speed(iter/s)": 0.138606 }, { "epoch": 0.5223669309173273, "grad_norm": 0.2574266493320465, "learning_rate": 0.00010087386722348325, "loss": 0.26802835464477537, "memory(GiB)": 91.64, "step": 5535, "token_acc": 0.9000989119683481, "train_speed(iter/s)": 0.138606 }, { "epoch": 0.5228388070970177, "grad_norm": 0.3153584599494934, "learning_rate": 0.00010071782247662108, "loss": 0.2677382230758667, "memory(GiB)": 91.64, "step": 5540, "token_acc": 0.9037955655768508, "train_speed(iter/s)": 0.13854 }, { "epoch": 0.5233106832767082, "grad_norm": 0.5408716797828674, "learning_rate": 0.0001005617759817531, "loss": 0.2700139045715332, "memory(GiB)": 91.64, "step": 5545, "token_acc": 0.9161554192229039, "train_speed(iter/s)": 0.138536 }, { "epoch": 0.5237825594563986, "grad_norm": 0.5670604705810547, "learning_rate": 0.00010040572811887604, "loss": 0.264970064163208, "memory(GiB)": 91.64, "step": 5550, "token_acc": 0.923982869379015, "train_speed(iter/s)": 0.138537 }, { "epoch": 0.5242544356360891, "grad_norm": 0.5169808864593506, "learning_rate": 0.00010024967926798994, "loss": 0.264431095123291, "memory(GiB)": 91.64, "step": 5555, "token_acc": 0.9123539232053423, "train_speed(iter/s)": 0.138536 }, { "epoch": 0.5247263118157796, "grad_norm": 0.6065265536308289, "learning_rate": 0.0001000936298090972, "loss": 0.2742233037948608, "memory(GiB)": 91.64, "step": 5560, "token_acc": 0.8927628103539356, "train_speed(iter/s)": 0.138537 }, { "epoch": 0.52519818799547, "grad_norm": 0.21818573772907257, "learning_rate": 9.993758012220181e-05, "loss": 0.2681767225265503, "memory(GiB)": 91.64, "step": 5565, "token_acc": 0.9015407741450583, "train_speed(iter/s)": 0.138538 }, { "epoch": 0.5256700641751605, "grad_norm": 0.5659834146499634, "learning_rate": 9.978153058730823e-05, "loss": 0.2633455514907837, "memory(GiB)": 91.64, "step": 5570, "token_acc": 0.9158934450763244, "train_speed(iter/s)": 0.138539 }, { "epoch": 0.5261419403548508, "grad_norm": 0.23959246277809143, "learning_rate": 9.962548158442054e-05, "loss": 0.2634448528289795, "memory(GiB)": 91.64, "step": 5575, "token_acc": 0.9061841180604358, "train_speed(iter/s)": 0.13854 }, { "epoch": 0.5266138165345413, "grad_norm": 0.2925381660461426, "learning_rate": 9.946943349354159e-05, "loss": 0.2646299362182617, "memory(GiB)": 91.64, "step": 5580, "token_acc": 0.9134487350199734, "train_speed(iter/s)": 0.138542 }, { "epoch": 0.5270856927142318, "grad_norm": 0.3260766565799713, "learning_rate": 9.93133866946719e-05, "loss": 0.2640314340591431, "memory(GiB)": 91.64, "step": 5585, "token_acc": 0.8965307364576993, "train_speed(iter/s)": 0.138542 }, { "epoch": 0.5275575688939222, "grad_norm": 0.33577093482017517, "learning_rate": 9.915734156780904e-05, "loss": 0.26520462036132814, "memory(GiB)": 91.64, "step": 5590, "token_acc": 0.9146467251160392, "train_speed(iter/s)": 0.13854 }, { "epoch": 0.5280294450736127, "grad_norm": 0.3756278157234192, "learning_rate": 9.900129849294627e-05, "loss": 0.2686309337615967, "memory(GiB)": 91.64, "step": 5595, "token_acc": 0.8980466888994759, "train_speed(iter/s)": 0.138541 }, { "epoch": 0.5285013212533032, "grad_norm": 0.636249303817749, "learning_rate": 9.884525785007204e-05, "loss": 0.27318830490112306, "memory(GiB)": 91.64, "step": 5600, "token_acc": 0.8904034896401308, "train_speed(iter/s)": 0.138541 }, { "epoch": 0.5289731974329935, "grad_norm": 0.6412723064422607, "learning_rate": 9.868922001916877e-05, "loss": 0.26247034072875974, "memory(GiB)": 91.64, "step": 5605, "token_acc": 0.9019680653546231, "train_speed(iter/s)": 0.138542 }, { "epoch": 0.529445073612684, "grad_norm": 0.3598019480705261, "learning_rate": 9.853318538021206e-05, "loss": 0.26594886779785154, "memory(GiB)": 91.64, "step": 5610, "token_acc": 0.8935340022296544, "train_speed(iter/s)": 0.138542 }, { "epoch": 0.5299169497923745, "grad_norm": 0.7006165385246277, "learning_rate": 9.837715431316974e-05, "loss": 0.26999516487121583, "memory(GiB)": 91.64, "step": 5615, "token_acc": 0.9212376933895922, "train_speed(iter/s)": 0.138544 }, { "epoch": 0.5303888259720649, "grad_norm": 0.6085411310195923, "learning_rate": 9.8221127198001e-05, "loss": 0.26465139389038084, "memory(GiB)": 91.64, "step": 5620, "token_acc": 0.9219022687609075, "train_speed(iter/s)": 0.138546 }, { "epoch": 0.5308607021517554, "grad_norm": 0.5289167761802673, "learning_rate": 9.806510441465532e-05, "loss": 0.26235690116882326, "memory(GiB)": 91.64, "step": 5625, "token_acc": 0.9046015712682379, "train_speed(iter/s)": 0.138547 }, { "epoch": 0.5313325783314459, "grad_norm": 0.3731206953525543, "learning_rate": 9.790908634307165e-05, "loss": 0.26745176315307617, "memory(GiB)": 91.64, "step": 5630, "token_acc": 0.9178789300797747, "train_speed(iter/s)": 0.138549 }, { "epoch": 0.5318044545111362, "grad_norm": 0.30575793981552124, "learning_rate": 9.775307336317752e-05, "loss": 0.26017489433288576, "memory(GiB)": 91.64, "step": 5635, "token_acc": 0.89728, "train_speed(iter/s)": 0.138546 }, { "epoch": 0.5322763306908267, "grad_norm": 0.5659154653549194, "learning_rate": 9.759706585488797e-05, "loss": 0.26600961685180663, "memory(GiB)": 91.64, "step": 5640, "token_acc": 0.8857142857142857, "train_speed(iter/s)": 0.138546 }, { "epoch": 0.5327482068705172, "grad_norm": 0.2851637601852417, "learning_rate": 9.744106419810478e-05, "loss": 0.2690946340560913, "memory(GiB)": 91.64, "step": 5645, "token_acc": 0.9198931909212283, "train_speed(iter/s)": 0.138546 }, { "epoch": 0.5332200830502076, "grad_norm": 0.2344711571931839, "learning_rate": 9.728506877271551e-05, "loss": 0.2627574920654297, "memory(GiB)": 91.64, "step": 5650, "token_acc": 0.9128968811463894, "train_speed(iter/s)": 0.138546 }, { "epoch": 0.5336919592298981, "grad_norm": 0.4054241478443146, "learning_rate": 9.712907995859248e-05, "loss": 0.26556243896484377, "memory(GiB)": 91.64, "step": 5655, "token_acc": 0.8993012741471434, "train_speed(iter/s)": 0.138546 }, { "epoch": 0.5341638354095886, "grad_norm": 0.2403021603822708, "learning_rate": 9.697309813559192e-05, "loss": 0.26658334732055666, "memory(GiB)": 91.64, "step": 5660, "token_acc": 0.8986765922249793, "train_speed(iter/s)": 0.138546 }, { "epoch": 0.534635711589279, "grad_norm": 0.3669694662094116, "learning_rate": 9.681712368355308e-05, "loss": 0.2626574277877808, "memory(GiB)": 91.64, "step": 5665, "token_acc": 0.9059067972692193, "train_speed(iter/s)": 0.138546 }, { "epoch": 0.5351075877689694, "grad_norm": 0.3157739043235779, "learning_rate": 9.666115698229721e-05, "loss": 0.260296106338501, "memory(GiB)": 91.64, "step": 5670, "token_acc": 0.9010388190267906, "train_speed(iter/s)": 0.138546 }, { "epoch": 0.5355794639486599, "grad_norm": 0.2455492615699768, "learning_rate": 9.650519841162666e-05, "loss": 0.25946090221405027, "memory(GiB)": 91.64, "step": 5675, "token_acc": 0.9096751160299893, "train_speed(iter/s)": 0.138549 }, { "epoch": 0.5360513401283503, "grad_norm": 0.3795280158519745, "learning_rate": 9.63492483513241e-05, "loss": 0.2626684904098511, "memory(GiB)": 91.64, "step": 5680, "token_acc": 0.8940092165898618, "train_speed(iter/s)": 0.13855 }, { "epoch": 0.5365232163080408, "grad_norm": 0.44789838790893555, "learning_rate": 9.619330718115141e-05, "loss": 0.26631550788879393, "memory(GiB)": 91.64, "step": 5685, "token_acc": 0.919800634345265, "train_speed(iter/s)": 0.13855 }, { "epoch": 0.5369950924877313, "grad_norm": 0.6232290267944336, "learning_rate": 9.603737528084878e-05, "loss": 0.260367751121521, "memory(GiB)": 91.64, "step": 5690, "token_acc": 0.9210836277974087, "train_speed(iter/s)": 0.138551 }, { "epoch": 0.5374669686674217, "grad_norm": 0.7171441316604614, "learning_rate": 9.588145303013383e-05, "loss": 0.26139035224914553, "memory(GiB)": 91.64, "step": 5695, "token_acc": 0.9144079885877318, "train_speed(iter/s)": 0.138552 }, { "epoch": 0.5379388448471121, "grad_norm": 0.8749316334724426, "learning_rate": 9.572554080870074e-05, "loss": 0.2580615520477295, "memory(GiB)": 91.64, "step": 5700, "token_acc": 0.9133631713554987, "train_speed(iter/s)": 0.138551 }, { "epoch": 0.5384107210268025, "grad_norm": 0.3951604664325714, "learning_rate": 9.556963899621929e-05, "loss": 0.26193459033966066, "memory(GiB)": 91.64, "step": 5705, "token_acc": 0.9229357798165138, "train_speed(iter/s)": 0.138549 }, { "epoch": 0.538882597206493, "grad_norm": 0.9120936989784241, "learning_rate": 9.541374797233381e-05, "loss": 0.26309173107147216, "memory(GiB)": 91.64, "step": 5710, "token_acc": 0.8990442054958184, "train_speed(iter/s)": 0.138549 }, { "epoch": 0.5393544733861835, "grad_norm": 0.33477917313575745, "learning_rate": 9.52578681166624e-05, "loss": 0.2714076995849609, "memory(GiB)": 91.64, "step": 5715, "token_acc": 0.9238231098430814, "train_speed(iter/s)": 0.138549 }, { "epoch": 0.5398263495658739, "grad_norm": 0.41128993034362793, "learning_rate": 9.510199980879603e-05, "loss": 0.26154122352600095, "memory(GiB)": 91.64, "step": 5720, "token_acc": 0.8976274165202109, "train_speed(iter/s)": 0.138551 }, { "epoch": 0.5402982257455644, "grad_norm": 0.41506433486938477, "learning_rate": 9.494614342829742e-05, "loss": 0.2645676612854004, "memory(GiB)": 91.64, "step": 5725, "token_acc": 0.9031833727966324, "train_speed(iter/s)": 0.138551 }, { "epoch": 0.5407701019252548, "grad_norm": 0.31078341603279114, "learning_rate": 9.479029935470034e-05, "loss": 0.26885018348693845, "memory(GiB)": 91.64, "step": 5730, "token_acc": 0.9037735849056604, "train_speed(iter/s)": 0.138551 }, { "epoch": 0.5412419781049452, "grad_norm": 0.24483777582645416, "learning_rate": 9.46344679675086e-05, "loss": 0.2597140073776245, "memory(GiB)": 91.64, "step": 5735, "token_acc": 0.8987915407854985, "train_speed(iter/s)": 0.138551 }, { "epoch": 0.5417138542846357, "grad_norm": 0.43131428956985474, "learning_rate": 9.447864964619511e-05, "loss": 0.2613609075546265, "memory(GiB)": 91.64, "step": 5740, "token_acc": 0.9058786741713571, "train_speed(iter/s)": 0.138551 }, { "epoch": 0.5421857304643262, "grad_norm": 0.3337703347206116, "learning_rate": 9.432284477020086e-05, "loss": 0.2597992420196533, "memory(GiB)": 91.64, "step": 5745, "token_acc": 0.9099797707349966, "train_speed(iter/s)": 0.138552 }, { "epoch": 0.5426576066440166, "grad_norm": 0.47471845149993896, "learning_rate": 9.416705371893426e-05, "loss": 0.26606192588806155, "memory(GiB)": 91.64, "step": 5750, "token_acc": 0.9284017645062775, "train_speed(iter/s)": 0.138553 }, { "epoch": 0.5431294828237071, "grad_norm": 0.2559286057949066, "learning_rate": 9.401127687176991e-05, "loss": 0.2615029811859131, "memory(GiB)": 91.64, "step": 5755, "token_acc": 0.8925468678555099, "train_speed(iter/s)": 0.138554 }, { "epoch": 0.5436013590033975, "grad_norm": 0.24070371687412262, "learning_rate": 9.385551460804787e-05, "loss": 0.26202309131622314, "memory(GiB)": 91.64, "step": 5760, "token_acc": 0.8897075754087037, "train_speed(iter/s)": 0.138553 }, { "epoch": 0.5440732351830879, "grad_norm": 0.31694546341896057, "learning_rate": 9.369976730707275e-05, "loss": 0.2607280731201172, "memory(GiB)": 91.64, "step": 5765, "token_acc": 0.9033432638199271, "train_speed(iter/s)": 0.138553 }, { "epoch": 0.5445451113627784, "grad_norm": 0.3087446093559265, "learning_rate": 9.354403534811269e-05, "loss": 0.2601593255996704, "memory(GiB)": 91.64, "step": 5770, "token_acc": 0.8965087281795511, "train_speed(iter/s)": 0.138553 }, { "epoch": 0.5450169875424689, "grad_norm": 0.2989960312843323, "learning_rate": 9.33883191103984e-05, "loss": 0.2624382972717285, "memory(GiB)": 91.64, "step": 5775, "token_acc": 0.9023532593995132, "train_speed(iter/s)": 0.138554 }, { "epoch": 0.5454888637221593, "grad_norm": 0.33407121896743774, "learning_rate": 9.323261897312238e-05, "loss": 0.2598482847213745, "memory(GiB)": 91.64, "step": 5780, "token_acc": 0.8982714650042505, "train_speed(iter/s)": 0.138552 }, { "epoch": 0.5459607399018498, "grad_norm": 0.5114820003509521, "learning_rate": 9.307693531543792e-05, "loss": 0.26101438999176024, "memory(GiB)": 91.64, "step": 5785, "token_acc": 0.9108391608391608, "train_speed(iter/s)": 0.138552 }, { "epoch": 0.5464326160815403, "grad_norm": 0.37111696600914, "learning_rate": 9.29212685164581e-05, "loss": 0.26595659255981446, "memory(GiB)": 91.64, "step": 5790, "token_acc": 0.8989431968295905, "train_speed(iter/s)": 0.138553 }, { "epoch": 0.5469044922612306, "grad_norm": 0.24424168467521667, "learning_rate": 9.276561895525507e-05, "loss": 0.2628682851791382, "memory(GiB)": 91.64, "step": 5795, "token_acc": 0.8922287390029325, "train_speed(iter/s)": 0.138554 }, { "epoch": 0.5473763684409211, "grad_norm": 0.5358099341392517, "learning_rate": 9.260998701085897e-05, "loss": 0.2708090305328369, "memory(GiB)": 91.64, "step": 5800, "token_acc": 0.9153335934451814, "train_speed(iter/s)": 0.138553 }, { "epoch": 0.5478482446206115, "grad_norm": 0.22064965963363647, "learning_rate": 9.245437306225696e-05, "loss": 0.2638305902481079, "memory(GiB)": 91.64, "step": 5805, "token_acc": 0.8872464764523892, "train_speed(iter/s)": 0.138554 }, { "epoch": 0.548320120800302, "grad_norm": 0.38728588819503784, "learning_rate": 9.229877748839242e-05, "loss": 0.26497840881347656, "memory(GiB)": 91.64, "step": 5810, "token_acc": 0.9151977131967604, "train_speed(iter/s)": 0.138553 }, { "epoch": 0.5487919969799925, "grad_norm": 0.5592960119247437, "learning_rate": 9.214320066816403e-05, "loss": 0.2640266418457031, "memory(GiB)": 91.64, "step": 5815, "token_acc": 0.9124352331606218, "train_speed(iter/s)": 0.138553 }, { "epoch": 0.549263873159683, "grad_norm": 0.39271870255470276, "learning_rate": 9.198764298042472e-05, "loss": 0.2661734580993652, "memory(GiB)": 91.64, "step": 5820, "token_acc": 0.9188461538461539, "train_speed(iter/s)": 0.138554 }, { "epoch": 0.5497357493393733, "grad_norm": 0.3182789981365204, "learning_rate": 9.183210480398096e-05, "loss": 0.2613093614578247, "memory(GiB)": 91.64, "step": 5825, "token_acc": 0.8979396262577863, "train_speed(iter/s)": 0.138554 }, { "epoch": 0.5502076255190638, "grad_norm": 1.0553961992263794, "learning_rate": 9.167658651759154e-05, "loss": 0.2589289665222168, "memory(GiB)": 91.64, "step": 5830, "token_acc": 0.9174974217944311, "train_speed(iter/s)": 0.138556 }, { "epoch": 0.5506795016987542, "grad_norm": 0.27244263887405396, "learning_rate": 9.152108849996696e-05, "loss": 0.2616077423095703, "memory(GiB)": 91.64, "step": 5835, "token_acc": 0.9227053140096618, "train_speed(iter/s)": 0.138557 }, { "epoch": 0.5511513778784447, "grad_norm": 0.3820434808731079, "learning_rate": 9.136561112976828e-05, "loss": 0.2627591133117676, "memory(GiB)": 91.64, "step": 5840, "token_acc": 0.898062015503876, "train_speed(iter/s)": 0.138554 }, { "epoch": 0.5516232540581352, "grad_norm": 0.22392357885837555, "learning_rate": 9.121015478560628e-05, "loss": 0.2643141508102417, "memory(GiB)": 91.64, "step": 5845, "token_acc": 0.9144553072625698, "train_speed(iter/s)": 0.138553 }, { "epoch": 0.5520951302378256, "grad_norm": 0.36011257767677307, "learning_rate": 9.105471984604055e-05, "loss": 0.26393847465515136, "memory(GiB)": 91.64, "step": 5850, "token_acc": 0.9219836710009072, "train_speed(iter/s)": 0.138554 }, { "epoch": 0.552567006417516, "grad_norm": 0.2702450752258301, "learning_rate": 9.089930668957862e-05, "loss": 0.25948729515075686, "memory(GiB)": 91.64, "step": 5855, "token_acc": 0.9018810371123538, "train_speed(iter/s)": 0.138552 }, { "epoch": 0.5530388825972065, "grad_norm": 0.5624324679374695, "learning_rate": 9.074391569467492e-05, "loss": 0.26597013473510744, "memory(GiB)": 91.64, "step": 5860, "token_acc": 0.9016867469879518, "train_speed(iter/s)": 0.138553 }, { "epoch": 0.5535107587768969, "grad_norm": 0.6906477808952332, "learning_rate": 9.058854723972986e-05, "loss": 0.2574812412261963, "memory(GiB)": 91.64, "step": 5865, "token_acc": 0.9240750966316952, "train_speed(iter/s)": 0.138553 }, { "epoch": 0.5539826349565874, "grad_norm": 0.21588723361492157, "learning_rate": 9.043320170308907e-05, "loss": 0.2587729930877686, "memory(GiB)": 91.64, "step": 5870, "token_acc": 0.9095205941931127, "train_speed(iter/s)": 0.138552 }, { "epoch": 0.5544545111362779, "grad_norm": 0.6740458011627197, "learning_rate": 9.027787946304223e-05, "loss": 0.26523623466491697, "memory(GiB)": 91.64, "step": 5875, "token_acc": 0.9050131926121372, "train_speed(iter/s)": 0.138552 }, { "epoch": 0.5549263873159683, "grad_norm": 0.3452465534210205, "learning_rate": 9.012258089782248e-05, "loss": 0.26502606868743894, "memory(GiB)": 91.64, "step": 5880, "token_acc": 0.9122871946706144, "train_speed(iter/s)": 0.138551 }, { "epoch": 0.5553982634956587, "grad_norm": 0.3662663400173187, "learning_rate": 8.996730638560519e-05, "loss": 0.263625431060791, "memory(GiB)": 91.64, "step": 5885, "token_acc": 0.9177910260433009, "train_speed(iter/s)": 0.138552 }, { "epoch": 0.5558701396753492, "grad_norm": 0.36303842067718506, "learning_rate": 8.981205630450713e-05, "loss": 0.2587179183959961, "memory(GiB)": 91.64, "step": 5890, "token_acc": 0.8908523908523909, "train_speed(iter/s)": 0.138554 }, { "epoch": 0.5563420158550396, "grad_norm": 0.3700637221336365, "learning_rate": 8.965683103258563e-05, "loss": 0.2564589023590088, "memory(GiB)": 91.64, "step": 5895, "token_acc": 0.9156102861282144, "train_speed(iter/s)": 0.138554 }, { "epoch": 0.5568138920347301, "grad_norm": 0.4616394340991974, "learning_rate": 8.95016309478376e-05, "loss": 0.2621131896972656, "memory(GiB)": 91.64, "step": 5900, "token_acc": 0.9194208372678627, "train_speed(iter/s)": 0.138553 }, { "epoch": 0.5572857682144206, "grad_norm": 0.4272395670413971, "learning_rate": 8.934645642819858e-05, "loss": 0.26262176036834717, "memory(GiB)": 91.64, "step": 5905, "token_acc": 0.901246719160105, "train_speed(iter/s)": 0.138553 }, { "epoch": 0.557757644394111, "grad_norm": 0.2230866253376007, "learning_rate": 8.919130785154195e-05, "loss": 0.26306858062744143, "memory(GiB)": 91.64, "step": 5910, "token_acc": 0.9148486980999296, "train_speed(iter/s)": 0.138555 }, { "epoch": 0.5582295205738015, "grad_norm": 0.5155897736549377, "learning_rate": 8.903618559567779e-05, "loss": 0.2663133144378662, "memory(GiB)": 91.64, "step": 5915, "token_acc": 0.9127292940522512, "train_speed(iter/s)": 0.138556 }, { "epoch": 0.5587013967534918, "grad_norm": 0.23849859833717346, "learning_rate": 8.88810900383522e-05, "loss": 0.25657382011413576, "memory(GiB)": 91.64, "step": 5920, "token_acc": 0.9311404857055026, "train_speed(iter/s)": 0.138554 }, { "epoch": 0.5591732729331823, "grad_norm": 0.4121880531311035, "learning_rate": 8.872602155724616e-05, "loss": 0.25624220371246337, "memory(GiB)": 91.64, "step": 5925, "token_acc": 0.9055555555555556, "train_speed(iter/s)": 0.138553 }, { "epoch": 0.5596451491128728, "grad_norm": 0.4594009816646576, "learning_rate": 8.857098052997477e-05, "loss": 0.2594911098480225, "memory(GiB)": 91.64, "step": 5930, "token_acc": 0.9104712041884817, "train_speed(iter/s)": 0.138553 }, { "epoch": 0.5601170252925632, "grad_norm": 0.306629478931427, "learning_rate": 8.841596733408627e-05, "loss": 0.2655156373977661, "memory(GiB)": 91.64, "step": 5935, "token_acc": 0.9011486251305256, "train_speed(iter/s)": 0.138554 }, { "epoch": 0.5605889014722537, "grad_norm": 0.5396053791046143, "learning_rate": 8.826098234706117e-05, "loss": 0.25810458660125735, "memory(GiB)": 91.64, "step": 5940, "token_acc": 0.8991660348749052, "train_speed(iter/s)": 0.138554 }, { "epoch": 0.5610607776519442, "grad_norm": 0.4360372722148895, "learning_rate": 8.810602594631121e-05, "loss": 0.26143407821655273, "memory(GiB)": 91.64, "step": 5945, "token_acc": 0.9059360730593607, "train_speed(iter/s)": 0.138556 }, { "epoch": 0.5615326538316345, "grad_norm": 0.3832603394985199, "learning_rate": 8.795109850917857e-05, "loss": 0.25672688484191897, "memory(GiB)": 91.64, "step": 5950, "token_acc": 0.9179431072210066, "train_speed(iter/s)": 0.138556 }, { "epoch": 0.562004530011325, "grad_norm": 0.6274591088294983, "learning_rate": 8.779620041293486e-05, "loss": 0.25992960929870607, "memory(GiB)": 91.64, "step": 5955, "token_acc": 0.8999081726354453, "train_speed(iter/s)": 0.138556 }, { "epoch": 0.5624764061910155, "grad_norm": 0.7016596794128418, "learning_rate": 8.764133203478027e-05, "loss": 0.2578299522399902, "memory(GiB)": 91.64, "step": 5960, "token_acc": 0.9278547539417105, "train_speed(iter/s)": 0.138556 }, { "epoch": 0.5629482823707059, "grad_norm": 0.30035027861595154, "learning_rate": 8.748649375184258e-05, "loss": 0.2687530040740967, "memory(GiB)": 91.64, "step": 5965, "token_acc": 0.9160012775471096, "train_speed(iter/s)": 0.138557 }, { "epoch": 0.5634201585503964, "grad_norm": 0.6111456155776978, "learning_rate": 8.73316859411764e-05, "loss": 0.25793159008026123, "memory(GiB)": 91.64, "step": 5970, "token_acc": 0.9109185441941074, "train_speed(iter/s)": 0.138558 }, { "epoch": 0.5638920347300869, "grad_norm": 0.3605799674987793, "learning_rate": 8.7176908979762e-05, "loss": 0.2613699436187744, "memory(GiB)": 91.64, "step": 5975, "token_acc": 0.9057390189163194, "train_speed(iter/s)": 0.138557 }, { "epoch": 0.5643639109097772, "grad_norm": 0.22072555124759674, "learning_rate": 8.702216324450458e-05, "loss": 0.2566020965576172, "memory(GiB)": 91.64, "step": 5980, "token_acc": 0.9135802469135802, "train_speed(iter/s)": 0.138557 }, { "epoch": 0.5648357870894677, "grad_norm": 0.5042877793312073, "learning_rate": 8.686744911223332e-05, "loss": 0.26034910678863527, "memory(GiB)": 91.64, "step": 5985, "token_acc": 0.9065708418891171, "train_speed(iter/s)": 0.138559 }, { "epoch": 0.5653076632691582, "grad_norm": 0.2110811173915863, "learning_rate": 8.671276695970043e-05, "loss": 0.2575195789337158, "memory(GiB)": 91.64, "step": 5990, "token_acc": 0.8952899961074348, "train_speed(iter/s)": 0.138559 }, { "epoch": 0.5657795394488486, "grad_norm": 0.44462695717811584, "learning_rate": 8.655811716358014e-05, "loss": 0.262726354598999, "memory(GiB)": 91.64, "step": 5995, "token_acc": 0.8959136468774094, "train_speed(iter/s)": 0.13856 }, { "epoch": 0.5662514156285391, "grad_norm": 0.2724670171737671, "learning_rate": 8.640350010046811e-05, "loss": 0.25894389152526853, "memory(GiB)": 91.64, "step": 6000, "token_acc": 0.9075779036827195, "train_speed(iter/s)": 0.13856 }, { "epoch": 0.5667232918082296, "grad_norm": 0.20576444268226624, "learning_rate": 8.624891614688014e-05, "loss": 0.2585927963256836, "memory(GiB)": 91.64, "step": 6005, "token_acc": 0.9124197810494526, "train_speed(iter/s)": 0.13856 }, { "epoch": 0.56719516798792, "grad_norm": 0.2223316729068756, "learning_rate": 8.609436567925137e-05, "loss": 0.2674827575683594, "memory(GiB)": 91.64, "step": 6010, "token_acc": 0.9234624145785877, "train_speed(iter/s)": 0.13856 }, { "epoch": 0.5676670441676104, "grad_norm": 0.39043989777565, "learning_rate": 8.593984907393551e-05, "loss": 0.26184422969818116, "memory(GiB)": 91.64, "step": 6015, "token_acc": 0.8997484728710026, "train_speed(iter/s)": 0.138559 }, { "epoch": 0.5681389203473008, "grad_norm": 0.4472939968109131, "learning_rate": 8.578536670720373e-05, "loss": 0.2619999885559082, "memory(GiB)": 91.64, "step": 6020, "token_acc": 0.9160671462829736, "train_speed(iter/s)": 0.138559 }, { "epoch": 0.5686107965269913, "grad_norm": 0.28536486625671387, "learning_rate": 8.56309189552438e-05, "loss": 0.2595273494720459, "memory(GiB)": 91.64, "step": 6025, "token_acc": 0.9195612431444241, "train_speed(iter/s)": 0.138559 }, { "epoch": 0.5690826727066818, "grad_norm": 0.5671156644821167, "learning_rate": 8.547650619415934e-05, "loss": 0.26369786262512207, "memory(GiB)": 91.64, "step": 6030, "token_acc": 0.9050097592713078, "train_speed(iter/s)": 0.138559 }, { "epoch": 0.5695545488863722, "grad_norm": 0.20459724962711334, "learning_rate": 8.532212879996864e-05, "loss": 0.2596536636352539, "memory(GiB)": 91.64, "step": 6035, "token_acc": 0.9215686274509803, "train_speed(iter/s)": 0.13856 }, { "epoch": 0.5700264250660627, "grad_norm": 0.36021432280540466, "learning_rate": 8.516778714860387e-05, "loss": 0.25885491371154784, "memory(GiB)": 91.64, "step": 6040, "token_acc": 0.9114039073148569, "train_speed(iter/s)": 0.13856 }, { "epoch": 0.5704983012457531, "grad_norm": 0.21195665001869202, "learning_rate": 8.501348161591018e-05, "loss": 0.26376259326934814, "memory(GiB)": 91.64, "step": 6045, "token_acc": 0.9074463609591923, "train_speed(iter/s)": 0.138562 }, { "epoch": 0.5709701774254435, "grad_norm": 0.4079072177410126, "learning_rate": 8.485921257764476e-05, "loss": 0.2624048233032227, "memory(GiB)": 91.64, "step": 6050, "token_acc": 0.918562201628756, "train_speed(iter/s)": 0.138561 }, { "epoch": 0.571442053605134, "grad_norm": 0.4321339726448059, "learning_rate": 8.470498040947601e-05, "loss": 0.26624159812927245, "memory(GiB)": 91.64, "step": 6055, "token_acc": 0.8944233892799134, "train_speed(iter/s)": 0.13856 }, { "epoch": 0.5719139297848245, "grad_norm": 0.586856484413147, "learning_rate": 8.455078548698243e-05, "loss": 0.2600421667098999, "memory(GiB)": 91.64, "step": 6060, "token_acc": 0.8978065802592223, "train_speed(iter/s)": 0.13856 }, { "epoch": 0.5723858059645149, "grad_norm": 0.2767311930656433, "learning_rate": 8.439662818565186e-05, "loss": 0.25961735248565676, "memory(GiB)": 91.64, "step": 6065, "token_acc": 0.9028112449799197, "train_speed(iter/s)": 0.138559 }, { "epoch": 0.5728576821442054, "grad_norm": 0.32214125990867615, "learning_rate": 8.424250888088056e-05, "loss": 0.2518421173095703, "memory(GiB)": 91.64, "step": 6070, "token_acc": 0.9209470304975923, "train_speed(iter/s)": 0.13856 }, { "epoch": 0.5733295583238958, "grad_norm": 0.32131657004356384, "learning_rate": 8.408842794797225e-05, "loss": 0.2633669376373291, "memory(GiB)": 91.64, "step": 6075, "token_acc": 0.8879159369527145, "train_speed(iter/s)": 0.13856 }, { "epoch": 0.5738014345035862, "grad_norm": 0.21564066410064697, "learning_rate": 8.39343857621371e-05, "loss": 0.2537181854248047, "memory(GiB)": 91.64, "step": 6080, "token_acc": 0.9242782773308093, "train_speed(iter/s)": 0.138561 }, { "epoch": 0.5742733106832767, "grad_norm": 0.6542291045188904, "learning_rate": 8.378038269849113e-05, "loss": 0.2556138277053833, "memory(GiB)": 91.64, "step": 6085, "token_acc": 0.9143944197844007, "train_speed(iter/s)": 0.13856 }, { "epoch": 0.5747451868629672, "grad_norm": 0.375399112701416, "learning_rate": 8.362641913205497e-05, "loss": 0.26028482913970946, "memory(GiB)": 91.64, "step": 6090, "token_acc": 0.9022589052997394, "train_speed(iter/s)": 0.138562 }, { "epoch": 0.5752170630426576, "grad_norm": 0.3971591293811798, "learning_rate": 8.347249543775303e-05, "loss": 0.25295219421386717, "memory(GiB)": 91.64, "step": 6095, "token_acc": 0.9101796407185628, "train_speed(iter/s)": 0.138563 }, { "epoch": 0.5756889392223481, "grad_norm": 0.2439550906419754, "learning_rate": 8.331861199041272e-05, "loss": 0.2594885349273682, "memory(GiB)": 91.64, "step": 6100, "token_acc": 0.9213917525773195, "train_speed(iter/s)": 0.138564 }, { "epoch": 0.5761608154020385, "grad_norm": 0.4823835492134094, "learning_rate": 8.31647691647634e-05, "loss": 0.2540708541870117, "memory(GiB)": 91.64, "step": 6105, "token_acc": 0.9006509078451524, "train_speed(iter/s)": 0.138563 }, { "epoch": 0.5766326915817289, "grad_norm": 0.40031829476356506, "learning_rate": 8.301096733543545e-05, "loss": 0.25965514183044436, "memory(GiB)": 91.64, "step": 6110, "token_acc": 0.9109712230215827, "train_speed(iter/s)": 0.138564 }, { "epoch": 0.5771045677614194, "grad_norm": 0.21735447645187378, "learning_rate": 8.285720687695953e-05, "loss": 0.25909032821655276, "memory(GiB)": 91.64, "step": 6115, "token_acc": 0.8939785740672331, "train_speed(iter/s)": 0.138564 }, { "epoch": 0.5775764439411099, "grad_norm": 0.8802348375320435, "learning_rate": 8.270348816376553e-05, "loss": 0.25847816467285156, "memory(GiB)": 91.64, "step": 6120, "token_acc": 0.8921238124425376, "train_speed(iter/s)": 0.138565 }, { "epoch": 0.5780483201208003, "grad_norm": 0.410442054271698, "learning_rate": 8.25498115701816e-05, "loss": 0.2583799362182617, "memory(GiB)": 91.64, "step": 6125, "token_acc": 0.9224385572933291, "train_speed(iter/s)": 0.138564 }, { "epoch": 0.5785201963004908, "grad_norm": 0.34100407361984253, "learning_rate": 8.23961774704334e-05, "loss": 0.2645753860473633, "memory(GiB)": 91.64, "step": 6130, "token_acc": 0.9133333333333333, "train_speed(iter/s)": 0.138565 }, { "epoch": 0.5789920724801813, "grad_norm": 0.45933786034584045, "learning_rate": 8.224258623864311e-05, "loss": 0.2588545322418213, "memory(GiB)": 91.64, "step": 6135, "token_acc": 0.8987271721084671, "train_speed(iter/s)": 0.138564 }, { "epoch": 0.5794639486598716, "grad_norm": 0.2021590918302536, "learning_rate": 8.208903824882843e-05, "loss": 0.2566136598587036, "memory(GiB)": 91.64, "step": 6140, "token_acc": 0.9168081494057725, "train_speed(iter/s)": 0.138564 }, { "epoch": 0.5799358248395621, "grad_norm": 0.46117132902145386, "learning_rate": 8.193553387490194e-05, "loss": 0.2587829351425171, "memory(GiB)": 91.64, "step": 6145, "token_acc": 0.8906326630701324, "train_speed(iter/s)": 0.138564 }, { "epoch": 0.5804077010192525, "grad_norm": 0.492724746465683, "learning_rate": 8.17820734906698e-05, "loss": 0.2539858341217041, "memory(GiB)": 91.64, "step": 6150, "token_acc": 0.9108153078202995, "train_speed(iter/s)": 0.138565 }, { "epoch": 0.580879577198943, "grad_norm": 0.3408304452896118, "learning_rate": 8.162865746983122e-05, "loss": 0.261328125, "memory(GiB)": 91.64, "step": 6155, "token_acc": 0.901056338028169, "train_speed(iter/s)": 0.138564 }, { "epoch": 0.5813514533786335, "grad_norm": 0.2859143018722534, "learning_rate": 8.147528618597729e-05, "loss": 0.2623132228851318, "memory(GiB)": 91.64, "step": 6160, "token_acc": 0.9008238276299113, "train_speed(iter/s)": 0.138563 }, { "epoch": 0.5818233295583239, "grad_norm": 0.4189459979534149, "learning_rate": 8.132196001259011e-05, "loss": 0.25851998329162595, "memory(GiB)": 91.64, "step": 6165, "token_acc": 0.9078947368421053, "train_speed(iter/s)": 0.138563 }, { "epoch": 0.5822952057380143, "grad_norm": 0.5357767939567566, "learning_rate": 8.116867932304204e-05, "loss": 0.25238189697265623, "memory(GiB)": 91.64, "step": 6170, "token_acc": 0.8951935914552737, "train_speed(iter/s)": 0.138565 }, { "epoch": 0.5827670819177048, "grad_norm": 0.255214661359787, "learning_rate": 8.101544449059466e-05, "loss": 0.2591865062713623, "memory(GiB)": 91.64, "step": 6175, "token_acc": 0.896551724137931, "train_speed(iter/s)": 0.138565 }, { "epoch": 0.5832389580973952, "grad_norm": 0.42737099528312683, "learning_rate": 8.086225588839782e-05, "loss": 0.2616575241088867, "memory(GiB)": 91.64, "step": 6180, "token_acc": 0.910024650780608, "train_speed(iter/s)": 0.138564 }, { "epoch": 0.5837108342770857, "grad_norm": 0.28640568256378174, "learning_rate": 8.070911388948885e-05, "loss": 0.2526390790939331, "memory(GiB)": 91.64, "step": 6185, "token_acc": 0.9003815175922001, "train_speed(iter/s)": 0.138564 }, { "epoch": 0.5841827104567762, "grad_norm": 0.4053497314453125, "learning_rate": 8.055601886679156e-05, "loss": 0.25293493270874023, "memory(GiB)": 91.64, "step": 6190, "token_acc": 0.8978351690087353, "train_speed(iter/s)": 0.138565 }, { "epoch": 0.5846545866364666, "grad_norm": 0.32031095027923584, "learning_rate": 8.040297119311536e-05, "loss": 0.2545334815979004, "memory(GiB)": 91.64, "step": 6195, "token_acc": 0.908307210031348, "train_speed(iter/s)": 0.138565 }, { "epoch": 0.585126462816157, "grad_norm": 0.30353373289108276, "learning_rate": 8.024997124115437e-05, "loss": 0.2560997486114502, "memory(GiB)": 91.64, "step": 6200, "token_acc": 0.9118942731277533, "train_speed(iter/s)": 0.138565 }, { "epoch": 0.5855983389958475, "grad_norm": 0.43289270997047424, "learning_rate": 8.009701938348654e-05, "loss": 0.2583000183105469, "memory(GiB)": 91.64, "step": 6205, "token_acc": 0.9062111801242236, "train_speed(iter/s)": 0.138566 }, { "epoch": 0.5860702151755379, "grad_norm": 0.38920220732688904, "learning_rate": 7.994411599257268e-05, "loss": 0.2500426769256592, "memory(GiB)": 91.64, "step": 6210, "token_acc": 0.8985663082437276, "train_speed(iter/s)": 0.138566 }, { "epoch": 0.5865420913552284, "grad_norm": 0.2825988233089447, "learning_rate": 7.97912614407555e-05, "loss": 0.2535118579864502, "memory(GiB)": 91.64, "step": 6215, "token_acc": 0.9290194783757015, "train_speed(iter/s)": 0.138566 }, { "epoch": 0.5870139675349189, "grad_norm": 0.2607544958591461, "learning_rate": 7.963845610025892e-05, "loss": 0.2580404281616211, "memory(GiB)": 91.64, "step": 6220, "token_acc": 0.9124552327894946, "train_speed(iter/s)": 0.138565 }, { "epoch": 0.5874858437146093, "grad_norm": 0.6544619202613831, "learning_rate": 7.948570034318685e-05, "loss": 0.2521751880645752, "memory(GiB)": 91.64, "step": 6225, "token_acc": 0.9135643988018828, "train_speed(iter/s)": 0.138566 }, { "epoch": 0.5879577198942998, "grad_norm": 0.6524026393890381, "learning_rate": 7.933299454152266e-05, "loss": 0.25135116577148436, "memory(GiB)": 91.64, "step": 6230, "token_acc": 0.9067930489731437, "train_speed(iter/s)": 0.138565 }, { "epoch": 0.5884295960739901, "grad_norm": 0.3173801004886627, "learning_rate": 7.91803390671279e-05, "loss": 0.25176496505737306, "memory(GiB)": 91.64, "step": 6235, "token_acc": 0.8964194373401535, "train_speed(iter/s)": 0.138566 }, { "epoch": 0.5889014722536806, "grad_norm": 0.4476158618927002, "learning_rate": 7.902773429174166e-05, "loss": 0.2545851469039917, "memory(GiB)": 91.64, "step": 6240, "token_acc": 0.922882427307206, "train_speed(iter/s)": 0.138567 }, { "epoch": 0.5893733484333711, "grad_norm": 0.2693001925945282, "learning_rate": 7.88751805869795e-05, "loss": 0.2574381113052368, "memory(GiB)": 91.64, "step": 6245, "token_acc": 0.8921568627450981, "train_speed(iter/s)": 0.138569 }, { "epoch": 0.5898452246130615, "grad_norm": 0.37155503034591675, "learning_rate": 7.872267832433272e-05, "loss": 0.25994248390197755, "memory(GiB)": 91.64, "step": 6250, "token_acc": 0.9076329076329076, "train_speed(iter/s)": 0.138569 }, { "epoch": 0.590317100792752, "grad_norm": 0.6347530484199524, "learning_rate": 7.85702278751672e-05, "loss": 0.2505256175994873, "memory(GiB)": 91.64, "step": 6255, "token_acc": 0.9176829268292683, "train_speed(iter/s)": 0.138566 }, { "epoch": 0.5907889769724425, "grad_norm": 0.2157605141401291, "learning_rate": 7.841782961072284e-05, "loss": 0.25982060432434084, "memory(GiB)": 91.64, "step": 6260, "token_acc": 0.9095300834431269, "train_speed(iter/s)": 0.138565 }, { "epoch": 0.5912608531521328, "grad_norm": 0.6124857664108276, "learning_rate": 7.826548390211225e-05, "loss": 0.25920767784118653, "memory(GiB)": 91.64, "step": 6265, "token_acc": 0.9093830334190232, "train_speed(iter/s)": 0.138566 }, { "epoch": 0.5917327293318233, "grad_norm": 0.2416924685239792, "learning_rate": 7.811319112032027e-05, "loss": 0.25081772804260255, "memory(GiB)": 91.64, "step": 6270, "token_acc": 0.9212386401884888, "train_speed(iter/s)": 0.138567 }, { "epoch": 0.5922046055115138, "grad_norm": 0.8983920216560364, "learning_rate": 7.796095163620267e-05, "loss": 0.2576131343841553, "memory(GiB)": 91.64, "step": 6275, "token_acc": 0.8962986598596043, "train_speed(iter/s)": 0.138566 }, { "epoch": 0.5926764816912042, "grad_norm": 0.4004858732223511, "learning_rate": 7.780876582048553e-05, "loss": 0.25931851863861083, "memory(GiB)": 91.64, "step": 6280, "token_acc": 0.9084588644264194, "train_speed(iter/s)": 0.138568 }, { "epoch": 0.5931483578708947, "grad_norm": 0.39955568313598633, "learning_rate": 7.76566340437642e-05, "loss": 0.25876388549804685, "memory(GiB)": 91.64, "step": 6285, "token_acc": 0.9037249283667622, "train_speed(iter/s)": 0.138569 }, { "epoch": 0.5936202340505852, "grad_norm": 0.5433376431465149, "learning_rate": 7.750455667650251e-05, "loss": 0.2523482799530029, "memory(GiB)": 91.64, "step": 6290, "token_acc": 0.9070858751759737, "train_speed(iter/s)": 0.138569 }, { "epoch": 0.5940921102302755, "grad_norm": 0.4515962600708008, "learning_rate": 7.735253408903174e-05, "loss": 0.2509950637817383, "memory(GiB)": 91.64, "step": 6295, "token_acc": 0.9048223350253807, "train_speed(iter/s)": 0.138569 }, { "epoch": 0.594563986409966, "grad_norm": 0.3393830358982086, "learning_rate": 7.72005666515497e-05, "loss": 0.25605719089508056, "memory(GiB)": 91.64, "step": 6300, "token_acc": 0.9163781624500665, "train_speed(iter/s)": 0.138569 }, { "epoch": 0.5950358625896565, "grad_norm": 0.24305680394172668, "learning_rate": 7.704865473412008e-05, "loss": 0.2521644592285156, "memory(GiB)": 91.64, "step": 6305, "token_acc": 0.9120942257971847, "train_speed(iter/s)": 0.138571 }, { "epoch": 0.5955077387693469, "grad_norm": 0.5273348689079285, "learning_rate": 7.689679870667121e-05, "loss": 0.25645806789398196, "memory(GiB)": 91.64, "step": 6310, "token_acc": 0.9080547112462006, "train_speed(iter/s)": 0.138572 }, { "epoch": 0.5959796149490374, "grad_norm": 0.826224148273468, "learning_rate": 7.674499893899533e-05, "loss": 0.25773797035217283, "memory(GiB)": 91.64, "step": 6315, "token_acc": 0.9154564315352697, "train_speed(iter/s)": 0.138572 }, { "epoch": 0.5964514911287279, "grad_norm": 0.3442542552947998, "learning_rate": 7.659325580074782e-05, "loss": 0.2533260345458984, "memory(GiB)": 91.64, "step": 6320, "token_acc": 0.9165663858804654, "train_speed(iter/s)": 0.138571 }, { "epoch": 0.5969233673084182, "grad_norm": 0.5301949381828308, "learning_rate": 7.644156966144603e-05, "loss": 0.2529233455657959, "memory(GiB)": 91.64, "step": 6325, "token_acc": 0.9026500811249324, "train_speed(iter/s)": 0.138572 }, { "epoch": 0.5973952434881087, "grad_norm": 0.34612637758255005, "learning_rate": 7.628994089046851e-05, "loss": 0.25222182273864746, "memory(GiB)": 91.64, "step": 6330, "token_acc": 0.9079539769884942, "train_speed(iter/s)": 0.138572 }, { "epoch": 0.5978671196677992, "grad_norm": 0.5262756943702698, "learning_rate": 7.61383698570542e-05, "loss": 0.25571882724761963, "memory(GiB)": 91.64, "step": 6335, "token_acc": 0.9050715214564369, "train_speed(iter/s)": 0.138573 }, { "epoch": 0.5983389958474896, "grad_norm": 0.37142181396484375, "learning_rate": 7.598685693030136e-05, "loss": 0.2572377920150757, "memory(GiB)": 91.64, "step": 6340, "token_acc": 0.9146341463414634, "train_speed(iter/s)": 0.138576 }, { "epoch": 0.5988108720271801, "grad_norm": 0.4435655176639557, "learning_rate": 7.583540247916672e-05, "loss": 0.24800877571105956, "memory(GiB)": 91.64, "step": 6345, "token_acc": 0.9018944519621109, "train_speed(iter/s)": 0.138575 }, { "epoch": 0.5992827482068706, "grad_norm": 0.7729746103286743, "learning_rate": 7.568400687246474e-05, "loss": 0.25422685146331786, "memory(GiB)": 91.64, "step": 6350, "token_acc": 0.9128616242593238, "train_speed(iter/s)": 0.138576 }, { "epoch": 0.599754624386561, "grad_norm": 0.27863532304763794, "learning_rate": 7.553267047886651e-05, "loss": 0.25974535942077637, "memory(GiB)": 91.64, "step": 6355, "token_acc": 0.9166051660516605, "train_speed(iter/s)": 0.138577 }, { "epoch": 0.6002265005662514, "grad_norm": 0.23578110337257385, "learning_rate": 7.53813936668989e-05, "loss": 0.25558838844299314, "memory(GiB)": 91.64, "step": 6360, "token_acc": 0.9315726290516206, "train_speed(iter/s)": 0.138577 }, { "epoch": 0.6006983767459418, "grad_norm": 0.6068395972251892, "learning_rate": 7.523017680494372e-05, "loss": 0.25866079330444336, "memory(GiB)": 91.64, "step": 6365, "token_acc": 0.9188640973630832, "train_speed(iter/s)": 0.138579 }, { "epoch": 0.6011702529256323, "grad_norm": 0.32254558801651, "learning_rate": 7.507902026123678e-05, "loss": 0.2493062973022461, "memory(GiB)": 91.64, "step": 6370, "token_acc": 0.9166666666666666, "train_speed(iter/s)": 0.138579 }, { "epoch": 0.6016421291053228, "grad_norm": 0.5741229057312012, "learning_rate": 7.492792440386709e-05, "loss": 0.26273245811462403, "memory(GiB)": 91.64, "step": 6375, "token_acc": 0.9025764895330113, "train_speed(iter/s)": 0.138579 }, { "epoch": 0.6021140052850132, "grad_norm": 0.21148309111595154, "learning_rate": 7.477688960077575e-05, "loss": 0.26000070571899414, "memory(GiB)": 91.64, "step": 6380, "token_acc": 0.8957715133531158, "train_speed(iter/s)": 0.13858 }, { "epoch": 0.6025858814647037, "grad_norm": 0.31708240509033203, "learning_rate": 7.462591621975523e-05, "loss": 0.2541205406188965, "memory(GiB)": 91.64, "step": 6385, "token_acc": 0.9128664495114006, "train_speed(iter/s)": 0.138581 }, { "epoch": 0.6030577576443941, "grad_norm": 0.5386173725128174, "learning_rate": 7.447500462844848e-05, "loss": 0.2534413576126099, "memory(GiB)": 91.64, "step": 6390, "token_acc": 0.9030408773678963, "train_speed(iter/s)": 0.138582 }, { "epoch": 0.6035296338240845, "grad_norm": 0.28694167733192444, "learning_rate": 7.432415519434791e-05, "loss": 0.2476402759552002, "memory(GiB)": 91.64, "step": 6395, "token_acc": 0.9071058475203553, "train_speed(iter/s)": 0.138583 }, { "epoch": 0.604001510003775, "grad_norm": 0.3457031548023224, "learning_rate": 7.417336828479462e-05, "loss": 0.2557513236999512, "memory(GiB)": 91.64, "step": 6400, "token_acc": 0.8938704028021016, "train_speed(iter/s)": 0.138585 }, { "epoch": 0.6044733861834655, "grad_norm": 0.3015283942222595, "learning_rate": 7.402264426697742e-05, "loss": 0.25288589000701905, "memory(GiB)": 91.64, "step": 6405, "token_acc": 0.9166666666666666, "train_speed(iter/s)": 0.138584 }, { "epoch": 0.6049452623631559, "grad_norm": 0.340631365776062, "learning_rate": 7.387198350793201e-05, "loss": 0.2512622594833374, "memory(GiB)": 91.64, "step": 6410, "token_acc": 0.92037691401649, "train_speed(iter/s)": 0.138584 }, { "epoch": 0.6054171385428464, "grad_norm": 1.1782653331756592, "learning_rate": 7.372138637453998e-05, "loss": 0.25229225158691404, "memory(GiB)": 91.64, "step": 6415, "token_acc": 0.908903403231351, "train_speed(iter/s)": 0.138583 }, { "epoch": 0.6058890147225368, "grad_norm": 0.2346450537443161, "learning_rate": 7.357085323352806e-05, "loss": 0.25162057876586913, "memory(GiB)": 91.64, "step": 6420, "token_acc": 0.912405513561583, "train_speed(iter/s)": 0.138581 }, { "epoch": 0.6063608909022272, "grad_norm": 0.28029438853263855, "learning_rate": 7.342038445146709e-05, "loss": 0.24912467002868652, "memory(GiB)": 91.64, "step": 6425, "token_acc": 0.9149034038638455, "train_speed(iter/s)": 0.138578 }, { "epoch": 0.6068327670819177, "grad_norm": 0.7123743295669556, "learning_rate": 7.326998039477118e-05, "loss": 0.2548022985458374, "memory(GiB)": 91.64, "step": 6430, "token_acc": 0.9224393132030787, "train_speed(iter/s)": 0.138576 }, { "epoch": 0.6073046432616082, "grad_norm": 0.26868027448654175, "learning_rate": 7.311964142969688e-05, "loss": 0.2451089382171631, "memory(GiB)": 91.64, "step": 6435, "token_acc": 0.9096236890808143, "train_speed(iter/s)": 0.138575 }, { "epoch": 0.6077765194412986, "grad_norm": 0.2344752699136734, "learning_rate": 7.296936792234221e-05, "loss": 0.24809615612030028, "memory(GiB)": 91.64, "step": 6440, "token_acc": 0.9250493096646942, "train_speed(iter/s)": 0.138576 }, { "epoch": 0.6082483956209891, "grad_norm": 0.3093951642513275, "learning_rate": 7.281916023864577e-05, "loss": 0.25421953201293945, "memory(GiB)": 91.64, "step": 6445, "token_acc": 0.9148073022312373, "train_speed(iter/s)": 0.138576 }, { "epoch": 0.6087202718006794, "grad_norm": 0.9016293883323669, "learning_rate": 7.266901874438585e-05, "loss": 0.2528842926025391, "memory(GiB)": 91.64, "step": 6450, "token_acc": 0.9130286493860846, "train_speed(iter/s)": 0.138576 }, { "epoch": 0.6091921479803699, "grad_norm": 0.45874106884002686, "learning_rate": 7.251894380517967e-05, "loss": 0.25236220359802247, "memory(GiB)": 91.64, "step": 6455, "token_acc": 0.9134799235181644, "train_speed(iter/s)": 0.138575 }, { "epoch": 0.6096640241600604, "grad_norm": 0.3354821503162384, "learning_rate": 7.236893578648218e-05, "loss": 0.2504927158355713, "memory(GiB)": 91.64, "step": 6460, "token_acc": 0.9105378704720087, "train_speed(iter/s)": 0.138578 }, { "epoch": 0.6101359003397508, "grad_norm": 0.3122103214263916, "learning_rate": 7.221899505358561e-05, "loss": 0.24795224666595458, "memory(GiB)": 91.64, "step": 6465, "token_acc": 0.9104912572855953, "train_speed(iter/s)": 0.138578 }, { "epoch": 0.6106077765194413, "grad_norm": 0.28339579701423645, "learning_rate": 7.206912197161815e-05, "loss": 0.2503954887390137, "memory(GiB)": 91.64, "step": 6470, "token_acc": 0.9248035914702581, "train_speed(iter/s)": 0.138577 }, { "epoch": 0.6110796526991318, "grad_norm": 0.9332698583602905, "learning_rate": 7.191931690554334e-05, "loss": 0.2518021583557129, "memory(GiB)": 91.64, "step": 6475, "token_acc": 0.9148079306071871, "train_speed(iter/s)": 0.138577 }, { "epoch": 0.6115515288788222, "grad_norm": 0.2940748929977417, "learning_rate": 7.176958022015902e-05, "loss": 0.250186824798584, "memory(GiB)": 91.64, "step": 6480, "token_acc": 0.9148569458807307, "train_speed(iter/s)": 0.138577 }, { "epoch": 0.6120234050585126, "grad_norm": 0.5708451867103577, "learning_rate": 7.161991228009663e-05, "loss": 0.24937090873718262, "memory(GiB)": 91.64, "step": 6485, "token_acc": 0.9102803738317757, "train_speed(iter/s)": 0.138576 }, { "epoch": 0.6124952812382031, "grad_norm": 0.5467776656150818, "learning_rate": 7.147031344982007e-05, "loss": 0.2573434829711914, "memory(GiB)": 91.64, "step": 6490, "token_acc": 0.9211538461538461, "train_speed(iter/s)": 0.138575 }, { "epoch": 0.6129671574178935, "grad_norm": 0.48039039969444275, "learning_rate": 7.132078409362506e-05, "loss": 0.2514265298843384, "memory(GiB)": 91.64, "step": 6495, "token_acc": 0.9051139864448552, "train_speed(iter/s)": 0.138575 }, { "epoch": 0.613439033597584, "grad_norm": 0.5072046518325806, "learning_rate": 7.117132457563807e-05, "loss": 0.24883434772491456, "memory(GiB)": 91.64, "step": 6500, "token_acc": 0.9116967175219602, "train_speed(iter/s)": 0.138576 }, { "epoch": 0.6139109097772745, "grad_norm": 0.6466028094291687, "learning_rate": 7.102193525981555e-05, "loss": 0.2516045570373535, "memory(GiB)": 91.64, "step": 6505, "token_acc": 0.9102815979043877, "train_speed(iter/s)": 0.138576 }, { "epoch": 0.6143827859569649, "grad_norm": 0.4448561370372772, "learning_rate": 7.087261650994295e-05, "loss": 0.24750699996948242, "memory(GiB)": 91.64, "step": 6510, "token_acc": 0.9200743494423792, "train_speed(iter/s)": 0.138578 }, { "epoch": 0.6148546621366553, "grad_norm": 0.3403247594833374, "learning_rate": 7.072336868963387e-05, "loss": 0.2533865451812744, "memory(GiB)": 91.64, "step": 6515, "token_acc": 0.9022582921665491, "train_speed(iter/s)": 0.138579 }, { "epoch": 0.6153265383163458, "grad_norm": 0.25941842794418335, "learning_rate": 7.057419216232925e-05, "loss": 0.25099682807922363, "memory(GiB)": 91.64, "step": 6520, "token_acc": 0.916248552682362, "train_speed(iter/s)": 0.138578 }, { "epoch": 0.6157984144960362, "grad_norm": 0.39571791887283325, "learning_rate": 7.042508729129644e-05, "loss": 0.2497119903564453, "memory(GiB)": 91.64, "step": 6525, "token_acc": 0.9218089602704987, "train_speed(iter/s)": 0.13858 }, { "epoch": 0.6162702906757267, "grad_norm": 0.2697203457355499, "learning_rate": 7.027605443962821e-05, "loss": 0.2517711639404297, "memory(GiB)": 91.64, "step": 6530, "token_acc": 0.9127028539451595, "train_speed(iter/s)": 0.13858 }, { "epoch": 0.6167421668554172, "grad_norm": 0.4376463294029236, "learning_rate": 7.012709397024195e-05, "loss": 0.2458188056945801, "memory(GiB)": 91.64, "step": 6535, "token_acc": 0.9233644859813084, "train_speed(iter/s)": 0.138581 }, { "epoch": 0.6172140430351076, "grad_norm": 0.9600175023078918, "learning_rate": 6.997820624587888e-05, "loss": 0.2551449775695801, "memory(GiB)": 91.64, "step": 6540, "token_acc": 0.9109266943291839, "train_speed(iter/s)": 0.138581 }, { "epoch": 0.617685919214798, "grad_norm": 0.2927698493003845, "learning_rate": 6.982939162910297e-05, "loss": 0.2430652618408203, "memory(GiB)": 91.64, "step": 6545, "token_acc": 0.9067005937234945, "train_speed(iter/s)": 0.138582 }, { "epoch": 0.6181577953944885, "grad_norm": 0.2956832945346832, "learning_rate": 6.968065048230028e-05, "loss": 0.2454047679901123, "memory(GiB)": 91.64, "step": 6550, "token_acc": 0.9283831282952548, "train_speed(iter/s)": 0.138582 }, { "epoch": 0.6186296715741789, "grad_norm": 0.41990306973457336, "learning_rate": 6.953198316767784e-05, "loss": 0.25021138191223147, "memory(GiB)": 91.64, "step": 6555, "token_acc": 0.9067796610169492, "train_speed(iter/s)": 0.138582 }, { "epoch": 0.6191015477538694, "grad_norm": 0.31326499581336975, "learning_rate": 6.938339004726297e-05, "loss": 0.25416412353515627, "memory(GiB)": 91.64, "step": 6560, "token_acc": 0.9209631728045325, "train_speed(iter/s)": 0.138583 }, { "epoch": 0.6195734239335599, "grad_norm": 0.5134010910987854, "learning_rate": 6.923487148290228e-05, "loss": 0.24505879878997802, "memory(GiB)": 91.64, "step": 6565, "token_acc": 0.9120978704123244, "train_speed(iter/s)": 0.138582 }, { "epoch": 0.6200453001132503, "grad_norm": 0.5389367341995239, "learning_rate": 6.908642783626083e-05, "loss": 0.24259617328643798, "memory(GiB)": 91.64, "step": 6570, "token_acc": 0.9113731456827691, "train_speed(iter/s)": 0.138582 }, { "epoch": 0.6205171762929408, "grad_norm": 0.29177892208099365, "learning_rate": 6.893805946882122e-05, "loss": 0.25206589698791504, "memory(GiB)": 91.64, "step": 6575, "token_acc": 0.9010067114093959, "train_speed(iter/s)": 0.138581 }, { "epoch": 0.6209890524726311, "grad_norm": 0.37285518646240234, "learning_rate": 6.87897667418828e-05, "loss": 0.2487583875656128, "memory(GiB)": 91.64, "step": 6580, "token_acc": 0.9034863945578231, "train_speed(iter/s)": 0.138581 }, { "epoch": 0.6214609286523216, "grad_norm": 0.3356618285179138, "learning_rate": 6.864155001656068e-05, "loss": 0.24705860614776612, "memory(GiB)": 91.64, "step": 6585, "token_acc": 0.90715667311412, "train_speed(iter/s)": 0.138581 }, { "epoch": 0.6219328048320121, "grad_norm": 0.44045567512512207, "learning_rate": 6.849340965378488e-05, "loss": 0.24823305606842042, "memory(GiB)": 91.64, "step": 6590, "token_acc": 0.9090614886731392, "train_speed(iter/s)": 0.138582 }, { "epoch": 0.6224046810117025, "grad_norm": 0.22942085564136505, "learning_rate": 6.83453460142995e-05, "loss": 0.24726405143737792, "memory(GiB)": 91.64, "step": 6595, "token_acc": 0.9070940932027308, "train_speed(iter/s)": 0.138583 }, { "epoch": 0.622876557191393, "grad_norm": 0.2592119872570038, "learning_rate": 6.819735945866177e-05, "loss": 0.25738024711608887, "memory(GiB)": 91.64, "step": 6600, "token_acc": 0.9033684926845866, "train_speed(iter/s)": 0.138583 }, { "epoch": 0.6233484333710835, "grad_norm": 0.224277526140213, "learning_rate": 6.80494503472412e-05, "loss": 0.2494365930557251, "memory(GiB)": 91.64, "step": 6605, "token_acc": 0.9118002416431735, "train_speed(iter/s)": 0.138584 }, { "epoch": 0.6238203095507738, "grad_norm": 0.4567587971687317, "learning_rate": 6.790161904021884e-05, "loss": 0.24388408660888672, "memory(GiB)": 91.64, "step": 6610, "token_acc": 0.919981498612396, "train_speed(iter/s)": 0.138584 }, { "epoch": 0.6242921857304643, "grad_norm": 0.3329204320907593, "learning_rate": 6.775386589758612e-05, "loss": 0.2464517116546631, "memory(GiB)": 91.64, "step": 6615, "token_acc": 0.917174959871589, "train_speed(iter/s)": 0.138584 }, { "epoch": 0.6247640619101548, "grad_norm": 0.20031793415546417, "learning_rate": 6.760619127914417e-05, "loss": 0.24638218879699708, "memory(GiB)": 91.64, "step": 6620, "token_acc": 0.9080194722008711, "train_speed(iter/s)": 0.13858 }, { "epoch": 0.6252359380898452, "grad_norm": 0.362870991230011, "learning_rate": 6.745859554450296e-05, "loss": 0.24988138675689697, "memory(GiB)": 91.64, "step": 6625, "token_acc": 0.9098639455782312, "train_speed(iter/s)": 0.138582 }, { "epoch": 0.6257078142695357, "grad_norm": 1.1485202312469482, "learning_rate": 6.731107905308025e-05, "loss": 0.24473962783813477, "memory(GiB)": 91.64, "step": 6630, "token_acc": 0.9165175909361956, "train_speed(iter/s)": 0.138582 }, { "epoch": 0.6261796904492262, "grad_norm": 0.2645990252494812, "learning_rate": 6.716364216410095e-05, "loss": 0.25117623805999756, "memory(GiB)": 91.64, "step": 6635, "token_acc": 0.9164531009738596, "train_speed(iter/s)": 0.138583 }, { "epoch": 0.6266515666289165, "grad_norm": 0.2435101419687271, "learning_rate": 6.70162852365961e-05, "loss": 0.2497401237487793, "memory(GiB)": 91.64, "step": 6640, "token_acc": 0.9139902014153511, "train_speed(iter/s)": 0.138583 }, { "epoch": 0.627123442808607, "grad_norm": 0.7363473773002625, "learning_rate": 6.686900862940199e-05, "loss": 0.2504051685333252, "memory(GiB)": 91.64, "step": 6645, "token_acc": 0.9123672230652504, "train_speed(iter/s)": 0.138584 }, { "epoch": 0.6275953189882975, "grad_norm": 0.4245203137397766, "learning_rate": 6.672181270115929e-05, "loss": 0.24365406036376952, "memory(GiB)": 91.64, "step": 6650, "token_acc": 0.9070602313522138, "train_speed(iter/s)": 0.138586 }, { "epoch": 0.6280671951679879, "grad_norm": 0.38381725549697876, "learning_rate": 6.657469781031229e-05, "loss": 0.2454462766647339, "memory(GiB)": 91.64, "step": 6655, "token_acc": 0.911594602038006, "train_speed(iter/s)": 0.138587 }, { "epoch": 0.6285390713476784, "grad_norm": 0.20383363962173462, "learning_rate": 6.64276643151079e-05, "loss": 0.24686357975006104, "memory(GiB)": 91.64, "step": 6660, "token_acc": 0.9073148568832349, "train_speed(iter/s)": 0.138588 }, { "epoch": 0.6290109475273689, "grad_norm": 0.221710205078125, "learning_rate": 6.628071257359473e-05, "loss": 0.24501872062683105, "memory(GiB)": 91.64, "step": 6665, "token_acc": 0.9191333536771438, "train_speed(iter/s)": 0.138587 }, { "epoch": 0.6294828237070592, "grad_norm": 0.6653693914413452, "learning_rate": 6.613384294362248e-05, "loss": 0.24365825653076173, "memory(GiB)": 91.64, "step": 6670, "token_acc": 0.9140116478245974, "train_speed(iter/s)": 0.138588 }, { "epoch": 0.6299546998867497, "grad_norm": 0.5823807716369629, "learning_rate": 6.598705578284081e-05, "loss": 0.2461719036102295, "memory(GiB)": 91.64, "step": 6675, "token_acc": 0.9245418613007546, "train_speed(iter/s)": 0.138588 }, { "epoch": 0.6304265760664401, "grad_norm": 0.34033286571502686, "learning_rate": 6.58403514486985e-05, "loss": 0.2465500831604004, "memory(GiB)": 91.64, "step": 6680, "token_acc": 0.904896090172596, "train_speed(iter/s)": 0.138588 }, { "epoch": 0.6308984522461306, "grad_norm": 0.33391931653022766, "learning_rate": 6.569373029844273e-05, "loss": 0.2485738754272461, "memory(GiB)": 91.64, "step": 6685, "token_acc": 0.8928571428571429, "train_speed(iter/s)": 0.138587 }, { "epoch": 0.6313703284258211, "grad_norm": 0.3390878736972809, "learning_rate": 6.554719268911804e-05, "loss": 0.2519662380218506, "memory(GiB)": 91.64, "step": 6690, "token_acc": 0.8982584784601283, "train_speed(iter/s)": 0.138586 }, { "epoch": 0.6318422046055115, "grad_norm": 0.2536347508430481, "learning_rate": 6.540073897756557e-05, "loss": 0.24952611923217774, "memory(GiB)": 91.64, "step": 6695, "token_acc": 0.8970688479890934, "train_speed(iter/s)": 0.138588 }, { "epoch": 0.632314080785202, "grad_norm": 0.2514791786670685, "learning_rate": 6.52543695204222e-05, "loss": 0.249678373336792, "memory(GiB)": 91.64, "step": 6700, "token_acc": 0.9129193433261956, "train_speed(iter/s)": 0.138588 }, { "epoch": 0.6327859569648924, "grad_norm": 0.30157020688056946, "learning_rate": 6.510808467411955e-05, "loss": 0.24402194023132323, "memory(GiB)": 91.64, "step": 6705, "token_acc": 0.8997335109926715, "train_speed(iter/s)": 0.138589 }, { "epoch": 0.6332578331445828, "grad_norm": 0.3998345732688904, "learning_rate": 6.496188479488328e-05, "loss": 0.24702603816986085, "memory(GiB)": 91.64, "step": 6710, "token_acc": 0.9077490774907749, "train_speed(iter/s)": 0.138589 }, { "epoch": 0.6337297093242733, "grad_norm": 0.7433375716209412, "learning_rate": 6.481577023873204e-05, "loss": 0.2497015953063965, "memory(GiB)": 91.64, "step": 6715, "token_acc": 0.9197431781701445, "train_speed(iter/s)": 0.138589 }, { "epoch": 0.6342015855039638, "grad_norm": 0.5230844020843506, "learning_rate": 6.466974136147679e-05, "loss": 0.24684548377990723, "memory(GiB)": 91.64, "step": 6720, "token_acc": 0.9207169941399518, "train_speed(iter/s)": 0.13859 }, { "epoch": 0.6346734616836542, "grad_norm": 0.6465916633605957, "learning_rate": 6.45237985187199e-05, "loss": 0.24689052104949952, "memory(GiB)": 91.64, "step": 6725, "token_acc": 0.9153567110036276, "train_speed(iter/s)": 0.13859 }, { "epoch": 0.6351453378633447, "grad_norm": 0.23393841087818146, "learning_rate": 6.43779420658541e-05, "loss": 0.23820953369140624, "memory(GiB)": 91.64, "step": 6730, "token_acc": 0.9156939040207522, "train_speed(iter/s)": 0.13859 }, { "epoch": 0.6356172140430351, "grad_norm": 0.31033238768577576, "learning_rate": 6.42321723580618e-05, "loss": 0.2415005683898926, "memory(GiB)": 91.64, "step": 6735, "token_acc": 0.9279962103268593, "train_speed(iter/s)": 0.138592 }, { "epoch": 0.6360890902227255, "grad_norm": 0.4967711865901947, "learning_rate": 6.408648975031423e-05, "loss": 0.25005640983581545, "memory(GiB)": 91.64, "step": 6740, "token_acc": 0.9208301306687163, "train_speed(iter/s)": 0.138593 }, { "epoch": 0.636560966402416, "grad_norm": 0.5336925983428955, "learning_rate": 6.394089459737043e-05, "loss": 0.25364394187927247, "memory(GiB)": 91.64, "step": 6745, "token_acc": 0.9160662824207493, "train_speed(iter/s)": 0.138593 }, { "epoch": 0.6370328425821065, "grad_norm": 0.4167347848415375, "learning_rate": 6.379538725377649e-05, "loss": 0.242067289352417, "memory(GiB)": 91.64, "step": 6750, "token_acc": 0.9000430848772081, "train_speed(iter/s)": 0.138593 }, { "epoch": 0.6375047187617969, "grad_norm": 0.7526382207870483, "learning_rate": 6.364996807386474e-05, "loss": 0.251052188873291, "memory(GiB)": 91.64, "step": 6755, "token_acc": 0.913547532295462, "train_speed(iter/s)": 0.138593 }, { "epoch": 0.6379765949414874, "grad_norm": 0.31620025634765625, "learning_rate": 6.350463741175281e-05, "loss": 0.24103710651397706, "memory(GiB)": 91.64, "step": 6760, "token_acc": 0.9160739687055477, "train_speed(iter/s)": 0.138593 }, { "epoch": 0.6384484711211778, "grad_norm": 0.5015328526496887, "learning_rate": 6.335939562134268e-05, "loss": 0.24773306846618653, "memory(GiB)": 91.64, "step": 6765, "token_acc": 0.9099279423538831, "train_speed(iter/s)": 0.138594 }, { "epoch": 0.6389203473008682, "grad_norm": 0.2772028148174286, "learning_rate": 6.321424305631998e-05, "loss": 0.24385676383972169, "memory(GiB)": 91.64, "step": 6770, "token_acc": 0.9089005235602095, "train_speed(iter/s)": 0.138593 }, { "epoch": 0.6393922234805587, "grad_norm": 0.6564515829086304, "learning_rate": 6.306918007015307e-05, "loss": 0.2472226619720459, "memory(GiB)": 91.64, "step": 6775, "token_acc": 0.9095378564405113, "train_speed(iter/s)": 0.138593 }, { "epoch": 0.6398640996602492, "grad_norm": 0.29450663924217224, "learning_rate": 6.292420701609214e-05, "loss": 0.25525641441345215, "memory(GiB)": 91.64, "step": 6780, "token_acc": 0.8881748071979434, "train_speed(iter/s)": 0.138594 }, { "epoch": 0.6403359758399396, "grad_norm": 0.33771711587905884, "learning_rate": 6.277932424716844e-05, "loss": 0.24898838996887207, "memory(GiB)": 91.64, "step": 6785, "token_acc": 0.9064565327910523, "train_speed(iter/s)": 0.138592 }, { "epoch": 0.6408078520196301, "grad_norm": 0.4698493182659149, "learning_rate": 6.263453211619328e-05, "loss": 0.24726357460021972, "memory(GiB)": 91.64, "step": 6790, "token_acc": 0.900737379466818, "train_speed(iter/s)": 0.138593 }, { "epoch": 0.6412797281993206, "grad_norm": 0.27207568287849426, "learning_rate": 6.248983097575734e-05, "loss": 0.2391824960708618, "memory(GiB)": 91.64, "step": 6795, "token_acc": 0.9296657381615598, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.6417516043790109, "grad_norm": 0.2685665190219879, "learning_rate": 6.234522117822964e-05, "loss": 0.24474749565124512, "memory(GiB)": 91.64, "step": 6800, "token_acc": 0.9061456245824984, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.6422234805587014, "grad_norm": 0.37602898478507996, "learning_rate": 6.220070307575681e-05, "loss": 0.24138176441192627, "memory(GiB)": 91.64, "step": 6805, "token_acc": 0.9087617668356264, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.6426953567383918, "grad_norm": 0.3656831681728363, "learning_rate": 6.205627702026217e-05, "loss": 0.24532785415649414, "memory(GiB)": 91.64, "step": 6810, "token_acc": 0.915625, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.6431672329180823, "grad_norm": 0.7820502519607544, "learning_rate": 6.191194336344499e-05, "loss": 0.24389877319335937, "memory(GiB)": 91.64, "step": 6815, "token_acc": 0.9069687607277721, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.6436391090977728, "grad_norm": 0.28132694959640503, "learning_rate": 6.176770245677937e-05, "loss": 0.23709509372711182, "memory(GiB)": 91.64, "step": 6820, "token_acc": 0.9139106286291341, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.6441109852774632, "grad_norm": 0.29679086804389954, "learning_rate": 6.162355465151366e-05, "loss": 0.2428110122680664, "memory(GiB)": 91.64, "step": 6825, "token_acc": 0.9155524278676987, "train_speed(iter/s)": 0.138596 }, { "epoch": 0.6445828614571536, "grad_norm": 0.6224273443222046, "learning_rate": 6.147950029866946e-05, "loss": 0.25038561820983884, "memory(GiB)": 91.64, "step": 6830, "token_acc": 0.90463645943098, "train_speed(iter/s)": 0.138599 }, { "epoch": 0.6450547376368441, "grad_norm": 0.21147441864013672, "learning_rate": 6.13355397490408e-05, "loss": 0.24030847549438478, "memory(GiB)": 91.64, "step": 6835, "token_acc": 0.9303013993541442, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.6455266138165345, "grad_norm": 0.3700733184814453, "learning_rate": 6.119167335319326e-05, "loss": 0.24275476932525636, "memory(GiB)": 91.64, "step": 6840, "token_acc": 0.9167386920195909, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.645998489996225, "grad_norm": 0.3230724632740021, "learning_rate": 6.104790146146326e-05, "loss": 0.24337000846862794, "memory(GiB)": 91.64, "step": 6845, "token_acc": 0.9195822454308094, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.6464703661759155, "grad_norm": 0.5377890467643738, "learning_rate": 6.0904224423956935e-05, "loss": 0.24264154434204102, "memory(GiB)": 91.64, "step": 6850, "token_acc": 0.9083885209713024, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.6469422423556059, "grad_norm": 0.2459423542022705, "learning_rate": 6.07606425905495e-05, "loss": 0.24474921226501464, "memory(GiB)": 91.64, "step": 6855, "token_acc": 0.9048361934477379, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.6474141185352963, "grad_norm": 0.21430309116840363, "learning_rate": 6.061715631088436e-05, "loss": 0.24547672271728516, "memory(GiB)": 91.64, "step": 6860, "token_acc": 0.9136972866949984, "train_speed(iter/s)": 0.138597 }, { "epoch": 0.6478859947149868, "grad_norm": 0.1994255632162094, "learning_rate": 6.047376593437214e-05, "loss": 0.24437365531921387, "memory(GiB)": 91.64, "step": 6865, "token_acc": 0.8946328613763582, "train_speed(iter/s)": 0.138597 }, { "epoch": 0.6483578708946772, "grad_norm": 0.21017318964004517, "learning_rate": 6.033047181019007e-05, "loss": 0.2440267562866211, "memory(GiB)": 91.64, "step": 6870, "token_acc": 0.9259259259259259, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.6488297470743677, "grad_norm": 0.21352751553058624, "learning_rate": 6.0187274287280915e-05, "loss": 0.2388768672943115, "memory(GiB)": 91.64, "step": 6875, "token_acc": 0.9148382298252138, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.6493016232540582, "grad_norm": 0.37781575322151184, "learning_rate": 6.004417371435216e-05, "loss": 0.2506894111633301, "memory(GiB)": 91.64, "step": 6880, "token_acc": 0.916063059224542, "train_speed(iter/s)": 0.138599 }, { "epoch": 0.6497734994337486, "grad_norm": 0.5697845220565796, "learning_rate": 5.990117043987524e-05, "loss": 0.24643681049346924, "memory(GiB)": 91.64, "step": 6885, "token_acc": 0.9218500797448166, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.650245375613439, "grad_norm": 0.27268853783607483, "learning_rate": 5.975826481208469e-05, "loss": 0.24230141639709474, "memory(GiB)": 91.64, "step": 6890, "token_acc": 0.9050925925925926, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.6507172517931294, "grad_norm": 0.23473475873470306, "learning_rate": 5.961545717897716e-05, "loss": 0.24397435188293456, "memory(GiB)": 91.64, "step": 6895, "token_acc": 0.9104010025062657, "train_speed(iter/s)": 0.138596 }, { "epoch": 0.6511891279728199, "grad_norm": 0.2682229280471802, "learning_rate": 5.9472747888310834e-05, "loss": 0.24428434371948243, "memory(GiB)": 91.64, "step": 6900, "token_acc": 0.9191026512576479, "train_speed(iter/s)": 0.138596 }, { "epoch": 0.6516610041525104, "grad_norm": 0.3811211585998535, "learning_rate": 5.933013728760423e-05, "loss": 0.24194138050079345, "memory(GiB)": 91.64, "step": 6905, "token_acc": 0.9222492190211732, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.6521328803322008, "grad_norm": 0.36602070927619934, "learning_rate": 5.9187625724135674e-05, "loss": 0.2407762050628662, "memory(GiB)": 91.64, "step": 6910, "token_acc": 0.922350162385592, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.6526047565118913, "grad_norm": 0.34567978978157043, "learning_rate": 5.904521354494228e-05, "loss": 0.24706146717071534, "memory(GiB)": 91.64, "step": 6915, "token_acc": 0.9051838723969872, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.6530766326915818, "grad_norm": 0.748266339302063, "learning_rate": 5.890290109681911e-05, "loss": 0.24392051696777345, "memory(GiB)": 91.64, "step": 6920, "token_acc": 0.9111389236545682, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.6535485088712721, "grad_norm": 0.6312034130096436, "learning_rate": 5.8760688726318394e-05, "loss": 0.2393946170806885, "memory(GiB)": 91.64, "step": 6925, "token_acc": 0.9175446633073752, "train_speed(iter/s)": 0.138594 }, { "epoch": 0.6540203850509626, "grad_norm": 0.39340659976005554, "learning_rate": 5.861857677974871e-05, "loss": 0.24171264171600343, "memory(GiB)": 91.64, "step": 6930, "token_acc": 0.9208353569694027, "train_speed(iter/s)": 0.138594 }, { "epoch": 0.6544922612306531, "grad_norm": 0.27433183789253235, "learning_rate": 5.8476565603174025e-05, "loss": 0.24087843894958497, "memory(GiB)": 91.64, "step": 6935, "token_acc": 0.9106471816283925, "train_speed(iter/s)": 0.138594 }, { "epoch": 0.6549641374103435, "grad_norm": 0.2510198950767517, "learning_rate": 5.833465554241291e-05, "loss": 0.24894437789916993, "memory(GiB)": 91.64, "step": 6940, "token_acc": 0.9257688229056203, "train_speed(iter/s)": 0.138593 }, { "epoch": 0.655436013590034, "grad_norm": 0.4624379277229309, "learning_rate": 5.8192846943037724e-05, "loss": 0.2392751693725586, "memory(GiB)": 91.64, "step": 6945, "token_acc": 0.9086819613135403, "train_speed(iter/s)": 0.138593 }, { "epoch": 0.6559078897697245, "grad_norm": 0.664070725440979, "learning_rate": 5.805114015037383e-05, "loss": 0.24224374294281006, "memory(GiB)": 91.64, "step": 6950, "token_acc": 0.9146216768916156, "train_speed(iter/s)": 0.138594 }, { "epoch": 0.6563797659494148, "grad_norm": 0.250935435295105, "learning_rate": 5.790953550949845e-05, "loss": 0.23738400936126708, "memory(GiB)": 91.64, "step": 6955, "token_acc": 0.9275690357627886, "train_speed(iter/s)": 0.138594 }, { "epoch": 0.6568516421291053, "grad_norm": 0.6119559407234192, "learning_rate": 5.7768033365240346e-05, "loss": 0.2418374538421631, "memory(GiB)": 91.64, "step": 6960, "token_acc": 0.8984538236523193, "train_speed(iter/s)": 0.138594 }, { "epoch": 0.6573235183087958, "grad_norm": 0.32473883032798767, "learning_rate": 5.7626634062178474e-05, "loss": 0.24251232147216797, "memory(GiB)": 91.64, "step": 6965, "token_acc": 0.9, "train_speed(iter/s)": 0.138594 }, { "epoch": 0.6577953944884862, "grad_norm": 0.6449810862541199, "learning_rate": 5.748533794464142e-05, "loss": 0.24386320114135743, "memory(GiB)": 91.64, "step": 6970, "token_acc": 0.9190297232661429, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.6582672706681767, "grad_norm": 0.6496239900588989, "learning_rate": 5.7344145356706515e-05, "loss": 0.24536077976226806, "memory(GiB)": 91.64, "step": 6975, "token_acc": 0.9164449175093135, "train_speed(iter/s)": 0.138594 }, { "epoch": 0.6587391468478672, "grad_norm": 0.48430728912353516, "learning_rate": 5.7203056642199e-05, "loss": 0.23882853984832764, "memory(GiB)": 91.64, "step": 6980, "token_acc": 0.9236111111111112, "train_speed(iter/s)": 0.138593 }, { "epoch": 0.6592110230275575, "grad_norm": 0.27332404255867004, "learning_rate": 5.7062072144691036e-05, "loss": 0.23896045684814454, "memory(GiB)": 91.64, "step": 6985, "token_acc": 0.9155635062611807, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.659682899207248, "grad_norm": 0.31327372789382935, "learning_rate": 5.692119220750123e-05, "loss": 0.24076180458068847, "memory(GiB)": 91.64, "step": 6990, "token_acc": 0.9125720094883091, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.6601547753869385, "grad_norm": 0.3141052722930908, "learning_rate": 5.678041717369331e-05, "loss": 0.23503921031951905, "memory(GiB)": 91.64, "step": 6995, "token_acc": 0.9165507649513213, "train_speed(iter/s)": 0.138594 }, { "epoch": 0.6606266515666289, "grad_norm": 0.8892892003059387, "learning_rate": 5.663974738607576e-05, "loss": 0.24884920120239257, "memory(GiB)": 91.64, "step": 7000, "token_acc": 0.917921146953405, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.6610985277463194, "grad_norm": 0.8716257214546204, "learning_rate": 5.649918318720069e-05, "loss": 0.2401883602142334, "memory(GiB)": 91.64, "step": 7005, "token_acc": 0.8837485172004745, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.6615704039260099, "grad_norm": 0.28352469205856323, "learning_rate": 5.635872491936301e-05, "loss": 0.24602279663085938, "memory(GiB)": 91.64, "step": 7010, "token_acc": 0.9251700680272109, "train_speed(iter/s)": 0.138594 }, { "epoch": 0.6620422801057002, "grad_norm": 0.6487071514129639, "learning_rate": 5.621837292459975e-05, "loss": 0.24283573627471924, "memory(GiB)": 91.64, "step": 7015, "token_acc": 0.9180874722016308, "train_speed(iter/s)": 0.138593 }, { "epoch": 0.6625141562853907, "grad_norm": 0.27189901471138, "learning_rate": 5.6078127544689275e-05, "loss": 0.2412208080291748, "memory(GiB)": 91.64, "step": 7020, "token_acc": 0.9183193277310925, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.6629860324650811, "grad_norm": 0.576571524143219, "learning_rate": 5.593798912115007e-05, "loss": 0.23776733875274658, "memory(GiB)": 91.64, "step": 7025, "token_acc": 0.9100346020761245, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.6634579086447716, "grad_norm": 0.31800100207328796, "learning_rate": 5.579795799524033e-05, "loss": 0.23961338996887208, "memory(GiB)": 91.64, "step": 7030, "token_acc": 0.9148170365926814, "train_speed(iter/s)": 0.138596 }, { "epoch": 0.6639297848244621, "grad_norm": 0.592032253742218, "learning_rate": 5.565803450795696e-05, "loss": 0.24274992942810059, "memory(GiB)": 91.64, "step": 7035, "token_acc": 0.9148219441770934, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.6644016610041525, "grad_norm": 0.20974688231945038, "learning_rate": 5.551821900003461e-05, "loss": 0.23541603088378907, "memory(GiB)": 91.64, "step": 7040, "token_acc": 0.9282661782661783, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.664873537183843, "grad_norm": 0.9004971981048584, "learning_rate": 5.5378511811945246e-05, "loss": 0.24225046634674072, "memory(GiB)": 91.64, "step": 7045, "token_acc": 0.906993775173929, "train_speed(iter/s)": 0.138599 }, { "epoch": 0.6653454133635334, "grad_norm": 0.2866557538509369, "learning_rate": 5.5238913283896766e-05, "loss": 0.23856678009033203, "memory(GiB)": 91.64, "step": 7050, "token_acc": 0.9058961343225302, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.6658172895432238, "grad_norm": 0.7507352232933044, "learning_rate": 5.509942375583267e-05, "loss": 0.24381327629089355, "memory(GiB)": 91.64, "step": 7055, "token_acc": 0.9071883530482256, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.6662891657229143, "grad_norm": 0.23248225450515747, "learning_rate": 5.496004356743093e-05, "loss": 0.24313607215881347, "memory(GiB)": 91.64, "step": 7060, "token_acc": 0.9168689320388349, "train_speed(iter/s)": 0.138597 }, { "epoch": 0.6667610419026048, "grad_norm": 0.20760686695575714, "learning_rate": 5.482077305810334e-05, "loss": 0.23505353927612305, "memory(GiB)": 91.64, "step": 7065, "token_acc": 0.9141703130259172, "train_speed(iter/s)": 0.138596 }, { "epoch": 0.6672329180822952, "grad_norm": 0.30243703722953796, "learning_rate": 5.468161256699443e-05, "loss": 0.2383075475692749, "memory(GiB)": 91.64, "step": 7070, "token_acc": 0.9148854961832061, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.6677047942619857, "grad_norm": 0.35814082622528076, "learning_rate": 5.454256243298112e-05, "loss": 0.24843888282775878, "memory(GiB)": 91.64, "step": 7075, "token_acc": 0.9190017513134852, "train_speed(iter/s)": 0.138594 }, { "epoch": 0.6681766704416761, "grad_norm": 0.411588191986084, "learning_rate": 5.440362299467128e-05, "loss": 0.2393017053604126, "memory(GiB)": 91.64, "step": 7080, "token_acc": 0.9128503075871497, "train_speed(iter/s)": 0.138594 }, { "epoch": 0.6686485466213665, "grad_norm": 0.23460780084133148, "learning_rate": 5.4264794590403404e-05, "loss": 0.23937084674835205, "memory(GiB)": 91.64, "step": 7085, "token_acc": 0.916003293988471, "train_speed(iter/s)": 0.138596 }, { "epoch": 0.669120422801057, "grad_norm": 0.5503364205360413, "learning_rate": 5.412607755824559e-05, "loss": 0.24756169319152832, "memory(GiB)": 91.64, "step": 7090, "token_acc": 0.9075514874141877, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.6695922989807475, "grad_norm": 0.32460644841194153, "learning_rate": 5.3987472235994615e-05, "loss": 0.24046099185943604, "memory(GiB)": 91.64, "step": 7095, "token_acc": 0.9222476314929762, "train_speed(iter/s)": 0.138596 }, { "epoch": 0.6700641751604379, "grad_norm": 0.8033245801925659, "learning_rate": 5.3848978961175325e-05, "loss": 0.2427436590194702, "memory(GiB)": 91.64, "step": 7100, "token_acc": 0.9128919860627178, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.6705360513401284, "grad_norm": 0.28919094800949097, "learning_rate": 5.3710598071039774e-05, "loss": 0.2408435344696045, "memory(GiB)": 91.64, "step": 7105, "token_acc": 0.9220098643649816, "train_speed(iter/s)": 0.138597 }, { "epoch": 0.6710079275198187, "grad_norm": 0.3446965217590332, "learning_rate": 5.357232990256618e-05, "loss": 0.2420729637145996, "memory(GiB)": 91.64, "step": 7110, "token_acc": 0.9189504373177843, "train_speed(iter/s)": 0.138596 }, { "epoch": 0.6714798036995092, "grad_norm": 0.5533409714698792, "learning_rate": 5.3434174792458357e-05, "loss": 0.24679412841796874, "memory(GiB)": 91.64, "step": 7115, "token_acc": 0.9165575916230366, "train_speed(iter/s)": 0.138597 }, { "epoch": 0.6719516798791997, "grad_norm": 0.26969775557518005, "learning_rate": 5.3296133077144864e-05, "loss": 0.23531513214111327, "memory(GiB)": 91.64, "step": 7120, "token_acc": 0.9101084295208115, "train_speed(iter/s)": 0.138597 }, { "epoch": 0.6724235560588901, "grad_norm": 0.3181051015853882, "learning_rate": 5.315820509277796e-05, "loss": 0.2495020866394043, "memory(GiB)": 91.64, "step": 7125, "token_acc": 0.9126819126819127, "train_speed(iter/s)": 0.138599 }, { "epoch": 0.6728954322385806, "grad_norm": 0.6126598119735718, "learning_rate": 5.302039117523307e-05, "loss": 0.2389207363128662, "memory(GiB)": 91.64, "step": 7130, "token_acc": 0.9185185185185185, "train_speed(iter/s)": 0.138598 }, { "epoch": 0.6733673084182711, "grad_norm": 0.22201423346996307, "learning_rate": 5.288269166010788e-05, "loss": 0.24875764846801757, "memory(GiB)": 91.64, "step": 7135, "token_acc": 0.9052910052910053, "train_speed(iter/s)": 0.138599 }, { "epoch": 0.6738391845979615, "grad_norm": 0.3069898188114166, "learning_rate": 5.274510688272141e-05, "loss": 0.24061965942382812, "memory(GiB)": 91.64, "step": 7140, "token_acc": 0.9072276159654801, "train_speed(iter/s)": 0.138601 }, { "epoch": 0.6743110607776519, "grad_norm": 0.24082788825035095, "learning_rate": 5.260763717811328e-05, "loss": 0.2444239616394043, "memory(GiB)": 91.64, "step": 7145, "token_acc": 0.9239920687376074, "train_speed(iter/s)": 0.138602 }, { "epoch": 0.6747829369573424, "grad_norm": 0.4293016791343689, "learning_rate": 5.247028288104301e-05, "loss": 0.24262595176696777, "memory(GiB)": 91.64, "step": 7150, "token_acc": 0.9193776520509194, "train_speed(iter/s)": 0.1386 }, { "epoch": 0.6752548131370328, "grad_norm": 0.3337520658969879, "learning_rate": 5.233304432598886e-05, "loss": 0.2384474754333496, "memory(GiB)": 91.64, "step": 7155, "token_acc": 0.9188795925791197, "train_speed(iter/s)": 0.1386 }, { "epoch": 0.6757266893167233, "grad_norm": 0.28636276721954346, "learning_rate": 5.2195921847147436e-05, "loss": 0.24127793312072754, "memory(GiB)": 91.64, "step": 7160, "token_acc": 0.9061994609164421, "train_speed(iter/s)": 0.1386 }, { "epoch": 0.6761985654964138, "grad_norm": 0.2952304482460022, "learning_rate": 5.2058915778432614e-05, "loss": 0.23610930442810057, "memory(GiB)": 91.64, "step": 7165, "token_acc": 0.9076154806491885, "train_speed(iter/s)": 0.138601 }, { "epoch": 0.6766704416761042, "grad_norm": 0.2610105574131012, "learning_rate": 5.1922026453474795e-05, "loss": 0.2389233112335205, "memory(GiB)": 91.64, "step": 7170, "token_acc": 0.9191176470588235, "train_speed(iter/s)": 0.1386 }, { "epoch": 0.6771423178557946, "grad_norm": 0.23289845883846283, "learning_rate": 5.178525420562013e-05, "loss": 0.24023265838623048, "memory(GiB)": 91.64, "step": 7175, "token_acc": 0.9193548387096774, "train_speed(iter/s)": 0.1386 }, { "epoch": 0.6776141940354851, "grad_norm": 0.3138931691646576, "learning_rate": 5.164859936792955e-05, "loss": 0.2391728162765503, "memory(GiB)": 91.64, "step": 7180, "token_acc": 0.9241106719367589, "train_speed(iter/s)": 0.138601 }, { "epoch": 0.6780860702151755, "grad_norm": 0.5057432651519775, "learning_rate": 5.1512062273178195e-05, "loss": 0.2443333864212036, "memory(GiB)": 91.64, "step": 7185, "token_acc": 0.9109893871961657, "train_speed(iter/s)": 0.138601 }, { "epoch": 0.678557946394866, "grad_norm": 0.3931172788143158, "learning_rate": 5.137564325385447e-05, "loss": 0.2416548252105713, "memory(GiB)": 91.64, "step": 7190, "token_acc": 0.9179136383069688, "train_speed(iter/s)": 0.138601 }, { "epoch": 0.6790298225745565, "grad_norm": 0.847093939781189, "learning_rate": 5.123934264215918e-05, "loss": 0.2375951290130615, "memory(GiB)": 91.64, "step": 7195, "token_acc": 0.9275525525525525, "train_speed(iter/s)": 0.1386 }, { "epoch": 0.6795016987542469, "grad_norm": 0.19797252118587494, "learning_rate": 5.110316077000487e-05, "loss": 0.24020733833312988, "memory(GiB)": 91.64, "step": 7200, "token_acc": 0.8846153846153846, "train_speed(iter/s)": 0.138601 }, { "epoch": 0.6799735749339373, "grad_norm": 0.38781481981277466, "learning_rate": 5.096709796901491e-05, "loss": 0.23613905906677246, "memory(GiB)": 91.64, "step": 7205, "token_acc": 0.9090549624357454, "train_speed(iter/s)": 0.138602 }, { "epoch": 0.6804454511136278, "grad_norm": 0.34402525424957275, "learning_rate": 5.083115457052263e-05, "loss": 0.24098021984100343, "memory(GiB)": 91.64, "step": 7210, "token_acc": 0.9180212014134276, "train_speed(iter/s)": 0.138601 }, { "epoch": 0.6809173272933182, "grad_norm": 0.20066817104816437, "learning_rate": 5.0695330905570735e-05, "loss": 0.23880929946899415, "memory(GiB)": 91.64, "step": 7215, "token_acc": 0.9213587715216379, "train_speed(iter/s)": 0.138602 }, { "epoch": 0.6813892034730087, "grad_norm": 0.31116795539855957, "learning_rate": 5.055962730491028e-05, "loss": 0.24228744506835936, "memory(GiB)": 91.64, "step": 7220, "token_acc": 0.9155663655316192, "train_speed(iter/s)": 0.138603 }, { "epoch": 0.6818610796526992, "grad_norm": 0.282959908246994, "learning_rate": 5.042404409899995e-05, "loss": 0.24117763042449952, "memory(GiB)": 91.64, "step": 7225, "token_acc": 0.9193548387096774, "train_speed(iter/s)": 0.138604 }, { "epoch": 0.6823329558323896, "grad_norm": 0.49499738216400146, "learning_rate": 5.0288581618005274e-05, "loss": 0.24113068580627442, "memory(GiB)": 91.64, "step": 7230, "token_acc": 0.9124833407374501, "train_speed(iter/s)": 0.138603 }, { "epoch": 0.68280483201208, "grad_norm": 0.37821346521377563, "learning_rate": 5.015324019179781e-05, "loss": 0.24250617027282714, "memory(GiB)": 91.64, "step": 7235, "token_acc": 0.9230293663060278, "train_speed(iter/s)": 0.138604 }, { "epoch": 0.6832767081917704, "grad_norm": 0.43186768889427185, "learning_rate": 5.001802014995425e-05, "loss": 0.23106989860534669, "memory(GiB)": 91.64, "step": 7240, "token_acc": 0.9164609053497942, "train_speed(iter/s)": 0.138604 }, { "epoch": 0.6837485843714609, "grad_norm": 0.42559754848480225, "learning_rate": 4.988292182175577e-05, "loss": 0.23471336364746093, "memory(GiB)": 91.64, "step": 7245, "token_acc": 0.928995756718529, "train_speed(iter/s)": 0.138604 }, { "epoch": 0.6842204605511514, "grad_norm": 0.4017716646194458, "learning_rate": 4.9747945536187145e-05, "loss": 0.24154629707336425, "memory(GiB)": 91.64, "step": 7250, "token_acc": 0.9225834046193327, "train_speed(iter/s)": 0.138604 }, { "epoch": 0.6846923367308418, "grad_norm": 0.45907652378082275, "learning_rate": 4.961309162193595e-05, "loss": 0.22962424755096436, "memory(GiB)": 91.64, "step": 7255, "token_acc": 0.929345470307759, "train_speed(iter/s)": 0.138604 }, { "epoch": 0.6851642129105323, "grad_norm": 0.23512178659439087, "learning_rate": 4.94783604073918e-05, "loss": 0.23903675079345704, "memory(GiB)": 91.64, "step": 7260, "token_acc": 0.9203929539295393, "train_speed(iter/s)": 0.138603 }, { "epoch": 0.6856360890902228, "grad_norm": 0.5371173024177551, "learning_rate": 4.9343752220645424e-05, "loss": 0.23918862342834474, "memory(GiB)": 91.64, "step": 7265, "token_acc": 0.9101217975640488, "train_speed(iter/s)": 0.138604 }, { "epoch": 0.6861079652699131, "grad_norm": 0.5968282222747803, "learning_rate": 4.9209267389488036e-05, "loss": 0.24033589363098146, "memory(GiB)": 91.64, "step": 7270, "token_acc": 0.9201339072214252, "train_speed(iter/s)": 0.138603 }, { "epoch": 0.6865798414496036, "grad_norm": 0.5909680724143982, "learning_rate": 4.907490624141046e-05, "loss": 0.23803811073303222, "memory(GiB)": 91.64, "step": 7275, "token_acc": 0.9005710446758481, "train_speed(iter/s)": 0.138604 }, { "epoch": 0.6870517176292941, "grad_norm": 0.6105820536613464, "learning_rate": 4.894066910360231e-05, "loss": 0.23955843448638917, "memory(GiB)": 91.64, "step": 7280, "token_acc": 0.9219009637753407, "train_speed(iter/s)": 0.138605 }, { "epoch": 0.6875235938089845, "grad_norm": 0.34672901034355164, "learning_rate": 4.880655630295122e-05, "loss": 0.23337287902832032, "memory(GiB)": 91.64, "step": 7285, "token_acc": 0.9275103980986333, "train_speed(iter/s)": 0.138604 }, { "epoch": 0.687995469988675, "grad_norm": 0.2837556302547455, "learning_rate": 4.867256816604211e-05, "loss": 0.24297127723693848, "memory(GiB)": 91.64, "step": 7290, "token_acc": 0.9024296182028538, "train_speed(iter/s)": 0.138603 }, { "epoch": 0.6884673461683655, "grad_norm": 0.31120648980140686, "learning_rate": 4.853870501915616e-05, "loss": 0.23241846561431884, "memory(GiB)": 91.64, "step": 7295, "token_acc": 0.9221556886227545, "train_speed(iter/s)": 0.138604 }, { "epoch": 0.6889392223480558, "grad_norm": 0.5619298219680786, "learning_rate": 4.8404967188270336e-05, "loss": 0.23225302696228028, "memory(GiB)": 91.64, "step": 7300, "token_acc": 0.9281452013923421, "train_speed(iter/s)": 0.138603 }, { "epoch": 0.6894110985277463, "grad_norm": 0.4053754508495331, "learning_rate": 4.827135499905638e-05, "loss": 0.23050668239593505, "memory(GiB)": 91.64, "step": 7305, "token_acc": 0.915, "train_speed(iter/s)": 0.138603 }, { "epoch": 0.6898829747074368, "grad_norm": 0.48405569791793823, "learning_rate": 4.8137868776880104e-05, "loss": 0.23096683025360107, "memory(GiB)": 91.64, "step": 7310, "token_acc": 0.919170243204578, "train_speed(iter/s)": 0.138603 }, { "epoch": 0.6903548508871272, "grad_norm": 0.28916484117507935, "learning_rate": 4.800450884680054e-05, "loss": 0.23934123516082764, "memory(GiB)": 91.64, "step": 7315, "token_acc": 0.8993939393939394, "train_speed(iter/s)": 0.138604 }, { "epoch": 0.6908267270668177, "grad_norm": 0.3169102668762207, "learning_rate": 4.7871275533569223e-05, "loss": 0.23298006057739257, "memory(GiB)": 91.64, "step": 7320, "token_acc": 0.9264836138175376, "train_speed(iter/s)": 0.138604 }, { "epoch": 0.6912986032465082, "grad_norm": 0.405086487531662, "learning_rate": 4.7738169161629273e-05, "loss": 0.23260374069213868, "memory(GiB)": 91.64, "step": 7325, "token_acc": 0.8946572580645161, "train_speed(iter/s)": 0.138605 }, { "epoch": 0.6917704794261985, "grad_norm": 0.3432481288909912, "learning_rate": 4.760519005511477e-05, "loss": 0.2355022668838501, "memory(GiB)": 91.64, "step": 7330, "token_acc": 0.9095427435387674, "train_speed(iter/s)": 0.138605 }, { "epoch": 0.692242355605889, "grad_norm": 0.3307795524597168, "learning_rate": 4.747233853784986e-05, "loss": 0.24247050285339355, "memory(GiB)": 91.64, "step": 7335, "token_acc": 0.9070431472081218, "train_speed(iter/s)": 0.138606 }, { "epoch": 0.6927142317855794, "grad_norm": 0.26367953419685364, "learning_rate": 4.733961493334798e-05, "loss": 0.23971378803253174, "memory(GiB)": 91.64, "step": 7340, "token_acc": 0.9123732251521298, "train_speed(iter/s)": 0.138605 }, { "epoch": 0.6931861079652699, "grad_norm": 0.220316544175148, "learning_rate": 4.720701956481112e-05, "loss": 0.22885804176330565, "memory(GiB)": 91.64, "step": 7345, "token_acc": 0.9171151776103337, "train_speed(iter/s)": 0.138606 }, { "epoch": 0.6936579841449604, "grad_norm": 0.4796167016029358, "learning_rate": 4.70745527551289e-05, "loss": 0.23552498817443848, "memory(GiB)": 91.64, "step": 7350, "token_acc": 0.9133454106280193, "train_speed(iter/s)": 0.138605 }, { "epoch": 0.6941298603246508, "grad_norm": 0.4051739573478699, "learning_rate": 4.694221482687797e-05, "loss": 0.230513858795166, "memory(GiB)": 91.64, "step": 7355, "token_acc": 0.9137493658041603, "train_speed(iter/s)": 0.138605 }, { "epoch": 0.6946017365043413, "grad_norm": 0.3922402262687683, "learning_rate": 4.681000610232112e-05, "loss": 0.23725688457489014, "memory(GiB)": 91.64, "step": 7360, "token_acc": 0.9128390596745027, "train_speed(iter/s)": 0.138603 }, { "epoch": 0.6950736126840317, "grad_norm": 0.2533682584762573, "learning_rate": 4.66779269034065e-05, "loss": 0.237007737159729, "memory(GiB)": 91.64, "step": 7365, "token_acc": 0.919661733615222, "train_speed(iter/s)": 0.138604 }, { "epoch": 0.6955454888637221, "grad_norm": 0.2674064040184021, "learning_rate": 4.654597755176682e-05, "loss": 0.23746719360351562, "memory(GiB)": 91.64, "step": 7370, "token_acc": 0.9228115567054765, "train_speed(iter/s)": 0.138603 }, { "epoch": 0.6960173650434126, "grad_norm": 0.5427283644676208, "learning_rate": 4.6414158368718665e-05, "loss": 0.24357161521911622, "memory(GiB)": 91.64, "step": 7375, "token_acc": 0.9070453707119144, "train_speed(iter/s)": 0.138601 }, { "epoch": 0.6964892412231031, "grad_norm": 0.4000316262245178, "learning_rate": 4.628246967526151e-05, "loss": 0.2293656587600708, "memory(GiB)": 91.64, "step": 7380, "token_acc": 0.9324394017534812, "train_speed(iter/s)": 0.138602 }, { "epoch": 0.6969611174027935, "grad_norm": 0.3474954068660736, "learning_rate": 4.61509117920772e-05, "loss": 0.23463306427001954, "memory(GiB)": 91.64, "step": 7385, "token_acc": 0.9113486325055015, "train_speed(iter/s)": 0.138602 }, { "epoch": 0.697432993582484, "grad_norm": 0.41696134209632874, "learning_rate": 4.601948503952896e-05, "loss": 0.23325471878051757, "memory(GiB)": 91.64, "step": 7390, "token_acc": 0.935026138909634, "train_speed(iter/s)": 0.138601 }, { "epoch": 0.6979048697621744, "grad_norm": 0.6173218488693237, "learning_rate": 4.5888189737660735e-05, "loss": 0.2361754894256592, "memory(GiB)": 91.64, "step": 7395, "token_acc": 0.9114997350291468, "train_speed(iter/s)": 0.138602 }, { "epoch": 0.6983767459418648, "grad_norm": 0.4587777256965637, "learning_rate": 4.5757026206196354e-05, "loss": 0.23703014850616455, "memory(GiB)": 91.64, "step": 7400, "token_acc": 0.9132492113564669, "train_speed(iter/s)": 0.138602 }, { "epoch": 0.6988486221215553, "grad_norm": 0.7898734211921692, "learning_rate": 4.562599476453878e-05, "loss": 0.23952670097351075, "memory(GiB)": 91.64, "step": 7405, "token_acc": 0.9085151301900071, "train_speed(iter/s)": 0.138604 }, { "epoch": 0.6993204983012458, "grad_norm": 0.36151617765426636, "learning_rate": 4.549509573176923e-05, "loss": 0.2393695592880249, "memory(GiB)": 91.64, "step": 7410, "token_acc": 0.9187757504414361, "train_speed(iter/s)": 0.138605 }, { "epoch": 0.6997923744809362, "grad_norm": 0.40499287843704224, "learning_rate": 4.53643294266466e-05, "loss": 0.23288025856018066, "memory(GiB)": 91.64, "step": 7415, "token_acc": 0.9126254180602007, "train_speed(iter/s)": 0.138606 }, { "epoch": 0.7002642506606267, "grad_norm": 0.39258766174316406, "learning_rate": 4.523369616760653e-05, "loss": 0.23379309177398683, "memory(GiB)": 91.64, "step": 7420, "token_acc": 0.9255467659376454, "train_speed(iter/s)": 0.138606 }, { "epoch": 0.700736126840317, "grad_norm": 0.5925548076629639, "learning_rate": 4.510319627276066e-05, "loss": 0.24018988609313965, "memory(GiB)": 91.64, "step": 7425, "token_acc": 0.9100609756097561, "train_speed(iter/s)": 0.138607 }, { "epoch": 0.7012080030200075, "grad_norm": 0.2632533311843872, "learning_rate": 4.497283005989592e-05, "loss": 0.2343803882598877, "memory(GiB)": 91.64, "step": 7430, "token_acc": 0.911731843575419, "train_speed(iter/s)": 0.138607 }, { "epoch": 0.701679879199698, "grad_norm": 0.5552768111228943, "learning_rate": 4.484259784647359e-05, "loss": 0.23283910751342773, "memory(GiB)": 91.64, "step": 7435, "token_acc": 0.9176574196389256, "train_speed(iter/s)": 0.138608 }, { "epoch": 0.7021517553793885, "grad_norm": 0.3536672294139862, "learning_rate": 4.471249994962875e-05, "loss": 0.24048154354095458, "memory(GiB)": 91.64, "step": 7440, "token_acc": 0.9188626907073509, "train_speed(iter/s)": 0.138608 }, { "epoch": 0.7026236315590789, "grad_norm": 0.7183988094329834, "learning_rate": 4.458253668616936e-05, "loss": 0.23343441486358643, "memory(GiB)": 91.64, "step": 7445, "token_acc": 0.9140362659503022, "train_speed(iter/s)": 0.138606 }, { "epoch": 0.7030955077387694, "grad_norm": 0.4441908001899719, "learning_rate": 4.445270837257554e-05, "loss": 0.22676398754119872, "memory(GiB)": 91.64, "step": 7450, "token_acc": 0.9258134490238612, "train_speed(iter/s)": 0.138606 }, { "epoch": 0.7035673839184597, "grad_norm": 0.29138508439064026, "learning_rate": 4.432301532499877e-05, "loss": 0.2306809425354004, "memory(GiB)": 91.64, "step": 7455, "token_acc": 0.9092585761711546, "train_speed(iter/s)": 0.138607 }, { "epoch": 0.7040392600981502, "grad_norm": 0.20300675928592682, "learning_rate": 4.419345785926119e-05, "loss": 0.22768373489379884, "memory(GiB)": 91.64, "step": 7460, "token_acc": 0.911993097497843, "train_speed(iter/s)": 0.138607 }, { "epoch": 0.7045111362778407, "grad_norm": 0.4421274662017822, "learning_rate": 4.406403629085465e-05, "loss": 0.23306775093078613, "memory(GiB)": 91.64, "step": 7465, "token_acc": 0.9188296366210477, "train_speed(iter/s)": 0.138606 }, { "epoch": 0.7049830124575311, "grad_norm": 0.21357952058315277, "learning_rate": 4.3934750934940196e-05, "loss": 0.23424277305603028, "memory(GiB)": 91.64, "step": 7470, "token_acc": 0.9328947368421052, "train_speed(iter/s)": 0.138606 }, { "epoch": 0.7054548886372216, "grad_norm": 0.436585009098053, "learning_rate": 4.380560210634715e-05, "loss": 0.2297410249710083, "memory(GiB)": 91.64, "step": 7475, "token_acc": 0.9225950782997763, "train_speed(iter/s)": 0.138607 }, { "epoch": 0.7059267648169121, "grad_norm": 0.28046926856040955, "learning_rate": 4.367659011957227e-05, "loss": 0.23267920017242433, "memory(GiB)": 91.64, "step": 7480, "token_acc": 0.9165763813651138, "train_speed(iter/s)": 0.138608 }, { "epoch": 0.7063986409966025, "grad_norm": 0.5179422497749329, "learning_rate": 4.354771528877926e-05, "loss": 0.23168692588806153, "memory(GiB)": 91.64, "step": 7485, "token_acc": 0.923728813559322, "train_speed(iter/s)": 0.138608 }, { "epoch": 0.7068705171762929, "grad_norm": 0.3157169818878174, "learning_rate": 4.3418977927797724e-05, "loss": 0.2363651752471924, "memory(GiB)": 91.64, "step": 7490, "token_acc": 0.9275045537340619, "train_speed(iter/s)": 0.138608 }, { "epoch": 0.7073423933559834, "grad_norm": 0.4149136543273926, "learning_rate": 4.329037835012245e-05, "loss": 0.2356886863708496, "memory(GiB)": 91.64, "step": 7495, "token_acc": 0.9102065249925172, "train_speed(iter/s)": 0.138608 }, { "epoch": 0.7078142695356738, "grad_norm": 0.3638668656349182, "learning_rate": 4.316191686891282e-05, "loss": 0.22782864570617675, "memory(GiB)": 91.64, "step": 7500, "token_acc": 0.9282261063592413, "train_speed(iter/s)": 0.138609 }, { "epoch": 0.7082861457153643, "grad_norm": 0.3706573247909546, "learning_rate": 4.30335937969919e-05, "loss": 0.2303562879562378, "memory(GiB)": 91.64, "step": 7505, "token_acc": 0.9246684350132626, "train_speed(iter/s)": 0.13861 }, { "epoch": 0.7087580218950548, "grad_norm": 0.4828733205795288, "learning_rate": 4.290540944684558e-05, "loss": 0.23863065242767334, "memory(GiB)": 91.64, "step": 7510, "token_acc": 0.9191153238546603, "train_speed(iter/s)": 0.138609 }, { "epoch": 0.7092298980747452, "grad_norm": 0.6060589551925659, "learning_rate": 4.277736413062219e-05, "loss": 0.22998156547546386, "memory(GiB)": 91.64, "step": 7515, "token_acc": 0.9242005527043032, "train_speed(iter/s)": 0.138608 }, { "epoch": 0.7097017742544356, "grad_norm": 0.4992319941520691, "learning_rate": 4.264945816013125e-05, "loss": 0.24006481170654298, "memory(GiB)": 91.64, "step": 7520, "token_acc": 0.9305866547245858, "train_speed(iter/s)": 0.138608 }, { "epoch": 0.7101736504341261, "grad_norm": 0.40544596314430237, "learning_rate": 4.2521691846843095e-05, "loss": 0.2283543348312378, "memory(GiB)": 91.64, "step": 7525, "token_acc": 0.9242761692650334, "train_speed(iter/s)": 0.138611 }, { "epoch": 0.7106455266138165, "grad_norm": 0.5109364986419678, "learning_rate": 4.239406550188791e-05, "loss": 0.2309938907623291, "memory(GiB)": 91.64, "step": 7530, "token_acc": 0.9235931853381518, "train_speed(iter/s)": 0.138611 }, { "epoch": 0.711117402793507, "grad_norm": 0.422530859708786, "learning_rate": 4.2266579436055084e-05, "loss": 0.23394412994384767, "memory(GiB)": 91.64, "step": 7535, "token_acc": 0.9018055115616092, "train_speed(iter/s)": 0.13861 }, { "epoch": 0.7115892789731975, "grad_norm": 0.2911304831504822, "learning_rate": 4.213923395979236e-05, "loss": 0.2337871789932251, "memory(GiB)": 91.64, "step": 7540, "token_acc": 0.910948905109489, "train_speed(iter/s)": 0.13861 }, { "epoch": 0.7120611551528879, "grad_norm": 0.5636393427848816, "learning_rate": 4.201202938320519e-05, "loss": 0.23299179077148438, "memory(GiB)": 91.64, "step": 7545, "token_acc": 0.9072039072039072, "train_speed(iter/s)": 0.13861 }, { "epoch": 0.7125330313325783, "grad_norm": 0.458686500787735, "learning_rate": 4.188496601605577e-05, "loss": 0.23804445266723634, "memory(GiB)": 91.64, "step": 7550, "token_acc": 0.9233138281490607, "train_speed(iter/s)": 0.13861 }, { "epoch": 0.7130049075122687, "grad_norm": 0.26119813323020935, "learning_rate": 4.17580441677626e-05, "loss": 0.23722286224365235, "memory(GiB)": 91.64, "step": 7555, "token_acc": 0.9108138238573021, "train_speed(iter/s)": 0.138609 }, { "epoch": 0.7134767836919592, "grad_norm": 0.32606083154678345, "learning_rate": 4.16312641473995e-05, "loss": 0.2353299617767334, "memory(GiB)": 91.64, "step": 7560, "token_acc": 0.9210182767624021, "train_speed(iter/s)": 0.138609 }, { "epoch": 0.7139486598716497, "grad_norm": 0.37780630588531494, "learning_rate": 4.15046262636948e-05, "loss": 0.2375786304473877, "memory(GiB)": 91.64, "step": 7565, "token_acc": 0.9266109785202864, "train_speed(iter/s)": 0.138611 }, { "epoch": 0.7144205360513401, "grad_norm": 0.5097290873527527, "learning_rate": 4.1378130825030926e-05, "loss": 0.23275210857391357, "memory(GiB)": 91.64, "step": 7570, "token_acc": 0.9178931061192874, "train_speed(iter/s)": 0.138613 }, { "epoch": 0.7148924122310306, "grad_norm": 0.2814135253429413, "learning_rate": 4.12517781394433e-05, "loss": 0.236267614364624, "memory(GiB)": 91.64, "step": 7575, "token_acc": 0.9209017959495606, "train_speed(iter/s)": 0.138614 }, { "epoch": 0.715364288410721, "grad_norm": 0.6145960092544556, "learning_rate": 4.1125568514619675e-05, "loss": 0.2347486972808838, "memory(GiB)": 91.64, "step": 7580, "token_acc": 0.925731760594146, "train_speed(iter/s)": 0.138614 }, { "epoch": 0.7158361645904114, "grad_norm": 0.5332577228546143, "learning_rate": 4.0999502257899515e-05, "loss": 0.24189887046813965, "memory(GiB)": 91.64, "step": 7585, "token_acc": 0.9104374784705477, "train_speed(iter/s)": 0.138614 }, { "epoch": 0.7163080407701019, "grad_norm": 0.7018850445747375, "learning_rate": 4.087357967627317e-05, "loss": 0.23237390518188478, "memory(GiB)": 91.64, "step": 7590, "token_acc": 0.9236180904522613, "train_speed(iter/s)": 0.138615 }, { "epoch": 0.7167799169497924, "grad_norm": 0.37541598081588745, "learning_rate": 4.0747801076380965e-05, "loss": 0.2391916275024414, "memory(GiB)": 91.64, "step": 7595, "token_acc": 0.9177265500794912, "train_speed(iter/s)": 0.138614 }, { "epoch": 0.7172517931294828, "grad_norm": 0.26603105664253235, "learning_rate": 4.062216676451285e-05, "loss": 0.22683272361755372, "memory(GiB)": 91.64, "step": 7600, "token_acc": 0.9210950080515298, "train_speed(iter/s)": 0.138614 }, { "epoch": 0.7177236693091733, "grad_norm": 0.5723817348480225, "learning_rate": 4.049667704660728e-05, "loss": 0.23589439392089845, "memory(GiB)": 91.64, "step": 7605, "token_acc": 0.9155238617663193, "train_speed(iter/s)": 0.138613 }, { "epoch": 0.7181955454888638, "grad_norm": 0.26500552892684937, "learning_rate": 4.037133222825052e-05, "loss": 0.22756319046020507, "memory(GiB)": 91.64, "step": 7610, "token_acc": 0.9262192580241767, "train_speed(iter/s)": 0.138615 }, { "epoch": 0.7186674216685541, "grad_norm": 0.7966447472572327, "learning_rate": 4.0246132614676145e-05, "loss": 0.2435863971710205, "memory(GiB)": 91.64, "step": 7615, "token_acc": 0.9128586609989373, "train_speed(iter/s)": 0.138615 }, { "epoch": 0.7191392978482446, "grad_norm": 1.105246663093567, "learning_rate": 4.012107851076406e-05, "loss": 0.22101092338562012, "memory(GiB)": 91.64, "step": 7620, "token_acc": 0.9139150943396226, "train_speed(iter/s)": 0.138616 }, { "epoch": 0.7196111740279351, "grad_norm": 0.37064602971076965, "learning_rate": 3.999617022103975e-05, "loss": 0.2291620969772339, "memory(GiB)": 91.64, "step": 7625, "token_acc": 0.9221767115272089, "train_speed(iter/s)": 0.138615 }, { "epoch": 0.7200830502076255, "grad_norm": 0.43661198019981384, "learning_rate": 3.987140804967384e-05, "loss": 0.2348928689956665, "memory(GiB)": 91.64, "step": 7630, "token_acc": 0.9234449760765551, "train_speed(iter/s)": 0.138616 }, { "epoch": 0.720554926387316, "grad_norm": 0.7201928496360779, "learning_rate": 3.9746792300480894e-05, "loss": 0.2301774024963379, "memory(GiB)": 91.64, "step": 7635, "token_acc": 0.915521978021978, "train_speed(iter/s)": 0.138615 }, { "epoch": 0.7210268025670065, "grad_norm": 0.3067832291126251, "learning_rate": 3.962232327691905e-05, "loss": 0.23211939334869386, "memory(GiB)": 91.64, "step": 7640, "token_acc": 0.9235436893203883, "train_speed(iter/s)": 0.138615 }, { "epoch": 0.7214986787466968, "grad_norm": 0.3120873272418976, "learning_rate": 3.949800128208915e-05, "loss": 0.22577478885650634, "memory(GiB)": 91.64, "step": 7645, "token_acc": 0.9192324854975458, "train_speed(iter/s)": 0.138615 }, { "epoch": 0.7219705549263873, "grad_norm": 0.22580601274967194, "learning_rate": 3.93738266187339e-05, "loss": 0.23252127170562745, "memory(GiB)": 91.64, "step": 7650, "token_acc": 0.9091406677613574, "train_speed(iter/s)": 0.138615 }, { "epoch": 0.7224424311060778, "grad_norm": 0.39276060461997986, "learning_rate": 3.92497995892373e-05, "loss": 0.2360863208770752, "memory(GiB)": 91.64, "step": 7655, "token_acc": 0.9083969465648855, "train_speed(iter/s)": 0.138616 }, { "epoch": 0.7229143072857682, "grad_norm": 0.36733052134513855, "learning_rate": 3.912592049562395e-05, "loss": 0.23193964958190919, "memory(GiB)": 91.64, "step": 7660, "token_acc": 0.914519906323185, "train_speed(iter/s)": 0.138616 }, { "epoch": 0.7233861834654587, "grad_norm": 0.36261504888534546, "learning_rate": 3.9002189639557974e-05, "loss": 0.23163235187530518, "memory(GiB)": 91.64, "step": 7665, "token_acc": 0.9202302631578947, "train_speed(iter/s)": 0.138615 }, { "epoch": 0.7238580596451492, "grad_norm": 0.4824898838996887, "learning_rate": 3.8878607322342674e-05, "loss": 0.23579974174499513, "memory(GiB)": 91.64, "step": 7670, "token_acc": 0.9020848845867461, "train_speed(iter/s)": 0.138616 }, { "epoch": 0.7243299358248395, "grad_norm": 0.4362608790397644, "learning_rate": 3.8755173844919624e-05, "loss": 0.23200278282165526, "memory(GiB)": 91.64, "step": 7675, "token_acc": 0.9163013152650459, "train_speed(iter/s)": 0.138616 }, { "epoch": 0.72480181200453, "grad_norm": 0.3005385100841522, "learning_rate": 3.863188950786786e-05, "loss": 0.22539281845092773, "memory(GiB)": 91.64, "step": 7680, "token_acc": 0.91974479516454, "train_speed(iter/s)": 0.138617 }, { "epoch": 0.7252736881842204, "grad_norm": 0.28257620334625244, "learning_rate": 3.8508754611403296e-05, "loss": 0.23311495780944824, "memory(GiB)": 91.64, "step": 7685, "token_acc": 0.9152061855670103, "train_speed(iter/s)": 0.138617 }, { "epoch": 0.7257455643639109, "grad_norm": 0.36101335287094116, "learning_rate": 3.838576945537806e-05, "loss": 0.23008966445922852, "memory(GiB)": 91.64, "step": 7690, "token_acc": 0.9269406392694064, "train_speed(iter/s)": 0.138617 }, { "epoch": 0.7262174405436014, "grad_norm": 0.25905391573905945, "learning_rate": 3.82629343392794e-05, "loss": 0.23752379417419434, "memory(GiB)": 91.64, "step": 7695, "token_acc": 0.930064308681672, "train_speed(iter/s)": 0.138618 }, { "epoch": 0.7266893167232918, "grad_norm": 0.3700575828552246, "learning_rate": 3.814024956222936e-05, "loss": 0.2280106782913208, "memory(GiB)": 91.64, "step": 7700, "token_acc": 0.928319209039548, "train_speed(iter/s)": 0.138616 }, { "epoch": 0.7271611929029823, "grad_norm": 0.21974368393421173, "learning_rate": 3.801771542298387e-05, "loss": 0.22454090118408204, "memory(GiB)": 91.64, "step": 7705, "token_acc": 0.9206798866855525, "train_speed(iter/s)": 0.138615 }, { "epoch": 0.7276330690826727, "grad_norm": 0.23629778623580933, "learning_rate": 3.78953322199319e-05, "loss": 0.23039534091949462, "memory(GiB)": 91.64, "step": 7710, "token_acc": 0.9159847244953628, "train_speed(iter/s)": 0.138615 }, { "epoch": 0.7281049452623631, "grad_norm": 0.2492135465145111, "learning_rate": 3.777310025109512e-05, "loss": 0.22857446670532228, "memory(GiB)": 91.64, "step": 7715, "token_acc": 0.9081383164512749, "train_speed(iter/s)": 0.138616 }, { "epoch": 0.7285768214420536, "grad_norm": 0.2390630841255188, "learning_rate": 3.7651019814126654e-05, "loss": 0.22294931411743163, "memory(GiB)": 91.64, "step": 7720, "token_acc": 0.9189723320158103, "train_speed(iter/s)": 0.138617 }, { "epoch": 0.7290486976217441, "grad_norm": 0.2751132547855377, "learning_rate": 3.752909120631079e-05, "loss": 0.23300666809082032, "memory(GiB)": 91.64, "step": 7725, "token_acc": 0.9150349650349651, "train_speed(iter/s)": 0.138616 }, { "epoch": 0.7295205738014345, "grad_norm": 0.35565805435180664, "learning_rate": 3.740731472456208e-05, "loss": 0.2383554220199585, "memory(GiB)": 91.64, "step": 7730, "token_acc": 0.9274079320113314, "train_speed(iter/s)": 0.138616 }, { "epoch": 0.729992449981125, "grad_norm": 0.4752665162086487, "learning_rate": 3.7285690665424523e-05, "loss": 0.2266439437866211, "memory(GiB)": 91.64, "step": 7735, "token_acc": 0.9231578947368421, "train_speed(iter/s)": 0.138616 }, { "epoch": 0.7304643261608154, "grad_norm": 0.49981334805488586, "learning_rate": 3.7164219325070995e-05, "loss": 0.234313440322876, "memory(GiB)": 91.64, "step": 7740, "token_acc": 0.9175717070453913, "train_speed(iter/s)": 0.138616 }, { "epoch": 0.7309362023405058, "grad_norm": 0.35681086778640747, "learning_rate": 3.704290099930261e-05, "loss": 0.23388543128967285, "memory(GiB)": 91.64, "step": 7745, "token_acc": 0.9287226534932957, "train_speed(iter/s)": 0.138616 }, { "epoch": 0.7314080785201963, "grad_norm": 0.23404952883720398, "learning_rate": 3.692173598354765e-05, "loss": 0.23158960342407225, "memory(GiB)": 91.64, "step": 7750, "token_acc": 0.9207547169811321, "train_speed(iter/s)": 0.138617 }, { "epoch": 0.7318799546998868, "grad_norm": 0.26125267148017883, "learning_rate": 3.680072457286121e-05, "loss": 0.22669458389282227, "memory(GiB)": 91.64, "step": 7755, "token_acc": 0.919360568383659, "train_speed(iter/s)": 0.138616 }, { "epoch": 0.7323518308795772, "grad_norm": 0.2665402293205261, "learning_rate": 3.667986706192431e-05, "loss": 0.23177189826965333, "memory(GiB)": 91.64, "step": 7760, "token_acc": 0.9216549295774648, "train_speed(iter/s)": 0.138617 }, { "epoch": 0.7328237070592677, "grad_norm": 0.42883795499801636, "learning_rate": 3.6559163745043126e-05, "loss": 0.22768354415893555, "memory(GiB)": 91.64, "step": 7765, "token_acc": 0.9188311688311688, "train_speed(iter/s)": 0.138617 }, { "epoch": 0.733295583238958, "grad_norm": 0.25692644715309143, "learning_rate": 3.643861491614841e-05, "loss": 0.23015880584716797, "memory(GiB)": 91.64, "step": 7770, "token_acc": 0.9187208527648234, "train_speed(iter/s)": 0.138618 }, { "epoch": 0.7337674594186485, "grad_norm": 0.21508167684078217, "learning_rate": 3.6318220868794784e-05, "loss": 0.2296093225479126, "memory(GiB)": 91.64, "step": 7775, "token_acc": 0.9230500174886324, "train_speed(iter/s)": 0.138617 }, { "epoch": 0.734239335598339, "grad_norm": 0.29474809765815735, "learning_rate": 3.6197981896159804e-05, "loss": 0.23098914623260497, "memory(GiB)": 91.64, "step": 7780, "token_acc": 0.9146655231560892, "train_speed(iter/s)": 0.138616 }, { "epoch": 0.7347112117780294, "grad_norm": 0.3672167658805847, "learning_rate": 3.6077898291043485e-05, "loss": 0.23493998050689696, "memory(GiB)": 91.64, "step": 7785, "token_acc": 0.9073152049023363, "train_speed(iter/s)": 0.138615 }, { "epoch": 0.7351830879577199, "grad_norm": 0.5175244808197021, "learning_rate": 3.595797034586753e-05, "loss": 0.22836103439331054, "memory(GiB)": 91.64, "step": 7790, "token_acc": 0.922854387656702, "train_speed(iter/s)": 0.138613 }, { "epoch": 0.7356549641374104, "grad_norm": 0.2874203026294708, "learning_rate": 3.583819835267446e-05, "loss": 0.22799487113952638, "memory(GiB)": 91.64, "step": 7795, "token_acc": 0.9126323751891074, "train_speed(iter/s)": 0.138613 }, { "epoch": 0.7361268403171007, "grad_norm": 0.40861421823501587, "learning_rate": 3.571858260312715e-05, "loss": 0.22677245140075683, "memory(GiB)": 91.64, "step": 7800, "token_acc": 0.9164278892072588, "train_speed(iter/s)": 0.138614 }, { "epoch": 0.7365987164967912, "grad_norm": 0.3449516296386719, "learning_rate": 3.559912338850795e-05, "loss": 0.2347282886505127, "memory(GiB)": 91.64, "step": 7805, "token_acc": 0.9178125, "train_speed(iter/s)": 0.138613 }, { "epoch": 0.7370705926764817, "grad_norm": 0.31171733140945435, "learning_rate": 3.5479820999718036e-05, "loss": 0.23100967407226564, "memory(GiB)": 91.64, "step": 7810, "token_acc": 0.9182941410920685, "train_speed(iter/s)": 0.138614 }, { "epoch": 0.7375424688561721, "grad_norm": 0.5504468679428101, "learning_rate": 3.536067572727671e-05, "loss": 0.23290205001831055, "memory(GiB)": 91.64, "step": 7815, "token_acc": 0.9189412737799835, "train_speed(iter/s)": 0.138614 }, { "epoch": 0.7380143450358626, "grad_norm": 0.8677312731742859, "learning_rate": 3.5241687861320593e-05, "loss": 0.22933337688446045, "memory(GiB)": 91.64, "step": 7820, "token_acc": 0.9203821656050956, "train_speed(iter/s)": 0.138616 }, { "epoch": 0.7384862212155531, "grad_norm": 0.2175053507089615, "learning_rate": 3.512285769160307e-05, "loss": 0.2278818130493164, "memory(GiB)": 91.64, "step": 7825, "token_acc": 0.9262792714657415, "train_speed(iter/s)": 0.138616 }, { "epoch": 0.7389580973952435, "grad_norm": 0.37307730317115784, "learning_rate": 3.50041855074935e-05, "loss": 0.22902565002441405, "memory(GiB)": 91.64, "step": 7830, "token_acc": 0.9135725429017161, "train_speed(iter/s)": 0.138616 }, { "epoch": 0.7394299735749339, "grad_norm": 0.37443387508392334, "learning_rate": 3.488567159797652e-05, "loss": 0.22925407886505128, "memory(GiB)": 91.64, "step": 7835, "token_acc": 0.9244406922752216, "train_speed(iter/s)": 0.138617 }, { "epoch": 0.7399018497546244, "grad_norm": 0.3315834701061249, "learning_rate": 3.4767316251651326e-05, "loss": 0.22882957458496095, "memory(GiB)": 91.64, "step": 7840, "token_acc": 0.914161008729389, "train_speed(iter/s)": 0.138619 }, { "epoch": 0.7403737259343148, "grad_norm": 0.2746642529964447, "learning_rate": 3.4649119756731055e-05, "loss": 0.22931833267211915, "memory(GiB)": 91.64, "step": 7845, "token_acc": 0.9125051588939331, "train_speed(iter/s)": 0.138618 }, { "epoch": 0.7408456021140053, "grad_norm": 0.6551487445831299, "learning_rate": 3.453108240104188e-05, "loss": 0.23314218521118163, "memory(GiB)": 91.64, "step": 7850, "token_acc": 0.9189952904238619, "train_speed(iter/s)": 0.138618 }, { "epoch": 0.7413174782936958, "grad_norm": 0.4284408986568451, "learning_rate": 3.4413204472022576e-05, "loss": 0.22884924411773683, "memory(GiB)": 91.64, "step": 7855, "token_acc": 0.9177083333333333, "train_speed(iter/s)": 0.138619 }, { "epoch": 0.7417893544733862, "grad_norm": 0.2609077990055084, "learning_rate": 3.429548625672365e-05, "loss": 0.22607100009918213, "memory(GiB)": 91.64, "step": 7860, "token_acc": 0.9041233657391887, "train_speed(iter/s)": 0.138619 }, { "epoch": 0.7422612306530766, "grad_norm": 0.7349159717559814, "learning_rate": 3.417792804180666e-05, "loss": 0.23114871978759766, "memory(GiB)": 91.64, "step": 7865, "token_acc": 0.8985565356856455, "train_speed(iter/s)": 0.13862 }, { "epoch": 0.742733106832767, "grad_norm": 0.2900051772594452, "learning_rate": 3.406053011354357e-05, "loss": 0.22995190620422362, "memory(GiB)": 91.64, "step": 7870, "token_acc": 0.9049773755656109, "train_speed(iter/s)": 0.138622 }, { "epoch": 0.7432049830124575, "grad_norm": 0.44393131136894226, "learning_rate": 3.394329275781604e-05, "loss": 0.2241837501525879, "memory(GiB)": 91.64, "step": 7875, "token_acc": 0.9156214367160775, "train_speed(iter/s)": 0.138623 }, { "epoch": 0.743676859192148, "grad_norm": 0.7309471368789673, "learning_rate": 3.3826216260114604e-05, "loss": 0.22879295349121093, "memory(GiB)": 91.64, "step": 7880, "token_acc": 0.9144460028050491, "train_speed(iter/s)": 0.138624 }, { "epoch": 0.7441487353718385, "grad_norm": 0.24232062697410583, "learning_rate": 3.370930090553821e-05, "loss": 0.2347412347793579, "memory(GiB)": 91.64, "step": 7885, "token_acc": 0.9253786479497599, "train_speed(iter/s)": 0.138625 }, { "epoch": 0.7446206115515289, "grad_norm": 0.8616470694541931, "learning_rate": 3.3592546978793327e-05, "loss": 0.23368425369262696, "memory(GiB)": 91.64, "step": 7890, "token_acc": 0.9122157588577472, "train_speed(iter/s)": 0.138626 }, { "epoch": 0.7450924877312193, "grad_norm": 0.5295524597167969, "learning_rate": 3.347595476419335e-05, "loss": 0.23362338542938232, "memory(GiB)": 91.64, "step": 7895, "token_acc": 0.910377358490566, "train_speed(iter/s)": 0.138626 }, { "epoch": 0.7455643639109097, "grad_norm": 0.2784343659877777, "learning_rate": 3.335952454565787e-05, "loss": 0.23343799114227295, "memory(GiB)": 91.64, "step": 7900, "token_acc": 0.8851380973257343, "train_speed(iter/s)": 0.138627 }, { "epoch": 0.7460362400906002, "grad_norm": 0.29195478558540344, "learning_rate": 3.324325660671205e-05, "loss": 0.22837295532226562, "memory(GiB)": 91.64, "step": 7905, "token_acc": 0.9181434599156119, "train_speed(iter/s)": 0.138627 }, { "epoch": 0.7465081162702907, "grad_norm": 0.3233549892902374, "learning_rate": 3.312715123048572e-05, "loss": 0.2293907642364502, "memory(GiB)": 91.64, "step": 7910, "token_acc": 0.9086402266288952, "train_speed(iter/s)": 0.138627 }, { "epoch": 0.7469799924499811, "grad_norm": 0.25969669222831726, "learning_rate": 3.3011208699713015e-05, "loss": 0.22522926330566406, "memory(GiB)": 91.64, "step": 7915, "token_acc": 0.922077922077922, "train_speed(iter/s)": 0.138627 }, { "epoch": 0.7474518686296716, "grad_norm": 0.2135898917913437, "learning_rate": 3.2895429296731426e-05, "loss": 0.22411694526672363, "memory(GiB)": 91.64, "step": 7920, "token_acc": 0.9312614259597807, "train_speed(iter/s)": 0.138629 }, { "epoch": 0.7479237448093621, "grad_norm": 0.218247190117836, "learning_rate": 3.2779813303481256e-05, "loss": 0.22187356948852538, "memory(GiB)": 91.64, "step": 7925, "token_acc": 0.9370489174017642, "train_speed(iter/s)": 0.13863 }, { "epoch": 0.7483956209890524, "grad_norm": 0.21325546503067017, "learning_rate": 3.2664361001504864e-05, "loss": 0.22855515480041505, "memory(GiB)": 91.64, "step": 7930, "token_acc": 0.9257692307692308, "train_speed(iter/s)": 0.13863 }, { "epoch": 0.7488674971687429, "grad_norm": 0.24248625338077545, "learning_rate": 3.2549072671945924e-05, "loss": 0.22276439666748046, "memory(GiB)": 91.64, "step": 7935, "token_acc": 0.9157955865272939, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.7493393733484334, "grad_norm": 0.2705487906932831, "learning_rate": 3.243394859554891e-05, "loss": 0.2265183687210083, "memory(GiB)": 91.64, "step": 7940, "token_acc": 0.9164098613251156, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.7498112495281238, "grad_norm": 0.2557106614112854, "learning_rate": 3.231898905265829e-05, "loss": 0.22840962409973145, "memory(GiB)": 91.64, "step": 7945, "token_acc": 0.9238232123607618, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.7502831257078143, "grad_norm": 0.36380380392074585, "learning_rate": 3.220419432321783e-05, "loss": 0.23431000709533692, "memory(GiB)": 91.64, "step": 7950, "token_acc": 0.9212121212121213, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.7507550018875048, "grad_norm": 0.2633918821811676, "learning_rate": 3.2089564686770004e-05, "loss": 0.22273101806640624, "memory(GiB)": 91.64, "step": 7955, "token_acc": 0.9136276391554703, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.7512268780671951, "grad_norm": 0.3842732906341553, "learning_rate": 3.197510042245524e-05, "loss": 0.2252732992172241, "memory(GiB)": 91.64, "step": 7960, "token_acc": 0.9360613810741688, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.7516987542468856, "grad_norm": 0.27877315878868103, "learning_rate": 3.186080180901121e-05, "loss": 0.2262340545654297, "memory(GiB)": 91.64, "step": 7965, "token_acc": 0.920619554695063, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.7521706304265761, "grad_norm": 0.23689651489257812, "learning_rate": 3.1746669124772264e-05, "loss": 0.22646026611328124, "memory(GiB)": 91.64, "step": 7970, "token_acc": 0.8999530295913575, "train_speed(iter/s)": 0.138632 }, { "epoch": 0.7526425066062665, "grad_norm": 0.5697574615478516, "learning_rate": 3.1632702647668664e-05, "loss": 0.22735657691955566, "memory(GiB)": 91.64, "step": 7975, "token_acc": 0.9176543980037429, "train_speed(iter/s)": 0.13863 }, { "epoch": 0.753114382785957, "grad_norm": 0.7894372940063477, "learning_rate": 3.1518902655225954e-05, "loss": 0.22902073860168456, "memory(GiB)": 91.64, "step": 7980, "token_acc": 0.9288161400837457, "train_speed(iter/s)": 0.13863 }, { "epoch": 0.7535862589656475, "grad_norm": 0.26597410440444946, "learning_rate": 3.1405269424564244e-05, "loss": 0.2152813196182251, "memory(GiB)": 91.64, "step": 7985, "token_acc": 0.9042145593869731, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.7540581351453378, "grad_norm": 0.5665609240531921, "learning_rate": 3.1291803232397576e-05, "loss": 0.22755122184753418, "memory(GiB)": 91.64, "step": 7990, "token_acc": 0.9204496325118893, "train_speed(iter/s)": 0.138629 }, { "epoch": 0.7545300113250283, "grad_norm": 0.2860982120037079, "learning_rate": 3.117850435503315e-05, "loss": 0.2274768829345703, "memory(GiB)": 91.64, "step": 7995, "token_acc": 0.916327716443928, "train_speed(iter/s)": 0.138629 }, { "epoch": 0.7550018875047187, "grad_norm": 0.3887801170349121, "learning_rate": 3.106537306837084e-05, "loss": 0.2311309814453125, "memory(GiB)": 91.64, "step": 8000, "token_acc": 0.9066347469220246, "train_speed(iter/s)": 0.138629 }, { "epoch": 0.7554737636844092, "grad_norm": 0.3887653648853302, "learning_rate": 3.095240964790233e-05, "loss": 0.2242722988128662, "memory(GiB)": 91.64, "step": 8005, "token_acc": 0.9280346820809249, "train_speed(iter/s)": 0.13863 }, { "epoch": 0.7559456398640997, "grad_norm": 0.3202626705169678, "learning_rate": 3.083961436871057e-05, "loss": 0.22988688945770264, "memory(GiB)": 91.64, "step": 8010, "token_acc": 0.924455205811138, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.7564175160437902, "grad_norm": 0.316771537065506, "learning_rate": 3.072698750546906e-05, "loss": 0.22806041240692138, "memory(GiB)": 91.64, "step": 8015, "token_acc": 0.9110389610389611, "train_speed(iter/s)": 0.13863 }, { "epoch": 0.7568893922234805, "grad_norm": 0.3148418366909027, "learning_rate": 3.061452933244112e-05, "loss": 0.2297208547592163, "memory(GiB)": 91.64, "step": 8020, "token_acc": 0.9350031705770451, "train_speed(iter/s)": 0.13863 }, { "epoch": 0.757361268403171, "grad_norm": 0.2092905342578888, "learning_rate": 3.0502240123479366e-05, "loss": 0.2229100227355957, "memory(GiB)": 91.64, "step": 8025, "token_acc": 0.9143859649122807, "train_speed(iter/s)": 0.138632 }, { "epoch": 0.7578331445828614, "grad_norm": 0.43824535608291626, "learning_rate": 3.0390120152024915e-05, "loss": 0.2252873659133911, "memory(GiB)": 91.64, "step": 8030, "token_acc": 0.9152276295133438, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.7583050207625519, "grad_norm": 0.35322973132133484, "learning_rate": 3.0278169691106785e-05, "loss": 0.22799272537231446, "memory(GiB)": 91.64, "step": 8035, "token_acc": 0.9206668582926129, "train_speed(iter/s)": 0.138632 }, { "epoch": 0.7587768969422424, "grad_norm": 0.521074652671814, "learning_rate": 3.016638901334118e-05, "loss": 0.22861776351928711, "memory(GiB)": 91.64, "step": 8040, "token_acc": 0.9213483146067416, "train_speed(iter/s)": 0.138633 }, { "epoch": 0.7592487731219328, "grad_norm": 0.38683828711509705, "learning_rate": 3.0054778390930925e-05, "loss": 0.22503888607025146, "memory(GiB)": 91.64, "step": 8045, "token_acc": 0.9049630411826821, "train_speed(iter/s)": 0.13863 }, { "epoch": 0.7597206493016233, "grad_norm": 0.20460493862628937, "learning_rate": 2.9943338095664632e-05, "loss": 0.22331924438476564, "memory(GiB)": 91.64, "step": 8050, "token_acc": 0.9197936210131332, "train_speed(iter/s)": 0.13863 }, { "epoch": 0.7601925254813137, "grad_norm": 0.31732818484306335, "learning_rate": 2.9832068398916212e-05, "loss": 0.23599157333374024, "memory(GiB)": 91.64, "step": 8055, "token_acc": 0.9237121510027526, "train_speed(iter/s)": 0.13863 }, { "epoch": 0.7606644016610041, "grad_norm": 0.7717975974082947, "learning_rate": 2.972096957164413e-05, "loss": 0.23142099380493164, "memory(GiB)": 91.64, "step": 8060, "token_acc": 0.928115552569701, "train_speed(iter/s)": 0.13863 }, { "epoch": 0.7611362778406946, "grad_norm": 0.44531887769699097, "learning_rate": 2.961004188439077e-05, "loss": 0.2313159227371216, "memory(GiB)": 91.64, "step": 8065, "token_acc": 0.9150915963943007, "train_speed(iter/s)": 0.13863 }, { "epoch": 0.7616081540203851, "grad_norm": 0.3463146984577179, "learning_rate": 2.9499285607281725e-05, "loss": 0.22837791442871094, "memory(GiB)": 91.64, "step": 8070, "token_acc": 0.9326069410815173, "train_speed(iter/s)": 0.13863 }, { "epoch": 0.7620800302000755, "grad_norm": 0.2808953523635864, "learning_rate": 2.9388701010025243e-05, "loss": 0.2212012529373169, "memory(GiB)": 91.64, "step": 8075, "token_acc": 0.9138222372423116, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.762551906379766, "grad_norm": 0.3945287764072418, "learning_rate": 2.9278288361911423e-05, "loss": 0.2277526378631592, "memory(GiB)": 91.64, "step": 8080, "token_acc": 0.9124365482233503, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.7630237825594564, "grad_norm": 0.8451266288757324, "learning_rate": 2.9168047931811683e-05, "loss": 0.2179340124130249, "memory(GiB)": 91.64, "step": 8085, "token_acc": 0.9067501739735561, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.7634956587391468, "grad_norm": 0.43402817845344543, "learning_rate": 2.9057979988178087e-05, "loss": 0.2228691577911377, "memory(GiB)": 91.64, "step": 8090, "token_acc": 0.9226860254083484, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.7639675349188373, "grad_norm": 0.25339168310165405, "learning_rate": 2.894808479904263e-05, "loss": 0.22564988136291503, "memory(GiB)": 91.64, "step": 8095, "token_acc": 0.9321125502456454, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.7644394110985278, "grad_norm": 0.902864396572113, "learning_rate": 2.883836263201669e-05, "loss": 0.22950098514556885, "memory(GiB)": 91.64, "step": 8100, "token_acc": 0.9148484848484848, "train_speed(iter/s)": 0.138632 }, { "epoch": 0.7649112872782182, "grad_norm": 0.28512996435165405, "learning_rate": 2.8728813754290196e-05, "loss": 0.22164077758789064, "memory(GiB)": 91.64, "step": 8105, "token_acc": 0.9126436781609195, "train_speed(iter/s)": 0.138632 }, { "epoch": 0.7653831634579087, "grad_norm": 0.21365024149417877, "learning_rate": 2.8619438432631185e-05, "loss": 0.22118642330169677, "memory(GiB)": 91.64, "step": 8110, "token_acc": 0.9141651031894934, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.765855039637599, "grad_norm": 0.26842251420021057, "learning_rate": 2.8510236933385048e-05, "loss": 0.22353811264038087, "memory(GiB)": 91.64, "step": 8115, "token_acc": 0.9207392197125257, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.7663269158172895, "grad_norm": 0.4451298415660858, "learning_rate": 2.8401209522473804e-05, "loss": 0.22560317516326905, "memory(GiB)": 91.64, "step": 8120, "token_acc": 0.9227367325702394, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.76679879199698, "grad_norm": 0.3383225202560425, "learning_rate": 2.8292356465395687e-05, "loss": 0.2258366107940674, "memory(GiB)": 91.64, "step": 8125, "token_acc": 0.9120879120879121, "train_speed(iter/s)": 0.138632 }, { "epoch": 0.7672706681766704, "grad_norm": 0.2160254716873169, "learning_rate": 2.8183678027224292e-05, "loss": 0.223410964012146, "memory(GiB)": 91.64, "step": 8130, "token_acc": 0.8991111111111111, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.7677425443563609, "grad_norm": 0.34852781891822815, "learning_rate": 2.8075174472607902e-05, "loss": 0.21970810890197753, "memory(GiB)": 91.64, "step": 8135, "token_acc": 0.9127594158339739, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.7682144205360514, "grad_norm": 0.6557506918907166, "learning_rate": 2.7966846065769036e-05, "loss": 0.23110716342926024, "memory(GiB)": 91.64, "step": 8140, "token_acc": 0.9102831594634874, "train_speed(iter/s)": 0.13863 }, { "epoch": 0.7686862967157417, "grad_norm": 0.6803750991821289, "learning_rate": 2.7858693070503718e-05, "loss": 0.22070074081420898, "memory(GiB)": 91.64, "step": 8145, "token_acc": 0.9156667809393212, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.7691581728954322, "grad_norm": 0.44910258054733276, "learning_rate": 2.7750715750180655e-05, "loss": 0.22234528064727782, "memory(GiB)": 91.64, "step": 8150, "token_acc": 0.9155890804597702, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.7696300490751227, "grad_norm": 0.35992804169654846, "learning_rate": 2.7642914367741e-05, "loss": 0.2249147415161133, "memory(GiB)": 91.64, "step": 8155, "token_acc": 0.9267461669505963, "train_speed(iter/s)": 0.138629 }, { "epoch": 0.7701019252548131, "grad_norm": 0.5502942204475403, "learning_rate": 2.753528918569732e-05, "loss": 0.22030355930328369, "memory(GiB)": 91.64, "step": 8160, "token_acc": 0.9201444622792937, "train_speed(iter/s)": 0.138629 }, { "epoch": 0.7705738014345036, "grad_norm": 0.6990233659744263, "learning_rate": 2.742784046613309e-05, "loss": 0.22877583503723145, "memory(GiB)": 91.64, "step": 8165, "token_acc": 0.9318885448916409, "train_speed(iter/s)": 0.138628 }, { "epoch": 0.7710456776141941, "grad_norm": 0.24115338921546936, "learning_rate": 2.7320568470702145e-05, "loss": 0.2234677791595459, "memory(GiB)": 91.64, "step": 8170, "token_acc": 0.9176334106728539, "train_speed(iter/s)": 0.138629 }, { "epoch": 0.7715175537938845, "grad_norm": 0.4765714108943939, "learning_rate": 2.721347346062797e-05, "loss": 0.22813882827758789, "memory(GiB)": 91.64, "step": 8175, "token_acc": 0.9207409656847859, "train_speed(iter/s)": 0.13863 }, { "epoch": 0.7719894299735749, "grad_norm": 0.41965359449386597, "learning_rate": 2.7106555696702952e-05, "loss": 0.22575123310089112, "memory(GiB)": 91.64, "step": 8180, "token_acc": 0.9118468288791384, "train_speed(iter/s)": 0.13863 }, { "epoch": 0.7724613061532654, "grad_norm": 0.32254064083099365, "learning_rate": 2.6999815439288044e-05, "loss": 0.22551360130310058, "memory(GiB)": 91.64, "step": 8185, "token_acc": 0.9281748785565579, "train_speed(iter/s)": 0.13863 }, { "epoch": 0.7729331823329558, "grad_norm": 0.2386188954114914, "learning_rate": 2.6893252948311766e-05, "loss": 0.23226511478424072, "memory(GiB)": 91.64, "step": 8190, "token_acc": 0.9106433677521842, "train_speed(iter/s)": 0.13863 }, { "epoch": 0.7734050585126463, "grad_norm": 0.47129666805267334, "learning_rate": 2.6786868483269856e-05, "loss": 0.21903567314147948, "memory(GiB)": 91.64, "step": 8195, "token_acc": 0.9255419415645617, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.7738769346923368, "grad_norm": 0.24712376296520233, "learning_rate": 2.668066230322449e-05, "loss": 0.2212503433227539, "memory(GiB)": 91.64, "step": 8200, "token_acc": 0.9146115906288532, "train_speed(iter/s)": 0.138632 }, { "epoch": 0.7743488108720272, "grad_norm": 0.24980393052101135, "learning_rate": 2.657463466680372e-05, "loss": 0.22688717842102052, "memory(GiB)": 91.64, "step": 8205, "token_acc": 0.9009870450339297, "train_speed(iter/s)": 0.138633 }, { "epoch": 0.7748206870517176, "grad_norm": 0.3085843026638031, "learning_rate": 2.6468785832200793e-05, "loss": 0.2235793113708496, "memory(GiB)": 91.64, "step": 8210, "token_acc": 0.9282822440717178, "train_speed(iter/s)": 0.138633 }, { "epoch": 0.775292563231408, "grad_norm": 0.22891342639923096, "learning_rate": 2.6363116057173588e-05, "loss": 0.21712026596069336, "memory(GiB)": 91.64, "step": 8215, "token_acc": 0.9137645107794361, "train_speed(iter/s)": 0.138633 }, { "epoch": 0.7757644394110985, "grad_norm": 0.35751280188560486, "learning_rate": 2.6257625599043844e-05, "loss": 0.22840476036071777, "memory(GiB)": 91.64, "step": 8220, "token_acc": 0.9042871385842473, "train_speed(iter/s)": 0.138632 }, { "epoch": 0.776236315590789, "grad_norm": 0.36552250385284424, "learning_rate": 2.6152314714696757e-05, "loss": 0.21927356719970703, "memory(GiB)": 91.64, "step": 8225, "token_acc": 0.9125596184419714, "train_speed(iter/s)": 0.138632 }, { "epoch": 0.7767081917704795, "grad_norm": 0.2750298082828522, "learning_rate": 2.60471836605802e-05, "loss": 0.21922688484191893, "memory(GiB)": 91.64, "step": 8230, "token_acc": 0.9250343878954608, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.7771800679501699, "grad_norm": 0.4435797333717346, "learning_rate": 2.5942232692704017e-05, "loss": 0.22108829021453857, "memory(GiB)": 91.64, "step": 8235, "token_acc": 0.9235737351991389, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.7776519441298603, "grad_norm": 0.48604586720466614, "learning_rate": 2.5837462066639718e-05, "loss": 0.23142728805541993, "memory(GiB)": 91.64, "step": 8240, "token_acc": 0.9114088159031979, "train_speed(iter/s)": 0.138632 }, { "epoch": 0.7781238203095507, "grad_norm": 0.243556946516037, "learning_rate": 2.573287203751955e-05, "loss": 0.22401127815246583, "memory(GiB)": 91.64, "step": 8245, "token_acc": 0.9081429990069514, "train_speed(iter/s)": 0.138632 }, { "epoch": 0.7785956964892412, "grad_norm": 0.3457433581352234, "learning_rate": 2.562846286003592e-05, "loss": 0.22924907207489015, "memory(GiB)": 91.64, "step": 8250, "token_acc": 0.914388705316912, "train_speed(iter/s)": 0.138632 }, { "epoch": 0.7790675726689317, "grad_norm": 0.2969830334186554, "learning_rate": 2.5524234788440905e-05, "loss": 0.226761794090271, "memory(GiB)": 91.64, "step": 8255, "token_acc": 0.9296254256526674, "train_speed(iter/s)": 0.138632 }, { "epoch": 0.7795394488486221, "grad_norm": 0.5631513595581055, "learning_rate": 2.5420188076545603e-05, "loss": 0.21923332214355468, "memory(GiB)": 91.64, "step": 8260, "token_acc": 0.9157566302652106, "train_speed(iter/s)": 0.138631 }, { "epoch": 0.7800113250283126, "grad_norm": 0.24374043941497803, "learning_rate": 2.531632297771931e-05, "loss": 0.22521307468414306, "memory(GiB)": 91.64, "step": 8265, "token_acc": 0.9289855072463769, "train_speed(iter/s)": 0.138633 }, { "epoch": 0.7804832012080031, "grad_norm": 0.6146330833435059, "learning_rate": 2.5212639744889312e-05, "loss": 0.22694334983825684, "memory(GiB)": 91.64, "step": 8270, "token_acc": 0.9116894197952219, "train_speed(iter/s)": 0.138633 }, { "epoch": 0.7809550773876934, "grad_norm": 0.21018651127815247, "learning_rate": 2.5109138630539797e-05, "loss": 0.22090816497802734, "memory(GiB)": 91.64, "step": 8275, "token_acc": 0.9197469197469198, "train_speed(iter/s)": 0.138633 }, { "epoch": 0.7814269535673839, "grad_norm": 0.2725118100643158, "learning_rate": 2.5005819886711578e-05, "loss": 0.21987648010253907, "memory(GiB)": 91.64, "step": 8280, "token_acc": 0.9179578359156718, "train_speed(iter/s)": 0.138633 }, { "epoch": 0.7818988297470744, "grad_norm": 0.6226215362548828, "learning_rate": 2.4902683765001355e-05, "loss": 0.21619763374328613, "memory(GiB)": 91.64, "step": 8285, "token_acc": 0.9423529411764706, "train_speed(iter/s)": 0.138634 }, { "epoch": 0.7823707059267648, "grad_norm": 0.2744337320327759, "learning_rate": 2.4799730516561147e-05, "loss": 0.22009749412536622, "memory(GiB)": 91.64, "step": 8290, "token_acc": 0.9216516675489677, "train_speed(iter/s)": 0.138634 }, { "epoch": 0.7828425821064553, "grad_norm": 0.284661203622818, "learning_rate": 2.4696960392097523e-05, "loss": 0.22542271614074708, "memory(GiB)": 91.64, "step": 8295, "token_acc": 0.9180474697716077, "train_speed(iter/s)": 0.138634 }, { "epoch": 0.7833144582861458, "grad_norm": 0.25613096356391907, "learning_rate": 2.4594373641871314e-05, "loss": 0.22844905853271485, "memory(GiB)": 91.64, "step": 8300, "token_acc": 0.9212598425196851, "train_speed(iter/s)": 0.138633 }, { "epoch": 0.7837863344658361, "grad_norm": 0.5812061429023743, "learning_rate": 2.4491970515696626e-05, "loss": 0.22294883728027343, "memory(GiB)": 91.64, "step": 8305, "token_acc": 0.9196940726577438, "train_speed(iter/s)": 0.138632 }, { "epoch": 0.7842582106455266, "grad_norm": 0.28404125571250916, "learning_rate": 2.4389751262940498e-05, "loss": 0.22034523487091065, "memory(GiB)": 91.64, "step": 8310, "token_acc": 0.915719325754606, "train_speed(iter/s)": 0.138633 }, { "epoch": 0.784730086825217, "grad_norm": 0.3765624463558197, "learning_rate": 2.4287716132522243e-05, "loss": 0.22550301551818847, "memory(GiB)": 91.64, "step": 8315, "token_acc": 0.923728813559322, "train_speed(iter/s)": 0.138633 }, { "epoch": 0.7852019630049075, "grad_norm": 0.4880436956882477, "learning_rate": 2.41858653729127e-05, "loss": 0.22433314323425294, "memory(GiB)": 91.64, "step": 8320, "token_acc": 0.9113504556752279, "train_speed(iter/s)": 0.138633 }, { "epoch": 0.785673839184598, "grad_norm": 0.5990275144577026, "learning_rate": 2.4084199232133797e-05, "loss": 0.22993867397308348, "memory(GiB)": 91.64, "step": 8325, "token_acc": 0.9129451667608819, "train_speed(iter/s)": 0.138634 }, { "epoch": 0.7861457153642885, "grad_norm": 0.559020459651947, "learning_rate": 2.3982717957757995e-05, "loss": 0.22906913757324218, "memory(GiB)": 91.64, "step": 8330, "token_acc": 0.9164596273291925, "train_speed(iter/s)": 0.138633 }, { "epoch": 0.7866175915439788, "grad_norm": 0.3202402591705322, "learning_rate": 2.3881421796907366e-05, "loss": 0.22067337036132811, "memory(GiB)": 91.64, "step": 8335, "token_acc": 0.9253401986024273, "train_speed(iter/s)": 0.138633 }, { "epoch": 0.7870894677236693, "grad_norm": 0.34719038009643555, "learning_rate": 2.378031099625334e-05, "loss": 0.223028564453125, "memory(GiB)": 91.64, "step": 8340, "token_acc": 0.9278810408921933, "train_speed(iter/s)": 0.138633 }, { "epoch": 0.7875613439033597, "grad_norm": 0.39138731360435486, "learning_rate": 2.3679385802015987e-05, "loss": 0.2211979389190674, "memory(GiB)": 91.64, "step": 8345, "token_acc": 0.9202751922298664, "train_speed(iter/s)": 0.138633 }, { "epoch": 0.7880332200830502, "grad_norm": 0.24618585407733917, "learning_rate": 2.3578646459963272e-05, "loss": 0.2210986375808716, "memory(GiB)": 91.64, "step": 8350, "token_acc": 0.9225880993645291, "train_speed(iter/s)": 0.138633 }, { "epoch": 0.7885050962627407, "grad_norm": 0.4435372054576874, "learning_rate": 2.347809321541069e-05, "loss": 0.2195359230041504, "memory(GiB)": 91.64, "step": 8355, "token_acc": 0.9158878504672897, "train_speed(iter/s)": 0.138632 }, { "epoch": 0.7889769724424311, "grad_norm": 0.21401335299015045, "learning_rate": 2.337772631322054e-05, "loss": 0.2184643030166626, "memory(GiB)": 91.64, "step": 8360, "token_acc": 0.935247582771755, "train_speed(iter/s)": 0.138632 }, { "epoch": 0.7894488486221215, "grad_norm": 0.2693486511707306, "learning_rate": 2.327754599780132e-05, "loss": 0.22365641593933105, "memory(GiB)": 91.64, "step": 8365, "token_acc": 0.9265658747300216, "train_speed(iter/s)": 0.138633 }, { "epoch": 0.789920724801812, "grad_norm": 0.3792926073074341, "learning_rate": 2.317755251310719e-05, "loss": 0.21845946311950684, "memory(GiB)": 91.64, "step": 8370, "token_acc": 0.9219151670951157, "train_speed(iter/s)": 0.138633 }, { "epoch": 0.7903926009815024, "grad_norm": 0.23595775663852692, "learning_rate": 2.3077746102637364e-05, "loss": 0.22609164714813232, "memory(GiB)": 91.64, "step": 8375, "token_acc": 0.9150650960942344, "train_speed(iter/s)": 0.138635 }, { "epoch": 0.7908644771611929, "grad_norm": 0.32832691073417664, "learning_rate": 2.297812700943539e-05, "loss": 0.2214564323425293, "memory(GiB)": 91.64, "step": 8380, "token_acc": 0.9153902261123268, "train_speed(iter/s)": 0.138634 }, { "epoch": 0.7913363533408834, "grad_norm": 0.36022377014160156, "learning_rate": 2.2878695476088873e-05, "loss": 0.2199798345565796, "memory(GiB)": 91.64, "step": 8385, "token_acc": 0.9154995331465919, "train_speed(iter/s)": 0.138634 }, { "epoch": 0.7918082295205738, "grad_norm": 0.449059396982193, "learning_rate": 2.2779451744728474e-05, "loss": 0.2180879831314087, "memory(GiB)": 91.64, "step": 8390, "token_acc": 0.9329475833900612, "train_speed(iter/s)": 0.138635 }, { "epoch": 0.7922801057002643, "grad_norm": 0.28043511509895325, "learning_rate": 2.268039605702765e-05, "loss": 0.22389302253723145, "memory(GiB)": 91.64, "step": 8395, "token_acc": 0.932570977917981, "train_speed(iter/s)": 0.138634 }, { "epoch": 0.7927519818799547, "grad_norm": 0.27153950929641724, "learning_rate": 2.2581528654201943e-05, "loss": 0.2266615390777588, "memory(GiB)": 91.64, "step": 8400, "token_acc": 0.929364278506559, "train_speed(iter/s)": 0.138635 }, { "epoch": 0.7932238580596451, "grad_norm": 0.22254140675067902, "learning_rate": 2.2482849777008308e-05, "loss": 0.22057442665100097, "memory(GiB)": 91.64, "step": 8405, "token_acc": 0.9215481171548117, "train_speed(iter/s)": 0.138636 }, { "epoch": 0.7936957342393356, "grad_norm": 0.24786394834518433, "learning_rate": 2.2384359665744657e-05, "loss": 0.21846373081207277, "memory(GiB)": 91.64, "step": 8410, "token_acc": 0.916236062007071, "train_speed(iter/s)": 0.138636 }, { "epoch": 0.7941676104190261, "grad_norm": 0.3908824622631073, "learning_rate": 2.2286058560249325e-05, "loss": 0.22614388465881347, "memory(GiB)": 91.64, "step": 8415, "token_acc": 0.9074074074074074, "train_speed(iter/s)": 0.138637 }, { "epoch": 0.7946394865987165, "grad_norm": 0.8846114277839661, "learning_rate": 2.2187946699900218e-05, "loss": 0.2331876277923584, "memory(GiB)": 91.64, "step": 8420, "token_acc": 0.9200567644276254, "train_speed(iter/s)": 0.138637 }, { "epoch": 0.795111362778407, "grad_norm": 0.59001225233078, "learning_rate": 2.2090024323614524e-05, "loss": 0.22399048805236815, "memory(GiB)": 91.64, "step": 8425, "token_acc": 0.923191278493558, "train_speed(iter/s)": 0.138636 }, { "epoch": 0.7955832389580973, "grad_norm": 0.5200862288475037, "learning_rate": 2.1992291669847974e-05, "loss": 0.22100448608398438, "memory(GiB)": 91.64, "step": 8430, "token_acc": 0.9250924784217016, "train_speed(iter/s)": 0.138637 }, { "epoch": 0.7960551151377878, "grad_norm": 0.3765367865562439, "learning_rate": 2.189474897659426e-05, "loss": 0.21650943756103516, "memory(GiB)": 91.64, "step": 8435, "token_acc": 0.9229422066549913, "train_speed(iter/s)": 0.138638 }, { "epoch": 0.7965269913174783, "grad_norm": 0.5452553033828735, "learning_rate": 2.1797396481384546e-05, "loss": 0.22537777423858643, "memory(GiB)": 91.64, "step": 8440, "token_acc": 0.9138381201044387, "train_speed(iter/s)": 0.138638 }, { "epoch": 0.7969988674971688, "grad_norm": 0.2730084955692291, "learning_rate": 2.1700234421286804e-05, "loss": 0.2295698881149292, "memory(GiB)": 91.64, "step": 8445, "token_acc": 0.9063561377971858, "train_speed(iter/s)": 0.138639 }, { "epoch": 0.7974707436768592, "grad_norm": 0.2253396362066269, "learning_rate": 2.1603263032905284e-05, "loss": 0.22352323532104493, "memory(GiB)": 91.64, "step": 8450, "token_acc": 0.939419795221843, "train_speed(iter/s)": 0.138639 }, { "epoch": 0.7979426198565497, "grad_norm": 0.9259458184242249, "learning_rate": 2.1506482552379915e-05, "loss": 0.22939915657043458, "memory(GiB)": 91.64, "step": 8455, "token_acc": 0.9181561618062088, "train_speed(iter/s)": 0.13864 }, { "epoch": 0.79841449603624, "grad_norm": 0.34216970205307007, "learning_rate": 2.1409893215385758e-05, "loss": 0.21892695426940917, "memory(GiB)": 91.64, "step": 8460, "token_acc": 0.9082747853239657, "train_speed(iter/s)": 0.13864 }, { "epoch": 0.7988863722159305, "grad_norm": 0.30453988909721375, "learning_rate": 2.1313495257132333e-05, "loss": 0.22130227088928223, "memory(GiB)": 91.64, "step": 8465, "token_acc": 0.9300937766410913, "train_speed(iter/s)": 0.13864 }, { "epoch": 0.799358248395621, "grad_norm": 0.36281198263168335, "learning_rate": 2.121728891236322e-05, "loss": 0.2198082685470581, "memory(GiB)": 91.64, "step": 8470, "token_acc": 0.9177984274481773, "train_speed(iter/s)": 0.13864 }, { "epoch": 0.7998301245753114, "grad_norm": 0.23792099952697754, "learning_rate": 2.112127441535534e-05, "loss": 0.2212691307067871, "memory(GiB)": 91.64, "step": 8475, "token_acc": 0.9101941747572816, "train_speed(iter/s)": 0.138639 }, { "epoch": 0.8003020007550019, "grad_norm": 0.3405323028564453, "learning_rate": 2.1025451999918454e-05, "loss": 0.2203970432281494, "memory(GiB)": 91.64, "step": 8480, "token_acc": 0.925, "train_speed(iter/s)": 0.13864 }, { "epoch": 0.8007738769346924, "grad_norm": 0.24904760718345642, "learning_rate": 2.0929821899394588e-05, "loss": 0.22575736045837402, "memory(GiB)": 91.64, "step": 8485, "token_acc": 0.886896551724138, "train_speed(iter/s)": 0.138642 }, { "epoch": 0.8012457531143827, "grad_norm": 0.2277214378118515, "learning_rate": 2.0834384346657386e-05, "loss": 0.22250080108642578, "memory(GiB)": 91.64, "step": 8490, "token_acc": 0.9332630839480155, "train_speed(iter/s)": 0.138642 }, { "epoch": 0.8017176292940732, "grad_norm": 0.354336142539978, "learning_rate": 2.0739139574111677e-05, "loss": 0.21643447875976562, "memory(GiB)": 91.64, "step": 8495, "token_acc": 0.9221705426356589, "train_speed(iter/s)": 0.13864 }, { "epoch": 0.8021895054737637, "grad_norm": 0.2935597002506256, "learning_rate": 2.0644087813692815e-05, "loss": 0.22014429569244384, "memory(GiB)": 91.64, "step": 8500, "token_acc": 0.920774647887324, "train_speed(iter/s)": 0.138642 }, { "epoch": 0.8026613816534541, "grad_norm": 0.27267611026763916, "learning_rate": 2.0549229296866158e-05, "loss": 0.2266261339187622, "memory(GiB)": 91.64, "step": 8505, "token_acc": 0.9265078560567663, "train_speed(iter/s)": 0.138641 }, { "epoch": 0.8031332578331446, "grad_norm": 0.250590980052948, "learning_rate": 2.0454564254626473e-05, "loss": 0.21946268081665038, "memory(GiB)": 91.64, "step": 8510, "token_acc": 0.9360582760016187, "train_speed(iter/s)": 0.138643 }, { "epoch": 0.8036051340128351, "grad_norm": 0.2068144679069519, "learning_rate": 2.0360092917497408e-05, "loss": 0.2195216417312622, "memory(GiB)": 91.64, "step": 8515, "token_acc": 0.9289740698985344, "train_speed(iter/s)": 0.138643 }, { "epoch": 0.8040770101925255, "grad_norm": 0.3886139392852783, "learning_rate": 2.0265815515530838e-05, "loss": 0.2155540943145752, "memory(GiB)": 91.64, "step": 8520, "token_acc": 0.9433656957928802, "train_speed(iter/s)": 0.138643 }, { "epoch": 0.8045488863722159, "grad_norm": 0.31128185987472534, "learning_rate": 2.0171732278306464e-05, "loss": 0.2209153413772583, "memory(GiB)": 91.64, "step": 8525, "token_acc": 0.9294670846394985, "train_speed(iter/s)": 0.138642 }, { "epoch": 0.8050207625519064, "grad_norm": 0.5342454314231873, "learning_rate": 2.007784343493112e-05, "loss": 0.22390148639678956, "memory(GiB)": 91.64, "step": 8530, "token_acc": 0.9230158730158731, "train_speed(iter/s)": 0.138642 }, { "epoch": 0.8054926387315968, "grad_norm": 0.4349924325942993, "learning_rate": 1.998414921403827e-05, "loss": 0.21895236968994142, "memory(GiB)": 91.64, "step": 8535, "token_acc": 0.9248291571753986, "train_speed(iter/s)": 0.138642 }, { "epoch": 0.8059645149112873, "grad_norm": 0.5242533683776855, "learning_rate": 1.989064984378747e-05, "loss": 0.21892497539520264, "memory(GiB)": 91.64, "step": 8540, "token_acc": 0.930921052631579, "train_speed(iter/s)": 0.138643 }, { "epoch": 0.8064363910909778, "grad_norm": 0.34894755482673645, "learning_rate": 1.9797345551863765e-05, "loss": 0.21972031593322755, "memory(GiB)": 91.64, "step": 8545, "token_acc": 0.9258809234507898, "train_speed(iter/s)": 0.138644 }, { "epoch": 0.8069082672706682, "grad_norm": 0.46887141466140747, "learning_rate": 1.9704236565477117e-05, "loss": 0.21499874591827392, "memory(GiB)": 91.64, "step": 8550, "token_acc": 0.9317157712305026, "train_speed(iter/s)": 0.138644 }, { "epoch": 0.8073801434503586, "grad_norm": 0.414568692445755, "learning_rate": 1.9611323111361935e-05, "loss": 0.22043170928955078, "memory(GiB)": 91.64, "step": 8555, "token_acc": 0.9253034547152195, "train_speed(iter/s)": 0.138644 }, { "epoch": 0.807852019630049, "grad_norm": 0.23252899944782257, "learning_rate": 1.951860541577647e-05, "loss": 0.21883907318115234, "memory(GiB)": 91.64, "step": 8560, "token_acc": 0.9171251719394773, "train_speed(iter/s)": 0.138645 }, { "epoch": 0.8083238958097395, "grad_norm": 0.2740820348262787, "learning_rate": 1.9426083704502273e-05, "loss": 0.22658867835998536, "memory(GiB)": 91.64, "step": 8565, "token_acc": 0.9211643420254699, "train_speed(iter/s)": 0.138644 }, { "epoch": 0.80879577198943, "grad_norm": 0.44111067056655884, "learning_rate": 1.9333758202843655e-05, "loss": 0.22524876594543458, "memory(GiB)": 91.64, "step": 8570, "token_acc": 0.9118528027385537, "train_speed(iter/s)": 0.138644 }, { "epoch": 0.8092676481691204, "grad_norm": 0.20151256024837494, "learning_rate": 1.924162913562707e-05, "loss": 0.21728076934814453, "memory(GiB)": 91.64, "step": 8575, "token_acc": 0.9278195488721804, "train_speed(iter/s)": 0.138645 }, { "epoch": 0.8097395243488109, "grad_norm": 0.5388370752334595, "learning_rate": 1.9149696727200695e-05, "loss": 0.22715296745300292, "memory(GiB)": 91.64, "step": 8580, "token_acc": 0.9294187425860023, "train_speed(iter/s)": 0.138645 }, { "epoch": 0.8102114005285013, "grad_norm": 0.27115631103515625, "learning_rate": 1.9057961201433772e-05, "loss": 0.2197357416152954, "memory(GiB)": 91.64, "step": 8585, "token_acc": 0.9310043668122271, "train_speed(iter/s)": 0.138645 }, { "epoch": 0.8106832767081917, "grad_norm": 0.21140803396701813, "learning_rate": 1.896642278171612e-05, "loss": 0.2216893196105957, "memory(GiB)": 91.64, "step": 8590, "token_acc": 0.9260104302477183, "train_speed(iter/s)": 0.138646 }, { "epoch": 0.8111551528878822, "grad_norm": 0.33303895592689514, "learning_rate": 1.8875081690957575e-05, "loss": 0.22209582328796387, "memory(GiB)": 91.64, "step": 8595, "token_acc": 0.9165911151405258, "train_speed(iter/s)": 0.138647 }, { "epoch": 0.8116270290675727, "grad_norm": 0.2280055433511734, "learning_rate": 1.8783938151587465e-05, "loss": 0.2175816535949707, "memory(GiB)": 91.64, "step": 8600, "token_acc": 0.9249368459040058, "train_speed(iter/s)": 0.138646 }, { "epoch": 0.8120989052472631, "grad_norm": 0.3590617775917053, "learning_rate": 1.8692992385553975e-05, "loss": 0.2196737289428711, "memory(GiB)": 91.64, "step": 8605, "token_acc": 0.9189448441247002, "train_speed(iter/s)": 0.138646 }, { "epoch": 0.8125707814269536, "grad_norm": 0.6457542777061462, "learning_rate": 1.860224461432377e-05, "loss": 0.21764006614685058, "memory(GiB)": 91.64, "step": 8610, "token_acc": 0.9244288224956063, "train_speed(iter/s)": 0.138646 }, { "epoch": 0.8130426576066441, "grad_norm": 0.46899905800819397, "learning_rate": 1.8511695058881316e-05, "loss": 0.2232901096343994, "memory(GiB)": 91.64, "step": 8615, "token_acc": 0.9269063611220861, "train_speed(iter/s)": 0.138647 }, { "epoch": 0.8135145337863344, "grad_norm": 0.28184252977371216, "learning_rate": 1.8421343939728442e-05, "loss": 0.21828012466430663, "memory(GiB)": 91.64, "step": 8620, "token_acc": 0.9019536903039074, "train_speed(iter/s)": 0.138648 }, { "epoch": 0.8139864099660249, "grad_norm": 0.3113933205604553, "learning_rate": 1.833119147688369e-05, "loss": 0.21824917793273926, "memory(GiB)": 91.64, "step": 8625, "token_acc": 0.9256303862112991, "train_speed(iter/s)": 0.138647 }, { "epoch": 0.8144582861457154, "grad_norm": 0.30904653668403625, "learning_rate": 1.8241237889881934e-05, "loss": 0.2253275156021118, "memory(GiB)": 91.64, "step": 8630, "token_acc": 0.9330543933054394, "train_speed(iter/s)": 0.138647 }, { "epoch": 0.8149301623254058, "grad_norm": 0.6179171204566956, "learning_rate": 1.815148339777363e-05, "loss": 0.22154507637023926, "memory(GiB)": 91.64, "step": 8635, "token_acc": 0.9194214876033058, "train_speed(iter/s)": 0.138648 }, { "epoch": 0.8154020385050963, "grad_norm": 0.32482269406318665, "learning_rate": 1.8061928219124503e-05, "loss": 0.21969285011291503, "memory(GiB)": 91.64, "step": 8640, "token_acc": 0.9162815982603969, "train_speed(iter/s)": 0.138648 }, { "epoch": 0.8158739146847868, "grad_norm": 0.3604651093482971, "learning_rate": 1.79725725720149e-05, "loss": 0.21565718650817872, "memory(GiB)": 91.64, "step": 8645, "token_acc": 0.924759080800593, "train_speed(iter/s)": 0.138649 }, { "epoch": 0.8163457908644771, "grad_norm": 0.6190539002418518, "learning_rate": 1.7883416674039278e-05, "loss": 0.2202209711074829, "memory(GiB)": 91.64, "step": 8650, "token_acc": 0.9248197734294542, "train_speed(iter/s)": 0.138649 }, { "epoch": 0.8168176670441676, "grad_norm": 0.2979640066623688, "learning_rate": 1.7794460742305696e-05, "loss": 0.22345561981201173, "memory(GiB)": 91.64, "step": 8655, "token_acc": 0.9106302916274694, "train_speed(iter/s)": 0.13865 }, { "epoch": 0.817289543223858, "grad_norm": 0.49348747730255127, "learning_rate": 1.770570499343517e-05, "loss": 0.21452643871307372, "memory(GiB)": 91.64, "step": 8660, "token_acc": 0.9195678271308524, "train_speed(iter/s)": 0.13865 }, { "epoch": 0.8177614194035485, "grad_norm": 0.3675593137741089, "learning_rate": 1.7617149643561358e-05, "loss": 0.21637289524078368, "memory(GiB)": 91.64, "step": 8665, "token_acc": 0.9341085271317829, "train_speed(iter/s)": 0.138651 }, { "epoch": 0.818233295583239, "grad_norm": 0.5269376039505005, "learning_rate": 1.752879490832985e-05, "loss": 0.21624937057495117, "memory(GiB)": 91.64, "step": 8670, "token_acc": 0.9225589225589226, "train_speed(iter/s)": 0.138652 }, { "epoch": 0.8187051717629295, "grad_norm": 0.6057656407356262, "learning_rate": 1.744064100289773e-05, "loss": 0.21797895431518555, "memory(GiB)": 91.64, "step": 8675, "token_acc": 0.9184810126582279, "train_speed(iter/s)": 0.138653 }, { "epoch": 0.8191770479426198, "grad_norm": 0.44021740555763245, "learning_rate": 1.7352688141933036e-05, "loss": 0.2198997974395752, "memory(GiB)": 91.64, "step": 8680, "token_acc": 0.9260405916752666, "train_speed(iter/s)": 0.138652 }, { "epoch": 0.8196489241223103, "grad_norm": 0.24512818455696106, "learning_rate": 1.726493653961425e-05, "loss": 0.2200550079345703, "memory(GiB)": 91.64, "step": 8685, "token_acc": 0.9201101928374655, "train_speed(iter/s)": 0.138654 }, { "epoch": 0.8201208003020007, "grad_norm": 0.25874602794647217, "learning_rate": 1.717738640962968e-05, "loss": 0.21884992122650146, "memory(GiB)": 91.64, "step": 8690, "token_acc": 0.9136848713119899, "train_speed(iter/s)": 0.138653 }, { "epoch": 0.8205926764816912, "grad_norm": 0.36624372005462646, "learning_rate": 1.7090037965177098e-05, "loss": 0.22657575607299804, "memory(GiB)": 91.64, "step": 8695, "token_acc": 0.917910447761194, "train_speed(iter/s)": 0.138655 }, { "epoch": 0.8210645526613817, "grad_norm": 0.4566001296043396, "learning_rate": 1.7002891418963107e-05, "loss": 0.21981406211853027, "memory(GiB)": 91.64, "step": 8700, "token_acc": 0.9316739873108834, "train_speed(iter/s)": 0.138655 }, { "epoch": 0.8215364288410721, "grad_norm": 0.5014268755912781, "learning_rate": 1.691594698320267e-05, "loss": 0.21487929821014404, "memory(GiB)": 91.64, "step": 8705, "token_acc": 0.9231936854887675, "train_speed(iter/s)": 0.138654 }, { "epoch": 0.8220083050207625, "grad_norm": 0.5701547861099243, "learning_rate": 1.6829204869618585e-05, "loss": 0.2149423837661743, "memory(GiB)": 91.64, "step": 8710, "token_acc": 0.9118796992481203, "train_speed(iter/s)": 0.138653 }, { "epoch": 0.822480181200453, "grad_norm": 0.5961725115776062, "learning_rate": 1.6742665289440973e-05, "loss": 0.21888768672943115, "memory(GiB)": 91.64, "step": 8715, "token_acc": 0.9206291148500366, "train_speed(iter/s)": 0.138653 }, { "epoch": 0.8229520573801434, "grad_norm": 0.5988127589225769, "learning_rate": 1.665632845340669e-05, "loss": 0.22203760147094725, "memory(GiB)": 91.64, "step": 8720, "token_acc": 0.9163961038961039, "train_speed(iter/s)": 0.138654 }, { "epoch": 0.8234239335598339, "grad_norm": 0.2811583876609802, "learning_rate": 1.6570194571758955e-05, "loss": 0.21281654834747316, "memory(GiB)": 91.64, "step": 8725, "token_acc": 0.9121813031161473, "train_speed(iter/s)": 0.138655 }, { "epoch": 0.8238958097395244, "grad_norm": 0.39711859822273254, "learning_rate": 1.648426385424675e-05, "loss": 0.22025790214538574, "memory(GiB)": 91.64, "step": 8730, "token_acc": 0.9323593073593074, "train_speed(iter/s)": 0.138656 }, { "epoch": 0.8243676859192148, "grad_norm": 0.36098819971084595, "learning_rate": 1.6398536510124285e-05, "loss": 0.21392159461975097, "memory(GiB)": 91.64, "step": 8735, "token_acc": 0.9101847872797594, "train_speed(iter/s)": 0.138657 }, { "epoch": 0.8248395620989053, "grad_norm": 0.46026375889778137, "learning_rate": 1.631301274815058e-05, "loss": 0.21499810218811036, "memory(GiB)": 91.64, "step": 8740, "token_acc": 0.9302019315188762, "train_speed(iter/s)": 0.138657 }, { "epoch": 0.8253114382785957, "grad_norm": 0.34468331933021545, "learning_rate": 1.622769277658882e-05, "loss": 0.21223993301391603, "memory(GiB)": 91.64, "step": 8745, "token_acc": 0.9186937687437521, "train_speed(iter/s)": 0.138656 }, { "epoch": 0.8257833144582861, "grad_norm": 0.4011945426464081, "learning_rate": 1.614257680320601e-05, "loss": 0.22296814918518065, "memory(GiB)": 91.64, "step": 8750, "token_acc": 0.9173285198555957, "train_speed(iter/s)": 0.138657 }, { "epoch": 0.8262551906379766, "grad_norm": 0.23414309322834015, "learning_rate": 1.605766503527236e-05, "loss": 0.2172119140625, "memory(GiB)": 91.64, "step": 8755, "token_acc": 0.9090909090909091, "train_speed(iter/s)": 0.138658 }, { "epoch": 0.8267270668176671, "grad_norm": 0.49239152669906616, "learning_rate": 1.597295767956081e-05, "loss": 0.2250286340713501, "memory(GiB)": 91.64, "step": 8760, "token_acc": 0.9303750919342976, "train_speed(iter/s)": 0.138657 }, { "epoch": 0.8271989429973575, "grad_norm": 0.29153236746788025, "learning_rate": 1.5888454942346498e-05, "loss": 0.21357007026672364, "memory(GiB)": 91.64, "step": 8765, "token_acc": 0.9249068501003153, "train_speed(iter/s)": 0.138657 }, { "epoch": 0.827670819177048, "grad_norm": 0.39535972476005554, "learning_rate": 1.5804157029406364e-05, "loss": 0.21768288612365722, "memory(GiB)": 91.64, "step": 8770, "token_acc": 0.918443696221474, "train_speed(iter/s)": 0.138657 }, { "epoch": 0.8281426953567383, "grad_norm": 0.3631436228752136, "learning_rate": 1.5720064146018455e-05, "loss": 0.21306240558624268, "memory(GiB)": 91.64, "step": 8775, "token_acc": 0.9182437547312642, "train_speed(iter/s)": 0.138657 }, { "epoch": 0.8286145715364288, "grad_norm": 0.2444126009941101, "learning_rate": 1.563617649696162e-05, "loss": 0.21740403175354003, "memory(GiB)": 91.64, "step": 8780, "token_acc": 0.9132075471698113, "train_speed(iter/s)": 0.138657 }, { "epoch": 0.8290864477161193, "grad_norm": 0.30875638127326965, "learning_rate": 1.555249428651494e-05, "loss": 0.21742725372314453, "memory(GiB)": 91.64, "step": 8785, "token_acc": 0.9158807996982271, "train_speed(iter/s)": 0.138658 }, { "epoch": 0.8295583238958097, "grad_norm": 0.7618734240531921, "learning_rate": 1.5469017718457124e-05, "loss": 0.21721193790435792, "memory(GiB)": 91.64, "step": 8790, "token_acc": 0.9266528925619835, "train_speed(iter/s)": 0.138658 }, { "epoch": 0.8300302000755002, "grad_norm": 0.25342825055122375, "learning_rate": 1.5385746996066263e-05, "loss": 0.21593549251556396, "memory(GiB)": 91.64, "step": 8795, "token_acc": 0.9261596718207636, "train_speed(iter/s)": 0.138658 }, { "epoch": 0.8305020762551907, "grad_norm": 0.3047173321247101, "learning_rate": 1.5302682322119087e-05, "loss": 0.21290826797485352, "memory(GiB)": 91.64, "step": 8800, "token_acc": 0.9214285714285714, "train_speed(iter/s)": 0.138659 }, { "epoch": 0.830973952434881, "grad_norm": 0.18959245085716248, "learning_rate": 1.5219823898890551e-05, "loss": 0.21209537982940674, "memory(GiB)": 91.64, "step": 8805, "token_acc": 0.9263112267013437, "train_speed(iter/s)": 0.138659 }, { "epoch": 0.8314458286145715, "grad_norm": 0.35109010338783264, "learning_rate": 1.5137171928153393e-05, "loss": 0.22048661708831788, "memory(GiB)": 91.64, "step": 8810, "token_acc": 0.9231553893233594, "train_speed(iter/s)": 0.13866 }, { "epoch": 0.831917704794262, "grad_norm": 0.4367806315422058, "learning_rate": 1.5054726611177627e-05, "loss": 0.21188702583312988, "memory(GiB)": 91.64, "step": 8815, "token_acc": 0.9250555731978406, "train_speed(iter/s)": 0.138659 }, { "epoch": 0.8323895809739524, "grad_norm": 0.2685738801956177, "learning_rate": 1.4972488148729958e-05, "loss": 0.21199412345886232, "memory(GiB)": 91.64, "step": 8820, "token_acc": 0.9164345403899722, "train_speed(iter/s)": 0.138659 }, { "epoch": 0.8328614571536429, "grad_norm": 0.7678811550140381, "learning_rate": 1.4890456741073488e-05, "loss": 0.21964569091796876, "memory(GiB)": 91.64, "step": 8825, "token_acc": 0.9182865370770338, "train_speed(iter/s)": 0.138659 }, { "epoch": 0.8333333333333334, "grad_norm": 0.3738093972206116, "learning_rate": 1.4808632587967031e-05, "loss": 0.2381913185119629, "memory(GiB)": 91.64, "step": 8830, "token_acc": 0.9197786998616874, "train_speed(iter/s)": 0.138659 }, { "epoch": 0.8338052095130238, "grad_norm": 0.4026806652545929, "learning_rate": 1.4727015888664685e-05, "loss": 0.22088391780853273, "memory(GiB)": 91.64, "step": 8835, "token_acc": 0.9287822878228782, "train_speed(iter/s)": 0.138659 }, { "epoch": 0.8342770856927142, "grad_norm": 0.44328606128692627, "learning_rate": 1.4645606841915415e-05, "loss": 0.2210165023803711, "memory(GiB)": 91.64, "step": 8840, "token_acc": 0.9258569299552906, "train_speed(iter/s)": 0.138658 }, { "epoch": 0.8347489618724047, "grad_norm": 0.29257553815841675, "learning_rate": 1.456440564596252e-05, "loss": 0.21825973987579345, "memory(GiB)": 91.64, "step": 8845, "token_acc": 0.9290515309932785, "train_speed(iter/s)": 0.138658 }, { "epoch": 0.8352208380520951, "grad_norm": 0.3918415904045105, "learning_rate": 1.4483412498543081e-05, "loss": 0.22037510871887206, "memory(GiB)": 91.64, "step": 8850, "token_acc": 0.9130610594419734, "train_speed(iter/s)": 0.138658 }, { "epoch": 0.8356927142317856, "grad_norm": 0.34599733352661133, "learning_rate": 1.4402627596887696e-05, "loss": 0.2142866611480713, "memory(GiB)": 91.64, "step": 8855, "token_acc": 0.9203539823008849, "train_speed(iter/s)": 0.138659 }, { "epoch": 0.8361645904114761, "grad_norm": 0.33332359790802, "learning_rate": 1.4322051137719684e-05, "loss": 0.21822817325592042, "memory(GiB)": 91.64, "step": 8860, "token_acc": 0.9312896405919662, "train_speed(iter/s)": 0.138658 }, { "epoch": 0.8366364665911665, "grad_norm": 0.31388983130455017, "learning_rate": 1.4241683317254884e-05, "loss": 0.2175750255584717, "memory(GiB)": 91.64, "step": 8865, "token_acc": 0.921377183967112, "train_speed(iter/s)": 0.138659 }, { "epoch": 0.8371083427708569, "grad_norm": 0.6400064826011658, "learning_rate": 1.4161524331201059e-05, "loss": 0.20982787609100342, "memory(GiB)": 91.64, "step": 8870, "token_acc": 0.9286498353457738, "train_speed(iter/s)": 0.13866 }, { "epoch": 0.8375802189505474, "grad_norm": 0.21114827692508698, "learning_rate": 1.4081574374757323e-05, "loss": 0.21725311279296874, "memory(GiB)": 91.64, "step": 8875, "token_acc": 0.9271402550091075, "train_speed(iter/s)": 0.138661 }, { "epoch": 0.8380520951302378, "grad_norm": 0.26979145407676697, "learning_rate": 1.4001833642613948e-05, "loss": 0.21486730575561525, "memory(GiB)": 91.64, "step": 8880, "token_acc": 0.9132149901380671, "train_speed(iter/s)": 0.138661 }, { "epoch": 0.8385239713099283, "grad_norm": 0.2545173466205597, "learning_rate": 1.3922302328951597e-05, "loss": 0.2135646104812622, "memory(GiB)": 91.64, "step": 8885, "token_acc": 0.9220452640402347, "train_speed(iter/s)": 0.138661 }, { "epoch": 0.8389958474896188, "grad_norm": 0.5725402235984802, "learning_rate": 1.3842980627440972e-05, "loss": 0.2180727243423462, "memory(GiB)": 91.64, "step": 8890, "token_acc": 0.9074926747593135, "train_speed(iter/s)": 0.138661 }, { "epoch": 0.8394677236693092, "grad_norm": 0.4995214343070984, "learning_rate": 1.3763868731242357e-05, "loss": 0.2170236587524414, "memory(GiB)": 91.64, "step": 8895, "token_acc": 0.9211840228245364, "train_speed(iter/s)": 0.138662 }, { "epoch": 0.8399395998489996, "grad_norm": 0.5268911123275757, "learning_rate": 1.3684966833005164e-05, "loss": 0.21324462890625, "memory(GiB)": 91.64, "step": 8900, "token_acc": 0.9303507880020335, "train_speed(iter/s)": 0.138663 }, { "epoch": 0.84041147602869, "grad_norm": 0.22460459172725677, "learning_rate": 1.3606275124867317e-05, "loss": 0.2095392942428589, "memory(GiB)": 91.64, "step": 8905, "token_acc": 0.9259259259259259, "train_speed(iter/s)": 0.138663 }, { "epoch": 0.8408833522083805, "grad_norm": 0.4552537202835083, "learning_rate": 1.3527793798455046e-05, "loss": 0.21751093864440918, "memory(GiB)": 91.64, "step": 8910, "token_acc": 0.9306561334211104, "train_speed(iter/s)": 0.138662 }, { "epoch": 0.841355228388071, "grad_norm": 0.5160773396492004, "learning_rate": 1.3449523044882184e-05, "loss": 0.2187131404876709, "memory(GiB)": 91.64, "step": 8915, "token_acc": 0.9276982186517639, "train_speed(iter/s)": 0.138663 }, { "epoch": 0.8418271045677614, "grad_norm": 0.8382816910743713, "learning_rate": 1.3371463054749766e-05, "loss": 0.21896607875823976, "memory(GiB)": 91.64, "step": 8920, "token_acc": 0.9211409395973155, "train_speed(iter/s)": 0.138663 }, { "epoch": 0.8422989807474519, "grad_norm": 0.3334463834762573, "learning_rate": 1.3293614018145639e-05, "loss": 0.2141813278198242, "memory(GiB)": 91.64, "step": 8925, "token_acc": 0.9138248847926267, "train_speed(iter/s)": 0.138662 }, { "epoch": 0.8427708569271423, "grad_norm": 0.26530754566192627, "learning_rate": 1.3215976124643947e-05, "loss": 0.2144141674041748, "memory(GiB)": 91.64, "step": 8930, "token_acc": 0.9184426229508197, "train_speed(iter/s)": 0.138662 }, { "epoch": 0.8432427331068327, "grad_norm": 0.21236403286457062, "learning_rate": 1.3138549563304581e-05, "loss": 0.21677255630493164, "memory(GiB)": 91.64, "step": 8935, "token_acc": 0.9428851174934726, "train_speed(iter/s)": 0.138662 }, { "epoch": 0.8437146092865232, "grad_norm": 0.3409329950809479, "learning_rate": 1.3061334522672964e-05, "loss": 0.22018632888793946, "memory(GiB)": 91.64, "step": 8940, "token_acc": 0.918456817185445, "train_speed(iter/s)": 0.138662 }, { "epoch": 0.8441864854662137, "grad_norm": 0.47499603033065796, "learning_rate": 1.2984331190779276e-05, "loss": 0.2202282428741455, "memory(GiB)": 91.64, "step": 8945, "token_acc": 0.9200565970993987, "train_speed(iter/s)": 0.138661 }, { "epoch": 0.8446583616459041, "grad_norm": 0.4795122444629669, "learning_rate": 1.2907539755138232e-05, "loss": 0.2215047597885132, "memory(GiB)": 91.64, "step": 8950, "token_acc": 0.9223080417991822, "train_speed(iter/s)": 0.13866 }, { "epoch": 0.8451302378255946, "grad_norm": 0.443649560213089, "learning_rate": 1.2830960402748581e-05, "loss": 0.22158918380737305, "memory(GiB)": 91.64, "step": 8955, "token_acc": 0.8966992665036675, "train_speed(iter/s)": 0.138661 }, { "epoch": 0.8456021140052851, "grad_norm": 0.4760352671146393, "learning_rate": 1.2754593320092523e-05, "loss": 0.21788718700408935, "memory(GiB)": 91.64, "step": 8960, "token_acc": 0.9285157265401611, "train_speed(iter/s)": 0.13866 }, { "epoch": 0.8460739901849754, "grad_norm": 0.4041005074977875, "learning_rate": 1.2678438693135386e-05, "loss": 0.217704439163208, "memory(GiB)": 91.64, "step": 8965, "token_acc": 0.9372652141247183, "train_speed(iter/s)": 0.138659 }, { "epoch": 0.8465458663646659, "grad_norm": 0.22946274280548096, "learning_rate": 1.260249670732524e-05, "loss": 0.21619582176208496, "memory(GiB)": 91.64, "step": 8970, "token_acc": 0.9303710490151168, "train_speed(iter/s)": 0.138658 }, { "epoch": 0.8470177425443564, "grad_norm": 0.49980804324150085, "learning_rate": 1.2526767547592177e-05, "loss": 0.21017656326293946, "memory(GiB)": 91.64, "step": 8975, "token_acc": 0.9205999117776797, "train_speed(iter/s)": 0.138657 }, { "epoch": 0.8474896187240468, "grad_norm": 0.41217175126075745, "learning_rate": 1.2451251398348107e-05, "loss": 0.20879015922546387, "memory(GiB)": 91.64, "step": 8980, "token_acc": 0.9199461823074335, "train_speed(iter/s)": 0.138656 }, { "epoch": 0.8479614949037373, "grad_norm": 0.21734531223773956, "learning_rate": 1.2375948443486274e-05, "loss": 0.22095990180969238, "memory(GiB)": 91.64, "step": 8985, "token_acc": 0.925754775107825, "train_speed(iter/s)": 0.138654 }, { "epoch": 0.8484333710834278, "grad_norm": 0.8602133989334106, "learning_rate": 1.2300858866380638e-05, "loss": 0.22048001289367675, "memory(GiB)": 91.64, "step": 8990, "token_acc": 0.920877998979071, "train_speed(iter/s)": 0.138653 }, { "epoch": 0.8489052472631181, "grad_norm": 0.3685482442378998, "learning_rate": 1.222598284988563e-05, "loss": 0.21594226360321045, "memory(GiB)": 91.64, "step": 8995, "token_acc": 0.9338983050847458, "train_speed(iter/s)": 0.138652 }, { "epoch": 0.8493771234428086, "grad_norm": 0.24085231125354767, "learning_rate": 1.2151320576335701e-05, "loss": 0.21290385723114014, "memory(GiB)": 91.64, "step": 9000, "token_acc": 0.9308624376336422, "train_speed(iter/s)": 0.138651 }, { "epoch": 0.849848999622499, "grad_norm": 0.3610924482345581, "learning_rate": 1.2076872227544645e-05, "loss": 0.21317293643951415, "memory(GiB)": 91.64, "step": 9005, "token_acc": 0.9221343873517787, "train_speed(iter/s)": 0.13865 }, { "epoch": 0.8503208758021895, "grad_norm": 0.4309537410736084, "learning_rate": 1.2002637984805432e-05, "loss": 0.21564769744873047, "memory(GiB)": 91.64, "step": 9010, "token_acc": 0.9249384741591469, "train_speed(iter/s)": 0.13865 }, { "epoch": 0.85079275198188, "grad_norm": 0.2729756534099579, "learning_rate": 1.1928618028889626e-05, "loss": 0.21591358184814452, "memory(GiB)": 91.64, "step": 9015, "token_acc": 0.928030303030303, "train_speed(iter/s)": 0.138648 }, { "epoch": 0.8512646281615704, "grad_norm": 0.7843363881111145, "learning_rate": 1.1854812540046933e-05, "loss": 0.21337780952453614, "memory(GiB)": 91.64, "step": 9020, "token_acc": 0.9305054151624549, "train_speed(iter/s)": 0.138647 }, { "epoch": 0.8517365043412608, "grad_norm": 0.3829854428768158, "learning_rate": 1.1781221698004851e-05, "loss": 0.2175013542175293, "memory(GiB)": 91.64, "step": 9025, "token_acc": 0.9183318853171155, "train_speed(iter/s)": 0.138647 }, { "epoch": 0.8522083805209513, "grad_norm": 0.2974933981895447, "learning_rate": 1.1707845681968143e-05, "loss": 0.21187739372253417, "memory(GiB)": 91.64, "step": 9030, "token_acc": 0.9268082663605052, "train_speed(iter/s)": 0.138647 }, { "epoch": 0.8526802567006417, "grad_norm": 0.5384204387664795, "learning_rate": 1.1634684670618468e-05, "loss": 0.2191821575164795, "memory(GiB)": 91.64, "step": 9035, "token_acc": 0.9259259259259259, "train_speed(iter/s)": 0.138647 }, { "epoch": 0.8531521328803322, "grad_norm": 0.4722369909286499, "learning_rate": 1.1561738842113912e-05, "loss": 0.21865737438201904, "memory(GiB)": 91.64, "step": 9040, "token_acc": 0.93042071197411, "train_speed(iter/s)": 0.138645 }, { "epoch": 0.8536240090600227, "grad_norm": 0.29009294509887695, "learning_rate": 1.1489008374088516e-05, "loss": 0.2190547466278076, "memory(GiB)": 91.64, "step": 9045, "token_acc": 0.9174520636984076, "train_speed(iter/s)": 0.138643 }, { "epoch": 0.8540958852397131, "grad_norm": 0.43121811747550964, "learning_rate": 1.1416493443651921e-05, "loss": 0.21074953079223632, "memory(GiB)": 91.64, "step": 9050, "token_acc": 0.9237262586674706, "train_speed(iter/s)": 0.138641 }, { "epoch": 0.8545677614194035, "grad_norm": 0.30366286635398865, "learning_rate": 1.1344194227388948e-05, "loss": 0.21621460914611818, "memory(GiB)": 91.64, "step": 9055, "token_acc": 0.9195775792038993, "train_speed(iter/s)": 0.13864 }, { "epoch": 0.855039637599094, "grad_norm": 0.35432592034339905, "learning_rate": 1.1272110901359024e-05, "loss": 0.2122971534729004, "memory(GiB)": 91.64, "step": 9060, "token_acc": 0.9211165048543689, "train_speed(iter/s)": 0.138639 }, { "epoch": 0.8555115137787844, "grad_norm": 0.32513248920440674, "learning_rate": 1.1200243641095908e-05, "loss": 0.2123556613922119, "memory(GiB)": 91.64, "step": 9065, "token_acc": 0.9351747463359639, "train_speed(iter/s)": 0.138637 }, { "epoch": 0.8559833899584749, "grad_norm": 0.3691011369228363, "learning_rate": 1.1128592621607226e-05, "loss": 0.21590576171875, "memory(GiB)": 91.64, "step": 9070, "token_acc": 0.9240752757949383, "train_speed(iter/s)": 0.138636 }, { "epoch": 0.8564552661381654, "grad_norm": 0.2052968144416809, "learning_rate": 1.1057158017373947e-05, "loss": 0.21484103202819824, "memory(GiB)": 91.64, "step": 9075, "token_acc": 0.9237958303378864, "train_speed(iter/s)": 0.138635 }, { "epoch": 0.8569271423178558, "grad_norm": 0.38909971714019775, "learning_rate": 1.0985940002350103e-05, "loss": 0.2229299545288086, "memory(GiB)": 91.64, "step": 9080, "token_acc": 0.9117383512544803, "train_speed(iter/s)": 0.138634 }, { "epoch": 0.8573990184975463, "grad_norm": 0.8026570081710815, "learning_rate": 1.0914938749962323e-05, "loss": 0.22169604301452636, "memory(GiB)": 91.64, "step": 9085, "token_acc": 0.9359975961538461, "train_speed(iter/s)": 0.138632 }, { "epoch": 0.8578708946772367, "grad_norm": 0.3203129768371582, "learning_rate": 1.0844154433109299e-05, "loss": 0.21136178970336914, "memory(GiB)": 91.64, "step": 9090, "token_acc": 0.9140328697850821, "train_speed(iter/s)": 0.138632 }, { "epoch": 0.8583427708569271, "grad_norm": 0.21245694160461426, "learning_rate": 1.0773587224161507e-05, "loss": 0.21133153438568114, "memory(GiB)": 91.64, "step": 9095, "token_acc": 0.9113320079522863, "train_speed(iter/s)": 0.13863 }, { "epoch": 0.8588146470366176, "grad_norm": 0.25271087884902954, "learning_rate": 1.0703237294960744e-05, "loss": 0.21964166164398194, "memory(GiB)": 91.64, "step": 9100, "token_acc": 0.9271875, "train_speed(iter/s)": 0.138629 }, { "epoch": 0.859286523216308, "grad_norm": 0.5589426755905151, "learning_rate": 1.063310481681965e-05, "loss": 0.2167053699493408, "memory(GiB)": 91.64, "step": 9105, "token_acc": 0.9252544529262087, "train_speed(iter/s)": 0.138628 }, { "epoch": 0.8597583993959985, "grad_norm": 0.37324556708335876, "learning_rate": 1.056318996052138e-05, "loss": 0.22221214771270753, "memory(GiB)": 91.64, "step": 9110, "token_acc": 0.9235668789808917, "train_speed(iter/s)": 0.138628 }, { "epoch": 0.860230275575689, "grad_norm": 0.3813531994819641, "learning_rate": 1.0493492896319135e-05, "loss": 0.21518683433532715, "memory(GiB)": 91.64, "step": 9115, "token_acc": 0.9208173690932312, "train_speed(iter/s)": 0.138627 }, { "epoch": 0.8607021517553793, "grad_norm": 0.3796219229698181, "learning_rate": 1.042401379393575e-05, "loss": 0.2043860912322998, "memory(GiB)": 91.64, "step": 9120, "token_acc": 0.9289118347895154, "train_speed(iter/s)": 0.138626 }, { "epoch": 0.8611740279350698, "grad_norm": 0.3149944245815277, "learning_rate": 1.0354752822563307e-05, "loss": 0.21240837574005128, "memory(GiB)": 91.64, "step": 9125, "token_acc": 0.939908256880734, "train_speed(iter/s)": 0.138625 }, { "epoch": 0.8616459041147603, "grad_norm": 0.21904854476451874, "learning_rate": 1.0285710150862715e-05, "loss": 0.2140800952911377, "memory(GiB)": 91.64, "step": 9130, "token_acc": 0.9252788104089219, "train_speed(iter/s)": 0.138625 }, { "epoch": 0.8621177802944507, "grad_norm": 0.4739190638065338, "learning_rate": 1.0216885946963239e-05, "loss": 0.2191236734390259, "memory(GiB)": 91.64, "step": 9135, "token_acc": 0.9306414848583523, "train_speed(iter/s)": 0.138626 }, { "epoch": 0.8625896564741412, "grad_norm": 0.28348055481910706, "learning_rate": 1.0148280378462182e-05, "loss": 0.2117250919342041, "memory(GiB)": 91.64, "step": 9140, "token_acc": 0.9304769603880356, "train_speed(iter/s)": 0.138624 }, { "epoch": 0.8630615326538317, "grad_norm": 0.2477513998746872, "learning_rate": 1.007989361242445e-05, "loss": 0.2174776792526245, "memory(GiB)": 91.64, "step": 9145, "token_acc": 0.9298039215686275, "train_speed(iter/s)": 0.138623 }, { "epoch": 0.863533408833522, "grad_norm": 0.29344943165779114, "learning_rate": 1.00117258153821e-05, "loss": 0.20976610183715821, "memory(GiB)": 91.64, "step": 9150, "token_acc": 0.9243150684931507, "train_speed(iter/s)": 0.138622 }, { "epoch": 0.8640052850132125, "grad_norm": 0.3104238510131836, "learning_rate": 9.943777153334e-06, "loss": 0.21416120529174804, "memory(GiB)": 91.64, "step": 9155, "token_acc": 0.9261862917398945, "train_speed(iter/s)": 0.13862 }, { "epoch": 0.864477161192903, "grad_norm": 0.2621361017227173, "learning_rate": 9.876047791745335e-06, "loss": 0.21070308685302735, "memory(GiB)": 91.64, "step": 9160, "token_acc": 0.9147621988882026, "train_speed(iter/s)": 0.13862 }, { "epoch": 0.8649490373725934, "grad_norm": 0.30771052837371826, "learning_rate": 9.808537895547309e-06, "loss": 0.20619337558746337, "memory(GiB)": 91.64, "step": 9165, "token_acc": 0.9170305676855895, "train_speed(iter/s)": 0.138618 }, { "epoch": 0.8654209135522839, "grad_norm": 0.3718501925468445, "learning_rate": 9.741247629136696e-06, "loss": 0.2108161449432373, "memory(GiB)": 91.64, "step": 9170, "token_acc": 0.9279187817258884, "train_speed(iter/s)": 0.138617 }, { "epoch": 0.8658927897319744, "grad_norm": 0.6464650630950928, "learning_rate": 9.67417715637542e-06, "loss": 0.21423704624176027, "memory(GiB)": 91.64, "step": 9175, "token_acc": 0.9234731420161884, "train_speed(iter/s)": 0.138615 }, { "epoch": 0.8663646659116648, "grad_norm": 0.5365249514579773, "learning_rate": 9.607326640590164e-06, "loss": 0.21575627326965333, "memory(GiB)": 91.64, "step": 9180, "token_acc": 0.9094117647058824, "train_speed(iter/s)": 0.138614 }, { "epoch": 0.8668365420913552, "grad_norm": 0.19745229184627533, "learning_rate": 9.540696244572033e-06, "loss": 0.2120530366897583, "memory(GiB)": 91.64, "step": 9185, "token_acc": 0.9127725856697819, "train_speed(iter/s)": 0.138612 }, { "epoch": 0.8673084182710457, "grad_norm": 0.3388816714286804, "learning_rate": 9.474286130576026e-06, "loss": 0.2139723300933838, "memory(GiB)": 91.64, "step": 9190, "token_acc": 0.9162839985870717, "train_speed(iter/s)": 0.138611 }, { "epoch": 0.8677802944507361, "grad_norm": 0.4545825719833374, "learning_rate": 9.408096460320792e-06, "loss": 0.21589879989624022, "memory(GiB)": 91.64, "step": 9195, "token_acc": 0.9250776397515528, "train_speed(iter/s)": 0.138608 }, { "epoch": 0.8682521706304266, "grad_norm": 0.24916082620620728, "learning_rate": 9.342127394988132e-06, "loss": 0.21355302333831788, "memory(GiB)": 91.64, "step": 9200, "token_acc": 0.925, "train_speed(iter/s)": 0.138607 }, { "epoch": 0.8687240468101171, "grad_norm": 0.7581355571746826, "learning_rate": 9.276379095222665e-06, "loss": 0.216577410697937, "memory(GiB)": 91.64, "step": 9205, "token_acc": 0.91288056206089, "train_speed(iter/s)": 0.138606 }, { "epoch": 0.8691959229898075, "grad_norm": 0.608823835849762, "learning_rate": 9.210851721131398e-06, "loss": 0.21935479640960692, "memory(GiB)": 91.64, "step": 9210, "token_acc": 0.9302580999450851, "train_speed(iter/s)": 0.138604 }, { "epoch": 0.8696677991694979, "grad_norm": 0.22969260811805725, "learning_rate": 9.145545432283353e-06, "loss": 0.21476612091064454, "memory(GiB)": 91.64, "step": 9215, "token_acc": 0.9197501201345507, "train_speed(iter/s)": 0.138602 }, { "epoch": 0.8701396753491883, "grad_norm": 1.0408731698989868, "learning_rate": 9.080460387709145e-06, "loss": 0.2159780502319336, "memory(GiB)": 91.64, "step": 9220, "token_acc": 0.9308084486525856, "train_speed(iter/s)": 0.1386 }, { "epoch": 0.8706115515288788, "grad_norm": 0.20538538694381714, "learning_rate": 9.015596745900679e-06, "loss": 0.21535878181457518, "memory(GiB)": 91.64, "step": 9225, "token_acc": 0.929406574012795, "train_speed(iter/s)": 0.1386 }, { "epoch": 0.8710834277085693, "grad_norm": 0.3050077557563782, "learning_rate": 8.950954664810695e-06, "loss": 0.22024221420288087, "memory(GiB)": 91.64, "step": 9230, "token_acc": 0.9159280830590023, "train_speed(iter/s)": 0.138597 }, { "epoch": 0.8715553038882597, "grad_norm": 0.25116536021232605, "learning_rate": 8.886534301852368e-06, "loss": 0.21032414436340333, "memory(GiB)": 91.64, "step": 9235, "token_acc": 0.9265141318977119, "train_speed(iter/s)": 0.138595 }, { "epoch": 0.8720271800679502, "grad_norm": 0.3281894028186798, "learning_rate": 8.822335813899018e-06, "loss": 0.21384072303771973, "memory(GiB)": 91.64, "step": 9240, "token_acc": 0.9211908931698775, "train_speed(iter/s)": 0.138594 }, { "epoch": 0.8724990562476406, "grad_norm": 0.20488016307353973, "learning_rate": 8.758359357283574e-06, "loss": 0.21094036102294922, "memory(GiB)": 91.64, "step": 9245, "token_acc": 0.9269041769041769, "train_speed(iter/s)": 0.138593 }, { "epoch": 0.872970932427331, "grad_norm": 0.4258732497692108, "learning_rate": 8.69460508779838e-06, "loss": 0.21204228401184083, "memory(GiB)": 91.64, "step": 9250, "token_acc": 0.9133466135458167, "train_speed(iter/s)": 0.138591 }, { "epoch": 0.8734428086070215, "grad_norm": 0.39470452070236206, "learning_rate": 8.631073160694658e-06, "loss": 0.2152315616607666, "memory(GiB)": 91.64, "step": 9255, "token_acc": 0.9243814289685849, "train_speed(iter/s)": 0.138589 }, { "epoch": 0.873914684786712, "grad_norm": 0.26770398020744324, "learning_rate": 8.567763730682221e-06, "loss": 0.21648941040039063, "memory(GiB)": 91.64, "step": 9260, "token_acc": 0.9256169621133125, "train_speed(iter/s)": 0.138587 }, { "epoch": 0.8743865609664024, "grad_norm": 0.31089380383491516, "learning_rate": 8.50467695192907e-06, "loss": 0.20960922241210939, "memory(GiB)": 91.64, "step": 9265, "token_acc": 0.9156517816869644, "train_speed(iter/s)": 0.138585 }, { "epoch": 0.8748584371460929, "grad_norm": 0.23713769018650055, "learning_rate": 8.441812978061015e-06, "loss": 0.211074161529541, "memory(GiB)": 91.64, "step": 9270, "token_acc": 0.9206798866855525, "train_speed(iter/s)": 0.138583 }, { "epoch": 0.8753303133257833, "grad_norm": 0.38536888360977173, "learning_rate": 8.379171962161259e-06, "loss": 0.21424272060394287, "memory(GiB)": 91.64, "step": 9275, "token_acc": 0.9308885754583921, "train_speed(iter/s)": 0.138582 }, { "epoch": 0.8758021895054737, "grad_norm": 0.5602840781211853, "learning_rate": 8.316754056770138e-06, "loss": 0.21376981735229492, "memory(GiB)": 91.64, "step": 9280, "token_acc": 0.9224137931034483, "train_speed(iter/s)": 0.13858 }, { "epoch": 0.8762740656851642, "grad_norm": 0.4108741879463196, "learning_rate": 8.254559413884633e-06, "loss": 0.21517577171325683, "memory(GiB)": 91.64, "step": 9285, "token_acc": 0.9342483242898181, "train_speed(iter/s)": 0.138578 }, { "epoch": 0.8767459418648547, "grad_norm": 0.2960684895515442, "learning_rate": 8.192588184958073e-06, "loss": 0.21525194644927978, "memory(GiB)": 91.64, "step": 9290, "token_acc": 0.9293150684931507, "train_speed(iter/s)": 0.138577 }, { "epoch": 0.8772178180445451, "grad_norm": 0.21378456056118011, "learning_rate": 8.130840520899719e-06, "loss": 0.20404720306396484, "memory(GiB)": 91.64, "step": 9295, "token_acc": 0.9324137931034483, "train_speed(iter/s)": 0.138576 }, { "epoch": 0.8776896942242356, "grad_norm": 0.3066118061542511, "learning_rate": 8.069316572074448e-06, "loss": 0.2152477979660034, "memory(GiB)": 91.64, "step": 9300, "token_acc": 0.9345850999394306, "train_speed(iter/s)": 0.138575 }, { "epoch": 0.8781615704039261, "grad_norm": 0.33726122975349426, "learning_rate": 8.008016488302306e-06, "loss": 0.21046628952026367, "memory(GiB)": 91.64, "step": 9305, "token_acc": 0.9202271498107085, "train_speed(iter/s)": 0.138571 }, { "epoch": 0.8786334465836164, "grad_norm": 0.38846316933631897, "learning_rate": 7.946940418858251e-06, "loss": 0.2146662950515747, "memory(GiB)": 91.64, "step": 9310, "token_acc": 0.9275923718712753, "train_speed(iter/s)": 0.138568 }, { "epoch": 0.8791053227633069, "grad_norm": 0.5795027017593384, "learning_rate": 7.886088512471678e-06, "loss": 0.2132624387741089, "memory(GiB)": 91.64, "step": 9315, "token_acc": 0.923013923013923, "train_speed(iter/s)": 0.138566 }, { "epoch": 0.8795771989429974, "grad_norm": 0.28636613488197327, "learning_rate": 7.825460917326177e-06, "loss": 0.21249852180480958, "memory(GiB)": 91.64, "step": 9320, "token_acc": 0.9045996592844975, "train_speed(iter/s)": 0.138564 }, { "epoch": 0.8800490751226878, "grad_norm": 0.24777673184871674, "learning_rate": 7.765057781059059e-06, "loss": 0.20986220836639405, "memory(GiB)": 91.64, "step": 9325, "token_acc": 0.9265103697024346, "train_speed(iter/s)": 0.138563 }, { "epoch": 0.8805209513023783, "grad_norm": 0.4328806698322296, "learning_rate": 7.704879250761021e-06, "loss": 0.21023178100585938, "memory(GiB)": 91.64, "step": 9330, "token_acc": 0.9189572116746954, "train_speed(iter/s)": 0.138562 }, { "epoch": 0.8809928274820688, "grad_norm": 0.7614713311195374, "learning_rate": 7.644925472975873e-06, "loss": 0.220062255859375, "memory(GiB)": 91.64, "step": 9335, "token_acc": 0.9254629629629629, "train_speed(iter/s)": 0.138561 }, { "epoch": 0.8814647036617591, "grad_norm": 0.6232017874717712, "learning_rate": 7.585196593700105e-06, "loss": 0.2095489978790283, "memory(GiB)": 91.64, "step": 9340, "token_acc": 0.9332292073408824, "train_speed(iter/s)": 0.13856 }, { "epoch": 0.8819365798414496, "grad_norm": 0.396986186504364, "learning_rate": 7.525692758382463e-06, "loss": 0.21920385360717773, "memory(GiB)": 91.64, "step": 9345, "token_acc": 0.9284452296819788, "train_speed(iter/s)": 0.138557 }, { "epoch": 0.88240845602114, "grad_norm": 0.8557764887809753, "learning_rate": 7.466414111923814e-06, "loss": 0.2085169553756714, "memory(GiB)": 91.64, "step": 9350, "token_acc": 0.9175862068965517, "train_speed(iter/s)": 0.138553 }, { "epoch": 0.8828803322008305, "grad_norm": 0.29277458786964417, "learning_rate": 7.407360798676577e-06, "loss": 0.21147971153259276, "memory(GiB)": 91.64, "step": 9355, "token_acc": 0.9102040816326531, "train_speed(iter/s)": 0.13855 }, { "epoch": 0.883352208380521, "grad_norm": 0.5351800322532654, "learning_rate": 7.348532962444421e-06, "loss": 0.21797070503234864, "memory(GiB)": 91.64, "step": 9360, "token_acc": 0.9135326261652202, "train_speed(iter/s)": 0.138548 }, { "epoch": 0.8838240845602114, "grad_norm": 0.2421572506427765, "learning_rate": 7.28993074648201e-06, "loss": 0.21367790699005126, "memory(GiB)": 91.64, "step": 9365, "token_acc": 0.9238447319778188, "train_speed(iter/s)": 0.138546 }, { "epoch": 0.8842959607399018, "grad_norm": 0.43830910325050354, "learning_rate": 7.231554293494547e-06, "loss": 0.214955472946167, "memory(GiB)": 91.64, "step": 9370, "token_acc": 0.9115107913669065, "train_speed(iter/s)": 0.138544 }, { "epoch": 0.8847678369195923, "grad_norm": 0.368368536233902, "learning_rate": 7.173403745637497e-06, "loss": 0.20797204971313477, "memory(GiB)": 91.64, "step": 9375, "token_acc": 0.9261330194231902, "train_speed(iter/s)": 0.138543 }, { "epoch": 0.8852397130992827, "grad_norm": 0.682052493095398, "learning_rate": 7.115479244516199e-06, "loss": 0.2070131778717041, "memory(GiB)": 91.64, "step": 9380, "token_acc": 0.9177962396152164, "train_speed(iter/s)": 0.138541 }, { "epoch": 0.8857115892789732, "grad_norm": 0.325766384601593, "learning_rate": 7.0577809311855425e-06, "loss": 0.21118898391723634, "memory(GiB)": 91.64, "step": 9385, "token_acc": 0.9205384281372123, "train_speed(iter/s)": 0.138539 }, { "epoch": 0.8861834654586637, "grad_norm": 0.2540743947029114, "learning_rate": 7.00030894614957e-06, "loss": 0.21211137771606445, "memory(GiB)": 91.64, "step": 9390, "token_acc": 0.9358490566037736, "train_speed(iter/s)": 0.138538 }, { "epoch": 0.8866553416383541, "grad_norm": 0.27310407161712646, "learning_rate": 6.94306342936123e-06, "loss": 0.21425786018371581, "memory(GiB)": 91.64, "step": 9395, "token_acc": 0.9352612553401248, "train_speed(iter/s)": 0.138535 }, { "epoch": 0.8871272178180446, "grad_norm": 0.2509388327598572, "learning_rate": 6.88604452022199e-06, "loss": 0.20922818183898925, "memory(GiB)": 91.64, "step": 9400, "token_acc": 0.9203892083149049, "train_speed(iter/s)": 0.138534 }, { "epoch": 0.887599093997735, "grad_norm": 0.3544827103614807, "learning_rate": 6.829252357581462e-06, "loss": 0.22144675254821777, "memory(GiB)": 91.64, "step": 9405, "token_acc": 0.9165378670788253, "train_speed(iter/s)": 0.138532 }, { "epoch": 0.8880709701774254, "grad_norm": 0.5197407603263855, "learning_rate": 6.772687079737139e-06, "loss": 0.21385698318481444, "memory(GiB)": 91.64, "step": 9410, "token_acc": 0.923288424525708, "train_speed(iter/s)": 0.138529 }, { "epoch": 0.8885428463571159, "grad_norm": 0.23393069207668304, "learning_rate": 6.716348824433949e-06, "loss": 0.2115368127822876, "memory(GiB)": 91.64, "step": 9415, "token_acc": 0.9165676959619953, "train_speed(iter/s)": 0.138527 }, { "epoch": 0.8890147225368064, "grad_norm": 0.5718517303466797, "learning_rate": 6.660237728864039e-06, "loss": 0.22244482040405272, "memory(GiB)": 91.64, "step": 9420, "token_acc": 0.9183126883160362, "train_speed(iter/s)": 0.138526 }, { "epoch": 0.8894865987164968, "grad_norm": 0.37451472878456116, "learning_rate": 6.604353929666384e-06, "loss": 0.2113950252532959, "memory(GiB)": 91.64, "step": 9425, "token_acc": 0.9068877551020408, "train_speed(iter/s)": 0.138526 }, { "epoch": 0.8899584748961873, "grad_norm": 0.23787033557891846, "learning_rate": 6.548697562926431e-06, "loss": 0.21541337966918944, "memory(GiB)": 91.64, "step": 9430, "token_acc": 0.9162337662337663, "train_speed(iter/s)": 0.138524 }, { "epoch": 0.8904303510758776, "grad_norm": 0.5333724617958069, "learning_rate": 6.49326876417583e-06, "loss": 0.20914788246154786, "memory(GiB)": 91.64, "step": 9435, "token_acc": 0.9208791208791208, "train_speed(iter/s)": 0.138522 }, { "epoch": 0.8909022272555681, "grad_norm": 0.24550431966781616, "learning_rate": 6.438067668392045e-06, "loss": 0.20611090660095216, "memory(GiB)": 91.64, "step": 9440, "token_acc": 0.9329292929292929, "train_speed(iter/s)": 0.138521 }, { "epoch": 0.8913741034352586, "grad_norm": 0.39244842529296875, "learning_rate": 6.383094409998036e-06, "loss": 0.21288225650787354, "memory(GiB)": 91.64, "step": 9445, "token_acc": 0.9141078838174274, "train_speed(iter/s)": 0.13852 }, { "epoch": 0.891845979614949, "grad_norm": 0.2563187777996063, "learning_rate": 6.3283491228619875e-06, "loss": 0.21205806732177734, "memory(GiB)": 91.64, "step": 9450, "token_acc": 0.9324095978371071, "train_speed(iter/s)": 0.138519 }, { "epoch": 0.8923178557946395, "grad_norm": 0.2973138093948364, "learning_rate": 6.273831940296904e-06, "loss": 0.20941767692565919, "memory(GiB)": 91.64, "step": 9455, "token_acc": 0.9111757105943152, "train_speed(iter/s)": 0.138516 }, { "epoch": 0.89278973197433, "grad_norm": 0.4202001392841339, "learning_rate": 6.219542995060313e-06, "loss": 0.21974601745605468, "memory(GiB)": 91.64, "step": 9460, "token_acc": 0.9231898238747553, "train_speed(iter/s)": 0.138515 }, { "epoch": 0.8932616081540203, "grad_norm": 0.7754381895065308, "learning_rate": 6.165482419353996e-06, "loss": 0.20998201370239258, "memory(GiB)": 91.64, "step": 9465, "token_acc": 0.9214920071047957, "train_speed(iter/s)": 0.138514 }, { "epoch": 0.8937334843337108, "grad_norm": 0.2756913900375366, "learning_rate": 6.1116503448236054e-06, "loss": 0.2183704376220703, "memory(GiB)": 91.64, "step": 9470, "token_acc": 0.9101675332177932, "train_speed(iter/s)": 0.138512 }, { "epoch": 0.8942053605134013, "grad_norm": 0.2889973223209381, "learning_rate": 6.058046902558301e-06, "loss": 0.2072589874267578, "memory(GiB)": 91.64, "step": 9475, "token_acc": 0.9353671147880042, "train_speed(iter/s)": 0.138509 }, { "epoch": 0.8946772366930917, "grad_norm": 0.3864864110946655, "learning_rate": 6.004672223090568e-06, "loss": 0.21283507347106934, "memory(GiB)": 91.64, "step": 9480, "token_acc": 0.9110769230769231, "train_speed(iter/s)": 0.138507 }, { "epoch": 0.8951491128727822, "grad_norm": 0.25138232111930847, "learning_rate": 5.951526436395782e-06, "loss": 0.21158528327941895, "memory(GiB)": 91.64, "step": 9485, "token_acc": 0.9071403447062961, "train_speed(iter/s)": 0.138506 }, { "epoch": 0.8956209890524727, "grad_norm": 0.4256957173347473, "learning_rate": 5.898609671891897e-06, "loss": 0.21031410694122316, "memory(GiB)": 91.64, "step": 9490, "token_acc": 0.9060980172091283, "train_speed(iter/s)": 0.138504 }, { "epoch": 0.896092865232163, "grad_norm": 0.361007422208786, "learning_rate": 5.845922058439268e-06, "loss": 0.21310253143310548, "memory(GiB)": 91.64, "step": 9495, "token_acc": 0.9223516361619523, "train_speed(iter/s)": 0.138501 }, { "epoch": 0.8965647414118535, "grad_norm": 0.5840691328048706, "learning_rate": 5.793463724340109e-06, "loss": 0.22230072021484376, "memory(GiB)": 91.64, "step": 9500, "token_acc": 0.9187408491947291, "train_speed(iter/s)": 0.1385 }, { "epoch": 0.897036617591544, "grad_norm": 0.2761625647544861, "learning_rate": 5.741234797338391e-06, "loss": 0.2137458324432373, "memory(GiB)": 91.64, "step": 9505, "token_acc": 0.9283000949667616, "train_speed(iter/s)": 0.138499 }, { "epoch": 0.8975084937712344, "grad_norm": 0.6072384715080261, "learning_rate": 5.689235404619387e-06, "loss": 0.21013424396514893, "memory(GiB)": 91.64, "step": 9510, "token_acc": 0.9139955569660425, "train_speed(iter/s)": 0.138498 }, { "epoch": 0.8979803699509249, "grad_norm": 0.25448301434516907, "learning_rate": 5.637465672809483e-06, "loss": 0.2110156536102295, "memory(GiB)": 91.64, "step": 9515, "token_acc": 0.9319535904672311, "train_speed(iter/s)": 0.138496 }, { "epoch": 0.8984522461306154, "grad_norm": 0.3504413962364197, "learning_rate": 5.585925727975727e-06, "loss": 0.20918526649475097, "memory(GiB)": 91.64, "step": 9520, "token_acc": 0.9220079410096427, "train_speed(iter/s)": 0.138495 }, { "epoch": 0.8989241223103058, "grad_norm": 0.3817376494407654, "learning_rate": 5.534615695625689e-06, "loss": 0.21109294891357422, "memory(GiB)": 91.64, "step": 9525, "token_acc": 0.9241635687732342, "train_speed(iter/s)": 0.138493 }, { "epoch": 0.8993959984899962, "grad_norm": 0.7123754620552063, "learning_rate": 5.4835357007069765e-06, "loss": 0.21664328575134278, "memory(GiB)": 91.64, "step": 9530, "token_acc": 0.9258387286639199, "train_speed(iter/s)": 0.138492 }, { "epoch": 0.8998678746696867, "grad_norm": 0.4179804027080536, "learning_rate": 5.43268586760709e-06, "loss": 0.2076176404953003, "memory(GiB)": 91.64, "step": 9535, "token_acc": 0.9145885286783042, "train_speed(iter/s)": 0.13849 }, { "epoch": 0.9003397508493771, "grad_norm": 0.6187090873718262, "learning_rate": 5.382066320153046e-06, "loss": 0.21339232921600343, "memory(GiB)": 91.64, "step": 9540, "token_acc": 0.9352548036758563, "train_speed(iter/s)": 0.138487 }, { "epoch": 0.9008116270290676, "grad_norm": 0.23456265032291412, "learning_rate": 5.331677181611006e-06, "loss": 0.21850805282592772, "memory(GiB)": 91.64, "step": 9545, "token_acc": 0.9417892156862745, "train_speed(iter/s)": 0.138486 }, { "epoch": 0.901283503208758, "grad_norm": 0.3491338789463043, "learning_rate": 5.281518574686162e-06, "loss": 0.21774368286132811, "memory(GiB)": 91.64, "step": 9550, "token_acc": 0.9337481698389458, "train_speed(iter/s)": 0.138484 }, { "epoch": 0.9017553793884485, "grad_norm": 0.35306277871131897, "learning_rate": 5.231590621522275e-06, "loss": 0.21966407299041749, "memory(GiB)": 91.64, "step": 9555, "token_acc": 0.9345238095238095, "train_speed(iter/s)": 0.138482 }, { "epoch": 0.9022272555681389, "grad_norm": 0.3036733567714691, "learning_rate": 5.18189344370138e-06, "loss": 0.21493420600891114, "memory(GiB)": 91.64, "step": 9560, "token_acc": 0.9334840167904424, "train_speed(iter/s)": 0.138479 }, { "epoch": 0.9026991317478293, "grad_norm": 0.38445791602134705, "learning_rate": 5.132427162243625e-06, "loss": 0.20835418701171876, "memory(GiB)": 91.64, "step": 9565, "token_acc": 0.9253386190948133, "train_speed(iter/s)": 0.138477 }, { "epoch": 0.9031710079275198, "grad_norm": 0.5530850291252136, "learning_rate": 5.083191897606843e-06, "loss": 0.21398649215698243, "memory(GiB)": 91.64, "step": 9570, "token_acc": 0.9331123832479663, "train_speed(iter/s)": 0.138475 }, { "epoch": 0.9036428841072103, "grad_norm": 0.2753889858722687, "learning_rate": 5.034187769686283e-06, "loss": 0.2125246524810791, "memory(GiB)": 91.64, "step": 9575, "token_acc": 0.9422946367956552, "train_speed(iter/s)": 0.138474 }, { "epoch": 0.9041147602869007, "grad_norm": 0.332717627286911, "learning_rate": 4.985414897814444e-06, "loss": 0.2124195098876953, "memory(GiB)": 91.64, "step": 9580, "token_acc": 0.9092827004219409, "train_speed(iter/s)": 0.138473 }, { "epoch": 0.9045866364665912, "grad_norm": 0.2855324149131775, "learning_rate": 4.936873400760544e-06, "loss": 0.21741337776184083, "memory(GiB)": 91.64, "step": 9585, "token_acc": 0.9338592233009708, "train_speed(iter/s)": 0.138472 }, { "epoch": 0.9050585126462816, "grad_norm": 0.2555350065231323, "learning_rate": 4.888563396730461e-06, "loss": 0.21048321723937988, "memory(GiB)": 91.64, "step": 9590, "token_acc": 0.921410365335599, "train_speed(iter/s)": 0.138469 }, { "epoch": 0.905530388825972, "grad_norm": 0.24384169280529022, "learning_rate": 4.840485003366324e-06, "loss": 0.21017694473266602, "memory(GiB)": 91.64, "step": 9595, "token_acc": 0.9220325833979829, "train_speed(iter/s)": 0.138468 }, { "epoch": 0.9060022650056625, "grad_norm": 0.5031404495239258, "learning_rate": 4.79263833774628e-06, "loss": 0.21278533935546876, "memory(GiB)": 91.64, "step": 9600, "token_acc": 0.9458646616541353, "train_speed(iter/s)": 0.138465 }, { "epoch": 0.906474141185353, "grad_norm": 0.26602670550346375, "learning_rate": 4.745023516384117e-06, "loss": 0.2099222183227539, "memory(GiB)": 91.64, "step": 9605, "token_acc": 0.9246897732135216, "train_speed(iter/s)": 0.138464 }, { "epoch": 0.9069460173650434, "grad_norm": 0.3199900984764099, "learning_rate": 4.6976406552291515e-06, "loss": 0.2130331039428711, "memory(GiB)": 91.64, "step": 9610, "token_acc": 0.9132743362831859, "train_speed(iter/s)": 0.138463 }, { "epoch": 0.9074178935447339, "grad_norm": 0.4659852385520935, "learning_rate": 4.650489869665731e-06, "loss": 0.21802825927734376, "memory(GiB)": 91.64, "step": 9615, "token_acc": 0.9254470426409904, "train_speed(iter/s)": 0.13846 }, { "epoch": 0.9078897697244243, "grad_norm": 0.23172196745872498, "learning_rate": 4.603571274513141e-06, "loss": 0.21299545764923095, "memory(GiB)": 91.64, "step": 9620, "token_acc": 0.9347454731807311, "train_speed(iter/s)": 0.138459 }, { "epoch": 0.9083616459041147, "grad_norm": 0.24915564060211182, "learning_rate": 4.556884984025234e-06, "loss": 0.2165134906768799, "memory(GiB)": 91.64, "step": 9625, "token_acc": 0.9073196419167984, "train_speed(iter/s)": 0.138457 }, { "epoch": 0.9088335220838052, "grad_norm": 0.26748067140579224, "learning_rate": 4.510431111890134e-06, "loss": 0.2091744899749756, "memory(GiB)": 91.64, "step": 9630, "token_acc": 0.9328651685393259, "train_speed(iter/s)": 0.138456 }, { "epoch": 0.9093053982634957, "grad_norm": 0.4371824860572815, "learning_rate": 4.4642097712299995e-06, "loss": 0.21007180213928223, "memory(GiB)": 91.64, "step": 9635, "token_acc": 0.9227008860372746, "train_speed(iter/s)": 0.138454 }, { "epoch": 0.9097772744431861, "grad_norm": 0.4615022540092468, "learning_rate": 4.418221074600792e-06, "loss": 0.20864152908325195, "memory(GiB)": 91.64, "step": 9640, "token_acc": 0.92157712305026, "train_speed(iter/s)": 0.138453 }, { "epoch": 0.9102491506228766, "grad_norm": 0.5096688866615295, "learning_rate": 4.372465133991888e-06, "loss": 0.21734046936035156, "memory(GiB)": 91.64, "step": 9645, "token_acc": 0.927038626609442, "train_speed(iter/s)": 0.138452 }, { "epoch": 0.9107210268025671, "grad_norm": 0.21919603645801544, "learning_rate": 4.326942060825889e-06, "loss": 0.21668176651000975, "memory(GiB)": 91.64, "step": 9650, "token_acc": 0.9174714661984197, "train_speed(iter/s)": 0.13845 }, { "epoch": 0.9111929029822574, "grad_norm": 0.761786937713623, "learning_rate": 4.281651965958355e-06, "loss": 0.21059024333953857, "memory(GiB)": 91.64, "step": 9655, "token_acc": 0.9291750503018109, "train_speed(iter/s)": 0.138448 }, { "epoch": 0.9116647791619479, "grad_norm": 0.22924518585205078, "learning_rate": 4.236594959677454e-06, "loss": 0.21478679180145263, "memory(GiB)": 91.64, "step": 9660, "token_acc": 0.9087285771223595, "train_speed(iter/s)": 0.138446 }, { "epoch": 0.9121366553416383, "grad_norm": 0.568040132522583, "learning_rate": 4.191771151703794e-06, "loss": 0.2100062847137451, "memory(GiB)": 91.64, "step": 9665, "token_acc": 0.9274411424160556, "train_speed(iter/s)": 0.138444 }, { "epoch": 0.9126085315213288, "grad_norm": 0.5014088153839111, "learning_rate": 4.147180651190085e-06, "loss": 0.21394610404968262, "memory(GiB)": 91.64, "step": 9670, "token_acc": 0.9418457648546145, "train_speed(iter/s)": 0.138442 }, { "epoch": 0.9130804077010193, "grad_norm": 0.4113982617855072, "learning_rate": 4.102823566720926e-06, "loss": 0.21270480155944824, "memory(GiB)": 91.64, "step": 9675, "token_acc": 0.9344614558152028, "train_speed(iter/s)": 0.13844 }, { "epoch": 0.9135522838807097, "grad_norm": 0.36646685004234314, "learning_rate": 4.058700006312488e-06, "loss": 0.20875248908996583, "memory(GiB)": 91.64, "step": 9680, "token_acc": 0.9254823685961411, "train_speed(iter/s)": 0.138438 }, { "epoch": 0.9140241600604001, "grad_norm": 0.37394624948501587, "learning_rate": 4.014810077412279e-06, "loss": 0.20997467041015624, "memory(GiB)": 91.64, "step": 9685, "token_acc": 0.9180238870792616, "train_speed(iter/s)": 0.138438 }, { "epoch": 0.9144960362400906, "grad_norm": 0.28262215852737427, "learning_rate": 3.9711538868988815e-06, "loss": 0.2138798475265503, "memory(GiB)": 91.64, "step": 9690, "token_acc": 0.9245947850599013, "train_speed(iter/s)": 0.138438 }, { "epoch": 0.914967912419781, "grad_norm": 0.24222835898399353, "learning_rate": 3.927731541081692e-06, "loss": 0.21181824207305908, "memory(GiB)": 91.64, "step": 9695, "token_acc": 0.9092514124293786, "train_speed(iter/s)": 0.138437 }, { "epoch": 0.9154397885994715, "grad_norm": 0.2124110907316208, "learning_rate": 3.884543145700659e-06, "loss": 0.209627103805542, "memory(GiB)": 91.64, "step": 9700, "token_acc": 0.9199381761978361, "train_speed(iter/s)": 0.138436 }, { "epoch": 0.915911664779162, "grad_norm": 0.38374340534210205, "learning_rate": 3.841588805926033e-06, "loss": 0.21204769611358643, "memory(GiB)": 91.64, "step": 9705, "token_acc": 0.9301503094606542, "train_speed(iter/s)": 0.138434 }, { "epoch": 0.9163835409588524, "grad_norm": 0.22321577370166779, "learning_rate": 3.7988686263580985e-06, "loss": 0.21288986206054689, "memory(GiB)": 91.64, "step": 9710, "token_acc": 0.915719696969697, "train_speed(iter/s)": 0.138432 }, { "epoch": 0.9168554171385428, "grad_norm": 0.3166010081768036, "learning_rate": 3.7563827110269177e-06, "loss": 0.20771732330322265, "memory(GiB)": 91.64, "step": 9715, "token_acc": 0.9292553191489362, "train_speed(iter/s)": 0.138432 }, { "epoch": 0.9173272933182333, "grad_norm": 0.4344588816165924, "learning_rate": 3.714131163392065e-06, "loss": 0.212508225440979, "memory(GiB)": 91.64, "step": 9720, "token_acc": 0.9188432835820896, "train_speed(iter/s)": 0.13843 }, { "epoch": 0.9177991694979237, "grad_norm": 0.3643015921115875, "learning_rate": 3.6721140863424817e-06, "loss": 0.21447608470916749, "memory(GiB)": 91.64, "step": 9725, "token_acc": 0.9209919261822376, "train_speed(iter/s)": 0.138429 }, { "epoch": 0.9182710456776142, "grad_norm": 0.45395082235336304, "learning_rate": 3.6303315821960227e-06, "loss": 0.21562774181365968, "memory(GiB)": 91.64, "step": 9730, "token_acc": 0.9170403587443946, "train_speed(iter/s)": 0.138427 }, { "epoch": 0.9187429218573047, "grad_norm": 0.6889760494232178, "learning_rate": 3.5887837526993983e-06, "loss": 0.22067337036132811, "memory(GiB)": 91.64, "step": 9735, "token_acc": 0.9127798507462687, "train_speed(iter/s)": 0.138426 }, { "epoch": 0.9192147980369951, "grad_norm": 0.3173013925552368, "learning_rate": 3.5474706990278217e-06, "loss": 0.21371016502380372, "memory(GiB)": 91.64, "step": 9740, "token_acc": 0.923474329996771, "train_speed(iter/s)": 0.138422 }, { "epoch": 0.9196866742166856, "grad_norm": 0.25628921389579773, "learning_rate": 3.506392521784796e-06, "loss": 0.21721768379211426, "memory(GiB)": 91.64, "step": 9745, "token_acc": 0.9235836627140975, "train_speed(iter/s)": 0.13842 }, { "epoch": 0.920158550396376, "grad_norm": 0.26340246200561523, "learning_rate": 3.4655493210018484e-06, "loss": 0.20722723007202148, "memory(GiB)": 91.64, "step": 9750, "token_acc": 0.9288334556126192, "train_speed(iter/s)": 0.138418 }, { "epoch": 0.9206304265760664, "grad_norm": 0.5831912159919739, "learning_rate": 3.424941196138376e-06, "loss": 0.21827468872070313, "memory(GiB)": 91.64, "step": 9755, "token_acc": 0.9169483341380975, "train_speed(iter/s)": 0.138417 }, { "epoch": 0.9211023027557569, "grad_norm": 0.22615396976470947, "learning_rate": 3.384568246081221e-06, "loss": 0.2103797435760498, "memory(GiB)": 91.64, "step": 9760, "token_acc": 0.9311565696302124, "train_speed(iter/s)": 0.138415 }, { "epoch": 0.9215741789354474, "grad_norm": 0.4777851700782776, "learning_rate": 3.3444305691446075e-06, "loss": 0.21615819931030272, "memory(GiB)": 91.64, "step": 9765, "token_acc": 0.9326971371170266, "train_speed(iter/s)": 0.138413 }, { "epoch": 0.9220460551151378, "grad_norm": 0.2671146094799042, "learning_rate": 3.3045282630698506e-06, "loss": 0.21225442886352539, "memory(GiB)": 91.64, "step": 9770, "token_acc": 0.9126576366184026, "train_speed(iter/s)": 0.138412 }, { "epoch": 0.9225179312948283, "grad_norm": 0.24419555068016052, "learning_rate": 3.264861425025034e-06, "loss": 0.20959196090698243, "memory(GiB)": 91.64, "step": 9775, "token_acc": 0.9362338093656593, "train_speed(iter/s)": 0.13841 }, { "epoch": 0.9229898074745186, "grad_norm": 0.3162088394165039, "learning_rate": 3.2254301516049025e-06, "loss": 0.21189465522766113, "memory(GiB)": 91.64, "step": 9780, "token_acc": 0.9261028378758078, "train_speed(iter/s)": 0.13841 }, { "epoch": 0.9234616836542091, "grad_norm": 0.2638489305973053, "learning_rate": 3.1862345388305237e-06, "loss": 0.2071277379989624, "memory(GiB)": 91.64, "step": 9785, "token_acc": 0.9252124645892351, "train_speed(iter/s)": 0.138408 }, { "epoch": 0.9239335598338996, "grad_norm": 0.4801633656024933, "learning_rate": 3.1472746821491373e-06, "loss": 0.2107628345489502, "memory(GiB)": 91.64, "step": 9790, "token_acc": 0.9388661202185792, "train_speed(iter/s)": 0.138406 }, { "epoch": 0.92440543601359, "grad_norm": 0.3137304484844208, "learning_rate": 3.1085506764338524e-06, "loss": 0.20806145668029785, "memory(GiB)": 91.64, "step": 9795, "token_acc": 0.9370592844084618, "train_speed(iter/s)": 0.138404 }, { "epoch": 0.9248773121932805, "grad_norm": 0.4165758788585663, "learning_rate": 3.070062615983449e-06, "loss": 0.21570463180541993, "memory(GiB)": 91.64, "step": 9800, "token_acc": 0.9288702928870293, "train_speed(iter/s)": 0.138403 }, { "epoch": 0.925349188372971, "grad_norm": 0.26783013343811035, "learning_rate": 3.031810594522133e-06, "loss": 0.21047744750976563, "memory(GiB)": 91.64, "step": 9805, "token_acc": 0.9246785058175138, "train_speed(iter/s)": 0.138401 }, { "epoch": 0.9258210645526613, "grad_norm": 0.3021276593208313, "learning_rate": 2.99379470519936e-06, "loss": 0.20863080024719238, "memory(GiB)": 91.64, "step": 9810, "token_acc": 0.9242424242424242, "train_speed(iter/s)": 0.138398 }, { "epoch": 0.9262929407323518, "grad_norm": 0.7376922965049744, "learning_rate": 2.9560150405895325e-06, "loss": 0.21465864181518554, "memory(GiB)": 91.64, "step": 9815, "token_acc": 0.9263358778625954, "train_speed(iter/s)": 0.138396 }, { "epoch": 0.9267648169120423, "grad_norm": 0.33614635467529297, "learning_rate": 2.9184716926918263e-06, "loss": 0.20645856857299805, "memory(GiB)": 91.64, "step": 9820, "token_acc": 0.9207180254300673, "train_speed(iter/s)": 0.138394 }, { "epoch": 0.9272366930917327, "grad_norm": 0.24106520414352417, "learning_rate": 2.8811647529299436e-06, "loss": 0.20869898796081543, "memory(GiB)": 91.64, "step": 9825, "token_acc": 0.9237107110161733, "train_speed(iter/s)": 0.138393 }, { "epoch": 0.9277085692714232, "grad_norm": 0.2242364138364792, "learning_rate": 2.8440943121518905e-06, "loss": 0.20895557403564452, "memory(GiB)": 91.64, "step": 9830, "token_acc": 0.9298651252408477, "train_speed(iter/s)": 0.138391 }, { "epoch": 0.9281804454511137, "grad_norm": 0.401591420173645, "learning_rate": 2.807260460629768e-06, "loss": 0.20607337951660157, "memory(GiB)": 91.64, "step": 9835, "token_acc": 0.9263485477178424, "train_speed(iter/s)": 0.138391 }, { "epoch": 0.928652321630804, "grad_norm": 0.37716448307037354, "learning_rate": 2.7706632880595716e-06, "loss": 0.21465234756469725, "memory(GiB)": 91.64, "step": 9840, "token_acc": 0.9250949257852952, "train_speed(iter/s)": 0.138389 }, { "epoch": 0.9291241978104945, "grad_norm": 0.5281177759170532, "learning_rate": 2.734302883560902e-06, "loss": 0.21172585487365722, "memory(GiB)": 91.64, "step": 9845, "token_acc": 0.9257907542579076, "train_speed(iter/s)": 0.138387 }, { "epoch": 0.929596073990185, "grad_norm": 0.30686765909194946, "learning_rate": 2.6981793356768314e-06, "loss": 0.21002044677734374, "memory(GiB)": 91.64, "step": 9850, "token_acc": 0.9194915254237288, "train_speed(iter/s)": 0.138385 }, { "epoch": 0.9300679501698754, "grad_norm": 0.42280927300453186, "learning_rate": 2.662292732373639e-06, "loss": 0.2111431121826172, "memory(GiB)": 91.64, "step": 9855, "token_acc": 0.9319787985865724, "train_speed(iter/s)": 0.138384 }, { "epoch": 0.9305398263495659, "grad_norm": 0.19301030039787292, "learning_rate": 2.6266431610405984e-06, "loss": 0.20500037670135499, "memory(GiB)": 91.64, "step": 9860, "token_acc": 0.9368821292775665, "train_speed(iter/s)": 0.138383 }, { "epoch": 0.9310117025292564, "grad_norm": 0.27901899814605713, "learning_rate": 2.591230708489778e-06, "loss": 0.2096851348876953, "memory(GiB)": 91.64, "step": 9865, "token_acc": 0.928513731825525, "train_speed(iter/s)": 0.138382 }, { "epoch": 0.9314835787089468, "grad_norm": 0.26552677154541016, "learning_rate": 2.5560554609558417e-06, "loss": 0.2152176856994629, "memory(GiB)": 91.64, "step": 9870, "token_acc": 0.9420463629096723, "train_speed(iter/s)": 0.13838 }, { "epoch": 0.9319554548886372, "grad_norm": 0.3852667808532715, "learning_rate": 2.5211175040958048e-06, "loss": 0.20902786254882813, "memory(GiB)": 91.64, "step": 9875, "token_acc": 0.9132697311361665, "train_speed(iter/s)": 0.138379 }, { "epoch": 0.9324273310683276, "grad_norm": 0.3729971647262573, "learning_rate": 2.4864169229888654e-06, "loss": 0.2123638153076172, "memory(GiB)": 91.64, "step": 9880, "token_acc": 0.9168326693227091, "train_speed(iter/s)": 0.138378 }, { "epoch": 0.9328992072480181, "grad_norm": 0.3718613386154175, "learning_rate": 2.4519538021361422e-06, "loss": 0.21329360008239745, "memory(GiB)": 91.64, "step": 9885, "token_acc": 0.9229032258064516, "train_speed(iter/s)": 0.138376 }, { "epoch": 0.9333710834277086, "grad_norm": 0.23476502299308777, "learning_rate": 2.417728225460525e-06, "loss": 0.20669341087341309, "memory(GiB)": 91.64, "step": 9890, "token_acc": 0.9244951712028094, "train_speed(iter/s)": 0.138376 }, { "epoch": 0.933842959607399, "grad_norm": 0.3839211165904999, "learning_rate": 2.3837402763064567e-06, "loss": 0.20854783058166504, "memory(GiB)": 91.64, "step": 9895, "token_acc": 0.9435273675065161, "train_speed(iter/s)": 0.138374 }, { "epoch": 0.9343148357870895, "grad_norm": 0.4082486033439636, "learning_rate": 2.349990037439709e-06, "loss": 0.21600706577301027, "memory(GiB)": 91.64, "step": 9900, "token_acc": 0.931433659839715, "train_speed(iter/s)": 0.138373 }, { "epoch": 0.9347867119667799, "grad_norm": 0.23630106449127197, "learning_rate": 2.3164775910471834e-06, "loss": 0.2045605182647705, "memory(GiB)": 91.64, "step": 9905, "token_acc": 0.9155844155844156, "train_speed(iter/s)": 0.13837 }, { "epoch": 0.9352585881464703, "grad_norm": 0.26476147770881653, "learning_rate": 2.283203018736757e-06, "loss": 0.21763415336608888, "memory(GiB)": 91.64, "step": 9910, "token_acc": 0.9334708612686952, "train_speed(iter/s)": 0.13837 }, { "epoch": 0.9357304643261608, "grad_norm": 0.23421037197113037, "learning_rate": 2.2501664015369906e-06, "loss": 0.21492888927459716, "memory(GiB)": 91.64, "step": 9915, "token_acc": 0.9112426035502958, "train_speed(iter/s)": 0.138369 }, { "epoch": 0.9362023405058513, "grad_norm": 0.5269423127174377, "learning_rate": 2.2173678198970316e-06, "loss": 0.2087617874145508, "memory(GiB)": 91.64, "step": 9920, "token_acc": 0.9160095989029825, "train_speed(iter/s)": 0.138366 }, { "epoch": 0.9366742166855417, "grad_norm": 0.3009715676307678, "learning_rate": 2.1848073536863577e-06, "loss": 0.20663909912109374, "memory(GiB)": 91.64, "step": 9925, "token_acc": 0.920963260265514, "train_speed(iter/s)": 0.138365 }, { "epoch": 0.9371460928652322, "grad_norm": 0.31081992387771606, "learning_rate": 2.152485082194633e-06, "loss": 0.21198580265045167, "memory(GiB)": 91.64, "step": 9930, "token_acc": 0.9265060240963855, "train_speed(iter/s)": 0.138362 }, { "epoch": 0.9376179690449226, "grad_norm": 0.21676135063171387, "learning_rate": 2.120401084131418e-06, "loss": 0.2091569185256958, "memory(GiB)": 91.64, "step": 9935, "token_acc": 0.9232542819499341, "train_speed(iter/s)": 0.13836 }, { "epoch": 0.938089845224613, "grad_norm": 0.6019723415374756, "learning_rate": 2.0885554376261164e-06, "loss": 0.2074450969696045, "memory(GiB)": 91.64, "step": 9940, "token_acc": 0.9200394866732478, "train_speed(iter/s)": 0.13836 }, { "epoch": 0.9385617214043035, "grad_norm": 0.37886330485343933, "learning_rate": 2.056948220227639e-06, "loss": 0.2108628273010254, "memory(GiB)": 91.64, "step": 9945, "token_acc": 0.9312602291325696, "train_speed(iter/s)": 0.138359 }, { "epoch": 0.939033597583994, "grad_norm": 0.2426460087299347, "learning_rate": 2.0255795089043296e-06, "loss": 0.21312596797943115, "memory(GiB)": 91.64, "step": 9950, "token_acc": 0.929093567251462, "train_speed(iter/s)": 0.138357 }, { "epoch": 0.9395054737636844, "grad_norm": 0.38751596212387085, "learning_rate": 1.994449380043717e-06, "loss": 0.2076961040496826, "memory(GiB)": 91.64, "step": 9955, "token_acc": 0.9129967776584318, "train_speed(iter/s)": 0.138358 }, { "epoch": 0.9399773499433749, "grad_norm": 0.2836040258407593, "learning_rate": 1.9635579094523514e-06, "loss": 0.20892977714538574, "memory(GiB)": 91.64, "step": 9960, "token_acc": 0.9155970439517698, "train_speed(iter/s)": 0.138356 }, { "epoch": 0.9404492261230654, "grad_norm": 0.34263989329338074, "learning_rate": 1.932905172355637e-06, "loss": 0.21328263282775878, "memory(GiB)": 91.64, "step": 9965, "token_acc": 0.9321761491481839, "train_speed(iter/s)": 0.138354 }, { "epoch": 0.9409211023027557, "grad_norm": 0.3382331430912018, "learning_rate": 1.902491243397575e-06, "loss": 0.20347208976745607, "memory(GiB)": 91.64, "step": 9970, "token_acc": 0.9281078382426361, "train_speed(iter/s)": 0.138352 }, { "epoch": 0.9413929784824462, "grad_norm": 0.4317174255847931, "learning_rate": 1.8723161966406777e-06, "loss": 0.21226215362548828, "memory(GiB)": 91.64, "step": 9975, "token_acc": 0.9255702280912365, "train_speed(iter/s)": 0.138351 }, { "epoch": 0.9418648546621367, "grad_norm": 0.252453088760376, "learning_rate": 1.842380105565711e-06, "loss": 0.20707168579101562, "memory(GiB)": 91.64, "step": 9980, "token_acc": 0.930406852248394, "train_speed(iter/s)": 0.138351 }, { "epoch": 0.9423367308418271, "grad_norm": 0.2883821725845337, "learning_rate": 1.8126830430715724e-06, "loss": 0.20754437446594237, "memory(GiB)": 91.64, "step": 9985, "token_acc": 0.9166115155526141, "train_speed(iter/s)": 0.13835 }, { "epoch": 0.9428086070215176, "grad_norm": 0.2774185538291931, "learning_rate": 1.7832250814750817e-06, "loss": 0.21275861263275148, "memory(GiB)": 91.64, "step": 9990, "token_acc": 0.9190020505809979, "train_speed(iter/s)": 0.138351 }, { "epoch": 0.943280483201208, "grad_norm": 0.54234379529953, "learning_rate": 1.7540062925108126e-06, "loss": 0.21790072917938233, "memory(GiB)": 91.64, "step": 9995, "token_acc": 0.9157351676698194, "train_speed(iter/s)": 0.13835 }, { "epoch": 0.9437523593808984, "grad_norm": 0.3133432865142822, "learning_rate": 1.7250267473309046e-06, "loss": 0.21021485328674316, "memory(GiB)": 91.64, "step": 10000, "token_acc": 0.9144818976279651, "train_speed(iter/s)": 0.138348 }, { "epoch": 0.9442242355605889, "grad_norm": 0.8445714712142944, "learning_rate": 1.696286516504908e-06, "loss": 0.2085047721862793, "memory(GiB)": 91.64, "step": 10005, "token_acc": 0.9371900826446281, "train_speed(iter/s)": 0.138347 }, { "epoch": 0.9446961117402793, "grad_norm": 0.2897244691848755, "learning_rate": 1.6677856700196394e-06, "loss": 0.21313183307647704, "memory(GiB)": 91.64, "step": 10010, "token_acc": 0.9148644009612084, "train_speed(iter/s)": 0.138345 }, { "epoch": 0.9451679879199698, "grad_norm": 0.3938073515892029, "learning_rate": 1.6395242772789144e-06, "loss": 0.2124699354171753, "memory(GiB)": 91.64, "step": 10015, "token_acc": 0.927461139896373, "train_speed(iter/s)": 0.138344 }, { "epoch": 0.9456398640996603, "grad_norm": 0.2608243525028229, "learning_rate": 1.6115024071034933e-06, "loss": 0.20247375965118408, "memory(GiB)": 91.64, "step": 10020, "token_acc": 0.9147230320699709, "train_speed(iter/s)": 0.138342 }, { "epoch": 0.9461117402793507, "grad_norm": 0.44675490260124207, "learning_rate": 1.58372012773087e-06, "loss": 0.214943265914917, "memory(GiB)": 91.64, "step": 10025, "token_acc": 0.9279670706608736, "train_speed(iter/s)": 0.13834 }, { "epoch": 0.9465836164590411, "grad_norm": 0.4210216701030731, "learning_rate": 1.5561775068150485e-06, "loss": 0.2119581937789917, "memory(GiB)": 91.64, "step": 10030, "token_acc": 0.9224646226415094, "train_speed(iter/s)": 0.138339 }, { "epoch": 0.9470554926387316, "grad_norm": 0.33085545897483826, "learning_rate": 1.5288746114264673e-06, "loss": 0.21336703300476073, "memory(GiB)": 91.64, "step": 10035, "token_acc": 0.9265560165975104, "train_speed(iter/s)": 0.138337 }, { "epoch": 0.947527368818422, "grad_norm": 0.3594004213809967, "learning_rate": 1.501811508051787e-06, "loss": 0.2055363416671753, "memory(GiB)": 91.64, "step": 10040, "token_acc": 0.9253539253539254, "train_speed(iter/s)": 0.138335 }, { "epoch": 0.9479992449981125, "grad_norm": 0.29170048236846924, "learning_rate": 1.474988262593735e-06, "loss": 0.20636603832244874, "memory(GiB)": 91.64, "step": 10045, "token_acc": 0.9267399267399268, "train_speed(iter/s)": 0.138333 }, { "epoch": 0.948471121177803, "grad_norm": 0.29537680745124817, "learning_rate": 1.448404940370951e-06, "loss": 0.22197353839874268, "memory(GiB)": 91.64, "step": 10050, "token_acc": 0.9287945034353529, "train_speed(iter/s)": 0.138332 }, { "epoch": 0.9489429973574934, "grad_norm": 0.29160556197166443, "learning_rate": 1.4220616061178415e-06, "loss": 0.20441300868988038, "memory(GiB)": 91.64, "step": 10055, "token_acc": 0.929786066922655, "train_speed(iter/s)": 0.138332 }, { "epoch": 0.9494148735371838, "grad_norm": 0.36549004912376404, "learning_rate": 1.3959583239843698e-06, "loss": 0.21138055324554444, "memory(GiB)": 91.64, "step": 10060, "token_acc": 0.9201907790143085, "train_speed(iter/s)": 0.138329 }, { "epoch": 0.9498867497168743, "grad_norm": 0.24625816941261292, "learning_rate": 1.3700951575359666e-06, "loss": 0.21180267333984376, "memory(GiB)": 91.64, "step": 10065, "token_acc": 0.9221022993899578, "train_speed(iter/s)": 0.138328 }, { "epoch": 0.9503586258965647, "grad_norm": 0.24190473556518555, "learning_rate": 1.3444721697533413e-06, "loss": 0.20847978591918945, "memory(GiB)": 91.64, "step": 10070, "token_acc": 0.9261583011583011, "train_speed(iter/s)": 0.138326 }, { "epoch": 0.9508305020762552, "grad_norm": 0.44247832894325256, "learning_rate": 1.3190894230323159e-06, "loss": 0.2141636848449707, "memory(GiB)": 91.64, "step": 10075, "token_acc": 0.9151750972762646, "train_speed(iter/s)": 0.138325 }, { "epoch": 0.9513023782559457, "grad_norm": 0.4810398817062378, "learning_rate": 1.2939469791837133e-06, "loss": 0.211592960357666, "memory(GiB)": 91.64, "step": 10080, "token_acc": 0.9310824921684651, "train_speed(iter/s)": 0.138323 }, { "epoch": 0.9517742544356361, "grad_norm": 0.26484912633895874, "learning_rate": 1.2690448994331472e-06, "loss": 0.21042494773864745, "memory(GiB)": 91.64, "step": 10085, "token_acc": 0.9333983105912931, "train_speed(iter/s)": 0.13832 }, { "epoch": 0.9522461306153266, "grad_norm": 0.24709878861904144, "learning_rate": 1.2443832444209547e-06, "loss": 0.2084174394607544, "memory(GiB)": 91.64, "step": 10090, "token_acc": 0.9254068716094033, "train_speed(iter/s)": 0.138319 }, { "epoch": 0.952718006795017, "grad_norm": 0.29317525029182434, "learning_rate": 1.2199620742019636e-06, "loss": 0.20613834857940674, "memory(GiB)": 91.64, "step": 10095, "token_acc": 0.9246799642750819, "train_speed(iter/s)": 0.138318 }, { "epoch": 0.9531898829747074, "grad_norm": 0.374560683965683, "learning_rate": 1.195781448245392e-06, "loss": 0.21635422706604004, "memory(GiB)": 91.64, "step": 10100, "token_acc": 0.926530612244898, "train_speed(iter/s)": 0.138317 }, { "epoch": 0.9536617591543979, "grad_norm": 0.33670246601104736, "learning_rate": 1.1718414254347276e-06, "loss": 0.20765471458435059, "memory(GiB)": 91.64, "step": 10105, "token_acc": 0.928082191780822, "train_speed(iter/s)": 0.138315 }, { "epoch": 0.9541336353340883, "grad_norm": 0.21765998005867004, "learning_rate": 1.1481420640675257e-06, "loss": 0.20929303169250488, "memory(GiB)": 91.64, "step": 10110, "token_acc": 0.9243661366566395, "train_speed(iter/s)": 0.138314 }, { "epoch": 0.9546055115137788, "grad_norm": 0.7290151715278625, "learning_rate": 1.124683421855277e-06, "loss": 0.20725011825561523, "memory(GiB)": 91.64, "step": 10115, "token_acc": 0.927360774818402, "train_speed(iter/s)": 0.138314 }, { "epoch": 0.9550773876934693, "grad_norm": 0.26937779784202576, "learning_rate": 1.1014655559233312e-06, "loss": 0.21137325763702391, "memory(GiB)": 91.64, "step": 10120, "token_acc": 0.9266569200779727, "train_speed(iter/s)": 0.138314 }, { "epoch": 0.9555492638731596, "grad_norm": 0.26850059628486633, "learning_rate": 1.0784885228106722e-06, "loss": 0.21040570735931396, "memory(GiB)": 91.64, "step": 10125, "token_acc": 0.9151398264223722, "train_speed(iter/s)": 0.138313 }, { "epoch": 0.9560211400528501, "grad_norm": 0.34640467166900635, "learning_rate": 1.055752378469832e-06, "loss": 0.2133777141571045, "memory(GiB)": 91.64, "step": 10130, "token_acc": 0.9248380129589633, "train_speed(iter/s)": 0.138312 }, { "epoch": 0.9564930162325406, "grad_norm": 0.24151375889778137, "learning_rate": 1.0332571782667555e-06, "loss": 0.21562702655792237, "memory(GiB)": 91.64, "step": 10135, "token_acc": 0.9167294649585531, "train_speed(iter/s)": 0.13831 }, { "epoch": 0.956964892412231, "grad_norm": 0.8157694935798645, "learning_rate": 1.0110029769806462e-06, "loss": 0.21337783336639404, "memory(GiB)": 91.64, "step": 10140, "token_acc": 0.9196428571428571, "train_speed(iter/s)": 0.138309 }, { "epoch": 0.9574367685919215, "grad_norm": 0.24254101514816284, "learning_rate": 9.889898288038103e-07, "loss": 0.2101999282836914, "memory(GiB)": 91.64, "step": 10145, "token_acc": 0.9198137609932747, "train_speed(iter/s)": 0.138309 }, { "epoch": 0.957908644771612, "grad_norm": 0.6977924108505249, "learning_rate": 9.67217787341601e-07, "loss": 0.2110898494720459, "memory(GiB)": 91.64, "step": 10150, "token_acc": 0.9282487377092745, "train_speed(iter/s)": 0.138307 }, { "epoch": 0.9583805209513023, "grad_norm": 0.5975959300994873, "learning_rate": 9.456869056122197e-07, "loss": 0.20567688941955567, "memory(GiB)": 91.64, "step": 10155, "token_acc": 0.9243664717348928, "train_speed(iter/s)": 0.138306 }, { "epoch": 0.9588523971309928, "grad_norm": 0.32901281118392944, "learning_rate": 9.243972360465702e-07, "loss": 0.20593414306640626, "memory(GiB)": 91.64, "step": 10160, "token_acc": 0.9356405585913783, "train_speed(iter/s)": 0.138304 }, { "epoch": 0.9593242733106833, "grad_norm": 0.2798580527305603, "learning_rate": 9.033488304882487e-07, "loss": 0.20811738967895507, "memory(GiB)": 91.64, "step": 10165, "token_acc": 0.9180633147113594, "train_speed(iter/s)": 0.138303 }, { "epoch": 0.9597961494903737, "grad_norm": 0.1956169605255127, "learning_rate": 8.825417401932545e-07, "loss": 0.21179871559143065, "memory(GiB)": 91.64, "step": 10170, "token_acc": 0.9301929625425652, "train_speed(iter/s)": 0.138302 }, { "epoch": 0.9602680256700642, "grad_norm": 0.272049218416214, "learning_rate": 8.619760158300016e-07, "loss": 0.207623291015625, "memory(GiB)": 91.64, "step": 10175, "token_acc": 0.9189681335356601, "train_speed(iter/s)": 0.138302 }, { "epoch": 0.9607399018497547, "grad_norm": 0.3947182893753052, "learning_rate": 8.416517074791297e-07, "loss": 0.2093808650970459, "memory(GiB)": 91.64, "step": 10180, "token_acc": 0.9274025587622732, "train_speed(iter/s)": 0.138301 }, { "epoch": 0.961211778029445, "grad_norm": 0.2339448779821396, "learning_rate": 8.215688646333819e-07, "loss": 0.20636515617370604, "memory(GiB)": 91.64, "step": 10185, "token_acc": 0.918429003021148, "train_speed(iter/s)": 0.1383 }, { "epoch": 0.9616836542091355, "grad_norm": 0.2749667763710022, "learning_rate": 8.01727536197483e-07, "loss": 0.21485419273376466, "memory(GiB)": 91.64, "step": 10190, "token_acc": 0.9255688391824142, "train_speed(iter/s)": 0.138297 }, { "epoch": 0.962155530388826, "grad_norm": 0.2768957316875458, "learning_rate": 7.821277704880947e-07, "loss": 0.20581846237182616, "memory(GiB)": 91.64, "step": 10195, "token_acc": 0.9279426149331594, "train_speed(iter/s)": 0.138295 }, { "epoch": 0.9626274065685164, "grad_norm": 0.5955285429954529, "learning_rate": 7.627696152335717e-07, "loss": 0.20913143157958985, "memory(GiB)": 91.64, "step": 10200, "token_acc": 0.921988855550793, "train_speed(iter/s)": 0.138294 }, { "epoch": 0.9630992827482069, "grad_norm": 0.2856890857219696, "learning_rate": 7.436531175739392e-07, "loss": 0.21088147163391113, "memory(GiB)": 91.64, "step": 10205, "token_acc": 0.9269616026711185, "train_speed(iter/s)": 0.138292 }, { "epoch": 0.9635711589278974, "grad_norm": 0.272771418094635, "learning_rate": 7.247783240607598e-07, "loss": 0.2031481981277466, "memory(GiB)": 91.64, "step": 10210, "token_acc": 0.9186991869918699, "train_speed(iter/s)": 0.13829 }, { "epoch": 0.9640430351075878, "grad_norm": 0.42939260601997375, "learning_rate": 7.061452806569668e-07, "loss": 0.2058316707611084, "memory(GiB)": 91.64, "step": 10215, "token_acc": 0.9244166940519224, "train_speed(iter/s)": 0.138288 }, { "epoch": 0.9645149112872782, "grad_norm": 0.3867746889591217, "learning_rate": 6.877540327368648e-07, "loss": 0.2134486675262451, "memory(GiB)": 91.64, "step": 10220, "token_acc": 0.9220824598183088, "train_speed(iter/s)": 0.138287 }, { "epoch": 0.9649867874669686, "grad_norm": 0.23862211406230927, "learning_rate": 6.696046250858845e-07, "loss": 0.2061309814453125, "memory(GiB)": 91.64, "step": 10225, "token_acc": 0.9267634154573068, "train_speed(iter/s)": 0.138285 }, { "epoch": 0.9654586636466591, "grad_norm": 0.3633194863796234, "learning_rate": 6.516971019005724e-07, "loss": 0.21052196025848388, "memory(GiB)": 91.64, "step": 10230, "token_acc": 0.9288199936728884, "train_speed(iter/s)": 0.138284 }, { "epoch": 0.9659305398263496, "grad_norm": 0.24097710847854614, "learning_rate": 6.340315067884461e-07, "loss": 0.21546194553375245, "memory(GiB)": 91.64, "step": 10235, "token_acc": 0.9337719298245614, "train_speed(iter/s)": 0.138283 }, { "epoch": 0.96640241600604, "grad_norm": 0.23538899421691895, "learning_rate": 6.166078827678945e-07, "loss": 0.21139774322509766, "memory(GiB)": 91.64, "step": 10240, "token_acc": 0.942993907745866, "train_speed(iter/s)": 0.138282 }, { "epoch": 0.9668742921857305, "grad_norm": 0.19995267689228058, "learning_rate": 5.994262722680332e-07, "loss": 0.21091582775115966, "memory(GiB)": 91.64, "step": 10245, "token_acc": 0.9313725490196079, "train_speed(iter/s)": 0.138281 }, { "epoch": 0.9673461683654209, "grad_norm": 0.4463655948638916, "learning_rate": 5.824867171287163e-07, "loss": 0.21558599472045897, "memory(GiB)": 91.64, "step": 10250, "token_acc": 0.9165417291354323, "train_speed(iter/s)": 0.13828 }, { "epoch": 0.9678180445451113, "grad_norm": 0.22608503699302673, "learning_rate": 5.6578925860028e-07, "loss": 0.20667457580566406, "memory(GiB)": 91.64, "step": 10255, "token_acc": 0.9164754474529601, "train_speed(iter/s)": 0.138279 }, { "epoch": 0.9682899207248018, "grad_norm": 0.21197392046451569, "learning_rate": 5.493339373435657e-07, "loss": 0.20172109603881835, "memory(GiB)": 91.64, "step": 10260, "token_acc": 0.919234360410831, "train_speed(iter/s)": 0.138277 }, { "epoch": 0.9687617969044923, "grad_norm": 0.4548327624797821, "learning_rate": 5.331207934297422e-07, "loss": 0.20822181701660156, "memory(GiB)": 91.64, "step": 10265, "token_acc": 0.93, "train_speed(iter/s)": 0.138276 }, { "epoch": 0.9692336730841827, "grad_norm": 0.2541370689868927, "learning_rate": 5.171498663402718e-07, "loss": 0.2125007390975952, "memory(GiB)": 91.64, "step": 10270, "token_acc": 0.9267723880597015, "train_speed(iter/s)": 0.138276 }, { "epoch": 0.9697055492638732, "grad_norm": 0.2773514986038208, "learning_rate": 5.014211949667446e-07, "loss": 0.20577967166900635, "memory(GiB)": 91.64, "step": 10275, "token_acc": 0.9172289698605488, "train_speed(iter/s)": 0.138275 }, { "epoch": 0.9701774254435636, "grad_norm": 0.2945961654186249, "learning_rate": 4.859348176108669e-07, "loss": 0.2093435525894165, "memory(GiB)": 91.64, "step": 10280, "token_acc": 0.9211669770328988, "train_speed(iter/s)": 0.138274 }, { "epoch": 0.970649301623254, "grad_norm": 0.43366917967796326, "learning_rate": 4.7069077198428345e-07, "loss": 0.2111149787902832, "memory(GiB)": 91.64, "step": 10285, "token_acc": 0.9272727272727272, "train_speed(iter/s)": 0.138274 }, { "epoch": 0.9711211778029445, "grad_norm": 0.42656323313713074, "learning_rate": 4.556890952085446e-07, "loss": 0.21514317989349366, "memory(GiB)": 91.64, "step": 10290, "token_acc": 0.9248013620885358, "train_speed(iter/s)": 0.138273 }, { "epoch": 0.971593053982635, "grad_norm": 0.3890469968318939, "learning_rate": 4.4092982381499505e-07, "loss": 0.20530645847320556, "memory(GiB)": 91.64, "step": 10295, "token_acc": 0.9163179916317992, "train_speed(iter/s)": 0.138271 }, { "epoch": 0.9720649301623254, "grad_norm": 0.3857133090496063, "learning_rate": 4.264129937446848e-07, "loss": 0.2022775650024414, "memory(GiB)": 91.64, "step": 10300, "token_acc": 0.9249260355029586, "train_speed(iter/s)": 0.13827 }, { "epoch": 0.9725368063420159, "grad_norm": 0.47042417526245117, "learning_rate": 4.121386403482586e-07, "loss": 0.21275124549865723, "memory(GiB)": 91.64, "step": 10305, "token_acc": 0.9202175883952856, "train_speed(iter/s)": 0.138269 }, { "epoch": 0.9730086825217064, "grad_norm": 0.3687644898891449, "learning_rate": 3.981067983859554e-07, "loss": 0.21237845420837403, "memory(GiB)": 91.64, "step": 10310, "token_acc": 0.920115149334293, "train_speed(iter/s)": 0.138267 }, { "epoch": 0.9734805587013967, "grad_norm": 0.1944044530391693, "learning_rate": 3.8431750202738704e-07, "loss": 0.2046431064605713, "memory(GiB)": 91.64, "step": 10315, "token_acc": 0.9339069221744232, "train_speed(iter/s)": 0.138265 }, { "epoch": 0.9739524348810872, "grad_norm": 0.30766624212265015, "learning_rate": 3.707707848515707e-07, "loss": 0.2141507625579834, "memory(GiB)": 91.64, "step": 10320, "token_acc": 0.9316535433070866, "train_speed(iter/s)": 0.138265 }, { "epoch": 0.9744243110607776, "grad_norm": 0.3184703290462494, "learning_rate": 3.5746667984682956e-07, "loss": 0.21402463912963868, "memory(GiB)": 91.64, "step": 10325, "token_acc": 0.9198218262806236, "train_speed(iter/s)": 0.138265 }, { "epoch": 0.9748961872404681, "grad_norm": 0.21421895921230316, "learning_rate": 3.444052194106262e-07, "loss": 0.2053246021270752, "memory(GiB)": 91.64, "step": 10330, "token_acc": 0.9339525283797729, "train_speed(iter/s)": 0.138263 }, { "epoch": 0.9753680634201586, "grad_norm": 0.3974616825580597, "learning_rate": 3.3158643534960677e-07, "loss": 0.21539621353149413, "memory(GiB)": 91.64, "step": 10335, "token_acc": 0.9283607248209018, "train_speed(iter/s)": 0.138262 }, { "epoch": 0.975839939599849, "grad_norm": 0.7123221755027771, "learning_rate": 3.1901035887942356e-07, "loss": 0.20816359519958497, "memory(GiB)": 91.64, "step": 10340, "token_acc": 0.9127652408218255, "train_speed(iter/s)": 0.138261 }, { "epoch": 0.9763118157795394, "grad_norm": 0.26935598254203796, "learning_rate": 3.066770206247349e-07, "loss": 0.21007375717163085, "memory(GiB)": 91.64, "step": 10345, "token_acc": 0.9361033519553073, "train_speed(iter/s)": 0.13826 }, { "epoch": 0.9767836919592299, "grad_norm": 0.6770565509796143, "learning_rate": 2.94586450619061e-07, "loss": 0.21003360748291017, "memory(GiB)": 91.64, "step": 10350, "token_acc": 0.9277614447711046, "train_speed(iter/s)": 0.138259 }, { "epoch": 0.9772555681389203, "grad_norm": 0.2321297973394394, "learning_rate": 2.8273867830477254e-07, "loss": 0.20995206832885743, "memory(GiB)": 91.64, "step": 10355, "token_acc": 0.9150858175248419, "train_speed(iter/s)": 0.138257 }, { "epoch": 0.9777274443186108, "grad_norm": 0.4965348541736603, "learning_rate": 2.711337325329577e-07, "loss": 0.21208806037902833, "memory(GiB)": 91.64, "step": 10360, "token_acc": 0.9319755600814664, "train_speed(iter/s)": 0.138255 }, { "epoch": 0.9781993204983013, "grad_norm": 0.37109440565109253, "learning_rate": 2.5977164156343327e-07, "loss": 0.20955781936645507, "memory(GiB)": 91.64, "step": 10365, "token_acc": 0.933184023889511, "train_speed(iter/s)": 0.138254 }, { "epoch": 0.9786711966779917, "grad_norm": 0.401061087846756, "learning_rate": 2.486524330645779e-07, "loss": 0.21407065391540528, "memory(GiB)": 91.64, "step": 10370, "token_acc": 0.9242843951985226, "train_speed(iter/s)": 0.138251 }, { "epoch": 0.9791430728576821, "grad_norm": 0.7057275176048279, "learning_rate": 2.3777613411335443e-07, "loss": 0.20971500873565674, "memory(GiB)": 91.64, "step": 10375, "token_acc": 0.9272334293948127, "train_speed(iter/s)": 0.13825 }, { "epoch": 0.9796149490373726, "grad_norm": 0.4812077581882477, "learning_rate": 2.271427711951768e-07, "loss": 0.21029391288757324, "memory(GiB)": 91.64, "step": 10380, "token_acc": 0.9272134709397066, "train_speed(iter/s)": 0.138248 }, { "epoch": 0.980086825217063, "grad_norm": 0.5195487141609192, "learning_rate": 2.167523702038876e-07, "loss": 0.21241250038146972, "memory(GiB)": 91.64, "step": 10385, "token_acc": 0.9222193414570911, "train_speed(iter/s)": 0.138246 }, { "epoch": 0.9805587013967535, "grad_norm": 0.2308487743139267, "learning_rate": 2.0660495644168055e-07, "loss": 0.20739927291870117, "memory(GiB)": 91.64, "step": 10390, "token_acc": 0.9294685990338164, "train_speed(iter/s)": 0.138245 }, { "epoch": 0.981030577576444, "grad_norm": 0.5138281583786011, "learning_rate": 1.967005546190448e-07, "loss": 0.21623821258544923, "memory(GiB)": 91.64, "step": 10395, "token_acc": 0.9200546634779638, "train_speed(iter/s)": 0.138244 }, { "epoch": 0.9815024537561344, "grad_norm": 0.27799543738365173, "learning_rate": 1.870391888546652e-07, "loss": 0.21060950756073, "memory(GiB)": 91.64, "step": 10400, "token_acc": 0.9342560553633218, "train_speed(iter/s)": 0.138243 }, { "epoch": 0.9819743299358248, "grad_norm": 0.480560302734375, "learning_rate": 1.7762088267544442e-07, "loss": 0.21494081020355224, "memory(GiB)": 91.64, "step": 10405, "token_acc": 0.9232977850697293, "train_speed(iter/s)": 0.138244 }, { "epoch": 0.9824462061155153, "grad_norm": 0.24424020946025848, "learning_rate": 1.6844565901636966e-07, "loss": 0.2057812452316284, "memory(GiB)": 91.64, "step": 10410, "token_acc": 0.9184561810795124, "train_speed(iter/s)": 0.138242 }, { "epoch": 0.9829180822952057, "grad_norm": 0.2741668224334717, "learning_rate": 1.5951354022047948e-07, "loss": 0.21181230545043944, "memory(GiB)": 91.64, "step": 10415, "token_acc": 0.925390625, "train_speed(iter/s)": 0.138242 }, { "epoch": 0.9833899584748962, "grad_norm": 0.267378032207489, "learning_rate": 1.508245480388415e-07, "loss": 0.20557844638824463, "memory(GiB)": 91.64, "step": 10420, "token_acc": 0.9227083998722453, "train_speed(iter/s)": 0.138241 }, { "epoch": 0.9838618346545867, "grad_norm": 0.24558934569358826, "learning_rate": 1.4237870363046358e-07, "loss": 0.20829353332519532, "memory(GiB)": 91.64, "step": 10425, "token_acc": 0.9241808827511508, "train_speed(iter/s)": 0.13824 }, { "epoch": 0.9843337108342771, "grad_norm": 0.26023128628730774, "learning_rate": 1.3417602756222724e-07, "loss": 0.20859556198120116, "memory(GiB)": 91.64, "step": 10430, "token_acc": 0.9226679555340744, "train_speed(iter/s)": 0.138238 }, { "epoch": 0.9848055870139676, "grad_norm": 0.4792412221431732, "learning_rate": 1.262165398089099e-07, "loss": 0.21122725009918214, "memory(GiB)": 91.64, "step": 10435, "token_acc": 0.9246612466124661, "train_speed(iter/s)": 0.138237 }, { "epoch": 0.9852774631936579, "grad_norm": 0.2937432825565338, "learning_rate": 1.1850025975304046e-07, "loss": 0.20611369609832764, "memory(GiB)": 91.64, "step": 10440, "token_acc": 0.9364161849710982, "train_speed(iter/s)": 0.138236 }, { "epoch": 0.9857493393733484, "grad_norm": 0.5766665935516357, "learning_rate": 1.1102720618493268e-07, "loss": 0.20731678009033203, "memory(GiB)": 91.64, "step": 10445, "token_acc": 0.9293218720152817, "train_speed(iter/s)": 0.138236 }, { "epoch": 0.9862212155530389, "grad_norm": 0.34089088439941406, "learning_rate": 1.037973973025963e-07, "loss": 0.2109663963317871, "memory(GiB)": 91.64, "step": 10450, "token_acc": 0.934560327198364, "train_speed(iter/s)": 0.138234 }, { "epoch": 0.9866930917327293, "grad_norm": 0.36588117480278015, "learning_rate": 9.681085071170382e-08, "loss": 0.20971145629882812, "memory(GiB)": 91.64, "step": 10455, "token_acc": 0.9096446700507614, "train_speed(iter/s)": 0.138232 }, { "epoch": 0.9871649679124198, "grad_norm": 0.21309404075145721, "learning_rate": 9.0067583425546e-08, "loss": 0.21097216606140137, "memory(GiB)": 91.64, "step": 10460, "token_acc": 0.9385533707865169, "train_speed(iter/s)": 0.138232 }, { "epoch": 0.9876368440921103, "grad_norm": 0.24648167192935944, "learning_rate": 8.356761186499862e-08, "loss": 0.21074647903442384, "memory(GiB)": 91.64, "step": 10465, "token_acc": 0.9346246973365617, "train_speed(iter/s)": 0.138231 }, { "epoch": 0.9881087202718006, "grad_norm": 0.43389177322387695, "learning_rate": 7.731095185846693e-08, "loss": 0.20950679779052733, "memory(GiB)": 91.64, "step": 10470, "token_acc": 0.9093137254901961, "train_speed(iter/s)": 0.13823 }, { "epoch": 0.9885805964514911, "grad_norm": 0.4236377477645874, "learning_rate": 7.129761864185236e-08, "loss": 0.21015970706939696, "memory(GiB)": 91.64, "step": 10475, "token_acc": 0.923942153186931, "train_speed(iter/s)": 0.138229 }, { "epoch": 0.9890524726311816, "grad_norm": 0.39386889338493347, "learning_rate": 6.552762685854141e-08, "loss": 0.21105661392211914, "memory(GiB)": 91.64, "step": 10480, "token_acc": 0.9331564986737401, "train_speed(iter/s)": 0.138228 }, { "epoch": 0.989524348810872, "grad_norm": 0.24592046439647675, "learning_rate": 6.000099055932795e-08, "loss": 0.21296677589416504, "memory(GiB)": 91.64, "step": 10485, "token_acc": 0.9173738276016079, "train_speed(iter/s)": 0.138227 }, { "epoch": 0.9899962249905625, "grad_norm": 0.3638093173503876, "learning_rate": 5.471772320240209e-08, "loss": 0.20483295917510985, "memory(GiB)": 91.64, "step": 10490, "token_acc": 0.9202294056308655, "train_speed(iter/s)": 0.138229 }, { "epoch": 0.990468101170253, "grad_norm": 0.20603786408901215, "learning_rate": 4.9677837653316904e-08, "loss": 0.20644736289978027, "memory(GiB)": 91.64, "step": 10495, "token_acc": 0.9233983286908078, "train_speed(iter/s)": 0.138228 }, { "epoch": 0.9909399773499433, "grad_norm": 0.6620780229568481, "learning_rate": 4.4881346184943994e-08, "loss": 0.2090602159500122, "memory(GiB)": 91.64, "step": 10500, "token_acc": 0.9377893518518519, "train_speed(iter/s)": 0.138227 }, { "epoch": 0.9914118535296338, "grad_norm": 0.48382601141929626, "learning_rate": 4.032826047747351e-08, "loss": 0.20607643127441405, "memory(GiB)": 91.64, "step": 10505, "token_acc": 0.9327878433664524, "train_speed(iter/s)": 0.138226 }, { "epoch": 0.9918837297093243, "grad_norm": 0.304166704416275, "learning_rate": 3.601859161834753e-08, "loss": 0.20815153121948243, "memory(GiB)": 91.64, "step": 10510, "token_acc": 0.9228538283062645, "train_speed(iter/s)": 0.138224 }, { "epoch": 0.9923556058890147, "grad_norm": 0.25076785683631897, "learning_rate": 3.195235010226005e-08, "loss": 0.20654242038726806, "memory(GiB)": 91.64, "step": 10515, "token_acc": 0.9157088122605364, "train_speed(iter/s)": 0.138223 }, { "epoch": 0.9928274820687052, "grad_norm": 0.27873122692108154, "learning_rate": 2.8129545831112604e-08, "loss": 0.21101999282836914, "memory(GiB)": 91.64, "step": 10520, "token_acc": 0.9243013795542978, "train_speed(iter/s)": 0.138222 }, { "epoch": 0.9932993582483957, "grad_norm": 0.3841204047203064, "learning_rate": 2.455018811403642e-08, "loss": 0.20729808807373046, "memory(GiB)": 91.64, "step": 10525, "token_acc": 0.9227557411273486, "train_speed(iter/s)": 0.138221 }, { "epoch": 0.9937712344280861, "grad_norm": 0.5184294581413269, "learning_rate": 2.121428566727035e-08, "loss": 0.21497209072113038, "memory(GiB)": 91.64, "step": 10530, "token_acc": 0.9206409767264403, "train_speed(iter/s)": 0.13822 }, { "epoch": 0.9942431106077765, "grad_norm": 0.2996355891227722, "learning_rate": 1.8121846614260752e-08, "loss": 0.20966591835021972, "memory(GiB)": 91.64, "step": 10535, "token_acc": 0.9370962257735465, "train_speed(iter/s)": 0.138218 }, { "epoch": 0.994714986787467, "grad_norm": 0.5140137076377869, "learning_rate": 1.5272878485561582e-08, "loss": 0.21541638374328614, "memory(GiB)": 91.64, "step": 10540, "token_acc": 0.9206145966709347, "train_speed(iter/s)": 0.138217 }, { "epoch": 0.9951868629671574, "grad_norm": 0.3749140501022339, "learning_rate": 1.2667388218834398e-08, "loss": 0.2071479320526123, "memory(GiB)": 91.64, "step": 10545, "token_acc": 0.9281074058033781, "train_speed(iter/s)": 0.138215 }, { "epoch": 0.9956587391468479, "grad_norm": 0.23596055805683136, "learning_rate": 1.0305382158848353e-08, "loss": 0.2132049322128296, "memory(GiB)": 91.64, "step": 10550, "token_acc": 0.9148795776971297, "train_speed(iter/s)": 0.138214 }, { "epoch": 0.9961306153265383, "grad_norm": 0.5079225301742554, "learning_rate": 8.186866057435793e-09, "loss": 0.2057969093322754, "memory(GiB)": 91.64, "step": 10555, "token_acc": 0.9209541627689429, "train_speed(iter/s)": 0.138213 }, { "epoch": 0.9966024915062288, "grad_norm": 0.2810031771659851, "learning_rate": 6.311845073492251e-09, "loss": 0.2084169864654541, "memory(GiB)": 91.64, "step": 10560, "token_acc": 0.9232763089683774, "train_speed(iter/s)": 0.138213 }, { "epoch": 0.9970743676859192, "grad_norm": 0.6900485157966614, "learning_rate": 4.680323772998651e-09, "loss": 0.2122826337814331, "memory(GiB)": 91.64, "step": 10565, "token_acc": 0.9240121580547113, "train_speed(iter/s)": 0.138211 }, { "epoch": 0.9975462438656096, "grad_norm": 0.22658671438694, "learning_rate": 3.2923061289324987e-09, "loss": 0.2080448627471924, "memory(GiB)": 91.64, "step": 10570, "token_acc": 0.9372056514913658, "train_speed(iter/s)": 0.138209 }, { "epoch": 0.9980181200453001, "grad_norm": 0.20944856107234955, "learning_rate": 2.1477955213455857e-09, "loss": 0.2094266891479492, "memory(GiB)": 91.64, "step": 10575, "token_acc": 0.9141078838174274, "train_speed(iter/s)": 0.138208 }, { "epoch": 0.9984899962249906, "grad_norm": 0.336431086063385, "learning_rate": 1.2467947372751808e-09, "loss": 0.21478500366210937, "memory(GiB)": 91.64, "step": 10580, "token_acc": 0.9218197879858657, "train_speed(iter/s)": 0.138207 }, { "epoch": 0.998961872404681, "grad_norm": 0.6330317854881287, "learning_rate": 5.893059708106385e-10, "loss": 0.21838765144348143, "memory(GiB)": 91.64, "step": 10585, "token_acc": 0.9174628450106157, "train_speed(iter/s)": 0.138207 }, { "epoch": 0.9994337485843715, "grad_norm": 0.22374173998832703, "learning_rate": 1.7533082302678695e-10, "loss": 0.20774707794189454, "memory(GiB)": 91.64, "step": 10590, "token_acc": 0.9141304347826087, "train_speed(iter/s)": 0.138207 }, { "epoch": 0.9999056247640619, "grad_norm": 0.23590397834777832, "learning_rate": 4.870302028336937e-12, "loss": 0.21752536296844482, "memory(GiB)": 91.64, "step": 10595, "token_acc": 0.9202557200538358, "train_speed(iter/s)": 0.138204 }, { "epoch": 1.0, "eval_loss": 0.23324698209762573, "eval_runtime": 3.6087, "eval_samples_per_second": 27.711, "eval_steps_per_second": 0.554, "eval_token_acc": 0.9133481698944079, "step": 10596 } ], "logging_steps": 5, "max_steps": 10596, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1111111111, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6446246652105405e+20, "train_batch_size": 4, "trial_name": null, "trial_params": null }