diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,21243 @@ +{ + "best_global_step": 10596, + "best_metric": 0.23324698, + "best_model_checkpoint": "/data/oss_bucket_0/xwt/output/citywalker/4d815960480fc88bcc76f00e7fcc7bace26a4251-1-ep/v0-20251014-003550/checkpoint-10596", + "epoch": 1.0, + "eval_steps": 1111111111, + "global_step": 10596, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 9.437523593808984e-05, + "grad_norm": 84.30111694335938, + "learning_rate": 3.773584905660378e-07, + "loss": 3.5155749320983887, + "memory(GiB)": 34.14, + "step": 1, + "token_acc": 0.610738255033557, + "train_speed(iter/s)": 0.00942 + }, + { + "epoch": 0.00047187617969044924, + "grad_norm": 106.11502075195312, + "learning_rate": 1.8867924528301887e-06, + "loss": 3.807619571685791, + "memory(GiB)": 63.23, + "step": 5, + "token_acc": 0.6100302637267618, + "train_speed(iter/s)": 0.03435 + }, + { + "epoch": 0.0009437523593808985, + "grad_norm": 116.6819839477539, + "learning_rate": 3.7735849056603773e-06, + "loss": 3.0834783554077148, + "memory(GiB)": 73.6, + "step": 10, + "token_acc": 0.6099345398536773, + "train_speed(iter/s)": 0.05377 + }, + { + "epoch": 0.0014156285390713476, + "grad_norm": 48.40726089477539, + "learning_rate": 5.660377358490566e-06, + "loss": 2.1324913024902346, + "memory(GiB)": 73.6, + "step": 15, + "token_acc": 0.6634066829665851, + "train_speed(iter/s)": 0.066562 + }, + { + "epoch": 0.001887504718761797, + "grad_norm": 42.45460891723633, + "learning_rate": 7.547169811320755e-06, + "loss": 1.2360769271850587, + "memory(GiB)": 73.6, + "step": 20, + "token_acc": 0.8050813815005955, + "train_speed(iter/s)": 0.076262 + }, + { + "epoch": 0.002359380898452246, + "grad_norm": 70.99395751953125, + "learning_rate": 9.433962264150944e-06, + "loss": 1.01270751953125, + "memory(GiB)": 91.64, + "step": 25, + "token_acc": 0.8459764814387826, + "train_speed(iter/s)": 0.083391 + }, + { + "epoch": 0.0028312570781426952, + "grad_norm": 19.72198486328125, + "learning_rate": 1.1320754716981132e-05, + "loss": 0.9193931579589844, + "memory(GiB)": 91.64, + "step": 30, + "token_acc": 0.8607944732297064, + "train_speed(iter/s)": 0.089177 + }, + { + "epoch": 0.003303133257833145, + "grad_norm": 20.32855987548828, + "learning_rate": 1.320754716981132e-05, + "loss": 0.7122109413146973, + "memory(GiB)": 91.64, + "step": 35, + "token_acc": 0.8821861304459913, + "train_speed(iter/s)": 0.093733 + }, + { + "epoch": 0.003775009437523594, + "grad_norm": 21.04654312133789, + "learning_rate": 1.509433962264151e-05, + "loss": 0.6282638549804688, + "memory(GiB)": 91.64, + "step": 40, + "token_acc": 0.8689788053949904, + "train_speed(iter/s)": 0.097197 + }, + { + "epoch": 0.004246885617214043, + "grad_norm": 9.78100872039795, + "learning_rate": 1.69811320754717e-05, + "loss": 0.4371612548828125, + "memory(GiB)": 91.64, + "step": 45, + "token_acc": 0.9013560223344855, + "train_speed(iter/s)": 0.100185 + }, + { + "epoch": 0.004718761796904492, + "grad_norm": 9.462187767028809, + "learning_rate": 1.8867924528301888e-05, + "loss": 0.3639880657196045, + "memory(GiB)": 91.64, + "step": 50, + "token_acc": 0.9021005251312828, + "train_speed(iter/s)": 0.102924 + }, + { + "epoch": 0.005190637976594942, + "grad_norm": 2.678065061569214, + "learning_rate": 2.0754716981132076e-05, + "loss": 0.33274307250976565, + "memory(GiB)": 91.64, + "step": 55, + "token_acc": 0.8931506849315068, + "train_speed(iter/s)": 0.105295 + }, + { + "epoch": 0.0056625141562853904, + "grad_norm": 2.3717150688171387, + "learning_rate": 2.2641509433962265e-05, + "loss": 0.35691156387329104, + "memory(GiB)": 91.64, + "step": 60, + "token_acc": 0.8906422018348624, + "train_speed(iter/s)": 0.107475 + }, + { + "epoch": 0.00613439033597584, + "grad_norm": 22.78520965576172, + "learning_rate": 2.4528301886792453e-05, + "loss": 0.4278131008148193, + "memory(GiB)": 91.64, + "step": 65, + "token_acc": 0.871261378413524, + "train_speed(iter/s)": 0.109376 + }, + { + "epoch": 0.00660626651566629, + "grad_norm": 2.964755058288574, + "learning_rate": 2.641509433962264e-05, + "loss": 0.39853944778442385, + "memory(GiB)": 91.64, + "step": 70, + "token_acc": 0.877030162412993, + "train_speed(iter/s)": 0.111001 + }, + { + "epoch": 0.007078142695356738, + "grad_norm": 2.3992581367492676, + "learning_rate": 2.830188679245283e-05, + "loss": 0.3416656494140625, + "memory(GiB)": 91.64, + "step": 75, + "token_acc": 0.8882584024443474, + "train_speed(iter/s)": 0.112467 + }, + { + "epoch": 0.007550018875047188, + "grad_norm": 2.020433187484741, + "learning_rate": 3.018867924528302e-05, + "loss": 0.35348923206329347, + "memory(GiB)": 91.64, + "step": 80, + "token_acc": 0.888859132297804, + "train_speed(iter/s)": 0.113855 + }, + { + "epoch": 0.008021895054737637, + "grad_norm": 3.7929301261901855, + "learning_rate": 3.207547169811321e-05, + "loss": 0.34899582862854006, + "memory(GiB)": 91.64, + "step": 85, + "token_acc": 0.8849840255591054, + "train_speed(iter/s)": 0.114958 + }, + { + "epoch": 0.008493771234428085, + "grad_norm": 1.895822525024414, + "learning_rate": 3.39622641509434e-05, + "loss": 0.38731689453125, + "memory(GiB)": 91.64, + "step": 90, + "token_acc": 0.8807424593967518, + "train_speed(iter/s)": 0.116143 + }, + { + "epoch": 0.008965647414118535, + "grad_norm": 2.664459705352783, + "learning_rate": 3.5849056603773584e-05, + "loss": 0.36236350536346434, + "memory(GiB)": 91.64, + "step": 95, + "token_acc": 0.8755798090040928, + "train_speed(iter/s)": 0.117219 + }, + { + "epoch": 0.009437523593808984, + "grad_norm": 6.293849468231201, + "learning_rate": 3.7735849056603776e-05, + "loss": 0.4948585033416748, + "memory(GiB)": 91.64, + "step": 100, + "token_acc": 0.8815642458100559, + "train_speed(iter/s)": 0.118142 + }, + { + "epoch": 0.009909399773499434, + "grad_norm": 1.9665182828903198, + "learning_rate": 3.962264150943397e-05, + "loss": 0.41594877243041994, + "memory(GiB)": 91.64, + "step": 105, + "token_acc": 0.8591625883632409, + "train_speed(iter/s)": 0.118993 + }, + { + "epoch": 0.010381275953189883, + "grad_norm": 4.004781723022461, + "learning_rate": 4.150943396226415e-05, + "loss": 0.35399832725524905, + "memory(GiB)": 91.64, + "step": 110, + "token_acc": 0.8827626233313988, + "train_speed(iter/s)": 0.119709 + }, + { + "epoch": 0.010853152132880333, + "grad_norm": 1.3648808002471924, + "learning_rate": 4.3396226415094345e-05, + "loss": 0.36987504959106443, + "memory(GiB)": 91.64, + "step": 115, + "token_acc": 0.8744955609362389, + "train_speed(iter/s)": 0.120437 + }, + { + "epoch": 0.011325028312570781, + "grad_norm": 1.6580424308776855, + "learning_rate": 4.528301886792453e-05, + "loss": 0.3571889877319336, + "memory(GiB)": 91.64, + "step": 120, + "token_acc": 0.8779239766081871, + "train_speed(iter/s)": 0.121126 + }, + { + "epoch": 0.01179690449226123, + "grad_norm": 3.6433212757110596, + "learning_rate": 4.716981132075472e-05, + "loss": 0.3812561511993408, + "memory(GiB)": 91.64, + "step": 125, + "token_acc": 0.8725761772853186, + "train_speed(iter/s)": 0.121769 + }, + { + "epoch": 0.01226878067195168, + "grad_norm": 4.834733009338379, + "learning_rate": 4.9056603773584906e-05, + "loss": 0.37772574424743655, + "memory(GiB)": 91.64, + "step": 130, + "token_acc": 0.8891786179921773, + "train_speed(iter/s)": 0.122353 + }, + { + "epoch": 0.01274065685164213, + "grad_norm": 1.4346174001693726, + "learning_rate": 5.09433962264151e-05, + "loss": 0.3767831802368164, + "memory(GiB)": 91.64, + "step": 135, + "token_acc": 0.8672922252010724, + "train_speed(iter/s)": 0.122876 + }, + { + "epoch": 0.01321253303133258, + "grad_norm": 1.908555269241333, + "learning_rate": 5.283018867924528e-05, + "loss": 0.3630094051361084, + "memory(GiB)": 91.64, + "step": 140, + "token_acc": 0.8912050256996003, + "train_speed(iter/s)": 0.123352 + }, + { + "epoch": 0.013684409211023027, + "grad_norm": 1.579922080039978, + "learning_rate": 5.4716981132075475e-05, + "loss": 0.38754441738128664, + "memory(GiB)": 91.64, + "step": 145, + "token_acc": 0.8529411764705882, + "train_speed(iter/s)": 0.12383 + }, + { + "epoch": 0.014156285390713477, + "grad_norm": 2.1948578357696533, + "learning_rate": 5.660377358490566e-05, + "loss": 0.38694057464599607, + "memory(GiB)": 91.64, + "step": 150, + "token_acc": 0.8721523291397484, + "train_speed(iter/s)": 0.124241 + }, + { + "epoch": 0.014628161570403926, + "grad_norm": 2.0328688621520996, + "learning_rate": 5.849056603773585e-05, + "loss": 0.3814719200134277, + "memory(GiB)": 91.64, + "step": 155, + "token_acc": 0.8973154362416107, + "train_speed(iter/s)": 0.1247 + }, + { + "epoch": 0.015100037750094376, + "grad_norm": 1.0338304042816162, + "learning_rate": 6.037735849056604e-05, + "loss": 0.46609911918640134, + "memory(GiB)": 91.64, + "step": 160, + "token_acc": 0.8663366336633663, + "train_speed(iter/s)": 0.12509 + }, + { + "epoch": 0.015571913929784825, + "grad_norm": 2.470008134841919, + "learning_rate": 6.226415094339622e-05, + "loss": 0.37815041542053224, + "memory(GiB)": 91.64, + "step": 165, + "token_acc": 0.8509316770186336, + "train_speed(iter/s)": 0.125425 + }, + { + "epoch": 0.016043790109475275, + "grad_norm": 5.2904839515686035, + "learning_rate": 6.415094339622641e-05, + "loss": 0.40090017318725585, + "memory(GiB)": 91.64, + "step": 170, + "token_acc": 0.8588604286461056, + "train_speed(iter/s)": 0.125806 + }, + { + "epoch": 0.016515666289165724, + "grad_norm": 2.9606592655181885, + "learning_rate": 6.60377358490566e-05, + "loss": 0.39040379524230956, + "memory(GiB)": 91.64, + "step": 175, + "token_acc": 0.8624469589816125, + "train_speed(iter/s)": 0.126167 + }, + { + "epoch": 0.01698754246885617, + "grad_norm": 2.9512269496917725, + "learning_rate": 6.79245283018868e-05, + "loss": 0.40850982666015623, + "memory(GiB)": 91.64, + "step": 180, + "token_acc": 0.8886337543053962, + "train_speed(iter/s)": 0.126417 + }, + { + "epoch": 0.01745941864854662, + "grad_norm": 3.298267126083374, + "learning_rate": 6.981132075471698e-05, + "loss": 0.37546262741088865, + "memory(GiB)": 91.64, + "step": 185, + "token_acc": 0.8762331838565023, + "train_speed(iter/s)": 0.126711 + }, + { + "epoch": 0.01793129482823707, + "grad_norm": 11.315258026123047, + "learning_rate": 7.169811320754717e-05, + "loss": 0.3741611480712891, + "memory(GiB)": 91.64, + "step": 190, + "token_acc": 0.8901128425577647, + "train_speed(iter/s)": 0.127037 + }, + { + "epoch": 0.01840317100792752, + "grad_norm": 1.4289817810058594, + "learning_rate": 7.358490566037736e-05, + "loss": 0.4472553253173828, + "memory(GiB)": 91.64, + "step": 195, + "token_acc": 0.8647993536224078, + "train_speed(iter/s)": 0.127361 + }, + { + "epoch": 0.01887504718761797, + "grad_norm": 21.130821228027344, + "learning_rate": 7.547169811320755e-05, + "loss": 0.43447041511535645, + "memory(GiB)": 91.64, + "step": 200, + "token_acc": 0.8622900763358778, + "train_speed(iter/s)": 0.127616 + }, + { + "epoch": 0.01934692336730842, + "grad_norm": 2.252457857131958, + "learning_rate": 7.735849056603774e-05, + "loss": 0.4470320701599121, + "memory(GiB)": 91.64, + "step": 205, + "token_acc": 0.8723309608540926, + "train_speed(iter/s)": 0.127869 + }, + { + "epoch": 0.019818799546998868, + "grad_norm": 2.8665921688079834, + "learning_rate": 7.924528301886794e-05, + "loss": 0.3971832275390625, + "memory(GiB)": 91.64, + "step": 210, + "token_acc": 0.8759057971014492, + "train_speed(iter/s)": 0.128077 + }, + { + "epoch": 0.020290675726689317, + "grad_norm": 3.2112598419189453, + "learning_rate": 8.113207547169813e-05, + "loss": 0.3845433235168457, + "memory(GiB)": 91.64, + "step": 215, + "token_acc": 0.8769537745261058, + "train_speed(iter/s)": 0.12831 + }, + { + "epoch": 0.020762551906379767, + "grad_norm": 1.5424221754074097, + "learning_rate": 8.30188679245283e-05, + "loss": 0.4125929832458496, + "memory(GiB)": 91.64, + "step": 220, + "token_acc": 0.8703056768558952, + "train_speed(iter/s)": 0.128563 + }, + { + "epoch": 0.021234428086070217, + "grad_norm": 1.9207730293273926, + "learning_rate": 8.49056603773585e-05, + "loss": 0.3798954010009766, + "memory(GiB)": 91.64, + "step": 225, + "token_acc": 0.8849710982658959, + "train_speed(iter/s)": 0.128725 + }, + { + "epoch": 0.021706304265760666, + "grad_norm": 3.5698459148406982, + "learning_rate": 8.679245283018869e-05, + "loss": 0.4055330753326416, + "memory(GiB)": 91.64, + "step": 230, + "token_acc": 0.8681796233703525, + "train_speed(iter/s)": 0.128949 + }, + { + "epoch": 0.022178180445451112, + "grad_norm": 4.7955732345581055, + "learning_rate": 8.867924528301888e-05, + "loss": 0.3989933967590332, + "memory(GiB)": 91.64, + "step": 235, + "token_acc": 0.8699075659020883, + "train_speed(iter/s)": 0.129146 + }, + { + "epoch": 0.022650056625141562, + "grad_norm": 1.4037288427352905, + "learning_rate": 9.056603773584906e-05, + "loss": 0.40497756004333496, + "memory(GiB)": 91.64, + "step": 240, + "token_acc": 0.8680724520140578, + "train_speed(iter/s)": 0.129334 + }, + { + "epoch": 0.02312193280483201, + "grad_norm": 3.678190231323242, + "learning_rate": 9.245283018867925e-05, + "loss": 0.41619534492492677, + "memory(GiB)": 91.64, + "step": 245, + "token_acc": 0.8728121353558926, + "train_speed(iter/s)": 0.129504 + }, + { + "epoch": 0.02359380898452246, + "grad_norm": 2.952890634536743, + "learning_rate": 9.433962264150944e-05, + "loss": 0.43350043296813967, + "memory(GiB)": 91.64, + "step": 250, + "token_acc": 0.8761645962732919, + "train_speed(iter/s)": 0.129679 + }, + { + "epoch": 0.02406568516421291, + "grad_norm": 1.438887357711792, + "learning_rate": 9.622641509433963e-05, + "loss": 0.44051084518432615, + "memory(GiB)": 91.64, + "step": 255, + "token_acc": 0.8877816291161178, + "train_speed(iter/s)": 0.129866 + }, + { + "epoch": 0.02453756134390336, + "grad_norm": 5.033264636993408, + "learning_rate": 9.811320754716981e-05, + "loss": 0.520989179611206, + "memory(GiB)": 91.64, + "step": 260, + "token_acc": 0.8630212648439658, + "train_speed(iter/s)": 0.129983 + }, + { + "epoch": 0.02500943752359381, + "grad_norm": 1.3177125453948975, + "learning_rate": 0.0001, + "loss": 0.4122615337371826, + "memory(GiB)": 91.64, + "step": 265, + "token_acc": 0.8586235489220564, + "train_speed(iter/s)": 0.130162 + }, + { + "epoch": 0.02548131370328426, + "grad_norm": 1.7470556497573853, + "learning_rate": 0.0001018867924528302, + "loss": 0.3909600734710693, + "memory(GiB)": 91.64, + "step": 270, + "token_acc": 0.862627197039778, + "train_speed(iter/s)": 0.130349 + }, + { + "epoch": 0.02595318988297471, + "grad_norm": 1.922580361366272, + "learning_rate": 0.00010377358490566037, + "loss": 0.40008974075317383, + "memory(GiB)": 91.64, + "step": 275, + "token_acc": 0.8686131386861314, + "train_speed(iter/s)": 0.130529 + }, + { + "epoch": 0.02642506606266516, + "grad_norm": 1.2326915264129639, + "learning_rate": 0.00010566037735849057, + "loss": 0.406398344039917, + "memory(GiB)": 91.64, + "step": 280, + "token_acc": 0.8661087866108786, + "train_speed(iter/s)": 0.130651 + }, + { + "epoch": 0.026896942242355604, + "grad_norm": 1.0071676969528198, + "learning_rate": 0.00010754716981132076, + "loss": 0.42357187271118163, + "memory(GiB)": 91.64, + "step": 285, + "token_acc": 0.868785399622404, + "train_speed(iter/s)": 0.130776 + }, + { + "epoch": 0.027368818422046054, + "grad_norm": 1.469139814376831, + "learning_rate": 0.00010943396226415095, + "loss": 0.40146756172180176, + "memory(GiB)": 91.64, + "step": 290, + "token_acc": 0.8522532800912721, + "train_speed(iter/s)": 0.130922 + }, + { + "epoch": 0.027840694601736504, + "grad_norm": 1.8155490159988403, + "learning_rate": 0.00011132075471698113, + "loss": 0.40570869445800783, + "memory(GiB)": 91.64, + "step": 295, + "token_acc": 0.8601811736904293, + "train_speed(iter/s)": 0.131047 + }, + { + "epoch": 0.028312570781426953, + "grad_norm": 0.9587322473526001, + "learning_rate": 0.00011320754716981132, + "loss": 0.4192817687988281, + "memory(GiB)": 91.64, + "step": 300, + "token_acc": 0.8656987295825771, + "train_speed(iter/s)": 0.131187 + }, + { + "epoch": 0.028784446961117403, + "grad_norm": 1.1372607946395874, + "learning_rate": 0.00011509433962264151, + "loss": 0.40810341835021974, + "memory(GiB)": 91.64, + "step": 305, + "token_acc": 0.8578063594140765, + "train_speed(iter/s)": 0.131329 + }, + { + "epoch": 0.029256323140807852, + "grad_norm": 0.9617637991905212, + "learning_rate": 0.0001169811320754717, + "loss": 0.41143293380737306, + "memory(GiB)": 91.64, + "step": 310, + "token_acc": 0.8715596330275229, + "train_speed(iter/s)": 0.131434 + }, + { + "epoch": 0.029728199320498302, + "grad_norm": 3.1000680923461914, + "learning_rate": 0.00011886792452830188, + "loss": 0.42690815925598147, + "memory(GiB)": 91.64, + "step": 315, + "token_acc": 0.8557730723132241, + "train_speed(iter/s)": 0.131554 + }, + { + "epoch": 0.03020007550018875, + "grad_norm": 1.8936156034469604, + "learning_rate": 0.00012075471698113207, + "loss": 0.4507251739501953, + "memory(GiB)": 91.64, + "step": 320, + "token_acc": 0.8282674772036475, + "train_speed(iter/s)": 0.131656 + }, + { + "epoch": 0.0306719516798792, + "grad_norm": 2.192671298980713, + "learning_rate": 0.00012264150943396227, + "loss": 0.42845516204833983, + "memory(GiB)": 91.64, + "step": 325, + "token_acc": 0.8745208280092001, + "train_speed(iter/s)": 0.131757 + }, + { + "epoch": 0.03114382785956965, + "grad_norm": 1.4483352899551392, + "learning_rate": 0.00012452830188679244, + "loss": 0.41372270584106446, + "memory(GiB)": 91.64, + "step": 330, + "token_acc": 0.8579447322970639, + "train_speed(iter/s)": 0.131828 + }, + { + "epoch": 0.0316157040392601, + "grad_norm": 0.6700084805488586, + "learning_rate": 0.00012641509433962265, + "loss": 0.3887781620025635, + "memory(GiB)": 91.64, + "step": 335, + "token_acc": 0.881233595800525, + "train_speed(iter/s)": 0.131926 + }, + { + "epoch": 0.03208758021895055, + "grad_norm": 0.9793855547904968, + "learning_rate": 0.00012830188679245283, + "loss": 0.3957367897033691, + "memory(GiB)": 91.64, + "step": 340, + "token_acc": 0.8718430951101558, + "train_speed(iter/s)": 0.132006 + }, + { + "epoch": 0.032559456398640996, + "grad_norm": 1.0212165117263794, + "learning_rate": 0.000130188679245283, + "loss": 0.401882266998291, + "memory(GiB)": 91.64, + "step": 345, + "token_acc": 0.8426698450536353, + "train_speed(iter/s)": 0.132057 + }, + { + "epoch": 0.03303133257833145, + "grad_norm": 1.0362790822982788, + "learning_rate": 0.0001320754716981132, + "loss": 0.40561609268188475, + "memory(GiB)": 91.64, + "step": 350, + "token_acc": 0.8569958847736625, + "train_speed(iter/s)": 0.132164 + }, + { + "epoch": 0.033503208758021895, + "grad_norm": 0.9072962999343872, + "learning_rate": 0.0001339622641509434, + "loss": 0.409895133972168, + "memory(GiB)": 91.64, + "step": 355, + "token_acc": 0.8754699248120301, + "train_speed(iter/s)": 0.132257 + }, + { + "epoch": 0.03397508493771234, + "grad_norm": 1.0432530641555786, + "learning_rate": 0.0001358490566037736, + "loss": 0.4178511619567871, + "memory(GiB)": 91.64, + "step": 360, + "token_acc": 0.8715596330275229, + "train_speed(iter/s)": 0.132366 + }, + { + "epoch": 0.034446961117402794, + "grad_norm": 1.1534943580627441, + "learning_rate": 0.00013773584905660377, + "loss": 0.41347360610961914, + "memory(GiB)": 91.64, + "step": 365, + "token_acc": 0.8692509855453351, + "train_speed(iter/s)": 0.132474 + }, + { + "epoch": 0.03491883729709324, + "grad_norm": 1.7700964212417603, + "learning_rate": 0.00013962264150943395, + "loss": 0.417569637298584, + "memory(GiB)": 91.64, + "step": 370, + "token_acc": 0.8729519977004886, + "train_speed(iter/s)": 0.132563 + }, + { + "epoch": 0.03539071347678369, + "grad_norm": 1.860262393951416, + "learning_rate": 0.00014150943396226416, + "loss": 0.4185757637023926, + "memory(GiB)": 91.64, + "step": 375, + "token_acc": 0.8906384196965359, + "train_speed(iter/s)": 0.132637 + }, + { + "epoch": 0.03586258965647414, + "grad_norm": 1.596077561378479, + "learning_rate": 0.00014339622641509434, + "loss": 0.411014461517334, + "memory(GiB)": 91.64, + "step": 380, + "token_acc": 0.8643669149353195, + "train_speed(iter/s)": 0.132728 + }, + { + "epoch": 0.03633446583616459, + "grad_norm": 0.9201560020446777, + "learning_rate": 0.00014528301886792451, + "loss": 0.42378597259521483, + "memory(GiB)": 91.64, + "step": 385, + "token_acc": 0.8777292576419214, + "train_speed(iter/s)": 0.132789 + }, + { + "epoch": 0.03680634201585504, + "grad_norm": 0.9091364741325378, + "learning_rate": 0.00014716981132075472, + "loss": 0.41143045425415037, + "memory(GiB)": 91.64, + "step": 390, + "token_acc": 0.8801906058543226, + "train_speed(iter/s)": 0.132888 + }, + { + "epoch": 0.03727821819554549, + "grad_norm": 1.1616473197937012, + "learning_rate": 0.0001490566037735849, + "loss": 0.4186095237731934, + "memory(GiB)": 91.64, + "step": 395, + "token_acc": 0.8518700787401575, + "train_speed(iter/s)": 0.132959 + }, + { + "epoch": 0.03775009437523594, + "grad_norm": 0.7037743330001831, + "learning_rate": 0.0001509433962264151, + "loss": 0.4113043785095215, + "memory(GiB)": 91.64, + "step": 400, + "token_acc": 0.8516409912926992, + "train_speed(iter/s)": 0.133015 + }, + { + "epoch": 0.03822197055492639, + "grad_norm": 0.7240108847618103, + "learning_rate": 0.0001528301886792453, + "loss": 0.4177090167999268, + "memory(GiB)": 91.64, + "step": 405, + "token_acc": 0.8457389428263214, + "train_speed(iter/s)": 0.133099 + }, + { + "epoch": 0.03869384673461684, + "grad_norm": 1.0540796518325806, + "learning_rate": 0.0001547169811320755, + "loss": 0.42041950225830077, + "memory(GiB)": 91.64, + "step": 410, + "token_acc": 0.8516787080322992, + "train_speed(iter/s)": 0.133188 + }, + { + "epoch": 0.03916572291430728, + "grad_norm": 0.980476975440979, + "learning_rate": 0.00015660377358490567, + "loss": 0.4235046863555908, + "memory(GiB)": 91.64, + "step": 415, + "token_acc": 0.8591037545417844, + "train_speed(iter/s)": 0.133243 + }, + { + "epoch": 0.039637599093997736, + "grad_norm": 0.9957146644592285, + "learning_rate": 0.00015849056603773587, + "loss": 0.426760196685791, + "memory(GiB)": 91.64, + "step": 420, + "token_acc": 0.837030191004313, + "train_speed(iter/s)": 0.13331 + }, + { + "epoch": 0.04010947527368818, + "grad_norm": 1.0860295295715332, + "learning_rate": 0.00016037735849056605, + "loss": 0.42039642333984373, + "memory(GiB)": 91.64, + "step": 425, + "token_acc": 0.8553571428571428, + "train_speed(iter/s)": 0.13337 + }, + { + "epoch": 0.040581351453378635, + "grad_norm": 0.6787883043289185, + "learning_rate": 0.00016226415094339625, + "loss": 0.4263429641723633, + "memory(GiB)": 91.64, + "step": 430, + "token_acc": 0.8620312072269367, + "train_speed(iter/s)": 0.133447 + }, + { + "epoch": 0.04105322763306908, + "grad_norm": 0.7591102123260498, + "learning_rate": 0.00016415094339622643, + "loss": 0.41852893829345705, + "memory(GiB)": 91.64, + "step": 435, + "token_acc": 0.8613795401532822, + "train_speed(iter/s)": 0.133505 + }, + { + "epoch": 0.041525103812759534, + "grad_norm": 0.6089534759521484, + "learning_rate": 0.0001660377358490566, + "loss": 0.42598543167114256, + "memory(GiB)": 91.64, + "step": 440, + "token_acc": 0.8578378378378378, + "train_speed(iter/s)": 0.133578 + }, + { + "epoch": 0.04199697999244998, + "grad_norm": 1.8172188997268677, + "learning_rate": 0.00016792452830188682, + "loss": 0.44203596115112304, + "memory(GiB)": 91.64, + "step": 445, + "token_acc": 0.8689731321310269, + "train_speed(iter/s)": 0.133639 + }, + { + "epoch": 0.04246885617214043, + "grad_norm": 1.0775083303451538, + "learning_rate": 0.000169811320754717, + "loss": 0.43241491317749026, + "memory(GiB)": 91.64, + "step": 450, + "token_acc": 0.8645731108930323, + "train_speed(iter/s)": 0.133695 + }, + { + "epoch": 0.04294073235183088, + "grad_norm": 1.2232269048690796, + "learning_rate": 0.00017169811320754717, + "loss": 0.42380399703979493, + "memory(GiB)": 91.64, + "step": 455, + "token_acc": 0.8739164696611506, + "train_speed(iter/s)": 0.133754 + }, + { + "epoch": 0.04341260853152133, + "grad_norm": 0.6929795145988464, + "learning_rate": 0.00017358490566037738, + "loss": 0.425693416595459, + "memory(GiB)": 91.64, + "step": 460, + "token_acc": 0.8682867557715674, + "train_speed(iter/s)": 0.13382 + }, + { + "epoch": 0.04388448471121178, + "grad_norm": 0.7897017002105713, + "learning_rate": 0.00017547169811320756, + "loss": 0.41992640495300293, + "memory(GiB)": 91.64, + "step": 465, + "token_acc": 0.8354898336414048, + "train_speed(iter/s)": 0.133876 + }, + { + "epoch": 0.044356360890902224, + "grad_norm": 0.5486307740211487, + "learning_rate": 0.00017735849056603776, + "loss": 0.42015676498413085, + "memory(GiB)": 91.64, + "step": 470, + "token_acc": 0.8480052753049786, + "train_speed(iter/s)": 0.133918 + }, + { + "epoch": 0.04482823707059268, + "grad_norm": 1.3324936628341675, + "learning_rate": 0.00017924528301886794, + "loss": 0.417246150970459, + "memory(GiB)": 91.64, + "step": 475, + "token_acc": 0.8778747026169706, + "train_speed(iter/s)": 0.133961 + }, + { + "epoch": 0.045300113250283124, + "grad_norm": 0.9353549480438232, + "learning_rate": 0.00018113207547169812, + "loss": 0.4723100185394287, + "memory(GiB)": 91.64, + "step": 480, + "token_acc": 0.8507697141061892, + "train_speed(iter/s)": 0.134019 + }, + { + "epoch": 0.04577198942997358, + "grad_norm": 1.1144438982009888, + "learning_rate": 0.00018301886792452832, + "loss": 0.45328826904296876, + "memory(GiB)": 91.64, + "step": 485, + "token_acc": 0.8691880638445524, + "train_speed(iter/s)": 0.134049 + }, + { + "epoch": 0.04624386560966402, + "grad_norm": 1.5882467031478882, + "learning_rate": 0.0001849056603773585, + "loss": 0.42916183471679686, + "memory(GiB)": 91.64, + "step": 490, + "token_acc": 0.8745184369840396, + "train_speed(iter/s)": 0.134102 + }, + { + "epoch": 0.046715741789354476, + "grad_norm": 1.1314642429351807, + "learning_rate": 0.00018679245283018868, + "loss": 0.41251192092895506, + "memory(GiB)": 91.64, + "step": 495, + "token_acc": 0.8797017960013555, + "train_speed(iter/s)": 0.13416 + }, + { + "epoch": 0.04718761796904492, + "grad_norm": 0.6526018381118774, + "learning_rate": 0.00018867924528301889, + "loss": 0.42115144729614257, + "memory(GiB)": 91.64, + "step": 500, + "token_acc": 0.8817605371130175, + "train_speed(iter/s)": 0.134197 + }, + { + "epoch": 0.047659494148735375, + "grad_norm": 0.7289260029792786, + "learning_rate": 0.00019056603773584906, + "loss": 0.4194014549255371, + "memory(GiB)": 91.64, + "step": 505, + "token_acc": 0.8838260869565218, + "train_speed(iter/s)": 0.134245 + }, + { + "epoch": 0.04813137032842582, + "grad_norm": 1.5535837411880493, + "learning_rate": 0.00019245283018867927, + "loss": 0.42951335906982424, + "memory(GiB)": 91.64, + "step": 510, + "token_acc": 0.8675577156743621, + "train_speed(iter/s)": 0.134314 + }, + { + "epoch": 0.04860324650811627, + "grad_norm": 0.6319753527641296, + "learning_rate": 0.00019433962264150945, + "loss": 0.42447052001953123, + "memory(GiB)": 91.64, + "step": 515, + "token_acc": 0.8672985781990521, + "train_speed(iter/s)": 0.134352 + }, + { + "epoch": 0.04907512268780672, + "grad_norm": 0.7045039534568787, + "learning_rate": 0.00019622641509433963, + "loss": 0.43744239807128904, + "memory(GiB)": 91.64, + "step": 520, + "token_acc": 0.8579258010118044, + "train_speed(iter/s)": 0.134408 + }, + { + "epoch": 0.049546998867497166, + "grad_norm": 0.7517572045326233, + "learning_rate": 0.00019811320754716983, + "loss": 0.4918349266052246, + "memory(GiB)": 91.64, + "step": 525, + "token_acc": 0.8434403487911217, + "train_speed(iter/s)": 0.134457 + }, + { + "epoch": 0.05001887504718762, + "grad_norm": 1.5722893476486206, + "learning_rate": 0.0002, + "loss": 0.44678421020507814, + "memory(GiB)": 91.64, + "step": 530, + "token_acc": 0.8664154103852596, + "train_speed(iter/s)": 0.134489 + }, + { + "epoch": 0.050490751226878065, + "grad_norm": 1.0157049894332886, + "learning_rate": 0.00019999987824247315, + "loss": 0.42899537086486816, + "memory(GiB)": 91.64, + "step": 535, + "token_acc": 0.8645294725956567, + "train_speed(iter/s)": 0.134533 + }, + { + "epoch": 0.05096262740656852, + "grad_norm": 0.9441537261009216, + "learning_rate": 0.00019999951297018905, + "loss": 0.41001176834106445, + "memory(GiB)": 91.64, + "step": 540, + "token_acc": 0.8450834879406308, + "train_speed(iter/s)": 0.134584 + }, + { + "epoch": 0.051434503586258964, + "grad_norm": 0.6020461320877075, + "learning_rate": 0.0001999989041840372, + "loss": 0.41864852905273436, + "memory(GiB)": 91.64, + "step": 545, + "token_acc": 0.8733031674208145, + "train_speed(iter/s)": 0.134636 + }, + { + "epoch": 0.05190637976594942, + "grad_norm": 0.7992700338363647, + "learning_rate": 0.0001999980518855001, + "loss": 0.4274590969085693, + "memory(GiB)": 91.64, + "step": 550, + "token_acc": 0.8683141503046716, + "train_speed(iter/s)": 0.134693 + }, + { + "epoch": 0.052378255945639864, + "grad_norm": 0.6813804507255554, + "learning_rate": 0.00019999695607665326, + "loss": 0.4231581687927246, + "memory(GiB)": 91.64, + "step": 555, + "token_acc": 0.8547925608011445, + "train_speed(iter/s)": 0.134736 + }, + { + "epoch": 0.05285013212533032, + "grad_norm": 1.6035652160644531, + "learning_rate": 0.00019999561676016506, + "loss": 0.4291111946105957, + "memory(GiB)": 91.64, + "step": 560, + "token_acc": 0.8530872959545777, + "train_speed(iter/s)": 0.134778 + }, + { + "epoch": 0.05332200830502076, + "grad_norm": 1.459104299545288, + "learning_rate": 0.00019999403393929695, + "loss": 0.438527774810791, + "memory(GiB)": 91.64, + "step": 565, + "token_acc": 0.8590631364562118, + "train_speed(iter/s)": 0.134828 + }, + { + "epoch": 0.05379388448471121, + "grad_norm": 1.4410645961761475, + "learning_rate": 0.0001999922076179034, + "loss": 0.4256152153015137, + "memory(GiB)": 91.64, + "step": 570, + "token_acc": 0.8751229105211407, + "train_speed(iter/s)": 0.134867 + }, + { + "epoch": 0.05426576066440166, + "grad_norm": 1.2330466508865356, + "learning_rate": 0.00019999013780043175, + "loss": 0.4717870712280273, + "memory(GiB)": 91.64, + "step": 575, + "token_acc": 0.8500267809319765, + "train_speed(iter/s)": 0.134918 + }, + { + "epoch": 0.05473763684409211, + "grad_norm": 0.652735710144043, + "learning_rate": 0.0001999878244919223, + "loss": 0.4285730361938477, + "memory(GiB)": 91.64, + "step": 580, + "token_acc": 0.8572246065808298, + "train_speed(iter/s)": 0.134957 + }, + { + "epoch": 0.05520951302378256, + "grad_norm": 1.1914304494857788, + "learning_rate": 0.0001999852676980083, + "loss": 0.4361170768737793, + "memory(GiB)": 91.64, + "step": 585, + "token_acc": 0.8608202443280978, + "train_speed(iter/s)": 0.134985 + }, + { + "epoch": 0.05568138920347301, + "grad_norm": 1.619393229484558, + "learning_rate": 0.00019998246742491596, + "loss": 0.4270325660705566, + "memory(GiB)": 91.64, + "step": 590, + "token_acc": 0.8668300653594772, + "train_speed(iter/s)": 0.13503 + }, + { + "epoch": 0.05615326538316346, + "grad_norm": 0.7896348237991333, + "learning_rate": 0.00019997942367946437, + "loss": 0.43086681365966795, + "memory(GiB)": 91.64, + "step": 595, + "token_acc": 0.8616211374832065, + "train_speed(iter/s)": 0.135073 + }, + { + "epoch": 0.056625141562853906, + "grad_norm": 0.5002231597900391, + "learning_rate": 0.00019997613646906544, + "loss": 0.4146383285522461, + "memory(GiB)": 91.64, + "step": 600, + "token_acc": 0.8878950506857484, + "train_speed(iter/s)": 0.135104 + }, + { + "epoch": 0.05709701774254436, + "grad_norm": 0.8216480612754822, + "learning_rate": 0.00019997260580172408, + "loss": 0.4246044158935547, + "memory(GiB)": 91.64, + "step": 605, + "token_acc": 0.8645484949832776, + "train_speed(iter/s)": 0.135117 + }, + { + "epoch": 0.057568893922234805, + "grad_norm": 0.4890948534011841, + "learning_rate": 0.000199968831686038, + "loss": 0.42087130546569823, + "memory(GiB)": 91.64, + "step": 610, + "token_acc": 0.8492407809110629, + "train_speed(iter/s)": 0.135152 + }, + { + "epoch": 0.05804077010192525, + "grad_norm": 0.6820136308670044, + "learning_rate": 0.00019996481413119772, + "loss": 0.41388683319091796, + "memory(GiB)": 91.64, + "step": 615, + "token_acc": 0.8731758165392633, + "train_speed(iter/s)": 0.135191 + }, + { + "epoch": 0.058512646281615704, + "grad_norm": 1.1169610023498535, + "learning_rate": 0.00019996055314698658, + "loss": 0.4246358394622803, + "memory(GiB)": 91.64, + "step": 620, + "token_acc": 0.8521199586349535, + "train_speed(iter/s)": 0.135201 + }, + { + "epoch": 0.05898452246130615, + "grad_norm": 0.9080762267112732, + "learning_rate": 0.0001999560487437808, + "loss": 0.41969666481018064, + "memory(GiB)": 91.64, + "step": 625, + "token_acc": 0.842851667305481, + "train_speed(iter/s)": 0.135239 + }, + { + "epoch": 0.059456398640996604, + "grad_norm": 0.5982246398925781, + "learning_rate": 0.0001999513009325491, + "loss": 0.4150944709777832, + "memory(GiB)": 91.64, + "step": 630, + "token_acc": 0.8643006263048016, + "train_speed(iter/s)": 0.135284 + }, + { + "epoch": 0.05992827482068705, + "grad_norm": 0.8043394088745117, + "learning_rate": 0.00019994630972485332, + "loss": 0.42877888679504395, + "memory(GiB)": 91.64, + "step": 635, + "token_acc": 0.8495774647887324, + "train_speed(iter/s)": 0.135316 + }, + { + "epoch": 0.0604001510003775, + "grad_norm": 0.7642725706100464, + "learning_rate": 0.00019994107513284767, + "loss": 0.41966772079467773, + "memory(GiB)": 91.64, + "step": 640, + "token_acc": 0.8726851851851852, + "train_speed(iter/s)": 0.135354 + }, + { + "epoch": 0.06087202718006795, + "grad_norm": 0.5839939713478088, + "learning_rate": 0.00019993559716927924, + "loss": 0.41250057220458985, + "memory(GiB)": 91.64, + "step": 645, + "token_acc": 0.8587731811697575, + "train_speed(iter/s)": 0.135396 + }, + { + "epoch": 0.0613439033597584, + "grad_norm": 0.7859387397766113, + "learning_rate": 0.00019992987584748764, + "loss": 0.4213667392730713, + "memory(GiB)": 91.64, + "step": 650, + "token_acc": 0.8893985728848114, + "train_speed(iter/s)": 0.135438 + }, + { + "epoch": 0.06181577953944885, + "grad_norm": 0.5287458300590515, + "learning_rate": 0.00019992391118140517, + "loss": 0.41655263900756834, + "memory(GiB)": 91.64, + "step": 655, + "token_acc": 0.8686520376175548, + "train_speed(iter/s)": 0.135466 + }, + { + "epoch": 0.0622876557191393, + "grad_norm": 0.9565641283988953, + "learning_rate": 0.00019991770318555672, + "loss": 0.4196015357971191, + "memory(GiB)": 91.64, + "step": 660, + "token_acc": 0.8530906011854361, + "train_speed(iter/s)": 0.135483 + }, + { + "epoch": 0.06275953189882974, + "grad_norm": 0.6690832376480103, + "learning_rate": 0.00019991125187505965, + "loss": 0.41780900955200195, + "memory(GiB)": 91.64, + "step": 665, + "token_acc": 0.8785834738617201, + "train_speed(iter/s)": 0.135505 + }, + { + "epoch": 0.0632314080785202, + "grad_norm": 0.4967804253101349, + "learning_rate": 0.0001999045572656239, + "loss": 0.41567211151123046, + "memory(GiB)": 91.64, + "step": 670, + "token_acc": 0.8707653701380176, + "train_speed(iter/s)": 0.135538 + }, + { + "epoch": 0.06370328425821065, + "grad_norm": 0.5052830576896667, + "learning_rate": 0.00019989761937355186, + "loss": 0.4151804447174072, + "memory(GiB)": 91.64, + "step": 675, + "token_acc": 0.8579277864992151, + "train_speed(iter/s)": 0.135555 + }, + { + "epoch": 0.0641751604379011, + "grad_norm": 0.9486806392669678, + "learning_rate": 0.0001998904382157383, + "loss": 0.4172024726867676, + "memory(GiB)": 91.64, + "step": 680, + "token_acc": 0.8693062368605466, + "train_speed(iter/s)": 0.135583 + }, + { + "epoch": 0.06464703661759154, + "grad_norm": 0.8576740622520447, + "learning_rate": 0.00019988301380967046, + "loss": 0.40970048904418943, + "memory(GiB)": 91.64, + "step": 685, + "token_acc": 0.8697332817935833, + "train_speed(iter/s)": 0.135611 + }, + { + "epoch": 0.06511891279728199, + "grad_norm": 1.150675654411316, + "learning_rate": 0.0001998753461734279, + "loss": 0.42277183532714846, + "memory(GiB)": 91.64, + "step": 690, + "token_acc": 0.8751705320600273, + "train_speed(iter/s)": 0.135635 + }, + { + "epoch": 0.06559078897697244, + "grad_norm": 1.0340559482574463, + "learning_rate": 0.0001998674353256824, + "loss": 0.41498851776123047, + "memory(GiB)": 91.64, + "step": 695, + "token_acc": 0.8739859383450513, + "train_speed(iter/s)": 0.135653 + }, + { + "epoch": 0.0660626651566629, + "grad_norm": 1.0053695440292358, + "learning_rate": 0.00019985928128569814, + "loss": 0.41504592895507814, + "memory(GiB)": 91.64, + "step": 700, + "token_acc": 0.8800152846771112, + "train_speed(iter/s)": 0.135671 + }, + { + "epoch": 0.06653454133635334, + "grad_norm": 0.6694364547729492, + "learning_rate": 0.00019985088407333137, + "loss": 0.41004395484924316, + "memory(GiB)": 91.64, + "step": 705, + "token_acc": 0.8829944002357796, + "train_speed(iter/s)": 0.1357 + }, + { + "epoch": 0.06700641751604379, + "grad_norm": 0.6840929388999939, + "learning_rate": 0.0001998422437090306, + "loss": 0.4107684135437012, + "memory(GiB)": 91.64, + "step": 710, + "token_acc": 0.8496487119437939, + "train_speed(iter/s)": 0.135697 + }, + { + "epoch": 0.06747829369573424, + "grad_norm": 0.8968580365180969, + "learning_rate": 0.00019983336021383642, + "loss": 0.40598034858703613, + "memory(GiB)": 91.64, + "step": 715, + "token_acc": 0.8637623762376238, + "train_speed(iter/s)": 0.135716 + }, + { + "epoch": 0.06795016987542468, + "grad_norm": 1.14011549949646, + "learning_rate": 0.0001998242336093815, + "loss": 0.4146572113037109, + "memory(GiB)": 91.64, + "step": 720, + "token_acc": 0.8786955196586407, + "train_speed(iter/s)": 0.135739 + }, + { + "epoch": 0.06842204605511513, + "grad_norm": 1.8799290657043457, + "learning_rate": 0.00019981486391789044, + "loss": 0.4101266860961914, + "memory(GiB)": 91.64, + "step": 725, + "token_acc": 0.8825644098262433, + "train_speed(iter/s)": 0.13576 + }, + { + "epoch": 0.06889392223480559, + "grad_norm": 1.097159743309021, + "learning_rate": 0.00019980525116217987, + "loss": 0.41389617919921873, + "memory(GiB)": 91.64, + "step": 730, + "token_acc": 0.8438617401668653, + "train_speed(iter/s)": 0.135793 + }, + { + "epoch": 0.06936579841449604, + "grad_norm": 0.39971891045570374, + "learning_rate": 0.00019979539536565835, + "loss": 0.4141077518463135, + "memory(GiB)": 91.64, + "step": 735, + "token_acc": 0.8802670004171882, + "train_speed(iter/s)": 0.135817 + }, + { + "epoch": 0.06983767459418648, + "grad_norm": 3.0273921489715576, + "learning_rate": 0.00019978529655232614, + "loss": 0.43627090454101564, + "memory(GiB)": 91.64, + "step": 740, + "token_acc": 0.8638635695383711, + "train_speed(iter/s)": 0.135832 + }, + { + "epoch": 0.07030955077387693, + "grad_norm": 1.658718228340149, + "learning_rate": 0.00019977495474677543, + "loss": 0.4568758010864258, + "memory(GiB)": 91.64, + "step": 745, + "token_acc": 0.8712955122777307, + "train_speed(iter/s)": 0.135857 + }, + { + "epoch": 0.07078142695356739, + "grad_norm": 0.8811335563659668, + "learning_rate": 0.00019976436997419004, + "loss": 0.405039119720459, + "memory(GiB)": 91.64, + "step": 750, + "token_acc": 0.8927808628791115, + "train_speed(iter/s)": 0.135875 + }, + { + "epoch": 0.07125330313325784, + "grad_norm": 0.4463726878166199, + "learning_rate": 0.00019975354226034554, + "loss": 0.3967690706253052, + "memory(GiB)": 91.64, + "step": 755, + "token_acc": 0.8465842167255595, + "train_speed(iter/s)": 0.135889 + }, + { + "epoch": 0.07172517931294828, + "grad_norm": 1.1370724439620972, + "learning_rate": 0.00019974247163160897, + "loss": 0.4012022018432617, + "memory(GiB)": 91.64, + "step": 760, + "token_acc": 0.8686626746506986, + "train_speed(iter/s)": 0.135918 + }, + { + "epoch": 0.07219705549263873, + "grad_norm": 0.6569880843162537, + "learning_rate": 0.00019973115811493903, + "loss": 0.39702539443969725, + "memory(GiB)": 91.64, + "step": 765, + "token_acc": 0.8665835411471322, + "train_speed(iter/s)": 0.135947 + }, + { + "epoch": 0.07266893167232918, + "grad_norm": 0.48011693358421326, + "learning_rate": 0.00019971960173788581, + "loss": 0.4035386085510254, + "memory(GiB)": 91.64, + "step": 770, + "token_acc": 0.8578104138851802, + "train_speed(iter/s)": 0.135961 + }, + { + "epoch": 0.07314080785201962, + "grad_norm": 0.4577121436595917, + "learning_rate": 0.00019970780252859087, + "loss": 0.39451889991760253, + "memory(GiB)": 91.64, + "step": 775, + "token_acc": 0.8687527162103433, + "train_speed(iter/s)": 0.135979 + }, + { + "epoch": 0.07361268403171008, + "grad_norm": 0.7687115669250488, + "learning_rate": 0.000199695760515787, + "loss": 0.40027799606323244, + "memory(GiB)": 91.64, + "step": 780, + "token_acc": 0.8462029355456286, + "train_speed(iter/s)": 0.135999 + }, + { + "epoch": 0.07408456021140053, + "grad_norm": 0.5802925825119019, + "learning_rate": 0.00019968347572879835, + "loss": 0.3972629070281982, + "memory(GiB)": 91.64, + "step": 785, + "token_acc": 0.8566473988439306, + "train_speed(iter/s)": 0.136023 + }, + { + "epoch": 0.07455643639109098, + "grad_norm": 0.4147838056087494, + "learning_rate": 0.0001996709481975402, + "loss": 0.40576868057250975, + "memory(GiB)": 91.64, + "step": 790, + "token_acc": 0.8650012010569301, + "train_speed(iter/s)": 0.136051 + }, + { + "epoch": 0.07502831257078142, + "grad_norm": 0.7321649789810181, + "learning_rate": 0.00019965817795251903, + "loss": 0.3949580669403076, + "memory(GiB)": 91.64, + "step": 795, + "token_acc": 0.8702346041055719, + "train_speed(iter/s)": 0.136065 + }, + { + "epoch": 0.07550018875047187, + "grad_norm": 0.42227479815483093, + "learning_rate": 0.00019964516502483224, + "loss": 0.40514750480651857, + "memory(GiB)": 91.64, + "step": 800, + "token_acc": 0.8411989795918368, + "train_speed(iter/s)": 0.136081 + }, + { + "epoch": 0.07597206493016233, + "grad_norm": 1.0944310426712036, + "learning_rate": 0.0001996319094461683, + "loss": 0.3979844808578491, + "memory(GiB)": 91.64, + "step": 805, + "token_acc": 0.8827761320355481, + "train_speed(iter/s)": 0.136098 + }, + { + "epoch": 0.07644394110985278, + "grad_norm": 1.040534496307373, + "learning_rate": 0.00019961841124880656, + "loss": 0.39571661949157716, + "memory(GiB)": 91.64, + "step": 810, + "token_acc": 0.8898480662983426, + "train_speed(iter/s)": 0.13611 + }, + { + "epoch": 0.07691581728954322, + "grad_norm": 0.605493426322937, + "learning_rate": 0.00019960467046561712, + "loss": 0.4014116287231445, + "memory(GiB)": 91.64, + "step": 815, + "token_acc": 0.8638888888888889, + "train_speed(iter/s)": 0.136124 + }, + { + "epoch": 0.07738769346923367, + "grad_norm": 0.7245599627494812, + "learning_rate": 0.0001995906871300609, + "loss": 0.39299564361572265, + "memory(GiB)": 91.64, + "step": 820, + "token_acc": 0.8532883642495784, + "train_speed(iter/s)": 0.136151 + }, + { + "epoch": 0.07785956964892413, + "grad_norm": 0.6854239702224731, + "learning_rate": 0.00019957646127618937, + "loss": 0.403260326385498, + "memory(GiB)": 91.64, + "step": 825, + "token_acc": 0.8646728971962617, + "train_speed(iter/s)": 0.136172 + }, + { + "epoch": 0.07833144582861457, + "grad_norm": 0.603828489780426, + "learning_rate": 0.00019956199293864467, + "loss": 0.41519527435302733, + "memory(GiB)": 91.64, + "step": 830, + "token_acc": 0.8636763412489006, + "train_speed(iter/s)": 0.13618 + }, + { + "epoch": 0.07880332200830502, + "grad_norm": 1.0101585388183594, + "learning_rate": 0.00019954728215265937, + "loss": 0.3980675935745239, + "memory(GiB)": 91.64, + "step": 835, + "token_acc": 0.8703146374829002, + "train_speed(iter/s)": 0.136184 + }, + { + "epoch": 0.07927519818799547, + "grad_norm": 0.4053173065185547, + "learning_rate": 0.00019953232895405644, + "loss": 0.3960963010787964, + "memory(GiB)": 91.64, + "step": 840, + "token_acc": 0.8587474472430224, + "train_speed(iter/s)": 0.136185 + }, + { + "epoch": 0.07974707436768592, + "grad_norm": 0.5084025859832764, + "learning_rate": 0.0001995171333792492, + "loss": 0.4032759666442871, + "memory(GiB)": 91.64, + "step": 845, + "token_acc": 0.8585987261146497, + "train_speed(iter/s)": 0.136191 + }, + { + "epoch": 0.08021895054737636, + "grad_norm": 0.4399580955505371, + "learning_rate": 0.0001995016954652411, + "loss": 0.39526617527008057, + "memory(GiB)": 91.64, + "step": 850, + "token_acc": 0.8773885350318471, + "train_speed(iter/s)": 0.136206 + }, + { + "epoch": 0.08069082672706682, + "grad_norm": 0.9100202322006226, + "learning_rate": 0.00019948601524962588, + "loss": 0.4031196117401123, + "memory(GiB)": 91.64, + "step": 855, + "token_acc": 0.8691389599317988, + "train_speed(iter/s)": 0.13623 + }, + { + "epoch": 0.08116270290675727, + "grad_norm": 0.5204232335090637, + "learning_rate": 0.00019947009277058712, + "loss": 0.38583035469055177, + "memory(GiB)": 91.64, + "step": 860, + "token_acc": 0.8842271293375394, + "train_speed(iter/s)": 0.136259 + }, + { + "epoch": 0.08163457908644772, + "grad_norm": 0.770131528377533, + "learning_rate": 0.00019945392806689855, + "loss": 0.3988801956176758, + "memory(GiB)": 91.64, + "step": 865, + "token_acc": 0.8591459528362014, + "train_speed(iter/s)": 0.136277 + }, + { + "epoch": 0.08210645526613816, + "grad_norm": 1.0950678586959839, + "learning_rate": 0.00019943752117792358, + "loss": 0.40749850273132326, + "memory(GiB)": 91.64, + "step": 870, + "token_acc": 0.8809041835357625, + "train_speed(iter/s)": 0.136294 + }, + { + "epoch": 0.08257833144582861, + "grad_norm": 0.7339961528778076, + "learning_rate": 0.00019942087214361548, + "loss": 0.40170702934265134, + "memory(GiB)": 91.64, + "step": 875, + "token_acc": 0.8847161572052402, + "train_speed(iter/s)": 0.136308 + }, + { + "epoch": 0.08305020762551907, + "grad_norm": 0.5357945561408997, + "learning_rate": 0.0001994039810045172, + "loss": 0.3935965061187744, + "memory(GiB)": 91.64, + "step": 880, + "token_acc": 0.8578924355050285, + "train_speed(iter/s)": 0.136328 + }, + { + "epoch": 0.08352208380520951, + "grad_norm": 0.9235381484031677, + "learning_rate": 0.0001993868478017611, + "loss": 0.39939312934875487, + "memory(GiB)": 91.64, + "step": 885, + "token_acc": 0.8347355769230769, + "train_speed(iter/s)": 0.136345 + }, + { + "epoch": 0.08399395998489996, + "grad_norm": 0.5018760561943054, + "learning_rate": 0.00019936947257706921, + "loss": 0.398266339302063, + "memory(GiB)": 91.64, + "step": 890, + "token_acc": 0.8601160013647219, + "train_speed(iter/s)": 0.136359 + }, + { + "epoch": 0.08446583616459041, + "grad_norm": 0.48619768023490906, + "learning_rate": 0.00019935185537275278, + "loss": 0.39243621826171876, + "memory(GiB)": 91.64, + "step": 895, + "token_acc": 0.8709032773780975, + "train_speed(iter/s)": 0.136385 + }, + { + "epoch": 0.08493771234428087, + "grad_norm": 0.6260401606559753, + "learning_rate": 0.00019933399623171236, + "loss": 0.39727630615234377, + "memory(GiB)": 91.64, + "step": 900, + "token_acc": 0.8716502115655853, + "train_speed(iter/s)": 0.136395 + }, + { + "epoch": 0.0854095885239713, + "grad_norm": 0.4643126428127289, + "learning_rate": 0.00019931589519743765, + "loss": 0.39346966743469236, + "memory(GiB)": 91.64, + "step": 905, + "token_acc": 0.8780300115429012, + "train_speed(iter/s)": 0.136405 + }, + { + "epoch": 0.08588146470366176, + "grad_norm": 0.5169233679771423, + "learning_rate": 0.00019929755231400735, + "loss": 0.3928957939147949, + "memory(GiB)": 91.64, + "step": 910, + "token_acc": 0.8716216216216216, + "train_speed(iter/s)": 0.136426 + }, + { + "epoch": 0.08635334088335221, + "grad_norm": 0.39350777864456177, + "learning_rate": 0.00019927896762608922, + "loss": 0.3948735952377319, + "memory(GiB)": 91.64, + "step": 915, + "token_acc": 0.8647242455775234, + "train_speed(iter/s)": 0.136437 + }, + { + "epoch": 0.08682521706304266, + "grad_norm": 0.40696701407432556, + "learning_rate": 0.0001992601411789397, + "loss": 0.38743617534637453, + "memory(GiB)": 91.64, + "step": 920, + "token_acc": 0.8676893576222435, + "train_speed(iter/s)": 0.136452 + }, + { + "epoch": 0.0872970932427331, + "grad_norm": 0.4113665521144867, + "learning_rate": 0.00019924107301840408, + "loss": 0.3946674823760986, + "memory(GiB)": 91.64, + "step": 925, + "token_acc": 0.8602645198389879, + "train_speed(iter/s)": 0.136484 + }, + { + "epoch": 0.08776896942242356, + "grad_norm": 0.3710317015647888, + "learning_rate": 0.00019922176319091617, + "loss": 0.3960568904876709, + "memory(GiB)": 91.64, + "step": 930, + "token_acc": 0.8746690203000883, + "train_speed(iter/s)": 0.136487 + }, + { + "epoch": 0.08824084560211401, + "grad_norm": 0.4853060245513916, + "learning_rate": 0.0001992022117434983, + "loss": 0.39677085876464846, + "memory(GiB)": 91.64, + "step": 935, + "token_acc": 0.8561736770691994, + "train_speed(iter/s)": 0.136506 + }, + { + "epoch": 0.08871272178180445, + "grad_norm": 0.49833860993385315, + "learning_rate": 0.0001991824187237612, + "loss": 0.39057235717773436, + "memory(GiB)": 91.64, + "step": 940, + "token_acc": 0.852015732546706, + "train_speed(iter/s)": 0.136515 + }, + { + "epoch": 0.0891845979614949, + "grad_norm": 0.8697565793991089, + "learning_rate": 0.00019916238417990386, + "loss": 0.39734203815460206, + "memory(GiB)": 91.64, + "step": 945, + "token_acc": 0.8645515558267236, + "train_speed(iter/s)": 0.136517 + }, + { + "epoch": 0.08965647414118535, + "grad_norm": 0.3841949701309204, + "learning_rate": 0.0001991421081607134, + "loss": 0.39315037727355956, + "memory(GiB)": 91.64, + "step": 950, + "token_acc": 0.8595624558927312, + "train_speed(iter/s)": 0.136535 + }, + { + "epoch": 0.09012835032087581, + "grad_norm": 0.6174625158309937, + "learning_rate": 0.00019912159071556497, + "loss": 0.3937983512878418, + "memory(GiB)": 91.64, + "step": 955, + "token_acc": 0.8652637332604537, + "train_speed(iter/s)": 0.136548 + }, + { + "epoch": 0.09060022650056625, + "grad_norm": 0.5916112065315247, + "learning_rate": 0.0001991008318944217, + "loss": 0.3945147037506104, + "memory(GiB)": 91.64, + "step": 960, + "token_acc": 0.8693599160545645, + "train_speed(iter/s)": 0.136565 + }, + { + "epoch": 0.0910721026802567, + "grad_norm": 0.6667875051498413, + "learning_rate": 0.00019907983174783433, + "loss": 0.38831090927124023, + "memory(GiB)": 91.64, + "step": 965, + "token_acc": 0.8872151409810738, + "train_speed(iter/s)": 0.13657 + }, + { + "epoch": 0.09154397885994715, + "grad_norm": 0.4363083839416504, + "learning_rate": 0.00019905859032694147, + "loss": 0.3933609962463379, + "memory(GiB)": 91.64, + "step": 970, + "token_acc": 0.8592730661696178, + "train_speed(iter/s)": 0.136591 + }, + { + "epoch": 0.09201585503963759, + "grad_norm": 0.5822863578796387, + "learning_rate": 0.00019903710768346918, + "loss": 0.38769237995147704, + "memory(GiB)": 91.64, + "step": 975, + "token_acc": 0.8806196840826246, + "train_speed(iter/s)": 0.136602 + }, + { + "epoch": 0.09248773121932805, + "grad_norm": 0.342798113822937, + "learning_rate": 0.00019901538386973085, + "loss": 0.39563870429992676, + "memory(GiB)": 91.64, + "step": 980, + "token_acc": 0.8935498421290031, + "train_speed(iter/s)": 0.13661 + }, + { + "epoch": 0.0929596073990185, + "grad_norm": 0.38594508171081543, + "learning_rate": 0.0001989934189386273, + "loss": 0.39223246574401854, + "memory(GiB)": 91.64, + "step": 985, + "token_acc": 0.8759615384615385, + "train_speed(iter/s)": 0.136622 + }, + { + "epoch": 0.09343148357870895, + "grad_norm": 0.4435591399669647, + "learning_rate": 0.00019897121294364643, + "loss": 0.39308857917785645, + "memory(GiB)": 91.64, + "step": 990, + "token_acc": 0.8816035968527538, + "train_speed(iter/s)": 0.136635 + }, + { + "epoch": 0.09390335975839939, + "grad_norm": 0.5884724855422974, + "learning_rate": 0.0001989487659388632, + "loss": 0.3950310707092285, + "memory(GiB)": 91.64, + "step": 995, + "token_acc": 0.8725490196078431, + "train_speed(iter/s)": 0.136656 + }, + { + "epoch": 0.09437523593808984, + "grad_norm": 0.785302460193634, + "learning_rate": 0.00019892607797893943, + "loss": 0.3945254564285278, + "memory(GiB)": 91.64, + "step": 1000, + "token_acc": 0.8808167141500475, + "train_speed(iter/s)": 0.136669 + }, + { + "epoch": 0.0948471121177803, + "grad_norm": 0.4744601547718048, + "learning_rate": 0.00019890314911912368, + "loss": 0.3849745750427246, + "memory(GiB)": 91.64, + "step": 1005, + "token_acc": 0.8613861386138614, + "train_speed(iter/s)": 0.13669 + }, + { + "epoch": 0.09531898829747075, + "grad_norm": 0.5876829624176025, + "learning_rate": 0.00019887997941525124, + "loss": 0.3877379894256592, + "memory(GiB)": 91.64, + "step": 1010, + "token_acc": 0.8721071863580999, + "train_speed(iter/s)": 0.1367 + }, + { + "epoch": 0.09579086447716119, + "grad_norm": 0.8457826375961304, + "learning_rate": 0.00019885656892374378, + "loss": 0.3987894535064697, + "memory(GiB)": 91.64, + "step": 1015, + "token_acc": 0.8600823045267489, + "train_speed(iter/s)": 0.136711 + }, + { + "epoch": 0.09626274065685164, + "grad_norm": 1.2091189622879028, + "learning_rate": 0.00019883291770160942, + "loss": 0.38491311073303225, + "memory(GiB)": 91.64, + "step": 1020, + "token_acc": 0.8802992518703242, + "train_speed(iter/s)": 0.136726 + }, + { + "epoch": 0.0967346168365421, + "grad_norm": 0.43290698528289795, + "learning_rate": 0.0001988090258064424, + "loss": 0.3924283027648926, + "memory(GiB)": 91.64, + "step": 1025, + "token_acc": 0.8670157068062827, + "train_speed(iter/s)": 0.136735 + }, + { + "epoch": 0.09720649301623253, + "grad_norm": 0.4230733811855316, + "learning_rate": 0.00019878489329642308, + "loss": 0.38915348052978516, + "memory(GiB)": 91.64, + "step": 1030, + "token_acc": 0.8749527767283718, + "train_speed(iter/s)": 0.136743 + }, + { + "epoch": 0.09767836919592299, + "grad_norm": 0.5611448884010315, + "learning_rate": 0.00019876052023031778, + "loss": 0.3783283233642578, + "memory(GiB)": 91.64, + "step": 1035, + "token_acc": 0.8774747852073216, + "train_speed(iter/s)": 0.13675 + }, + { + "epoch": 0.09815024537561344, + "grad_norm": 0.8954288363456726, + "learning_rate": 0.00019873590666747855, + "loss": 0.38250117301940917, + "memory(GiB)": 91.64, + "step": 1040, + "token_acc": 0.8612850082372323, + "train_speed(iter/s)": 0.136771 + }, + { + "epoch": 0.0986221215553039, + "grad_norm": 0.601012110710144, + "learning_rate": 0.00019871105266784317, + "loss": 0.38401503562927247, + "memory(GiB)": 91.64, + "step": 1045, + "token_acc": 0.8982758620689655, + "train_speed(iter/s)": 0.13678 + }, + { + "epoch": 0.09909399773499433, + "grad_norm": 0.5251648426055908, + "learning_rate": 0.00019868595829193486, + "loss": 0.38367199897766113, + "memory(GiB)": 91.64, + "step": 1050, + "token_acc": 0.8643162393162394, + "train_speed(iter/s)": 0.136804 + }, + { + "epoch": 0.09956587391468479, + "grad_norm": 0.553338348865509, + "learning_rate": 0.00019866062360086216, + "loss": 0.383012580871582, + "memory(GiB)": 91.64, + "step": 1055, + "token_acc": 0.8829745596868884, + "train_speed(iter/s)": 0.13682 + }, + { + "epoch": 0.10003775009437524, + "grad_norm": 0.48636507987976074, + "learning_rate": 0.00019863504865631892, + "loss": 0.39072608947753906, + "memory(GiB)": 91.64, + "step": 1060, + "token_acc": 0.8856960408684547, + "train_speed(iter/s)": 0.136826 + }, + { + "epoch": 0.10050962627406569, + "grad_norm": 0.560430645942688, + "learning_rate": 0.00019860923352058393, + "loss": 0.3910938262939453, + "memory(GiB)": 91.64, + "step": 1065, + "token_acc": 0.8764075067024129, + "train_speed(iter/s)": 0.136831 + }, + { + "epoch": 0.10098150245375613, + "grad_norm": 0.5854150652885437, + "learning_rate": 0.00019858317825652096, + "loss": 0.38489365577697754, + "memory(GiB)": 91.64, + "step": 1070, + "token_acc": 0.8923697270471465, + "train_speed(iter/s)": 0.136832 + }, + { + "epoch": 0.10145337863344658, + "grad_norm": 0.47770190238952637, + "learning_rate": 0.00019855688292757848, + "loss": 0.39474029541015626, + "memory(GiB)": 91.64, + "step": 1075, + "token_acc": 0.8581871345029239, + "train_speed(iter/s)": 0.136842 + }, + { + "epoch": 0.10192525481313704, + "grad_norm": 0.6249523162841797, + "learning_rate": 0.00019853034759778957, + "loss": 0.37869269847869874, + "memory(GiB)": 91.64, + "step": 1080, + "token_acc": 0.8848101265822785, + "train_speed(iter/s)": 0.136845 + }, + { + "epoch": 0.10239713099282748, + "grad_norm": 0.5587074756622314, + "learning_rate": 0.00019850357233177176, + "loss": 0.37889723777770995, + "memory(GiB)": 91.64, + "step": 1085, + "token_acc": 0.8695814648729447, + "train_speed(iter/s)": 0.136852 + }, + { + "epoch": 0.10286900717251793, + "grad_norm": 0.456506609916687, + "learning_rate": 0.00019847655719472688, + "loss": 0.37969346046447755, + "memory(GiB)": 91.64, + "step": 1090, + "token_acc": 0.8711864406779661, + "train_speed(iter/s)": 0.136854 + }, + { + "epoch": 0.10334088335220838, + "grad_norm": 0.48825180530548096, + "learning_rate": 0.00019844930225244083, + "loss": 0.3927449226379395, + "memory(GiB)": 91.64, + "step": 1095, + "token_acc": 0.8670253651037664, + "train_speed(iter/s)": 0.136853 + }, + { + "epoch": 0.10381275953189883, + "grad_norm": 0.33617979288101196, + "learning_rate": 0.0001984218075712835, + "loss": 0.3827697277069092, + "memory(GiB)": 91.64, + "step": 1100, + "token_acc": 0.8889289578074288, + "train_speed(iter/s)": 0.136858 + }, + { + "epoch": 0.10428463571158927, + "grad_norm": 0.5517711639404297, + "learning_rate": 0.00019839407321820858, + "loss": 0.3828376293182373, + "memory(GiB)": 91.64, + "step": 1105, + "token_acc": 0.8939336131247615, + "train_speed(iter/s)": 0.136873 + }, + { + "epoch": 0.10475651189127973, + "grad_norm": 1.1416659355163574, + "learning_rate": 0.0001983660992607534, + "loss": 0.39458913803100587, + "memory(GiB)": 91.64, + "step": 1110, + "token_acc": 0.8489618218352311, + "train_speed(iter/s)": 0.136884 + }, + { + "epoch": 0.10522838807097018, + "grad_norm": 0.3741232752799988, + "learning_rate": 0.00019833788576703875, + "loss": 0.3905258893966675, + "memory(GiB)": 91.64, + "step": 1115, + "token_acc": 0.8970338983050847, + "train_speed(iter/s)": 0.136893 + }, + { + "epoch": 0.10570026425066063, + "grad_norm": 0.4323633015155792, + "learning_rate": 0.00019830943280576874, + "loss": 0.38057544231414797, + "memory(GiB)": 91.64, + "step": 1120, + "token_acc": 0.8544902093180283, + "train_speed(iter/s)": 0.136906 + }, + { + "epoch": 0.10617214043035107, + "grad_norm": 0.5611393451690674, + "learning_rate": 0.0001982807404462306, + "loss": 0.3737280607223511, + "memory(GiB)": 91.64, + "step": 1125, + "token_acc": 0.8825597749648383, + "train_speed(iter/s)": 0.136914 + }, + { + "epoch": 0.10664401661004153, + "grad_norm": 0.39299675822257996, + "learning_rate": 0.00019825180875829456, + "loss": 0.37208285331726076, + "memory(GiB)": 91.64, + "step": 1130, + "token_acc": 0.8761904761904762, + "train_speed(iter/s)": 0.136918 + }, + { + "epoch": 0.10711589278973198, + "grad_norm": 0.6676709055900574, + "learning_rate": 0.0001982226378124136, + "loss": 0.37977089881896975, + "memory(GiB)": 91.64, + "step": 1135, + "token_acc": 0.883206106870229, + "train_speed(iter/s)": 0.136928 + }, + { + "epoch": 0.10758776896942242, + "grad_norm": 0.4939993619918823, + "learning_rate": 0.00019819322767962344, + "loss": 0.3877007007598877, + "memory(GiB)": 91.64, + "step": 1140, + "token_acc": 0.8383121732636296, + "train_speed(iter/s)": 0.136939 + }, + { + "epoch": 0.10805964514911287, + "grad_norm": 0.3608226776123047, + "learning_rate": 0.00019816357843154212, + "loss": 0.37889995574951174, + "memory(GiB)": 91.64, + "step": 1145, + "token_acc": 0.8914505283381364, + "train_speed(iter/s)": 0.136944 + }, + { + "epoch": 0.10853152132880332, + "grad_norm": 0.45509615540504456, + "learning_rate": 0.00019813369014037003, + "loss": 0.38171145915985105, + "memory(GiB)": 91.64, + "step": 1150, + "token_acc": 0.8768656716417911, + "train_speed(iter/s)": 0.136949 + }, + { + "epoch": 0.10900339750849378, + "grad_norm": 0.5381651520729065, + "learning_rate": 0.00019810356287888967, + "loss": 0.38211042881011964, + "memory(GiB)": 91.64, + "step": 1155, + "token_acc": 0.8691718858733473, + "train_speed(iter/s)": 0.136952 + }, + { + "epoch": 0.10947527368818422, + "grad_norm": 0.5979215502738953, + "learning_rate": 0.00019807319672046546, + "loss": 0.37435040473937986, + "memory(GiB)": 91.64, + "step": 1160, + "token_acc": 0.8914702953866578, + "train_speed(iter/s)": 0.136955 + }, + { + "epoch": 0.10994714986787467, + "grad_norm": 0.46846890449523926, + "learning_rate": 0.00019804259173904356, + "loss": 0.3730193614959717, + "memory(GiB)": 91.64, + "step": 1165, + "token_acc": 0.872072072072072, + "train_speed(iter/s)": 0.136965 + }, + { + "epoch": 0.11041902604756512, + "grad_norm": 0.6530776619911194, + "learning_rate": 0.0001980117480091517, + "loss": 0.3740133762359619, + "memory(GiB)": 91.64, + "step": 1170, + "token_acc": 0.8564527260179434, + "train_speed(iter/s)": 0.136979 + }, + { + "epoch": 0.11089090222725557, + "grad_norm": 1.131178379058838, + "learning_rate": 0.000197980665605899, + "loss": 0.3640794277191162, + "memory(GiB)": 91.64, + "step": 1175, + "token_acc": 0.8569848875783266, + "train_speed(iter/s)": 0.136983 + }, + { + "epoch": 0.11136277840694601, + "grad_norm": 0.4460451602935791, + "learning_rate": 0.00019794934460497582, + "loss": 0.3784611225128174, + "memory(GiB)": 91.64, + "step": 1180, + "token_acc": 0.8822531387852053, + "train_speed(iter/s)": 0.136986 + }, + { + "epoch": 0.11183465458663647, + "grad_norm": 0.6955581903457642, + "learning_rate": 0.00019791778508265352, + "loss": 0.3845529556274414, + "memory(GiB)": 91.64, + "step": 1185, + "token_acc": 0.8754593711719069, + "train_speed(iter/s)": 0.137001 + }, + { + "epoch": 0.11230653076632692, + "grad_norm": 0.39989179372787476, + "learning_rate": 0.0001978859871157842, + "loss": 0.3827672958374023, + "memory(GiB)": 91.64, + "step": 1190, + "token_acc": 0.8808139534883721, + "train_speed(iter/s)": 0.137003 + }, + { + "epoch": 0.11277840694601736, + "grad_norm": 0.3942891061306, + "learning_rate": 0.0001978539507818008, + "loss": 0.3772392511367798, + "memory(GiB)": 91.64, + "step": 1195, + "token_acc": 0.8851182197496523, + "train_speed(iter/s)": 0.137013 + }, + { + "epoch": 0.11325028312570781, + "grad_norm": 0.422818660736084, + "learning_rate": 0.00019782167615871657, + "loss": 0.37031795978546145, + "memory(GiB)": 91.64, + "step": 1200, + "token_acc": 0.8915779283639884, + "train_speed(iter/s)": 0.137021 + }, + { + "epoch": 0.11372215930539827, + "grad_norm": 0.8060660362243652, + "learning_rate": 0.00019778916332512507, + "loss": 0.3858052730560303, + "memory(GiB)": 91.64, + "step": 1205, + "token_acc": 0.894151417294582, + "train_speed(iter/s)": 0.137033 + }, + { + "epoch": 0.11419403548508872, + "grad_norm": 0.44119134545326233, + "learning_rate": 0.00019775641236019996, + "loss": 0.37988340854644775, + "memory(GiB)": 91.64, + "step": 1210, + "token_acc": 0.883441258094357, + "train_speed(iter/s)": 0.137044 + }, + { + "epoch": 0.11466591166477916, + "grad_norm": 0.654055655002594, + "learning_rate": 0.00019772342334369478, + "loss": 0.3706467866897583, + "memory(GiB)": 91.64, + "step": 1215, + "token_acc": 0.8836206896551724, + "train_speed(iter/s)": 0.137053 + }, + { + "epoch": 0.11513778784446961, + "grad_norm": 0.45024436712265015, + "learning_rate": 0.00019769019635594272, + "loss": 0.3729224443435669, + "memory(GiB)": 91.64, + "step": 1220, + "token_acc": 0.8666930379746836, + "train_speed(iter/s)": 0.137064 + }, + { + "epoch": 0.11560966402416006, + "grad_norm": 0.3812342882156372, + "learning_rate": 0.00019765673147785652, + "loss": 0.38361666202545164, + "memory(GiB)": 91.64, + "step": 1225, + "token_acc": 0.8751983077736647, + "train_speed(iter/s)": 0.13708 + }, + { + "epoch": 0.1160815402038505, + "grad_norm": 0.39537158608436584, + "learning_rate": 0.0001976230287909282, + "loss": 0.3797783136367798, + "memory(GiB)": 91.64, + "step": 1230, + "token_acc": 0.8721294363256785, + "train_speed(iter/s)": 0.137091 + }, + { + "epoch": 0.11655341638354096, + "grad_norm": 0.5020424127578735, + "learning_rate": 0.00019758908837722884, + "loss": 0.37000179290771484, + "memory(GiB)": 91.64, + "step": 1235, + "token_acc": 0.8931271477663231, + "train_speed(iter/s)": 0.137094 + }, + { + "epoch": 0.11702529256323141, + "grad_norm": 0.4574194550514221, + "learning_rate": 0.00019755491031940854, + "loss": 0.36977810859680177, + "memory(GiB)": 91.64, + "step": 1240, + "token_acc": 0.8930348258706468, + "train_speed(iter/s)": 0.137102 + }, + { + "epoch": 0.11749716874292186, + "grad_norm": 0.6839103698730469, + "learning_rate": 0.0001975204947006959, + "loss": 0.3748194932937622, + "memory(GiB)": 91.64, + "step": 1245, + "token_acc": 0.8971393791844188, + "train_speed(iter/s)": 0.13711 + }, + { + "epoch": 0.1179690449226123, + "grad_norm": 0.43332648277282715, + "learning_rate": 0.0001974858416048982, + "loss": 0.36992840766906737, + "memory(GiB)": 91.64, + "step": 1250, + "token_acc": 0.8771353482260184, + "train_speed(iter/s)": 0.13711 + }, + { + "epoch": 0.11844092110230275, + "grad_norm": 0.7626131176948547, + "learning_rate": 0.00019745095111640094, + "loss": 0.3885170936584473, + "memory(GiB)": 91.64, + "step": 1255, + "token_acc": 0.8769186214885607, + "train_speed(iter/s)": 0.137124 + }, + { + "epoch": 0.11891279728199321, + "grad_norm": 1.1946239471435547, + "learning_rate": 0.00019741582332016773, + "loss": 0.3764191150665283, + "memory(GiB)": 91.64, + "step": 1260, + "token_acc": 0.8837127845884414, + "train_speed(iter/s)": 0.137128 + }, + { + "epoch": 0.11938467346168366, + "grad_norm": 0.37812870740890503, + "learning_rate": 0.00019738045830173997, + "loss": 0.3727047204971313, + "memory(GiB)": 91.64, + "step": 1265, + "token_acc": 0.868295994568907, + "train_speed(iter/s)": 0.137132 + }, + { + "epoch": 0.1198565496413741, + "grad_norm": 0.9704969525337219, + "learning_rate": 0.0001973448561472369, + "loss": 0.3743635892868042, + "memory(GiB)": 91.64, + "step": 1270, + "token_acc": 0.8887052341597796, + "train_speed(iter/s)": 0.137142 + }, + { + "epoch": 0.12032842582106455, + "grad_norm": 0.5081287622451782, + "learning_rate": 0.00019730901694335503, + "loss": 0.4186855316162109, + "memory(GiB)": 91.64, + "step": 1275, + "token_acc": 0.9019308943089431, + "train_speed(iter/s)": 0.137157 + }, + { + "epoch": 0.120800302000755, + "grad_norm": 0.50217205286026, + "learning_rate": 0.0001972729407773683, + "loss": 0.37697782516479494, + "memory(GiB)": 91.64, + "step": 1280, + "token_acc": 0.874384236453202, + "train_speed(iter/s)": 0.137164 + }, + { + "epoch": 0.12127217818044544, + "grad_norm": 0.7912167310714722, + "learning_rate": 0.0001972366277371276, + "loss": 0.3852388381958008, + "memory(GiB)": 91.64, + "step": 1285, + "token_acc": 0.8706407137064072, + "train_speed(iter/s)": 0.137174 + }, + { + "epoch": 0.1217440543601359, + "grad_norm": 0.42602792382240295, + "learning_rate": 0.00019720007791106057, + "loss": 0.3708258390426636, + "memory(GiB)": 91.64, + "step": 1290, + "token_acc": 0.8843969261610425, + "train_speed(iter/s)": 0.13718 + }, + { + "epoch": 0.12221593053982635, + "grad_norm": 0.3974127471446991, + "learning_rate": 0.00019716329138817158, + "loss": 0.3643842935562134, + "memory(GiB)": 91.64, + "step": 1295, + "token_acc": 0.8741738688357905, + "train_speed(iter/s)": 0.137184 + }, + { + "epoch": 0.1226878067195168, + "grad_norm": 0.4702013432979584, + "learning_rate": 0.0001971262682580414, + "loss": 0.3616140604019165, + "memory(GiB)": 91.64, + "step": 1300, + "token_acc": 0.886223191566703, + "train_speed(iter/s)": 0.13719 + }, + { + "epoch": 0.12315968289920724, + "grad_norm": 0.43112415075302124, + "learning_rate": 0.00019708900861082685, + "loss": 0.3715237855911255, + "memory(GiB)": 91.64, + "step": 1305, + "token_acc": 0.886991461577097, + "train_speed(iter/s)": 0.137195 + }, + { + "epoch": 0.1236315590788977, + "grad_norm": 0.5010024905204773, + "learning_rate": 0.00019705151253726082, + "loss": 0.37075207233428953, + "memory(GiB)": 91.64, + "step": 1310, + "token_acc": 0.8941935483870967, + "train_speed(iter/s)": 0.137196 + }, + { + "epoch": 0.12410343525858815, + "grad_norm": 0.6552342772483826, + "learning_rate": 0.0001970137801286519, + "loss": 0.3803473234176636, + "memory(GiB)": 91.64, + "step": 1315, + "token_acc": 0.8743248109470652, + "train_speed(iter/s)": 0.137199 + }, + { + "epoch": 0.1245753114382786, + "grad_norm": 0.45841383934020996, + "learning_rate": 0.00019697581147688417, + "loss": 0.3710304260253906, + "memory(GiB)": 91.64, + "step": 1320, + "token_acc": 0.867526746381372, + "train_speed(iter/s)": 0.137207 + }, + { + "epoch": 0.12504718761796904, + "grad_norm": 0.4373835027217865, + "learning_rate": 0.00019693760667441703, + "loss": 0.35978107452392577, + "memory(GiB)": 91.64, + "step": 1325, + "token_acc": 0.8925126320624713, + "train_speed(iter/s)": 0.137217 + }, + { + "epoch": 0.12551906379765948, + "grad_norm": 0.3811391592025757, + "learning_rate": 0.00019689916581428488, + "loss": 0.3601937770843506, + "memory(GiB)": 91.64, + "step": 1330, + "token_acc": 0.8777089783281734, + "train_speed(iter/s)": 0.137222 + }, + { + "epoch": 0.12599093997734995, + "grad_norm": 0.40880128741264343, + "learning_rate": 0.00019686048899009704, + "loss": 0.3690077066421509, + "memory(GiB)": 91.64, + "step": 1335, + "token_acc": 0.8848944591029023, + "train_speed(iter/s)": 0.137221 + }, + { + "epoch": 0.1264628161570404, + "grad_norm": 0.6039553880691528, + "learning_rate": 0.0001968215762960374, + "loss": 0.37348055839538574, + "memory(GiB)": 91.64, + "step": 1340, + "token_acc": 0.870300204022151, + "train_speed(iter/s)": 0.137226 + }, + { + "epoch": 0.12693469233673085, + "grad_norm": 1.1906999349594116, + "learning_rate": 0.00019678242782686421, + "loss": 0.36064743995666504, + "memory(GiB)": 91.64, + "step": 1345, + "token_acc": 0.8814722395508422, + "train_speed(iter/s)": 0.137241 + }, + { + "epoch": 0.1274065685164213, + "grad_norm": 0.6271125674247742, + "learning_rate": 0.00019674304367790993, + "loss": 0.3587361812591553, + "memory(GiB)": 91.64, + "step": 1350, + "token_acc": 0.8639104220499569, + "train_speed(iter/s)": 0.137247 + }, + { + "epoch": 0.12787844469611173, + "grad_norm": 0.34620094299316406, + "learning_rate": 0.0001967034239450808, + "loss": 0.3652570486068726, + "memory(GiB)": 91.64, + "step": 1355, + "token_acc": 0.879513492968453, + "train_speed(iter/s)": 0.137257 + }, + { + "epoch": 0.1283503208758022, + "grad_norm": 0.5999593734741211, + "learning_rate": 0.00019666356872485695, + "loss": 0.36589975357055665, + "memory(GiB)": 91.64, + "step": 1360, + "token_acc": 0.8735487919673675, + "train_speed(iter/s)": 0.137273 + }, + { + "epoch": 0.12882219705549264, + "grad_norm": 0.4182213842868805, + "learning_rate": 0.00019662347811429172, + "loss": 0.3619654178619385, + "memory(GiB)": 91.64, + "step": 1365, + "token_acc": 0.8859732824427481, + "train_speed(iter/s)": 0.137286 + }, + { + "epoch": 0.12929407323518308, + "grad_norm": 0.7008164525032043, + "learning_rate": 0.0001965831522110119, + "loss": 0.36653695106506345, + "memory(GiB)": 91.64, + "step": 1370, + "token_acc": 0.8621755253399258, + "train_speed(iter/s)": 0.137294 + }, + { + "epoch": 0.12976594941487354, + "grad_norm": 0.5119627118110657, + "learning_rate": 0.00019654259111321704, + "loss": 0.3641792297363281, + "memory(GiB)": 91.64, + "step": 1375, + "token_acc": 0.8765696784073507, + "train_speed(iter/s)": 0.137302 + }, + { + "epoch": 0.13023782559456398, + "grad_norm": 0.6845554709434509, + "learning_rate": 0.00019650179491967955, + "loss": 0.36969287395477296, + "memory(GiB)": 91.64, + "step": 1380, + "token_acc": 0.8856742883807747, + "train_speed(iter/s)": 0.1373 + }, + { + "epoch": 0.13070970177425442, + "grad_norm": 0.7057746052742004, + "learning_rate": 0.00019646076372974432, + "loss": 0.364498496055603, + "memory(GiB)": 91.64, + "step": 1385, + "token_acc": 0.8957236842105263, + "train_speed(iter/s)": 0.137308 + }, + { + "epoch": 0.1311815779539449, + "grad_norm": 0.5785896182060242, + "learning_rate": 0.0001964194976433285, + "loss": 0.3685713768005371, + "memory(GiB)": 91.64, + "step": 1390, + "token_acc": 0.8923533778767632, + "train_speed(iter/s)": 0.137314 + }, + { + "epoch": 0.13165345413363533, + "grad_norm": 0.43366122245788574, + "learning_rate": 0.00019637799676092114, + "loss": 0.3690282106399536, + "memory(GiB)": 91.64, + "step": 1395, + "token_acc": 0.8743633276740238, + "train_speed(iter/s)": 0.137317 + }, + { + "epoch": 0.1321253303133258, + "grad_norm": 0.3319828510284424, + "learning_rate": 0.0001963362611835832, + "loss": 0.3694582462310791, + "memory(GiB)": 91.64, + "step": 1400, + "token_acc": 0.8864421416234888, + "train_speed(iter/s)": 0.137319 + }, + { + "epoch": 0.13259720649301623, + "grad_norm": 0.42545682191848755, + "learning_rate": 0.00019629429101294707, + "loss": 0.3619790315628052, + "memory(GiB)": 91.64, + "step": 1405, + "token_acc": 0.8672329012069736, + "train_speed(iter/s)": 0.137334 + }, + { + "epoch": 0.13306908267270667, + "grad_norm": 0.6269343495368958, + "learning_rate": 0.00019625208635121646, + "loss": 0.3626497983932495, + "memory(GiB)": 91.64, + "step": 1410, + "token_acc": 0.8965665236051502, + "train_speed(iter/s)": 0.137348 + }, + { + "epoch": 0.13354095885239714, + "grad_norm": 0.5690509676933289, + "learning_rate": 0.00019620964730116601, + "loss": 0.35851593017578126, + "memory(GiB)": 91.64, + "step": 1415, + "token_acc": 0.8846794338051623, + "train_speed(iter/s)": 0.137357 + }, + { + "epoch": 0.13401283503208758, + "grad_norm": 0.592526912689209, + "learning_rate": 0.00019616697396614128, + "loss": 0.3695559501647949, + "memory(GiB)": 91.64, + "step": 1420, + "token_acc": 0.8799270072992701, + "train_speed(iter/s)": 0.137358 + }, + { + "epoch": 0.13448471121177802, + "grad_norm": 0.5720744132995605, + "learning_rate": 0.0001961240664500582, + "loss": 0.36823060512542727, + "memory(GiB)": 91.64, + "step": 1425, + "token_acc": 0.8664506839452844, + "train_speed(iter/s)": 0.137366 + }, + { + "epoch": 0.13495658739146849, + "grad_norm": 0.854423463344574, + "learning_rate": 0.00019608092485740307, + "loss": 0.3638261318206787, + "memory(GiB)": 91.64, + "step": 1430, + "token_acc": 0.8540372670807453, + "train_speed(iter/s)": 0.13737 + }, + { + "epoch": 0.13542846357115892, + "grad_norm": 0.40354272723197937, + "learning_rate": 0.00019603754929323214, + "loss": 0.36725308895111086, + "memory(GiB)": 91.64, + "step": 1435, + "token_acc": 0.8717277486910995, + "train_speed(iter/s)": 0.13738 + }, + { + "epoch": 0.13590033975084936, + "grad_norm": 0.5269042253494263, + "learning_rate": 0.00019599393986317147, + "loss": 0.3656820297241211, + "memory(GiB)": 91.64, + "step": 1440, + "token_acc": 0.8932565232124704, + "train_speed(iter/s)": 0.137388 + }, + { + "epoch": 0.13637221593053983, + "grad_norm": 0.47906067967414856, + "learning_rate": 0.00019595009667341655, + "loss": 0.3657586097717285, + "memory(GiB)": 91.64, + "step": 1445, + "token_acc": 0.8870967741935484, + "train_speed(iter/s)": 0.137395 + }, + { + "epoch": 0.13684409211023027, + "grad_norm": 0.821492075920105, + "learning_rate": 0.00019590601983073214, + "loss": 0.3700244665145874, + "memory(GiB)": 91.64, + "step": 1450, + "token_acc": 0.8615896041100031, + "train_speed(iter/s)": 0.137399 + }, + { + "epoch": 0.13731596828992074, + "grad_norm": 1.1818538904190063, + "learning_rate": 0.00019586170944245202, + "loss": 0.36403641700744627, + "memory(GiB)": 91.64, + "step": 1455, + "token_acc": 0.8834304746044963, + "train_speed(iter/s)": 0.137405 + }, + { + "epoch": 0.13778784446961118, + "grad_norm": 0.3865146040916443, + "learning_rate": 0.00019581716561647866, + "loss": 0.3664146661758423, + "memory(GiB)": 91.64, + "step": 1460, + "token_acc": 0.8741547708489857, + "train_speed(iter/s)": 0.137411 + }, + { + "epoch": 0.13825972064930162, + "grad_norm": 0.8478065729141235, + "learning_rate": 0.00019577238846128295, + "loss": 0.37954490184783934, + "memory(GiB)": 91.64, + "step": 1465, + "token_acc": 0.8746763335059554, + "train_speed(iter/s)": 0.137423 + }, + { + "epoch": 0.13873159682899208, + "grad_norm": 0.49488478899002075, + "learning_rate": 0.000195727378085904, + "loss": 0.3570873737335205, + "memory(GiB)": 91.64, + "step": 1470, + "token_acc": 0.8735310576385003, + "train_speed(iter/s)": 0.137431 + }, + { + "epoch": 0.13920347300868252, + "grad_norm": 0.6807460784912109, + "learning_rate": 0.0001956821345999489, + "loss": 0.3615212917327881, + "memory(GiB)": 91.64, + "step": 1475, + "token_acc": 0.8776844070961718, + "train_speed(iter/s)": 0.137442 + }, + { + "epoch": 0.13967534918837296, + "grad_norm": 0.9388037323951721, + "learning_rate": 0.0001956366581135923, + "loss": 0.3591162204742432, + "memory(GiB)": 91.64, + "step": 1480, + "token_acc": 0.8671586715867159, + "train_speed(iter/s)": 0.137452 + }, + { + "epoch": 0.14014722536806343, + "grad_norm": 0.8437421321868896, + "learning_rate": 0.0001955909487375763, + "loss": 0.353442907333374, + "memory(GiB)": 91.64, + "step": 1485, + "token_acc": 0.8862820205889395, + "train_speed(iter/s)": 0.137459 + }, + { + "epoch": 0.14061910154775387, + "grad_norm": 0.6493034362792969, + "learning_rate": 0.00019554500658321015, + "loss": 0.3546589851379395, + "memory(GiB)": 91.64, + "step": 1490, + "token_acc": 0.879837067209776, + "train_speed(iter/s)": 0.137462 + }, + { + "epoch": 0.1410909777274443, + "grad_norm": 0.38441404700279236, + "learning_rate": 0.00019549883176236987, + "loss": 0.36052756309509276, + "memory(GiB)": 91.64, + "step": 1495, + "token_acc": 0.8771869639794168, + "train_speed(iter/s)": 0.137469 + }, + { + "epoch": 0.14156285390713477, + "grad_norm": 0.3363651633262634, + "learning_rate": 0.00019545242438749808, + "loss": 0.36542329788208006, + "memory(GiB)": 91.64, + "step": 1500, + "token_acc": 0.871866295264624, + "train_speed(iter/s)": 0.137472 + }, + { + "epoch": 0.1420347300868252, + "grad_norm": 0.371928334236145, + "learning_rate": 0.0001954057845716038, + "loss": 0.3581106662750244, + "memory(GiB)": 91.64, + "step": 1505, + "token_acc": 0.8747779751332149, + "train_speed(iter/s)": 0.137476 + }, + { + "epoch": 0.14250660626651568, + "grad_norm": 0.44092419743537903, + "learning_rate": 0.00019535891242826193, + "loss": 0.3675301313400269, + "memory(GiB)": 91.64, + "step": 1510, + "token_acc": 0.8932142857142857, + "train_speed(iter/s)": 0.137482 + }, + { + "epoch": 0.14297848244620612, + "grad_norm": 0.5470117926597595, + "learning_rate": 0.00019531180807161322, + "loss": 0.3621679306030273, + "memory(GiB)": 91.64, + "step": 1515, + "token_acc": 0.8902282636573481, + "train_speed(iter/s)": 0.137486 + }, + { + "epoch": 0.14345035862589656, + "grad_norm": 0.5375911593437195, + "learning_rate": 0.0001952644716163639, + "loss": 0.36718566417694093, + "memory(GiB)": 91.64, + "step": 1520, + "token_acc": 0.8784158415841584, + "train_speed(iter/s)": 0.137495 + }, + { + "epoch": 0.14392223480558702, + "grad_norm": 0.7067252993583679, + "learning_rate": 0.00019521690317778528, + "loss": 0.3592665672302246, + "memory(GiB)": 91.64, + "step": 1525, + "token_acc": 0.8725556304787593, + "train_speed(iter/s)": 0.137504 + }, + { + "epoch": 0.14439411098527746, + "grad_norm": 0.6723697185516357, + "learning_rate": 0.0001951691028717138, + "loss": 0.3603172779083252, + "memory(GiB)": 91.64, + "step": 1530, + "token_acc": 0.8805710306406686, + "train_speed(iter/s)": 0.13751 + }, + { + "epoch": 0.1448659871649679, + "grad_norm": 0.5974426865577698, + "learning_rate": 0.0001951210708145503, + "loss": 0.3683944225311279, + "memory(GiB)": 91.64, + "step": 1535, + "token_acc": 0.8440899202320522, + "train_speed(iter/s)": 0.137524 + }, + { + "epoch": 0.14533786334465837, + "grad_norm": 0.32669320702552795, + "learning_rate": 0.00019507280712326006, + "loss": 0.3616074562072754, + "memory(GiB)": 91.64, + "step": 1540, + "token_acc": 0.9022123893805309, + "train_speed(iter/s)": 0.137534 + }, + { + "epoch": 0.1458097395243488, + "grad_norm": 0.6083944439888, + "learning_rate": 0.00019502431191537249, + "loss": 0.3704042673110962, + "memory(GiB)": 91.64, + "step": 1545, + "token_acc": 0.8691735213474344, + "train_speed(iter/s)": 0.137545 + }, + { + "epoch": 0.14628161570403925, + "grad_norm": 0.4255918860435486, + "learning_rate": 0.0001949755853089807, + "loss": 0.3596953392028809, + "memory(GiB)": 91.64, + "step": 1550, + "token_acc": 0.866793893129771, + "train_speed(iter/s)": 0.137553 + }, + { + "epoch": 0.14675349188372971, + "grad_norm": 0.5203797221183777, + "learning_rate": 0.00019492662742274134, + "loss": 0.3596514701843262, + "memory(GiB)": 91.64, + "step": 1555, + "token_acc": 0.8798060270176654, + "train_speed(iter/s)": 0.137558 + }, + { + "epoch": 0.14722536806342015, + "grad_norm": 0.556632936000824, + "learning_rate": 0.0001948774383758742, + "loss": 0.362532377243042, + "memory(GiB)": 91.64, + "step": 1560, + "token_acc": 0.8814923189465984, + "train_speed(iter/s)": 0.137567 + }, + { + "epoch": 0.14769724424311062, + "grad_norm": 0.6500077843666077, + "learning_rate": 0.00019482801828816197, + "loss": 0.36057684421539304, + "memory(GiB)": 91.64, + "step": 1565, + "token_acc": 0.8865593410707601, + "train_speed(iter/s)": 0.137567 + }, + { + "epoch": 0.14816912042280106, + "grad_norm": 0.43490278720855713, + "learning_rate": 0.0001947783672799501, + "loss": 0.37216577529907224, + "memory(GiB)": 91.64, + "step": 1570, + "token_acc": 0.8793157076205288, + "train_speed(iter/s)": 0.137577 + }, + { + "epoch": 0.1486409966024915, + "grad_norm": 0.31350913643836975, + "learning_rate": 0.0001947284854721462, + "loss": 0.3571339130401611, + "memory(GiB)": 91.64, + "step": 1575, + "token_acc": 0.8714285714285714, + "train_speed(iter/s)": 0.137584 + }, + { + "epoch": 0.14911287278218197, + "grad_norm": 0.49556922912597656, + "learning_rate": 0.00019467837298622003, + "loss": 0.36325485706329347, + "memory(GiB)": 91.64, + "step": 1580, + "token_acc": 0.8804733727810651, + "train_speed(iter/s)": 0.137588 + }, + { + "epoch": 0.1495847489618724, + "grad_norm": 0.3206709623336792, + "learning_rate": 0.00019462802994420298, + "loss": 0.3566303730010986, + "memory(GiB)": 91.64, + "step": 1585, + "token_acc": 0.8815612382234186, + "train_speed(iter/s)": 0.137592 + }, + { + "epoch": 0.15005662514156284, + "grad_norm": 0.4770624041557312, + "learning_rate": 0.000194577456468688, + "loss": 0.3560822010040283, + "memory(GiB)": 91.64, + "step": 1590, + "token_acc": 0.878257328990228, + "train_speed(iter/s)": 0.137597 + }, + { + "epoch": 0.1505285013212533, + "grad_norm": 0.36648330092430115, + "learning_rate": 0.00019452665268282905, + "loss": 0.35216608047485354, + "memory(GiB)": 91.64, + "step": 1595, + "token_acc": 0.8765673981191222, + "train_speed(iter/s)": 0.137598 + }, + { + "epoch": 0.15100037750094375, + "grad_norm": 0.34637942910194397, + "learning_rate": 0.00019447561871034107, + "loss": 0.35890846252441405, + "memory(GiB)": 91.64, + "step": 1600, + "token_acc": 0.8925100057175529, + "train_speed(iter/s)": 0.137604 + }, + { + "epoch": 0.1514722536806342, + "grad_norm": 0.5771172642707825, + "learning_rate": 0.00019442435467549937, + "loss": 0.3579749345779419, + "memory(GiB)": 91.64, + "step": 1605, + "token_acc": 0.8734729493891797, + "train_speed(iter/s)": 0.137605 + }, + { + "epoch": 0.15194412986032466, + "grad_norm": 0.382007360458374, + "learning_rate": 0.0001943728607031397, + "loss": 0.35507354736328123, + "memory(GiB)": 91.64, + "step": 1610, + "token_acc": 0.8889200561009818, + "train_speed(iter/s)": 0.13761 + }, + { + "epoch": 0.1524160060400151, + "grad_norm": 0.3741127848625183, + "learning_rate": 0.00019432113691865755, + "loss": 0.3627027988433838, + "memory(GiB)": 91.64, + "step": 1615, + "token_acc": 0.8872416891284816, + "train_speed(iter/s)": 0.137613 + }, + { + "epoch": 0.15288788221970556, + "grad_norm": 0.9472887516021729, + "learning_rate": 0.00019426918344800815, + "loss": 0.36210317611694337, + "memory(GiB)": 91.64, + "step": 1620, + "token_acc": 0.8653122648607976, + "train_speed(iter/s)": 0.137618 + }, + { + "epoch": 0.153359758399396, + "grad_norm": 0.4501311480998993, + "learning_rate": 0.00019421700041770602, + "loss": 0.3565349578857422, + "memory(GiB)": 91.64, + "step": 1625, + "token_acc": 0.8737430167597765, + "train_speed(iter/s)": 0.137627 + }, + { + "epoch": 0.15383163457908644, + "grad_norm": 0.7638270854949951, + "learning_rate": 0.0001941645879548247, + "loss": 0.3586350679397583, + "memory(GiB)": 91.64, + "step": 1630, + "token_acc": 0.8927091963545982, + "train_speed(iter/s)": 0.137633 + }, + { + "epoch": 0.1543035107587769, + "grad_norm": 0.32415205240249634, + "learning_rate": 0.00019411194618699644, + "loss": 0.3630037307739258, + "memory(GiB)": 91.64, + "step": 1635, + "token_acc": 0.8924870466321243, + "train_speed(iter/s)": 0.137637 + }, + { + "epoch": 0.15477538693846735, + "grad_norm": 0.46527740359306335, + "learning_rate": 0.00019405907524241184, + "loss": 0.36022109985351564, + "memory(GiB)": 91.64, + "step": 1640, + "token_acc": 0.892293114339861, + "train_speed(iter/s)": 0.137643 + }, + { + "epoch": 0.15524726311815779, + "grad_norm": 0.47606101632118225, + "learning_rate": 0.00019400597524981965, + "loss": 0.3650399684906006, + "memory(GiB)": 91.64, + "step": 1645, + "token_acc": 0.8674858850880106, + "train_speed(iter/s)": 0.137657 + }, + { + "epoch": 0.15571913929784825, + "grad_norm": 0.45343896746635437, + "learning_rate": 0.0001939526463385263, + "loss": 0.353916597366333, + "memory(GiB)": 91.64, + "step": 1650, + "token_acc": 0.8914905768132496, + "train_speed(iter/s)": 0.13767 + }, + { + "epoch": 0.1561910154775387, + "grad_norm": 0.7184441685676575, + "learning_rate": 0.00019389908863839573, + "loss": 0.3685162544250488, + "memory(GiB)": 91.64, + "step": 1655, + "token_acc": 0.8822937625754527, + "train_speed(iter/s)": 0.13768 + }, + { + "epoch": 0.15666289165722913, + "grad_norm": 0.5629715919494629, + "learning_rate": 0.00019384530227984902, + "loss": 0.3554409027099609, + "memory(GiB)": 91.64, + "step": 1660, + "token_acc": 0.8831908831908832, + "train_speed(iter/s)": 0.137687 + }, + { + "epoch": 0.1571347678369196, + "grad_norm": 0.39644697308540344, + "learning_rate": 0.00019379128739386404, + "loss": 0.351816725730896, + "memory(GiB)": 91.64, + "step": 1665, + "token_acc": 0.8597662771285476, + "train_speed(iter/s)": 0.137691 + }, + { + "epoch": 0.15760664401661004, + "grad_norm": 0.6162962317466736, + "learning_rate": 0.00019373704411197517, + "loss": 0.3518479585647583, + "memory(GiB)": 91.64, + "step": 1670, + "token_acc": 0.8764940239043825, + "train_speed(iter/s)": 0.137697 + }, + { + "epoch": 0.1580785201963005, + "grad_norm": 0.30298227071762085, + "learning_rate": 0.000193682572566273, + "loss": 0.35238142013549806, + "memory(GiB)": 91.64, + "step": 1675, + "token_acc": 0.8691389599317988, + "train_speed(iter/s)": 0.137704 + }, + { + "epoch": 0.15855039637599094, + "grad_norm": 0.5133036375045776, + "learning_rate": 0.00019362787288940383, + "loss": 0.36153383255004884, + "memory(GiB)": 91.64, + "step": 1680, + "token_acc": 0.894973436861463, + "train_speed(iter/s)": 0.137709 + }, + { + "epoch": 0.15902227255568138, + "grad_norm": 0.2759738266468048, + "learning_rate": 0.0001935729452145697, + "loss": 0.3581871509552002, + "memory(GiB)": 91.64, + "step": 1685, + "token_acc": 0.8885003885003885, + "train_speed(iter/s)": 0.137718 + }, + { + "epoch": 0.15949414873537185, + "grad_norm": 0.38993126153945923, + "learning_rate": 0.0001935177896755278, + "loss": 0.35147097110748293, + "memory(GiB)": 91.64, + "step": 1690, + "token_acc": 0.887071240105541, + "train_speed(iter/s)": 0.137727 + }, + { + "epoch": 0.1599660249150623, + "grad_norm": 0.6348713040351868, + "learning_rate": 0.00019346240640659012, + "loss": 0.3537603378295898, + "memory(GiB)": 91.64, + "step": 1695, + "token_acc": 0.8848167539267016, + "train_speed(iter/s)": 0.137723 + }, + { + "epoch": 0.16043790109475273, + "grad_norm": 0.4917113184928894, + "learning_rate": 0.00019340679554262323, + "loss": 0.35138711929321287, + "memory(GiB)": 91.64, + "step": 1700, + "token_acc": 0.8826676176890157, + "train_speed(iter/s)": 0.137738 + }, + { + "epoch": 0.1609097772744432, + "grad_norm": 0.5605958104133606, + "learning_rate": 0.000193350957219048, + "loss": 0.35388185977935793, + "memory(GiB)": 91.64, + "step": 1705, + "token_acc": 0.8839228295819935, + "train_speed(iter/s)": 0.137745 + }, + { + "epoch": 0.16138165345413363, + "grad_norm": 0.4609486162662506, + "learning_rate": 0.0001932948915718391, + "loss": 0.3516982555389404, + "memory(GiB)": 91.64, + "step": 1710, + "token_acc": 0.8835051546391752, + "train_speed(iter/s)": 0.137747 + }, + { + "epoch": 0.16185352963382407, + "grad_norm": 0.4357410967350006, + "learning_rate": 0.00019323859873752493, + "loss": 0.3436026096343994, + "memory(GiB)": 91.64, + "step": 1715, + "token_acc": 0.8933823529411765, + "train_speed(iter/s)": 0.137753 + }, + { + "epoch": 0.16232540581351454, + "grad_norm": 1.353634238243103, + "learning_rate": 0.0001931820788531869, + "loss": 0.3458723068237305, + "memory(GiB)": 91.64, + "step": 1720, + "token_acc": 0.8818590704647676, + "train_speed(iter/s)": 0.13776 + }, + { + "epoch": 0.16279728199320498, + "grad_norm": 1.2291808128356934, + "learning_rate": 0.0001931253320564595, + "loss": 0.35339980125427245, + "memory(GiB)": 91.64, + "step": 1725, + "token_acc": 0.8876443822191109, + "train_speed(iter/s)": 0.137759 + }, + { + "epoch": 0.16326915817289545, + "grad_norm": 0.3997421860694885, + "learning_rate": 0.0001930683584855297, + "loss": 0.35820183753967283, + "memory(GiB)": 91.64, + "step": 1730, + "token_acc": 0.8761955366631243, + "train_speed(iter/s)": 0.137767 + }, + { + "epoch": 0.16374103435258588, + "grad_norm": 0.3965195119380951, + "learning_rate": 0.00019301115827913672, + "loss": 0.3507367134094238, + "memory(GiB)": 91.64, + "step": 1735, + "token_acc": 0.887131252672082, + "train_speed(iter/s)": 0.137769 + }, + { + "epoch": 0.16421291053227632, + "grad_norm": 0.6176219582557678, + "learning_rate": 0.0001929537315765717, + "loss": 0.3522993326187134, + "memory(GiB)": 91.64, + "step": 1740, + "token_acc": 0.9011488111140796, + "train_speed(iter/s)": 0.13777 + }, + { + "epoch": 0.1646847867119668, + "grad_norm": 0.5894827842712402, + "learning_rate": 0.00019289607851767727, + "loss": 0.3515509843826294, + "memory(GiB)": 91.64, + "step": 1745, + "token_acc": 0.9020016680567139, + "train_speed(iter/s)": 0.13777 + }, + { + "epoch": 0.16515666289165723, + "grad_norm": 0.5238110423088074, + "learning_rate": 0.00019283819924284732, + "loss": 0.3614342212677002, + "memory(GiB)": 91.64, + "step": 1750, + "token_acc": 0.868234415826801, + "train_speed(iter/s)": 0.13777 + }, + { + "epoch": 0.16562853907134767, + "grad_norm": 0.6640599370002747, + "learning_rate": 0.0001927800938930266, + "loss": 0.3606734752655029, + "memory(GiB)": 91.64, + "step": 1755, + "token_acc": 0.8917274939172749, + "train_speed(iter/s)": 0.13777 + }, + { + "epoch": 0.16610041525103814, + "grad_norm": 0.3279666602611542, + "learning_rate": 0.00019272176260971038, + "loss": 0.3470527410507202, + "memory(GiB)": 91.64, + "step": 1760, + "token_acc": 0.8820662768031189, + "train_speed(iter/s)": 0.137768 + }, + { + "epoch": 0.16657229143072858, + "grad_norm": 0.557361900806427, + "learning_rate": 0.00019266320553494413, + "loss": 0.36066641807556155, + "memory(GiB)": 91.64, + "step": 1765, + "token_acc": 0.8856569709127382, + "train_speed(iter/s)": 0.137766 + }, + { + "epoch": 0.16704416761041901, + "grad_norm": 0.5631670951843262, + "learning_rate": 0.00019260442281132314, + "loss": 0.347049617767334, + "memory(GiB)": 91.64, + "step": 1770, + "token_acc": 0.8933909000332115, + "train_speed(iter/s)": 0.137773 + }, + { + "epoch": 0.16751604379010948, + "grad_norm": 0.3367120921611786, + "learning_rate": 0.00019254541458199218, + "loss": 0.3531044483184814, + "memory(GiB)": 91.64, + "step": 1775, + "token_acc": 0.866046511627907, + "train_speed(iter/s)": 0.137778 + }, + { + "epoch": 0.16798791996979992, + "grad_norm": 0.30985838174819946, + "learning_rate": 0.00019248618099064517, + "loss": 0.3543743133544922, + "memory(GiB)": 91.64, + "step": 1780, + "token_acc": 0.8813512004466778, + "train_speed(iter/s)": 0.137783 + }, + { + "epoch": 0.1684597961494904, + "grad_norm": 0.29574140906333923, + "learning_rate": 0.00019242672218152483, + "loss": 0.3577969312667847, + "memory(GiB)": 91.64, + "step": 1785, + "token_acc": 0.8939130434782608, + "train_speed(iter/s)": 0.137789 + }, + { + "epoch": 0.16893167232918083, + "grad_norm": 0.5875484943389893, + "learning_rate": 0.00019236703829942232, + "loss": 0.350512170791626, + "memory(GiB)": 91.64, + "step": 1790, + "token_acc": 0.8707829408020369, + "train_speed(iter/s)": 0.137795 + }, + { + "epoch": 0.16940354850887127, + "grad_norm": 1.0582468509674072, + "learning_rate": 0.0001923071294896768, + "loss": 0.3436901330947876, + "memory(GiB)": 91.64, + "step": 1795, + "token_acc": 0.8852157943067034, + "train_speed(iter/s)": 0.137799 + }, + { + "epoch": 0.16987542468856173, + "grad_norm": 0.3438156843185425, + "learning_rate": 0.00019224699589817537, + "loss": 0.34713518619537354, + "memory(GiB)": 91.64, + "step": 1800, + "token_acc": 0.8709556057185854, + "train_speed(iter/s)": 0.137799 + }, + { + "epoch": 0.17034730086825217, + "grad_norm": 0.38124313950538635, + "learning_rate": 0.00019218663767135233, + "loss": 0.3550222396850586, + "memory(GiB)": 91.64, + "step": 1805, + "token_acc": 0.8765864332603939, + "train_speed(iter/s)": 0.137801 + }, + { + "epoch": 0.1708191770479426, + "grad_norm": 0.4326140582561493, + "learning_rate": 0.00019212605495618897, + "loss": 0.3468668460845947, + "memory(GiB)": 91.64, + "step": 1810, + "token_acc": 0.8850841555426582, + "train_speed(iter/s)": 0.137805 + }, + { + "epoch": 0.17129105322763308, + "grad_norm": 0.43024584650993347, + "learning_rate": 0.0001920652479002134, + "loss": 0.34256269931793215, + "memory(GiB)": 91.64, + "step": 1815, + "token_acc": 0.870197904540163, + "train_speed(iter/s)": 0.137809 + }, + { + "epoch": 0.17176292940732352, + "grad_norm": 0.3962891101837158, + "learning_rate": 0.00019200421665149998, + "loss": 0.35373740196228026, + "memory(GiB)": 91.64, + "step": 1820, + "token_acc": 0.8898639754278193, + "train_speed(iter/s)": 0.137813 + }, + { + "epoch": 0.17223480558701396, + "grad_norm": 0.3830450177192688, + "learning_rate": 0.00019194296135866893, + "loss": 0.3469409704208374, + "memory(GiB)": 91.64, + "step": 1825, + "token_acc": 0.8796900489396411, + "train_speed(iter/s)": 0.137819 + }, + { + "epoch": 0.17270668176670442, + "grad_norm": 0.5176311731338501, + "learning_rate": 0.0001918814821708861, + "loss": 0.3452960968017578, + "memory(GiB)": 91.64, + "step": 1830, + "token_acc": 0.8795876288659794, + "train_speed(iter/s)": 0.137818 + }, + { + "epoch": 0.17317855794639486, + "grad_norm": 0.3469870984554291, + "learning_rate": 0.00019181977923786258, + "loss": 0.35071775913238523, + "memory(GiB)": 91.64, + "step": 1835, + "token_acc": 0.8695652173913043, + "train_speed(iter/s)": 0.137825 + }, + { + "epoch": 0.17365043412608533, + "grad_norm": 0.3066750764846802, + "learning_rate": 0.00019175785270985433, + "loss": 0.3505476713180542, + "memory(GiB)": 91.64, + "step": 1840, + "token_acc": 0.8618200567156483, + "train_speed(iter/s)": 0.137819 + }, + { + "epoch": 0.17412231030577577, + "grad_norm": 0.6881127953529358, + "learning_rate": 0.00019169570273766176, + "loss": 0.3488409996032715, + "memory(GiB)": 91.64, + "step": 1845, + "token_acc": 0.8822618125484121, + "train_speed(iter/s)": 0.137815 + }, + { + "epoch": 0.1745941864854662, + "grad_norm": 0.5931531190872192, + "learning_rate": 0.0001916333294726294, + "loss": 0.3492976188659668, + "memory(GiB)": 91.64, + "step": 1850, + "token_acc": 0.89794921875, + "train_speed(iter/s)": 0.137818 + }, + { + "epoch": 0.17506606266515667, + "grad_norm": 0.6179839968681335, + "learning_rate": 0.00019157073306664554, + "loss": 0.3676631212234497, + "memory(GiB)": 91.64, + "step": 1855, + "token_acc": 0.8740814963259853, + "train_speed(iter/s)": 0.13782 + }, + { + "epoch": 0.1755379388448471, + "grad_norm": 0.5834555625915527, + "learning_rate": 0.00019150791367214182, + "loss": 0.3476147174835205, + "memory(GiB)": 91.64, + "step": 1860, + "token_acc": 0.8826677645121449, + "train_speed(iter/s)": 0.137819 + }, + { + "epoch": 0.17600981502453755, + "grad_norm": 0.35329657793045044, + "learning_rate": 0.000191444871442093, + "loss": 0.35233583450317385, + "memory(GiB)": 91.64, + "step": 1865, + "token_acc": 0.8933369156367544, + "train_speed(iter/s)": 0.13782 + }, + { + "epoch": 0.17648169120422802, + "grad_norm": 0.35915401577949524, + "learning_rate": 0.00019138160653001633, + "loss": 0.34854936599731445, + "memory(GiB)": 91.64, + "step": 1870, + "token_acc": 0.8787451533309835, + "train_speed(iter/s)": 0.137821 + }, + { + "epoch": 0.17695356738391846, + "grad_norm": 0.8864762783050537, + "learning_rate": 0.00019131811908997142, + "loss": 0.34482736587524415, + "memory(GiB)": 91.64, + "step": 1875, + "token_acc": 0.8787006578947368, + "train_speed(iter/s)": 0.137827 + }, + { + "epoch": 0.1774254435636089, + "grad_norm": 0.4602474868297577, + "learning_rate": 0.00019125440927655974, + "loss": 0.3450265645980835, + "memory(GiB)": 91.64, + "step": 1880, + "token_acc": 0.8929677134011499, + "train_speed(iter/s)": 0.137831 + }, + { + "epoch": 0.17789731974329936, + "grad_norm": 0.35482195019721985, + "learning_rate": 0.00019119047724492426, + "loss": 0.33423638343811035, + "memory(GiB)": 91.64, + "step": 1885, + "token_acc": 0.8888008452192288, + "train_speed(iter/s)": 0.137832 + }, + { + "epoch": 0.1783691959229898, + "grad_norm": 0.9185347557067871, + "learning_rate": 0.00019112632315074915, + "loss": 0.35080528259277344, + "memory(GiB)": 91.64, + "step": 1890, + "token_acc": 0.8741188318227593, + "train_speed(iter/s)": 0.137838 + }, + { + "epoch": 0.17884107210268024, + "grad_norm": 0.3791806399822235, + "learning_rate": 0.00019106194715025926, + "loss": 0.3456167459487915, + "memory(GiB)": 91.64, + "step": 1895, + "token_acc": 0.8879935535858179, + "train_speed(iter/s)": 0.137842 + }, + { + "epoch": 0.1793129482823707, + "grad_norm": 0.5742256045341492, + "learning_rate": 0.00019099734940021982, + "loss": 0.3473632335662842, + "memory(GiB)": 91.64, + "step": 1900, + "token_acc": 0.8950050968399592, + "train_speed(iter/s)": 0.13785 + }, + { + "epoch": 0.17978482446206115, + "grad_norm": 0.49934887886047363, + "learning_rate": 0.00019093253005793607, + "loss": 0.3430349111557007, + "memory(GiB)": 91.64, + "step": 1905, + "token_acc": 0.8664504716981132, + "train_speed(iter/s)": 0.137855 + }, + { + "epoch": 0.18025670064175162, + "grad_norm": 0.5481332540512085, + "learning_rate": 0.00019086748928125294, + "loss": 0.3437246322631836, + "memory(GiB)": 91.64, + "step": 1910, + "token_acc": 0.8839852892009361, + "train_speed(iter/s)": 0.137858 + }, + { + "epoch": 0.18072857682144206, + "grad_norm": 0.28932511806488037, + "learning_rate": 0.00019080222722855442, + "loss": 0.3524455547332764, + "memory(GiB)": 91.64, + "step": 1915, + "token_acc": 0.8890915724188252, + "train_speed(iter/s)": 0.137863 + }, + { + "epoch": 0.1812004530011325, + "grad_norm": 0.8510509133338928, + "learning_rate": 0.00019073674405876347, + "loss": 0.3491034030914307, + "memory(GiB)": 91.64, + "step": 1920, + "token_acc": 0.8880655226209049, + "train_speed(iter/s)": 0.137866 + }, + { + "epoch": 0.18167232918082296, + "grad_norm": 0.316115140914917, + "learning_rate": 0.00019067103993134152, + "loss": 0.3515012264251709, + "memory(GiB)": 91.64, + "step": 1925, + "token_acc": 0.8913354303688876, + "train_speed(iter/s)": 0.137862 + }, + { + "epoch": 0.1821442053605134, + "grad_norm": 0.3926486670970917, + "learning_rate": 0.00019060511500628794, + "loss": 0.35095829963684083, + "memory(GiB)": 91.64, + "step": 1930, + "token_acc": 0.8682614555256065, + "train_speed(iter/s)": 0.137868 + }, + { + "epoch": 0.18261608154020384, + "grad_norm": 0.35603636503219604, + "learning_rate": 0.00019053896944413984, + "loss": 0.34353179931640626, + "memory(GiB)": 91.64, + "step": 1935, + "token_acc": 0.8846398980242193, + "train_speed(iter/s)": 0.13787 + }, + { + "epoch": 0.1830879577198943, + "grad_norm": 0.43732625246047974, + "learning_rate": 0.0001904726034059717, + "loss": 0.3492013454437256, + "memory(GiB)": 91.64, + "step": 1940, + "token_acc": 0.8862512363996043, + "train_speed(iter/s)": 0.137869 + }, + { + "epoch": 0.18355983389958475, + "grad_norm": 0.4010016918182373, + "learning_rate": 0.00019040601705339473, + "loss": 0.34664440155029297, + "memory(GiB)": 91.64, + "step": 1945, + "token_acc": 0.9094759131815775, + "train_speed(iter/s)": 0.137868 + }, + { + "epoch": 0.18403171007927518, + "grad_norm": 0.6890151500701904, + "learning_rate": 0.00019033921054855676, + "loss": 0.34554276466369627, + "memory(GiB)": 91.64, + "step": 1950, + "token_acc": 0.8837018837018837, + "train_speed(iter/s)": 0.137871 + }, + { + "epoch": 0.18450358625896565, + "grad_norm": 0.36494889855384827, + "learning_rate": 0.00019027218405414168, + "loss": 0.34601216316223143, + "memory(GiB)": 91.64, + "step": 1955, + "token_acc": 0.8904270986745213, + "train_speed(iter/s)": 0.137876 + }, + { + "epoch": 0.1849754624386561, + "grad_norm": 0.3291149437427521, + "learning_rate": 0.0001902049377333691, + "loss": 0.34858098030090334, + "memory(GiB)": 91.64, + "step": 1960, + "token_acc": 0.8801042571676803, + "train_speed(iter/s)": 0.137876 + }, + { + "epoch": 0.18544733861834656, + "grad_norm": 0.3290327489376068, + "learning_rate": 0.0001901374717499939, + "loss": 0.343774151802063, + "memory(GiB)": 91.64, + "step": 1965, + "token_acc": 0.8665939658306071, + "train_speed(iter/s)": 0.137879 + }, + { + "epoch": 0.185919214798037, + "grad_norm": 0.7721136212348938, + "learning_rate": 0.000190069786268306, + "loss": 0.3448851823806763, + "memory(GiB)": 91.64, + "step": 1970, + "token_acc": 0.907328730748805, + "train_speed(iter/s)": 0.137886 + }, + { + "epoch": 0.18639109097772744, + "grad_norm": 0.42297878861427307, + "learning_rate": 0.00019000188145312964, + "loss": 0.34144585132598876, + "memory(GiB)": 91.64, + "step": 1975, + "token_acc": 0.8630643967431533, + "train_speed(iter/s)": 0.137895 + }, + { + "epoch": 0.1868629671574179, + "grad_norm": 0.35999032855033875, + "learning_rate": 0.00018993375746982331, + "loss": 0.3448899030685425, + "memory(GiB)": 91.64, + "step": 1980, + "token_acc": 0.8727481793790725, + "train_speed(iter/s)": 0.137894 + }, + { + "epoch": 0.18733484333710834, + "grad_norm": 0.5476382374763489, + "learning_rate": 0.00018986541448427915, + "loss": 0.34505505561828614, + "memory(GiB)": 91.64, + "step": 1985, + "token_acc": 0.8727154046997389, + "train_speed(iter/s)": 0.137901 + }, + { + "epoch": 0.18780671951679878, + "grad_norm": 0.459293931722641, + "learning_rate": 0.00018979685266292263, + "loss": 0.34939863681793215, + "memory(GiB)": 91.64, + "step": 1990, + "token_acc": 0.859118086696562, + "train_speed(iter/s)": 0.137908 + }, + { + "epoch": 0.18827859569648925, + "grad_norm": 0.3321842551231384, + "learning_rate": 0.00018972807217271207, + "loss": 0.35269508361816404, + "memory(GiB)": 91.64, + "step": 1995, + "token_acc": 0.8570105003088326, + "train_speed(iter/s)": 0.137911 + }, + { + "epoch": 0.1887504718761797, + "grad_norm": 0.301276296377182, + "learning_rate": 0.00018965907318113838, + "loss": 0.34697985649108887, + "memory(GiB)": 91.64, + "step": 2000, + "token_acc": 0.8835132117603275, + "train_speed(iter/s)": 0.13791 + }, + { + "epoch": 0.18922234805587013, + "grad_norm": 0.2910401523113251, + "learning_rate": 0.00018958985585622445, + "loss": 0.3419046878814697, + "memory(GiB)": 91.64, + "step": 2005, + "token_acc": 0.8818718764198091, + "train_speed(iter/s)": 0.137915 + }, + { + "epoch": 0.1896942242355606, + "grad_norm": 0.7058345079421997, + "learning_rate": 0.00018952042036652486, + "loss": 0.34183340072631835, + "memory(GiB)": 91.64, + "step": 2010, + "token_acc": 0.8762214983713354, + "train_speed(iter/s)": 0.137919 + }, + { + "epoch": 0.19016610041525103, + "grad_norm": 0.30306434631347656, + "learning_rate": 0.00018945076688112552, + "loss": 0.3396963834762573, + "memory(GiB)": 91.64, + "step": 2015, + "token_acc": 0.8915574963609898, + "train_speed(iter/s)": 0.137921 + }, + { + "epoch": 0.1906379765949415, + "grad_norm": 0.48909690976142883, + "learning_rate": 0.0001893808955696432, + "loss": 0.3448690176010132, + "memory(GiB)": 91.64, + "step": 2020, + "token_acc": 0.8720196353436185, + "train_speed(iter/s)": 0.137919 + }, + { + "epoch": 0.19110985277463194, + "grad_norm": 0.6389379501342773, + "learning_rate": 0.00018931080660222497, + "loss": 0.3356884002685547, + "memory(GiB)": 91.64, + "step": 2025, + "token_acc": 0.8754716981132076, + "train_speed(iter/s)": 0.137926 + }, + { + "epoch": 0.19158172895432238, + "grad_norm": 0.39773041009902954, + "learning_rate": 0.00018924050014954805, + "loss": 0.34584047794342043, + "memory(GiB)": 91.64, + "step": 2030, + "token_acc": 0.9035667107001321, + "train_speed(iter/s)": 0.137923 + }, + { + "epoch": 0.19205360513401284, + "grad_norm": 0.30064910650253296, + "learning_rate": 0.00018916997638281923, + "loss": 0.34599852561950684, + "memory(GiB)": 91.64, + "step": 2035, + "token_acc": 0.8658097686375321, + "train_speed(iter/s)": 0.137924 + }, + { + "epoch": 0.19252548131370328, + "grad_norm": 0.3374609351158142, + "learning_rate": 0.00018909923547377454, + "loss": 0.35094680786132815, + "memory(GiB)": 91.64, + "step": 2040, + "token_acc": 0.8824691358024691, + "train_speed(iter/s)": 0.137931 + }, + { + "epoch": 0.19299735749339372, + "grad_norm": 0.30539119243621826, + "learning_rate": 0.00018902827759467868, + "loss": 0.3404444932937622, + "memory(GiB)": 91.64, + "step": 2045, + "token_acc": 0.8939899833055092, + "train_speed(iter/s)": 0.137935 + }, + { + "epoch": 0.1934692336730842, + "grad_norm": 0.2880310118198395, + "learning_rate": 0.00018895710291832484, + "loss": 0.3389493703842163, + "memory(GiB)": 91.64, + "step": 2050, + "token_acc": 0.872093023255814, + "train_speed(iter/s)": 0.137938 + }, + { + "epoch": 0.19394110985277463, + "grad_norm": 0.4194709360599518, + "learning_rate": 0.00018888571161803402, + "loss": 0.3463752269744873, + "memory(GiB)": 91.64, + "step": 2055, + "token_acc": 0.883762732174955, + "train_speed(iter/s)": 0.137944 + }, + { + "epoch": 0.19441298603246507, + "grad_norm": 1.0667372941970825, + "learning_rate": 0.00018881410386765478, + "loss": 0.3408158540725708, + "memory(GiB)": 91.64, + "step": 2060, + "token_acc": 0.8831054256726952, + "train_speed(iter/s)": 0.137947 + }, + { + "epoch": 0.19488486221215554, + "grad_norm": 0.5454498529434204, + "learning_rate": 0.00018874227984156278, + "loss": 0.35147881507873535, + "memory(GiB)": 91.64, + "step": 2065, + "token_acc": 0.8823058446757406, + "train_speed(iter/s)": 0.137952 + }, + { + "epoch": 0.19535673839184597, + "grad_norm": 0.3480747640132904, + "learning_rate": 0.00018867023971466036, + "loss": 0.34201037883758545, + "memory(GiB)": 91.64, + "step": 2070, + "token_acc": 0.8867823765020026, + "train_speed(iter/s)": 0.137954 + }, + { + "epoch": 0.19582861457153644, + "grad_norm": 0.4492743909358978, + "learning_rate": 0.00018859798366237604, + "loss": 0.33846278190612794, + "memory(GiB)": 91.64, + "step": 2075, + "token_acc": 0.8773491592482691, + "train_speed(iter/s)": 0.137958 + }, + { + "epoch": 0.19630049075122688, + "grad_norm": 0.5267034769058228, + "learning_rate": 0.0001885255118606642, + "loss": 0.3435330390930176, + "memory(GiB)": 91.64, + "step": 2080, + "token_acc": 0.8721130644605308, + "train_speed(iter/s)": 0.137965 + }, + { + "epoch": 0.19677236693091732, + "grad_norm": 0.28133124113082886, + "learning_rate": 0.0001884528244860046, + "loss": 0.3497642755508423, + "memory(GiB)": 91.64, + "step": 2085, + "token_acc": 0.8880718954248366, + "train_speed(iter/s)": 0.137966 + }, + { + "epoch": 0.1972442431106078, + "grad_norm": 0.2900506556034088, + "learning_rate": 0.0001883799217154019, + "loss": 0.3394758224487305, + "memory(GiB)": 91.64, + "step": 2090, + "token_acc": 0.8921568627450981, + "train_speed(iter/s)": 0.137962 + }, + { + "epoch": 0.19771611929029823, + "grad_norm": 0.8465926647186279, + "learning_rate": 0.00018830680372638537, + "loss": 0.3450798034667969, + "memory(GiB)": 91.64, + "step": 2095, + "token_acc": 0.8832747041893189, + "train_speed(iter/s)": 0.137967 + }, + { + "epoch": 0.19818799546998866, + "grad_norm": 0.5081422328948975, + "learning_rate": 0.00018823347069700828, + "loss": 0.33031363487243653, + "memory(GiB)": 91.64, + "step": 2100, + "token_acc": 0.8912106135986733, + "train_speed(iter/s)": 0.137971 + }, + { + "epoch": 0.19865987164967913, + "grad_norm": 0.4367571175098419, + "learning_rate": 0.00018815992280584763, + "loss": 0.3436918258666992, + "memory(GiB)": 91.64, + "step": 2105, + "token_acc": 0.8999052731291443, + "train_speed(iter/s)": 0.137972 + }, + { + "epoch": 0.19913174782936957, + "grad_norm": 0.3542596399784088, + "learning_rate": 0.00018808616023200357, + "loss": 0.3388987064361572, + "memory(GiB)": 91.64, + "step": 2110, + "token_acc": 0.8867167919799499, + "train_speed(iter/s)": 0.137975 + }, + { + "epoch": 0.19960362400906, + "grad_norm": 0.5012041926383972, + "learning_rate": 0.00018801218315509912, + "loss": 0.3462409019470215, + "memory(GiB)": 91.64, + "step": 2115, + "token_acc": 0.8946236559139785, + "train_speed(iter/s)": 0.137983 + }, + { + "epoch": 0.20007550018875048, + "grad_norm": 0.6917479634284973, + "learning_rate": 0.00018793799175527954, + "loss": 0.34777753353118895, + "memory(GiB)": 91.64, + "step": 2120, + "token_acc": 0.886994775914215, + "train_speed(iter/s)": 0.137984 + }, + { + "epoch": 0.20054737636844092, + "grad_norm": 0.299472451210022, + "learning_rate": 0.00018786358621321211, + "loss": 0.3416252136230469, + "memory(GiB)": 91.64, + "step": 2125, + "token_acc": 0.879359095193214, + "train_speed(iter/s)": 0.137994 + }, + { + "epoch": 0.20101925254813138, + "grad_norm": 0.542940080165863, + "learning_rate": 0.0001877889667100855, + "loss": 0.3419647216796875, + "memory(GiB)": 91.64, + "step": 2130, + "token_acc": 0.8806986382474837, + "train_speed(iter/s)": 0.137995 + }, + { + "epoch": 0.20149112872782182, + "grad_norm": 0.34102553129196167, + "learning_rate": 0.00018771413342760944, + "loss": 0.3418309688568115, + "memory(GiB)": 91.64, + "step": 2135, + "token_acc": 0.8886278195488722, + "train_speed(iter/s)": 0.137999 + }, + { + "epoch": 0.20196300490751226, + "grad_norm": 0.4004146158695221, + "learning_rate": 0.00018763908654801422, + "loss": 0.33717515468597414, + "memory(GiB)": 91.64, + "step": 2140, + "token_acc": 0.894689870593485, + "train_speed(iter/s)": 0.138001 + }, + { + "epoch": 0.20243488108720273, + "grad_norm": 0.5726518034934998, + "learning_rate": 0.0001875638262540503, + "loss": 0.3408660888671875, + "memory(GiB)": 91.64, + "step": 2145, + "token_acc": 0.8878548161935784, + "train_speed(iter/s)": 0.138006 + }, + { + "epoch": 0.20290675726689317, + "grad_norm": 0.4096904397010803, + "learning_rate": 0.00018748835272898781, + "loss": 0.3478860378265381, + "memory(GiB)": 91.64, + "step": 2150, + "token_acc": 0.8895281933256617, + "train_speed(iter/s)": 0.13801 + }, + { + "epoch": 0.2033786334465836, + "grad_norm": 0.2845887243747711, + "learning_rate": 0.0001874126661566162, + "loss": 0.3438119888305664, + "memory(GiB)": 91.64, + "step": 2155, + "token_acc": 0.8817876021143681, + "train_speed(iter/s)": 0.138015 + }, + { + "epoch": 0.20385050962627407, + "grad_norm": 0.49546805024147034, + "learning_rate": 0.00018733676672124362, + "loss": 0.3439002990722656, + "memory(GiB)": 91.64, + "step": 2160, + "token_acc": 0.8995042812077513, + "train_speed(iter/s)": 0.138018 + }, + { + "epoch": 0.2043223858059645, + "grad_norm": 0.6575082540512085, + "learning_rate": 0.00018726065460769663, + "loss": 0.3439802885055542, + "memory(GiB)": 91.64, + "step": 2165, + "token_acc": 0.8819188191881919, + "train_speed(iter/s)": 0.138019 + }, + { + "epoch": 0.20479426198565495, + "grad_norm": 0.678426206111908, + "learning_rate": 0.00018718433000131966, + "loss": 0.33720600605010986, + "memory(GiB)": 91.64, + "step": 2170, + "token_acc": 0.8794220229197808, + "train_speed(iter/s)": 0.138026 + }, + { + "epoch": 0.20526613816534542, + "grad_norm": 0.5582923293113708, + "learning_rate": 0.00018710779308797468, + "loss": 0.3416036605834961, + "memory(GiB)": 91.64, + "step": 2175, + "token_acc": 0.8883531157270029, + "train_speed(iter/s)": 0.138031 + }, + { + "epoch": 0.20573801434503586, + "grad_norm": 0.39158037304878235, + "learning_rate": 0.00018703104405404055, + "loss": 0.335191011428833, + "memory(GiB)": 91.64, + "step": 2180, + "token_acc": 0.9014705882352941, + "train_speed(iter/s)": 0.138035 + }, + { + "epoch": 0.20620989052472632, + "grad_norm": 0.45636487007141113, + "learning_rate": 0.00018695408308641272, + "loss": 0.34262187480926515, + "memory(GiB)": 91.64, + "step": 2185, + "token_acc": 0.883441258094357, + "train_speed(iter/s)": 0.138032 + }, + { + "epoch": 0.20668176670441676, + "grad_norm": 0.8102078437805176, + "learning_rate": 0.00018687691037250277, + "loss": 0.33262019157409667, + "memory(GiB)": 91.64, + "step": 2190, + "token_acc": 0.8790444511641972, + "train_speed(iter/s)": 0.138034 + }, + { + "epoch": 0.2071536428841072, + "grad_norm": 0.31004661321640015, + "learning_rate": 0.0001867995261002378, + "loss": 0.34704036712646485, + "memory(GiB)": 91.64, + "step": 2195, + "token_acc": 0.895397489539749, + "train_speed(iter/s)": 0.138038 + }, + { + "epoch": 0.20762551906379767, + "grad_norm": 0.8132814168930054, + "learning_rate": 0.00018672193045806023, + "loss": 0.34265220165252686, + "memory(GiB)": 91.64, + "step": 2200, + "token_acc": 0.8636037329504667, + "train_speed(iter/s)": 0.138039 + }, + { + "epoch": 0.2080973952434881, + "grad_norm": 0.7821739315986633, + "learning_rate": 0.00018664412363492708, + "loss": 0.33681278228759765, + "memory(GiB)": 91.64, + "step": 2205, + "token_acc": 0.8768713718301253, + "train_speed(iter/s)": 0.13804 + }, + { + "epoch": 0.20856927142317855, + "grad_norm": 0.6573207974433899, + "learning_rate": 0.00018656610582030975, + "loss": 0.3311905860900879, + "memory(GiB)": 91.64, + "step": 2210, + "token_acc": 0.8870541611624835, + "train_speed(iter/s)": 0.138041 + }, + { + "epoch": 0.20904114760286902, + "grad_norm": 0.5945848226547241, + "learning_rate": 0.0001864878772041933, + "loss": 0.34422693252563474, + "memory(GiB)": 91.64, + "step": 2215, + "token_acc": 0.8831013916500994, + "train_speed(iter/s)": 0.138044 + }, + { + "epoch": 0.20951302378255945, + "grad_norm": 0.5761186480522156, + "learning_rate": 0.00018640943797707622, + "loss": 0.3473784923553467, + "memory(GiB)": 91.64, + "step": 2220, + "token_acc": 0.8858757062146893, + "train_speed(iter/s)": 0.138045 + }, + { + "epoch": 0.2099848999622499, + "grad_norm": 0.3242824077606201, + "learning_rate": 0.00018633078832996978, + "loss": 0.33851065635681155, + "memory(GiB)": 91.64, + "step": 2225, + "token_acc": 0.8659020732245258, + "train_speed(iter/s)": 0.13805 + }, + { + "epoch": 0.21045677614194036, + "grad_norm": 0.5747153759002686, + "learning_rate": 0.0001862519284543978, + "loss": 0.33396263122558595, + "memory(GiB)": 91.64, + "step": 2230, + "token_acc": 0.8875242404654169, + "train_speed(iter/s)": 0.138054 + }, + { + "epoch": 0.2109286523216308, + "grad_norm": 0.742813229560852, + "learning_rate": 0.00018617285854239586, + "loss": 0.3415235996246338, + "memory(GiB)": 91.64, + "step": 2235, + "token_acc": 0.8529603122966819, + "train_speed(iter/s)": 0.138056 + }, + { + "epoch": 0.21140052850132127, + "grad_norm": 0.5336731672286987, + "learning_rate": 0.00018609357878651115, + "loss": 0.33643336296081544, + "memory(GiB)": 91.64, + "step": 2240, + "token_acc": 0.9009042954031651, + "train_speed(iter/s)": 0.138057 + }, + { + "epoch": 0.2118724046810117, + "grad_norm": 0.3660552203655243, + "learning_rate": 0.00018601408937980182, + "loss": 0.3392070770263672, + "memory(GiB)": 91.64, + "step": 2245, + "token_acc": 0.8835978835978836, + "train_speed(iter/s)": 0.138053 + }, + { + "epoch": 0.21234428086070214, + "grad_norm": 0.5543057322502136, + "learning_rate": 0.00018593439051583653, + "loss": 0.3458813428878784, + "memory(GiB)": 91.64, + "step": 2250, + "token_acc": 0.8912521440823328, + "train_speed(iter/s)": 0.138054 + }, + { + "epoch": 0.2128161570403926, + "grad_norm": 0.2876913845539093, + "learning_rate": 0.00018585448238869393, + "loss": 0.33101418018341067, + "memory(GiB)": 91.64, + "step": 2255, + "token_acc": 0.8919026725169525, + "train_speed(iter/s)": 0.138055 + }, + { + "epoch": 0.21328803322008305, + "grad_norm": 0.3627128303050995, + "learning_rate": 0.00018577436519296247, + "loss": 0.3388057231903076, + "memory(GiB)": 91.64, + "step": 2260, + "token_acc": 0.8724961479198767, + "train_speed(iter/s)": 0.138057 + }, + { + "epoch": 0.2137599093997735, + "grad_norm": 0.5142070055007935, + "learning_rate": 0.00018569403912373951, + "loss": 0.3349132061004639, + "memory(GiB)": 91.64, + "step": 2265, + "token_acc": 0.8903488898957861, + "train_speed(iter/s)": 0.138059 + }, + { + "epoch": 0.21423178557946396, + "grad_norm": 0.29270824790000916, + "learning_rate": 0.00018561350437663115, + "loss": 0.34359285831451414, + "memory(GiB)": 91.64, + "step": 2270, + "token_acc": 0.8882921589688507, + "train_speed(iter/s)": 0.138061 + }, + { + "epoch": 0.2147036617591544, + "grad_norm": 1.000394344329834, + "learning_rate": 0.00018553276114775157, + "loss": 0.34334120750427244, + "memory(GiB)": 91.64, + "step": 2275, + "token_acc": 0.9124653739612189, + "train_speed(iter/s)": 0.138058 + }, + { + "epoch": 0.21517553793884484, + "grad_norm": 0.6676534414291382, + "learning_rate": 0.00018545180963372272, + "loss": 0.33390681743621825, + "memory(GiB)": 91.64, + "step": 2280, + "token_acc": 0.8856531049250536, + "train_speed(iter/s)": 0.138056 + }, + { + "epoch": 0.2156474141185353, + "grad_norm": 0.3300536870956421, + "learning_rate": 0.00018537065003167377, + "loss": 0.3315050840377808, + "memory(GiB)": 91.64, + "step": 2285, + "token_acc": 0.8751619870410368, + "train_speed(iter/s)": 0.138061 + }, + { + "epoch": 0.21611929029822574, + "grad_norm": 0.4177890121936798, + "learning_rate": 0.0001852892825392405, + "loss": 0.3441403865814209, + "memory(GiB)": 91.64, + "step": 2290, + "token_acc": 0.8829141370338248, + "train_speed(iter/s)": 0.138065 + }, + { + "epoch": 0.2165911664779162, + "grad_norm": 0.3767399787902832, + "learning_rate": 0.00018520770735456504, + "loss": 0.3340781211853027, + "memory(GiB)": 91.64, + "step": 2295, + "token_acc": 0.8791012838801712, + "train_speed(iter/s)": 0.138068 + }, + { + "epoch": 0.21706304265760665, + "grad_norm": 0.5728538632392883, + "learning_rate": 0.0001851259246762952, + "loss": 0.33160576820373533, + "memory(GiB)": 91.64, + "step": 2300, + "token_acc": 0.8773809523809524, + "train_speed(iter/s)": 0.13807 + }, + { + "epoch": 0.2175349188372971, + "grad_norm": 0.38368016481399536, + "learning_rate": 0.00018504393470358417, + "loss": 0.33443965911865237, + "memory(GiB)": 91.64, + "step": 2305, + "token_acc": 0.894983866236433, + "train_speed(iter/s)": 0.138071 + }, + { + "epoch": 0.21800679501698755, + "grad_norm": 0.8141961097717285, + "learning_rate": 0.00018496173763608986, + "loss": 0.3331787109375, + "memory(GiB)": 91.64, + "step": 2310, + "token_acc": 0.8896752706078268, + "train_speed(iter/s)": 0.138073 + }, + { + "epoch": 0.218478671196678, + "grad_norm": 0.4129309356212616, + "learning_rate": 0.00018487933367397448, + "loss": 0.33819682598114015, + "memory(GiB)": 91.64, + "step": 2315, + "token_acc": 0.8782961460446247, + "train_speed(iter/s)": 0.138076 + }, + { + "epoch": 0.21895054737636843, + "grad_norm": 0.45642325282096863, + "learning_rate": 0.0001847967230179041, + "loss": 0.34285683631896974, + "memory(GiB)": 91.64, + "step": 2320, + "token_acc": 0.8848589522164652, + "train_speed(iter/s)": 0.138083 + }, + { + "epoch": 0.2194224235560589, + "grad_norm": 0.40785184502601624, + "learning_rate": 0.0001847139058690481, + "loss": 0.3394650459289551, + "memory(GiB)": 91.64, + "step": 2325, + "token_acc": 0.8692338547934216, + "train_speed(iter/s)": 0.138082 + }, + { + "epoch": 0.21989429973574934, + "grad_norm": 0.2887260913848877, + "learning_rate": 0.0001846308824290787, + "loss": 0.3331992864608765, + "memory(GiB)": 91.64, + "step": 2330, + "token_acc": 0.8966675277706019, + "train_speed(iter/s)": 0.13808 + }, + { + "epoch": 0.22036617591543978, + "grad_norm": 0.7396420836448669, + "learning_rate": 0.0001845476529001705, + "loss": 0.33391461372375486, + "memory(GiB)": 91.64, + "step": 2335, + "token_acc": 0.8978511367175335, + "train_speed(iter/s)": 0.138083 + }, + { + "epoch": 0.22083805209513024, + "grad_norm": 0.2597915232181549, + "learning_rate": 0.00018446421748499986, + "loss": 0.3247242450714111, + "memory(GiB)": 91.64, + "step": 2340, + "token_acc": 0.8870259481037924, + "train_speed(iter/s)": 0.138086 + }, + { + "epoch": 0.22130992827482068, + "grad_norm": 0.5094233155250549, + "learning_rate": 0.0001843805763867447, + "loss": 0.3364422798156738, + "memory(GiB)": 91.64, + "step": 2345, + "token_acc": 0.878769782024485, + "train_speed(iter/s)": 0.13809 + }, + { + "epoch": 0.22178180445451115, + "grad_norm": 0.35699471831321716, + "learning_rate": 0.00018429672980908355, + "loss": 0.3271955490112305, + "memory(GiB)": 91.64, + "step": 2350, + "token_acc": 0.8881599500156201, + "train_speed(iter/s)": 0.138088 + }, + { + "epoch": 0.2222536806342016, + "grad_norm": 0.31184324622154236, + "learning_rate": 0.00018421267795619555, + "loss": 0.335361909866333, + "memory(GiB)": 91.64, + "step": 2355, + "token_acc": 0.880623346074684, + "train_speed(iter/s)": 0.138091 + }, + { + "epoch": 0.22272555681389203, + "grad_norm": 0.599033772945404, + "learning_rate": 0.00018412842103275956, + "loss": 0.328480863571167, + "memory(GiB)": 91.64, + "step": 2360, + "token_acc": 0.8927937522186723, + "train_speed(iter/s)": 0.138092 + }, + { + "epoch": 0.2231974329935825, + "grad_norm": 0.5339512825012207, + "learning_rate": 0.00018404395924395388, + "loss": 0.33516457080841067, + "memory(GiB)": 91.64, + "step": 2365, + "token_acc": 0.9051851851851852, + "train_speed(iter/s)": 0.138093 + }, + { + "epoch": 0.22366930917327293, + "grad_norm": 0.5576615929603577, + "learning_rate": 0.0001839592927954557, + "loss": 0.333830738067627, + "memory(GiB)": 91.64, + "step": 2370, + "token_acc": 0.874083519285942, + "train_speed(iter/s)": 0.138096 + }, + { + "epoch": 0.22414118535296337, + "grad_norm": 0.32573401927948, + "learning_rate": 0.00018387442189344056, + "loss": 0.3350250244140625, + "memory(GiB)": 91.64, + "step": 2375, + "token_acc": 0.8787170063481456, + "train_speed(iter/s)": 0.138099 + }, + { + "epoch": 0.22461306153265384, + "grad_norm": 0.7854357361793518, + "learning_rate": 0.00018378934674458187, + "loss": 0.33852076530456543, + "memory(GiB)": 91.64, + "step": 2380, + "token_acc": 0.904169079328315, + "train_speed(iter/s)": 0.138101 + }, + { + "epoch": 0.22508493771234428, + "grad_norm": 0.4120338559150696, + "learning_rate": 0.00018370406755605046, + "loss": 0.33258886337280275, + "memory(GiB)": 91.64, + "step": 2385, + "token_acc": 0.8675496688741722, + "train_speed(iter/s)": 0.138098 + }, + { + "epoch": 0.22555681389203472, + "grad_norm": 0.6627610325813293, + "learning_rate": 0.00018361858453551393, + "loss": 0.329588794708252, + "memory(GiB)": 91.64, + "step": 2390, + "token_acc": 0.8796651552145099, + "train_speed(iter/s)": 0.138103 + }, + { + "epoch": 0.22602869007172519, + "grad_norm": 1.0246161222457886, + "learning_rate": 0.00018353289789113636, + "loss": 0.3348996639251709, + "memory(GiB)": 91.64, + "step": 2395, + "token_acc": 0.895040369088812, + "train_speed(iter/s)": 0.138106 + }, + { + "epoch": 0.22650056625141562, + "grad_norm": 0.34904682636260986, + "learning_rate": 0.0001834470078315776, + "loss": 0.33976428508758544, + "memory(GiB)": 91.64, + "step": 2400, + "token_acc": 0.8983688833124216, + "train_speed(iter/s)": 0.138109 + }, + { + "epoch": 0.22697244243110606, + "grad_norm": 0.3885119557380676, + "learning_rate": 0.00018336091456599288, + "loss": 0.3324185609817505, + "memory(GiB)": 91.64, + "step": 2405, + "token_acc": 0.8879310344827587, + "train_speed(iter/s)": 0.138113 + }, + { + "epoch": 0.22744431861079653, + "grad_norm": 0.8067770600318909, + "learning_rate": 0.00018327461830403228, + "loss": 0.3337501049041748, + "memory(GiB)": 91.64, + "step": 2410, + "token_acc": 0.8831425187524674, + "train_speed(iter/s)": 0.138115 + }, + { + "epoch": 0.22791619479048697, + "grad_norm": 1.321143388748169, + "learning_rate": 0.00018318811925584013, + "loss": 0.34863355159759524, + "memory(GiB)": 91.64, + "step": 2415, + "token_acc": 0.8935230618253189, + "train_speed(iter/s)": 0.138113 + }, + { + "epoch": 0.22838807097017744, + "grad_norm": 0.2674782872200012, + "learning_rate": 0.00018310141763205472, + "loss": 0.3419090747833252, + "memory(GiB)": 91.64, + "step": 2420, + "token_acc": 0.8874555160142349, + "train_speed(iter/s)": 0.13811 + }, + { + "epoch": 0.22885994714986788, + "grad_norm": 0.6279721260070801, + "learning_rate": 0.0001830145136438075, + "loss": 0.33653717041015624, + "memory(GiB)": 91.64, + "step": 2425, + "token_acc": 0.8700137551581844, + "train_speed(iter/s)": 0.138108 + }, + { + "epoch": 0.22933182332955832, + "grad_norm": 0.4765380322933197, + "learning_rate": 0.00018292740750272277, + "loss": 0.33394408226013184, + "memory(GiB)": 91.64, + "step": 2430, + "token_acc": 0.8937568455640745, + "train_speed(iter/s)": 0.138109 + }, + { + "epoch": 0.22980369950924878, + "grad_norm": 0.4631507098674774, + "learning_rate": 0.0001828400994209171, + "loss": 0.33988327980041505, + "memory(GiB)": 91.64, + "step": 2435, + "token_acc": 0.8815399802566634, + "train_speed(iter/s)": 0.138114 + }, + { + "epoch": 0.23027557568893922, + "grad_norm": 0.6181381344795227, + "learning_rate": 0.0001827525896109988, + "loss": 0.3365536451339722, + "memory(GiB)": 91.64, + "step": 2440, + "token_acc": 0.8993469074145217, + "train_speed(iter/s)": 0.138118 + }, + { + "epoch": 0.23074745186862966, + "grad_norm": 0.8235430121421814, + "learning_rate": 0.0001826648782860675, + "loss": 0.33196301460266114, + "memory(GiB)": 91.64, + "step": 2445, + "token_acc": 0.8881469115191987, + "train_speed(iter/s)": 0.13812 + }, + { + "epoch": 0.23121932804832013, + "grad_norm": 0.3884918689727783, + "learning_rate": 0.00018257696565971337, + "loss": 0.34457688331604003, + "memory(GiB)": 91.64, + "step": 2450, + "token_acc": 0.8957507082152975, + "train_speed(iter/s)": 0.138123 + }, + { + "epoch": 0.23169120422801057, + "grad_norm": 1.2805403470993042, + "learning_rate": 0.00018248885194601698, + "loss": 0.3333995580673218, + "memory(GiB)": 91.64, + "step": 2455, + "token_acc": 0.8563068920676203, + "train_speed(iter/s)": 0.138118 + }, + { + "epoch": 0.232163080407701, + "grad_norm": 0.6470995545387268, + "learning_rate": 0.0001824005373595484, + "loss": 0.3331939697265625, + "memory(GiB)": 91.64, + "step": 2460, + "token_acc": 0.8931245745405038, + "train_speed(iter/s)": 0.138119 + }, + { + "epoch": 0.23263495658739147, + "grad_norm": 0.41226083040237427, + "learning_rate": 0.00018231202211536703, + "loss": 0.33048851490020753, + "memory(GiB)": 91.64, + "step": 2465, + "token_acc": 0.9143766271333527, + "train_speed(iter/s)": 0.138121 + }, + { + "epoch": 0.2331068327670819, + "grad_norm": 0.3055603504180908, + "learning_rate": 0.0001822233064290208, + "loss": 0.33061680793762205, + "memory(GiB)": 91.64, + "step": 2470, + "token_acc": 0.8926151761517616, + "train_speed(iter/s)": 0.138122 + }, + { + "epoch": 0.23357870894677238, + "grad_norm": 0.44368642568588257, + "learning_rate": 0.0001821343905165457, + "loss": 0.33341374397277834, + "memory(GiB)": 91.64, + "step": 2475, + "token_acc": 0.904647983595352, + "train_speed(iter/s)": 0.138122 + }, + { + "epoch": 0.23405058512646282, + "grad_norm": 0.6508908867835999, + "learning_rate": 0.00018204527459446542, + "loss": 0.3335702419281006, + "memory(GiB)": 91.64, + "step": 2480, + "token_acc": 0.8927335640138409, + "train_speed(iter/s)": 0.138129 + }, + { + "epoch": 0.23452246130615326, + "grad_norm": 0.6149075031280518, + "learning_rate": 0.00018195595887979062, + "loss": 0.32854518890380857, + "memory(GiB)": 91.64, + "step": 2485, + "token_acc": 0.8804051421893261, + "train_speed(iter/s)": 0.138129 + }, + { + "epoch": 0.23499433748584372, + "grad_norm": 0.6348846554756165, + "learning_rate": 0.0001818664435900185, + "loss": 0.328797721862793, + "memory(GiB)": 91.64, + "step": 2490, + "token_acc": 0.8710801393728222, + "train_speed(iter/s)": 0.138132 + }, + { + "epoch": 0.23546621366553416, + "grad_norm": 0.37644708156585693, + "learning_rate": 0.00018177672894313234, + "loss": 0.3341526508331299, + "memory(GiB)": 91.64, + "step": 2495, + "token_acc": 0.8936955063715627, + "train_speed(iter/s)": 0.138135 + }, + { + "epoch": 0.2359380898452246, + "grad_norm": 0.4598802924156189, + "learning_rate": 0.00018168681515760068, + "loss": 0.3312446355819702, + "memory(GiB)": 91.64, + "step": 2500, + "token_acc": 0.897131552917903, + "train_speed(iter/s)": 0.138137 + }, + { + "epoch": 0.23640996602491507, + "grad_norm": 0.48607075214385986, + "learning_rate": 0.00018159670245237726, + "loss": 0.3310050964355469, + "memory(GiB)": 91.64, + "step": 2505, + "token_acc": 0.884657634566093, + "train_speed(iter/s)": 0.138142 + }, + { + "epoch": 0.2368818422046055, + "grad_norm": 0.38543701171875, + "learning_rate": 0.0001815063910469, + "loss": 0.3297031164169312, + "memory(GiB)": 91.64, + "step": 2510, + "token_acc": 0.9097881665449233, + "train_speed(iter/s)": 0.138138 + }, + { + "epoch": 0.23735371838429595, + "grad_norm": 0.5178254842758179, + "learning_rate": 0.00018141588116109077, + "loss": 0.32350549697875974, + "memory(GiB)": 91.64, + "step": 2515, + "token_acc": 0.890393567498942, + "train_speed(iter/s)": 0.13814 + }, + { + "epoch": 0.23782559456398641, + "grad_norm": 0.33711934089660645, + "learning_rate": 0.0001813251730153548, + "loss": 0.3261995553970337, + "memory(GiB)": 91.64, + "step": 2520, + "token_acc": 0.8786370597243491, + "train_speed(iter/s)": 0.138146 + }, + { + "epoch": 0.23829747074367685, + "grad_norm": 0.6296564340591431, + "learning_rate": 0.00018123426683058007, + "loss": 0.3304123401641846, + "memory(GiB)": 91.64, + "step": 2525, + "token_acc": 0.8883516483516484, + "train_speed(iter/s)": 0.13815 + }, + { + "epoch": 0.23876934692336732, + "grad_norm": 0.6025164127349854, + "learning_rate": 0.0001811431628281368, + "loss": 0.323737645149231, + "memory(GiB)": 91.64, + "step": 2530, + "token_acc": 0.8825503355704698, + "train_speed(iter/s)": 0.138151 + }, + { + "epoch": 0.23924122310305776, + "grad_norm": 0.4461984932422638, + "learning_rate": 0.000181051861229877, + "loss": 0.3296360015869141, + "memory(GiB)": 91.64, + "step": 2535, + "token_acc": 0.8823049741777657, + "train_speed(iter/s)": 0.138152 + }, + { + "epoch": 0.2397130992827482, + "grad_norm": 0.3749321401119232, + "learning_rate": 0.00018096036225813373, + "loss": 0.3249626636505127, + "memory(GiB)": 91.64, + "step": 2540, + "token_acc": 0.8823712948517941, + "train_speed(iter/s)": 0.138157 + }, + { + "epoch": 0.24018497546243867, + "grad_norm": 0.4789837598800659, + "learning_rate": 0.00018086866613572085, + "loss": 0.32210140228271483, + "memory(GiB)": 91.64, + "step": 2545, + "token_acc": 0.8996778647031753, + "train_speed(iter/s)": 0.138159 + }, + { + "epoch": 0.2406568516421291, + "grad_norm": 0.30677059292793274, + "learning_rate": 0.00018077677308593216, + "loss": 0.3380124568939209, + "memory(GiB)": 91.64, + "step": 2550, + "token_acc": 0.8955823293172691, + "train_speed(iter/s)": 0.138164 + }, + { + "epoch": 0.24112872782181954, + "grad_norm": 0.26206114888191223, + "learning_rate": 0.00018068468333254107, + "loss": 0.33399908542633056, + "memory(GiB)": 91.64, + "step": 2555, + "token_acc": 0.8851699279093718, + "train_speed(iter/s)": 0.138167 + }, + { + "epoch": 0.24160060400151, + "grad_norm": 0.3598458170890808, + "learning_rate": 0.00018059239709980002, + "loss": 0.33064751625061034, + "memory(GiB)": 91.64, + "step": 2560, + "token_acc": 0.8942148760330578, + "train_speed(iter/s)": 0.138168 + }, + { + "epoch": 0.24207248018120045, + "grad_norm": 0.32919731736183167, + "learning_rate": 0.00018049991461243988, + "loss": 0.33213248252868655, + "memory(GiB)": 91.64, + "step": 2565, + "token_acc": 0.8634217217580821, + "train_speed(iter/s)": 0.138164 + }, + { + "epoch": 0.2425443563608909, + "grad_norm": 0.37082603573799133, + "learning_rate": 0.00018040723609566943, + "loss": 0.33206088542938234, + "memory(GiB)": 91.64, + "step": 2570, + "token_acc": 0.8974438902743143, + "train_speed(iter/s)": 0.138169 + }, + { + "epoch": 0.24301623254058136, + "grad_norm": 0.875857412815094, + "learning_rate": 0.00018031436177517478, + "loss": 0.3363888502120972, + "memory(GiB)": 91.64, + "step": 2575, + "token_acc": 0.8790530108080288, + "train_speed(iter/s)": 0.138168 + }, + { + "epoch": 0.2434881087202718, + "grad_norm": 0.5113961696624756, + "learning_rate": 0.0001802212918771189, + "loss": 0.3324090003967285, + "memory(GiB)": 91.64, + "step": 2580, + "token_acc": 0.886832363828662, + "train_speed(iter/s)": 0.13817 + }, + { + "epoch": 0.24395998489996226, + "grad_norm": 0.4555577337741852, + "learning_rate": 0.000180128026628141, + "loss": 0.3298499345779419, + "memory(GiB)": 91.64, + "step": 2585, + "token_acc": 0.8806023664395841, + "train_speed(iter/s)": 0.138172 + }, + { + "epoch": 0.2444318610796527, + "grad_norm": 0.7389398217201233, + "learning_rate": 0.00018003456625535603, + "loss": 0.32846970558166505, + "memory(GiB)": 91.64, + "step": 2590, + "token_acc": 0.8857536132140399, + "train_speed(iter/s)": 0.138176 + }, + { + "epoch": 0.24490373725934314, + "grad_norm": 0.36795300245285034, + "learning_rate": 0.000179940910986354, + "loss": 0.32522149085998536, + "memory(GiB)": 91.64, + "step": 2595, + "token_acc": 0.8899871078642029, + "train_speed(iter/s)": 0.138179 + }, + { + "epoch": 0.2453756134390336, + "grad_norm": 0.5684471726417542, + "learning_rate": 0.00017984706104919965, + "loss": 0.3347635746002197, + "memory(GiB)": 91.64, + "step": 2600, + "token_acc": 0.8730314960629921, + "train_speed(iter/s)": 0.138185 + }, + { + "epoch": 0.24584748961872405, + "grad_norm": 0.4991260766983032, + "learning_rate": 0.00017975301667243166, + "loss": 0.32751965522766113, + "memory(GiB)": 91.64, + "step": 2605, + "token_acc": 0.8909626719056974, + "train_speed(iter/s)": 0.13819 + }, + { + "epoch": 0.24631936579841449, + "grad_norm": 0.5270681381225586, + "learning_rate": 0.00017965877808506228, + "loss": 0.32968854904174805, + "memory(GiB)": 91.64, + "step": 2610, + "token_acc": 0.8979523329976502, + "train_speed(iter/s)": 0.138192 + }, + { + "epoch": 0.24679124197810495, + "grad_norm": 0.713347315788269, + "learning_rate": 0.00017956434551657667, + "loss": 0.32762675285339354, + "memory(GiB)": 91.64, + "step": 2615, + "token_acc": 0.8938199917046868, + "train_speed(iter/s)": 0.138197 + }, + { + "epoch": 0.2472631181577954, + "grad_norm": 0.32395628094673157, + "learning_rate": 0.00017946971919693229, + "loss": 0.3311192989349365, + "memory(GiB)": 91.64, + "step": 2620, + "token_acc": 0.8778821520068317, + "train_speed(iter/s)": 0.138197 + }, + { + "epoch": 0.24773499433748583, + "grad_norm": 0.4176148474216461, + "learning_rate": 0.0001793748993565585, + "loss": 0.3304103136062622, + "memory(GiB)": 91.64, + "step": 2625, + "token_acc": 0.8812527185732927, + "train_speed(iter/s)": 0.138198 + }, + { + "epoch": 0.2482068705171763, + "grad_norm": 0.5663833618164062, + "learning_rate": 0.0001792798862263559, + "loss": 0.32198286056518555, + "memory(GiB)": 91.64, + "step": 2630, + "token_acc": 0.8690159574468085, + "train_speed(iter/s)": 0.138197 + }, + { + "epoch": 0.24867874669686674, + "grad_norm": 0.3792598247528076, + "learning_rate": 0.0001791846800376958, + "loss": 0.31867480278015137, + "memory(GiB)": 91.64, + "step": 2635, + "token_acc": 0.8866906474820144, + "train_speed(iter/s)": 0.1382 + }, + { + "epoch": 0.2491506228765572, + "grad_norm": 0.32338786125183105, + "learning_rate": 0.00017908928102241953, + "loss": 0.3264533519744873, + "memory(GiB)": 91.64, + "step": 2640, + "token_acc": 0.8988023952095808, + "train_speed(iter/s)": 0.138202 + }, + { + "epoch": 0.24962249905624764, + "grad_norm": 1.4472591876983643, + "learning_rate": 0.00017899368941283808, + "loss": 0.3345726490020752, + "memory(GiB)": 91.64, + "step": 2645, + "token_acc": 0.8777016341591988, + "train_speed(iter/s)": 0.138203 + }, + { + "epoch": 0.2500943752359381, + "grad_norm": 0.6152178645133972, + "learning_rate": 0.00017889790544173143, + "loss": 0.3289012432098389, + "memory(GiB)": 91.64, + "step": 2650, + "token_acc": 0.8993955094991365, + "train_speed(iter/s)": 0.138207 + }, + { + "epoch": 0.25056625141562855, + "grad_norm": 0.3531012237071991, + "learning_rate": 0.00017880192934234792, + "loss": 0.3324916362762451, + "memory(GiB)": 91.64, + "step": 2655, + "token_acc": 0.9044006069802731, + "train_speed(iter/s)": 0.13821 + }, + { + "epoch": 0.25103812759531896, + "grad_norm": 0.6609277725219727, + "learning_rate": 0.00017870576134840381, + "loss": 0.327985954284668, + "memory(GiB)": 91.64, + "step": 2660, + "token_acc": 0.8779822767552828, + "train_speed(iter/s)": 0.13821 + }, + { + "epoch": 0.2515100037750094, + "grad_norm": 0.297342449426651, + "learning_rate": 0.00017860940169408274, + "loss": 0.3250416278839111, + "memory(GiB)": 91.64, + "step": 2665, + "token_acc": 0.8967880085653105, + "train_speed(iter/s)": 0.138215 + }, + { + "epoch": 0.2519818799546999, + "grad_norm": 0.4424315094947815, + "learning_rate": 0.00017851285061403483, + "loss": 0.3348080158233643, + "memory(GiB)": 91.64, + "step": 2670, + "token_acc": 0.875974025974026, + "train_speed(iter/s)": 0.138217 + }, + { + "epoch": 0.25245375613439036, + "grad_norm": 0.3924655020236969, + "learning_rate": 0.0001784161083433766, + "loss": 0.32940819263458254, + "memory(GiB)": 91.64, + "step": 2675, + "token_acc": 0.8746097814776275, + "train_speed(iter/s)": 0.138219 + }, + { + "epoch": 0.2529256323140808, + "grad_norm": 0.4169272482395172, + "learning_rate": 0.00017831917511769, + "loss": 0.32938084602355955, + "memory(GiB)": 91.64, + "step": 2680, + "token_acc": 0.8867924528301887, + "train_speed(iter/s)": 0.138222 + }, + { + "epoch": 0.25339750849377124, + "grad_norm": 0.5509438514709473, + "learning_rate": 0.000178222051173022, + "loss": 0.32615439891815184, + "memory(GiB)": 91.64, + "step": 2685, + "token_acc": 0.8933631618195377, + "train_speed(iter/s)": 0.138225 + }, + { + "epoch": 0.2538693846734617, + "grad_norm": 0.27454128861427307, + "learning_rate": 0.00017812473674588407, + "loss": 0.3298367977142334, + "memory(GiB)": 91.64, + "step": 2690, + "token_acc": 0.8921161825726142, + "train_speed(iter/s)": 0.138226 + }, + { + "epoch": 0.2543412608531521, + "grad_norm": 0.2513178884983063, + "learning_rate": 0.0001780272320732515, + "loss": 0.3351443767547607, + "memory(GiB)": 91.64, + "step": 2695, + "token_acc": 0.8785100286532951, + "train_speed(iter/s)": 0.138227 + }, + { + "epoch": 0.2548131370328426, + "grad_norm": 0.3216034471988678, + "learning_rate": 0.00017792953739256278, + "loss": 0.3156256198883057, + "memory(GiB)": 91.64, + "step": 2700, + "token_acc": 0.8910359634997316, + "train_speed(iter/s)": 0.13823 + }, + { + "epoch": 0.25528501321253305, + "grad_norm": 0.7129026651382446, + "learning_rate": 0.0001778316529417192, + "loss": 0.3225963354110718, + "memory(GiB)": 91.64, + "step": 2705, + "token_acc": 0.8915942028985507, + "train_speed(iter/s)": 0.138232 + }, + { + "epoch": 0.25575688939222346, + "grad_norm": 1.063512921333313, + "learning_rate": 0.0001777335789590842, + "loss": 0.3296067237854004, + "memory(GiB)": 91.64, + "step": 2710, + "token_acc": 0.8891454965357968, + "train_speed(iter/s)": 0.138231 + }, + { + "epoch": 0.25622876557191393, + "grad_norm": 0.6636164784431458, + "learning_rate": 0.0001776353156834826, + "loss": 0.32224364280700685, + "memory(GiB)": 91.64, + "step": 2715, + "token_acc": 0.8841752721183366, + "train_speed(iter/s)": 0.138229 + }, + { + "epoch": 0.2567006417516044, + "grad_norm": 0.5418056845664978, + "learning_rate": 0.00017753686335420028, + "loss": 0.32490553855896, + "memory(GiB)": 91.64, + "step": 2720, + "token_acc": 0.8908418131359852, + "train_speed(iter/s)": 0.13823 + }, + { + "epoch": 0.2571725179312948, + "grad_norm": 0.984113872051239, + "learning_rate": 0.0001774382222109835, + "loss": 0.3314798831939697, + "memory(GiB)": 91.64, + "step": 2725, + "token_acc": 0.8758210822646231, + "train_speed(iter/s)": 0.138232 + }, + { + "epoch": 0.2576443941109853, + "grad_norm": 0.35621482133865356, + "learning_rate": 0.00017733939249403835, + "loss": 0.3266749382019043, + "memory(GiB)": 91.64, + "step": 2730, + "token_acc": 0.8859649122807017, + "train_speed(iter/s)": 0.138236 + }, + { + "epoch": 0.25811627029067574, + "grad_norm": 0.31070077419281006, + "learning_rate": 0.00017724037444402993, + "loss": 0.3295797109603882, + "memory(GiB)": 91.64, + "step": 2735, + "token_acc": 0.8840749414519906, + "train_speed(iter/s)": 0.138237 + }, + { + "epoch": 0.25858814647036615, + "grad_norm": 0.36651989817619324, + "learning_rate": 0.00017714116830208228, + "loss": 0.3238197326660156, + "memory(GiB)": 91.64, + "step": 2740, + "token_acc": 0.8655913978494624, + "train_speed(iter/s)": 0.13824 + }, + { + "epoch": 0.2590600226500566, + "grad_norm": 0.3921261131763458, + "learning_rate": 0.00017704177430977712, + "loss": 0.3325662612915039, + "memory(GiB)": 91.64, + "step": 2745, + "token_acc": 0.8959709379128138, + "train_speed(iter/s)": 0.138242 + }, + { + "epoch": 0.2595318988297471, + "grad_norm": 0.2784421741962433, + "learning_rate": 0.00017694219270915387, + "loss": 0.318132209777832, + "memory(GiB)": 91.64, + "step": 2750, + "token_acc": 0.8974579922447221, + "train_speed(iter/s)": 0.138246 + }, + { + "epoch": 0.2600037750094375, + "grad_norm": 0.536296010017395, + "learning_rate": 0.0001768424237427087, + "loss": 0.327205753326416, + "memory(GiB)": 91.64, + "step": 2755, + "token_acc": 0.9128829826504246, + "train_speed(iter/s)": 0.138246 + }, + { + "epoch": 0.26047565118912797, + "grad_norm": 0.29977917671203613, + "learning_rate": 0.00017674246765339406, + "loss": 0.3198941707611084, + "memory(GiB)": 91.64, + "step": 2760, + "token_acc": 0.8957388939256573, + "train_speed(iter/s)": 0.13825 + }, + { + "epoch": 0.26094752736881843, + "grad_norm": 0.26089954376220703, + "learning_rate": 0.00017664232468461808, + "loss": 0.3271759510040283, + "memory(GiB)": 91.64, + "step": 2765, + "token_acc": 0.8782735208535403, + "train_speed(iter/s)": 0.13825 + }, + { + "epoch": 0.26141940354850884, + "grad_norm": 0.6381005644798279, + "learning_rate": 0.00017654199508024396, + "loss": 0.31565151214599607, + "memory(GiB)": 91.64, + "step": 2770, + "token_acc": 0.8883669521967394, + "train_speed(iter/s)": 0.138249 + }, + { + "epoch": 0.2618912797281993, + "grad_norm": 0.35093823075294495, + "learning_rate": 0.0001764414790845894, + "loss": 0.32278971672058104, + "memory(GiB)": 91.64, + "step": 2775, + "token_acc": 0.8895833333333333, + "train_speed(iter/s)": 0.138248 + }, + { + "epoch": 0.2623631559078898, + "grad_norm": 0.5740118026733398, + "learning_rate": 0.00017634077694242599, + "loss": 0.32536165714263915, + "memory(GiB)": 91.64, + "step": 2780, + "token_acc": 0.8868584758942457, + "train_speed(iter/s)": 0.138251 + }, + { + "epoch": 0.26283503208758024, + "grad_norm": 0.30373477935791016, + "learning_rate": 0.00017623988889897856, + "loss": 0.31938059329986573, + "memory(GiB)": 91.64, + "step": 2785, + "token_acc": 0.8855127509495387, + "train_speed(iter/s)": 0.138254 + }, + { + "epoch": 0.26330690826727066, + "grad_norm": 0.400481641292572, + "learning_rate": 0.00017613881519992474, + "loss": 0.3272983551025391, + "memory(GiB)": 91.64, + "step": 2790, + "token_acc": 0.8951557093425605, + "train_speed(iter/s)": 0.138251 + }, + { + "epoch": 0.2637787844469611, + "grad_norm": 0.26797062158584595, + "learning_rate": 0.00017603755609139413, + "loss": 0.32006087303161623, + "memory(GiB)": 91.64, + "step": 2795, + "token_acc": 0.8942536552193131, + "train_speed(iter/s)": 0.138254 + }, + { + "epoch": 0.2642506606266516, + "grad_norm": 0.3913300633430481, + "learning_rate": 0.00017593611181996802, + "loss": 0.316060733795166, + "memory(GiB)": 91.64, + "step": 2800, + "token_acc": 0.8809629959875167, + "train_speed(iter/s)": 0.138253 + }, + { + "epoch": 0.264722536806342, + "grad_norm": 0.6603167057037354, + "learning_rate": 0.00017583448263267835, + "loss": 0.32715344429016113, + "memory(GiB)": 91.64, + "step": 2805, + "token_acc": 0.8789531079607416, + "train_speed(iter/s)": 0.138254 + }, + { + "epoch": 0.26519441298603247, + "grad_norm": 0.5087538957595825, + "learning_rate": 0.00017573266877700755, + "loss": 0.32497029304504393, + "memory(GiB)": 91.64, + "step": 2810, + "token_acc": 0.8834763354553004, + "train_speed(iter/s)": 0.138253 + }, + { + "epoch": 0.26566628916572294, + "grad_norm": 0.5204217433929443, + "learning_rate": 0.00017563067050088772, + "loss": 0.33654916286468506, + "memory(GiB)": 91.64, + "step": 2815, + "token_acc": 0.871804654711942, + "train_speed(iter/s)": 0.13826 + }, + { + "epoch": 0.26613816534541335, + "grad_norm": 0.7432185411453247, + "learning_rate": 0.0001755284880527, + "loss": 0.3227522611618042, + "memory(GiB)": 91.64, + "step": 2820, + "token_acc": 0.8913313977602655, + "train_speed(iter/s)": 0.138262 + }, + { + "epoch": 0.2666100415251038, + "grad_norm": 0.7067577242851257, + "learning_rate": 0.00017542612168127395, + "loss": 0.3221942186355591, + "memory(GiB)": 91.64, + "step": 2825, + "token_acc": 0.9026233881725211, + "train_speed(iter/s)": 0.138265 + }, + { + "epoch": 0.2670819177047943, + "grad_norm": 0.29249584674835205, + "learning_rate": 0.0001753235716358872, + "loss": 0.32654409408569335, + "memory(GiB)": 91.64, + "step": 2830, + "token_acc": 0.9053850025163563, + "train_speed(iter/s)": 0.138265 + }, + { + "epoch": 0.2675537938844847, + "grad_norm": 0.2350318282842636, + "learning_rate": 0.00017522083816626452, + "loss": 0.3282480239868164, + "memory(GiB)": 91.64, + "step": 2835, + "token_acc": 0.8849056603773585, + "train_speed(iter/s)": 0.138263 + }, + { + "epoch": 0.26802567006417516, + "grad_norm": 0.3909781873226166, + "learning_rate": 0.00017511792152257735, + "loss": 0.32073369026184084, + "memory(GiB)": 91.64, + "step": 2840, + "token_acc": 0.8862385321100917, + "train_speed(iter/s)": 0.138259 + }, + { + "epoch": 0.2684975462438656, + "grad_norm": 0.42215001583099365, + "learning_rate": 0.0001750148219554432, + "loss": 0.3243825435638428, + "memory(GiB)": 91.64, + "step": 2845, + "token_acc": 0.8947040498442368, + "train_speed(iter/s)": 0.138263 + }, + { + "epoch": 0.26896942242355604, + "grad_norm": 0.5986440181732178, + "learning_rate": 0.00017491153971592506, + "loss": 0.3247550010681152, + "memory(GiB)": 91.64, + "step": 2850, + "token_acc": 0.8752844788347747, + "train_speed(iter/s)": 0.138259 + }, + { + "epoch": 0.2694412986032465, + "grad_norm": 0.3086279630661011, + "learning_rate": 0.00017480807505553076, + "loss": 0.3214439868927002, + "memory(GiB)": 91.64, + "step": 2855, + "token_acc": 0.8910550458715596, + "train_speed(iter/s)": 0.138259 + }, + { + "epoch": 0.26991317478293697, + "grad_norm": 0.275035560131073, + "learning_rate": 0.00017470442822621228, + "loss": 0.32530651092529295, + "memory(GiB)": 91.64, + "step": 2860, + "token_acc": 0.8911111111111111, + "train_speed(iter/s)": 0.138259 + }, + { + "epoch": 0.2703850509626274, + "grad_norm": 0.6449964046478271, + "learning_rate": 0.00017460059948036527, + "loss": 0.32490763664245603, + "memory(GiB)": 91.64, + "step": 2865, + "token_acc": 0.882903981264637, + "train_speed(iter/s)": 0.138266 + }, + { + "epoch": 0.27085692714231785, + "grad_norm": 0.8364949226379395, + "learning_rate": 0.00017449658907082833, + "loss": 0.32590255737304685, + "memory(GiB)": 91.64, + "step": 2870, + "token_acc": 0.8773987206823027, + "train_speed(iter/s)": 0.138269 + }, + { + "epoch": 0.2713288033220083, + "grad_norm": 1.0155036449432373, + "learning_rate": 0.0001743923972508825, + "loss": 0.32789173126220705, + "memory(GiB)": 91.64, + "step": 2875, + "token_acc": 0.889273356401384, + "train_speed(iter/s)": 0.138268 + }, + { + "epoch": 0.2718006795016987, + "grad_norm": 0.6500275731086731, + "learning_rate": 0.00017428802427425053, + "loss": 0.3223546504974365, + "memory(GiB)": 91.64, + "step": 2880, + "token_acc": 0.8844172569220863, + "train_speed(iter/s)": 0.138272 + }, + { + "epoch": 0.2722725556813892, + "grad_norm": 0.5770022869110107, + "learning_rate": 0.00017418347039509634, + "loss": 0.3223582744598389, + "memory(GiB)": 91.64, + "step": 2885, + "token_acc": 0.8766788766788767, + "train_speed(iter/s)": 0.138279 + }, + { + "epoch": 0.27274443186107966, + "grad_norm": 0.4336351454257965, + "learning_rate": 0.00017407873586802435, + "loss": 0.324751091003418, + "memory(GiB)": 91.64, + "step": 2890, + "token_acc": 0.8822988505747127, + "train_speed(iter/s)": 0.138273 + }, + { + "epoch": 0.27321630804077013, + "grad_norm": 0.49264365434646606, + "learning_rate": 0.00017397382094807892, + "loss": 0.315723180770874, + "memory(GiB)": 91.64, + "step": 2895, + "token_acc": 0.8913617502829121, + "train_speed(iter/s)": 0.138277 + }, + { + "epoch": 0.27368818422046054, + "grad_norm": 0.47523409128189087, + "learning_rate": 0.00017386872589074366, + "loss": 0.32211527824401853, + "memory(GiB)": 91.64, + "step": 2900, + "token_acc": 0.8896848137535817, + "train_speed(iter/s)": 0.13828 + }, + { + "epoch": 0.274160060400151, + "grad_norm": 0.5986055135726929, + "learning_rate": 0.00017376345095194084, + "loss": 0.3235922336578369, + "memory(GiB)": 91.64, + "step": 2905, + "token_acc": 0.882665832290363, + "train_speed(iter/s)": 0.138282 + }, + { + "epoch": 0.2746319365798415, + "grad_norm": 0.25783222913742065, + "learning_rate": 0.0001736579963880308, + "loss": 0.31179332733154297, + "memory(GiB)": 91.64, + "step": 2910, + "token_acc": 0.8972863302054274, + "train_speed(iter/s)": 0.138281 + }, + { + "epoch": 0.2751038127595319, + "grad_norm": 0.7452314496040344, + "learning_rate": 0.0001735523624558113, + "loss": 0.3252346277236938, + "memory(GiB)": 91.64, + "step": 2915, + "token_acc": 0.884393063583815, + "train_speed(iter/s)": 0.138284 + }, + { + "epoch": 0.27557568893922235, + "grad_norm": 0.4005107879638672, + "learning_rate": 0.00017344654941251682, + "loss": 0.3287190437316895, + "memory(GiB)": 91.64, + "step": 2920, + "token_acc": 0.8755208333333333, + "train_speed(iter/s)": 0.138286 + }, + { + "epoch": 0.2760475651189128, + "grad_norm": 0.7920652031898499, + "learning_rate": 0.00017334055751581812, + "loss": 0.323880934715271, + "memory(GiB)": 91.64, + "step": 2925, + "token_acc": 0.8801882755669662, + "train_speed(iter/s)": 0.138289 + }, + { + "epoch": 0.27651944129860323, + "grad_norm": 0.34472015500068665, + "learning_rate": 0.0001732343870238213, + "loss": 0.3178147554397583, + "memory(GiB)": 91.64, + "step": 2930, + "token_acc": 0.8979652020053082, + "train_speed(iter/s)": 0.138289 + }, + { + "epoch": 0.2769913174782937, + "grad_norm": 0.38036808371543884, + "learning_rate": 0.00017312803819506762, + "loss": 0.3199058771133423, + "memory(GiB)": 91.64, + "step": 2935, + "token_acc": 0.9014567266495287, + "train_speed(iter/s)": 0.138292 + }, + { + "epoch": 0.27746319365798416, + "grad_norm": 0.36448225378990173, + "learning_rate": 0.00017302151128853244, + "loss": 0.31929521560668944, + "memory(GiB)": 91.64, + "step": 2940, + "token_acc": 0.8818827708703375, + "train_speed(iter/s)": 0.138299 + }, + { + "epoch": 0.2779350698376746, + "grad_norm": 0.3512881100177765, + "learning_rate": 0.00017291480656362479, + "loss": 0.31873791217803954, + "memory(GiB)": 91.64, + "step": 2945, + "token_acc": 0.9066164154103853, + "train_speed(iter/s)": 0.1383 + }, + { + "epoch": 0.27840694601736504, + "grad_norm": 0.4474186301231384, + "learning_rate": 0.00017280792428018678, + "loss": 0.3215645313262939, + "memory(GiB)": 91.64, + "step": 2950, + "token_acc": 0.8794084186575654, + "train_speed(iter/s)": 0.138302 + }, + { + "epoch": 0.2788788221970555, + "grad_norm": 0.5548481345176697, + "learning_rate": 0.0001727008646984928, + "loss": 0.3308894634246826, + "memory(GiB)": 91.64, + "step": 2955, + "token_acc": 0.9011608623548922, + "train_speed(iter/s)": 0.1383 + }, + { + "epoch": 0.2793506983767459, + "grad_norm": 0.5263611078262329, + "learning_rate": 0.00017259362807924914, + "loss": 0.3184781074523926, + "memory(GiB)": 91.64, + "step": 2960, + "token_acc": 0.9090909090909091, + "train_speed(iter/s)": 0.138303 + }, + { + "epoch": 0.2798225745564364, + "grad_norm": 0.49656400084495544, + "learning_rate": 0.000172486214683593, + "loss": 0.3242170333862305, + "memory(GiB)": 91.64, + "step": 2965, + "token_acc": 0.8780850431162652, + "train_speed(iter/s)": 0.138303 + }, + { + "epoch": 0.28029445073612685, + "grad_norm": 0.33543533086776733, + "learning_rate": 0.00017237862477309225, + "loss": 0.3202751636505127, + "memory(GiB)": 91.64, + "step": 2970, + "token_acc": 0.900549115314216, + "train_speed(iter/s)": 0.138304 + }, + { + "epoch": 0.28076632691581727, + "grad_norm": 0.2617323100566864, + "learning_rate": 0.00017227085860974453, + "loss": 0.31397452354431155, + "memory(GiB)": 91.64, + "step": 2975, + "token_acc": 0.8893352812271731, + "train_speed(iter/s)": 0.138308 + }, + { + "epoch": 0.28123820309550773, + "grad_norm": 0.33117854595184326, + "learning_rate": 0.0001721629164559766, + "loss": 0.313932466506958, + "memory(GiB)": 91.64, + "step": 2980, + "token_acc": 0.8594104308390023, + "train_speed(iter/s)": 0.138311 + }, + { + "epoch": 0.2817100792751982, + "grad_norm": 0.27345171570777893, + "learning_rate": 0.00017205479857464387, + "loss": 0.3203251361846924, + "memory(GiB)": 91.64, + "step": 2985, + "token_acc": 0.8858647936786654, + "train_speed(iter/s)": 0.138315 + }, + { + "epoch": 0.2821819554548886, + "grad_norm": 0.30517563223838806, + "learning_rate": 0.0001719465052290297, + "loss": 0.3147796630859375, + "memory(GiB)": 91.64, + "step": 2990, + "token_acc": 0.8925373134328358, + "train_speed(iter/s)": 0.138317 + }, + { + "epoch": 0.2826538316345791, + "grad_norm": 0.6084278225898743, + "learning_rate": 0.00017183803668284467, + "loss": 0.32270588874816897, + "memory(GiB)": 91.64, + "step": 2995, + "token_acc": 0.8847248576850095, + "train_speed(iter/s)": 0.138318 + }, + { + "epoch": 0.28312570781426954, + "grad_norm": 0.9644866585731506, + "learning_rate": 0.000171729393200226, + "loss": 0.315791392326355, + "memory(GiB)": 91.64, + "step": 3000, + "token_acc": 0.8951439621830684, + "train_speed(iter/s)": 0.13832 + }, + { + "epoch": 0.28359758399396, + "grad_norm": 0.4065193235874176, + "learning_rate": 0.00017162057504573695, + "loss": 0.31624255180358884, + "memory(GiB)": 91.64, + "step": 3005, + "token_acc": 0.8852857721929469, + "train_speed(iter/s)": 0.138323 + }, + { + "epoch": 0.2840694601736504, + "grad_norm": 0.6837791800498962, + "learning_rate": 0.00017151158248436608, + "loss": 0.32163176536560056, + "memory(GiB)": 91.64, + "step": 3010, + "token_acc": 0.8927978758712247, + "train_speed(iter/s)": 0.138325 + }, + { + "epoch": 0.2845413363533409, + "grad_norm": 0.3094852566719055, + "learning_rate": 0.0001714024157815267, + "loss": 0.3260573625564575, + "memory(GiB)": 91.64, + "step": 3015, + "token_acc": 0.9311740890688259, + "train_speed(iter/s)": 0.138327 + }, + { + "epoch": 0.28501321253303136, + "grad_norm": 0.45613107085227966, + "learning_rate": 0.00017129307520305615, + "loss": 0.3153514385223389, + "memory(GiB)": 91.64, + "step": 3020, + "token_acc": 0.8873579056148811, + "train_speed(iter/s)": 0.138327 + }, + { + "epoch": 0.28548508871272177, + "grad_norm": 0.3602701425552368, + "learning_rate": 0.00017118356101521523, + "loss": 0.3154455184936523, + "memory(GiB)": 91.64, + "step": 3025, + "token_acc": 0.8888233559422, + "train_speed(iter/s)": 0.138328 + }, + { + "epoch": 0.28595696489241224, + "grad_norm": 0.35357511043548584, + "learning_rate": 0.00017107387348468746, + "loss": 0.3162027597427368, + "memory(GiB)": 91.64, + "step": 3030, + "token_acc": 0.8909919383105503, + "train_speed(iter/s)": 0.138331 + }, + { + "epoch": 0.2864288410721027, + "grad_norm": 0.6155738234519958, + "learning_rate": 0.0001709640128785785, + "loss": 0.3166818141937256, + "memory(GiB)": 91.64, + "step": 3035, + "token_acc": 0.8842105263157894, + "train_speed(iter/s)": 0.13833 + }, + { + "epoch": 0.2869007172517931, + "grad_norm": 0.47968873381614685, + "learning_rate": 0.00017085397946441542, + "loss": 0.3206371784210205, + "memory(GiB)": 91.64, + "step": 3040, + "token_acc": 0.8922330097087379, + "train_speed(iter/s)": 0.138328 + }, + { + "epoch": 0.2873725934314836, + "grad_norm": 0.4527382254600525, + "learning_rate": 0.00017074377351014618, + "loss": 0.3101963996887207, + "memory(GiB)": 91.64, + "step": 3045, + "token_acc": 0.9003378378378378, + "train_speed(iter/s)": 0.138329 + }, + { + "epoch": 0.28784446961117405, + "grad_norm": 0.3803810775279999, + "learning_rate": 0.0001706333952841389, + "loss": 0.31847975254058836, + "memory(GiB)": 91.64, + "step": 3050, + "token_acc": 0.9010079193664506, + "train_speed(iter/s)": 0.13833 + }, + { + "epoch": 0.28831634579086446, + "grad_norm": 0.4184848666191101, + "learning_rate": 0.0001705228450551811, + "loss": 0.32082357406616213, + "memory(GiB)": 91.64, + "step": 3055, + "token_acc": 0.90530058177117, + "train_speed(iter/s)": 0.13833 + }, + { + "epoch": 0.2887882219705549, + "grad_norm": 0.6643010377883911, + "learning_rate": 0.00017041212309247926, + "loss": 0.3204943180084229, + "memory(GiB)": 91.64, + "step": 3060, + "token_acc": 0.9075657894736842, + "train_speed(iter/s)": 0.138332 + }, + { + "epoch": 0.2892600981502454, + "grad_norm": 1.0790079832077026, + "learning_rate": 0.00017030122966565808, + "loss": 0.3205895900726318, + "memory(GiB)": 91.64, + "step": 3065, + "token_acc": 0.884206529992407, + "train_speed(iter/s)": 0.138335 + }, + { + "epoch": 0.2897319743299358, + "grad_norm": 0.6009517908096313, + "learning_rate": 0.00017019016504475967, + "loss": 0.32358787059783933, + "memory(GiB)": 91.64, + "step": 3070, + "token_acc": 0.8528336380255942, + "train_speed(iter/s)": 0.138335 + }, + { + "epoch": 0.29020385050962627, + "grad_norm": 0.23621824383735657, + "learning_rate": 0.00017007892950024315, + "loss": 0.32037298679351806, + "memory(GiB)": 91.64, + "step": 3075, + "token_acc": 0.8835883588358836, + "train_speed(iter/s)": 0.138333 + }, + { + "epoch": 0.29067572668931674, + "grad_norm": 0.26536548137664795, + "learning_rate": 0.00016996752330298383, + "loss": 0.31683764457702634, + "memory(GiB)": 91.64, + "step": 3080, + "token_acc": 0.8833831695856471, + "train_speed(iter/s)": 0.138335 + }, + { + "epoch": 0.29114760286900715, + "grad_norm": 0.6876998543739319, + "learning_rate": 0.0001698559467242725, + "loss": 0.33230607509613036, + "memory(GiB)": 91.64, + "step": 3085, + "token_acc": 0.8910573842120594, + "train_speed(iter/s)": 0.138336 + }, + { + "epoch": 0.2916194790486976, + "grad_norm": 0.7065960168838501, + "learning_rate": 0.000169744200035815, + "loss": 0.3083855628967285, + "memory(GiB)": 91.64, + "step": 3090, + "token_acc": 0.9105636179547756, + "train_speed(iter/s)": 0.138336 + }, + { + "epoch": 0.2920913552283881, + "grad_norm": 0.3593423068523407, + "learning_rate": 0.0001696322835097313, + "loss": 0.30929980278015134, + "memory(GiB)": 91.64, + "step": 3095, + "token_acc": 0.8987049028677151, + "train_speed(iter/s)": 0.138338 + }, + { + "epoch": 0.2925632314080785, + "grad_norm": 0.3623899519443512, + "learning_rate": 0.00016952019741855502, + "loss": 0.321060037612915, + "memory(GiB)": 91.64, + "step": 3100, + "token_acc": 0.8926689027311931, + "train_speed(iter/s)": 0.13834 + }, + { + "epoch": 0.29303510758776896, + "grad_norm": 0.2958219051361084, + "learning_rate": 0.0001694079420352326, + "loss": 0.3155770778656006, + "memory(GiB)": 91.64, + "step": 3105, + "token_acc": 0.8705526116578349, + "train_speed(iter/s)": 0.138342 + }, + { + "epoch": 0.29350698376745943, + "grad_norm": 0.26606494188308716, + "learning_rate": 0.00016929551763312283, + "loss": 0.3177908420562744, + "memory(GiB)": 91.64, + "step": 3110, + "token_acc": 0.9123818307585889, + "train_speed(iter/s)": 0.138341 + }, + { + "epoch": 0.2939788599471499, + "grad_norm": 0.2279764711856842, + "learning_rate": 0.00016918292448599612, + "loss": 0.3189659118652344, + "memory(GiB)": 91.64, + "step": 3115, + "token_acc": 0.902122641509434, + "train_speed(iter/s)": 0.138342 + }, + { + "epoch": 0.2944507361268403, + "grad_norm": 0.3087345063686371, + "learning_rate": 0.00016907016286803363, + "loss": 0.3192793369293213, + "memory(GiB)": 91.64, + "step": 3120, + "token_acc": 0.8875233769703447, + "train_speed(iter/s)": 0.13834 + }, + { + "epoch": 0.2949226123065308, + "grad_norm": 0.36169660091400146, + "learning_rate": 0.00016895723305382693, + "loss": 0.3232297897338867, + "memory(GiB)": 91.64, + "step": 3125, + "token_acc": 0.9030837004405287, + "train_speed(iter/s)": 0.138341 + }, + { + "epoch": 0.29539448848622124, + "grad_norm": 0.4390930235385895, + "learning_rate": 0.0001688441353183771, + "loss": 0.31847243309020995, + "memory(GiB)": 91.64, + "step": 3130, + "token_acc": 0.8952380952380953, + "train_speed(iter/s)": 0.138342 + }, + { + "epoch": 0.29586636466591165, + "grad_norm": 0.3723819851875305, + "learning_rate": 0.0001687308699370942, + "loss": 0.3224406480789185, + "memory(GiB)": 91.64, + "step": 3135, + "token_acc": 0.9039268955297605, + "train_speed(iter/s)": 0.138343 + }, + { + "epoch": 0.2963382408456021, + "grad_norm": 0.36210358142852783, + "learning_rate": 0.00016861743718579638, + "loss": 0.3119763612747192, + "memory(GiB)": 91.64, + "step": 3140, + "token_acc": 0.8790560471976401, + "train_speed(iter/s)": 0.138345 + }, + { + "epoch": 0.2968101170252926, + "grad_norm": 0.3111349046230316, + "learning_rate": 0.00016850383734070957, + "loss": 0.3166660785675049, + "memory(GiB)": 91.64, + "step": 3145, + "token_acc": 0.8888489208633094, + "train_speed(iter/s)": 0.138349 + }, + { + "epoch": 0.297281993204983, + "grad_norm": 0.41482797265052795, + "learning_rate": 0.00016839007067846645, + "loss": 0.31719019412994387, + "memory(GiB)": 91.64, + "step": 3150, + "token_acc": 0.8931875525651808, + "train_speed(iter/s)": 0.138351 + }, + { + "epoch": 0.29775386938467346, + "grad_norm": 0.8108663558959961, + "learning_rate": 0.00016827613747610597, + "loss": 0.31843905448913573, + "memory(GiB)": 91.64, + "step": 3155, + "token_acc": 0.8946395563770795, + "train_speed(iter/s)": 0.138352 + }, + { + "epoch": 0.29822574556436393, + "grad_norm": 0.3246685564517975, + "learning_rate": 0.0001681620380110726, + "loss": 0.3225980758666992, + "memory(GiB)": 91.64, + "step": 3160, + "token_acc": 0.904508541024923, + "train_speed(iter/s)": 0.138354 + }, + { + "epoch": 0.29869762174405434, + "grad_norm": 0.26903632283210754, + "learning_rate": 0.00016804777256121576, + "loss": 0.31510913372039795, + "memory(GiB)": 91.64, + "step": 3165, + "token_acc": 0.8867300537412154, + "train_speed(iter/s)": 0.138354 + }, + { + "epoch": 0.2991694979237448, + "grad_norm": 0.5990667343139648, + "learning_rate": 0.000167933341404789, + "loss": 0.3187410831451416, + "memory(GiB)": 91.64, + "step": 3170, + "token_acc": 0.8661604176554343, + "train_speed(iter/s)": 0.138357 + }, + { + "epoch": 0.2996413741034353, + "grad_norm": 0.39676856994628906, + "learning_rate": 0.00016781874482044943, + "loss": 0.3103852510452271, + "memory(GiB)": 91.64, + "step": 3175, + "token_acc": 0.9174556213017752, + "train_speed(iter/s)": 0.138358 + }, + { + "epoch": 0.3001132502831257, + "grad_norm": 0.30802735686302185, + "learning_rate": 0.00016770398308725698, + "loss": 0.3097995281219482, + "memory(GiB)": 91.64, + "step": 3180, + "token_acc": 0.8779197884530631, + "train_speed(iter/s)": 0.138361 + }, + { + "epoch": 0.30058512646281615, + "grad_norm": 0.3544689416885376, + "learning_rate": 0.00016758905648467373, + "loss": 0.31096925735473635, + "memory(GiB)": 91.64, + "step": 3185, + "token_acc": 0.8993464052287582, + "train_speed(iter/s)": 0.138365 + }, + { + "epoch": 0.3010570026425066, + "grad_norm": 0.3460041582584381, + "learning_rate": 0.00016747396529256326, + "loss": 0.3142538070678711, + "memory(GiB)": 91.64, + "step": 3190, + "token_acc": 0.8960473078120137, + "train_speed(iter/s)": 0.138364 + }, + { + "epoch": 0.30152887882219703, + "grad_norm": 0.2710016369819641, + "learning_rate": 0.00016735870979118995, + "loss": 0.32133386135101316, + "memory(GiB)": 91.64, + "step": 3195, + "token_acc": 0.8818143986683312, + "train_speed(iter/s)": 0.138366 + }, + { + "epoch": 0.3020007550018875, + "grad_norm": 0.2878091335296631, + "learning_rate": 0.0001672432902612183, + "loss": 0.3245548248291016, + "memory(GiB)": 91.64, + "step": 3200, + "token_acc": 0.8998724489795918, + "train_speed(iter/s)": 0.138364 + }, + { + "epoch": 0.30247263118157797, + "grad_norm": 0.38925909996032715, + "learning_rate": 0.0001671277069837122, + "loss": 0.3146018981933594, + "memory(GiB)": 91.64, + "step": 3205, + "token_acc": 0.8907684871918802, + "train_speed(iter/s)": 0.138366 + }, + { + "epoch": 0.3029445073612684, + "grad_norm": 0.31076857447624207, + "learning_rate": 0.0001670119602401344, + "loss": 0.3132633209228516, + "memory(GiB)": 91.64, + "step": 3210, + "token_acc": 0.8922895821071218, + "train_speed(iter/s)": 0.13837 + }, + { + "epoch": 0.30341638354095884, + "grad_norm": 0.40293845534324646, + "learning_rate": 0.00016689605031234566, + "loss": 0.31130781173706057, + "memory(GiB)": 91.64, + "step": 3215, + "token_acc": 0.9012658227848102, + "train_speed(iter/s)": 0.138374 + }, + { + "epoch": 0.3038882597206493, + "grad_norm": 0.45072442293167114, + "learning_rate": 0.000166779977482604, + "loss": 0.31590077877044676, + "memory(GiB)": 91.64, + "step": 3220, + "token_acc": 0.9161572052401746, + "train_speed(iter/s)": 0.138376 + }, + { + "epoch": 0.3043601359003397, + "grad_norm": 0.2620558440685272, + "learning_rate": 0.00016666374203356431, + "loss": 0.32024450302124025, + "memory(GiB)": 91.64, + "step": 3225, + "token_acc": 0.8846487424111015, + "train_speed(iter/s)": 0.138379 + }, + { + "epoch": 0.3048320120800302, + "grad_norm": 0.6706470251083374, + "learning_rate": 0.00016654734424827742, + "loss": 0.3171579122543335, + "memory(GiB)": 91.64, + "step": 3230, + "token_acc": 0.9007765314926661, + "train_speed(iter/s)": 0.138382 + }, + { + "epoch": 0.30530388825972066, + "grad_norm": 0.500370442867279, + "learning_rate": 0.00016643078441018938, + "loss": 0.31276688575744627, + "memory(GiB)": 91.64, + "step": 3235, + "token_acc": 0.8980530973451327, + "train_speed(iter/s)": 0.138383 + }, + { + "epoch": 0.3057757644394111, + "grad_norm": 0.6752060055732727, + "learning_rate": 0.000166314062803141, + "loss": 0.31355462074279783, + "memory(GiB)": 91.64, + "step": 3240, + "token_acc": 0.9082494969818914, + "train_speed(iter/s)": 0.138384 + }, + { + "epoch": 0.30624764061910154, + "grad_norm": 0.7671592831611633, + "learning_rate": 0.00016619717971136697, + "loss": 0.3132540941238403, + "memory(GiB)": 91.64, + "step": 3245, + "token_acc": 0.8937125748502994, + "train_speed(iter/s)": 0.138384 + }, + { + "epoch": 0.306719516798792, + "grad_norm": 0.46400538086891174, + "learning_rate": 0.00016608013541949518, + "loss": 0.31029553413391114, + "memory(GiB)": 91.64, + "step": 3250, + "token_acc": 0.8741035856573706, + "train_speed(iter/s)": 0.138384 + }, + { + "epoch": 0.30719139297848247, + "grad_norm": 0.2673157751560211, + "learning_rate": 0.00016596293021254612, + "loss": 0.3108152151107788, + "memory(GiB)": 91.64, + "step": 3255, + "token_acc": 0.8941227312013829, + "train_speed(iter/s)": 0.138388 + }, + { + "epoch": 0.3076632691581729, + "grad_norm": 0.32742491364479065, + "learning_rate": 0.00016584556437593213, + "loss": 0.3182346343994141, + "memory(GiB)": 91.64, + "step": 3260, + "token_acc": 0.9013292433537833, + "train_speed(iter/s)": 0.138392 + }, + { + "epoch": 0.30813514533786335, + "grad_norm": 0.47153425216674805, + "learning_rate": 0.00016572803819545664, + "loss": 0.31491539478302, + "memory(GiB)": 91.64, + "step": 3265, + "token_acc": 0.895458440445587, + "train_speed(iter/s)": 0.13839 + }, + { + "epoch": 0.3086070215175538, + "grad_norm": 0.5636700391769409, + "learning_rate": 0.00016561035195731364, + "loss": 0.31096749305725097, + "memory(GiB)": 91.64, + "step": 3270, + "token_acc": 0.8844444444444445, + "train_speed(iter/s)": 0.138391 + }, + { + "epoch": 0.3090788976972442, + "grad_norm": 0.24482609331607819, + "learning_rate": 0.00016549250594808683, + "loss": 0.31171326637268065, + "memory(GiB)": 91.64, + "step": 3275, + "token_acc": 0.899330811754437, + "train_speed(iter/s)": 0.138394 + }, + { + "epoch": 0.3095507738769347, + "grad_norm": 0.32658451795578003, + "learning_rate": 0.00016537450045474894, + "loss": 0.3069904804229736, + "memory(GiB)": 91.64, + "step": 3280, + "token_acc": 0.9060240963855422, + "train_speed(iter/s)": 0.138394 + }, + { + "epoch": 0.31002265005662516, + "grad_norm": 0.3008076846599579, + "learning_rate": 0.00016525633576466116, + "loss": 0.3172896862030029, + "memory(GiB)": 91.64, + "step": 3285, + "token_acc": 0.8990936555891239, + "train_speed(iter/s)": 0.138396 + }, + { + "epoch": 0.31049452623631557, + "grad_norm": 0.3459542393684387, + "learning_rate": 0.00016513801216557226, + "loss": 0.3143473148345947, + "memory(GiB)": 91.64, + "step": 3290, + "token_acc": 0.903512157468159, + "train_speed(iter/s)": 0.138398 + }, + { + "epoch": 0.31096640241600604, + "grad_norm": 0.2709652781486511, + "learning_rate": 0.00016501952994561804, + "loss": 0.3146618366241455, + "memory(GiB)": 91.64, + "step": 3295, + "token_acc": 0.8988359201773836, + "train_speed(iter/s)": 0.138401 + }, + { + "epoch": 0.3114382785956965, + "grad_norm": 0.31961753964424133, + "learning_rate": 0.00016490088939332054, + "loss": 0.3087768077850342, + "memory(GiB)": 91.64, + "step": 3300, + "token_acc": 0.8871085214857976, + "train_speed(iter/s)": 0.138401 + }, + { + "epoch": 0.3119101547753869, + "grad_norm": 0.32156920433044434, + "learning_rate": 0.0001647820907975874, + "loss": 0.3088067531585693, + "memory(GiB)": 91.64, + "step": 3305, + "token_acc": 0.8805100182149362, + "train_speed(iter/s)": 0.138401 + }, + { + "epoch": 0.3123820309550774, + "grad_norm": 0.37048351764678955, + "learning_rate": 0.000164663134447711, + "loss": 0.3027732133865356, + "memory(GiB)": 91.64, + "step": 3310, + "token_acc": 0.9139474444123592, + "train_speed(iter/s)": 0.138404 + }, + { + "epoch": 0.31285390713476785, + "grad_norm": 1.188414216041565, + "learning_rate": 0.00016454402063336804, + "loss": 0.30920934677124023, + "memory(GiB)": 91.64, + "step": 3315, + "token_acc": 0.8970201577563541, + "train_speed(iter/s)": 0.138405 + }, + { + "epoch": 0.31332578331445826, + "grad_norm": 0.27507731318473816, + "learning_rate": 0.00016442474964461853, + "loss": 0.31197390556335447, + "memory(GiB)": 91.64, + "step": 3320, + "token_acc": 0.8936553713049747, + "train_speed(iter/s)": 0.138404 + }, + { + "epoch": 0.31379765949414873, + "grad_norm": 0.21555930376052856, + "learning_rate": 0.0001643053217719053, + "loss": 0.31116595268249514, + "memory(GiB)": 91.64, + "step": 3325, + "token_acc": 0.898416166029492, + "train_speed(iter/s)": 0.138404 + }, + { + "epoch": 0.3142695356738392, + "grad_norm": 0.44676095247268677, + "learning_rate": 0.00016418573730605322, + "loss": 0.31005539894104006, + "memory(GiB)": 91.64, + "step": 3330, + "token_acc": 0.8707945597709377, + "train_speed(iter/s)": 0.138407 + }, + { + "epoch": 0.3147414118535296, + "grad_norm": 0.33203697204589844, + "learning_rate": 0.00016406599653826843, + "loss": 0.3174571990966797, + "memory(GiB)": 91.64, + "step": 3335, + "token_acc": 0.8960258780036968, + "train_speed(iter/s)": 0.138408 + }, + { + "epoch": 0.3152132880332201, + "grad_norm": 0.4387269914150238, + "learning_rate": 0.00016394609976013778, + "loss": 0.3156848430633545, + "memory(GiB)": 91.64, + "step": 3340, + "token_acc": 0.8951187335092349, + "train_speed(iter/s)": 0.138406 + }, + { + "epoch": 0.31568516421291054, + "grad_norm": 0.2519628703594208, + "learning_rate": 0.00016382604726362793, + "loss": 0.3058452606201172, + "memory(GiB)": 91.64, + "step": 3345, + "token_acc": 0.8932913102206214, + "train_speed(iter/s)": 0.138403 + }, + { + "epoch": 0.316157040392601, + "grad_norm": 0.4981694221496582, + "learning_rate": 0.00016370583934108477, + "loss": 0.31234307289123536, + "memory(GiB)": 91.64, + "step": 3350, + "token_acc": 0.9024309024309024, + "train_speed(iter/s)": 0.138404 + }, + { + "epoch": 0.3166289165722914, + "grad_norm": 0.24707676470279694, + "learning_rate": 0.00016358547628523272, + "loss": 0.3052816867828369, + "memory(GiB)": 91.64, + "step": 3355, + "token_acc": 0.9038121686442817, + "train_speed(iter/s)": 0.138405 + }, + { + "epoch": 0.3171007927519819, + "grad_norm": 0.3029780685901642, + "learning_rate": 0.00016346495838917395, + "loss": 0.30600829124450685, + "memory(GiB)": 91.64, + "step": 3360, + "token_acc": 0.898513251454428, + "train_speed(iter/s)": 0.138405 + }, + { + "epoch": 0.31757266893167235, + "grad_norm": 0.6097707152366638, + "learning_rate": 0.0001633442859463876, + "loss": 0.30912506580352783, + "memory(GiB)": 91.64, + "step": 3365, + "token_acc": 0.9050100200400801, + "train_speed(iter/s)": 0.138405 + }, + { + "epoch": 0.31804454511136276, + "grad_norm": 0.39238670468330383, + "learning_rate": 0.00016322345925072934, + "loss": 0.3076608180999756, + "memory(GiB)": 91.64, + "step": 3370, + "token_acc": 0.8900841908325537, + "train_speed(iter/s)": 0.138407 + }, + { + "epoch": 0.31851642129105323, + "grad_norm": 0.764627993106842, + "learning_rate": 0.00016310247859643032, + "loss": 0.3063749551773071, + "memory(GiB)": 91.64, + "step": 3375, + "token_acc": 0.8825581395348837, + "train_speed(iter/s)": 0.138408 + }, + { + "epoch": 0.3189882974707437, + "grad_norm": 0.3981649875640869, + "learning_rate": 0.00016298134427809662, + "loss": 0.31308808326721194, + "memory(GiB)": 91.64, + "step": 3380, + "token_acc": 0.8966457594764112, + "train_speed(iter/s)": 0.13841 + }, + { + "epoch": 0.3194601736504341, + "grad_norm": 0.44580742716789246, + "learning_rate": 0.00016286005659070857, + "loss": 0.31591091156005857, + "memory(GiB)": 91.64, + "step": 3385, + "token_acc": 0.8994082840236687, + "train_speed(iter/s)": 0.138411 + }, + { + "epoch": 0.3199320498301246, + "grad_norm": 0.4073268175125122, + "learning_rate": 0.00016273861582961994, + "loss": 0.3123687982559204, + "memory(GiB)": 91.64, + "step": 3390, + "token_acc": 0.888494528246081, + "train_speed(iter/s)": 0.13841 + }, + { + "epoch": 0.32040392600981504, + "grad_norm": 0.3851412534713745, + "learning_rate": 0.00016261702229055725, + "loss": 0.31351797580718993, + "memory(GiB)": 91.64, + "step": 3395, + "token_acc": 0.887447539107211, + "train_speed(iter/s)": 0.138414 + }, + { + "epoch": 0.32087580218950545, + "grad_norm": 0.5207167267799377, + "learning_rate": 0.00016249527626961907, + "loss": 0.3095412731170654, + "memory(GiB)": 91.64, + "step": 3400, + "token_acc": 0.8894736842105263, + "train_speed(iter/s)": 0.138413 + }, + { + "epoch": 0.3213476783691959, + "grad_norm": 0.5070729851722717, + "learning_rate": 0.00016237337806327532, + "loss": 0.3108300447463989, + "memory(GiB)": 91.64, + "step": 3405, + "token_acc": 0.902782433791485, + "train_speed(iter/s)": 0.138413 + }, + { + "epoch": 0.3218195545488864, + "grad_norm": 0.46263793110847473, + "learning_rate": 0.0001622513279683665, + "loss": 0.3083954811096191, + "memory(GiB)": 91.64, + "step": 3410, + "token_acc": 0.8961522548613984, + "train_speed(iter/s)": 0.138414 + }, + { + "epoch": 0.3222914307285768, + "grad_norm": 0.5839296579360962, + "learning_rate": 0.0001621291262821029, + "loss": 0.3138519287109375, + "memory(GiB)": 91.64, + "step": 3415, + "token_acc": 0.8892988929889298, + "train_speed(iter/s)": 0.138412 + }, + { + "epoch": 0.32276330690826727, + "grad_norm": 0.2847861647605896, + "learning_rate": 0.00016200677330206403, + "loss": 0.30753250122070314, + "memory(GiB)": 91.64, + "step": 3420, + "token_acc": 0.9061032863849765, + "train_speed(iter/s)": 0.13841 + }, + { + "epoch": 0.32323518308795773, + "grad_norm": 0.339787095785141, + "learning_rate": 0.00016188426932619784, + "loss": 0.31532652378082277, + "memory(GiB)": 91.64, + "step": 3425, + "token_acc": 0.892657793044225, + "train_speed(iter/s)": 0.138412 + }, + { + "epoch": 0.32370705926764815, + "grad_norm": 0.48385143280029297, + "learning_rate": 0.00016176161465281997, + "loss": 0.31803653240203855, + "memory(GiB)": 91.64, + "step": 3430, + "token_acc": 0.8876627051499717, + "train_speed(iter/s)": 0.138413 + }, + { + "epoch": 0.3241789354473386, + "grad_norm": 0.8372563719749451, + "learning_rate": 0.000161638809580613, + "loss": 0.30901002883911133, + "memory(GiB)": 91.64, + "step": 3435, + "token_acc": 0.8927664974619289, + "train_speed(iter/s)": 0.138414 + }, + { + "epoch": 0.3246508116270291, + "grad_norm": 0.29702985286712646, + "learning_rate": 0.00016151585440862573, + "loss": 0.3036106824874878, + "memory(GiB)": 91.64, + "step": 3440, + "token_acc": 0.9132579054250073, + "train_speed(iter/s)": 0.138415 + }, + { + "epoch": 0.3251226878067195, + "grad_norm": 0.42178598046302795, + "learning_rate": 0.0001613927494362726, + "loss": 0.3027750015258789, + "memory(GiB)": 91.64, + "step": 3445, + "token_acc": 0.8794378698224852, + "train_speed(iter/s)": 0.138414 + }, + { + "epoch": 0.32559456398640996, + "grad_norm": 0.6362924575805664, + "learning_rate": 0.00016126949496333263, + "loss": 0.3112166881561279, + "memory(GiB)": 91.64, + "step": 3450, + "token_acc": 0.9103899249212885, + "train_speed(iter/s)": 0.138416 + }, + { + "epoch": 0.3260664401661004, + "grad_norm": 0.5326300263404846, + "learning_rate": 0.00016114609128994908, + "loss": 0.31546103954315186, + "memory(GiB)": 91.64, + "step": 3455, + "token_acc": 0.9140065146579804, + "train_speed(iter/s)": 0.138418 + }, + { + "epoch": 0.3265383163457909, + "grad_norm": 0.34611740708351135, + "learning_rate": 0.00016102253871662852, + "loss": 0.31009516716003416, + "memory(GiB)": 91.64, + "step": 3460, + "token_acc": 0.9055606617647058, + "train_speed(iter/s)": 0.138418 + }, + { + "epoch": 0.3270101925254813, + "grad_norm": 0.27188488841056824, + "learning_rate": 0.00016089883754423997, + "loss": 0.31755566596984863, + "memory(GiB)": 91.64, + "step": 3465, + "token_acc": 0.8814242526032919, + "train_speed(iter/s)": 0.13842 + }, + { + "epoch": 0.32748206870517177, + "grad_norm": 0.24603348970413208, + "learning_rate": 0.00016077498807401448, + "loss": 0.3162256717681885, + "memory(GiB)": 91.64, + "step": 3470, + "token_acc": 0.8958736299161831, + "train_speed(iter/s)": 0.138421 + }, + { + "epoch": 0.32795394488486224, + "grad_norm": 0.3293556272983551, + "learning_rate": 0.0001606509906075441, + "loss": 0.3149267196655273, + "memory(GiB)": 91.64, + "step": 3475, + "token_acc": 0.904655075715087, + "train_speed(iter/s)": 0.138423 + }, + { + "epoch": 0.32842582106455265, + "grad_norm": 0.2843257784843445, + "learning_rate": 0.00016052684544678138, + "loss": 0.3054977893829346, + "memory(GiB)": 91.64, + "step": 3480, + "token_acc": 0.8907051282051283, + "train_speed(iter/s)": 0.138427 + }, + { + "epoch": 0.3288976972442431, + "grad_norm": 0.47237107157707214, + "learning_rate": 0.00016040255289403844, + "loss": 0.30878467559814454, + "memory(GiB)": 91.64, + "step": 3485, + "token_acc": 0.9008951406649617, + "train_speed(iter/s)": 0.138428 + }, + { + "epoch": 0.3293695734239336, + "grad_norm": 0.4679430425167084, + "learning_rate": 0.00016027811325198637, + "loss": 0.31091535091400146, + "memory(GiB)": 91.64, + "step": 3490, + "token_acc": 0.9059921062073915, + "train_speed(iter/s)": 0.138427 + }, + { + "epoch": 0.329841449603624, + "grad_norm": 0.9541602730751038, + "learning_rate": 0.0001601535268236544, + "loss": 0.31114275455474855, + "memory(GiB)": 91.64, + "step": 3495, + "token_acc": 0.8911082474226805, + "train_speed(iter/s)": 0.138429 + }, + { + "epoch": 0.33031332578331446, + "grad_norm": 1.2509143352508545, + "learning_rate": 0.00016002879391242928, + "loss": 0.3070805311203003, + "memory(GiB)": 91.64, + "step": 3500, + "token_acc": 0.8869294605809128, + "train_speed(iter/s)": 0.13843 + }, + { + "epoch": 0.3307852019630049, + "grad_norm": 0.5359411835670471, + "learning_rate": 0.00015990391482205443, + "loss": 0.30284867286682127, + "memory(GiB)": 91.64, + "step": 3505, + "token_acc": 0.8969344608879493, + "train_speed(iter/s)": 0.138433 + }, + { + "epoch": 0.33125707814269534, + "grad_norm": 0.2854415476322174, + "learning_rate": 0.00015977888985662918, + "loss": 0.318118691444397, + "memory(GiB)": 91.64, + "step": 3510, + "token_acc": 0.8925649235147635, + "train_speed(iter/s)": 0.138433 + }, + { + "epoch": 0.3317289543223858, + "grad_norm": 0.3062131404876709, + "learning_rate": 0.0001596537193206082, + "loss": 0.30570647716522215, + "memory(GiB)": 91.64, + "step": 3515, + "token_acc": 0.8835709436524101, + "train_speed(iter/s)": 0.138437 + }, + { + "epoch": 0.33220083050207627, + "grad_norm": 0.35242435336112976, + "learning_rate": 0.00015952840351880058, + "loss": 0.30632739067077636, + "memory(GiB)": 91.64, + "step": 3520, + "token_acc": 0.8913419913419913, + "train_speed(iter/s)": 0.138435 + }, + { + "epoch": 0.3326727066817667, + "grad_norm": 0.3083515167236328, + "learning_rate": 0.00015940294275636912, + "loss": 0.3100026845932007, + "memory(GiB)": 91.64, + "step": 3525, + "token_acc": 0.8972868217054264, + "train_speed(iter/s)": 0.138437 + }, + { + "epoch": 0.33314458286145715, + "grad_norm": 0.38937556743621826, + "learning_rate": 0.00015927733733882968, + "loss": 0.3004534006118774, + "memory(GiB)": 91.64, + "step": 3530, + "token_acc": 0.8906178489702518, + "train_speed(iter/s)": 0.138442 + }, + { + "epoch": 0.3336164590411476, + "grad_norm": 0.4373650550842285, + "learning_rate": 0.0001591515875720504, + "loss": 0.3044088363647461, + "memory(GiB)": 91.64, + "step": 3535, + "token_acc": 0.8975325565455792, + "train_speed(iter/s)": 0.138446 + }, + { + "epoch": 0.33408833522083803, + "grad_norm": 0.3125515282154083, + "learning_rate": 0.00015902569376225083, + "loss": 0.30311577320098876, + "memory(GiB)": 91.64, + "step": 3540, + "token_acc": 0.9021792966815255, + "train_speed(iter/s)": 0.138448 + }, + { + "epoch": 0.3345602114005285, + "grad_norm": 0.6560575366020203, + "learning_rate": 0.00015889965621600138, + "loss": 0.309435510635376, + "memory(GiB)": 91.64, + "step": 3545, + "token_acc": 0.9111111111111111, + "train_speed(iter/s)": 0.13845 + }, + { + "epoch": 0.33503208758021896, + "grad_norm": 0.2778153121471405, + "learning_rate": 0.00015877347524022247, + "loss": 0.30498385429382324, + "memory(GiB)": 91.64, + "step": 3550, + "token_acc": 0.8926521239954076, + "train_speed(iter/s)": 0.138452 + }, + { + "epoch": 0.3355039637599094, + "grad_norm": 0.513312041759491, + "learning_rate": 0.00015864715114218372, + "loss": 0.30378971099853513, + "memory(GiB)": 91.64, + "step": 3555, + "token_acc": 0.9048288795124239, + "train_speed(iter/s)": 0.138454 + }, + { + "epoch": 0.33597583993959984, + "grad_norm": 0.9006767272949219, + "learning_rate": 0.00015852068422950337, + "loss": 0.30725460052490233, + "memory(GiB)": 91.64, + "step": 3560, + "token_acc": 0.8941216913028532, + "train_speed(iter/s)": 0.138452 + }, + { + "epoch": 0.3364477161192903, + "grad_norm": 0.65769362449646, + "learning_rate": 0.00015839407481014738, + "loss": 0.3100194692611694, + "memory(GiB)": 91.64, + "step": 3565, + "token_acc": 0.892912571132954, + "train_speed(iter/s)": 0.138455 + }, + { + "epoch": 0.3369195922989808, + "grad_norm": 0.7264442443847656, + "learning_rate": 0.0001582673231924287, + "loss": 0.30505690574645994, + "memory(GiB)": 91.64, + "step": 3570, + "token_acc": 0.8920722135007849, + "train_speed(iter/s)": 0.138456 + }, + { + "epoch": 0.3373914684786712, + "grad_norm": 0.3359721004962921, + "learning_rate": 0.0001581404296850067, + "loss": 0.3063464403152466, + "memory(GiB)": 91.64, + "step": 3575, + "token_acc": 0.9028258362168397, + "train_speed(iter/s)": 0.138459 + }, + { + "epoch": 0.33786334465836165, + "grad_norm": 0.49838733673095703, + "learning_rate": 0.0001580133945968861, + "loss": 0.3045190334320068, + "memory(GiB)": 91.64, + "step": 3580, + "token_acc": 0.8754628071356446, + "train_speed(iter/s)": 0.138459 + }, + { + "epoch": 0.3383352208380521, + "grad_norm": 0.31267309188842773, + "learning_rate": 0.00015788621823741646, + "loss": 0.304337739944458, + "memory(GiB)": 91.64, + "step": 3585, + "token_acc": 0.889163322012967, + "train_speed(iter/s)": 0.138462 + }, + { + "epoch": 0.33880709701774253, + "grad_norm": 0.41228607296943665, + "learning_rate": 0.0001577589009162914, + "loss": 0.3088655948638916, + "memory(GiB)": 91.64, + "step": 3590, + "token_acc": 0.8918482647296206, + "train_speed(iter/s)": 0.138463 + }, + { + "epoch": 0.339278973197433, + "grad_norm": 0.5077301263809204, + "learning_rate": 0.0001576314429435477, + "loss": 0.3072696924209595, + "memory(GiB)": 91.64, + "step": 3595, + "token_acc": 0.8791666666666667, + "train_speed(iter/s)": 0.138468 + }, + { + "epoch": 0.33975084937712347, + "grad_norm": 0.368539035320282, + "learning_rate": 0.00015750384462956477, + "loss": 0.3104322671890259, + "memory(GiB)": 91.64, + "step": 3600, + "token_acc": 0.8836734693877552, + "train_speed(iter/s)": 0.138469 + }, + { + "epoch": 0.3402227255568139, + "grad_norm": 0.5741416215896606, + "learning_rate": 0.00015737610628506368, + "loss": 0.3075693607330322, + "memory(GiB)": 91.64, + "step": 3605, + "token_acc": 0.8923333333333333, + "train_speed(iter/s)": 0.138469 + }, + { + "epoch": 0.34069460173650434, + "grad_norm": 0.5280699133872986, + "learning_rate": 0.00015724822822110656, + "loss": 0.3020766735076904, + "memory(GiB)": 91.64, + "step": 3610, + "token_acc": 0.91324028668427, + "train_speed(iter/s)": 0.138473 + }, + { + "epoch": 0.3411664779161948, + "grad_norm": 0.65427565574646, + "learning_rate": 0.00015712021074909573, + "loss": 0.3077415466308594, + "memory(GiB)": 91.64, + "step": 3615, + "token_acc": 0.889433962264151, + "train_speed(iter/s)": 0.138475 + }, + { + "epoch": 0.3416383540958852, + "grad_norm": 0.25699713826179504, + "learning_rate": 0.00015699205418077302, + "loss": 0.30338454246520996, + "memory(GiB)": 91.64, + "step": 3620, + "token_acc": 0.8946078431372549, + "train_speed(iter/s)": 0.138476 + }, + { + "epoch": 0.3421102302755757, + "grad_norm": 0.382522314786911, + "learning_rate": 0.00015686375882821885, + "loss": 0.30158405303955077, + "memory(GiB)": 91.64, + "step": 3625, + "token_acc": 0.9050859598853869, + "train_speed(iter/s)": 0.138474 + }, + { + "epoch": 0.34258210645526616, + "grad_norm": 0.5537758469581604, + "learning_rate": 0.00015673532500385192, + "loss": 0.31232216358184817, + "memory(GiB)": 91.64, + "step": 3630, + "token_acc": 0.8968973747016706, + "train_speed(iter/s)": 0.138476 + }, + { + "epoch": 0.34305398263495657, + "grad_norm": 0.4347175061702728, + "learning_rate": 0.0001566067530204278, + "loss": 0.30861697196960447, + "memory(GiB)": 91.64, + "step": 3635, + "token_acc": 0.9140995260663507, + "train_speed(iter/s)": 0.138474 + }, + { + "epoch": 0.34352585881464703, + "grad_norm": 1.3980445861816406, + "learning_rate": 0.00015647804319103862, + "loss": 0.3118103265762329, + "memory(GiB)": 91.64, + "step": 3640, + "token_acc": 0.891973445986723, + "train_speed(iter/s)": 0.138474 + }, + { + "epoch": 0.3439977349943375, + "grad_norm": 0.7892172336578369, + "learning_rate": 0.00015634919582911225, + "loss": 0.2955352783203125, + "memory(GiB)": 91.64, + "step": 3645, + "token_acc": 0.8974358974358975, + "train_speed(iter/s)": 0.138472 + }, + { + "epoch": 0.3444696111740279, + "grad_norm": 0.25731360912323, + "learning_rate": 0.0001562202112484114, + "loss": 0.30662920475006106, + "memory(GiB)": 91.64, + "step": 3650, + "token_acc": 0.9029054799558661, + "train_speed(iter/s)": 0.138474 + }, + { + "epoch": 0.3449414873537184, + "grad_norm": 0.25891074538230896, + "learning_rate": 0.00015609108976303283, + "loss": 0.30108160972595216, + "memory(GiB)": 91.64, + "step": 3655, + "token_acc": 0.9025487256371814, + "train_speed(iter/s)": 0.138475 + }, + { + "epoch": 0.34541336353340885, + "grad_norm": 0.9629558324813843, + "learning_rate": 0.00015596183168740694, + "loss": 0.30233757495880126, + "memory(GiB)": 91.64, + "step": 3660, + "token_acc": 0.8685909608811242, + "train_speed(iter/s)": 0.138477 + }, + { + "epoch": 0.34588523971309926, + "grad_norm": 0.5921810865402222, + "learning_rate": 0.00015583243733629655, + "loss": 0.3014842510223389, + "memory(GiB)": 91.64, + "step": 3665, + "token_acc": 0.9083843617522374, + "train_speed(iter/s)": 0.138475 + }, + { + "epoch": 0.3463571158927897, + "grad_norm": 0.5052902698516846, + "learning_rate": 0.00015570290702479638, + "loss": 0.29955530166625977, + "memory(GiB)": 91.64, + "step": 3670, + "token_acc": 0.8916758544652701, + "train_speed(iter/s)": 0.138475 + }, + { + "epoch": 0.3468289920724802, + "grad_norm": 0.5260510444641113, + "learning_rate": 0.00015557324106833223, + "loss": 0.30066709518432616, + "memory(GiB)": 91.64, + "step": 3675, + "token_acc": 0.9076858813700919, + "train_speed(iter/s)": 0.138477 + }, + { + "epoch": 0.34730086825217066, + "grad_norm": 0.33802786469459534, + "learning_rate": 0.00015544343978266025, + "loss": 0.2984708547592163, + "memory(GiB)": 91.64, + "step": 3680, + "token_acc": 0.901496693351897, + "train_speed(iter/s)": 0.138479 + }, + { + "epoch": 0.34777274443186107, + "grad_norm": 0.32484322786331177, + "learning_rate": 0.00015531350348386606, + "loss": 0.3036123037338257, + "memory(GiB)": 91.64, + "step": 3685, + "token_acc": 0.9054111631870473, + "train_speed(iter/s)": 0.138482 + }, + { + "epoch": 0.34824462061155154, + "grad_norm": 0.3622114956378937, + "learning_rate": 0.00015518343248836417, + "loss": 0.29522085189819336, + "memory(GiB)": 91.64, + "step": 3690, + "token_acc": 0.9036262659261679, + "train_speed(iter/s)": 0.13848 + }, + { + "epoch": 0.348716496791242, + "grad_norm": 0.4862135946750641, + "learning_rate": 0.000155053227112897, + "loss": 0.2977942705154419, + "memory(GiB)": 91.64, + "step": 3695, + "token_acc": 0.9008559201141226, + "train_speed(iter/s)": 0.138482 + }, + { + "epoch": 0.3491883729709324, + "grad_norm": 0.5323036313056946, + "learning_rate": 0.00015492288767453424, + "loss": 0.31482396125793455, + "memory(GiB)": 91.64, + "step": 3700, + "token_acc": 0.9135959339263025, + "train_speed(iter/s)": 0.138484 + }, + { + "epoch": 0.3496602491506229, + "grad_norm": 0.45398956537246704, + "learning_rate": 0.00015479241449067207, + "loss": 0.3046382427215576, + "memory(GiB)": 91.64, + "step": 3705, + "token_acc": 0.8907425580634609, + "train_speed(iter/s)": 0.138487 + }, + { + "epoch": 0.35013212533031335, + "grad_norm": 0.24290789663791656, + "learning_rate": 0.00015466180787903228, + "loss": 0.30463085174560545, + "memory(GiB)": 91.64, + "step": 3710, + "token_acc": 0.8997484728710026, + "train_speed(iter/s)": 0.138489 + }, + { + "epoch": 0.35060400151000376, + "grad_norm": 0.6027255058288574, + "learning_rate": 0.00015453106815766169, + "loss": 0.3037456512451172, + "memory(GiB)": 91.64, + "step": 3715, + "token_acc": 0.8926788685524126, + "train_speed(iter/s)": 0.13849 + }, + { + "epoch": 0.3510758776896942, + "grad_norm": 0.3494771718978882, + "learning_rate": 0.00015440019564493112, + "loss": 0.30643885135650634, + "memory(GiB)": 91.64, + "step": 3720, + "token_acc": 0.8992887624466572, + "train_speed(iter/s)": 0.138489 + }, + { + "epoch": 0.3515477538693847, + "grad_norm": 0.45546987652778625, + "learning_rate": 0.00015426919065953496, + "loss": 0.3024001598358154, + "memory(GiB)": 91.64, + "step": 3725, + "token_acc": 0.9199255121042831, + "train_speed(iter/s)": 0.138493 + }, + { + "epoch": 0.3520196300490751, + "grad_norm": 0.2523688077926636, + "learning_rate": 0.00015413805352048997, + "loss": 0.30108323097229006, + "memory(GiB)": 91.64, + "step": 3730, + "token_acc": 0.9001024590163934, + "train_speed(iter/s)": 0.138496 + }, + { + "epoch": 0.35249150622876557, + "grad_norm": 0.4431523382663727, + "learning_rate": 0.00015400678454713487, + "loss": 0.3021445989608765, + "memory(GiB)": 91.64, + "step": 3735, + "token_acc": 0.9001941747572816, + "train_speed(iter/s)": 0.138497 + }, + { + "epoch": 0.35296338240845604, + "grad_norm": 0.6595969200134277, + "learning_rate": 0.00015387538405912937, + "loss": 0.3013103485107422, + "memory(GiB)": 91.64, + "step": 3740, + "token_acc": 0.9106609099966788, + "train_speed(iter/s)": 0.138499 + }, + { + "epoch": 0.35343525858814645, + "grad_norm": 0.2660990059375763, + "learning_rate": 0.00015374385237645343, + "loss": 0.29910807609558104, + "memory(GiB)": 91.64, + "step": 3745, + "token_acc": 0.8930566640063847, + "train_speed(iter/s)": 0.138502 + }, + { + "epoch": 0.3539071347678369, + "grad_norm": 0.30778956413269043, + "learning_rate": 0.00015361218981940647, + "loss": 0.30529217720031737, + "memory(GiB)": 91.64, + "step": 3750, + "token_acc": 0.8790162633875446, + "train_speed(iter/s)": 0.138502 + }, + { + "epoch": 0.3543790109475274, + "grad_norm": 0.3072812557220459, + "learning_rate": 0.0001534803967086067, + "loss": 0.2946580410003662, + "memory(GiB)": 91.64, + "step": 3755, + "token_acc": 0.8934729064039408, + "train_speed(iter/s)": 0.138503 + }, + { + "epoch": 0.3548508871272178, + "grad_norm": 0.3878733515739441, + "learning_rate": 0.00015334847336499015, + "loss": 0.30007166862487794, + "memory(GiB)": 91.64, + "step": 3760, + "token_acc": 0.9072721498888536, + "train_speed(iter/s)": 0.138504 + }, + { + "epoch": 0.35532276330690826, + "grad_norm": 0.2825472950935364, + "learning_rate": 0.00015321642010981, + "loss": 0.2999789953231812, + "memory(GiB)": 91.64, + "step": 3765, + "token_acc": 0.9049180327868852, + "train_speed(iter/s)": 0.138508 + }, + { + "epoch": 0.35579463948659873, + "grad_norm": 0.3111068904399872, + "learning_rate": 0.0001530842372646358, + "loss": 0.29370770454406736, + "memory(GiB)": 91.64, + "step": 3770, + "token_acc": 0.9016277423920736, + "train_speed(iter/s)": 0.138503 + }, + { + "epoch": 0.35626651566628914, + "grad_norm": 0.42809635400772095, + "learning_rate": 0.00015295192515135274, + "loss": 0.29727604389190676, + "memory(GiB)": 91.64, + "step": 3775, + "token_acc": 0.9122373300370828, + "train_speed(iter/s)": 0.138504 + }, + { + "epoch": 0.3567383918459796, + "grad_norm": 0.8906384110450745, + "learning_rate": 0.0001528194840921607, + "loss": 0.3030253887176514, + "memory(GiB)": 91.64, + "step": 3780, + "token_acc": 0.8868296529968455, + "train_speed(iter/s)": 0.138503 + }, + { + "epoch": 0.3572102680256701, + "grad_norm": 0.7155807018280029, + "learning_rate": 0.00015268691440957355, + "loss": 0.29479668140411375, + "memory(GiB)": 91.64, + "step": 3785, + "token_acc": 0.9038887132469174, + "train_speed(iter/s)": 0.138504 + }, + { + "epoch": 0.3576821442053605, + "grad_norm": 0.4430237114429474, + "learning_rate": 0.0001525542164264185, + "loss": 0.30679366588592527, + "memory(GiB)": 91.64, + "step": 3790, + "token_acc": 0.9016441573693482, + "train_speed(iter/s)": 0.138505 + }, + { + "epoch": 0.35815402038505095, + "grad_norm": 0.8023039102554321, + "learning_rate": 0.0001524213904658351, + "loss": 0.3044567108154297, + "memory(GiB)": 91.64, + "step": 3795, + "token_acc": 0.8872512896094326, + "train_speed(iter/s)": 0.138505 + }, + { + "epoch": 0.3586258965647414, + "grad_norm": 0.6200763583183289, + "learning_rate": 0.00015228843685127452, + "loss": 0.30850720405578613, + "memory(GiB)": 91.64, + "step": 3800, + "token_acc": 0.8852781880846874, + "train_speed(iter/s)": 0.138505 + }, + { + "epoch": 0.3590977727444319, + "grad_norm": 0.7332703471183777, + "learning_rate": 0.00015215535590649886, + "loss": 0.30007758140563967, + "memory(GiB)": 91.64, + "step": 3805, + "token_acc": 0.9086795557383792, + "train_speed(iter/s)": 0.138506 + }, + { + "epoch": 0.3595696489241223, + "grad_norm": 0.5468603372573853, + "learning_rate": 0.00015202214795558022, + "loss": 0.2976674556732178, + "memory(GiB)": 91.64, + "step": 3810, + "token_acc": 0.8955123911587408, + "train_speed(iter/s)": 0.138505 + }, + { + "epoch": 0.36004152510381277, + "grad_norm": 0.4006412625312805, + "learning_rate": 0.00015188881332290003, + "loss": 0.2963697910308838, + "memory(GiB)": 91.64, + "step": 3815, + "token_acc": 0.9008033531260915, + "train_speed(iter/s)": 0.138505 + }, + { + "epoch": 0.36051340128350323, + "grad_norm": 0.2284918576478958, + "learning_rate": 0.00015175535233314823, + "loss": 0.29793496131896974, + "memory(GiB)": 91.64, + "step": 3820, + "token_acc": 0.8894009216589862, + "train_speed(iter/s)": 0.138507 + }, + { + "epoch": 0.36098527746319364, + "grad_norm": 0.8408511877059937, + "learning_rate": 0.00015162176531132235, + "loss": 0.31119661331176757, + "memory(GiB)": 91.64, + "step": 3825, + "token_acc": 0.8848153926157046, + "train_speed(iter/s)": 0.138506 + }, + { + "epoch": 0.3614571536428841, + "grad_norm": 0.23569151759147644, + "learning_rate": 0.00015148805258272696, + "loss": 0.3092831611633301, + "memory(GiB)": 91.64, + "step": 3830, + "token_acc": 0.8938014737754659, + "train_speed(iter/s)": 0.138509 + }, + { + "epoch": 0.3619290298225746, + "grad_norm": 0.3385840356349945, + "learning_rate": 0.0001513542144729726, + "loss": 0.29678735733032224, + "memory(GiB)": 91.64, + "step": 3835, + "token_acc": 0.8935003915426781, + "train_speed(iter/s)": 0.138507 + }, + { + "epoch": 0.362400906002265, + "grad_norm": 0.35799628496170044, + "learning_rate": 0.00015122025130797536, + "loss": 0.29270572662353517, + "memory(GiB)": 91.64, + "step": 3840, + "token_acc": 0.9016266460108443, + "train_speed(iter/s)": 0.138508 + }, + { + "epoch": 0.36287278218195546, + "grad_norm": 0.34791991114616394, + "learning_rate": 0.00015108616341395558, + "loss": 0.2929104804992676, + "memory(GiB)": 91.64, + "step": 3845, + "token_acc": 0.9117132867132867, + "train_speed(iter/s)": 0.138507 + }, + { + "epoch": 0.3633446583616459, + "grad_norm": 0.2428075075149536, + "learning_rate": 0.0001509519511174375, + "loss": 0.29956960678100586, + "memory(GiB)": 91.64, + "step": 3850, + "token_acc": 0.9037393557941503, + "train_speed(iter/s)": 0.138507 + }, + { + "epoch": 0.36381653454133633, + "grad_norm": 0.38686317205429077, + "learning_rate": 0.00015081761474524828, + "loss": 0.29664180278778074, + "memory(GiB)": 91.64, + "step": 3855, + "token_acc": 0.9018382352941177, + "train_speed(iter/s)": 0.138507 + }, + { + "epoch": 0.3642884107210268, + "grad_norm": 0.6946428418159485, + "learning_rate": 0.00015068315462451722, + "loss": 0.297865891456604, + "memory(GiB)": 91.64, + "step": 3860, + "token_acc": 0.9015151515151515, + "train_speed(iter/s)": 0.138509 + }, + { + "epoch": 0.36476028690071727, + "grad_norm": 0.24777008593082428, + "learning_rate": 0.00015054857108267496, + "loss": 0.2987982749938965, + "memory(GiB)": 91.64, + "step": 3865, + "token_acc": 0.8901398839986353, + "train_speed(iter/s)": 0.138511 + }, + { + "epoch": 0.3652321630804077, + "grad_norm": 0.5574231147766113, + "learning_rate": 0.00015041386444745268, + "loss": 0.3010772705078125, + "memory(GiB)": 91.64, + "step": 3870, + "token_acc": 0.8945620589456206, + "train_speed(iter/s)": 0.138511 + }, + { + "epoch": 0.36570403926009815, + "grad_norm": 0.648216962814331, + "learning_rate": 0.00015027903504688127, + "loss": 0.3043731927871704, + "memory(GiB)": 91.64, + "step": 3875, + "token_acc": 0.9052585832246849, + "train_speed(iter/s)": 0.138513 + }, + { + "epoch": 0.3661759154397886, + "grad_norm": 1.228064775466919, + "learning_rate": 0.00015014408320929062, + "loss": 0.30399317741394044, + "memory(GiB)": 91.64, + "step": 3880, + "token_acc": 0.9066859066859067, + "train_speed(iter/s)": 0.138512 + }, + { + "epoch": 0.366647791619479, + "grad_norm": 0.36719319224357605, + "learning_rate": 0.00015000900926330886, + "loss": 0.3041665077209473, + "memory(GiB)": 91.64, + "step": 3885, + "token_acc": 0.89, + "train_speed(iter/s)": 0.138513 + }, + { + "epoch": 0.3671196677991695, + "grad_norm": 0.5185071229934692, + "learning_rate": 0.0001498738135378613, + "loss": 0.2928229570388794, + "memory(GiB)": 91.64, + "step": 3890, + "token_acc": 0.9091915836101883, + "train_speed(iter/s)": 0.138514 + }, + { + "epoch": 0.36759154397885996, + "grad_norm": 0.3124889135360718, + "learning_rate": 0.00014973849636216993, + "loss": 0.3014270067214966, + "memory(GiB)": 91.64, + "step": 3895, + "token_acc": 0.908899420747762, + "train_speed(iter/s)": 0.138515 + }, + { + "epoch": 0.36806342015855037, + "grad_norm": 0.2866859436035156, + "learning_rate": 0.0001496030580657524, + "loss": 0.2979059934616089, + "memory(GiB)": 91.64, + "step": 3900, + "token_acc": 0.9056129572745043, + "train_speed(iter/s)": 0.138516 + }, + { + "epoch": 0.36853529633824084, + "grad_norm": 0.725142776966095, + "learning_rate": 0.00014946749897842135, + "loss": 0.30335102081298826, + "memory(GiB)": 91.64, + "step": 3905, + "token_acc": 0.9037496309418365, + "train_speed(iter/s)": 0.138514 + }, + { + "epoch": 0.3690071725179313, + "grad_norm": 0.7367982268333435, + "learning_rate": 0.0001493318194302836, + "loss": 0.3035327911376953, + "memory(GiB)": 91.64, + "step": 3910, + "token_acc": 0.9125295508274232, + "train_speed(iter/s)": 0.138515 + }, + { + "epoch": 0.36947904869762177, + "grad_norm": 0.31670787930488586, + "learning_rate": 0.00014919601975173924, + "loss": 0.29859652519226076, + "memory(GiB)": 91.64, + "step": 3915, + "token_acc": 0.9066422594142259, + "train_speed(iter/s)": 0.138515 + }, + { + "epoch": 0.3699509248773122, + "grad_norm": 0.6017026305198669, + "learning_rate": 0.00014906010027348096, + "loss": 0.3074479579925537, + "memory(GiB)": 91.64, + "step": 3920, + "token_acc": 0.8810875410815656, + "train_speed(iter/s)": 0.138514 + }, + { + "epoch": 0.37042280105700265, + "grad_norm": 0.5601099133491516, + "learning_rate": 0.00014892406132649316, + "loss": 0.2999934434890747, + "memory(GiB)": 91.64, + "step": 3925, + "token_acc": 0.8974358974358975, + "train_speed(iter/s)": 0.138517 + }, + { + "epoch": 0.3708946772366931, + "grad_norm": 0.5873445868492126, + "learning_rate": 0.00014878790324205108, + "loss": 0.2997703552246094, + "memory(GiB)": 91.64, + "step": 3930, + "token_acc": 0.913626209977662, + "train_speed(iter/s)": 0.138516 + }, + { + "epoch": 0.3713665534163835, + "grad_norm": 0.27552977204322815, + "learning_rate": 0.00014865162635172024, + "loss": 0.3029902935028076, + "memory(GiB)": 91.64, + "step": 3935, + "token_acc": 0.906828119744366, + "train_speed(iter/s)": 0.138515 + }, + { + "epoch": 0.371838429596074, + "grad_norm": 0.35505324602127075, + "learning_rate": 0.00014851523098735535, + "loss": 0.30098762512207033, + "memory(GiB)": 91.64, + "step": 3940, + "token_acc": 0.8870905587668594, + "train_speed(iter/s)": 0.138517 + }, + { + "epoch": 0.37231030577576446, + "grad_norm": 0.9836472868919373, + "learning_rate": 0.00014837871748109963, + "loss": 0.3018435001373291, + "memory(GiB)": 91.64, + "step": 3945, + "token_acc": 0.8946090335114133, + "train_speed(iter/s)": 0.138517 + }, + { + "epoch": 0.37278218195545487, + "grad_norm": 0.26430022716522217, + "learning_rate": 0.00014824208616538405, + "loss": 0.2975289344787598, + "memory(GiB)": 91.64, + "step": 3950, + "token_acc": 0.9091860769432086, + "train_speed(iter/s)": 0.138518 + }, + { + "epoch": 0.37325405813514534, + "grad_norm": 0.6483102440834045, + "learning_rate": 0.00014810533737292646, + "loss": 0.29604153633117675, + "memory(GiB)": 91.64, + "step": 3955, + "token_acc": 0.882892606583918, + "train_speed(iter/s)": 0.138519 + }, + { + "epoch": 0.3737259343148358, + "grad_norm": 0.4567039906978607, + "learning_rate": 0.0001479684714367307, + "loss": 0.2948923587799072, + "memory(GiB)": 91.64, + "step": 3960, + "token_acc": 0.9090909090909091, + "train_speed(iter/s)": 0.138519 + }, + { + "epoch": 0.3741978104945262, + "grad_norm": 0.232547327876091, + "learning_rate": 0.00014783148869008592, + "loss": 0.2958275079727173, + "memory(GiB)": 91.64, + "step": 3965, + "token_acc": 0.8747252747252747, + "train_speed(iter/s)": 0.13852 + }, + { + "epoch": 0.3746696866742167, + "grad_norm": 0.34699806571006775, + "learning_rate": 0.0001476943894665658, + "loss": 0.2959395408630371, + "memory(GiB)": 91.64, + "step": 3970, + "token_acc": 0.8965679360601787, + "train_speed(iter/s)": 0.138519 + }, + { + "epoch": 0.37514156285390715, + "grad_norm": 0.41914603114128113, + "learning_rate": 0.00014755717410002748, + "loss": 0.3048529624938965, + "memory(GiB)": 91.64, + "step": 3975, + "token_acc": 0.9008640291041382, + "train_speed(iter/s)": 0.138521 + }, + { + "epoch": 0.37561343903359756, + "grad_norm": 0.30545568466186523, + "learning_rate": 0.00014741984292461117, + "loss": 0.300109338760376, + "memory(GiB)": 91.64, + "step": 3980, + "token_acc": 0.9146689497716894, + "train_speed(iter/s)": 0.138523 + }, + { + "epoch": 0.37608531521328803, + "grad_norm": 0.7996503114700317, + "learning_rate": 0.00014728239627473884, + "loss": 0.3002124309539795, + "memory(GiB)": 91.64, + "step": 3985, + "token_acc": 0.9075589792970631, + "train_speed(iter/s)": 0.138524 + }, + { + "epoch": 0.3765571913929785, + "grad_norm": 0.601633608341217, + "learning_rate": 0.00014714483448511384, + "loss": 0.30334570407867434, + "memory(GiB)": 91.64, + "step": 3990, + "token_acc": 0.906820723071189, + "train_speed(iter/s)": 0.138525 + }, + { + "epoch": 0.3770290675726689, + "grad_norm": 0.23861028254032135, + "learning_rate": 0.00014700715789071978, + "loss": 0.3003624200820923, + "memory(GiB)": 91.64, + "step": 3995, + "token_acc": 0.904647983595352, + "train_speed(iter/s)": 0.138526 + }, + { + "epoch": 0.3775009437523594, + "grad_norm": 0.46522191166877747, + "learning_rate": 0.00014686936682681994, + "loss": 0.2874744415283203, + "memory(GiB)": 91.64, + "step": 4000, + "token_acc": 0.8961826614341777, + "train_speed(iter/s)": 0.138526 + }, + { + "epoch": 0.37797281993204984, + "grad_norm": 0.4399687945842743, + "learning_rate": 0.0001467314616289563, + "loss": 0.2904350280761719, + "memory(GiB)": 91.64, + "step": 4005, + "token_acc": 0.8935816428333888, + "train_speed(iter/s)": 0.138526 + }, + { + "epoch": 0.37844469611174025, + "grad_norm": 0.5468761920928955, + "learning_rate": 0.00014659344263294875, + "loss": 0.2920623779296875, + "memory(GiB)": 91.64, + "step": 4010, + "token_acc": 0.8952150211992732, + "train_speed(iter/s)": 0.138529 + }, + { + "epoch": 0.3789165722914307, + "grad_norm": 0.31320616602897644, + "learning_rate": 0.00014645531017489432, + "loss": 0.313277530670166, + "memory(GiB)": 91.64, + "step": 4015, + "token_acc": 0.885201793721973, + "train_speed(iter/s)": 0.138529 + }, + { + "epoch": 0.3793884484711212, + "grad_norm": 0.230007141828537, + "learning_rate": 0.00014631706459116637, + "loss": 0.30183398723602295, + "memory(GiB)": 91.64, + "step": 4020, + "token_acc": 0.8790294627383015, + "train_speed(iter/s)": 0.13853 + }, + { + "epoch": 0.37986032465081165, + "grad_norm": 0.24103610217571259, + "learning_rate": 0.00014617870621841375, + "loss": 0.2933482646942139, + "memory(GiB)": 91.64, + "step": 4025, + "token_acc": 0.8915499322187076, + "train_speed(iter/s)": 0.138529 + }, + { + "epoch": 0.38033220083050207, + "grad_norm": 0.6546054482460022, + "learning_rate": 0.0001460402353935598, + "loss": 0.2997136116027832, + "memory(GiB)": 91.64, + "step": 4030, + "token_acc": 0.8830313014827018, + "train_speed(iter/s)": 0.13853 + }, + { + "epoch": 0.38080407701019253, + "grad_norm": 0.5257068276405334, + "learning_rate": 0.0001459016524538019, + "loss": 0.3064009428024292, + "memory(GiB)": 91.64, + "step": 4035, + "token_acc": 0.8834586466165414, + "train_speed(iter/s)": 0.13853 + }, + { + "epoch": 0.381275953189883, + "grad_norm": 0.7437881231307983, + "learning_rate": 0.0001457629577366104, + "loss": 0.30060720443725586, + "memory(GiB)": 91.64, + "step": 4040, + "token_acc": 0.9072522392372147, + "train_speed(iter/s)": 0.138529 + }, + { + "epoch": 0.3817478293695734, + "grad_norm": 0.6455546617507935, + "learning_rate": 0.0001456241515797278, + "loss": 0.298064136505127, + "memory(GiB)": 91.64, + "step": 4045, + "token_acc": 0.9005726184279021, + "train_speed(iter/s)": 0.138531 + }, + { + "epoch": 0.3822197055492639, + "grad_norm": 0.5941305756568909, + "learning_rate": 0.00014548523432116785, + "loss": 0.30365958213806155, + "memory(GiB)": 91.64, + "step": 4050, + "token_acc": 0.904631217838765, + "train_speed(iter/s)": 0.138532 + }, + { + "epoch": 0.38269158172895434, + "grad_norm": 0.7511333227157593, + "learning_rate": 0.0001453462062992152, + "loss": 0.2938352108001709, + "memory(GiB)": 91.64, + "step": 4055, + "token_acc": 0.9002293577981652, + "train_speed(iter/s)": 0.138532 + }, + { + "epoch": 0.38316345790864476, + "grad_norm": 0.2584506571292877, + "learning_rate": 0.0001452070678524239, + "loss": 0.2902865171432495, + "memory(GiB)": 91.64, + "step": 4060, + "token_acc": 0.8862512363996043, + "train_speed(iter/s)": 0.138534 + }, + { + "epoch": 0.3836353340883352, + "grad_norm": 0.38080471754074097, + "learning_rate": 0.000145067819319617, + "loss": 0.30122551918029783, + "memory(GiB)": 91.64, + "step": 4065, + "token_acc": 0.9068033550792172, + "train_speed(iter/s)": 0.138537 + }, + { + "epoch": 0.3841072102680257, + "grad_norm": 0.3039911687374115, + "learning_rate": 0.0001449284610398857, + "loss": 0.29466953277587893, + "memory(GiB)": 91.64, + "step": 4070, + "token_acc": 0.9058471454880295, + "train_speed(iter/s)": 0.138537 + }, + { + "epoch": 0.3845790864477161, + "grad_norm": 0.34186533093452454, + "learning_rate": 0.00014478899335258836, + "loss": 0.29807510375976565, + "memory(GiB)": 91.64, + "step": 4075, + "token_acc": 0.885883347421809, + "train_speed(iter/s)": 0.138539 + }, + { + "epoch": 0.38505096262740657, + "grad_norm": 0.6868099570274353, + "learning_rate": 0.00014464941659734977, + "loss": 0.29289746284484863, + "memory(GiB)": 91.64, + "step": 4080, + "token_acc": 0.8864503816793893, + "train_speed(iter/s)": 0.138537 + }, + { + "epoch": 0.38552283880709703, + "grad_norm": 0.4959952235221863, + "learning_rate": 0.00014450973111406037, + "loss": 0.30514447689056395, + "memory(GiB)": 91.64, + "step": 4085, + "token_acc": 0.8926809210526315, + "train_speed(iter/s)": 0.138538 + }, + { + "epoch": 0.38599471498678745, + "grad_norm": 0.5920107364654541, + "learning_rate": 0.00014436993724287534, + "loss": 0.30403847694396974, + "memory(GiB)": 91.64, + "step": 4090, + "token_acc": 0.893070044709389, + "train_speed(iter/s)": 0.138538 + }, + { + "epoch": 0.3864665911664779, + "grad_norm": 0.2796635925769806, + "learning_rate": 0.00014423003532421376, + "loss": 0.2975569248199463, + "memory(GiB)": 91.64, + "step": 4095, + "token_acc": 0.9021779254337394, + "train_speed(iter/s)": 0.138537 + }, + { + "epoch": 0.3869384673461684, + "grad_norm": 0.2953856289386749, + "learning_rate": 0.00014409002569875794, + "loss": 0.29867355823516845, + "memory(GiB)": 91.64, + "step": 4100, + "token_acc": 0.910139030179722, + "train_speed(iter/s)": 0.138537 + }, + { + "epoch": 0.3874103435258588, + "grad_norm": 0.42904919385910034, + "learning_rate": 0.00014394990870745234, + "loss": 0.2972731590270996, + "memory(GiB)": 91.64, + "step": 4105, + "token_acc": 0.8958496476115897, + "train_speed(iter/s)": 0.138538 + }, + { + "epoch": 0.38788221970554926, + "grad_norm": 0.3960139751434326, + "learning_rate": 0.0001438096846915029, + "loss": 0.2955931663513184, + "memory(GiB)": 91.64, + "step": 4110, + "token_acc": 0.8847637415621986, + "train_speed(iter/s)": 0.138539 + }, + { + "epoch": 0.3883540958852397, + "grad_norm": 0.46903130412101746, + "learning_rate": 0.00014366935399237626, + "loss": 0.2966940402984619, + "memory(GiB)": 91.64, + "step": 4115, + "token_acc": 0.914792603698151, + "train_speed(iter/s)": 0.13854 + }, + { + "epoch": 0.38882597206493014, + "grad_norm": 0.35994887351989746, + "learning_rate": 0.00014352891695179878, + "loss": 0.29332523345947265, + "memory(GiB)": 91.64, + "step": 4120, + "token_acc": 0.8934058898847631, + "train_speed(iter/s)": 0.13854 + }, + { + "epoch": 0.3892978482446206, + "grad_norm": 0.4265725314617157, + "learning_rate": 0.00014338837391175582, + "loss": 0.29866001605987547, + "memory(GiB)": 91.64, + "step": 4125, + "token_acc": 0.8938466025080198, + "train_speed(iter/s)": 0.13854 + }, + { + "epoch": 0.38976972442431107, + "grad_norm": 0.41101741790771484, + "learning_rate": 0.0001432477252144908, + "loss": 0.2899683952331543, + "memory(GiB)": 91.64, + "step": 4130, + "token_acc": 0.9041523571651576, + "train_speed(iter/s)": 0.13854 + }, + { + "epoch": 0.39024160060400154, + "grad_norm": 0.4724404215812683, + "learning_rate": 0.00014310697120250448, + "loss": 0.30640535354614257, + "memory(GiB)": 91.64, + "step": 4135, + "token_acc": 0.8994540491355778, + "train_speed(iter/s)": 0.138541 + }, + { + "epoch": 0.39071347678369195, + "grad_norm": 0.21818110346794128, + "learning_rate": 0.0001429661122185541, + "loss": 0.2890320301055908, + "memory(GiB)": 91.64, + "step": 4140, + "token_acc": 0.9128375177640928, + "train_speed(iter/s)": 0.138542 + }, + { + "epoch": 0.3911853529633824, + "grad_norm": 0.2800583243370056, + "learning_rate": 0.00014282514860565246, + "loss": 0.3026628017425537, + "memory(GiB)": 91.64, + "step": 4145, + "token_acc": 0.8859737638748738, + "train_speed(iter/s)": 0.138542 + }, + { + "epoch": 0.3916572291430729, + "grad_norm": 0.3000319302082062, + "learning_rate": 0.00014268408070706713, + "loss": 0.2907330274581909, + "memory(GiB)": 91.64, + "step": 4150, + "token_acc": 0.8899253731343284, + "train_speed(iter/s)": 0.138542 + }, + { + "epoch": 0.3921291053227633, + "grad_norm": 0.21889916062355042, + "learning_rate": 0.00014254290886631977, + "loss": 0.2907184362411499, + "memory(GiB)": 91.64, + "step": 4155, + "token_acc": 0.9149686520376176, + "train_speed(iter/s)": 0.138542 + }, + { + "epoch": 0.39260098150245376, + "grad_norm": 0.6156848669052124, + "learning_rate": 0.00014240163342718506, + "loss": 0.29220128059387207, + "memory(GiB)": 91.64, + "step": 4160, + "token_acc": 0.8869187019069923, + "train_speed(iter/s)": 0.138542 + }, + { + "epoch": 0.3930728576821442, + "grad_norm": 0.30616524815559387, + "learning_rate": 0.00014226025473368988, + "loss": 0.30183541774749756, + "memory(GiB)": 91.64, + "step": 4165, + "token_acc": 0.9006644518272425, + "train_speed(iter/s)": 0.138541 + }, + { + "epoch": 0.39354473386183464, + "grad_norm": 0.3282259404659271, + "learning_rate": 0.0001421187731301127, + "loss": 0.3005667209625244, + "memory(GiB)": 91.64, + "step": 4170, + "token_acc": 0.8975103734439834, + "train_speed(iter/s)": 0.138543 + }, + { + "epoch": 0.3940166100415251, + "grad_norm": 0.28773191571235657, + "learning_rate": 0.0001419771889609825, + "loss": 0.3000694751739502, + "memory(GiB)": 91.64, + "step": 4175, + "token_acc": 0.8820326678765881, + "train_speed(iter/s)": 0.138543 + }, + { + "epoch": 0.3944884862212156, + "grad_norm": 0.3656611144542694, + "learning_rate": 0.00014183550257107803, + "loss": 0.2959104299545288, + "memory(GiB)": 91.64, + "step": 4180, + "token_acc": 0.8863755917937928, + "train_speed(iter/s)": 0.138543 + }, + { + "epoch": 0.394960362400906, + "grad_norm": 0.45343348383903503, + "learning_rate": 0.00014169371430542698, + "loss": 0.2970226526260376, + "memory(GiB)": 91.64, + "step": 4185, + "token_acc": 0.8920325203252033, + "train_speed(iter/s)": 0.138545 + }, + { + "epoch": 0.39543223858059645, + "grad_norm": 0.2663649916648865, + "learning_rate": 0.00014155182450930516, + "loss": 0.2905903339385986, + "memory(GiB)": 91.64, + "step": 4190, + "token_acc": 0.9216404247528378, + "train_speed(iter/s)": 0.138544 + }, + { + "epoch": 0.3959041147602869, + "grad_norm": 0.3120744526386261, + "learning_rate": 0.00014140983352823558, + "loss": 0.2967799186706543, + "memory(GiB)": 91.64, + "step": 4195, + "token_acc": 0.9013350700097688, + "train_speed(iter/s)": 0.138546 + }, + { + "epoch": 0.39637599093997733, + "grad_norm": 0.2620052397251129, + "learning_rate": 0.0001412677417079876, + "loss": 0.2900829792022705, + "memory(GiB)": 91.64, + "step": 4200, + "token_acc": 0.8982475975127191, + "train_speed(iter/s)": 0.138548 + }, + { + "epoch": 0.3968478671196678, + "grad_norm": 0.2274962216615677, + "learning_rate": 0.00014112554939457625, + "loss": 0.29586215019226075, + "memory(GiB)": 91.64, + "step": 4205, + "token_acc": 0.9071954210956664, + "train_speed(iter/s)": 0.138548 + }, + { + "epoch": 0.39731974329935826, + "grad_norm": 0.3711623549461365, + "learning_rate": 0.00014098325693426118, + "loss": 0.2935636520385742, + "memory(GiB)": 91.64, + "step": 4210, + "token_acc": 0.8952965235173824, + "train_speed(iter/s)": 0.138549 + }, + { + "epoch": 0.3977916194790487, + "grad_norm": 0.9405967593193054, + "learning_rate": 0.00014084086467354597, + "loss": 0.2912130355834961, + "memory(GiB)": 91.64, + "step": 4215, + "token_acc": 0.9064059900166389, + "train_speed(iter/s)": 0.13855 + }, + { + "epoch": 0.39826349565873914, + "grad_norm": 0.33666670322418213, + "learning_rate": 0.00014069837295917721, + "loss": 0.29177026748657225, + "memory(GiB)": 91.64, + "step": 4220, + "token_acc": 0.9011857707509882, + "train_speed(iter/s)": 0.138552 + }, + { + "epoch": 0.3987353718384296, + "grad_norm": 0.22122500836849213, + "learning_rate": 0.00014055578213814366, + "loss": 0.2964980125427246, + "memory(GiB)": 91.64, + "step": 4225, + "token_acc": 0.8981173864894795, + "train_speed(iter/s)": 0.138553 + }, + { + "epoch": 0.39920724801812, + "grad_norm": 0.37318891286849976, + "learning_rate": 0.00014041309255767548, + "loss": 0.29087071418762206, + "memory(GiB)": 91.64, + "step": 4230, + "token_acc": 0.9126625211984172, + "train_speed(iter/s)": 0.138554 + }, + { + "epoch": 0.3996791241978105, + "grad_norm": 0.24224835634231567, + "learning_rate": 0.00014027030456524318, + "loss": 0.2874420166015625, + "memory(GiB)": 91.64, + "step": 4235, + "token_acc": 0.895663514835344, + "train_speed(iter/s)": 0.138555 + }, + { + "epoch": 0.40015100037750095, + "grad_norm": 0.44189468026161194, + "learning_rate": 0.00014012741850855714, + "loss": 0.2921741962432861, + "memory(GiB)": 91.64, + "step": 4240, + "token_acc": 0.9087829033098775, + "train_speed(iter/s)": 0.138555 + }, + { + "epoch": 0.40062287655719137, + "grad_norm": 0.28330740332603455, + "learning_rate": 0.00013998443473556632, + "loss": 0.2978023052215576, + "memory(GiB)": 91.64, + "step": 4245, + "token_acc": 0.8874931731294374, + "train_speed(iter/s)": 0.138557 + }, + { + "epoch": 0.40109475273688183, + "grad_norm": 0.2942737936973572, + "learning_rate": 0.00013984135359445778, + "loss": 0.289691686630249, + "memory(GiB)": 91.64, + "step": 4250, + "token_acc": 0.8960396039603961, + "train_speed(iter/s)": 0.138559 + }, + { + "epoch": 0.4015666289165723, + "grad_norm": 0.9360629320144653, + "learning_rate": 0.00013969817543365562, + "loss": 0.2971461772918701, + "memory(GiB)": 91.64, + "step": 4255, + "token_acc": 0.9020383328262853, + "train_speed(iter/s)": 0.138556 + }, + { + "epoch": 0.40203850509626277, + "grad_norm": 0.4696950316429138, + "learning_rate": 0.00013955490060182024, + "loss": 0.2846244812011719, + "memory(GiB)": 91.64, + "step": 4260, + "token_acc": 0.8936451897616946, + "train_speed(iter/s)": 0.138559 + }, + { + "epoch": 0.4025103812759532, + "grad_norm": 0.24814550578594208, + "learning_rate": 0.0001394115294478474, + "loss": 0.2938679695129395, + "memory(GiB)": 91.64, + "step": 4265, + "token_acc": 0.9173832923832924, + "train_speed(iter/s)": 0.138559 + }, + { + "epoch": 0.40298225745564364, + "grad_norm": 0.40624096989631653, + "learning_rate": 0.00013926806232086744, + "loss": 0.29901700019836425, + "memory(GiB)": 91.64, + "step": 4270, + "token_acc": 0.9090644973852411, + "train_speed(iter/s)": 0.13856 + }, + { + "epoch": 0.4034541336353341, + "grad_norm": 0.4126349687576294, + "learning_rate": 0.00013912449957024443, + "loss": 0.30084829330444335, + "memory(GiB)": 91.64, + "step": 4275, + "token_acc": 0.8965719308526223, + "train_speed(iter/s)": 0.138562 + }, + { + "epoch": 0.4039260098150245, + "grad_norm": 0.3654210865497589, + "learning_rate": 0.00013898084154557528, + "loss": 0.29240951538085935, + "memory(GiB)": 91.64, + "step": 4280, + "token_acc": 0.8849469496021221, + "train_speed(iter/s)": 0.13856 + }, + { + "epoch": 0.404397885994715, + "grad_norm": 0.4598270356655121, + "learning_rate": 0.00013883708859668885, + "loss": 0.29373817443847655, + "memory(GiB)": 91.64, + "step": 4285, + "token_acc": 0.9065555957986237, + "train_speed(iter/s)": 0.138562 + }, + { + "epoch": 0.40486976217440546, + "grad_norm": 0.33815762400627136, + "learning_rate": 0.0001386932410736453, + "loss": 0.29302225112915037, + "memory(GiB)": 91.64, + "step": 4290, + "token_acc": 0.8969513731418494, + "train_speed(iter/s)": 0.138561 + }, + { + "epoch": 0.40534163835409587, + "grad_norm": 0.5209315419197083, + "learning_rate": 0.00013854929932673494, + "loss": 0.2958113431930542, + "memory(GiB)": 91.64, + "step": 4295, + "token_acc": 0.8755449861276259, + "train_speed(iter/s)": 0.138563 + }, + { + "epoch": 0.40581351453378633, + "grad_norm": 0.2827267050743103, + "learning_rate": 0.0001384052637064776, + "loss": 0.28914942741394045, + "memory(GiB)": 91.64, + "step": 4300, + "token_acc": 0.9065727699530517, + "train_speed(iter/s)": 0.138562 + }, + { + "epoch": 0.4062853907134768, + "grad_norm": 0.6436609625816345, + "learning_rate": 0.00013826113456362176, + "loss": 0.2951486825942993, + "memory(GiB)": 91.64, + "step": 4305, + "token_acc": 0.8828323993886907, + "train_speed(iter/s)": 0.138563 + }, + { + "epoch": 0.4067572668931672, + "grad_norm": 0.8339882493019104, + "learning_rate": 0.00013811691224914347, + "loss": 0.28337812423706055, + "memory(GiB)": 91.64, + "step": 4310, + "token_acc": 0.8929663608562691, + "train_speed(iter/s)": 0.138563 + }, + { + "epoch": 0.4072291430728577, + "grad_norm": 0.3736589848995209, + "learning_rate": 0.0001379725971142459, + "loss": 0.28997302055358887, + "memory(GiB)": 91.64, + "step": 4315, + "token_acc": 0.8899193548387097, + "train_speed(iter/s)": 0.138565 + }, + { + "epoch": 0.40770101925254815, + "grad_norm": 0.3151237964630127, + "learning_rate": 0.000137828189510358, + "loss": 0.2918028116226196, + "memory(GiB)": 91.64, + "step": 4320, + "token_acc": 0.8964912280701754, + "train_speed(iter/s)": 0.138566 + }, + { + "epoch": 0.40817289543223856, + "grad_norm": 0.3310835361480713, + "learning_rate": 0.0001376836897891341, + "loss": 0.2858395576477051, + "memory(GiB)": 91.64, + "step": 4325, + "token_acc": 0.8929663608562691, + "train_speed(iter/s)": 0.138566 + }, + { + "epoch": 0.408644771611929, + "grad_norm": 0.4066063165664673, + "learning_rate": 0.0001375390983024528, + "loss": 0.293898868560791, + "memory(GiB)": 91.64, + "step": 4330, + "token_acc": 0.9106681432262828, + "train_speed(iter/s)": 0.138565 + }, + { + "epoch": 0.4091166477916195, + "grad_norm": 0.6316620111465454, + "learning_rate": 0.00013739441540241607, + "loss": 0.29479825496673584, + "memory(GiB)": 91.64, + "step": 4335, + "token_acc": 0.9061522419186653, + "train_speed(iter/s)": 0.138568 + }, + { + "epoch": 0.4095885239713099, + "grad_norm": 0.7944101095199585, + "learning_rate": 0.00013724964144134856, + "loss": 0.2885154962539673, + "memory(GiB)": 91.64, + "step": 4340, + "token_acc": 0.9099597585513078, + "train_speed(iter/s)": 0.138567 + }, + { + "epoch": 0.41006040015100037, + "grad_norm": 0.3424244523048401, + "learning_rate": 0.00013710477677179674, + "loss": 0.2906686544418335, + "memory(GiB)": 91.64, + "step": 4345, + "token_acc": 0.9176111595466434, + "train_speed(iter/s)": 0.138568 + }, + { + "epoch": 0.41053227633069084, + "grad_norm": 0.4654749631881714, + "learning_rate": 0.00013695982174652779, + "loss": 0.2884217262268066, + "memory(GiB)": 91.64, + "step": 4350, + "token_acc": 0.9127239320165366, + "train_speed(iter/s)": 0.138569 + }, + { + "epoch": 0.41100415251038125, + "grad_norm": 0.22995208203792572, + "learning_rate": 0.00013681477671852903, + "loss": 0.2872136354446411, + "memory(GiB)": 91.64, + "step": 4355, + "token_acc": 0.9024103468547913, + "train_speed(iter/s)": 0.13857 + }, + { + "epoch": 0.4114760286900717, + "grad_norm": 0.4856892228126526, + "learning_rate": 0.00013666964204100702, + "loss": 0.2831977605819702, + "memory(GiB)": 91.64, + "step": 4360, + "token_acc": 0.894754539340955, + "train_speed(iter/s)": 0.13857 + }, + { + "epoch": 0.4119479048697622, + "grad_norm": 0.406127393245697, + "learning_rate": 0.00013652441806738644, + "loss": 0.2858266830444336, + "memory(GiB)": 91.64, + "step": 4365, + "token_acc": 0.9002932551319648, + "train_speed(iter/s)": 0.138572 + }, + { + "epoch": 0.41241978104945265, + "grad_norm": 0.3872334957122803, + "learning_rate": 0.0001363791051513096, + "loss": 0.2899020195007324, + "memory(GiB)": 91.64, + "step": 4370, + "token_acc": 0.8910081743869209, + "train_speed(iter/s)": 0.138572 + }, + { + "epoch": 0.41289165722914306, + "grad_norm": 0.3237299621105194, + "learning_rate": 0.0001362337036466353, + "loss": 0.28980767726898193, + "memory(GiB)": 91.64, + "step": 4375, + "token_acc": 0.8978611959842864, + "train_speed(iter/s)": 0.138573 + }, + { + "epoch": 0.41336353340883353, + "grad_norm": 0.5206886529922485, + "learning_rate": 0.00013608821390743812, + "loss": 0.28344340324401857, + "memory(GiB)": 91.64, + "step": 4380, + "token_acc": 0.88828089375285, + "train_speed(iter/s)": 0.138575 + }, + { + "epoch": 0.413835409588524, + "grad_norm": 0.39191269874572754, + "learning_rate": 0.0001359426362880074, + "loss": 0.29194035530090334, + "memory(GiB)": 91.64, + "step": 4385, + "token_acc": 0.9174977334542158, + "train_speed(iter/s)": 0.138575 + }, + { + "epoch": 0.4143072857682144, + "grad_norm": 0.41927480697631836, + "learning_rate": 0.00013579697114284665, + "loss": 0.2890349864959717, + "memory(GiB)": 91.64, + "step": 4390, + "token_acc": 0.8929208804283165, + "train_speed(iter/s)": 0.138574 + }, + { + "epoch": 0.4147791619479049, + "grad_norm": 0.28609418869018555, + "learning_rate": 0.0001356512188266724, + "loss": 0.2916177034378052, + "memory(GiB)": 91.64, + "step": 4395, + "token_acc": 0.8874501992031872, + "train_speed(iter/s)": 0.138574 + }, + { + "epoch": 0.41525103812759534, + "grad_norm": 0.20930610597133636, + "learning_rate": 0.00013550537969441343, + "loss": 0.2926508903503418, + "memory(GiB)": 91.64, + "step": 4400, + "token_acc": 0.9018830525272548, + "train_speed(iter/s)": 0.138575 + }, + { + "epoch": 0.41572291430728575, + "grad_norm": 0.4391603171825409, + "learning_rate": 0.00013535945410121002, + "loss": 0.29059476852416993, + "memory(GiB)": 91.64, + "step": 4405, + "token_acc": 0.9014660756904194, + "train_speed(iter/s)": 0.138575 + }, + { + "epoch": 0.4161947904869762, + "grad_norm": 0.3281775116920471, + "learning_rate": 0.000135213442402413, + "loss": 0.2879380226135254, + "memory(GiB)": 91.64, + "step": 4410, + "token_acc": 0.8988439306358381, + "train_speed(iter/s)": 0.138577 + }, + { + "epoch": 0.4166666666666667, + "grad_norm": 0.33387717604637146, + "learning_rate": 0.00013506734495358276, + "loss": 0.29560070037841796, + "memory(GiB)": 91.64, + "step": 4415, + "token_acc": 0.9064398541919806, + "train_speed(iter/s)": 0.138575 + }, + { + "epoch": 0.4171385428463571, + "grad_norm": 0.4845304489135742, + "learning_rate": 0.0001349211621104886, + "loss": 0.2906494140625, + "memory(GiB)": 91.64, + "step": 4420, + "token_acc": 0.8972520908004779, + "train_speed(iter/s)": 0.138576 + }, + { + "epoch": 0.41761041902604756, + "grad_norm": 0.33037856221199036, + "learning_rate": 0.0001347748942291078, + "loss": 0.28696584701538086, + "memory(GiB)": 91.64, + "step": 4425, + "token_acc": 0.9130630630630631, + "train_speed(iter/s)": 0.138577 + }, + { + "epoch": 0.41808229520573803, + "grad_norm": 0.2779523432254791, + "learning_rate": 0.00013462854166562463, + "loss": 0.29009506702423093, + "memory(GiB)": 91.64, + "step": 4430, + "token_acc": 0.9063520871143376, + "train_speed(iter/s)": 0.138577 + }, + { + "epoch": 0.41855417138542844, + "grad_norm": 0.4156259000301361, + "learning_rate": 0.00013448210477642956, + "loss": 0.2897838354110718, + "memory(GiB)": 91.64, + "step": 4435, + "token_acc": 0.8880031885213232, + "train_speed(iter/s)": 0.138576 + }, + { + "epoch": 0.4190260475651189, + "grad_norm": 0.24767661094665527, + "learning_rate": 0.00013433558391811858, + "loss": 0.29002995491027833, + "memory(GiB)": 91.64, + "step": 4440, + "token_acc": 0.8957399103139013, + "train_speed(iter/s)": 0.138579 + }, + { + "epoch": 0.4194979237448094, + "grad_norm": 0.30752333998680115, + "learning_rate": 0.00013418897944749195, + "loss": 0.29051032066345217, + "memory(GiB)": 91.64, + "step": 4445, + "token_acc": 0.8970005659309565, + "train_speed(iter/s)": 0.13858 + }, + { + "epoch": 0.4199697999244998, + "grad_norm": 0.45796823501586914, + "learning_rate": 0.00013404229172155364, + "loss": 0.2903867244720459, + "memory(GiB)": 91.64, + "step": 4450, + "token_acc": 0.8912252325111201, + "train_speed(iter/s)": 0.13858 + }, + { + "epoch": 0.42044167610419025, + "grad_norm": 0.39090868830680847, + "learning_rate": 0.00013389552109751036, + "loss": 0.2896425724029541, + "memory(GiB)": 91.64, + "step": 4455, + "token_acc": 0.904796511627907, + "train_speed(iter/s)": 0.138579 + }, + { + "epoch": 0.4209135522838807, + "grad_norm": 0.5254302620887756, + "learning_rate": 0.00013374866793277066, + "loss": 0.2874057054519653, + "memory(GiB)": 91.64, + "step": 4460, + "token_acc": 0.8978494623655914, + "train_speed(iter/s)": 0.13858 + }, + { + "epoch": 0.42138542846357113, + "grad_norm": 0.5833097100257874, + "learning_rate": 0.00013360173258494416, + "loss": 0.28268094062805177, + "memory(GiB)": 91.64, + "step": 4465, + "token_acc": 0.9087018544935807, + "train_speed(iter/s)": 0.138579 + }, + { + "epoch": 0.4218573046432616, + "grad_norm": 0.2850300371646881, + "learning_rate": 0.00013345471541184042, + "loss": 0.28884856700897216, + "memory(GiB)": 91.64, + "step": 4470, + "token_acc": 0.9155321782178217, + "train_speed(iter/s)": 0.138575 + }, + { + "epoch": 0.42232918082295207, + "grad_norm": 1.1341931819915771, + "learning_rate": 0.00013330761677146852, + "loss": 0.2828017473220825, + "memory(GiB)": 91.64, + "step": 4475, + "token_acc": 0.9020037570444583, + "train_speed(iter/s)": 0.138577 + }, + { + "epoch": 0.42280105700264253, + "grad_norm": 1.6812070608139038, + "learning_rate": 0.00013316043702203575, + "loss": 0.29272284507751467, + "memory(GiB)": 91.64, + "step": 4480, + "token_acc": 0.897029702970297, + "train_speed(iter/s)": 0.138578 + }, + { + "epoch": 0.42327293318233294, + "grad_norm": 0.2931070029735565, + "learning_rate": 0.00013301317652194693, + "loss": 0.28561973571777344, + "memory(GiB)": 91.64, + "step": 4485, + "token_acc": 0.898884239888424, + "train_speed(iter/s)": 0.138577 + }, + { + "epoch": 0.4237448093620234, + "grad_norm": 0.40677410364151, + "learning_rate": 0.00013286583562980355, + "loss": 0.28868684768676756, + "memory(GiB)": 91.64, + "step": 4490, + "token_acc": 0.8961675579322638, + "train_speed(iter/s)": 0.138578 + }, + { + "epoch": 0.4242166855417139, + "grad_norm": 0.5539590716362, + "learning_rate": 0.00013271841470440288, + "loss": 0.2832665920257568, + "memory(GiB)": 91.64, + "step": 4495, + "token_acc": 0.8930443900734009, + "train_speed(iter/s)": 0.138578 + }, + { + "epoch": 0.4246885617214043, + "grad_norm": 0.2117510586977005, + "learning_rate": 0.0001325709141047371, + "loss": 0.28936703205108644, + "memory(GiB)": 91.64, + "step": 4500, + "token_acc": 0.8862788963460104, + "train_speed(iter/s)": 0.138577 + }, + { + "epoch": 0.42516043790109476, + "grad_norm": 0.4565034508705139, + "learning_rate": 0.00013242333418999228, + "loss": 0.28919024467468263, + "memory(GiB)": 91.64, + "step": 4505, + "token_acc": 0.9185295578738202, + "train_speed(iter/s)": 0.138576 + }, + { + "epoch": 0.4256323140807852, + "grad_norm": 0.8646222949028015, + "learning_rate": 0.00013227567531954784, + "loss": 0.2934823513031006, + "memory(GiB)": 91.64, + "step": 4510, + "token_acc": 0.9073763621123219, + "train_speed(iter/s)": 0.138576 + }, + { + "epoch": 0.42610419026047563, + "grad_norm": 0.48075807094573975, + "learning_rate": 0.00013212793785297527, + "loss": 0.29014410972595217, + "memory(GiB)": 91.64, + "step": 4515, + "token_acc": 0.8953418027828192, + "train_speed(iter/s)": 0.138576 + }, + { + "epoch": 0.4265760664401661, + "grad_norm": 0.5212975740432739, + "learning_rate": 0.00013198012215003758, + "loss": 0.28605401515960693, + "memory(GiB)": 91.64, + "step": 4520, + "token_acc": 0.8984655566438132, + "train_speed(iter/s)": 0.138577 + }, + { + "epoch": 0.42704794261985657, + "grad_norm": 0.38360151648521423, + "learning_rate": 0.00013183222857068828, + "loss": 0.2868018388748169, + "memory(GiB)": 91.64, + "step": 4525, + "token_acc": 0.8966820663586729, + "train_speed(iter/s)": 0.138577 + }, + { + "epoch": 0.427519818799547, + "grad_norm": 0.3686319887638092, + "learning_rate": 0.00013168425747507042, + "loss": 0.281552791595459, + "memory(GiB)": 91.64, + "step": 4530, + "token_acc": 0.8858676207513417, + "train_speed(iter/s)": 0.138582 + }, + { + "epoch": 0.42799169497923745, + "grad_norm": 1.0960360765457153, + "learning_rate": 0.00013153620922351598, + "loss": 0.28768799304962156, + "memory(GiB)": 91.64, + "step": 4535, + "token_acc": 0.9006509078451524, + "train_speed(iter/s)": 0.138581 + }, + { + "epoch": 0.4284635711589279, + "grad_norm": 0.9694100022315979, + "learning_rate": 0.00013138808417654472, + "loss": 0.28849072456359864, + "memory(GiB)": 91.64, + "step": 4540, + "token_acc": 0.8936080740117746, + "train_speed(iter/s)": 0.138583 + }, + { + "epoch": 0.4289354473386183, + "grad_norm": 0.32766884565353394, + "learning_rate": 0.00013123988269486336, + "loss": 0.28848419189453123, + "memory(GiB)": 91.64, + "step": 4545, + "token_acc": 0.9198760513501549, + "train_speed(iter/s)": 0.138584 + }, + { + "epoch": 0.4294073235183088, + "grad_norm": 0.9172677397727966, + "learning_rate": 0.00013109160513936492, + "loss": 0.2858105659484863, + "memory(GiB)": 91.64, + "step": 4550, + "token_acc": 0.8982770046388336, + "train_speed(iter/s)": 0.138585 + }, + { + "epoch": 0.42987919969799926, + "grad_norm": 0.4844764769077301, + "learning_rate": 0.0001309432518711275, + "loss": 0.29333882331848143, + "memory(GiB)": 91.64, + "step": 4555, + "token_acc": 0.8893459481694775, + "train_speed(iter/s)": 0.138584 + }, + { + "epoch": 0.43035107587768967, + "grad_norm": 0.5546042323112488, + "learning_rate": 0.00013079482325141365, + "loss": 0.29157898426055906, + "memory(GiB)": 91.64, + "step": 4560, + "token_acc": 0.903052805280528, + "train_speed(iter/s)": 0.138584 + }, + { + "epoch": 0.43082295205738014, + "grad_norm": 0.224000945687294, + "learning_rate": 0.0001306463196416694, + "loss": 0.2820766448974609, + "memory(GiB)": 91.64, + "step": 4565, + "token_acc": 0.9094988780852655, + "train_speed(iter/s)": 0.138584 + }, + { + "epoch": 0.4312948282370706, + "grad_norm": 0.2916657328605652, + "learning_rate": 0.00013049774140352346, + "loss": 0.28515851497650146, + "memory(GiB)": 91.64, + "step": 4570, + "token_acc": 0.9044019564250778, + "train_speed(iter/s)": 0.138587 + }, + { + "epoch": 0.431766704416761, + "grad_norm": 0.31271326541900635, + "learning_rate": 0.00013034908889878613, + "loss": 0.290648365020752, + "memory(GiB)": 91.64, + "step": 4575, + "token_acc": 0.9117174959871589, + "train_speed(iter/s)": 0.13859 + }, + { + "epoch": 0.4322385805964515, + "grad_norm": 0.48812609910964966, + "learning_rate": 0.00013020036248944863, + "loss": 0.2845763206481934, + "memory(GiB)": 91.64, + "step": 4580, + "token_acc": 0.911256242796773, + "train_speed(iter/s)": 0.138591 + }, + { + "epoch": 0.43271045677614195, + "grad_norm": 0.490957647562027, + "learning_rate": 0.0001300515625376822, + "loss": 0.28296747207641604, + "memory(GiB)": 91.64, + "step": 4585, + "token_acc": 0.9053865475858219, + "train_speed(iter/s)": 0.138591 + }, + { + "epoch": 0.4331823329558324, + "grad_norm": 0.6826327443122864, + "learning_rate": 0.00012990268940583715, + "loss": 0.28066396713256836, + "memory(GiB)": 91.64, + "step": 4590, + "token_acc": 0.912207625760974, + "train_speed(iter/s)": 0.138591 + }, + { + "epoch": 0.43365420913552283, + "grad_norm": 0.3873187005519867, + "learning_rate": 0.0001297537434564419, + "loss": 0.2871824026107788, + "memory(GiB)": 91.64, + "step": 4595, + "token_acc": 0.9020979020979021, + "train_speed(iter/s)": 0.138591 + }, + { + "epoch": 0.4341260853152133, + "grad_norm": 0.24573656916618347, + "learning_rate": 0.00012960472505220227, + "loss": 0.2789003849029541, + "memory(GiB)": 91.64, + "step": 4600, + "token_acc": 0.9040368271954674, + "train_speed(iter/s)": 0.138591 + }, + { + "epoch": 0.43459796149490376, + "grad_norm": 0.5334983468055725, + "learning_rate": 0.00012945563455600052, + "loss": 0.287949800491333, + "memory(GiB)": 91.64, + "step": 4605, + "token_acc": 0.886, + "train_speed(iter/s)": 0.13859 + }, + { + "epoch": 0.4350698376745942, + "grad_norm": 0.22217601537704468, + "learning_rate": 0.00012930647233089451, + "loss": 0.28804755210876465, + "memory(GiB)": 91.64, + "step": 4610, + "token_acc": 0.8871165644171779, + "train_speed(iter/s)": 0.138592 + }, + { + "epoch": 0.43554171385428464, + "grad_norm": 0.8007298707962036, + "learning_rate": 0.0001291572387401166, + "loss": 0.2913137197494507, + "memory(GiB)": 91.64, + "step": 4615, + "token_acc": 0.8878835562549174, + "train_speed(iter/s)": 0.13859 + }, + { + "epoch": 0.4360135900339751, + "grad_norm": 0.763749361038208, + "learning_rate": 0.0001290079341470731, + "loss": 0.27843732833862306, + "memory(GiB)": 91.64, + "step": 4620, + "token_acc": 0.9164843180160467, + "train_speed(iter/s)": 0.13859 + }, + { + "epoch": 0.4364854662136655, + "grad_norm": 0.36388909816741943, + "learning_rate": 0.0001288585589153432, + "loss": 0.28447413444519043, + "memory(GiB)": 91.64, + "step": 4625, + "token_acc": 0.8996680191811139, + "train_speed(iter/s)": 0.138593 + }, + { + "epoch": 0.436957342393356, + "grad_norm": 0.6230308413505554, + "learning_rate": 0.00012870911340867806, + "loss": 0.28026676177978516, + "memory(GiB)": 91.64, + "step": 4630, + "token_acc": 0.895593220338983, + "train_speed(iter/s)": 0.138591 + }, + { + "epoch": 0.43742921857304645, + "grad_norm": 0.40676748752593994, + "learning_rate": 0.00012855959799099997, + "loss": 0.2786916971206665, + "memory(GiB)": 91.64, + "step": 4635, + "token_acc": 0.8946716232961586, + "train_speed(iter/s)": 0.138591 + }, + { + "epoch": 0.43790109475273686, + "grad_norm": 0.3990239202976227, + "learning_rate": 0.0001284100130264015, + "loss": 0.2920806646347046, + "memory(GiB)": 91.64, + "step": 4640, + "token_acc": 0.9059865092748736, + "train_speed(iter/s)": 0.138593 + }, + { + "epoch": 0.43837297093242733, + "grad_norm": 0.2727039158344269, + "learning_rate": 0.0001282603588791445, + "loss": 0.27799243927001954, + "memory(GiB)": 91.64, + "step": 4645, + "token_acc": 0.9078553954879043, + "train_speed(iter/s)": 0.138593 + }, + { + "epoch": 0.4388448471121178, + "grad_norm": 0.6722660064697266, + "learning_rate": 0.00012811063591365942, + "loss": 0.28724350929260256, + "memory(GiB)": 91.64, + "step": 4650, + "token_acc": 0.9086161879895561, + "train_speed(iter/s)": 0.138593 + }, + { + "epoch": 0.4393167232918082, + "grad_norm": 0.5787277817726135, + "learning_rate": 0.0001279608444945442, + "loss": 0.2847604274749756, + "memory(GiB)": 91.64, + "step": 4655, + "token_acc": 0.9099365750528541, + "train_speed(iter/s)": 0.138596 + }, + { + "epoch": 0.4397885994714987, + "grad_norm": 0.7865330576896667, + "learning_rate": 0.00012781098498656343, + "loss": 0.2823522090911865, + "memory(GiB)": 91.64, + "step": 4660, + "token_acc": 0.8874614594039054, + "train_speed(iter/s)": 0.138596 + }, + { + "epoch": 0.44026047565118914, + "grad_norm": 0.2751135528087616, + "learning_rate": 0.00012766105775464769, + "loss": 0.28550019264221194, + "memory(GiB)": 91.64, + "step": 4665, + "token_acc": 0.9065579340808698, + "train_speed(iter/s)": 0.138597 + }, + { + "epoch": 0.44073235183087955, + "grad_norm": 0.3212601840496063, + "learning_rate": 0.00012751106316389227, + "loss": 0.29028480052947997, + "memory(GiB)": 91.64, + "step": 4670, + "token_acc": 0.8936355710549259, + "train_speed(iter/s)": 0.138597 + }, + { + "epoch": 0.44120422801057, + "grad_norm": 0.574639081954956, + "learning_rate": 0.0001273610015795566, + "loss": 0.28751068115234374, + "memory(GiB)": 91.64, + "step": 4675, + "token_acc": 0.8857025809094633, + "train_speed(iter/s)": 0.138596 + }, + { + "epoch": 0.4416761041902605, + "grad_norm": 0.5098394155502319, + "learning_rate": 0.00012721087336706326, + "loss": 0.2824862480163574, + "memory(GiB)": 91.64, + "step": 4680, + "token_acc": 0.9175931981687377, + "train_speed(iter/s)": 0.138597 + }, + { + "epoch": 0.4421479803699509, + "grad_norm": 0.6681429147720337, + "learning_rate": 0.00012706067889199697, + "loss": 0.2874873876571655, + "memory(GiB)": 91.64, + "step": 4685, + "token_acc": 0.9023062139654068, + "train_speed(iter/s)": 0.138597 + }, + { + "epoch": 0.44261985654964137, + "grad_norm": 0.28191789984703064, + "learning_rate": 0.00012691041852010398, + "loss": 0.28571357727050783, + "memory(GiB)": 91.64, + "step": 4690, + "token_acc": 0.9098571763053149, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.44309173272933183, + "grad_norm": 0.5222494602203369, + "learning_rate": 0.00012676009261729086, + "loss": 0.2767521858215332, + "memory(GiB)": 91.64, + "step": 4695, + "token_acc": 0.9009402283411686, + "train_speed(iter/s)": 0.138599 + }, + { + "epoch": 0.4435636089090223, + "grad_norm": 0.22164411842823029, + "learning_rate": 0.00012660970154962383, + "loss": 0.27899010181427003, + "memory(GiB)": 91.64, + "step": 4700, + "token_acc": 0.8977528089887641, + "train_speed(iter/s)": 0.138596 + }, + { + "epoch": 0.4440354850887127, + "grad_norm": 0.2828960716724396, + "learning_rate": 0.00012645924568332773, + "loss": 0.28249554634094237, + "memory(GiB)": 91.64, + "step": 4705, + "token_acc": 0.9085754783841248, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.4445073612684032, + "grad_norm": 0.4036547541618347, + "learning_rate": 0.00012630872538478536, + "loss": 0.28681211471557616, + "memory(GiB)": 91.64, + "step": 4710, + "token_acc": 0.8995236032914682, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.44497923744809365, + "grad_norm": 0.39314863085746765, + "learning_rate": 0.00012615814102053617, + "loss": 0.28807053565979, + "memory(GiB)": 91.64, + "step": 4715, + "token_acc": 0.9016070842899311, + "train_speed(iter/s)": 0.138597 + }, + { + "epoch": 0.44545111362778406, + "grad_norm": 0.5945762991905212, + "learning_rate": 0.00012600749295727583, + "loss": 0.2825813293457031, + "memory(GiB)": 91.64, + "step": 4720, + "token_acc": 0.8978658536585366, + "train_speed(iter/s)": 0.1386 + }, + { + "epoch": 0.4459229898074745, + "grad_norm": 0.3599962294101715, + "learning_rate": 0.00012585678156185507, + "loss": 0.2848550319671631, + "memory(GiB)": 91.64, + "step": 4725, + "token_acc": 0.9100689655172414, + "train_speed(iter/s)": 0.138601 + }, + { + "epoch": 0.446394865987165, + "grad_norm": 0.41364148259162903, + "learning_rate": 0.0001257060072012788, + "loss": 0.27649877071380613, + "memory(GiB)": 91.64, + "step": 4730, + "token_acc": 0.9065370070232307, + "train_speed(iter/s)": 0.138602 + }, + { + "epoch": 0.4468667421668554, + "grad_norm": 0.3855108916759491, + "learning_rate": 0.00012555517024270525, + "loss": 0.2867574214935303, + "memory(GiB)": 91.64, + "step": 4735, + "token_acc": 0.9041860465116279, + "train_speed(iter/s)": 0.138603 + }, + { + "epoch": 0.44733861834654587, + "grad_norm": 0.4569513499736786, + "learning_rate": 0.00012540427105344517, + "loss": 0.29142746925354, + "memory(GiB)": 91.64, + "step": 4740, + "token_acc": 0.8947759346372864, + "train_speed(iter/s)": 0.138604 + }, + { + "epoch": 0.44781049452623634, + "grad_norm": 0.36718836426734924, + "learning_rate": 0.00012525331000096078, + "loss": 0.27325663566589353, + "memory(GiB)": 91.64, + "step": 4745, + "token_acc": 0.9051918735891648, + "train_speed(iter/s)": 0.138602 + }, + { + "epoch": 0.44828237070592675, + "grad_norm": 0.45342808961868286, + "learning_rate": 0.000125102287452865, + "loss": 0.2808860778808594, + "memory(GiB)": 91.64, + "step": 4750, + "token_acc": 0.901932712956335, + "train_speed(iter/s)": 0.138602 + }, + { + "epoch": 0.4487542468856172, + "grad_norm": 0.39637887477874756, + "learning_rate": 0.00012495120377692038, + "loss": 0.27970137596130373, + "memory(GiB)": 91.64, + "step": 4755, + "token_acc": 0.8976339932399807, + "train_speed(iter/s)": 0.138602 + }, + { + "epoch": 0.4492261230653077, + "grad_norm": 0.5339449048042297, + "learning_rate": 0.0001248000593410385, + "loss": 0.27558255195617676, + "memory(GiB)": 91.64, + "step": 4760, + "token_acc": 0.9057873485868102, + "train_speed(iter/s)": 0.138603 + }, + { + "epoch": 0.4496979992449981, + "grad_norm": 0.3682711720466614, + "learning_rate": 0.0001246488545132788, + "loss": 0.28574070930480955, + "memory(GiB)": 91.64, + "step": 4765, + "token_acc": 0.9038128249566725, + "train_speed(iter/s)": 0.138605 + }, + { + "epoch": 0.45016987542468856, + "grad_norm": 0.25929683446884155, + "learning_rate": 0.0001244975896618478, + "loss": 0.29129462242126464, + "memory(GiB)": 91.64, + "step": 4770, + "token_acc": 0.9121370067014147, + "train_speed(iter/s)": 0.138603 + }, + { + "epoch": 0.450641751604379, + "grad_norm": 0.37731558084487915, + "learning_rate": 0.0001243462651550982, + "loss": 0.2848989725112915, + "memory(GiB)": 91.64, + "step": 4775, + "token_acc": 0.906934306569343, + "train_speed(iter/s)": 0.138604 + }, + { + "epoch": 0.45111362778406944, + "grad_norm": 0.29577717185020447, + "learning_rate": 0.00012419488136152784, + "loss": 0.28867268562316895, + "memory(GiB)": 91.64, + "step": 4780, + "token_acc": 0.9120754716981132, + "train_speed(iter/s)": 0.138603 + }, + { + "epoch": 0.4515855039637599, + "grad_norm": 0.2625930607318878, + "learning_rate": 0.00012404343864977918, + "loss": 0.2829215288162231, + "memory(GiB)": 91.64, + "step": 4785, + "token_acc": 0.8994281870164816, + "train_speed(iter/s)": 0.138603 + }, + { + "epoch": 0.45205738014345037, + "grad_norm": 0.4252516031265259, + "learning_rate": 0.00012389193738863795, + "loss": 0.27890982627868655, + "memory(GiB)": 91.64, + "step": 4790, + "token_acc": 0.9049789621318373, + "train_speed(iter/s)": 0.138601 + }, + { + "epoch": 0.4525292563231408, + "grad_norm": 0.24441945552825928, + "learning_rate": 0.0001237403779470326, + "loss": 0.27833251953125, + "memory(GiB)": 91.64, + "step": 4795, + "token_acc": 0.907057462398766, + "train_speed(iter/s)": 0.1386 + }, + { + "epoch": 0.45300113250283125, + "grad_norm": 0.894290566444397, + "learning_rate": 0.00012358876069403312, + "loss": 0.2981924057006836, + "memory(GiB)": 91.64, + "step": 4800, + "token_acc": 0.9085677749360613, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.4534730086825217, + "grad_norm": 0.49250420928001404, + "learning_rate": 0.0001234370859988503, + "loss": 0.2824810743331909, + "memory(GiB)": 91.64, + "step": 4805, + "token_acc": 0.9156916724019271, + "train_speed(iter/s)": 0.138599 + }, + { + "epoch": 0.45394488486221213, + "grad_norm": 0.34194937348365784, + "learning_rate": 0.00012328535423083498, + "loss": 0.2780169486999512, + "memory(GiB)": 91.64, + "step": 4810, + "token_acc": 0.9040823099900431, + "train_speed(iter/s)": 0.138596 + }, + { + "epoch": 0.4544167610419026, + "grad_norm": 0.2699131667613983, + "learning_rate": 0.0001231335657594768, + "loss": 0.27982387542724607, + "memory(GiB)": 91.64, + "step": 4815, + "token_acc": 0.9047619047619048, + "train_speed(iter/s)": 0.138597 + }, + { + "epoch": 0.45488863722159306, + "grad_norm": 0.3031172454357147, + "learning_rate": 0.0001229817209544035, + "loss": 0.28308122158050536, + "memory(GiB)": 91.64, + "step": 4820, + "token_acc": 0.9010152284263959, + "train_speed(iter/s)": 0.138596 + }, + { + "epoch": 0.45536051340128353, + "grad_norm": 0.2868700921535492, + "learning_rate": 0.00012282982018538006, + "loss": 0.28293166160583494, + "memory(GiB)": 91.64, + "step": 4825, + "token_acc": 0.8836694540088539, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.45583238958097394, + "grad_norm": 0.3862973749637604, + "learning_rate": 0.00012267786382230778, + "loss": 0.28501114845275877, + "memory(GiB)": 91.64, + "step": 4830, + "token_acc": 0.9142234068330506, + "train_speed(iter/s)": 0.138592 + }, + { + "epoch": 0.4563042657606644, + "grad_norm": 0.3525838255882263, + "learning_rate": 0.00012252585223522318, + "loss": 0.2820533037185669, + "memory(GiB)": 91.64, + "step": 4835, + "token_acc": 0.903065964694952, + "train_speed(iter/s)": 0.13859 + }, + { + "epoch": 0.4567761419403549, + "grad_norm": 0.28131887316703796, + "learning_rate": 0.00012237378579429742, + "loss": 0.2824002742767334, + "memory(GiB)": 91.64, + "step": 4840, + "token_acc": 0.9106837606837607, + "train_speed(iter/s)": 0.138591 + }, + { + "epoch": 0.4572480181200453, + "grad_norm": 0.6042118072509766, + "learning_rate": 0.00012222166486983518, + "loss": 0.28409135341644287, + "memory(GiB)": 91.64, + "step": 4845, + "token_acc": 0.9043731778425655, + "train_speed(iter/s)": 0.138589 + }, + { + "epoch": 0.45771989429973575, + "grad_norm": 0.3936363756656647, + "learning_rate": 0.00012206948983227375, + "loss": 0.28306241035461427, + "memory(GiB)": 91.64, + "step": 4850, + "token_acc": 0.8994461014060503, + "train_speed(iter/s)": 0.138587 + }, + { + "epoch": 0.4581917704794262, + "grad_norm": 0.5257007479667664, + "learning_rate": 0.00012191726105218233, + "loss": 0.289243221282959, + "memory(GiB)": 91.64, + "step": 4855, + "token_acc": 0.8839373163565132, + "train_speed(iter/s)": 0.138587 + }, + { + "epoch": 0.45866364665911663, + "grad_norm": 0.21987028419971466, + "learning_rate": 0.00012176497890026088, + "loss": 0.2828011989593506, + "memory(GiB)": 91.64, + "step": 4860, + "token_acc": 0.9015120555782591, + "train_speed(iter/s)": 0.138586 + }, + { + "epoch": 0.4591355228388071, + "grad_norm": 0.5803897976875305, + "learning_rate": 0.00012161264374733936, + "loss": 0.27418339252471924, + "memory(GiB)": 91.64, + "step": 4865, + "token_acc": 0.8991545893719807, + "train_speed(iter/s)": 0.138586 + }, + { + "epoch": 0.45960739901849756, + "grad_norm": 0.45357078313827515, + "learning_rate": 0.0001214602559643768, + "loss": 0.28390045166015626, + "memory(GiB)": 91.64, + "step": 4870, + "token_acc": 0.9008333333333334, + "train_speed(iter/s)": 0.138586 + }, + { + "epoch": 0.460079275198188, + "grad_norm": 0.454944372177124, + "learning_rate": 0.00012130781592246041, + "loss": 0.278179407119751, + "memory(GiB)": 91.64, + "step": 4875, + "token_acc": 0.8991797676008202, + "train_speed(iter/s)": 0.138587 + }, + { + "epoch": 0.46055115137787844, + "grad_norm": 0.2925609052181244, + "learning_rate": 0.00012115532399280463, + "loss": 0.27666945457458497, + "memory(GiB)": 91.64, + "step": 4880, + "token_acc": 0.8829588014981273, + "train_speed(iter/s)": 0.138587 + }, + { + "epoch": 0.4610230275575689, + "grad_norm": 0.41853830218315125, + "learning_rate": 0.00012100278054675025, + "loss": 0.2862995624542236, + "memory(GiB)": 91.64, + "step": 4885, + "token_acc": 0.9114873035066505, + "train_speed(iter/s)": 0.138586 + }, + { + "epoch": 0.4614949037372593, + "grad_norm": 0.9547513127326965, + "learning_rate": 0.00012085018595576353, + "loss": 0.2757422924041748, + "memory(GiB)": 91.64, + "step": 4890, + "token_acc": 0.9112970711297071, + "train_speed(iter/s)": 0.138584 + }, + { + "epoch": 0.4619667799169498, + "grad_norm": 0.3806958496570587, + "learning_rate": 0.00012069754059143528, + "loss": 0.2767773628234863, + "memory(GiB)": 91.64, + "step": 4895, + "token_acc": 0.8986072423398329, + "train_speed(iter/s)": 0.138585 + }, + { + "epoch": 0.46243865609664025, + "grad_norm": 0.23360499739646912, + "learning_rate": 0.00012054484482547996, + "loss": 0.28561379909515383, + "memory(GiB)": 91.64, + "step": 4900, + "token_acc": 0.9079869219990658, + "train_speed(iter/s)": 0.138584 + }, + { + "epoch": 0.46291053227633067, + "grad_norm": 0.42152461409568787, + "learning_rate": 0.0001203920990297347, + "loss": 0.2833158254623413, + "memory(GiB)": 91.64, + "step": 4905, + "token_acc": 0.9091880341880342, + "train_speed(iter/s)": 0.138585 + }, + { + "epoch": 0.46338240845602113, + "grad_norm": 1.3780053853988647, + "learning_rate": 0.00012023930357615854, + "loss": 0.2830458641052246, + "memory(GiB)": 91.64, + "step": 4910, + "token_acc": 0.9006878761822872, + "train_speed(iter/s)": 0.138585 + }, + { + "epoch": 0.4638542846357116, + "grad_norm": 0.7108725905418396, + "learning_rate": 0.0001200864588368314, + "loss": 0.27931234836578367, + "memory(GiB)": 91.64, + "step": 4915, + "token_acc": 0.9094964945825367, + "train_speed(iter/s)": 0.138587 + }, + { + "epoch": 0.464326160815402, + "grad_norm": 0.2742263674736023, + "learning_rate": 0.00011993356518395322, + "loss": 0.27413043975830076, + "memory(GiB)": 91.64, + "step": 4920, + "token_acc": 0.9130901287553648, + "train_speed(iter/s)": 0.138587 + }, + { + "epoch": 0.4647980369950925, + "grad_norm": 0.3077310621738434, + "learning_rate": 0.0001197806229898431, + "loss": 0.2840799570083618, + "memory(GiB)": 91.64, + "step": 4925, + "token_acc": 0.9001019367991845, + "train_speed(iter/s)": 0.13859 + }, + { + "epoch": 0.46526991317478295, + "grad_norm": 0.2271108776330948, + "learning_rate": 0.00011962763262693826, + "loss": 0.280320930480957, + "memory(GiB)": 91.64, + "step": 4930, + "token_acc": 0.8968949044585988, + "train_speed(iter/s)": 0.138592 + }, + { + "epoch": 0.4657417893544734, + "grad_norm": 0.3191837966442108, + "learning_rate": 0.0001194745944677933, + "loss": 0.28061847686767577, + "memory(GiB)": 91.64, + "step": 4935, + "token_acc": 0.8997225525168451, + "train_speed(iter/s)": 0.138591 + }, + { + "epoch": 0.4662136655341638, + "grad_norm": 0.9220530390739441, + "learning_rate": 0.00011932150888507911, + "loss": 0.2785890340805054, + "memory(GiB)": 91.64, + "step": 4940, + "token_acc": 0.914657481821661, + "train_speed(iter/s)": 0.13859 + }, + { + "epoch": 0.4666855417138543, + "grad_norm": 0.2771691083908081, + "learning_rate": 0.00011916837625158221, + "loss": 0.2860894680023193, + "memory(GiB)": 91.64, + "step": 4945, + "token_acc": 0.901593252108716, + "train_speed(iter/s)": 0.138589 + }, + { + "epoch": 0.46715741789354476, + "grad_norm": 0.32764509320259094, + "learning_rate": 0.00011901519694020358, + "loss": 0.281624960899353, + "memory(GiB)": 91.64, + "step": 4950, + "token_acc": 0.8995609220636663, + "train_speed(iter/s)": 0.13859 + }, + { + "epoch": 0.46762929407323517, + "grad_norm": 0.25029462575912476, + "learning_rate": 0.00011886197132395791, + "loss": 0.28546116352081297, + "memory(GiB)": 91.64, + "step": 4955, + "token_acc": 0.9098073555166375, + "train_speed(iter/s)": 0.138588 + }, + { + "epoch": 0.46810117025292564, + "grad_norm": 0.39748215675354004, + "learning_rate": 0.00011870869977597263, + "loss": 0.27789499759674074, + "memory(GiB)": 91.64, + "step": 4960, + "token_acc": 0.8938492063492064, + "train_speed(iter/s)": 0.138587 + }, + { + "epoch": 0.4685730464326161, + "grad_norm": 0.23799273371696472, + "learning_rate": 0.00011855538266948702, + "loss": 0.27652902603149415, + "memory(GiB)": 91.64, + "step": 4965, + "token_acc": 0.9004487964096287, + "train_speed(iter/s)": 0.138586 + }, + { + "epoch": 0.4690449226123065, + "grad_norm": 0.5671650171279907, + "learning_rate": 0.00011840202037785138, + "loss": 0.27930173873901365, + "memory(GiB)": 91.64, + "step": 4970, + "token_acc": 0.9092416079569001, + "train_speed(iter/s)": 0.138586 + }, + { + "epoch": 0.469516798791997, + "grad_norm": 0.27737873792648315, + "learning_rate": 0.00011824861327452587, + "loss": 0.28255581855773926, + "memory(GiB)": 91.64, + "step": 4975, + "token_acc": 0.9156298600311042, + "train_speed(iter/s)": 0.138588 + }, + { + "epoch": 0.46998867497168745, + "grad_norm": 0.3608155846595764, + "learning_rate": 0.00011809516173307997, + "loss": 0.2820130348205566, + "memory(GiB)": 91.64, + "step": 4980, + "token_acc": 0.8924418604651163, + "train_speed(iter/s)": 0.138588 + }, + { + "epoch": 0.47046055115137786, + "grad_norm": 0.25761914253234863, + "learning_rate": 0.00011794166612719126, + "loss": 0.27811760902404786, + "memory(GiB)": 91.64, + "step": 4985, + "token_acc": 0.8917102315160568, + "train_speed(iter/s)": 0.138588 + }, + { + "epoch": 0.4709324273310683, + "grad_norm": 0.31159085035324097, + "learning_rate": 0.00011778812683064464, + "loss": 0.27784423828125, + "memory(GiB)": 91.64, + "step": 4990, + "token_acc": 0.9108947959565705, + "train_speed(iter/s)": 0.138584 + }, + { + "epoch": 0.4714043035107588, + "grad_norm": 0.2755048871040344, + "learning_rate": 0.00011763454421733138, + "loss": 0.27713913917541505, + "memory(GiB)": 91.64, + "step": 4995, + "token_acc": 0.9096820809248555, + "train_speed(iter/s)": 0.138582 + }, + { + "epoch": 0.4718761796904492, + "grad_norm": 0.5099897384643555, + "learning_rate": 0.00011748091866124833, + "loss": 0.28005452156066896, + "memory(GiB)": 91.64, + "step": 5000, + "token_acc": 0.8963607594936709, + "train_speed(iter/s)": 0.13858 + }, + { + "epoch": 0.47234805587013967, + "grad_norm": 0.34181883931159973, + "learning_rate": 0.0001173272505364968, + "loss": 0.28417515754699707, + "memory(GiB)": 91.64, + "step": 5005, + "token_acc": 0.9090192989365892, + "train_speed(iter/s)": 0.138578 + }, + { + "epoch": 0.47281993204983014, + "grad_norm": 0.3527189791202545, + "learning_rate": 0.0001171735402172818, + "loss": 0.27751028537750244, + "memory(GiB)": 91.64, + "step": 5010, + "token_acc": 0.8898207056101793, + "train_speed(iter/s)": 0.138577 + }, + { + "epoch": 0.47329180822952055, + "grad_norm": 0.25944942235946655, + "learning_rate": 0.00011701978807791114, + "loss": 0.27676069736480713, + "memory(GiB)": 91.64, + "step": 5015, + "token_acc": 0.896022549326652, + "train_speed(iter/s)": 0.138577 + }, + { + "epoch": 0.473763684409211, + "grad_norm": 0.30124396085739136, + "learning_rate": 0.00011686599449279436, + "loss": 0.28163583278656007, + "memory(GiB)": 91.64, + "step": 5020, + "token_acc": 0.894380118610562, + "train_speed(iter/s)": 0.138578 + }, + { + "epoch": 0.4742355605889015, + "grad_norm": 0.3558872640132904, + "learning_rate": 0.00011671215983644203, + "loss": 0.27531468868255615, + "memory(GiB)": 91.64, + "step": 5025, + "token_acc": 0.9087152516904583, + "train_speed(iter/s)": 0.138576 + }, + { + "epoch": 0.4747074367685919, + "grad_norm": 0.33486005663871765, + "learning_rate": 0.00011655828448346473, + "loss": 0.2796565294265747, + "memory(GiB)": 91.64, + "step": 5030, + "token_acc": 0.9095816464237517, + "train_speed(iter/s)": 0.138575 + }, + { + "epoch": 0.47517931294828236, + "grad_norm": 0.33892741799354553, + "learning_rate": 0.00011640436880857208, + "loss": 0.2787603855133057, + "memory(GiB)": 91.64, + "step": 5035, + "token_acc": 0.8977215189873418, + "train_speed(iter/s)": 0.138578 + }, + { + "epoch": 0.47565118912797283, + "grad_norm": 0.840950608253479, + "learning_rate": 0.00011625041318657186, + "loss": 0.277506947517395, + "memory(GiB)": 91.64, + "step": 5040, + "token_acc": 0.9116925592804579, + "train_speed(iter/s)": 0.138576 + }, + { + "epoch": 0.4761230653076633, + "grad_norm": 0.30797073245048523, + "learning_rate": 0.00011609641799236928, + "loss": 0.27592084407806394, + "memory(GiB)": 91.64, + "step": 5045, + "token_acc": 0.9042925278219396, + "train_speed(iter/s)": 0.138576 + }, + { + "epoch": 0.4765949414873537, + "grad_norm": 0.4821353852748871, + "learning_rate": 0.00011594238360096577, + "loss": 0.2765143871307373, + "memory(GiB)": 91.64, + "step": 5050, + "token_acc": 0.8992502343017807, + "train_speed(iter/s)": 0.138577 + }, + { + "epoch": 0.4770668176670442, + "grad_norm": 0.7837891578674316, + "learning_rate": 0.00011578831038745826, + "loss": 0.2858426570892334, + "memory(GiB)": 91.64, + "step": 5055, + "token_acc": 0.9004637887977167, + "train_speed(iter/s)": 0.138577 + }, + { + "epoch": 0.47753869384673464, + "grad_norm": 0.5565283894538879, + "learning_rate": 0.0001156341987270382, + "loss": 0.27644643783569334, + "memory(GiB)": 91.64, + "step": 5060, + "token_acc": 0.8971206729213846, + "train_speed(iter/s)": 0.138578 + }, + { + "epoch": 0.47801057002642505, + "grad_norm": 0.5778424143791199, + "learning_rate": 0.00011548004899499076, + "loss": 0.27022864818573, + "memory(GiB)": 91.64, + "step": 5065, + "token_acc": 0.8980617372577172, + "train_speed(iter/s)": 0.138578 + }, + { + "epoch": 0.4784824462061155, + "grad_norm": 0.21273185312747955, + "learning_rate": 0.00011532586156669368, + "loss": 0.2737504720687866, + "memory(GiB)": 91.64, + "step": 5070, + "token_acc": 0.8996960486322189, + "train_speed(iter/s)": 0.138579 + }, + { + "epoch": 0.478954322385806, + "grad_norm": 0.20052437484264374, + "learning_rate": 0.00011517163681761653, + "loss": 0.28348593711853026, + "memory(GiB)": 91.64, + "step": 5075, + "token_acc": 0.9174463401210787, + "train_speed(iter/s)": 0.138581 + }, + { + "epoch": 0.4794261985654964, + "grad_norm": 0.35244038701057434, + "learning_rate": 0.00011501737512331987, + "loss": 0.27732019424438475, + "memory(GiB)": 91.64, + "step": 5080, + "token_acc": 0.8944756864042342, + "train_speed(iter/s)": 0.138581 + }, + { + "epoch": 0.47989807474518686, + "grad_norm": 0.65986168384552, + "learning_rate": 0.0001148630768594541, + "loss": 0.2808716058731079, + "memory(GiB)": 91.64, + "step": 5085, + "token_acc": 0.8806818181818182, + "train_speed(iter/s)": 0.13858 + }, + { + "epoch": 0.48036995092487733, + "grad_norm": 0.3498198390007019, + "learning_rate": 0.00011470874240175873, + "loss": 0.2739971876144409, + "memory(GiB)": 91.64, + "step": 5090, + "token_acc": 0.9123263888888888, + "train_speed(iter/s)": 0.138582 + }, + { + "epoch": 0.48084182710456774, + "grad_norm": 0.5786554217338562, + "learning_rate": 0.0001145543721260614, + "loss": 0.2853843212127686, + "memory(GiB)": 91.64, + "step": 5095, + "token_acc": 0.9068446464072707, + "train_speed(iter/s)": 0.13858 + }, + { + "epoch": 0.4813137032842582, + "grad_norm": 0.6657483577728271, + "learning_rate": 0.00011439996640827694, + "loss": 0.28014469146728516, + "memory(GiB)": 91.64, + "step": 5100, + "token_acc": 0.8972332015810277, + "train_speed(iter/s)": 0.138582 + }, + { + "epoch": 0.4817855794639487, + "grad_norm": 0.5863536596298218, + "learning_rate": 0.00011424552562440658, + "loss": 0.27579662799835203, + "memory(GiB)": 91.64, + "step": 5105, + "token_acc": 0.9050228310502283, + "train_speed(iter/s)": 0.13858 + }, + { + "epoch": 0.4822574556436391, + "grad_norm": 0.21976174414157867, + "learning_rate": 0.00011409105015053683, + "loss": 0.2734682083129883, + "memory(GiB)": 91.64, + "step": 5110, + "token_acc": 0.8935144609991236, + "train_speed(iter/s)": 0.13858 + }, + { + "epoch": 0.48272933182332955, + "grad_norm": 0.3334028720855713, + "learning_rate": 0.00011393654036283875, + "loss": 0.27859272956848147, + "memory(GiB)": 91.64, + "step": 5115, + "token_acc": 0.8913043478260869, + "train_speed(iter/s)": 0.138578 + }, + { + "epoch": 0.48320120800302, + "grad_norm": 0.4304635226726532, + "learning_rate": 0.0001137819966375669, + "loss": 0.27210822105407717, + "memory(GiB)": 91.64, + "step": 5120, + "token_acc": 0.9054319371727748, + "train_speed(iter/s)": 0.138581 + }, + { + "epoch": 0.48367308418271043, + "grad_norm": 0.6442833542823792, + "learning_rate": 0.00011362741935105849, + "loss": 0.2701150894165039, + "memory(GiB)": 91.64, + "step": 5125, + "token_acc": 0.890727035263387, + "train_speed(iter/s)": 0.13858 + }, + { + "epoch": 0.4841449603624009, + "grad_norm": 0.38582414388656616, + "learning_rate": 0.00011347280887973259, + "loss": 0.2726860046386719, + "memory(GiB)": 91.64, + "step": 5130, + "token_acc": 0.9164477141355754, + "train_speed(iter/s)": 0.138581 + }, + { + "epoch": 0.48461683654209137, + "grad_norm": 0.2989153563976288, + "learning_rate": 0.0001133181656000889, + "loss": 0.2714090347290039, + "memory(GiB)": 91.64, + "step": 5135, + "token_acc": 0.8948035487959443, + "train_speed(iter/s)": 0.138582 + }, + { + "epoch": 0.4850887127217818, + "grad_norm": 0.2917657196521759, + "learning_rate": 0.00011316348988870705, + "loss": 0.2744471073150635, + "memory(GiB)": 91.64, + "step": 5140, + "token_acc": 0.9163398692810457, + "train_speed(iter/s)": 0.138582 + }, + { + "epoch": 0.48556058890147225, + "grad_norm": 0.3109903037548065, + "learning_rate": 0.00011300878212224577, + "loss": 0.28097503185272216, + "memory(GiB)": 91.64, + "step": 5145, + "token_acc": 0.9047619047619048, + "train_speed(iter/s)": 0.138584 + }, + { + "epoch": 0.4860324650811627, + "grad_norm": 0.33942723274230957, + "learning_rate": 0.00011285404267744171, + "loss": 0.27203121185302737, + "memory(GiB)": 91.64, + "step": 5150, + "token_acc": 0.9211481359287363, + "train_speed(iter/s)": 0.138585 + }, + { + "epoch": 0.4865043412608532, + "grad_norm": 0.4545697271823883, + "learning_rate": 0.00011269927193110869, + "loss": 0.2756700038909912, + "memory(GiB)": 91.64, + "step": 5155, + "token_acc": 0.9127371273712737, + "train_speed(iter/s)": 0.138582 + }, + { + "epoch": 0.4869762174405436, + "grad_norm": 0.2911210358142853, + "learning_rate": 0.00011254447026013682, + "loss": 0.27875099182128904, + "memory(GiB)": 91.64, + "step": 5160, + "token_acc": 0.8810038944180009, + "train_speed(iter/s)": 0.138583 + }, + { + "epoch": 0.48744809362023406, + "grad_norm": 0.25021690130233765, + "learning_rate": 0.00011238963804149148, + "loss": 0.26958017349243163, + "memory(GiB)": 91.64, + "step": 5165, + "token_acc": 0.8997645475950219, + "train_speed(iter/s)": 0.138583 + }, + { + "epoch": 0.4879199697999245, + "grad_norm": 0.38456693291664124, + "learning_rate": 0.00011223477565221236, + "loss": 0.27497286796569825, + "memory(GiB)": 91.64, + "step": 5170, + "token_acc": 0.890403015366773, + "train_speed(iter/s)": 0.138583 + }, + { + "epoch": 0.48839184597961494, + "grad_norm": 0.8317140936851501, + "learning_rate": 0.00011207988346941273, + "loss": 0.27993662357330323, + "memory(GiB)": 91.64, + "step": 5175, + "token_acc": 0.9226804123711341, + "train_speed(iter/s)": 0.138583 + }, + { + "epoch": 0.4888637221593054, + "grad_norm": 0.31843581795692444, + "learning_rate": 0.00011192496187027843, + "loss": 0.2792136430740356, + "memory(GiB)": 91.64, + "step": 5180, + "token_acc": 0.9134140870345446, + "train_speed(iter/s)": 0.138584 + }, + { + "epoch": 0.48933559833899587, + "grad_norm": 0.3232461214065552, + "learning_rate": 0.00011177001123206681, + "loss": 0.26682229042053224, + "memory(GiB)": 91.64, + "step": 5185, + "token_acc": 0.9106923392052437, + "train_speed(iter/s)": 0.138585 + }, + { + "epoch": 0.4898074745186863, + "grad_norm": 0.388791561126709, + "learning_rate": 0.00011161503193210599, + "loss": 0.27460460662841796, + "memory(GiB)": 91.64, + "step": 5190, + "token_acc": 0.90976, + "train_speed(iter/s)": 0.138587 + }, + { + "epoch": 0.49027935069837675, + "grad_norm": 0.6369457244873047, + "learning_rate": 0.00011146002434779394, + "loss": 0.27512354850769044, + "memory(GiB)": 91.64, + "step": 5195, + "token_acc": 0.8958634654324559, + "train_speed(iter/s)": 0.138588 + }, + { + "epoch": 0.4907512268780672, + "grad_norm": 0.3902687430381775, + "learning_rate": 0.00011130498885659744, + "loss": 0.2719013214111328, + "memory(GiB)": 91.64, + "step": 5200, + "token_acc": 0.9022752704214845, + "train_speed(iter/s)": 0.138588 + }, + { + "epoch": 0.4912231030577576, + "grad_norm": 0.44649550318717957, + "learning_rate": 0.00011114992583605126, + "loss": 0.2780723571777344, + "memory(GiB)": 91.64, + "step": 5205, + "token_acc": 0.9057301293900185, + "train_speed(iter/s)": 0.13859 + }, + { + "epoch": 0.4916949792374481, + "grad_norm": 0.3157523572444916, + "learning_rate": 0.00011099483566375717, + "loss": 0.27315502166748046, + "memory(GiB)": 91.64, + "step": 5210, + "token_acc": 0.9008341056533827, + "train_speed(iter/s)": 0.138587 + }, + { + "epoch": 0.49216685541713856, + "grad_norm": 0.37949439883232117, + "learning_rate": 0.00011083971871738311, + "loss": 0.2804953813552856, + "memory(GiB)": 91.64, + "step": 5215, + "token_acc": 0.8958660387231816, + "train_speed(iter/s)": 0.138589 + }, + { + "epoch": 0.49263873159682897, + "grad_norm": 0.5504710078239441, + "learning_rate": 0.0001106845753746622, + "loss": 0.2684622764587402, + "memory(GiB)": 91.64, + "step": 5220, + "token_acc": 0.9050916496945011, + "train_speed(iter/s)": 0.13859 + }, + { + "epoch": 0.49311060777651944, + "grad_norm": 0.6442100405693054, + "learning_rate": 0.00011052940601339181, + "loss": 0.27536282539367674, + "memory(GiB)": 91.64, + "step": 5225, + "token_acc": 0.9010629599345871, + "train_speed(iter/s)": 0.138591 + }, + { + "epoch": 0.4935824839562099, + "grad_norm": 0.42355185747146606, + "learning_rate": 0.0001103742110114327, + "loss": 0.28316774368286135, + "memory(GiB)": 91.64, + "step": 5230, + "token_acc": 0.9025316455696203, + "train_speed(iter/s)": 0.138593 + }, + { + "epoch": 0.4940543601359003, + "grad_norm": 0.29734212160110474, + "learning_rate": 0.00011021899074670811, + "loss": 0.26995747089385985, + "memory(GiB)": 91.64, + "step": 5235, + "token_acc": 0.9120280264694434, + "train_speed(iter/s)": 0.138592 + }, + { + "epoch": 0.4945262363155908, + "grad_norm": 0.30176839232444763, + "learning_rate": 0.00011006374559720268, + "loss": 0.27476816177368163, + "memory(GiB)": 91.64, + "step": 5240, + "token_acc": 0.9170903402424716, + "train_speed(iter/s)": 0.138592 + }, + { + "epoch": 0.49499811249528125, + "grad_norm": 0.6972224116325378, + "learning_rate": 0.00010990847594096176, + "loss": 0.26695716381073, + "memory(GiB)": 91.64, + "step": 5245, + "token_acc": 0.9249381358262304, + "train_speed(iter/s)": 0.138593 + }, + { + "epoch": 0.49546998867497166, + "grad_norm": 0.6553791761398315, + "learning_rate": 0.00010975318215609035, + "loss": 0.27399606704711915, + "memory(GiB)": 91.64, + "step": 5250, + "token_acc": 0.9224839400428265, + "train_speed(iter/s)": 0.138592 + }, + { + "epoch": 0.49594186485466213, + "grad_norm": 0.5006653070449829, + "learning_rate": 0.00010959786462075214, + "loss": 0.27437796592712405, + "memory(GiB)": 91.64, + "step": 5255, + "token_acc": 0.8942583732057416, + "train_speed(iter/s)": 0.138591 + }, + { + "epoch": 0.4964137410343526, + "grad_norm": 0.3082018196582794, + "learning_rate": 0.00010944252371316874, + "loss": 0.2674814224243164, + "memory(GiB)": 91.64, + "step": 5260, + "token_acc": 0.9183520599250936, + "train_speed(iter/s)": 0.138591 + }, + { + "epoch": 0.49688561721404306, + "grad_norm": 0.22847510874271393, + "learning_rate": 0.00010928715981161868, + "loss": 0.2763264894485474, + "memory(GiB)": 91.64, + "step": 5265, + "token_acc": 0.8880643166357453, + "train_speed(iter/s)": 0.13859 + }, + { + "epoch": 0.4973574933937335, + "grad_norm": 0.4131425619125366, + "learning_rate": 0.0001091317732944364, + "loss": 0.2672610282897949, + "memory(GiB)": 91.64, + "step": 5270, + "token_acc": 0.9162442674390538, + "train_speed(iter/s)": 0.13859 + }, + { + "epoch": 0.49782936957342394, + "grad_norm": 0.3631506562232971, + "learning_rate": 0.00010897636454001145, + "loss": 0.276334285736084, + "memory(GiB)": 91.64, + "step": 5275, + "token_acc": 0.9140866873065016, + "train_speed(iter/s)": 0.13859 + }, + { + "epoch": 0.4983012457531144, + "grad_norm": 0.49582716822624207, + "learning_rate": 0.00010882093392678761, + "loss": 0.270448637008667, + "memory(GiB)": 91.64, + "step": 5280, + "token_acc": 0.8968347010550997, + "train_speed(iter/s)": 0.138592 + }, + { + "epoch": 0.4987731219328048, + "grad_norm": 0.35381758213043213, + "learning_rate": 0.00010866548183326176, + "loss": 0.27590155601501465, + "memory(GiB)": 91.64, + "step": 5285, + "token_acc": 0.9073126692747517, + "train_speed(iter/s)": 0.138591 + }, + { + "epoch": 0.4992449981124953, + "grad_norm": 0.2577316462993622, + "learning_rate": 0.00010851000863798313, + "loss": 0.27100181579589844, + "memory(GiB)": 91.64, + "step": 5290, + "token_acc": 0.9157062891010866, + "train_speed(iter/s)": 0.138591 + }, + { + "epoch": 0.49971687429218575, + "grad_norm": 0.6241110563278198, + "learning_rate": 0.00010835451471955245, + "loss": 0.27226576805114744, + "memory(GiB)": 91.64, + "step": 5295, + "token_acc": 0.9051878354203936, + "train_speed(iter/s)": 0.138591 + }, + { + "epoch": 0.5001887504718762, + "grad_norm": 0.43986672163009644, + "learning_rate": 0.00010819900045662073, + "loss": 0.27079594135284424, + "memory(GiB)": 91.64, + "step": 5300, + "token_acc": 0.9174471037114117, + "train_speed(iter/s)": 0.13859 + }, + { + "epoch": 0.5006606266515666, + "grad_norm": 0.2602379322052002, + "learning_rate": 0.00010804346622788866, + "loss": 0.27014808654785155, + "memory(GiB)": 91.64, + "step": 5305, + "token_acc": 0.9015012815818382, + "train_speed(iter/s)": 0.138591 + }, + { + "epoch": 0.5011325028312571, + "grad_norm": 0.45675429701805115, + "learning_rate": 0.00010788791241210547, + "loss": 0.27579355239868164, + "memory(GiB)": 91.64, + "step": 5310, + "token_acc": 0.8968147151188874, + "train_speed(iter/s)": 0.138592 + }, + { + "epoch": 0.5016043790109476, + "grad_norm": 0.29645124077796936, + "learning_rate": 0.00010773233938806812, + "loss": 0.26997838020324705, + "memory(GiB)": 91.64, + "step": 5315, + "token_acc": 0.9151072569602922, + "train_speed(iter/s)": 0.138592 + }, + { + "epoch": 0.5020762551906379, + "grad_norm": 0.4871494174003601, + "learning_rate": 0.00010757674753462039, + "loss": 0.26801414489746095, + "memory(GiB)": 91.64, + "step": 5320, + "token_acc": 0.9187116564417178, + "train_speed(iter/s)": 0.138594 + }, + { + "epoch": 0.5025481313703284, + "grad_norm": 0.2918551564216614, + "learning_rate": 0.00010742113723065181, + "loss": 0.27881925106048583, + "memory(GiB)": 91.64, + "step": 5325, + "token_acc": 0.9141835518474374, + "train_speed(iter/s)": 0.138592 + }, + { + "epoch": 0.5030200075500189, + "grad_norm": 0.5010596513748169, + "learning_rate": 0.00010726550885509689, + "loss": 0.2730778455734253, + "memory(GiB)": 91.64, + "step": 5330, + "token_acc": 0.9, + "train_speed(iter/s)": 0.138592 + }, + { + "epoch": 0.5034918837297093, + "grad_norm": 0.26482099294662476, + "learning_rate": 0.00010710986278693424, + "loss": 0.27079339027404786, + "memory(GiB)": 91.64, + "step": 5335, + "token_acc": 0.9089954497724886, + "train_speed(iter/s)": 0.138592 + }, + { + "epoch": 0.5039637599093998, + "grad_norm": 0.2955012321472168, + "learning_rate": 0.00010695419940518536, + "loss": 0.2673619747161865, + "memory(GiB)": 91.64, + "step": 5340, + "token_acc": 0.9062937062937063, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.5044356360890903, + "grad_norm": 0.2763923704624176, + "learning_rate": 0.00010679851908891405, + "loss": 0.279115629196167, + "memory(GiB)": 91.64, + "step": 5345, + "token_acc": 0.8722680913064594, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.5049075122687807, + "grad_norm": 0.2069929838180542, + "learning_rate": 0.00010664282221722538, + "loss": 0.2716416835784912, + "memory(GiB)": 91.64, + "step": 5350, + "token_acc": 0.9151846785225718, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.5053793884484711, + "grad_norm": 0.5250474810600281, + "learning_rate": 0.00010648710916926458, + "loss": 0.2689443826675415, + "memory(GiB)": 91.64, + "step": 5355, + "token_acc": 0.9039106145251397, + "train_speed(iter/s)": 0.138597 + }, + { + "epoch": 0.5058512646281615, + "grad_norm": 0.5674706697463989, + "learning_rate": 0.00010633138032421638, + "loss": 0.27478585243225095, + "memory(GiB)": 91.64, + "step": 5360, + "token_acc": 0.9083916083916084, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.506323140807852, + "grad_norm": 0.3561114966869354, + "learning_rate": 0.00010617563606130403, + "loss": 0.27205119132995603, + "memory(GiB)": 91.64, + "step": 5365, + "token_acc": 0.8959741404642962, + "train_speed(iter/s)": 0.138599 + }, + { + "epoch": 0.5067950169875425, + "grad_norm": 0.43625959753990173, + "learning_rate": 0.0001060198767597882, + "loss": 0.27315943241119384, + "memory(GiB)": 91.64, + "step": 5370, + "token_acc": 0.9110037944118662, + "train_speed(iter/s)": 0.138599 + }, + { + "epoch": 0.507266893167233, + "grad_norm": 0.27587762475013733, + "learning_rate": 0.00010586410279896619, + "loss": 0.2728897571563721, + "memory(GiB)": 91.64, + "step": 5375, + "token_acc": 0.9074920858248329, + "train_speed(iter/s)": 0.138601 + }, + { + "epoch": 0.5077387693469234, + "grad_norm": 0.23502488434314728, + "learning_rate": 0.00010570831455817116, + "loss": 0.2788903474807739, + "memory(GiB)": 91.64, + "step": 5380, + "token_acc": 0.9112850619699935, + "train_speed(iter/s)": 0.138602 + }, + { + "epoch": 0.5082106455266138, + "grad_norm": 0.22300395369529724, + "learning_rate": 0.00010555251241677086, + "loss": 0.26743249893188475, + "memory(GiB)": 91.64, + "step": 5385, + "token_acc": 0.9102902374670184, + "train_speed(iter/s)": 0.138602 + }, + { + "epoch": 0.5086825217063042, + "grad_norm": 0.3851175606250763, + "learning_rate": 0.00010539669675416694, + "loss": 0.26730880737304685, + "memory(GiB)": 91.64, + "step": 5390, + "token_acc": 0.9054170249355116, + "train_speed(iter/s)": 0.138602 + }, + { + "epoch": 0.5091543978859947, + "grad_norm": 0.303202360868454, + "learning_rate": 0.00010524086794979402, + "loss": 0.2683709144592285, + "memory(GiB)": 91.64, + "step": 5395, + "token_acc": 0.9025044722719141, + "train_speed(iter/s)": 0.138603 + }, + { + "epoch": 0.5096262740656852, + "grad_norm": 0.36325204372406006, + "learning_rate": 0.00010508502638311873, + "loss": 0.27001941204071045, + "memory(GiB)": 91.64, + "step": 5400, + "token_acc": 0.928078250863061, + "train_speed(iter/s)": 0.138602 + }, + { + "epoch": 0.5100981502453756, + "grad_norm": 0.7915918827056885, + "learning_rate": 0.00010492917243363867, + "loss": 0.271225643157959, + "memory(GiB)": 91.64, + "step": 5405, + "token_acc": 0.9089173711480775, + "train_speed(iter/s)": 0.138599 + }, + { + "epoch": 0.5105700264250661, + "grad_norm": 0.2819935083389282, + "learning_rate": 0.00010477330648088171, + "loss": 0.2733079671859741, + "memory(GiB)": 91.64, + "step": 5410, + "token_acc": 0.8977853492333902, + "train_speed(iter/s)": 0.138599 + }, + { + "epoch": 0.5110419026047565, + "grad_norm": 0.22760671377182007, + "learning_rate": 0.00010461742890440493, + "loss": 0.2703261375427246, + "memory(GiB)": 91.64, + "step": 5415, + "token_acc": 0.9160453808752026, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.5115137787844469, + "grad_norm": 0.7035602927207947, + "learning_rate": 0.00010446154008379367, + "loss": 0.2706472873687744, + "memory(GiB)": 91.64, + "step": 5420, + "token_acc": 0.9036195286195287, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.5119856549641374, + "grad_norm": 0.7053150534629822, + "learning_rate": 0.00010430564039866067, + "loss": 0.27515950202941897, + "memory(GiB)": 91.64, + "step": 5425, + "token_acc": 0.9001291433491175, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.5124575311438279, + "grad_norm": 0.4593672454357147, + "learning_rate": 0.00010414973022864514, + "loss": 0.27523515224456785, + "memory(GiB)": 91.64, + "step": 5430, + "token_acc": 0.8985200845665962, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.5129294073235183, + "grad_norm": 0.22665126621723175, + "learning_rate": 0.00010399380995341181, + "loss": 0.2573527812957764, + "memory(GiB)": 91.64, + "step": 5435, + "token_acc": 0.9121645172533984, + "train_speed(iter/s)": 0.138599 + }, + { + "epoch": 0.5134012835032088, + "grad_norm": 0.5087513327598572, + "learning_rate": 0.00010383787995265004, + "loss": 0.27467942237854004, + "memory(GiB)": 91.64, + "step": 5440, + "token_acc": 0.8909090909090909, + "train_speed(iter/s)": 0.138601 + }, + { + "epoch": 0.5138731596828993, + "grad_norm": 0.48231035470962524, + "learning_rate": 0.00010368194060607283, + "loss": 0.27229771614074705, + "memory(GiB)": 91.64, + "step": 5445, + "token_acc": 0.8985872855701312, + "train_speed(iter/s)": 0.138601 + }, + { + "epoch": 0.5143450358625896, + "grad_norm": 0.5203849673271179, + "learning_rate": 0.00010352599229341597, + "loss": 0.2672194242477417, + "memory(GiB)": 91.64, + "step": 5450, + "token_acc": 0.9112739112739112, + "train_speed(iter/s)": 0.138602 + }, + { + "epoch": 0.5148169120422801, + "grad_norm": 0.5567079782485962, + "learning_rate": 0.0001033700353944371, + "loss": 0.2730486154556274, + "memory(GiB)": 91.64, + "step": 5455, + "token_acc": 0.900904033379694, + "train_speed(iter/s)": 0.138602 + }, + { + "epoch": 0.5152887882219706, + "grad_norm": 0.4877086281776428, + "learning_rate": 0.0001032140702889147, + "loss": 0.2705501079559326, + "memory(GiB)": 91.64, + "step": 5460, + "token_acc": 0.9008064516129032, + "train_speed(iter/s)": 0.138603 + }, + { + "epoch": 0.515760664401661, + "grad_norm": 0.3445092737674713, + "learning_rate": 0.00010305809735664735, + "loss": 0.2583261728286743, + "memory(GiB)": 91.64, + "step": 5465, + "token_acc": 0.9392538791680423, + "train_speed(iter/s)": 0.138605 + }, + { + "epoch": 0.5162325405813515, + "grad_norm": 0.41431504487991333, + "learning_rate": 0.00010290211697745258, + "loss": 0.2675126075744629, + "memory(GiB)": 91.64, + "step": 5470, + "token_acc": 0.917844232665133, + "train_speed(iter/s)": 0.138607 + }, + { + "epoch": 0.516704416761042, + "grad_norm": 0.24442262947559357, + "learning_rate": 0.00010274612953116605, + "loss": 0.26557936668396, + "memory(GiB)": 91.64, + "step": 5475, + "token_acc": 0.9098092643051771, + "train_speed(iter/s)": 0.138607 + }, + { + "epoch": 0.5171762929407323, + "grad_norm": 0.6545706391334534, + "learning_rate": 0.00010259013539764074, + "loss": 0.27161517143249514, + "memory(GiB)": 91.64, + "step": 5480, + "token_acc": 0.9188552188552188, + "train_speed(iter/s)": 0.138607 + }, + { + "epoch": 0.5176481691204228, + "grad_norm": 0.2782509922981262, + "learning_rate": 0.00010243413495674583, + "loss": 0.262584924697876, + "memory(GiB)": 91.64, + "step": 5485, + "token_acc": 0.8945548833189283, + "train_speed(iter/s)": 0.138608 + }, + { + "epoch": 0.5181200453001132, + "grad_norm": 0.2864302694797516, + "learning_rate": 0.00010227812858836585, + "loss": 0.2646970510482788, + "memory(GiB)": 91.64, + "step": 5490, + "token_acc": 0.9124331550802139, + "train_speed(iter/s)": 0.138608 + }, + { + "epoch": 0.5185919214798037, + "grad_norm": 0.5489168763160706, + "learning_rate": 0.00010212211667239982, + "loss": 0.26138916015625, + "memory(GiB)": 91.64, + "step": 5495, + "token_acc": 0.9172252987467211, + "train_speed(iter/s)": 0.138606 + }, + { + "epoch": 0.5190637976594942, + "grad_norm": 0.828153669834137, + "learning_rate": 0.00010196609958876027, + "loss": 0.2727668762207031, + "memory(GiB)": 91.64, + "step": 5500, + "token_acc": 0.9049342105263158, + "train_speed(iter/s)": 0.138608 + }, + { + "epoch": 0.5195356738391846, + "grad_norm": 0.3575778901576996, + "learning_rate": 0.00010181007771737221, + "loss": 0.2691819667816162, + "memory(GiB)": 91.64, + "step": 5505, + "token_acc": 0.9117647058823529, + "train_speed(iter/s)": 0.138609 + }, + { + "epoch": 0.520007550018875, + "grad_norm": 0.340814471244812, + "learning_rate": 0.00010165405143817242, + "loss": 0.278232479095459, + "memory(GiB)": 91.64, + "step": 5510, + "token_acc": 0.8898630136986302, + "train_speed(iter/s)": 0.138607 + }, + { + "epoch": 0.5204794261985655, + "grad_norm": 0.27852863073349, + "learning_rate": 0.00010149802113110843, + "loss": 0.2711049556732178, + "memory(GiB)": 91.64, + "step": 5515, + "token_acc": 0.8898026315789473, + "train_speed(iter/s)": 0.138606 + }, + { + "epoch": 0.5209513023782559, + "grad_norm": 0.40802115201950073, + "learning_rate": 0.00010134198717613743, + "loss": 0.27061238288879397, + "memory(GiB)": 91.64, + "step": 5520, + "token_acc": 0.9073778345576493, + "train_speed(iter/s)": 0.138606 + }, + { + "epoch": 0.5214231785579464, + "grad_norm": 0.3736279010772705, + "learning_rate": 0.00010118594995322563, + "loss": 0.26925182342529297, + "memory(GiB)": 91.64, + "step": 5525, + "token_acc": 0.9193942354665364, + "train_speed(iter/s)": 0.138606 + }, + { + "epoch": 0.5218950547376369, + "grad_norm": 0.28507253527641296, + "learning_rate": 0.00010102990984234721, + "loss": 0.27002944946289065, + "memory(GiB)": 91.64, + "step": 5530, + "token_acc": 0.9087465564738292, + "train_speed(iter/s)": 0.138606 + }, + { + "epoch": 0.5223669309173273, + "grad_norm": 0.2574266493320465, + "learning_rate": 0.00010087386722348325, + "loss": 0.26802835464477537, + "memory(GiB)": 91.64, + "step": 5535, + "token_acc": 0.9000989119683481, + "train_speed(iter/s)": 0.138606 + }, + { + "epoch": 0.5228388070970177, + "grad_norm": 0.3153584599494934, + "learning_rate": 0.00010071782247662108, + "loss": 0.2677382230758667, + "memory(GiB)": 91.64, + "step": 5540, + "token_acc": 0.9037955655768508, + "train_speed(iter/s)": 0.13854 + }, + { + "epoch": 0.5233106832767082, + "grad_norm": 0.5408716797828674, + "learning_rate": 0.0001005617759817531, + "loss": 0.2700139045715332, + "memory(GiB)": 91.64, + "step": 5545, + "token_acc": 0.9161554192229039, + "train_speed(iter/s)": 0.138536 + }, + { + "epoch": 0.5237825594563986, + "grad_norm": 0.5670604705810547, + "learning_rate": 0.00010040572811887604, + "loss": 0.264970064163208, + "memory(GiB)": 91.64, + "step": 5550, + "token_acc": 0.923982869379015, + "train_speed(iter/s)": 0.138537 + }, + { + "epoch": 0.5242544356360891, + "grad_norm": 0.5169808864593506, + "learning_rate": 0.00010024967926798994, + "loss": 0.264431095123291, + "memory(GiB)": 91.64, + "step": 5555, + "token_acc": 0.9123539232053423, + "train_speed(iter/s)": 0.138536 + }, + { + "epoch": 0.5247263118157796, + "grad_norm": 0.6065265536308289, + "learning_rate": 0.0001000936298090972, + "loss": 0.2742233037948608, + "memory(GiB)": 91.64, + "step": 5560, + "token_acc": 0.8927628103539356, + "train_speed(iter/s)": 0.138537 + }, + { + "epoch": 0.52519818799547, + "grad_norm": 0.21818573772907257, + "learning_rate": 9.993758012220181e-05, + "loss": 0.2681767225265503, + "memory(GiB)": 91.64, + "step": 5565, + "token_acc": 0.9015407741450583, + "train_speed(iter/s)": 0.138538 + }, + { + "epoch": 0.5256700641751605, + "grad_norm": 0.5659834146499634, + "learning_rate": 9.978153058730823e-05, + "loss": 0.2633455514907837, + "memory(GiB)": 91.64, + "step": 5570, + "token_acc": 0.9158934450763244, + "train_speed(iter/s)": 0.138539 + }, + { + "epoch": 0.5261419403548508, + "grad_norm": 0.23959246277809143, + "learning_rate": 9.962548158442054e-05, + "loss": 0.2634448528289795, + "memory(GiB)": 91.64, + "step": 5575, + "token_acc": 0.9061841180604358, + "train_speed(iter/s)": 0.13854 + }, + { + "epoch": 0.5266138165345413, + "grad_norm": 0.2925381660461426, + "learning_rate": 9.946943349354159e-05, + "loss": 0.2646299362182617, + "memory(GiB)": 91.64, + "step": 5580, + "token_acc": 0.9134487350199734, + "train_speed(iter/s)": 0.138542 + }, + { + "epoch": 0.5270856927142318, + "grad_norm": 0.3260766565799713, + "learning_rate": 9.93133866946719e-05, + "loss": 0.2640314340591431, + "memory(GiB)": 91.64, + "step": 5585, + "token_acc": 0.8965307364576993, + "train_speed(iter/s)": 0.138542 + }, + { + "epoch": 0.5275575688939222, + "grad_norm": 0.33577093482017517, + "learning_rate": 9.915734156780904e-05, + "loss": 0.26520462036132814, + "memory(GiB)": 91.64, + "step": 5590, + "token_acc": 0.9146467251160392, + "train_speed(iter/s)": 0.13854 + }, + { + "epoch": 0.5280294450736127, + "grad_norm": 0.3756278157234192, + "learning_rate": 9.900129849294627e-05, + "loss": 0.2686309337615967, + "memory(GiB)": 91.64, + "step": 5595, + "token_acc": 0.8980466888994759, + "train_speed(iter/s)": 0.138541 + }, + { + "epoch": 0.5285013212533032, + "grad_norm": 0.636249303817749, + "learning_rate": 9.884525785007204e-05, + "loss": 0.27318830490112306, + "memory(GiB)": 91.64, + "step": 5600, + "token_acc": 0.8904034896401308, + "train_speed(iter/s)": 0.138541 + }, + { + "epoch": 0.5289731974329935, + "grad_norm": 0.6412723064422607, + "learning_rate": 9.868922001916877e-05, + "loss": 0.26247034072875974, + "memory(GiB)": 91.64, + "step": 5605, + "token_acc": 0.9019680653546231, + "train_speed(iter/s)": 0.138542 + }, + { + "epoch": 0.529445073612684, + "grad_norm": 0.3598019480705261, + "learning_rate": 9.853318538021206e-05, + "loss": 0.26594886779785154, + "memory(GiB)": 91.64, + "step": 5610, + "token_acc": 0.8935340022296544, + "train_speed(iter/s)": 0.138542 + }, + { + "epoch": 0.5299169497923745, + "grad_norm": 0.7006165385246277, + "learning_rate": 9.837715431316974e-05, + "loss": 0.26999516487121583, + "memory(GiB)": 91.64, + "step": 5615, + "token_acc": 0.9212376933895922, + "train_speed(iter/s)": 0.138544 + }, + { + "epoch": 0.5303888259720649, + "grad_norm": 0.6085411310195923, + "learning_rate": 9.8221127198001e-05, + "loss": 0.26465139389038084, + "memory(GiB)": 91.64, + "step": 5620, + "token_acc": 0.9219022687609075, + "train_speed(iter/s)": 0.138546 + }, + { + "epoch": 0.5308607021517554, + "grad_norm": 0.5289167761802673, + "learning_rate": 9.806510441465532e-05, + "loss": 0.26235690116882326, + "memory(GiB)": 91.64, + "step": 5625, + "token_acc": 0.9046015712682379, + "train_speed(iter/s)": 0.138547 + }, + { + "epoch": 0.5313325783314459, + "grad_norm": 0.3731206953525543, + "learning_rate": 9.790908634307165e-05, + "loss": 0.26745176315307617, + "memory(GiB)": 91.64, + "step": 5630, + "token_acc": 0.9178789300797747, + "train_speed(iter/s)": 0.138549 + }, + { + "epoch": 0.5318044545111362, + "grad_norm": 0.30575793981552124, + "learning_rate": 9.775307336317752e-05, + "loss": 0.26017489433288576, + "memory(GiB)": 91.64, + "step": 5635, + "token_acc": 0.89728, + "train_speed(iter/s)": 0.138546 + }, + { + "epoch": 0.5322763306908267, + "grad_norm": 0.5659154653549194, + "learning_rate": 9.759706585488797e-05, + "loss": 0.26600961685180663, + "memory(GiB)": 91.64, + "step": 5640, + "token_acc": 0.8857142857142857, + "train_speed(iter/s)": 0.138546 + }, + { + "epoch": 0.5327482068705172, + "grad_norm": 0.2851637601852417, + "learning_rate": 9.744106419810478e-05, + "loss": 0.2690946340560913, + "memory(GiB)": 91.64, + "step": 5645, + "token_acc": 0.9198931909212283, + "train_speed(iter/s)": 0.138546 + }, + { + "epoch": 0.5332200830502076, + "grad_norm": 0.2344711571931839, + "learning_rate": 9.728506877271551e-05, + "loss": 0.2627574920654297, + "memory(GiB)": 91.64, + "step": 5650, + "token_acc": 0.9128968811463894, + "train_speed(iter/s)": 0.138546 + }, + { + "epoch": 0.5336919592298981, + "grad_norm": 0.4054241478443146, + "learning_rate": 9.712907995859248e-05, + "loss": 0.26556243896484377, + "memory(GiB)": 91.64, + "step": 5655, + "token_acc": 0.8993012741471434, + "train_speed(iter/s)": 0.138546 + }, + { + "epoch": 0.5341638354095886, + "grad_norm": 0.2403021603822708, + "learning_rate": 9.697309813559192e-05, + "loss": 0.26658334732055666, + "memory(GiB)": 91.64, + "step": 5660, + "token_acc": 0.8986765922249793, + "train_speed(iter/s)": 0.138546 + }, + { + "epoch": 0.534635711589279, + "grad_norm": 0.3669694662094116, + "learning_rate": 9.681712368355308e-05, + "loss": 0.2626574277877808, + "memory(GiB)": 91.64, + "step": 5665, + "token_acc": 0.9059067972692193, + "train_speed(iter/s)": 0.138546 + }, + { + "epoch": 0.5351075877689694, + "grad_norm": 0.3157739043235779, + "learning_rate": 9.666115698229721e-05, + "loss": 0.260296106338501, + "memory(GiB)": 91.64, + "step": 5670, + "token_acc": 0.9010388190267906, + "train_speed(iter/s)": 0.138546 + }, + { + "epoch": 0.5355794639486599, + "grad_norm": 0.2455492615699768, + "learning_rate": 9.650519841162666e-05, + "loss": 0.25946090221405027, + "memory(GiB)": 91.64, + "step": 5675, + "token_acc": 0.9096751160299893, + "train_speed(iter/s)": 0.138549 + }, + { + "epoch": 0.5360513401283503, + "grad_norm": 0.3795280158519745, + "learning_rate": 9.63492483513241e-05, + "loss": 0.2626684904098511, + "memory(GiB)": 91.64, + "step": 5680, + "token_acc": 0.8940092165898618, + "train_speed(iter/s)": 0.13855 + }, + { + "epoch": 0.5365232163080408, + "grad_norm": 0.44789838790893555, + "learning_rate": 9.619330718115141e-05, + "loss": 0.26631550788879393, + "memory(GiB)": 91.64, + "step": 5685, + "token_acc": 0.919800634345265, + "train_speed(iter/s)": 0.13855 + }, + { + "epoch": 0.5369950924877313, + "grad_norm": 0.6232290267944336, + "learning_rate": 9.603737528084878e-05, + "loss": 0.260367751121521, + "memory(GiB)": 91.64, + "step": 5690, + "token_acc": 0.9210836277974087, + "train_speed(iter/s)": 0.138551 + }, + { + "epoch": 0.5374669686674217, + "grad_norm": 0.7171441316604614, + "learning_rate": 9.588145303013383e-05, + "loss": 0.26139035224914553, + "memory(GiB)": 91.64, + "step": 5695, + "token_acc": 0.9144079885877318, + "train_speed(iter/s)": 0.138552 + }, + { + "epoch": 0.5379388448471121, + "grad_norm": 0.8749316334724426, + "learning_rate": 9.572554080870074e-05, + "loss": 0.2580615520477295, + "memory(GiB)": 91.64, + "step": 5700, + "token_acc": 0.9133631713554987, + "train_speed(iter/s)": 0.138551 + }, + { + "epoch": 0.5384107210268025, + "grad_norm": 0.3951604664325714, + "learning_rate": 9.556963899621929e-05, + "loss": 0.26193459033966066, + "memory(GiB)": 91.64, + "step": 5705, + "token_acc": 0.9229357798165138, + "train_speed(iter/s)": 0.138549 + }, + { + "epoch": 0.538882597206493, + "grad_norm": 0.9120936989784241, + "learning_rate": 9.541374797233381e-05, + "loss": 0.26309173107147216, + "memory(GiB)": 91.64, + "step": 5710, + "token_acc": 0.8990442054958184, + "train_speed(iter/s)": 0.138549 + }, + { + "epoch": 0.5393544733861835, + "grad_norm": 0.33477917313575745, + "learning_rate": 9.52578681166624e-05, + "loss": 0.2714076995849609, + "memory(GiB)": 91.64, + "step": 5715, + "token_acc": 0.9238231098430814, + "train_speed(iter/s)": 0.138549 + }, + { + "epoch": 0.5398263495658739, + "grad_norm": 0.41128993034362793, + "learning_rate": 9.510199980879603e-05, + "loss": 0.26154122352600095, + "memory(GiB)": 91.64, + "step": 5720, + "token_acc": 0.8976274165202109, + "train_speed(iter/s)": 0.138551 + }, + { + "epoch": 0.5402982257455644, + "grad_norm": 0.41506433486938477, + "learning_rate": 9.494614342829742e-05, + "loss": 0.2645676612854004, + "memory(GiB)": 91.64, + "step": 5725, + "token_acc": 0.9031833727966324, + "train_speed(iter/s)": 0.138551 + }, + { + "epoch": 0.5407701019252548, + "grad_norm": 0.31078341603279114, + "learning_rate": 9.479029935470034e-05, + "loss": 0.26885018348693845, + "memory(GiB)": 91.64, + "step": 5730, + "token_acc": 0.9037735849056604, + "train_speed(iter/s)": 0.138551 + }, + { + "epoch": 0.5412419781049452, + "grad_norm": 0.24483777582645416, + "learning_rate": 9.46344679675086e-05, + "loss": 0.2597140073776245, + "memory(GiB)": 91.64, + "step": 5735, + "token_acc": 0.8987915407854985, + "train_speed(iter/s)": 0.138551 + }, + { + "epoch": 0.5417138542846357, + "grad_norm": 0.43131428956985474, + "learning_rate": 9.447864964619511e-05, + "loss": 0.2613609075546265, + "memory(GiB)": 91.64, + "step": 5740, + "token_acc": 0.9058786741713571, + "train_speed(iter/s)": 0.138551 + }, + { + "epoch": 0.5421857304643262, + "grad_norm": 0.3337703347206116, + "learning_rate": 9.432284477020086e-05, + "loss": 0.2597992420196533, + "memory(GiB)": 91.64, + "step": 5745, + "token_acc": 0.9099797707349966, + "train_speed(iter/s)": 0.138552 + }, + { + "epoch": 0.5426576066440166, + "grad_norm": 0.47471845149993896, + "learning_rate": 9.416705371893426e-05, + "loss": 0.26606192588806155, + "memory(GiB)": 91.64, + "step": 5750, + "token_acc": 0.9284017645062775, + "train_speed(iter/s)": 0.138553 + }, + { + "epoch": 0.5431294828237071, + "grad_norm": 0.2559286057949066, + "learning_rate": 9.401127687176991e-05, + "loss": 0.2615029811859131, + "memory(GiB)": 91.64, + "step": 5755, + "token_acc": 0.8925468678555099, + "train_speed(iter/s)": 0.138554 + }, + { + "epoch": 0.5436013590033975, + "grad_norm": 0.24070371687412262, + "learning_rate": 9.385551460804787e-05, + "loss": 0.26202309131622314, + "memory(GiB)": 91.64, + "step": 5760, + "token_acc": 0.8897075754087037, + "train_speed(iter/s)": 0.138553 + }, + { + "epoch": 0.5440732351830879, + "grad_norm": 0.31694546341896057, + "learning_rate": 9.369976730707275e-05, + "loss": 0.2607280731201172, + "memory(GiB)": 91.64, + "step": 5765, + "token_acc": 0.9033432638199271, + "train_speed(iter/s)": 0.138553 + }, + { + "epoch": 0.5445451113627784, + "grad_norm": 0.3087446093559265, + "learning_rate": 9.354403534811269e-05, + "loss": 0.2601593255996704, + "memory(GiB)": 91.64, + "step": 5770, + "token_acc": 0.8965087281795511, + "train_speed(iter/s)": 0.138553 + }, + { + "epoch": 0.5450169875424689, + "grad_norm": 0.2989960312843323, + "learning_rate": 9.33883191103984e-05, + "loss": 0.2624382972717285, + "memory(GiB)": 91.64, + "step": 5775, + "token_acc": 0.9023532593995132, + "train_speed(iter/s)": 0.138554 + }, + { + "epoch": 0.5454888637221593, + "grad_norm": 0.33407121896743774, + "learning_rate": 9.323261897312238e-05, + "loss": 0.2598482847213745, + "memory(GiB)": 91.64, + "step": 5780, + "token_acc": 0.8982714650042505, + "train_speed(iter/s)": 0.138552 + }, + { + "epoch": 0.5459607399018498, + "grad_norm": 0.5114820003509521, + "learning_rate": 9.307693531543792e-05, + "loss": 0.26101438999176024, + "memory(GiB)": 91.64, + "step": 5785, + "token_acc": 0.9108391608391608, + "train_speed(iter/s)": 0.138552 + }, + { + "epoch": 0.5464326160815403, + "grad_norm": 0.37111696600914, + "learning_rate": 9.29212685164581e-05, + "loss": 0.26595659255981446, + "memory(GiB)": 91.64, + "step": 5790, + "token_acc": 0.8989431968295905, + "train_speed(iter/s)": 0.138553 + }, + { + "epoch": 0.5469044922612306, + "grad_norm": 0.24424168467521667, + "learning_rate": 9.276561895525507e-05, + "loss": 0.2628682851791382, + "memory(GiB)": 91.64, + "step": 5795, + "token_acc": 0.8922287390029325, + "train_speed(iter/s)": 0.138554 + }, + { + "epoch": 0.5473763684409211, + "grad_norm": 0.5358099341392517, + "learning_rate": 9.260998701085897e-05, + "loss": 0.2708090305328369, + "memory(GiB)": 91.64, + "step": 5800, + "token_acc": 0.9153335934451814, + "train_speed(iter/s)": 0.138553 + }, + { + "epoch": 0.5478482446206115, + "grad_norm": 0.22064965963363647, + "learning_rate": 9.245437306225696e-05, + "loss": 0.2638305902481079, + "memory(GiB)": 91.64, + "step": 5805, + "token_acc": 0.8872464764523892, + "train_speed(iter/s)": 0.138554 + }, + { + "epoch": 0.548320120800302, + "grad_norm": 0.38728588819503784, + "learning_rate": 9.229877748839242e-05, + "loss": 0.26497840881347656, + "memory(GiB)": 91.64, + "step": 5810, + "token_acc": 0.9151977131967604, + "train_speed(iter/s)": 0.138553 + }, + { + "epoch": 0.5487919969799925, + "grad_norm": 0.5592960119247437, + "learning_rate": 9.214320066816403e-05, + "loss": 0.2640266418457031, + "memory(GiB)": 91.64, + "step": 5815, + "token_acc": 0.9124352331606218, + "train_speed(iter/s)": 0.138553 + }, + { + "epoch": 0.549263873159683, + "grad_norm": 0.39271870255470276, + "learning_rate": 9.198764298042472e-05, + "loss": 0.2661734580993652, + "memory(GiB)": 91.64, + "step": 5820, + "token_acc": 0.9188461538461539, + "train_speed(iter/s)": 0.138554 + }, + { + "epoch": 0.5497357493393733, + "grad_norm": 0.3182789981365204, + "learning_rate": 9.183210480398096e-05, + "loss": 0.2613093614578247, + "memory(GiB)": 91.64, + "step": 5825, + "token_acc": 0.8979396262577863, + "train_speed(iter/s)": 0.138554 + }, + { + "epoch": 0.5502076255190638, + "grad_norm": 1.0553961992263794, + "learning_rate": 9.167658651759154e-05, + "loss": 0.2589289665222168, + "memory(GiB)": 91.64, + "step": 5830, + "token_acc": 0.9174974217944311, + "train_speed(iter/s)": 0.138556 + }, + { + "epoch": 0.5506795016987542, + "grad_norm": 0.27244263887405396, + "learning_rate": 9.152108849996696e-05, + "loss": 0.2616077423095703, + "memory(GiB)": 91.64, + "step": 5835, + "token_acc": 0.9227053140096618, + "train_speed(iter/s)": 0.138557 + }, + { + "epoch": 0.5511513778784447, + "grad_norm": 0.3820434808731079, + "learning_rate": 9.136561112976828e-05, + "loss": 0.2627591133117676, + "memory(GiB)": 91.64, + "step": 5840, + "token_acc": 0.898062015503876, + "train_speed(iter/s)": 0.138554 + }, + { + "epoch": 0.5516232540581352, + "grad_norm": 0.22392357885837555, + "learning_rate": 9.121015478560628e-05, + "loss": 0.2643141508102417, + "memory(GiB)": 91.64, + "step": 5845, + "token_acc": 0.9144553072625698, + "train_speed(iter/s)": 0.138553 + }, + { + "epoch": 0.5520951302378256, + "grad_norm": 0.36011257767677307, + "learning_rate": 9.105471984604055e-05, + "loss": 0.26393847465515136, + "memory(GiB)": 91.64, + "step": 5850, + "token_acc": 0.9219836710009072, + "train_speed(iter/s)": 0.138554 + }, + { + "epoch": 0.552567006417516, + "grad_norm": 0.2702450752258301, + "learning_rate": 9.089930668957862e-05, + "loss": 0.25948729515075686, + "memory(GiB)": 91.64, + "step": 5855, + "token_acc": 0.9018810371123538, + "train_speed(iter/s)": 0.138552 + }, + { + "epoch": 0.5530388825972065, + "grad_norm": 0.5624324679374695, + "learning_rate": 9.074391569467492e-05, + "loss": 0.26597013473510744, + "memory(GiB)": 91.64, + "step": 5860, + "token_acc": 0.9016867469879518, + "train_speed(iter/s)": 0.138553 + }, + { + "epoch": 0.5535107587768969, + "grad_norm": 0.6906477808952332, + "learning_rate": 9.058854723972986e-05, + "loss": 0.2574812412261963, + "memory(GiB)": 91.64, + "step": 5865, + "token_acc": 0.9240750966316952, + "train_speed(iter/s)": 0.138553 + }, + { + "epoch": 0.5539826349565874, + "grad_norm": 0.21588723361492157, + "learning_rate": 9.043320170308907e-05, + "loss": 0.2587729930877686, + "memory(GiB)": 91.64, + "step": 5870, + "token_acc": 0.9095205941931127, + "train_speed(iter/s)": 0.138552 + }, + { + "epoch": 0.5544545111362779, + "grad_norm": 0.6740458011627197, + "learning_rate": 9.027787946304223e-05, + "loss": 0.26523623466491697, + "memory(GiB)": 91.64, + "step": 5875, + "token_acc": 0.9050131926121372, + "train_speed(iter/s)": 0.138552 + }, + { + "epoch": 0.5549263873159683, + "grad_norm": 0.3452465534210205, + "learning_rate": 9.012258089782248e-05, + "loss": 0.26502606868743894, + "memory(GiB)": 91.64, + "step": 5880, + "token_acc": 0.9122871946706144, + "train_speed(iter/s)": 0.138551 + }, + { + "epoch": 0.5553982634956587, + "grad_norm": 0.3662663400173187, + "learning_rate": 8.996730638560519e-05, + "loss": 0.263625431060791, + "memory(GiB)": 91.64, + "step": 5885, + "token_acc": 0.9177910260433009, + "train_speed(iter/s)": 0.138552 + }, + { + "epoch": 0.5558701396753492, + "grad_norm": 0.36303842067718506, + "learning_rate": 8.981205630450713e-05, + "loss": 0.2587179183959961, + "memory(GiB)": 91.64, + "step": 5890, + "token_acc": 0.8908523908523909, + "train_speed(iter/s)": 0.138554 + }, + { + "epoch": 0.5563420158550396, + "grad_norm": 0.3700637221336365, + "learning_rate": 8.965683103258563e-05, + "loss": 0.2564589023590088, + "memory(GiB)": 91.64, + "step": 5895, + "token_acc": 0.9156102861282144, + "train_speed(iter/s)": 0.138554 + }, + { + "epoch": 0.5568138920347301, + "grad_norm": 0.4616394340991974, + "learning_rate": 8.95016309478376e-05, + "loss": 0.2621131896972656, + "memory(GiB)": 91.64, + "step": 5900, + "token_acc": 0.9194208372678627, + "train_speed(iter/s)": 0.138553 + }, + { + "epoch": 0.5572857682144206, + "grad_norm": 0.4272395670413971, + "learning_rate": 8.934645642819858e-05, + "loss": 0.26262176036834717, + "memory(GiB)": 91.64, + "step": 5905, + "token_acc": 0.901246719160105, + "train_speed(iter/s)": 0.138553 + }, + { + "epoch": 0.557757644394111, + "grad_norm": 0.2230866253376007, + "learning_rate": 8.919130785154195e-05, + "loss": 0.26306858062744143, + "memory(GiB)": 91.64, + "step": 5910, + "token_acc": 0.9148486980999296, + "train_speed(iter/s)": 0.138555 + }, + { + "epoch": 0.5582295205738015, + "grad_norm": 0.5155897736549377, + "learning_rate": 8.903618559567779e-05, + "loss": 0.2663133144378662, + "memory(GiB)": 91.64, + "step": 5915, + "token_acc": 0.9127292940522512, + "train_speed(iter/s)": 0.138556 + }, + { + "epoch": 0.5587013967534918, + "grad_norm": 0.23849859833717346, + "learning_rate": 8.88810900383522e-05, + "loss": 0.25657382011413576, + "memory(GiB)": 91.64, + "step": 5920, + "token_acc": 0.9311404857055026, + "train_speed(iter/s)": 0.138554 + }, + { + "epoch": 0.5591732729331823, + "grad_norm": 0.4121880531311035, + "learning_rate": 8.872602155724616e-05, + "loss": 0.25624220371246337, + "memory(GiB)": 91.64, + "step": 5925, + "token_acc": 0.9055555555555556, + "train_speed(iter/s)": 0.138553 + }, + { + "epoch": 0.5596451491128728, + "grad_norm": 0.4594009816646576, + "learning_rate": 8.857098052997477e-05, + "loss": 0.2594911098480225, + "memory(GiB)": 91.64, + "step": 5930, + "token_acc": 0.9104712041884817, + "train_speed(iter/s)": 0.138553 + }, + { + "epoch": 0.5601170252925632, + "grad_norm": 0.306629478931427, + "learning_rate": 8.841596733408627e-05, + "loss": 0.2655156373977661, + "memory(GiB)": 91.64, + "step": 5935, + "token_acc": 0.9011486251305256, + "train_speed(iter/s)": 0.138554 + }, + { + "epoch": 0.5605889014722537, + "grad_norm": 0.5396053791046143, + "learning_rate": 8.826098234706117e-05, + "loss": 0.25810458660125735, + "memory(GiB)": 91.64, + "step": 5940, + "token_acc": 0.8991660348749052, + "train_speed(iter/s)": 0.138554 + }, + { + "epoch": 0.5610607776519442, + "grad_norm": 0.4360372722148895, + "learning_rate": 8.810602594631121e-05, + "loss": 0.26143407821655273, + "memory(GiB)": 91.64, + "step": 5945, + "token_acc": 0.9059360730593607, + "train_speed(iter/s)": 0.138556 + }, + { + "epoch": 0.5615326538316345, + "grad_norm": 0.3832603394985199, + "learning_rate": 8.795109850917857e-05, + "loss": 0.25672688484191897, + "memory(GiB)": 91.64, + "step": 5950, + "token_acc": 0.9179431072210066, + "train_speed(iter/s)": 0.138556 + }, + { + "epoch": 0.562004530011325, + "grad_norm": 0.6274591088294983, + "learning_rate": 8.779620041293486e-05, + "loss": 0.25992960929870607, + "memory(GiB)": 91.64, + "step": 5955, + "token_acc": 0.8999081726354453, + "train_speed(iter/s)": 0.138556 + }, + { + "epoch": 0.5624764061910155, + "grad_norm": 0.7016596794128418, + "learning_rate": 8.764133203478027e-05, + "loss": 0.2578299522399902, + "memory(GiB)": 91.64, + "step": 5960, + "token_acc": 0.9278547539417105, + "train_speed(iter/s)": 0.138556 + }, + { + "epoch": 0.5629482823707059, + "grad_norm": 0.30035027861595154, + "learning_rate": 8.748649375184258e-05, + "loss": 0.2687530040740967, + "memory(GiB)": 91.64, + "step": 5965, + "token_acc": 0.9160012775471096, + "train_speed(iter/s)": 0.138557 + }, + { + "epoch": 0.5634201585503964, + "grad_norm": 0.6111456155776978, + "learning_rate": 8.73316859411764e-05, + "loss": 0.25793159008026123, + "memory(GiB)": 91.64, + "step": 5970, + "token_acc": 0.9109185441941074, + "train_speed(iter/s)": 0.138558 + }, + { + "epoch": 0.5638920347300869, + "grad_norm": 0.3605799674987793, + "learning_rate": 8.7176908979762e-05, + "loss": 0.2613699436187744, + "memory(GiB)": 91.64, + "step": 5975, + "token_acc": 0.9057390189163194, + "train_speed(iter/s)": 0.138557 + }, + { + "epoch": 0.5643639109097772, + "grad_norm": 0.22072555124759674, + "learning_rate": 8.702216324450458e-05, + "loss": 0.2566020965576172, + "memory(GiB)": 91.64, + "step": 5980, + "token_acc": 0.9135802469135802, + "train_speed(iter/s)": 0.138557 + }, + { + "epoch": 0.5648357870894677, + "grad_norm": 0.5042877793312073, + "learning_rate": 8.686744911223332e-05, + "loss": 0.26034910678863527, + "memory(GiB)": 91.64, + "step": 5985, + "token_acc": 0.9065708418891171, + "train_speed(iter/s)": 0.138559 + }, + { + "epoch": 0.5653076632691582, + "grad_norm": 0.2110811173915863, + "learning_rate": 8.671276695970043e-05, + "loss": 0.2575195789337158, + "memory(GiB)": 91.64, + "step": 5990, + "token_acc": 0.8952899961074348, + "train_speed(iter/s)": 0.138559 + }, + { + "epoch": 0.5657795394488486, + "grad_norm": 0.44462695717811584, + "learning_rate": 8.655811716358014e-05, + "loss": 0.262726354598999, + "memory(GiB)": 91.64, + "step": 5995, + "token_acc": 0.8959136468774094, + "train_speed(iter/s)": 0.13856 + }, + { + "epoch": 0.5662514156285391, + "grad_norm": 0.2724670171737671, + "learning_rate": 8.640350010046811e-05, + "loss": 0.25894389152526853, + "memory(GiB)": 91.64, + "step": 6000, + "token_acc": 0.9075779036827195, + "train_speed(iter/s)": 0.13856 + }, + { + "epoch": 0.5667232918082296, + "grad_norm": 0.20576444268226624, + "learning_rate": 8.624891614688014e-05, + "loss": 0.2585927963256836, + "memory(GiB)": 91.64, + "step": 6005, + "token_acc": 0.9124197810494526, + "train_speed(iter/s)": 0.13856 + }, + { + "epoch": 0.56719516798792, + "grad_norm": 0.2223316729068756, + "learning_rate": 8.609436567925137e-05, + "loss": 0.2674827575683594, + "memory(GiB)": 91.64, + "step": 6010, + "token_acc": 0.9234624145785877, + "train_speed(iter/s)": 0.13856 + }, + { + "epoch": 0.5676670441676104, + "grad_norm": 0.39043989777565, + "learning_rate": 8.593984907393551e-05, + "loss": 0.26184422969818116, + "memory(GiB)": 91.64, + "step": 6015, + "token_acc": 0.8997484728710026, + "train_speed(iter/s)": 0.138559 + }, + { + "epoch": 0.5681389203473008, + "grad_norm": 0.4472939968109131, + "learning_rate": 8.578536670720373e-05, + "loss": 0.2619999885559082, + "memory(GiB)": 91.64, + "step": 6020, + "token_acc": 0.9160671462829736, + "train_speed(iter/s)": 0.138559 + }, + { + "epoch": 0.5686107965269913, + "grad_norm": 0.28536486625671387, + "learning_rate": 8.56309189552438e-05, + "loss": 0.2595273494720459, + "memory(GiB)": 91.64, + "step": 6025, + "token_acc": 0.9195612431444241, + "train_speed(iter/s)": 0.138559 + }, + { + "epoch": 0.5690826727066818, + "grad_norm": 0.5671156644821167, + "learning_rate": 8.547650619415934e-05, + "loss": 0.26369786262512207, + "memory(GiB)": 91.64, + "step": 6030, + "token_acc": 0.9050097592713078, + "train_speed(iter/s)": 0.138559 + }, + { + "epoch": 0.5695545488863722, + "grad_norm": 0.20459724962711334, + "learning_rate": 8.532212879996864e-05, + "loss": 0.2596536636352539, + "memory(GiB)": 91.64, + "step": 6035, + "token_acc": 0.9215686274509803, + "train_speed(iter/s)": 0.13856 + }, + { + "epoch": 0.5700264250660627, + "grad_norm": 0.36021432280540466, + "learning_rate": 8.516778714860387e-05, + "loss": 0.25885491371154784, + "memory(GiB)": 91.64, + "step": 6040, + "token_acc": 0.9114039073148569, + "train_speed(iter/s)": 0.13856 + }, + { + "epoch": 0.5704983012457531, + "grad_norm": 0.21195665001869202, + "learning_rate": 8.501348161591018e-05, + "loss": 0.26376259326934814, + "memory(GiB)": 91.64, + "step": 6045, + "token_acc": 0.9074463609591923, + "train_speed(iter/s)": 0.138562 + }, + { + "epoch": 0.5709701774254435, + "grad_norm": 0.4079072177410126, + "learning_rate": 8.485921257764476e-05, + "loss": 0.2624048233032227, + "memory(GiB)": 91.64, + "step": 6050, + "token_acc": 0.918562201628756, + "train_speed(iter/s)": 0.138561 + }, + { + "epoch": 0.571442053605134, + "grad_norm": 0.4321339726448059, + "learning_rate": 8.470498040947601e-05, + "loss": 0.26624159812927245, + "memory(GiB)": 91.64, + "step": 6055, + "token_acc": 0.8944233892799134, + "train_speed(iter/s)": 0.13856 + }, + { + "epoch": 0.5719139297848245, + "grad_norm": 0.586856484413147, + "learning_rate": 8.455078548698243e-05, + "loss": 0.2600421667098999, + "memory(GiB)": 91.64, + "step": 6060, + "token_acc": 0.8978065802592223, + "train_speed(iter/s)": 0.13856 + }, + { + "epoch": 0.5723858059645149, + "grad_norm": 0.2767311930656433, + "learning_rate": 8.439662818565186e-05, + "loss": 0.25961735248565676, + "memory(GiB)": 91.64, + "step": 6065, + "token_acc": 0.9028112449799197, + "train_speed(iter/s)": 0.138559 + }, + { + "epoch": 0.5728576821442054, + "grad_norm": 0.32214125990867615, + "learning_rate": 8.424250888088056e-05, + "loss": 0.2518421173095703, + "memory(GiB)": 91.64, + "step": 6070, + "token_acc": 0.9209470304975923, + "train_speed(iter/s)": 0.13856 + }, + { + "epoch": 0.5733295583238958, + "grad_norm": 0.32131657004356384, + "learning_rate": 8.408842794797225e-05, + "loss": 0.2633669376373291, + "memory(GiB)": 91.64, + "step": 6075, + "token_acc": 0.8879159369527145, + "train_speed(iter/s)": 0.13856 + }, + { + "epoch": 0.5738014345035862, + "grad_norm": 0.21564066410064697, + "learning_rate": 8.39343857621371e-05, + "loss": 0.2537181854248047, + "memory(GiB)": 91.64, + "step": 6080, + "token_acc": 0.9242782773308093, + "train_speed(iter/s)": 0.138561 + }, + { + "epoch": 0.5742733106832767, + "grad_norm": 0.6542291045188904, + "learning_rate": 8.378038269849113e-05, + "loss": 0.2556138277053833, + "memory(GiB)": 91.64, + "step": 6085, + "token_acc": 0.9143944197844007, + "train_speed(iter/s)": 0.13856 + }, + { + "epoch": 0.5747451868629672, + "grad_norm": 0.375399112701416, + "learning_rate": 8.362641913205497e-05, + "loss": 0.26028482913970946, + "memory(GiB)": 91.64, + "step": 6090, + "token_acc": 0.9022589052997394, + "train_speed(iter/s)": 0.138562 + }, + { + "epoch": 0.5752170630426576, + "grad_norm": 0.3971591293811798, + "learning_rate": 8.347249543775303e-05, + "loss": 0.25295219421386717, + "memory(GiB)": 91.64, + "step": 6095, + "token_acc": 0.9101796407185628, + "train_speed(iter/s)": 0.138563 + }, + { + "epoch": 0.5756889392223481, + "grad_norm": 0.2439550906419754, + "learning_rate": 8.331861199041272e-05, + "loss": 0.2594885349273682, + "memory(GiB)": 91.64, + "step": 6100, + "token_acc": 0.9213917525773195, + "train_speed(iter/s)": 0.138564 + }, + { + "epoch": 0.5761608154020385, + "grad_norm": 0.4823835492134094, + "learning_rate": 8.31647691647634e-05, + "loss": 0.2540708541870117, + "memory(GiB)": 91.64, + "step": 6105, + "token_acc": 0.9006509078451524, + "train_speed(iter/s)": 0.138563 + }, + { + "epoch": 0.5766326915817289, + "grad_norm": 0.40031829476356506, + "learning_rate": 8.301096733543545e-05, + "loss": 0.25965514183044436, + "memory(GiB)": 91.64, + "step": 6110, + "token_acc": 0.9109712230215827, + "train_speed(iter/s)": 0.138564 + }, + { + "epoch": 0.5771045677614194, + "grad_norm": 0.21735447645187378, + "learning_rate": 8.285720687695953e-05, + "loss": 0.25909032821655276, + "memory(GiB)": 91.64, + "step": 6115, + "token_acc": 0.8939785740672331, + "train_speed(iter/s)": 0.138564 + }, + { + "epoch": 0.5775764439411099, + "grad_norm": 0.8802348375320435, + "learning_rate": 8.270348816376553e-05, + "loss": 0.25847816467285156, + "memory(GiB)": 91.64, + "step": 6120, + "token_acc": 0.8921238124425376, + "train_speed(iter/s)": 0.138565 + }, + { + "epoch": 0.5780483201208003, + "grad_norm": 0.410442054271698, + "learning_rate": 8.25498115701816e-05, + "loss": 0.2583799362182617, + "memory(GiB)": 91.64, + "step": 6125, + "token_acc": 0.9224385572933291, + "train_speed(iter/s)": 0.138564 + }, + { + "epoch": 0.5785201963004908, + "grad_norm": 0.34100407361984253, + "learning_rate": 8.23961774704334e-05, + "loss": 0.2645753860473633, + "memory(GiB)": 91.64, + "step": 6130, + "token_acc": 0.9133333333333333, + "train_speed(iter/s)": 0.138565 + }, + { + "epoch": 0.5789920724801813, + "grad_norm": 0.45933786034584045, + "learning_rate": 8.224258623864311e-05, + "loss": 0.2588545322418213, + "memory(GiB)": 91.64, + "step": 6135, + "token_acc": 0.8987271721084671, + "train_speed(iter/s)": 0.138564 + }, + { + "epoch": 0.5794639486598716, + "grad_norm": 0.2021590918302536, + "learning_rate": 8.208903824882843e-05, + "loss": 0.2566136598587036, + "memory(GiB)": 91.64, + "step": 6140, + "token_acc": 0.9168081494057725, + "train_speed(iter/s)": 0.138564 + }, + { + "epoch": 0.5799358248395621, + "grad_norm": 0.46117132902145386, + "learning_rate": 8.193553387490194e-05, + "loss": 0.2587829351425171, + "memory(GiB)": 91.64, + "step": 6145, + "token_acc": 0.8906326630701324, + "train_speed(iter/s)": 0.138564 + }, + { + "epoch": 0.5804077010192525, + "grad_norm": 0.492724746465683, + "learning_rate": 8.17820734906698e-05, + "loss": 0.2539858341217041, + "memory(GiB)": 91.64, + "step": 6150, + "token_acc": 0.9108153078202995, + "train_speed(iter/s)": 0.138565 + }, + { + "epoch": 0.580879577198943, + "grad_norm": 0.3408304452896118, + "learning_rate": 8.162865746983122e-05, + "loss": 0.261328125, + "memory(GiB)": 91.64, + "step": 6155, + "token_acc": 0.901056338028169, + "train_speed(iter/s)": 0.138564 + }, + { + "epoch": 0.5813514533786335, + "grad_norm": 0.2859143018722534, + "learning_rate": 8.147528618597729e-05, + "loss": 0.2623132228851318, + "memory(GiB)": 91.64, + "step": 6160, + "token_acc": 0.9008238276299113, + "train_speed(iter/s)": 0.138563 + }, + { + "epoch": 0.5818233295583239, + "grad_norm": 0.4189459979534149, + "learning_rate": 8.132196001259011e-05, + "loss": 0.25851998329162595, + "memory(GiB)": 91.64, + "step": 6165, + "token_acc": 0.9078947368421053, + "train_speed(iter/s)": 0.138563 + }, + { + "epoch": 0.5822952057380143, + "grad_norm": 0.5357767939567566, + "learning_rate": 8.116867932304204e-05, + "loss": 0.25238189697265623, + "memory(GiB)": 91.64, + "step": 6170, + "token_acc": 0.8951935914552737, + "train_speed(iter/s)": 0.138565 + }, + { + "epoch": 0.5827670819177048, + "grad_norm": 0.255214661359787, + "learning_rate": 8.101544449059466e-05, + "loss": 0.2591865062713623, + "memory(GiB)": 91.64, + "step": 6175, + "token_acc": 0.896551724137931, + "train_speed(iter/s)": 0.138565 + }, + { + "epoch": 0.5832389580973952, + "grad_norm": 0.42737099528312683, + "learning_rate": 8.086225588839782e-05, + "loss": 0.2616575241088867, + "memory(GiB)": 91.64, + "step": 6180, + "token_acc": 0.910024650780608, + "train_speed(iter/s)": 0.138564 + }, + { + "epoch": 0.5837108342770857, + "grad_norm": 0.28640568256378174, + "learning_rate": 8.070911388948885e-05, + "loss": 0.2526390790939331, + "memory(GiB)": 91.64, + "step": 6185, + "token_acc": 0.9003815175922001, + "train_speed(iter/s)": 0.138564 + }, + { + "epoch": 0.5841827104567762, + "grad_norm": 0.4053497314453125, + "learning_rate": 8.055601886679156e-05, + "loss": 0.25293493270874023, + "memory(GiB)": 91.64, + "step": 6190, + "token_acc": 0.8978351690087353, + "train_speed(iter/s)": 0.138565 + }, + { + "epoch": 0.5846545866364666, + "grad_norm": 0.32031095027923584, + "learning_rate": 8.040297119311536e-05, + "loss": 0.2545334815979004, + "memory(GiB)": 91.64, + "step": 6195, + "token_acc": 0.908307210031348, + "train_speed(iter/s)": 0.138565 + }, + { + "epoch": 0.585126462816157, + "grad_norm": 0.30353373289108276, + "learning_rate": 8.024997124115437e-05, + "loss": 0.2560997486114502, + "memory(GiB)": 91.64, + "step": 6200, + "token_acc": 0.9118942731277533, + "train_speed(iter/s)": 0.138565 + }, + { + "epoch": 0.5855983389958475, + "grad_norm": 0.43289270997047424, + "learning_rate": 8.009701938348654e-05, + "loss": 0.2583000183105469, + "memory(GiB)": 91.64, + "step": 6205, + "token_acc": 0.9062111801242236, + "train_speed(iter/s)": 0.138566 + }, + { + "epoch": 0.5860702151755379, + "grad_norm": 0.38920220732688904, + "learning_rate": 7.994411599257268e-05, + "loss": 0.2500426769256592, + "memory(GiB)": 91.64, + "step": 6210, + "token_acc": 0.8985663082437276, + "train_speed(iter/s)": 0.138566 + }, + { + "epoch": 0.5865420913552284, + "grad_norm": 0.2825988233089447, + "learning_rate": 7.97912614407555e-05, + "loss": 0.2535118579864502, + "memory(GiB)": 91.64, + "step": 6215, + "token_acc": 0.9290194783757015, + "train_speed(iter/s)": 0.138566 + }, + { + "epoch": 0.5870139675349189, + "grad_norm": 0.2607544958591461, + "learning_rate": 7.963845610025892e-05, + "loss": 0.2580404281616211, + "memory(GiB)": 91.64, + "step": 6220, + "token_acc": 0.9124552327894946, + "train_speed(iter/s)": 0.138565 + }, + { + "epoch": 0.5874858437146093, + "grad_norm": 0.6544619202613831, + "learning_rate": 7.948570034318685e-05, + "loss": 0.2521751880645752, + "memory(GiB)": 91.64, + "step": 6225, + "token_acc": 0.9135643988018828, + "train_speed(iter/s)": 0.138566 + }, + { + "epoch": 0.5879577198942998, + "grad_norm": 0.6524026393890381, + "learning_rate": 7.933299454152266e-05, + "loss": 0.25135116577148436, + "memory(GiB)": 91.64, + "step": 6230, + "token_acc": 0.9067930489731437, + "train_speed(iter/s)": 0.138565 + }, + { + "epoch": 0.5884295960739901, + "grad_norm": 0.3173801004886627, + "learning_rate": 7.91803390671279e-05, + "loss": 0.25176496505737306, + "memory(GiB)": 91.64, + "step": 6235, + "token_acc": 0.8964194373401535, + "train_speed(iter/s)": 0.138566 + }, + { + "epoch": 0.5889014722536806, + "grad_norm": 0.4476158618927002, + "learning_rate": 7.902773429174166e-05, + "loss": 0.2545851469039917, + "memory(GiB)": 91.64, + "step": 6240, + "token_acc": 0.922882427307206, + "train_speed(iter/s)": 0.138567 + }, + { + "epoch": 0.5893733484333711, + "grad_norm": 0.2693001925945282, + "learning_rate": 7.88751805869795e-05, + "loss": 0.2574381113052368, + "memory(GiB)": 91.64, + "step": 6245, + "token_acc": 0.8921568627450981, + "train_speed(iter/s)": 0.138569 + }, + { + "epoch": 0.5898452246130615, + "grad_norm": 0.37155503034591675, + "learning_rate": 7.872267832433272e-05, + "loss": 0.25994248390197755, + "memory(GiB)": 91.64, + "step": 6250, + "token_acc": 0.9076329076329076, + "train_speed(iter/s)": 0.138569 + }, + { + "epoch": 0.590317100792752, + "grad_norm": 0.6347530484199524, + "learning_rate": 7.85702278751672e-05, + "loss": 0.2505256175994873, + "memory(GiB)": 91.64, + "step": 6255, + "token_acc": 0.9176829268292683, + "train_speed(iter/s)": 0.138566 + }, + { + "epoch": 0.5907889769724425, + "grad_norm": 0.2157605141401291, + "learning_rate": 7.841782961072284e-05, + "loss": 0.25982060432434084, + "memory(GiB)": 91.64, + "step": 6260, + "token_acc": 0.9095300834431269, + "train_speed(iter/s)": 0.138565 + }, + { + "epoch": 0.5912608531521328, + "grad_norm": 0.6124857664108276, + "learning_rate": 7.826548390211225e-05, + "loss": 0.25920767784118653, + "memory(GiB)": 91.64, + "step": 6265, + "token_acc": 0.9093830334190232, + "train_speed(iter/s)": 0.138566 + }, + { + "epoch": 0.5917327293318233, + "grad_norm": 0.2416924685239792, + "learning_rate": 7.811319112032027e-05, + "loss": 0.25081772804260255, + "memory(GiB)": 91.64, + "step": 6270, + "token_acc": 0.9212386401884888, + "train_speed(iter/s)": 0.138567 + }, + { + "epoch": 0.5922046055115138, + "grad_norm": 0.8983920216560364, + "learning_rate": 7.796095163620267e-05, + "loss": 0.2576131343841553, + "memory(GiB)": 91.64, + "step": 6275, + "token_acc": 0.8962986598596043, + "train_speed(iter/s)": 0.138566 + }, + { + "epoch": 0.5926764816912042, + "grad_norm": 0.4004858732223511, + "learning_rate": 7.780876582048553e-05, + "loss": 0.25931851863861083, + "memory(GiB)": 91.64, + "step": 6280, + "token_acc": 0.9084588644264194, + "train_speed(iter/s)": 0.138568 + }, + { + "epoch": 0.5931483578708947, + "grad_norm": 0.39955568313598633, + "learning_rate": 7.76566340437642e-05, + "loss": 0.25876388549804685, + "memory(GiB)": 91.64, + "step": 6285, + "token_acc": 0.9037249283667622, + "train_speed(iter/s)": 0.138569 + }, + { + "epoch": 0.5936202340505852, + "grad_norm": 0.5433376431465149, + "learning_rate": 7.750455667650251e-05, + "loss": 0.2523482799530029, + "memory(GiB)": 91.64, + "step": 6290, + "token_acc": 0.9070858751759737, + "train_speed(iter/s)": 0.138569 + }, + { + "epoch": 0.5940921102302755, + "grad_norm": 0.4515962600708008, + "learning_rate": 7.735253408903174e-05, + "loss": 0.2509950637817383, + "memory(GiB)": 91.64, + "step": 6295, + "token_acc": 0.9048223350253807, + "train_speed(iter/s)": 0.138569 + }, + { + "epoch": 0.594563986409966, + "grad_norm": 0.3393830358982086, + "learning_rate": 7.72005666515497e-05, + "loss": 0.25605719089508056, + "memory(GiB)": 91.64, + "step": 6300, + "token_acc": 0.9163781624500665, + "train_speed(iter/s)": 0.138569 + }, + { + "epoch": 0.5950358625896565, + "grad_norm": 0.24305680394172668, + "learning_rate": 7.704865473412008e-05, + "loss": 0.2521644592285156, + "memory(GiB)": 91.64, + "step": 6305, + "token_acc": 0.9120942257971847, + "train_speed(iter/s)": 0.138571 + }, + { + "epoch": 0.5955077387693469, + "grad_norm": 0.5273348689079285, + "learning_rate": 7.689679870667121e-05, + "loss": 0.25645806789398196, + "memory(GiB)": 91.64, + "step": 6310, + "token_acc": 0.9080547112462006, + "train_speed(iter/s)": 0.138572 + }, + { + "epoch": 0.5959796149490374, + "grad_norm": 0.826224148273468, + "learning_rate": 7.674499893899533e-05, + "loss": 0.25773797035217283, + "memory(GiB)": 91.64, + "step": 6315, + "token_acc": 0.9154564315352697, + "train_speed(iter/s)": 0.138572 + }, + { + "epoch": 0.5964514911287279, + "grad_norm": 0.3442542552947998, + "learning_rate": 7.659325580074782e-05, + "loss": 0.2533260345458984, + "memory(GiB)": 91.64, + "step": 6320, + "token_acc": 0.9165663858804654, + "train_speed(iter/s)": 0.138571 + }, + { + "epoch": 0.5969233673084182, + "grad_norm": 0.5301949381828308, + "learning_rate": 7.644156966144603e-05, + "loss": 0.2529233455657959, + "memory(GiB)": 91.64, + "step": 6325, + "token_acc": 0.9026500811249324, + "train_speed(iter/s)": 0.138572 + }, + { + "epoch": 0.5973952434881087, + "grad_norm": 0.34612637758255005, + "learning_rate": 7.628994089046851e-05, + "loss": 0.25222182273864746, + "memory(GiB)": 91.64, + "step": 6330, + "token_acc": 0.9079539769884942, + "train_speed(iter/s)": 0.138572 + }, + { + "epoch": 0.5978671196677992, + "grad_norm": 0.5262756943702698, + "learning_rate": 7.61383698570542e-05, + "loss": 0.25571882724761963, + "memory(GiB)": 91.64, + "step": 6335, + "token_acc": 0.9050715214564369, + "train_speed(iter/s)": 0.138573 + }, + { + "epoch": 0.5983389958474896, + "grad_norm": 0.37142181396484375, + "learning_rate": 7.598685693030136e-05, + "loss": 0.2572377920150757, + "memory(GiB)": 91.64, + "step": 6340, + "token_acc": 0.9146341463414634, + "train_speed(iter/s)": 0.138576 + }, + { + "epoch": 0.5988108720271801, + "grad_norm": 0.4435655176639557, + "learning_rate": 7.583540247916672e-05, + "loss": 0.24800877571105956, + "memory(GiB)": 91.64, + "step": 6345, + "token_acc": 0.9018944519621109, + "train_speed(iter/s)": 0.138575 + }, + { + "epoch": 0.5992827482068706, + "grad_norm": 0.7729746103286743, + "learning_rate": 7.568400687246474e-05, + "loss": 0.25422685146331786, + "memory(GiB)": 91.64, + "step": 6350, + "token_acc": 0.9128616242593238, + "train_speed(iter/s)": 0.138576 + }, + { + "epoch": 0.599754624386561, + "grad_norm": 0.27863532304763794, + "learning_rate": 7.553267047886651e-05, + "loss": 0.25974535942077637, + "memory(GiB)": 91.64, + "step": 6355, + "token_acc": 0.9166051660516605, + "train_speed(iter/s)": 0.138577 + }, + { + "epoch": 0.6002265005662514, + "grad_norm": 0.23578110337257385, + "learning_rate": 7.53813936668989e-05, + "loss": 0.25558838844299314, + "memory(GiB)": 91.64, + "step": 6360, + "token_acc": 0.9315726290516206, + "train_speed(iter/s)": 0.138577 + }, + { + "epoch": 0.6006983767459418, + "grad_norm": 0.6068395972251892, + "learning_rate": 7.523017680494372e-05, + "loss": 0.25866079330444336, + "memory(GiB)": 91.64, + "step": 6365, + "token_acc": 0.9188640973630832, + "train_speed(iter/s)": 0.138579 + }, + { + "epoch": 0.6011702529256323, + "grad_norm": 0.32254558801651, + "learning_rate": 7.507902026123678e-05, + "loss": 0.2493062973022461, + "memory(GiB)": 91.64, + "step": 6370, + "token_acc": 0.9166666666666666, + "train_speed(iter/s)": 0.138579 + }, + { + "epoch": 0.6016421291053228, + "grad_norm": 0.5741229057312012, + "learning_rate": 7.492792440386709e-05, + "loss": 0.26273245811462403, + "memory(GiB)": 91.64, + "step": 6375, + "token_acc": 0.9025764895330113, + "train_speed(iter/s)": 0.138579 + }, + { + "epoch": 0.6021140052850132, + "grad_norm": 0.21148309111595154, + "learning_rate": 7.477688960077575e-05, + "loss": 0.26000070571899414, + "memory(GiB)": 91.64, + "step": 6380, + "token_acc": 0.8957715133531158, + "train_speed(iter/s)": 0.13858 + }, + { + "epoch": 0.6025858814647037, + "grad_norm": 0.31708240509033203, + "learning_rate": 7.462591621975523e-05, + "loss": 0.2541205406188965, + "memory(GiB)": 91.64, + "step": 6385, + "token_acc": 0.9128664495114006, + "train_speed(iter/s)": 0.138581 + }, + { + "epoch": 0.6030577576443941, + "grad_norm": 0.5386173725128174, + "learning_rate": 7.447500462844848e-05, + "loss": 0.2534413576126099, + "memory(GiB)": 91.64, + "step": 6390, + "token_acc": 0.9030408773678963, + "train_speed(iter/s)": 0.138582 + }, + { + "epoch": 0.6035296338240845, + "grad_norm": 0.28694167733192444, + "learning_rate": 7.432415519434791e-05, + "loss": 0.2476402759552002, + "memory(GiB)": 91.64, + "step": 6395, + "token_acc": 0.9071058475203553, + "train_speed(iter/s)": 0.138583 + }, + { + "epoch": 0.604001510003775, + "grad_norm": 0.3457031548023224, + "learning_rate": 7.417336828479462e-05, + "loss": 0.2557513236999512, + "memory(GiB)": 91.64, + "step": 6400, + "token_acc": 0.8938704028021016, + "train_speed(iter/s)": 0.138585 + }, + { + "epoch": 0.6044733861834655, + "grad_norm": 0.3015283942222595, + "learning_rate": 7.402264426697742e-05, + "loss": 0.25288589000701905, + "memory(GiB)": 91.64, + "step": 6405, + "token_acc": 0.9166666666666666, + "train_speed(iter/s)": 0.138584 + }, + { + "epoch": 0.6049452623631559, + "grad_norm": 0.340631365776062, + "learning_rate": 7.387198350793201e-05, + "loss": 0.2512622594833374, + "memory(GiB)": 91.64, + "step": 6410, + "token_acc": 0.92037691401649, + "train_speed(iter/s)": 0.138584 + }, + { + "epoch": 0.6054171385428464, + "grad_norm": 1.1782653331756592, + "learning_rate": 7.372138637453998e-05, + "loss": 0.25229225158691404, + "memory(GiB)": 91.64, + "step": 6415, + "token_acc": 0.908903403231351, + "train_speed(iter/s)": 0.138583 + }, + { + "epoch": 0.6058890147225368, + "grad_norm": 0.2346450537443161, + "learning_rate": 7.357085323352806e-05, + "loss": 0.25162057876586913, + "memory(GiB)": 91.64, + "step": 6420, + "token_acc": 0.912405513561583, + "train_speed(iter/s)": 0.138581 + }, + { + "epoch": 0.6063608909022272, + "grad_norm": 0.28029438853263855, + "learning_rate": 7.342038445146709e-05, + "loss": 0.24912467002868652, + "memory(GiB)": 91.64, + "step": 6425, + "token_acc": 0.9149034038638455, + "train_speed(iter/s)": 0.138578 + }, + { + "epoch": 0.6068327670819177, + "grad_norm": 0.7123743295669556, + "learning_rate": 7.326998039477118e-05, + "loss": 0.2548022985458374, + "memory(GiB)": 91.64, + "step": 6430, + "token_acc": 0.9224393132030787, + "train_speed(iter/s)": 0.138576 + }, + { + "epoch": 0.6073046432616082, + "grad_norm": 0.26868027448654175, + "learning_rate": 7.311964142969688e-05, + "loss": 0.2451089382171631, + "memory(GiB)": 91.64, + "step": 6435, + "token_acc": 0.9096236890808143, + "train_speed(iter/s)": 0.138575 + }, + { + "epoch": 0.6077765194412986, + "grad_norm": 0.2344752699136734, + "learning_rate": 7.296936792234221e-05, + "loss": 0.24809615612030028, + "memory(GiB)": 91.64, + "step": 6440, + "token_acc": 0.9250493096646942, + "train_speed(iter/s)": 0.138576 + }, + { + "epoch": 0.6082483956209891, + "grad_norm": 0.3093951642513275, + "learning_rate": 7.281916023864577e-05, + "loss": 0.25421953201293945, + "memory(GiB)": 91.64, + "step": 6445, + "token_acc": 0.9148073022312373, + "train_speed(iter/s)": 0.138576 + }, + { + "epoch": 0.6087202718006794, + "grad_norm": 0.9016293883323669, + "learning_rate": 7.266901874438585e-05, + "loss": 0.2528842926025391, + "memory(GiB)": 91.64, + "step": 6450, + "token_acc": 0.9130286493860846, + "train_speed(iter/s)": 0.138576 + }, + { + "epoch": 0.6091921479803699, + "grad_norm": 0.45874106884002686, + "learning_rate": 7.251894380517967e-05, + "loss": 0.25236220359802247, + "memory(GiB)": 91.64, + "step": 6455, + "token_acc": 0.9134799235181644, + "train_speed(iter/s)": 0.138575 + }, + { + "epoch": 0.6096640241600604, + "grad_norm": 0.3354821503162384, + "learning_rate": 7.236893578648218e-05, + "loss": 0.2504927158355713, + "memory(GiB)": 91.64, + "step": 6460, + "token_acc": 0.9105378704720087, + "train_speed(iter/s)": 0.138578 + }, + { + "epoch": 0.6101359003397508, + "grad_norm": 0.3122103214263916, + "learning_rate": 7.221899505358561e-05, + "loss": 0.24795224666595458, + "memory(GiB)": 91.64, + "step": 6465, + "token_acc": 0.9104912572855953, + "train_speed(iter/s)": 0.138578 + }, + { + "epoch": 0.6106077765194413, + "grad_norm": 0.28339579701423645, + "learning_rate": 7.206912197161815e-05, + "loss": 0.2503954887390137, + "memory(GiB)": 91.64, + "step": 6470, + "token_acc": 0.9248035914702581, + "train_speed(iter/s)": 0.138577 + }, + { + "epoch": 0.6110796526991318, + "grad_norm": 0.9332698583602905, + "learning_rate": 7.191931690554334e-05, + "loss": 0.2518021583557129, + "memory(GiB)": 91.64, + "step": 6475, + "token_acc": 0.9148079306071871, + "train_speed(iter/s)": 0.138577 + }, + { + "epoch": 0.6115515288788222, + "grad_norm": 0.2940748929977417, + "learning_rate": 7.176958022015902e-05, + "loss": 0.250186824798584, + "memory(GiB)": 91.64, + "step": 6480, + "token_acc": 0.9148569458807307, + "train_speed(iter/s)": 0.138577 + }, + { + "epoch": 0.6120234050585126, + "grad_norm": 0.5708451867103577, + "learning_rate": 7.161991228009663e-05, + "loss": 0.24937090873718262, + "memory(GiB)": 91.64, + "step": 6485, + "token_acc": 0.9102803738317757, + "train_speed(iter/s)": 0.138576 + }, + { + "epoch": 0.6124952812382031, + "grad_norm": 0.5467776656150818, + "learning_rate": 7.147031344982007e-05, + "loss": 0.2573434829711914, + "memory(GiB)": 91.64, + "step": 6490, + "token_acc": 0.9211538461538461, + "train_speed(iter/s)": 0.138575 + }, + { + "epoch": 0.6129671574178935, + "grad_norm": 0.48039039969444275, + "learning_rate": 7.132078409362506e-05, + "loss": 0.2514265298843384, + "memory(GiB)": 91.64, + "step": 6495, + "token_acc": 0.9051139864448552, + "train_speed(iter/s)": 0.138575 + }, + { + "epoch": 0.613439033597584, + "grad_norm": 0.5072046518325806, + "learning_rate": 7.117132457563807e-05, + "loss": 0.24883434772491456, + "memory(GiB)": 91.64, + "step": 6500, + "token_acc": 0.9116967175219602, + "train_speed(iter/s)": 0.138576 + }, + { + "epoch": 0.6139109097772745, + "grad_norm": 0.6466028094291687, + "learning_rate": 7.102193525981555e-05, + "loss": 0.2516045570373535, + "memory(GiB)": 91.64, + "step": 6505, + "token_acc": 0.9102815979043877, + "train_speed(iter/s)": 0.138576 + }, + { + "epoch": 0.6143827859569649, + "grad_norm": 0.4448561370372772, + "learning_rate": 7.087261650994295e-05, + "loss": 0.24750699996948242, + "memory(GiB)": 91.64, + "step": 6510, + "token_acc": 0.9200743494423792, + "train_speed(iter/s)": 0.138578 + }, + { + "epoch": 0.6148546621366553, + "grad_norm": 0.3403247594833374, + "learning_rate": 7.072336868963387e-05, + "loss": 0.2533865451812744, + "memory(GiB)": 91.64, + "step": 6515, + "token_acc": 0.9022582921665491, + "train_speed(iter/s)": 0.138579 + }, + { + "epoch": 0.6153265383163458, + "grad_norm": 0.25941842794418335, + "learning_rate": 7.057419216232925e-05, + "loss": 0.25099682807922363, + "memory(GiB)": 91.64, + "step": 6520, + "token_acc": 0.916248552682362, + "train_speed(iter/s)": 0.138578 + }, + { + "epoch": 0.6157984144960362, + "grad_norm": 0.39571791887283325, + "learning_rate": 7.042508729129644e-05, + "loss": 0.2497119903564453, + "memory(GiB)": 91.64, + "step": 6525, + "token_acc": 0.9218089602704987, + "train_speed(iter/s)": 0.13858 + }, + { + "epoch": 0.6162702906757267, + "grad_norm": 0.2697203457355499, + "learning_rate": 7.027605443962821e-05, + "loss": 0.2517711639404297, + "memory(GiB)": 91.64, + "step": 6530, + "token_acc": 0.9127028539451595, + "train_speed(iter/s)": 0.13858 + }, + { + "epoch": 0.6167421668554172, + "grad_norm": 0.4376463294029236, + "learning_rate": 7.012709397024195e-05, + "loss": 0.2458188056945801, + "memory(GiB)": 91.64, + "step": 6535, + "token_acc": 0.9233644859813084, + "train_speed(iter/s)": 0.138581 + }, + { + "epoch": 0.6172140430351076, + "grad_norm": 0.9600175023078918, + "learning_rate": 6.997820624587888e-05, + "loss": 0.2551449775695801, + "memory(GiB)": 91.64, + "step": 6540, + "token_acc": 0.9109266943291839, + "train_speed(iter/s)": 0.138581 + }, + { + "epoch": 0.617685919214798, + "grad_norm": 0.2927698493003845, + "learning_rate": 6.982939162910297e-05, + "loss": 0.2430652618408203, + "memory(GiB)": 91.64, + "step": 6545, + "token_acc": 0.9067005937234945, + "train_speed(iter/s)": 0.138582 + }, + { + "epoch": 0.6181577953944885, + "grad_norm": 0.2956832945346832, + "learning_rate": 6.968065048230028e-05, + "loss": 0.2454047679901123, + "memory(GiB)": 91.64, + "step": 6550, + "token_acc": 0.9283831282952548, + "train_speed(iter/s)": 0.138582 + }, + { + "epoch": 0.6186296715741789, + "grad_norm": 0.41990306973457336, + "learning_rate": 6.953198316767784e-05, + "loss": 0.25021138191223147, + "memory(GiB)": 91.64, + "step": 6555, + "token_acc": 0.9067796610169492, + "train_speed(iter/s)": 0.138582 + }, + { + "epoch": 0.6191015477538694, + "grad_norm": 0.31326499581336975, + "learning_rate": 6.938339004726297e-05, + "loss": 0.25416412353515627, + "memory(GiB)": 91.64, + "step": 6560, + "token_acc": 0.9209631728045325, + "train_speed(iter/s)": 0.138583 + }, + { + "epoch": 0.6195734239335599, + "grad_norm": 0.5134010910987854, + "learning_rate": 6.923487148290228e-05, + "loss": 0.24505879878997802, + "memory(GiB)": 91.64, + "step": 6565, + "token_acc": 0.9120978704123244, + "train_speed(iter/s)": 0.138582 + }, + { + "epoch": 0.6200453001132503, + "grad_norm": 0.5389367341995239, + "learning_rate": 6.908642783626083e-05, + "loss": 0.24259617328643798, + "memory(GiB)": 91.64, + "step": 6570, + "token_acc": 0.9113731456827691, + "train_speed(iter/s)": 0.138582 + }, + { + "epoch": 0.6205171762929408, + "grad_norm": 0.29177892208099365, + "learning_rate": 6.893805946882122e-05, + "loss": 0.25206589698791504, + "memory(GiB)": 91.64, + "step": 6575, + "token_acc": 0.9010067114093959, + "train_speed(iter/s)": 0.138581 + }, + { + "epoch": 0.6209890524726311, + "grad_norm": 0.37285518646240234, + "learning_rate": 6.87897667418828e-05, + "loss": 0.2487583875656128, + "memory(GiB)": 91.64, + "step": 6580, + "token_acc": 0.9034863945578231, + "train_speed(iter/s)": 0.138581 + }, + { + "epoch": 0.6214609286523216, + "grad_norm": 0.3356618285179138, + "learning_rate": 6.864155001656068e-05, + "loss": 0.24705860614776612, + "memory(GiB)": 91.64, + "step": 6585, + "token_acc": 0.90715667311412, + "train_speed(iter/s)": 0.138581 + }, + { + "epoch": 0.6219328048320121, + "grad_norm": 0.44045567512512207, + "learning_rate": 6.849340965378488e-05, + "loss": 0.24823305606842042, + "memory(GiB)": 91.64, + "step": 6590, + "token_acc": 0.9090614886731392, + "train_speed(iter/s)": 0.138582 + }, + { + "epoch": 0.6224046810117025, + "grad_norm": 0.22942085564136505, + "learning_rate": 6.83453460142995e-05, + "loss": 0.24726405143737792, + "memory(GiB)": 91.64, + "step": 6595, + "token_acc": 0.9070940932027308, + "train_speed(iter/s)": 0.138583 + }, + { + "epoch": 0.622876557191393, + "grad_norm": 0.2592119872570038, + "learning_rate": 6.819735945866177e-05, + "loss": 0.25738024711608887, + "memory(GiB)": 91.64, + "step": 6600, + "token_acc": 0.9033684926845866, + "train_speed(iter/s)": 0.138583 + }, + { + "epoch": 0.6233484333710835, + "grad_norm": 0.224277526140213, + "learning_rate": 6.80494503472412e-05, + "loss": 0.2494365930557251, + "memory(GiB)": 91.64, + "step": 6605, + "token_acc": 0.9118002416431735, + "train_speed(iter/s)": 0.138584 + }, + { + "epoch": 0.6238203095507738, + "grad_norm": 0.4567587971687317, + "learning_rate": 6.790161904021884e-05, + "loss": 0.24388408660888672, + "memory(GiB)": 91.64, + "step": 6610, + "token_acc": 0.919981498612396, + "train_speed(iter/s)": 0.138584 + }, + { + "epoch": 0.6242921857304643, + "grad_norm": 0.3329204320907593, + "learning_rate": 6.775386589758612e-05, + "loss": 0.2464517116546631, + "memory(GiB)": 91.64, + "step": 6615, + "token_acc": 0.917174959871589, + "train_speed(iter/s)": 0.138584 + }, + { + "epoch": 0.6247640619101548, + "grad_norm": 0.20031793415546417, + "learning_rate": 6.760619127914417e-05, + "loss": 0.24638218879699708, + "memory(GiB)": 91.64, + "step": 6620, + "token_acc": 0.9080194722008711, + "train_speed(iter/s)": 0.13858 + }, + { + "epoch": 0.6252359380898452, + "grad_norm": 0.362870991230011, + "learning_rate": 6.745859554450296e-05, + "loss": 0.24988138675689697, + "memory(GiB)": 91.64, + "step": 6625, + "token_acc": 0.9098639455782312, + "train_speed(iter/s)": 0.138582 + }, + { + "epoch": 0.6257078142695357, + "grad_norm": 1.1485202312469482, + "learning_rate": 6.731107905308025e-05, + "loss": 0.24473962783813477, + "memory(GiB)": 91.64, + "step": 6630, + "token_acc": 0.9165175909361956, + "train_speed(iter/s)": 0.138582 + }, + { + "epoch": 0.6261796904492262, + "grad_norm": 0.2645990252494812, + "learning_rate": 6.716364216410095e-05, + "loss": 0.25117623805999756, + "memory(GiB)": 91.64, + "step": 6635, + "token_acc": 0.9164531009738596, + "train_speed(iter/s)": 0.138583 + }, + { + "epoch": 0.6266515666289165, + "grad_norm": 0.2435101419687271, + "learning_rate": 6.70162852365961e-05, + "loss": 0.2497401237487793, + "memory(GiB)": 91.64, + "step": 6640, + "token_acc": 0.9139902014153511, + "train_speed(iter/s)": 0.138583 + }, + { + "epoch": 0.627123442808607, + "grad_norm": 0.7363473773002625, + "learning_rate": 6.686900862940199e-05, + "loss": 0.2504051685333252, + "memory(GiB)": 91.64, + "step": 6645, + "token_acc": 0.9123672230652504, + "train_speed(iter/s)": 0.138584 + }, + { + "epoch": 0.6275953189882975, + "grad_norm": 0.4245203137397766, + "learning_rate": 6.672181270115929e-05, + "loss": 0.24365406036376952, + "memory(GiB)": 91.64, + "step": 6650, + "token_acc": 0.9070602313522138, + "train_speed(iter/s)": 0.138586 + }, + { + "epoch": 0.6280671951679879, + "grad_norm": 0.38381725549697876, + "learning_rate": 6.657469781031229e-05, + "loss": 0.2454462766647339, + "memory(GiB)": 91.64, + "step": 6655, + "token_acc": 0.911594602038006, + "train_speed(iter/s)": 0.138587 + }, + { + "epoch": 0.6285390713476784, + "grad_norm": 0.20383363962173462, + "learning_rate": 6.64276643151079e-05, + "loss": 0.24686357975006104, + "memory(GiB)": 91.64, + "step": 6660, + "token_acc": 0.9073148568832349, + "train_speed(iter/s)": 0.138588 + }, + { + "epoch": 0.6290109475273689, + "grad_norm": 0.221710205078125, + "learning_rate": 6.628071257359473e-05, + "loss": 0.24501872062683105, + "memory(GiB)": 91.64, + "step": 6665, + "token_acc": 0.9191333536771438, + "train_speed(iter/s)": 0.138587 + }, + { + "epoch": 0.6294828237070592, + "grad_norm": 0.6653693914413452, + "learning_rate": 6.613384294362248e-05, + "loss": 0.24365825653076173, + "memory(GiB)": 91.64, + "step": 6670, + "token_acc": 0.9140116478245974, + "train_speed(iter/s)": 0.138588 + }, + { + "epoch": 0.6299546998867497, + "grad_norm": 0.5823807716369629, + "learning_rate": 6.598705578284081e-05, + "loss": 0.2461719036102295, + "memory(GiB)": 91.64, + "step": 6675, + "token_acc": 0.9245418613007546, + "train_speed(iter/s)": 0.138588 + }, + { + "epoch": 0.6304265760664401, + "grad_norm": 0.34033286571502686, + "learning_rate": 6.58403514486985e-05, + "loss": 0.2465500831604004, + "memory(GiB)": 91.64, + "step": 6680, + "token_acc": 0.904896090172596, + "train_speed(iter/s)": 0.138588 + }, + { + "epoch": 0.6308984522461306, + "grad_norm": 0.33391931653022766, + "learning_rate": 6.569373029844273e-05, + "loss": 0.2485738754272461, + "memory(GiB)": 91.64, + "step": 6685, + "token_acc": 0.8928571428571429, + "train_speed(iter/s)": 0.138587 + }, + { + "epoch": 0.6313703284258211, + "grad_norm": 0.3390878736972809, + "learning_rate": 6.554719268911804e-05, + "loss": 0.2519662380218506, + "memory(GiB)": 91.64, + "step": 6690, + "token_acc": 0.8982584784601283, + "train_speed(iter/s)": 0.138586 + }, + { + "epoch": 0.6318422046055115, + "grad_norm": 0.2536347508430481, + "learning_rate": 6.540073897756557e-05, + "loss": 0.24952611923217774, + "memory(GiB)": 91.64, + "step": 6695, + "token_acc": 0.8970688479890934, + "train_speed(iter/s)": 0.138588 + }, + { + "epoch": 0.632314080785202, + "grad_norm": 0.2514791786670685, + "learning_rate": 6.52543695204222e-05, + "loss": 0.249678373336792, + "memory(GiB)": 91.64, + "step": 6700, + "token_acc": 0.9129193433261956, + "train_speed(iter/s)": 0.138588 + }, + { + "epoch": 0.6327859569648924, + "grad_norm": 0.30157020688056946, + "learning_rate": 6.510808467411955e-05, + "loss": 0.24402194023132323, + "memory(GiB)": 91.64, + "step": 6705, + "token_acc": 0.8997335109926715, + "train_speed(iter/s)": 0.138589 + }, + { + "epoch": 0.6332578331445828, + "grad_norm": 0.3998345732688904, + "learning_rate": 6.496188479488328e-05, + "loss": 0.24702603816986085, + "memory(GiB)": 91.64, + "step": 6710, + "token_acc": 0.9077490774907749, + "train_speed(iter/s)": 0.138589 + }, + { + "epoch": 0.6337297093242733, + "grad_norm": 0.7433375716209412, + "learning_rate": 6.481577023873204e-05, + "loss": 0.2497015953063965, + "memory(GiB)": 91.64, + "step": 6715, + "token_acc": 0.9197431781701445, + "train_speed(iter/s)": 0.138589 + }, + { + "epoch": 0.6342015855039638, + "grad_norm": 0.5230844020843506, + "learning_rate": 6.466974136147679e-05, + "loss": 0.24684548377990723, + "memory(GiB)": 91.64, + "step": 6720, + "token_acc": 0.9207169941399518, + "train_speed(iter/s)": 0.13859 + }, + { + "epoch": 0.6346734616836542, + "grad_norm": 0.6465916633605957, + "learning_rate": 6.45237985187199e-05, + "loss": 0.24689052104949952, + "memory(GiB)": 91.64, + "step": 6725, + "token_acc": 0.9153567110036276, + "train_speed(iter/s)": 0.13859 + }, + { + "epoch": 0.6351453378633447, + "grad_norm": 0.23393841087818146, + "learning_rate": 6.43779420658541e-05, + "loss": 0.23820953369140624, + "memory(GiB)": 91.64, + "step": 6730, + "token_acc": 0.9156939040207522, + "train_speed(iter/s)": 0.13859 + }, + { + "epoch": 0.6356172140430351, + "grad_norm": 0.31033238768577576, + "learning_rate": 6.42321723580618e-05, + "loss": 0.2415005683898926, + "memory(GiB)": 91.64, + "step": 6735, + "token_acc": 0.9279962103268593, + "train_speed(iter/s)": 0.138592 + }, + { + "epoch": 0.6360890902227255, + "grad_norm": 0.4967711865901947, + "learning_rate": 6.408648975031423e-05, + "loss": 0.25005640983581545, + "memory(GiB)": 91.64, + "step": 6740, + "token_acc": 0.9208301306687163, + "train_speed(iter/s)": 0.138593 + }, + { + "epoch": 0.636560966402416, + "grad_norm": 0.5336925983428955, + "learning_rate": 6.394089459737043e-05, + "loss": 0.25364394187927247, + "memory(GiB)": 91.64, + "step": 6745, + "token_acc": 0.9160662824207493, + "train_speed(iter/s)": 0.138593 + }, + { + "epoch": 0.6370328425821065, + "grad_norm": 0.4167347848415375, + "learning_rate": 6.379538725377649e-05, + "loss": 0.242067289352417, + "memory(GiB)": 91.64, + "step": 6750, + "token_acc": 0.9000430848772081, + "train_speed(iter/s)": 0.138593 + }, + { + "epoch": 0.6375047187617969, + "grad_norm": 0.7526382207870483, + "learning_rate": 6.364996807386474e-05, + "loss": 0.251052188873291, + "memory(GiB)": 91.64, + "step": 6755, + "token_acc": 0.913547532295462, + "train_speed(iter/s)": 0.138593 + }, + { + "epoch": 0.6379765949414874, + "grad_norm": 0.31620025634765625, + "learning_rate": 6.350463741175281e-05, + "loss": 0.24103710651397706, + "memory(GiB)": 91.64, + "step": 6760, + "token_acc": 0.9160739687055477, + "train_speed(iter/s)": 0.138593 + }, + { + "epoch": 0.6384484711211778, + "grad_norm": 0.5015328526496887, + "learning_rate": 6.335939562134268e-05, + "loss": 0.24773306846618653, + "memory(GiB)": 91.64, + "step": 6765, + "token_acc": 0.9099279423538831, + "train_speed(iter/s)": 0.138594 + }, + { + "epoch": 0.6389203473008682, + "grad_norm": 0.2772028148174286, + "learning_rate": 6.321424305631998e-05, + "loss": 0.24385676383972169, + "memory(GiB)": 91.64, + "step": 6770, + "token_acc": 0.9089005235602095, + "train_speed(iter/s)": 0.138593 + }, + { + "epoch": 0.6393922234805587, + "grad_norm": 0.6564515829086304, + "learning_rate": 6.306918007015307e-05, + "loss": 0.2472226619720459, + "memory(GiB)": 91.64, + "step": 6775, + "token_acc": 0.9095378564405113, + "train_speed(iter/s)": 0.138593 + }, + { + "epoch": 0.6398640996602492, + "grad_norm": 0.29450663924217224, + "learning_rate": 6.292420701609214e-05, + "loss": 0.25525641441345215, + "memory(GiB)": 91.64, + "step": 6780, + "token_acc": 0.8881748071979434, + "train_speed(iter/s)": 0.138594 + }, + { + "epoch": 0.6403359758399396, + "grad_norm": 0.33771711587905884, + "learning_rate": 6.277932424716844e-05, + "loss": 0.24898838996887207, + "memory(GiB)": 91.64, + "step": 6785, + "token_acc": 0.9064565327910523, + "train_speed(iter/s)": 0.138592 + }, + { + "epoch": 0.6408078520196301, + "grad_norm": 0.4698493182659149, + "learning_rate": 6.263453211619328e-05, + "loss": 0.24726357460021972, + "memory(GiB)": 91.64, + "step": 6790, + "token_acc": 0.900737379466818, + "train_speed(iter/s)": 0.138593 + }, + { + "epoch": 0.6412797281993206, + "grad_norm": 0.27207568287849426, + "learning_rate": 6.248983097575734e-05, + "loss": 0.2391824960708618, + "memory(GiB)": 91.64, + "step": 6795, + "token_acc": 0.9296657381615598, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.6417516043790109, + "grad_norm": 0.2685665190219879, + "learning_rate": 6.234522117822964e-05, + "loss": 0.24474749565124512, + "memory(GiB)": 91.64, + "step": 6800, + "token_acc": 0.9061456245824984, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.6422234805587014, + "grad_norm": 0.37602898478507996, + "learning_rate": 6.220070307575681e-05, + "loss": 0.24138176441192627, + "memory(GiB)": 91.64, + "step": 6805, + "token_acc": 0.9087617668356264, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.6426953567383918, + "grad_norm": 0.3656831681728363, + "learning_rate": 6.205627702026217e-05, + "loss": 0.24532785415649414, + "memory(GiB)": 91.64, + "step": 6810, + "token_acc": 0.915625, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.6431672329180823, + "grad_norm": 0.7820502519607544, + "learning_rate": 6.191194336344499e-05, + "loss": 0.24389877319335937, + "memory(GiB)": 91.64, + "step": 6815, + "token_acc": 0.9069687607277721, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.6436391090977728, + "grad_norm": 0.28132694959640503, + "learning_rate": 6.176770245677937e-05, + "loss": 0.23709509372711182, + "memory(GiB)": 91.64, + "step": 6820, + "token_acc": 0.9139106286291341, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.6441109852774632, + "grad_norm": 0.29679086804389954, + "learning_rate": 6.162355465151366e-05, + "loss": 0.2428110122680664, + "memory(GiB)": 91.64, + "step": 6825, + "token_acc": 0.9155524278676987, + "train_speed(iter/s)": 0.138596 + }, + { + "epoch": 0.6445828614571536, + "grad_norm": 0.6224273443222046, + "learning_rate": 6.147950029866946e-05, + "loss": 0.25038561820983884, + "memory(GiB)": 91.64, + "step": 6830, + "token_acc": 0.90463645943098, + "train_speed(iter/s)": 0.138599 + }, + { + "epoch": 0.6450547376368441, + "grad_norm": 0.21147441864013672, + "learning_rate": 6.13355397490408e-05, + "loss": 0.24030847549438478, + "memory(GiB)": 91.64, + "step": 6835, + "token_acc": 0.9303013993541442, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.6455266138165345, + "grad_norm": 0.3700733184814453, + "learning_rate": 6.119167335319326e-05, + "loss": 0.24275476932525636, + "memory(GiB)": 91.64, + "step": 6840, + "token_acc": 0.9167386920195909, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.645998489996225, + "grad_norm": 0.3230724632740021, + "learning_rate": 6.104790146146326e-05, + "loss": 0.24337000846862794, + "memory(GiB)": 91.64, + "step": 6845, + "token_acc": 0.9195822454308094, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.6464703661759155, + "grad_norm": 0.5377890467643738, + "learning_rate": 6.0904224423956935e-05, + "loss": 0.24264154434204102, + "memory(GiB)": 91.64, + "step": 6850, + "token_acc": 0.9083885209713024, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.6469422423556059, + "grad_norm": 0.2459423542022705, + "learning_rate": 6.07606425905495e-05, + "loss": 0.24474921226501464, + "memory(GiB)": 91.64, + "step": 6855, + "token_acc": 0.9048361934477379, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.6474141185352963, + "grad_norm": 0.21430309116840363, + "learning_rate": 6.061715631088436e-05, + "loss": 0.24547672271728516, + "memory(GiB)": 91.64, + "step": 6860, + "token_acc": 0.9136972866949984, + "train_speed(iter/s)": 0.138597 + }, + { + "epoch": 0.6478859947149868, + "grad_norm": 0.1994255632162094, + "learning_rate": 6.047376593437214e-05, + "loss": 0.24437365531921387, + "memory(GiB)": 91.64, + "step": 6865, + "token_acc": 0.8946328613763582, + "train_speed(iter/s)": 0.138597 + }, + { + "epoch": 0.6483578708946772, + "grad_norm": 0.21017318964004517, + "learning_rate": 6.033047181019007e-05, + "loss": 0.2440267562866211, + "memory(GiB)": 91.64, + "step": 6870, + "token_acc": 0.9259259259259259, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.6488297470743677, + "grad_norm": 0.21352751553058624, + "learning_rate": 6.0187274287280915e-05, + "loss": 0.2388768672943115, + "memory(GiB)": 91.64, + "step": 6875, + "token_acc": 0.9148382298252138, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.6493016232540582, + "grad_norm": 0.37781575322151184, + "learning_rate": 6.004417371435216e-05, + "loss": 0.2506894111633301, + "memory(GiB)": 91.64, + "step": 6880, + "token_acc": 0.916063059224542, + "train_speed(iter/s)": 0.138599 + }, + { + "epoch": 0.6497734994337486, + "grad_norm": 0.5697845220565796, + "learning_rate": 5.990117043987524e-05, + "loss": 0.24643681049346924, + "memory(GiB)": 91.64, + "step": 6885, + "token_acc": 0.9218500797448166, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.650245375613439, + "grad_norm": 0.27268853783607483, + "learning_rate": 5.975826481208469e-05, + "loss": 0.24230141639709474, + "memory(GiB)": 91.64, + "step": 6890, + "token_acc": 0.9050925925925926, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.6507172517931294, + "grad_norm": 0.23473475873470306, + "learning_rate": 5.961545717897716e-05, + "loss": 0.24397435188293456, + "memory(GiB)": 91.64, + "step": 6895, + "token_acc": 0.9104010025062657, + "train_speed(iter/s)": 0.138596 + }, + { + "epoch": 0.6511891279728199, + "grad_norm": 0.2682229280471802, + "learning_rate": 5.9472747888310834e-05, + "loss": 0.24428434371948243, + "memory(GiB)": 91.64, + "step": 6900, + "token_acc": 0.9191026512576479, + "train_speed(iter/s)": 0.138596 + }, + { + "epoch": 0.6516610041525104, + "grad_norm": 0.3811211585998535, + "learning_rate": 5.933013728760423e-05, + "loss": 0.24194138050079345, + "memory(GiB)": 91.64, + "step": 6905, + "token_acc": 0.9222492190211732, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.6521328803322008, + "grad_norm": 0.36602070927619934, + "learning_rate": 5.9187625724135674e-05, + "loss": 0.2407762050628662, + "memory(GiB)": 91.64, + "step": 6910, + "token_acc": 0.922350162385592, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.6526047565118913, + "grad_norm": 0.34567978978157043, + "learning_rate": 5.904521354494228e-05, + "loss": 0.24706146717071534, + "memory(GiB)": 91.64, + "step": 6915, + "token_acc": 0.9051838723969872, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.6530766326915818, + "grad_norm": 0.748266339302063, + "learning_rate": 5.890290109681911e-05, + "loss": 0.24392051696777345, + "memory(GiB)": 91.64, + "step": 6920, + "token_acc": 0.9111389236545682, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.6535485088712721, + "grad_norm": 0.6312034130096436, + "learning_rate": 5.8760688726318394e-05, + "loss": 0.2393946170806885, + "memory(GiB)": 91.64, + "step": 6925, + "token_acc": 0.9175446633073752, + "train_speed(iter/s)": 0.138594 + }, + { + "epoch": 0.6540203850509626, + "grad_norm": 0.39340659976005554, + "learning_rate": 5.861857677974871e-05, + "loss": 0.24171264171600343, + "memory(GiB)": 91.64, + "step": 6930, + "token_acc": 0.9208353569694027, + "train_speed(iter/s)": 0.138594 + }, + { + "epoch": 0.6544922612306531, + "grad_norm": 0.27433183789253235, + "learning_rate": 5.8476565603174025e-05, + "loss": 0.24087843894958497, + "memory(GiB)": 91.64, + "step": 6935, + "token_acc": 0.9106471816283925, + "train_speed(iter/s)": 0.138594 + }, + { + "epoch": 0.6549641374103435, + "grad_norm": 0.2510198950767517, + "learning_rate": 5.833465554241291e-05, + "loss": 0.24894437789916993, + "memory(GiB)": 91.64, + "step": 6940, + "token_acc": 0.9257688229056203, + "train_speed(iter/s)": 0.138593 + }, + { + "epoch": 0.655436013590034, + "grad_norm": 0.4624379277229309, + "learning_rate": 5.8192846943037724e-05, + "loss": 0.2392751693725586, + "memory(GiB)": 91.64, + "step": 6945, + "token_acc": 0.9086819613135403, + "train_speed(iter/s)": 0.138593 + }, + { + "epoch": 0.6559078897697245, + "grad_norm": 0.664070725440979, + "learning_rate": 5.805114015037383e-05, + "loss": 0.24224374294281006, + "memory(GiB)": 91.64, + "step": 6950, + "token_acc": 0.9146216768916156, + "train_speed(iter/s)": 0.138594 + }, + { + "epoch": 0.6563797659494148, + "grad_norm": 0.250935435295105, + "learning_rate": 5.790953550949845e-05, + "loss": 0.23738400936126708, + "memory(GiB)": 91.64, + "step": 6955, + "token_acc": 0.9275690357627886, + "train_speed(iter/s)": 0.138594 + }, + { + "epoch": 0.6568516421291053, + "grad_norm": 0.6119559407234192, + "learning_rate": 5.7768033365240346e-05, + "loss": 0.2418374538421631, + "memory(GiB)": 91.64, + "step": 6960, + "token_acc": 0.8984538236523193, + "train_speed(iter/s)": 0.138594 + }, + { + "epoch": 0.6573235183087958, + "grad_norm": 0.32473883032798767, + "learning_rate": 5.7626634062178474e-05, + "loss": 0.24251232147216797, + "memory(GiB)": 91.64, + "step": 6965, + "token_acc": 0.9, + "train_speed(iter/s)": 0.138594 + }, + { + "epoch": 0.6577953944884862, + "grad_norm": 0.6449810862541199, + "learning_rate": 5.748533794464142e-05, + "loss": 0.24386320114135743, + "memory(GiB)": 91.64, + "step": 6970, + "token_acc": 0.9190297232661429, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.6582672706681767, + "grad_norm": 0.6496239900588989, + "learning_rate": 5.7344145356706515e-05, + "loss": 0.24536077976226806, + "memory(GiB)": 91.64, + "step": 6975, + "token_acc": 0.9164449175093135, + "train_speed(iter/s)": 0.138594 + }, + { + "epoch": 0.6587391468478672, + "grad_norm": 0.48430728912353516, + "learning_rate": 5.7203056642199e-05, + "loss": 0.23882853984832764, + "memory(GiB)": 91.64, + "step": 6980, + "token_acc": 0.9236111111111112, + "train_speed(iter/s)": 0.138593 + }, + { + "epoch": 0.6592110230275575, + "grad_norm": 0.27332404255867004, + "learning_rate": 5.7062072144691036e-05, + "loss": 0.23896045684814454, + "memory(GiB)": 91.64, + "step": 6985, + "token_acc": 0.9155635062611807, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.659682899207248, + "grad_norm": 0.31327372789382935, + "learning_rate": 5.692119220750123e-05, + "loss": 0.24076180458068847, + "memory(GiB)": 91.64, + "step": 6990, + "token_acc": 0.9125720094883091, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.6601547753869385, + "grad_norm": 0.3141052722930908, + "learning_rate": 5.678041717369331e-05, + "loss": 0.23503921031951905, + "memory(GiB)": 91.64, + "step": 6995, + "token_acc": 0.9165507649513213, + "train_speed(iter/s)": 0.138594 + }, + { + "epoch": 0.6606266515666289, + "grad_norm": 0.8892892003059387, + "learning_rate": 5.663974738607576e-05, + "loss": 0.24884920120239257, + "memory(GiB)": 91.64, + "step": 7000, + "token_acc": 0.917921146953405, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.6610985277463194, + "grad_norm": 0.8716257214546204, + "learning_rate": 5.649918318720069e-05, + "loss": 0.2401883602142334, + "memory(GiB)": 91.64, + "step": 7005, + "token_acc": 0.8837485172004745, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.6615704039260099, + "grad_norm": 0.28352469205856323, + "learning_rate": 5.635872491936301e-05, + "loss": 0.24602279663085938, + "memory(GiB)": 91.64, + "step": 7010, + "token_acc": 0.9251700680272109, + "train_speed(iter/s)": 0.138594 + }, + { + "epoch": 0.6620422801057002, + "grad_norm": 0.6487071514129639, + "learning_rate": 5.621837292459975e-05, + "loss": 0.24283573627471924, + "memory(GiB)": 91.64, + "step": 7015, + "token_acc": 0.9180874722016308, + "train_speed(iter/s)": 0.138593 + }, + { + "epoch": 0.6625141562853907, + "grad_norm": 0.27189901471138, + "learning_rate": 5.6078127544689275e-05, + "loss": 0.2412208080291748, + "memory(GiB)": 91.64, + "step": 7020, + "token_acc": 0.9183193277310925, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.6629860324650811, + "grad_norm": 0.576571524143219, + "learning_rate": 5.593798912115007e-05, + "loss": 0.23776733875274658, + "memory(GiB)": 91.64, + "step": 7025, + "token_acc": 0.9100346020761245, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.6634579086447716, + "grad_norm": 0.31800100207328796, + "learning_rate": 5.579795799524033e-05, + "loss": 0.23961338996887208, + "memory(GiB)": 91.64, + "step": 7030, + "token_acc": 0.9148170365926814, + "train_speed(iter/s)": 0.138596 + }, + { + "epoch": 0.6639297848244621, + "grad_norm": 0.592032253742218, + "learning_rate": 5.565803450795696e-05, + "loss": 0.24274992942810059, + "memory(GiB)": 91.64, + "step": 7035, + "token_acc": 0.9148219441770934, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.6644016610041525, + "grad_norm": 0.20974688231945038, + "learning_rate": 5.551821900003461e-05, + "loss": 0.23541603088378907, + "memory(GiB)": 91.64, + "step": 7040, + "token_acc": 0.9282661782661783, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.664873537183843, + "grad_norm": 0.9004971981048584, + "learning_rate": 5.5378511811945246e-05, + "loss": 0.24225046634674072, + "memory(GiB)": 91.64, + "step": 7045, + "token_acc": 0.906993775173929, + "train_speed(iter/s)": 0.138599 + }, + { + "epoch": 0.6653454133635334, + "grad_norm": 0.2866557538509369, + "learning_rate": 5.5238913283896766e-05, + "loss": 0.23856678009033203, + "memory(GiB)": 91.64, + "step": 7050, + "token_acc": 0.9058961343225302, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.6658172895432238, + "grad_norm": 0.7507352232933044, + "learning_rate": 5.509942375583267e-05, + "loss": 0.24381327629089355, + "memory(GiB)": 91.64, + "step": 7055, + "token_acc": 0.9071883530482256, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.6662891657229143, + "grad_norm": 0.23248225450515747, + "learning_rate": 5.496004356743093e-05, + "loss": 0.24313607215881347, + "memory(GiB)": 91.64, + "step": 7060, + "token_acc": 0.9168689320388349, + "train_speed(iter/s)": 0.138597 + }, + { + "epoch": 0.6667610419026048, + "grad_norm": 0.20760686695575714, + "learning_rate": 5.482077305810334e-05, + "loss": 0.23505353927612305, + "memory(GiB)": 91.64, + "step": 7065, + "token_acc": 0.9141703130259172, + "train_speed(iter/s)": 0.138596 + }, + { + "epoch": 0.6672329180822952, + "grad_norm": 0.30243703722953796, + "learning_rate": 5.468161256699443e-05, + "loss": 0.2383075475692749, + "memory(GiB)": 91.64, + "step": 7070, + "token_acc": 0.9148854961832061, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.6677047942619857, + "grad_norm": 0.35814082622528076, + "learning_rate": 5.454256243298112e-05, + "loss": 0.24843888282775878, + "memory(GiB)": 91.64, + "step": 7075, + "token_acc": 0.9190017513134852, + "train_speed(iter/s)": 0.138594 + }, + { + "epoch": 0.6681766704416761, + "grad_norm": 0.411588191986084, + "learning_rate": 5.440362299467128e-05, + "loss": 0.2393017053604126, + "memory(GiB)": 91.64, + "step": 7080, + "token_acc": 0.9128503075871497, + "train_speed(iter/s)": 0.138594 + }, + { + "epoch": 0.6686485466213665, + "grad_norm": 0.23460780084133148, + "learning_rate": 5.4264794590403404e-05, + "loss": 0.23937084674835205, + "memory(GiB)": 91.64, + "step": 7085, + "token_acc": 0.916003293988471, + "train_speed(iter/s)": 0.138596 + }, + { + "epoch": 0.669120422801057, + "grad_norm": 0.5503364205360413, + "learning_rate": 5.412607755824559e-05, + "loss": 0.24756169319152832, + "memory(GiB)": 91.64, + "step": 7090, + "token_acc": 0.9075514874141877, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.6695922989807475, + "grad_norm": 0.32460644841194153, + "learning_rate": 5.3987472235994615e-05, + "loss": 0.24046099185943604, + "memory(GiB)": 91.64, + "step": 7095, + "token_acc": 0.9222476314929762, + "train_speed(iter/s)": 0.138596 + }, + { + "epoch": 0.6700641751604379, + "grad_norm": 0.8033245801925659, + "learning_rate": 5.3848978961175325e-05, + "loss": 0.2427436590194702, + "memory(GiB)": 91.64, + "step": 7100, + "token_acc": 0.9128919860627178, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.6705360513401284, + "grad_norm": 0.28919094800949097, + "learning_rate": 5.3710598071039774e-05, + "loss": 0.2408435344696045, + "memory(GiB)": 91.64, + "step": 7105, + "token_acc": 0.9220098643649816, + "train_speed(iter/s)": 0.138597 + }, + { + "epoch": 0.6710079275198187, + "grad_norm": 0.3446965217590332, + "learning_rate": 5.357232990256618e-05, + "loss": 0.2420729637145996, + "memory(GiB)": 91.64, + "step": 7110, + "token_acc": 0.9189504373177843, + "train_speed(iter/s)": 0.138596 + }, + { + "epoch": 0.6714798036995092, + "grad_norm": 0.5533409714698792, + "learning_rate": 5.3434174792458357e-05, + "loss": 0.24679412841796874, + "memory(GiB)": 91.64, + "step": 7115, + "token_acc": 0.9165575916230366, + "train_speed(iter/s)": 0.138597 + }, + { + "epoch": 0.6719516798791997, + "grad_norm": 0.26969775557518005, + "learning_rate": 5.3296133077144864e-05, + "loss": 0.23531513214111327, + "memory(GiB)": 91.64, + "step": 7120, + "token_acc": 0.9101084295208115, + "train_speed(iter/s)": 0.138597 + }, + { + "epoch": 0.6724235560588901, + "grad_norm": 0.3181051015853882, + "learning_rate": 5.315820509277796e-05, + "loss": 0.2495020866394043, + "memory(GiB)": 91.64, + "step": 7125, + "token_acc": 0.9126819126819127, + "train_speed(iter/s)": 0.138599 + }, + { + "epoch": 0.6728954322385806, + "grad_norm": 0.6126598119735718, + "learning_rate": 5.302039117523307e-05, + "loss": 0.2389207363128662, + "memory(GiB)": 91.64, + "step": 7130, + "token_acc": 0.9185185185185185, + "train_speed(iter/s)": 0.138598 + }, + { + "epoch": 0.6733673084182711, + "grad_norm": 0.22201423346996307, + "learning_rate": 5.288269166010788e-05, + "loss": 0.24875764846801757, + "memory(GiB)": 91.64, + "step": 7135, + "token_acc": 0.9052910052910053, + "train_speed(iter/s)": 0.138599 + }, + { + "epoch": 0.6738391845979615, + "grad_norm": 0.3069898188114166, + "learning_rate": 5.274510688272141e-05, + "loss": 0.24061965942382812, + "memory(GiB)": 91.64, + "step": 7140, + "token_acc": 0.9072276159654801, + "train_speed(iter/s)": 0.138601 + }, + { + "epoch": 0.6743110607776519, + "grad_norm": 0.24082788825035095, + "learning_rate": 5.260763717811328e-05, + "loss": 0.2444239616394043, + "memory(GiB)": 91.64, + "step": 7145, + "token_acc": 0.9239920687376074, + "train_speed(iter/s)": 0.138602 + }, + { + "epoch": 0.6747829369573424, + "grad_norm": 0.4293016791343689, + "learning_rate": 5.247028288104301e-05, + "loss": 0.24262595176696777, + "memory(GiB)": 91.64, + "step": 7150, + "token_acc": 0.9193776520509194, + "train_speed(iter/s)": 0.1386 + }, + { + "epoch": 0.6752548131370328, + "grad_norm": 0.3337520658969879, + "learning_rate": 5.233304432598886e-05, + "loss": 0.2384474754333496, + "memory(GiB)": 91.64, + "step": 7155, + "token_acc": 0.9188795925791197, + "train_speed(iter/s)": 0.1386 + }, + { + "epoch": 0.6757266893167233, + "grad_norm": 0.28636276721954346, + "learning_rate": 5.2195921847147436e-05, + "loss": 0.24127793312072754, + "memory(GiB)": 91.64, + "step": 7160, + "token_acc": 0.9061994609164421, + "train_speed(iter/s)": 0.1386 + }, + { + "epoch": 0.6761985654964138, + "grad_norm": 0.2952304482460022, + "learning_rate": 5.2058915778432614e-05, + "loss": 0.23610930442810057, + "memory(GiB)": 91.64, + "step": 7165, + "token_acc": 0.9076154806491885, + "train_speed(iter/s)": 0.138601 + }, + { + "epoch": 0.6766704416761042, + "grad_norm": 0.2610105574131012, + "learning_rate": 5.1922026453474795e-05, + "loss": 0.2389233112335205, + "memory(GiB)": 91.64, + "step": 7170, + "token_acc": 0.9191176470588235, + "train_speed(iter/s)": 0.1386 + }, + { + "epoch": 0.6771423178557946, + "grad_norm": 0.23289845883846283, + "learning_rate": 5.178525420562013e-05, + "loss": 0.24023265838623048, + "memory(GiB)": 91.64, + "step": 7175, + "token_acc": 0.9193548387096774, + "train_speed(iter/s)": 0.1386 + }, + { + "epoch": 0.6776141940354851, + "grad_norm": 0.3138931691646576, + "learning_rate": 5.164859936792955e-05, + "loss": 0.2391728162765503, + "memory(GiB)": 91.64, + "step": 7180, + "token_acc": 0.9241106719367589, + "train_speed(iter/s)": 0.138601 + }, + { + "epoch": 0.6780860702151755, + "grad_norm": 0.5057432651519775, + "learning_rate": 5.1512062273178195e-05, + "loss": 0.2443333864212036, + "memory(GiB)": 91.64, + "step": 7185, + "token_acc": 0.9109893871961657, + "train_speed(iter/s)": 0.138601 + }, + { + "epoch": 0.678557946394866, + "grad_norm": 0.3931172788143158, + "learning_rate": 5.137564325385447e-05, + "loss": 0.2416548252105713, + "memory(GiB)": 91.64, + "step": 7190, + "token_acc": 0.9179136383069688, + "train_speed(iter/s)": 0.138601 + }, + { + "epoch": 0.6790298225745565, + "grad_norm": 0.847093939781189, + "learning_rate": 5.123934264215918e-05, + "loss": 0.2375951290130615, + "memory(GiB)": 91.64, + "step": 7195, + "token_acc": 0.9275525525525525, + "train_speed(iter/s)": 0.1386 + }, + { + "epoch": 0.6795016987542469, + "grad_norm": 0.19797252118587494, + "learning_rate": 5.110316077000487e-05, + "loss": 0.24020733833312988, + "memory(GiB)": 91.64, + "step": 7200, + "token_acc": 0.8846153846153846, + "train_speed(iter/s)": 0.138601 + }, + { + "epoch": 0.6799735749339373, + "grad_norm": 0.38781481981277466, + "learning_rate": 5.096709796901491e-05, + "loss": 0.23613905906677246, + "memory(GiB)": 91.64, + "step": 7205, + "token_acc": 0.9090549624357454, + "train_speed(iter/s)": 0.138602 + }, + { + "epoch": 0.6804454511136278, + "grad_norm": 0.34402525424957275, + "learning_rate": 5.083115457052263e-05, + "loss": 0.24098021984100343, + "memory(GiB)": 91.64, + "step": 7210, + "token_acc": 0.9180212014134276, + "train_speed(iter/s)": 0.138601 + }, + { + "epoch": 0.6809173272933182, + "grad_norm": 0.20066817104816437, + "learning_rate": 5.0695330905570735e-05, + "loss": 0.23880929946899415, + "memory(GiB)": 91.64, + "step": 7215, + "token_acc": 0.9213587715216379, + "train_speed(iter/s)": 0.138602 + }, + { + "epoch": 0.6813892034730087, + "grad_norm": 0.31116795539855957, + "learning_rate": 5.055962730491028e-05, + "loss": 0.24228744506835936, + "memory(GiB)": 91.64, + "step": 7220, + "token_acc": 0.9155663655316192, + "train_speed(iter/s)": 0.138603 + }, + { + "epoch": 0.6818610796526992, + "grad_norm": 0.282959908246994, + "learning_rate": 5.042404409899995e-05, + "loss": 0.24117763042449952, + "memory(GiB)": 91.64, + "step": 7225, + "token_acc": 0.9193548387096774, + "train_speed(iter/s)": 0.138604 + }, + { + "epoch": 0.6823329558323896, + "grad_norm": 0.49499738216400146, + "learning_rate": 5.0288581618005274e-05, + "loss": 0.24113068580627442, + "memory(GiB)": 91.64, + "step": 7230, + "token_acc": 0.9124833407374501, + "train_speed(iter/s)": 0.138603 + }, + { + "epoch": 0.68280483201208, + "grad_norm": 0.37821346521377563, + "learning_rate": 5.015324019179781e-05, + "loss": 0.24250617027282714, + "memory(GiB)": 91.64, + "step": 7235, + "token_acc": 0.9230293663060278, + "train_speed(iter/s)": 0.138604 + }, + { + "epoch": 0.6832767081917704, + "grad_norm": 0.43186768889427185, + "learning_rate": 5.001802014995425e-05, + "loss": 0.23106989860534669, + "memory(GiB)": 91.64, + "step": 7240, + "token_acc": 0.9164609053497942, + "train_speed(iter/s)": 0.138604 + }, + { + "epoch": 0.6837485843714609, + "grad_norm": 0.42559754848480225, + "learning_rate": 4.988292182175577e-05, + "loss": 0.23471336364746093, + "memory(GiB)": 91.64, + "step": 7245, + "token_acc": 0.928995756718529, + "train_speed(iter/s)": 0.138604 + }, + { + "epoch": 0.6842204605511514, + "grad_norm": 0.4017716646194458, + "learning_rate": 4.9747945536187145e-05, + "loss": 0.24154629707336425, + "memory(GiB)": 91.64, + "step": 7250, + "token_acc": 0.9225834046193327, + "train_speed(iter/s)": 0.138604 + }, + { + "epoch": 0.6846923367308418, + "grad_norm": 0.45907652378082275, + "learning_rate": 4.961309162193595e-05, + "loss": 0.22962424755096436, + "memory(GiB)": 91.64, + "step": 7255, + "token_acc": 0.929345470307759, + "train_speed(iter/s)": 0.138604 + }, + { + "epoch": 0.6851642129105323, + "grad_norm": 0.23512178659439087, + "learning_rate": 4.94783604073918e-05, + "loss": 0.23903675079345704, + "memory(GiB)": 91.64, + "step": 7260, + "token_acc": 0.9203929539295393, + "train_speed(iter/s)": 0.138603 + }, + { + "epoch": 0.6856360890902228, + "grad_norm": 0.5371173024177551, + "learning_rate": 4.9343752220645424e-05, + "loss": 0.23918862342834474, + "memory(GiB)": 91.64, + "step": 7265, + "token_acc": 0.9101217975640488, + "train_speed(iter/s)": 0.138604 + }, + { + "epoch": 0.6861079652699131, + "grad_norm": 0.5968282222747803, + "learning_rate": 4.9209267389488036e-05, + "loss": 0.24033589363098146, + "memory(GiB)": 91.64, + "step": 7270, + "token_acc": 0.9201339072214252, + "train_speed(iter/s)": 0.138603 + }, + { + "epoch": 0.6865798414496036, + "grad_norm": 0.5909680724143982, + "learning_rate": 4.907490624141046e-05, + "loss": 0.23803811073303222, + "memory(GiB)": 91.64, + "step": 7275, + "token_acc": 0.9005710446758481, + "train_speed(iter/s)": 0.138604 + }, + { + "epoch": 0.6870517176292941, + "grad_norm": 0.6105820536613464, + "learning_rate": 4.894066910360231e-05, + "loss": 0.23955843448638917, + "memory(GiB)": 91.64, + "step": 7280, + "token_acc": 0.9219009637753407, + "train_speed(iter/s)": 0.138605 + }, + { + "epoch": 0.6875235938089845, + "grad_norm": 0.34672901034355164, + "learning_rate": 4.880655630295122e-05, + "loss": 0.23337287902832032, + "memory(GiB)": 91.64, + "step": 7285, + "token_acc": 0.9275103980986333, + "train_speed(iter/s)": 0.138604 + }, + { + "epoch": 0.687995469988675, + "grad_norm": 0.2837556302547455, + "learning_rate": 4.867256816604211e-05, + "loss": 0.24297127723693848, + "memory(GiB)": 91.64, + "step": 7290, + "token_acc": 0.9024296182028538, + "train_speed(iter/s)": 0.138603 + }, + { + "epoch": 0.6884673461683655, + "grad_norm": 0.31120648980140686, + "learning_rate": 4.853870501915616e-05, + "loss": 0.23241846561431884, + "memory(GiB)": 91.64, + "step": 7295, + "token_acc": 0.9221556886227545, + "train_speed(iter/s)": 0.138604 + }, + { + "epoch": 0.6889392223480558, + "grad_norm": 0.5619298219680786, + "learning_rate": 4.8404967188270336e-05, + "loss": 0.23225302696228028, + "memory(GiB)": 91.64, + "step": 7300, + "token_acc": 0.9281452013923421, + "train_speed(iter/s)": 0.138603 + }, + { + "epoch": 0.6894110985277463, + "grad_norm": 0.4053754508495331, + "learning_rate": 4.827135499905638e-05, + "loss": 0.23050668239593505, + "memory(GiB)": 91.64, + "step": 7305, + "token_acc": 0.915, + "train_speed(iter/s)": 0.138603 + }, + { + "epoch": 0.6898829747074368, + "grad_norm": 0.48405569791793823, + "learning_rate": 4.8137868776880104e-05, + "loss": 0.23096683025360107, + "memory(GiB)": 91.64, + "step": 7310, + "token_acc": 0.919170243204578, + "train_speed(iter/s)": 0.138603 + }, + { + "epoch": 0.6903548508871272, + "grad_norm": 0.28916484117507935, + "learning_rate": 4.800450884680054e-05, + "loss": 0.23934123516082764, + "memory(GiB)": 91.64, + "step": 7315, + "token_acc": 0.8993939393939394, + "train_speed(iter/s)": 0.138604 + }, + { + "epoch": 0.6908267270668177, + "grad_norm": 0.3169102668762207, + "learning_rate": 4.7871275533569223e-05, + "loss": 0.23298006057739257, + "memory(GiB)": 91.64, + "step": 7320, + "token_acc": 0.9264836138175376, + "train_speed(iter/s)": 0.138604 + }, + { + "epoch": 0.6912986032465082, + "grad_norm": 0.405086487531662, + "learning_rate": 4.7738169161629273e-05, + "loss": 0.23260374069213868, + "memory(GiB)": 91.64, + "step": 7325, + "token_acc": 0.8946572580645161, + "train_speed(iter/s)": 0.138605 + }, + { + "epoch": 0.6917704794261985, + "grad_norm": 0.3432481288909912, + "learning_rate": 4.760519005511477e-05, + "loss": 0.2355022668838501, + "memory(GiB)": 91.64, + "step": 7330, + "token_acc": 0.9095427435387674, + "train_speed(iter/s)": 0.138605 + }, + { + "epoch": 0.692242355605889, + "grad_norm": 0.3307795524597168, + "learning_rate": 4.747233853784986e-05, + "loss": 0.24247050285339355, + "memory(GiB)": 91.64, + "step": 7335, + "token_acc": 0.9070431472081218, + "train_speed(iter/s)": 0.138606 + }, + { + "epoch": 0.6927142317855794, + "grad_norm": 0.26367953419685364, + "learning_rate": 4.733961493334798e-05, + "loss": 0.23971378803253174, + "memory(GiB)": 91.64, + "step": 7340, + "token_acc": 0.9123732251521298, + "train_speed(iter/s)": 0.138605 + }, + { + "epoch": 0.6931861079652699, + "grad_norm": 0.220316544175148, + "learning_rate": 4.720701956481112e-05, + "loss": 0.22885804176330565, + "memory(GiB)": 91.64, + "step": 7345, + "token_acc": 0.9171151776103337, + "train_speed(iter/s)": 0.138606 + }, + { + "epoch": 0.6936579841449604, + "grad_norm": 0.4796167016029358, + "learning_rate": 4.70745527551289e-05, + "loss": 0.23552498817443848, + "memory(GiB)": 91.64, + "step": 7350, + "token_acc": 0.9133454106280193, + "train_speed(iter/s)": 0.138605 + }, + { + "epoch": 0.6941298603246508, + "grad_norm": 0.4051739573478699, + "learning_rate": 4.694221482687797e-05, + "loss": 0.230513858795166, + "memory(GiB)": 91.64, + "step": 7355, + "token_acc": 0.9137493658041603, + "train_speed(iter/s)": 0.138605 + }, + { + "epoch": 0.6946017365043413, + "grad_norm": 0.3922402262687683, + "learning_rate": 4.681000610232112e-05, + "loss": 0.23725688457489014, + "memory(GiB)": 91.64, + "step": 7360, + "token_acc": 0.9128390596745027, + "train_speed(iter/s)": 0.138603 + }, + { + "epoch": 0.6950736126840317, + "grad_norm": 0.2533682584762573, + "learning_rate": 4.66779269034065e-05, + "loss": 0.237007737159729, + "memory(GiB)": 91.64, + "step": 7365, + "token_acc": 0.919661733615222, + "train_speed(iter/s)": 0.138604 + }, + { + "epoch": 0.6955454888637221, + "grad_norm": 0.2674064040184021, + "learning_rate": 4.654597755176682e-05, + "loss": 0.23746719360351562, + "memory(GiB)": 91.64, + "step": 7370, + "token_acc": 0.9228115567054765, + "train_speed(iter/s)": 0.138603 + }, + { + "epoch": 0.6960173650434126, + "grad_norm": 0.5427283644676208, + "learning_rate": 4.6414158368718665e-05, + "loss": 0.24357161521911622, + "memory(GiB)": 91.64, + "step": 7375, + "token_acc": 0.9070453707119144, + "train_speed(iter/s)": 0.138601 + }, + { + "epoch": 0.6964892412231031, + "grad_norm": 0.4000316262245178, + "learning_rate": 4.628246967526151e-05, + "loss": 0.2293656587600708, + "memory(GiB)": 91.64, + "step": 7380, + "token_acc": 0.9324394017534812, + "train_speed(iter/s)": 0.138602 + }, + { + "epoch": 0.6969611174027935, + "grad_norm": 0.3474954068660736, + "learning_rate": 4.61509117920772e-05, + "loss": 0.23463306427001954, + "memory(GiB)": 91.64, + "step": 7385, + "token_acc": 0.9113486325055015, + "train_speed(iter/s)": 0.138602 + }, + { + "epoch": 0.697432993582484, + "grad_norm": 0.41696134209632874, + "learning_rate": 4.601948503952896e-05, + "loss": 0.23325471878051757, + "memory(GiB)": 91.64, + "step": 7390, + "token_acc": 0.935026138909634, + "train_speed(iter/s)": 0.138601 + }, + { + "epoch": 0.6979048697621744, + "grad_norm": 0.6173218488693237, + "learning_rate": 4.5888189737660735e-05, + "loss": 0.2361754894256592, + "memory(GiB)": 91.64, + "step": 7395, + "token_acc": 0.9114997350291468, + "train_speed(iter/s)": 0.138602 + }, + { + "epoch": 0.6983767459418648, + "grad_norm": 0.4587777256965637, + "learning_rate": 4.5757026206196354e-05, + "loss": 0.23703014850616455, + "memory(GiB)": 91.64, + "step": 7400, + "token_acc": 0.9132492113564669, + "train_speed(iter/s)": 0.138602 + }, + { + "epoch": 0.6988486221215553, + "grad_norm": 0.7898734211921692, + "learning_rate": 4.562599476453878e-05, + "loss": 0.23952670097351075, + "memory(GiB)": 91.64, + "step": 7405, + "token_acc": 0.9085151301900071, + "train_speed(iter/s)": 0.138604 + }, + { + "epoch": 0.6993204983012458, + "grad_norm": 0.36151617765426636, + "learning_rate": 4.549509573176923e-05, + "loss": 0.2393695592880249, + "memory(GiB)": 91.64, + "step": 7410, + "token_acc": 0.9187757504414361, + "train_speed(iter/s)": 0.138605 + }, + { + "epoch": 0.6997923744809362, + "grad_norm": 0.40499287843704224, + "learning_rate": 4.53643294266466e-05, + "loss": 0.23288025856018066, + "memory(GiB)": 91.64, + "step": 7415, + "token_acc": 0.9126254180602007, + "train_speed(iter/s)": 0.138606 + }, + { + "epoch": 0.7002642506606267, + "grad_norm": 0.39258766174316406, + "learning_rate": 4.523369616760653e-05, + "loss": 0.23379309177398683, + "memory(GiB)": 91.64, + "step": 7420, + "token_acc": 0.9255467659376454, + "train_speed(iter/s)": 0.138606 + }, + { + "epoch": 0.700736126840317, + "grad_norm": 0.5925548076629639, + "learning_rate": 4.510319627276066e-05, + "loss": 0.24018988609313965, + "memory(GiB)": 91.64, + "step": 7425, + "token_acc": 0.9100609756097561, + "train_speed(iter/s)": 0.138607 + }, + { + "epoch": 0.7012080030200075, + "grad_norm": 0.2632533311843872, + "learning_rate": 4.497283005989592e-05, + "loss": 0.2343803882598877, + "memory(GiB)": 91.64, + "step": 7430, + "token_acc": 0.911731843575419, + "train_speed(iter/s)": 0.138607 + }, + { + "epoch": 0.701679879199698, + "grad_norm": 0.5552768111228943, + "learning_rate": 4.484259784647359e-05, + "loss": 0.23283910751342773, + "memory(GiB)": 91.64, + "step": 7435, + "token_acc": 0.9176574196389256, + "train_speed(iter/s)": 0.138608 + }, + { + "epoch": 0.7021517553793885, + "grad_norm": 0.3536672294139862, + "learning_rate": 4.471249994962875e-05, + "loss": 0.24048154354095458, + "memory(GiB)": 91.64, + "step": 7440, + "token_acc": 0.9188626907073509, + "train_speed(iter/s)": 0.138608 + }, + { + "epoch": 0.7026236315590789, + "grad_norm": 0.7183988094329834, + "learning_rate": 4.458253668616936e-05, + "loss": 0.23343441486358643, + "memory(GiB)": 91.64, + "step": 7445, + "token_acc": 0.9140362659503022, + "train_speed(iter/s)": 0.138606 + }, + { + "epoch": 0.7030955077387694, + "grad_norm": 0.4441908001899719, + "learning_rate": 4.445270837257554e-05, + "loss": 0.22676398754119872, + "memory(GiB)": 91.64, + "step": 7450, + "token_acc": 0.9258134490238612, + "train_speed(iter/s)": 0.138606 + }, + { + "epoch": 0.7035673839184597, + "grad_norm": 0.29138508439064026, + "learning_rate": 4.432301532499877e-05, + "loss": 0.2306809425354004, + "memory(GiB)": 91.64, + "step": 7455, + "token_acc": 0.9092585761711546, + "train_speed(iter/s)": 0.138607 + }, + { + "epoch": 0.7040392600981502, + "grad_norm": 0.20300675928592682, + "learning_rate": 4.419345785926119e-05, + "loss": 0.22768373489379884, + "memory(GiB)": 91.64, + "step": 7460, + "token_acc": 0.911993097497843, + "train_speed(iter/s)": 0.138607 + }, + { + "epoch": 0.7045111362778407, + "grad_norm": 0.4421274662017822, + "learning_rate": 4.406403629085465e-05, + "loss": 0.23306775093078613, + "memory(GiB)": 91.64, + "step": 7465, + "token_acc": 0.9188296366210477, + "train_speed(iter/s)": 0.138606 + }, + { + "epoch": 0.7049830124575311, + "grad_norm": 0.21357952058315277, + "learning_rate": 4.3934750934940196e-05, + "loss": 0.23424277305603028, + "memory(GiB)": 91.64, + "step": 7470, + "token_acc": 0.9328947368421052, + "train_speed(iter/s)": 0.138606 + }, + { + "epoch": 0.7054548886372216, + "grad_norm": 0.436585009098053, + "learning_rate": 4.380560210634715e-05, + "loss": 0.2297410249710083, + "memory(GiB)": 91.64, + "step": 7475, + "token_acc": 0.9225950782997763, + "train_speed(iter/s)": 0.138607 + }, + { + "epoch": 0.7059267648169121, + "grad_norm": 0.28046926856040955, + "learning_rate": 4.367659011957227e-05, + "loss": 0.23267920017242433, + "memory(GiB)": 91.64, + "step": 7480, + "token_acc": 0.9165763813651138, + "train_speed(iter/s)": 0.138608 + }, + { + "epoch": 0.7063986409966025, + "grad_norm": 0.5179422497749329, + "learning_rate": 4.354771528877926e-05, + "loss": 0.23168692588806153, + "memory(GiB)": 91.64, + "step": 7485, + "token_acc": 0.923728813559322, + "train_speed(iter/s)": 0.138608 + }, + { + "epoch": 0.7068705171762929, + "grad_norm": 0.3157169818878174, + "learning_rate": 4.3418977927797724e-05, + "loss": 0.2363651752471924, + "memory(GiB)": 91.64, + "step": 7490, + "token_acc": 0.9275045537340619, + "train_speed(iter/s)": 0.138608 + }, + { + "epoch": 0.7073423933559834, + "grad_norm": 0.4149136543273926, + "learning_rate": 4.329037835012245e-05, + "loss": 0.2356886863708496, + "memory(GiB)": 91.64, + "step": 7495, + "token_acc": 0.9102065249925172, + "train_speed(iter/s)": 0.138608 + }, + { + "epoch": 0.7078142695356738, + "grad_norm": 0.3638668656349182, + "learning_rate": 4.316191686891282e-05, + "loss": 0.22782864570617675, + "memory(GiB)": 91.64, + "step": 7500, + "token_acc": 0.9282261063592413, + "train_speed(iter/s)": 0.138609 + }, + { + "epoch": 0.7082861457153643, + "grad_norm": 0.3706573247909546, + "learning_rate": 4.30335937969919e-05, + "loss": 0.2303562879562378, + "memory(GiB)": 91.64, + "step": 7505, + "token_acc": 0.9246684350132626, + "train_speed(iter/s)": 0.13861 + }, + { + "epoch": 0.7087580218950548, + "grad_norm": 0.4828733205795288, + "learning_rate": 4.290540944684558e-05, + "loss": 0.23863065242767334, + "memory(GiB)": 91.64, + "step": 7510, + "token_acc": 0.9191153238546603, + "train_speed(iter/s)": 0.138609 + }, + { + "epoch": 0.7092298980747452, + "grad_norm": 0.6060589551925659, + "learning_rate": 4.277736413062219e-05, + "loss": 0.22998156547546386, + "memory(GiB)": 91.64, + "step": 7515, + "token_acc": 0.9242005527043032, + "train_speed(iter/s)": 0.138608 + }, + { + "epoch": 0.7097017742544356, + "grad_norm": 0.4992319941520691, + "learning_rate": 4.264945816013125e-05, + "loss": 0.24006481170654298, + "memory(GiB)": 91.64, + "step": 7520, + "token_acc": 0.9305866547245858, + "train_speed(iter/s)": 0.138608 + }, + { + "epoch": 0.7101736504341261, + "grad_norm": 0.40544596314430237, + "learning_rate": 4.2521691846843095e-05, + "loss": 0.2283543348312378, + "memory(GiB)": 91.64, + "step": 7525, + "token_acc": 0.9242761692650334, + "train_speed(iter/s)": 0.138611 + }, + { + "epoch": 0.7106455266138165, + "grad_norm": 0.5109364986419678, + "learning_rate": 4.239406550188791e-05, + "loss": 0.2309938907623291, + "memory(GiB)": 91.64, + "step": 7530, + "token_acc": 0.9235931853381518, + "train_speed(iter/s)": 0.138611 + }, + { + "epoch": 0.711117402793507, + "grad_norm": 0.422530859708786, + "learning_rate": 4.2266579436055084e-05, + "loss": 0.23394412994384767, + "memory(GiB)": 91.64, + "step": 7535, + "token_acc": 0.9018055115616092, + "train_speed(iter/s)": 0.13861 + }, + { + "epoch": 0.7115892789731975, + "grad_norm": 0.2911304831504822, + "learning_rate": 4.213923395979236e-05, + "loss": 0.2337871789932251, + "memory(GiB)": 91.64, + "step": 7540, + "token_acc": 0.910948905109489, + "train_speed(iter/s)": 0.13861 + }, + { + "epoch": 0.7120611551528879, + "grad_norm": 0.5636393427848816, + "learning_rate": 4.201202938320519e-05, + "loss": 0.23299179077148438, + "memory(GiB)": 91.64, + "step": 7545, + "token_acc": 0.9072039072039072, + "train_speed(iter/s)": 0.13861 + }, + { + "epoch": 0.7125330313325783, + "grad_norm": 0.458686500787735, + "learning_rate": 4.188496601605577e-05, + "loss": 0.23804445266723634, + "memory(GiB)": 91.64, + "step": 7550, + "token_acc": 0.9233138281490607, + "train_speed(iter/s)": 0.13861 + }, + { + "epoch": 0.7130049075122687, + "grad_norm": 0.26119813323020935, + "learning_rate": 4.17580441677626e-05, + "loss": 0.23722286224365235, + "memory(GiB)": 91.64, + "step": 7555, + "token_acc": 0.9108138238573021, + "train_speed(iter/s)": 0.138609 + }, + { + "epoch": 0.7134767836919592, + "grad_norm": 0.32606083154678345, + "learning_rate": 4.16312641473995e-05, + "loss": 0.2353299617767334, + "memory(GiB)": 91.64, + "step": 7560, + "token_acc": 0.9210182767624021, + "train_speed(iter/s)": 0.138609 + }, + { + "epoch": 0.7139486598716497, + "grad_norm": 0.37780630588531494, + "learning_rate": 4.15046262636948e-05, + "loss": 0.2375786304473877, + "memory(GiB)": 91.64, + "step": 7565, + "token_acc": 0.9266109785202864, + "train_speed(iter/s)": 0.138611 + }, + { + "epoch": 0.7144205360513401, + "grad_norm": 0.5097290873527527, + "learning_rate": 4.1378130825030926e-05, + "loss": 0.23275210857391357, + "memory(GiB)": 91.64, + "step": 7570, + "token_acc": 0.9178931061192874, + "train_speed(iter/s)": 0.138613 + }, + { + "epoch": 0.7148924122310306, + "grad_norm": 0.2814135253429413, + "learning_rate": 4.12517781394433e-05, + "loss": 0.236267614364624, + "memory(GiB)": 91.64, + "step": 7575, + "token_acc": 0.9209017959495606, + "train_speed(iter/s)": 0.138614 + }, + { + "epoch": 0.715364288410721, + "grad_norm": 0.6145960092544556, + "learning_rate": 4.1125568514619675e-05, + "loss": 0.2347486972808838, + "memory(GiB)": 91.64, + "step": 7580, + "token_acc": 0.925731760594146, + "train_speed(iter/s)": 0.138614 + }, + { + "epoch": 0.7158361645904114, + "grad_norm": 0.5332577228546143, + "learning_rate": 4.0999502257899515e-05, + "loss": 0.24189887046813965, + "memory(GiB)": 91.64, + "step": 7585, + "token_acc": 0.9104374784705477, + "train_speed(iter/s)": 0.138614 + }, + { + "epoch": 0.7163080407701019, + "grad_norm": 0.7018850445747375, + "learning_rate": 4.087357967627317e-05, + "loss": 0.23237390518188478, + "memory(GiB)": 91.64, + "step": 7590, + "token_acc": 0.9236180904522613, + "train_speed(iter/s)": 0.138615 + }, + { + "epoch": 0.7167799169497924, + "grad_norm": 0.37541598081588745, + "learning_rate": 4.0747801076380965e-05, + "loss": 0.2391916275024414, + "memory(GiB)": 91.64, + "step": 7595, + "token_acc": 0.9177265500794912, + "train_speed(iter/s)": 0.138614 + }, + { + "epoch": 0.7172517931294828, + "grad_norm": 0.26603105664253235, + "learning_rate": 4.062216676451285e-05, + "loss": 0.22683272361755372, + "memory(GiB)": 91.64, + "step": 7600, + "token_acc": 0.9210950080515298, + "train_speed(iter/s)": 0.138614 + }, + { + "epoch": 0.7177236693091733, + "grad_norm": 0.5723817348480225, + "learning_rate": 4.049667704660728e-05, + "loss": 0.23589439392089845, + "memory(GiB)": 91.64, + "step": 7605, + "token_acc": 0.9155238617663193, + "train_speed(iter/s)": 0.138613 + }, + { + "epoch": 0.7181955454888638, + "grad_norm": 0.26500552892684937, + "learning_rate": 4.037133222825052e-05, + "loss": 0.22756319046020507, + "memory(GiB)": 91.64, + "step": 7610, + "token_acc": 0.9262192580241767, + "train_speed(iter/s)": 0.138615 + }, + { + "epoch": 0.7186674216685541, + "grad_norm": 0.7966447472572327, + "learning_rate": 4.0246132614676145e-05, + "loss": 0.2435863971710205, + "memory(GiB)": 91.64, + "step": 7615, + "token_acc": 0.9128586609989373, + "train_speed(iter/s)": 0.138615 + }, + { + "epoch": 0.7191392978482446, + "grad_norm": 1.105246663093567, + "learning_rate": 4.012107851076406e-05, + "loss": 0.22101092338562012, + "memory(GiB)": 91.64, + "step": 7620, + "token_acc": 0.9139150943396226, + "train_speed(iter/s)": 0.138616 + }, + { + "epoch": 0.7196111740279351, + "grad_norm": 0.37064602971076965, + "learning_rate": 3.999617022103975e-05, + "loss": 0.2291620969772339, + "memory(GiB)": 91.64, + "step": 7625, + "token_acc": 0.9221767115272089, + "train_speed(iter/s)": 0.138615 + }, + { + "epoch": 0.7200830502076255, + "grad_norm": 0.43661198019981384, + "learning_rate": 3.987140804967384e-05, + "loss": 0.2348928689956665, + "memory(GiB)": 91.64, + "step": 7630, + "token_acc": 0.9234449760765551, + "train_speed(iter/s)": 0.138616 + }, + { + "epoch": 0.720554926387316, + "grad_norm": 0.7201928496360779, + "learning_rate": 3.9746792300480894e-05, + "loss": 0.2301774024963379, + "memory(GiB)": 91.64, + "step": 7635, + "token_acc": 0.915521978021978, + "train_speed(iter/s)": 0.138615 + }, + { + "epoch": 0.7210268025670065, + "grad_norm": 0.3067832291126251, + "learning_rate": 3.962232327691905e-05, + "loss": 0.23211939334869386, + "memory(GiB)": 91.64, + "step": 7640, + "token_acc": 0.9235436893203883, + "train_speed(iter/s)": 0.138615 + }, + { + "epoch": 0.7214986787466968, + "grad_norm": 0.3120873272418976, + "learning_rate": 3.949800128208915e-05, + "loss": 0.22577478885650634, + "memory(GiB)": 91.64, + "step": 7645, + "token_acc": 0.9192324854975458, + "train_speed(iter/s)": 0.138615 + }, + { + "epoch": 0.7219705549263873, + "grad_norm": 0.22580601274967194, + "learning_rate": 3.93738266187339e-05, + "loss": 0.23252127170562745, + "memory(GiB)": 91.64, + "step": 7650, + "token_acc": 0.9091406677613574, + "train_speed(iter/s)": 0.138615 + }, + { + "epoch": 0.7224424311060778, + "grad_norm": 0.39276060461997986, + "learning_rate": 3.92497995892373e-05, + "loss": 0.2360863208770752, + "memory(GiB)": 91.64, + "step": 7655, + "token_acc": 0.9083969465648855, + "train_speed(iter/s)": 0.138616 + }, + { + "epoch": 0.7229143072857682, + "grad_norm": 0.36733052134513855, + "learning_rate": 3.912592049562395e-05, + "loss": 0.23193964958190919, + "memory(GiB)": 91.64, + "step": 7660, + "token_acc": 0.914519906323185, + "train_speed(iter/s)": 0.138616 + }, + { + "epoch": 0.7233861834654587, + "grad_norm": 0.36261504888534546, + "learning_rate": 3.9002189639557974e-05, + "loss": 0.23163235187530518, + "memory(GiB)": 91.64, + "step": 7665, + "token_acc": 0.9202302631578947, + "train_speed(iter/s)": 0.138615 + }, + { + "epoch": 0.7238580596451492, + "grad_norm": 0.4824898838996887, + "learning_rate": 3.8878607322342674e-05, + "loss": 0.23579974174499513, + "memory(GiB)": 91.64, + "step": 7670, + "token_acc": 0.9020848845867461, + "train_speed(iter/s)": 0.138616 + }, + { + "epoch": 0.7243299358248395, + "grad_norm": 0.4362608790397644, + "learning_rate": 3.8755173844919624e-05, + "loss": 0.23200278282165526, + "memory(GiB)": 91.64, + "step": 7675, + "token_acc": 0.9163013152650459, + "train_speed(iter/s)": 0.138616 + }, + { + "epoch": 0.72480181200453, + "grad_norm": 0.3005385100841522, + "learning_rate": 3.863188950786786e-05, + "loss": 0.22539281845092773, + "memory(GiB)": 91.64, + "step": 7680, + "token_acc": 0.91974479516454, + "train_speed(iter/s)": 0.138617 + }, + { + "epoch": 0.7252736881842204, + "grad_norm": 0.28257620334625244, + "learning_rate": 3.8508754611403296e-05, + "loss": 0.23311495780944824, + "memory(GiB)": 91.64, + "step": 7685, + "token_acc": 0.9152061855670103, + "train_speed(iter/s)": 0.138617 + }, + { + "epoch": 0.7257455643639109, + "grad_norm": 0.36101335287094116, + "learning_rate": 3.838576945537806e-05, + "loss": 0.23008966445922852, + "memory(GiB)": 91.64, + "step": 7690, + "token_acc": 0.9269406392694064, + "train_speed(iter/s)": 0.138617 + }, + { + "epoch": 0.7262174405436014, + "grad_norm": 0.25905391573905945, + "learning_rate": 3.82629343392794e-05, + "loss": 0.23752379417419434, + "memory(GiB)": 91.64, + "step": 7695, + "token_acc": 0.930064308681672, + "train_speed(iter/s)": 0.138618 + }, + { + "epoch": 0.7266893167232918, + "grad_norm": 0.3700575828552246, + "learning_rate": 3.814024956222936e-05, + "loss": 0.2280106782913208, + "memory(GiB)": 91.64, + "step": 7700, + "token_acc": 0.928319209039548, + "train_speed(iter/s)": 0.138616 + }, + { + "epoch": 0.7271611929029823, + "grad_norm": 0.21974368393421173, + "learning_rate": 3.801771542298387e-05, + "loss": 0.22454090118408204, + "memory(GiB)": 91.64, + "step": 7705, + "token_acc": 0.9206798866855525, + "train_speed(iter/s)": 0.138615 + }, + { + "epoch": 0.7276330690826727, + "grad_norm": 0.23629778623580933, + "learning_rate": 3.78953322199319e-05, + "loss": 0.23039534091949462, + "memory(GiB)": 91.64, + "step": 7710, + "token_acc": 0.9159847244953628, + "train_speed(iter/s)": 0.138615 + }, + { + "epoch": 0.7281049452623631, + "grad_norm": 0.2492135465145111, + "learning_rate": 3.777310025109512e-05, + "loss": 0.22857446670532228, + "memory(GiB)": 91.64, + "step": 7715, + "token_acc": 0.9081383164512749, + "train_speed(iter/s)": 0.138616 + }, + { + "epoch": 0.7285768214420536, + "grad_norm": 0.2390630841255188, + "learning_rate": 3.7651019814126654e-05, + "loss": 0.22294931411743163, + "memory(GiB)": 91.64, + "step": 7720, + "token_acc": 0.9189723320158103, + "train_speed(iter/s)": 0.138617 + }, + { + "epoch": 0.7290486976217441, + "grad_norm": 0.2751132547855377, + "learning_rate": 3.752909120631079e-05, + "loss": 0.23300666809082032, + "memory(GiB)": 91.64, + "step": 7725, + "token_acc": 0.9150349650349651, + "train_speed(iter/s)": 0.138616 + }, + { + "epoch": 0.7295205738014345, + "grad_norm": 0.35565805435180664, + "learning_rate": 3.740731472456208e-05, + "loss": 0.2383554220199585, + "memory(GiB)": 91.64, + "step": 7730, + "token_acc": 0.9274079320113314, + "train_speed(iter/s)": 0.138616 + }, + { + "epoch": 0.729992449981125, + "grad_norm": 0.4752665162086487, + "learning_rate": 3.7285690665424523e-05, + "loss": 0.2266439437866211, + "memory(GiB)": 91.64, + "step": 7735, + "token_acc": 0.9231578947368421, + "train_speed(iter/s)": 0.138616 + }, + { + "epoch": 0.7304643261608154, + "grad_norm": 0.49981334805488586, + "learning_rate": 3.7164219325070995e-05, + "loss": 0.234313440322876, + "memory(GiB)": 91.64, + "step": 7740, + "token_acc": 0.9175717070453913, + "train_speed(iter/s)": 0.138616 + }, + { + "epoch": 0.7309362023405058, + "grad_norm": 0.35681086778640747, + "learning_rate": 3.704290099930261e-05, + "loss": 0.23388543128967285, + "memory(GiB)": 91.64, + "step": 7745, + "token_acc": 0.9287226534932957, + "train_speed(iter/s)": 0.138616 + }, + { + "epoch": 0.7314080785201963, + "grad_norm": 0.23404952883720398, + "learning_rate": 3.692173598354765e-05, + "loss": 0.23158960342407225, + "memory(GiB)": 91.64, + "step": 7750, + "token_acc": 0.9207547169811321, + "train_speed(iter/s)": 0.138617 + }, + { + "epoch": 0.7318799546998868, + "grad_norm": 0.26125267148017883, + "learning_rate": 3.680072457286121e-05, + "loss": 0.22669458389282227, + "memory(GiB)": 91.64, + "step": 7755, + "token_acc": 0.919360568383659, + "train_speed(iter/s)": 0.138616 + }, + { + "epoch": 0.7323518308795772, + "grad_norm": 0.2665402293205261, + "learning_rate": 3.667986706192431e-05, + "loss": 0.23177189826965333, + "memory(GiB)": 91.64, + "step": 7760, + "token_acc": 0.9216549295774648, + "train_speed(iter/s)": 0.138617 + }, + { + "epoch": 0.7328237070592677, + "grad_norm": 0.42883795499801636, + "learning_rate": 3.6559163745043126e-05, + "loss": 0.22768354415893555, + "memory(GiB)": 91.64, + "step": 7765, + "token_acc": 0.9188311688311688, + "train_speed(iter/s)": 0.138617 + }, + { + "epoch": 0.733295583238958, + "grad_norm": 0.25692644715309143, + "learning_rate": 3.643861491614841e-05, + "loss": 0.23015880584716797, + "memory(GiB)": 91.64, + "step": 7770, + "token_acc": 0.9187208527648234, + "train_speed(iter/s)": 0.138618 + }, + { + "epoch": 0.7337674594186485, + "grad_norm": 0.21508167684078217, + "learning_rate": 3.6318220868794784e-05, + "loss": 0.2296093225479126, + "memory(GiB)": 91.64, + "step": 7775, + "token_acc": 0.9230500174886324, + "train_speed(iter/s)": 0.138617 + }, + { + "epoch": 0.734239335598339, + "grad_norm": 0.29474809765815735, + "learning_rate": 3.6197981896159804e-05, + "loss": 0.23098914623260497, + "memory(GiB)": 91.64, + "step": 7780, + "token_acc": 0.9146655231560892, + "train_speed(iter/s)": 0.138616 + }, + { + "epoch": 0.7347112117780294, + "grad_norm": 0.3672167658805847, + "learning_rate": 3.6077898291043485e-05, + "loss": 0.23493998050689696, + "memory(GiB)": 91.64, + "step": 7785, + "token_acc": 0.9073152049023363, + "train_speed(iter/s)": 0.138615 + }, + { + "epoch": 0.7351830879577199, + "grad_norm": 0.5175244808197021, + "learning_rate": 3.595797034586753e-05, + "loss": 0.22836103439331054, + "memory(GiB)": 91.64, + "step": 7790, + "token_acc": 0.922854387656702, + "train_speed(iter/s)": 0.138613 + }, + { + "epoch": 0.7356549641374104, + "grad_norm": 0.2874203026294708, + "learning_rate": 3.583819835267446e-05, + "loss": 0.22799487113952638, + "memory(GiB)": 91.64, + "step": 7795, + "token_acc": 0.9126323751891074, + "train_speed(iter/s)": 0.138613 + }, + { + "epoch": 0.7361268403171007, + "grad_norm": 0.40861421823501587, + "learning_rate": 3.571858260312715e-05, + "loss": 0.22677245140075683, + "memory(GiB)": 91.64, + "step": 7800, + "token_acc": 0.9164278892072588, + "train_speed(iter/s)": 0.138614 + }, + { + "epoch": 0.7365987164967912, + "grad_norm": 0.3449516296386719, + "learning_rate": 3.559912338850795e-05, + "loss": 0.2347282886505127, + "memory(GiB)": 91.64, + "step": 7805, + "token_acc": 0.9178125, + "train_speed(iter/s)": 0.138613 + }, + { + "epoch": 0.7370705926764817, + "grad_norm": 0.31171733140945435, + "learning_rate": 3.5479820999718036e-05, + "loss": 0.23100967407226564, + "memory(GiB)": 91.64, + "step": 7810, + "token_acc": 0.9182941410920685, + "train_speed(iter/s)": 0.138614 + }, + { + "epoch": 0.7375424688561721, + "grad_norm": 0.5504468679428101, + "learning_rate": 3.536067572727671e-05, + "loss": 0.23290205001831055, + "memory(GiB)": 91.64, + "step": 7815, + "token_acc": 0.9189412737799835, + "train_speed(iter/s)": 0.138614 + }, + { + "epoch": 0.7380143450358626, + "grad_norm": 0.8677312731742859, + "learning_rate": 3.5241687861320593e-05, + "loss": 0.22933337688446045, + "memory(GiB)": 91.64, + "step": 7820, + "token_acc": 0.9203821656050956, + "train_speed(iter/s)": 0.138616 + }, + { + "epoch": 0.7384862212155531, + "grad_norm": 0.2175053507089615, + "learning_rate": 3.512285769160307e-05, + "loss": 0.2278818130493164, + "memory(GiB)": 91.64, + "step": 7825, + "token_acc": 0.9262792714657415, + "train_speed(iter/s)": 0.138616 + }, + { + "epoch": 0.7389580973952435, + "grad_norm": 0.37307730317115784, + "learning_rate": 3.50041855074935e-05, + "loss": 0.22902565002441405, + "memory(GiB)": 91.64, + "step": 7830, + "token_acc": 0.9135725429017161, + "train_speed(iter/s)": 0.138616 + }, + { + "epoch": 0.7394299735749339, + "grad_norm": 0.37443387508392334, + "learning_rate": 3.488567159797652e-05, + "loss": 0.22925407886505128, + "memory(GiB)": 91.64, + "step": 7835, + "token_acc": 0.9244406922752216, + "train_speed(iter/s)": 0.138617 + }, + { + "epoch": 0.7399018497546244, + "grad_norm": 0.3315834701061249, + "learning_rate": 3.4767316251651326e-05, + "loss": 0.22882957458496095, + "memory(GiB)": 91.64, + "step": 7840, + "token_acc": 0.914161008729389, + "train_speed(iter/s)": 0.138619 + }, + { + "epoch": 0.7403737259343148, + "grad_norm": 0.2746642529964447, + "learning_rate": 3.4649119756731055e-05, + "loss": 0.22931833267211915, + "memory(GiB)": 91.64, + "step": 7845, + "token_acc": 0.9125051588939331, + "train_speed(iter/s)": 0.138618 + }, + { + "epoch": 0.7408456021140053, + "grad_norm": 0.6551487445831299, + "learning_rate": 3.453108240104188e-05, + "loss": 0.23314218521118163, + "memory(GiB)": 91.64, + "step": 7850, + "token_acc": 0.9189952904238619, + "train_speed(iter/s)": 0.138618 + }, + { + "epoch": 0.7413174782936958, + "grad_norm": 0.4284408986568451, + "learning_rate": 3.4413204472022576e-05, + "loss": 0.22884924411773683, + "memory(GiB)": 91.64, + "step": 7855, + "token_acc": 0.9177083333333333, + "train_speed(iter/s)": 0.138619 + }, + { + "epoch": 0.7417893544733862, + "grad_norm": 0.2609077990055084, + "learning_rate": 3.429548625672365e-05, + "loss": 0.22607100009918213, + "memory(GiB)": 91.64, + "step": 7860, + "token_acc": 0.9041233657391887, + "train_speed(iter/s)": 0.138619 + }, + { + "epoch": 0.7422612306530766, + "grad_norm": 0.7349159717559814, + "learning_rate": 3.417792804180666e-05, + "loss": 0.23114871978759766, + "memory(GiB)": 91.64, + "step": 7865, + "token_acc": 0.8985565356856455, + "train_speed(iter/s)": 0.13862 + }, + { + "epoch": 0.742733106832767, + "grad_norm": 0.2900051772594452, + "learning_rate": 3.406053011354357e-05, + "loss": 0.22995190620422362, + "memory(GiB)": 91.64, + "step": 7870, + "token_acc": 0.9049773755656109, + "train_speed(iter/s)": 0.138622 + }, + { + "epoch": 0.7432049830124575, + "grad_norm": 0.44393131136894226, + "learning_rate": 3.394329275781604e-05, + "loss": 0.2241837501525879, + "memory(GiB)": 91.64, + "step": 7875, + "token_acc": 0.9156214367160775, + "train_speed(iter/s)": 0.138623 + }, + { + "epoch": 0.743676859192148, + "grad_norm": 0.7309471368789673, + "learning_rate": 3.3826216260114604e-05, + "loss": 0.22879295349121093, + "memory(GiB)": 91.64, + "step": 7880, + "token_acc": 0.9144460028050491, + "train_speed(iter/s)": 0.138624 + }, + { + "epoch": 0.7441487353718385, + "grad_norm": 0.24232062697410583, + "learning_rate": 3.370930090553821e-05, + "loss": 0.2347412347793579, + "memory(GiB)": 91.64, + "step": 7885, + "token_acc": 0.9253786479497599, + "train_speed(iter/s)": 0.138625 + }, + { + "epoch": 0.7446206115515289, + "grad_norm": 0.8616470694541931, + "learning_rate": 3.3592546978793327e-05, + "loss": 0.23368425369262696, + "memory(GiB)": 91.64, + "step": 7890, + "token_acc": 0.9122157588577472, + "train_speed(iter/s)": 0.138626 + }, + { + "epoch": 0.7450924877312193, + "grad_norm": 0.5295524597167969, + "learning_rate": 3.347595476419335e-05, + "loss": 0.23362338542938232, + "memory(GiB)": 91.64, + "step": 7895, + "token_acc": 0.910377358490566, + "train_speed(iter/s)": 0.138626 + }, + { + "epoch": 0.7455643639109097, + "grad_norm": 0.2784343659877777, + "learning_rate": 3.335952454565787e-05, + "loss": 0.23343799114227295, + "memory(GiB)": 91.64, + "step": 7900, + "token_acc": 0.8851380973257343, + "train_speed(iter/s)": 0.138627 + }, + { + "epoch": 0.7460362400906002, + "grad_norm": 0.29195478558540344, + "learning_rate": 3.324325660671205e-05, + "loss": 0.22837295532226562, + "memory(GiB)": 91.64, + "step": 7905, + "token_acc": 0.9181434599156119, + "train_speed(iter/s)": 0.138627 + }, + { + "epoch": 0.7465081162702907, + "grad_norm": 0.3233549892902374, + "learning_rate": 3.312715123048572e-05, + "loss": 0.2293907642364502, + "memory(GiB)": 91.64, + "step": 7910, + "token_acc": 0.9086402266288952, + "train_speed(iter/s)": 0.138627 + }, + { + "epoch": 0.7469799924499811, + "grad_norm": 0.25969669222831726, + "learning_rate": 3.3011208699713015e-05, + "loss": 0.22522926330566406, + "memory(GiB)": 91.64, + "step": 7915, + "token_acc": 0.922077922077922, + "train_speed(iter/s)": 0.138627 + }, + { + "epoch": 0.7474518686296716, + "grad_norm": 0.2135898917913437, + "learning_rate": 3.2895429296731426e-05, + "loss": 0.22411694526672363, + "memory(GiB)": 91.64, + "step": 7920, + "token_acc": 0.9312614259597807, + "train_speed(iter/s)": 0.138629 + }, + { + "epoch": 0.7479237448093621, + "grad_norm": 0.218247190117836, + "learning_rate": 3.2779813303481256e-05, + "loss": 0.22187356948852538, + "memory(GiB)": 91.64, + "step": 7925, + "token_acc": 0.9370489174017642, + "train_speed(iter/s)": 0.13863 + }, + { + "epoch": 0.7483956209890524, + "grad_norm": 0.21325546503067017, + "learning_rate": 3.2664361001504864e-05, + "loss": 0.22855515480041505, + "memory(GiB)": 91.64, + "step": 7930, + "token_acc": 0.9257692307692308, + "train_speed(iter/s)": 0.13863 + }, + { + "epoch": 0.7488674971687429, + "grad_norm": 0.24248625338077545, + "learning_rate": 3.2549072671945924e-05, + "loss": 0.22276439666748046, + "memory(GiB)": 91.64, + "step": 7935, + "token_acc": 0.9157955865272939, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.7493393733484334, + "grad_norm": 0.2705487906932831, + "learning_rate": 3.243394859554891e-05, + "loss": 0.2265183687210083, + "memory(GiB)": 91.64, + "step": 7940, + "token_acc": 0.9164098613251156, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.7498112495281238, + "grad_norm": 0.2557106614112854, + "learning_rate": 3.231898905265829e-05, + "loss": 0.22840962409973145, + "memory(GiB)": 91.64, + "step": 7945, + "token_acc": 0.9238232123607618, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.7502831257078143, + "grad_norm": 0.36380380392074585, + "learning_rate": 3.220419432321783e-05, + "loss": 0.23431000709533692, + "memory(GiB)": 91.64, + "step": 7950, + "token_acc": 0.9212121212121213, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.7507550018875048, + "grad_norm": 0.2633918821811676, + "learning_rate": 3.2089564686770004e-05, + "loss": 0.22273101806640624, + "memory(GiB)": 91.64, + "step": 7955, + "token_acc": 0.9136276391554703, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.7512268780671951, + "grad_norm": 0.3842732906341553, + "learning_rate": 3.197510042245524e-05, + "loss": 0.2252732992172241, + "memory(GiB)": 91.64, + "step": 7960, + "token_acc": 0.9360613810741688, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.7516987542468856, + "grad_norm": 0.27877315878868103, + "learning_rate": 3.186080180901121e-05, + "loss": 0.2262340545654297, + "memory(GiB)": 91.64, + "step": 7965, + "token_acc": 0.920619554695063, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.7521706304265761, + "grad_norm": 0.23689651489257812, + "learning_rate": 3.1746669124772264e-05, + "loss": 0.22646026611328124, + "memory(GiB)": 91.64, + "step": 7970, + "token_acc": 0.8999530295913575, + "train_speed(iter/s)": 0.138632 + }, + { + "epoch": 0.7526425066062665, + "grad_norm": 0.5697574615478516, + "learning_rate": 3.1632702647668664e-05, + "loss": 0.22735657691955566, + "memory(GiB)": 91.64, + "step": 7975, + "token_acc": 0.9176543980037429, + "train_speed(iter/s)": 0.13863 + }, + { + "epoch": 0.753114382785957, + "grad_norm": 0.7894372940063477, + "learning_rate": 3.1518902655225954e-05, + "loss": 0.22902073860168456, + "memory(GiB)": 91.64, + "step": 7980, + "token_acc": 0.9288161400837457, + "train_speed(iter/s)": 0.13863 + }, + { + "epoch": 0.7535862589656475, + "grad_norm": 0.26597410440444946, + "learning_rate": 3.1405269424564244e-05, + "loss": 0.2152813196182251, + "memory(GiB)": 91.64, + "step": 7985, + "token_acc": 0.9042145593869731, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.7540581351453378, + "grad_norm": 0.5665609240531921, + "learning_rate": 3.1291803232397576e-05, + "loss": 0.22755122184753418, + "memory(GiB)": 91.64, + "step": 7990, + "token_acc": 0.9204496325118893, + "train_speed(iter/s)": 0.138629 + }, + { + "epoch": 0.7545300113250283, + "grad_norm": 0.2860982120037079, + "learning_rate": 3.117850435503315e-05, + "loss": 0.2274768829345703, + "memory(GiB)": 91.64, + "step": 7995, + "token_acc": 0.916327716443928, + "train_speed(iter/s)": 0.138629 + }, + { + "epoch": 0.7550018875047187, + "grad_norm": 0.3887801170349121, + "learning_rate": 3.106537306837084e-05, + "loss": 0.2311309814453125, + "memory(GiB)": 91.64, + "step": 8000, + "token_acc": 0.9066347469220246, + "train_speed(iter/s)": 0.138629 + }, + { + "epoch": 0.7554737636844092, + "grad_norm": 0.3887653648853302, + "learning_rate": 3.095240964790233e-05, + "loss": 0.2242722988128662, + "memory(GiB)": 91.64, + "step": 8005, + "token_acc": 0.9280346820809249, + "train_speed(iter/s)": 0.13863 + }, + { + "epoch": 0.7559456398640997, + "grad_norm": 0.3202626705169678, + "learning_rate": 3.083961436871057e-05, + "loss": 0.22988688945770264, + "memory(GiB)": 91.64, + "step": 8010, + "token_acc": 0.924455205811138, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.7564175160437902, + "grad_norm": 0.316771537065506, + "learning_rate": 3.072698750546906e-05, + "loss": 0.22806041240692138, + "memory(GiB)": 91.64, + "step": 8015, + "token_acc": 0.9110389610389611, + "train_speed(iter/s)": 0.13863 + }, + { + "epoch": 0.7568893922234805, + "grad_norm": 0.3148418366909027, + "learning_rate": 3.061452933244112e-05, + "loss": 0.2297208547592163, + "memory(GiB)": 91.64, + "step": 8020, + "token_acc": 0.9350031705770451, + "train_speed(iter/s)": 0.13863 + }, + { + "epoch": 0.757361268403171, + "grad_norm": 0.2092905342578888, + "learning_rate": 3.0502240123479366e-05, + "loss": 0.2229100227355957, + "memory(GiB)": 91.64, + "step": 8025, + "token_acc": 0.9143859649122807, + "train_speed(iter/s)": 0.138632 + }, + { + "epoch": 0.7578331445828614, + "grad_norm": 0.43824535608291626, + "learning_rate": 3.0390120152024915e-05, + "loss": 0.2252873659133911, + "memory(GiB)": 91.64, + "step": 8030, + "token_acc": 0.9152276295133438, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.7583050207625519, + "grad_norm": 0.35322973132133484, + "learning_rate": 3.0278169691106785e-05, + "loss": 0.22799272537231446, + "memory(GiB)": 91.64, + "step": 8035, + "token_acc": 0.9206668582926129, + "train_speed(iter/s)": 0.138632 + }, + { + "epoch": 0.7587768969422424, + "grad_norm": 0.521074652671814, + "learning_rate": 3.016638901334118e-05, + "loss": 0.22861776351928711, + "memory(GiB)": 91.64, + "step": 8040, + "token_acc": 0.9213483146067416, + "train_speed(iter/s)": 0.138633 + }, + { + "epoch": 0.7592487731219328, + "grad_norm": 0.38683828711509705, + "learning_rate": 3.0054778390930925e-05, + "loss": 0.22503888607025146, + "memory(GiB)": 91.64, + "step": 8045, + "token_acc": 0.9049630411826821, + "train_speed(iter/s)": 0.13863 + }, + { + "epoch": 0.7597206493016233, + "grad_norm": 0.20460493862628937, + "learning_rate": 2.9943338095664632e-05, + "loss": 0.22331924438476564, + "memory(GiB)": 91.64, + "step": 8050, + "token_acc": 0.9197936210131332, + "train_speed(iter/s)": 0.13863 + }, + { + "epoch": 0.7601925254813137, + "grad_norm": 0.31732818484306335, + "learning_rate": 2.9832068398916212e-05, + "loss": 0.23599157333374024, + "memory(GiB)": 91.64, + "step": 8055, + "token_acc": 0.9237121510027526, + "train_speed(iter/s)": 0.13863 + }, + { + "epoch": 0.7606644016610041, + "grad_norm": 0.7717975974082947, + "learning_rate": 2.972096957164413e-05, + "loss": 0.23142099380493164, + "memory(GiB)": 91.64, + "step": 8060, + "token_acc": 0.928115552569701, + "train_speed(iter/s)": 0.13863 + }, + { + "epoch": 0.7611362778406946, + "grad_norm": 0.44531887769699097, + "learning_rate": 2.961004188439077e-05, + "loss": 0.2313159227371216, + "memory(GiB)": 91.64, + "step": 8065, + "token_acc": 0.9150915963943007, + "train_speed(iter/s)": 0.13863 + }, + { + "epoch": 0.7616081540203851, + "grad_norm": 0.3463146984577179, + "learning_rate": 2.9499285607281725e-05, + "loss": 0.22837791442871094, + "memory(GiB)": 91.64, + "step": 8070, + "token_acc": 0.9326069410815173, + "train_speed(iter/s)": 0.13863 + }, + { + "epoch": 0.7620800302000755, + "grad_norm": 0.2808953523635864, + "learning_rate": 2.9388701010025243e-05, + "loss": 0.2212012529373169, + "memory(GiB)": 91.64, + "step": 8075, + "token_acc": 0.9138222372423116, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.762551906379766, + "grad_norm": 0.3945287764072418, + "learning_rate": 2.9278288361911423e-05, + "loss": 0.2277526378631592, + "memory(GiB)": 91.64, + "step": 8080, + "token_acc": 0.9124365482233503, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.7630237825594564, + "grad_norm": 0.8451266288757324, + "learning_rate": 2.9168047931811683e-05, + "loss": 0.2179340124130249, + "memory(GiB)": 91.64, + "step": 8085, + "token_acc": 0.9067501739735561, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.7634956587391468, + "grad_norm": 0.43402817845344543, + "learning_rate": 2.9057979988178087e-05, + "loss": 0.2228691577911377, + "memory(GiB)": 91.64, + "step": 8090, + "token_acc": 0.9226860254083484, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.7639675349188373, + "grad_norm": 0.25339168310165405, + "learning_rate": 2.894808479904263e-05, + "loss": 0.22564988136291503, + "memory(GiB)": 91.64, + "step": 8095, + "token_acc": 0.9321125502456454, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.7644394110985278, + "grad_norm": 0.902864396572113, + "learning_rate": 2.883836263201669e-05, + "loss": 0.22950098514556885, + "memory(GiB)": 91.64, + "step": 8100, + "token_acc": 0.9148484848484848, + "train_speed(iter/s)": 0.138632 + }, + { + "epoch": 0.7649112872782182, + "grad_norm": 0.28512996435165405, + "learning_rate": 2.8728813754290196e-05, + "loss": 0.22164077758789064, + "memory(GiB)": 91.64, + "step": 8105, + "token_acc": 0.9126436781609195, + "train_speed(iter/s)": 0.138632 + }, + { + "epoch": 0.7653831634579087, + "grad_norm": 0.21365024149417877, + "learning_rate": 2.8619438432631185e-05, + "loss": 0.22118642330169677, + "memory(GiB)": 91.64, + "step": 8110, + "token_acc": 0.9141651031894934, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.765855039637599, + "grad_norm": 0.26842251420021057, + "learning_rate": 2.8510236933385048e-05, + "loss": 0.22353811264038087, + "memory(GiB)": 91.64, + "step": 8115, + "token_acc": 0.9207392197125257, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.7663269158172895, + "grad_norm": 0.4451298415660858, + "learning_rate": 2.8401209522473804e-05, + "loss": 0.22560317516326905, + "memory(GiB)": 91.64, + "step": 8120, + "token_acc": 0.9227367325702394, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.76679879199698, + "grad_norm": 0.3383225202560425, + "learning_rate": 2.8292356465395687e-05, + "loss": 0.2258366107940674, + "memory(GiB)": 91.64, + "step": 8125, + "token_acc": 0.9120879120879121, + "train_speed(iter/s)": 0.138632 + }, + { + "epoch": 0.7672706681766704, + "grad_norm": 0.2160254716873169, + "learning_rate": 2.8183678027224292e-05, + "loss": 0.223410964012146, + "memory(GiB)": 91.64, + "step": 8130, + "token_acc": 0.8991111111111111, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.7677425443563609, + "grad_norm": 0.34852781891822815, + "learning_rate": 2.8075174472607902e-05, + "loss": 0.21970810890197753, + "memory(GiB)": 91.64, + "step": 8135, + "token_acc": 0.9127594158339739, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.7682144205360514, + "grad_norm": 0.6557506918907166, + "learning_rate": 2.7966846065769036e-05, + "loss": 0.23110716342926024, + "memory(GiB)": 91.64, + "step": 8140, + "token_acc": 0.9102831594634874, + "train_speed(iter/s)": 0.13863 + }, + { + "epoch": 0.7686862967157417, + "grad_norm": 0.6803750991821289, + "learning_rate": 2.7858693070503718e-05, + "loss": 0.22070074081420898, + "memory(GiB)": 91.64, + "step": 8145, + "token_acc": 0.9156667809393212, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.7691581728954322, + "grad_norm": 0.44910258054733276, + "learning_rate": 2.7750715750180655e-05, + "loss": 0.22234528064727782, + "memory(GiB)": 91.64, + "step": 8150, + "token_acc": 0.9155890804597702, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.7696300490751227, + "grad_norm": 0.35992804169654846, + "learning_rate": 2.7642914367741e-05, + "loss": 0.2249147415161133, + "memory(GiB)": 91.64, + "step": 8155, + "token_acc": 0.9267461669505963, + "train_speed(iter/s)": 0.138629 + }, + { + "epoch": 0.7701019252548131, + "grad_norm": 0.5502942204475403, + "learning_rate": 2.753528918569732e-05, + "loss": 0.22030355930328369, + "memory(GiB)": 91.64, + "step": 8160, + "token_acc": 0.9201444622792937, + "train_speed(iter/s)": 0.138629 + }, + { + "epoch": 0.7705738014345036, + "grad_norm": 0.6990233659744263, + "learning_rate": 2.742784046613309e-05, + "loss": 0.22877583503723145, + "memory(GiB)": 91.64, + "step": 8165, + "token_acc": 0.9318885448916409, + "train_speed(iter/s)": 0.138628 + }, + { + "epoch": 0.7710456776141941, + "grad_norm": 0.24115338921546936, + "learning_rate": 2.7320568470702145e-05, + "loss": 0.2234677791595459, + "memory(GiB)": 91.64, + "step": 8170, + "token_acc": 0.9176334106728539, + "train_speed(iter/s)": 0.138629 + }, + { + "epoch": 0.7715175537938845, + "grad_norm": 0.4765714108943939, + "learning_rate": 2.721347346062797e-05, + "loss": 0.22813882827758789, + "memory(GiB)": 91.64, + "step": 8175, + "token_acc": 0.9207409656847859, + "train_speed(iter/s)": 0.13863 + }, + { + "epoch": 0.7719894299735749, + "grad_norm": 0.41965359449386597, + "learning_rate": 2.7106555696702952e-05, + "loss": 0.22575123310089112, + "memory(GiB)": 91.64, + "step": 8180, + "token_acc": 0.9118468288791384, + "train_speed(iter/s)": 0.13863 + }, + { + "epoch": 0.7724613061532654, + "grad_norm": 0.32254064083099365, + "learning_rate": 2.6999815439288044e-05, + "loss": 0.22551360130310058, + "memory(GiB)": 91.64, + "step": 8185, + "token_acc": 0.9281748785565579, + "train_speed(iter/s)": 0.13863 + }, + { + "epoch": 0.7729331823329558, + "grad_norm": 0.2386188954114914, + "learning_rate": 2.6893252948311766e-05, + "loss": 0.23226511478424072, + "memory(GiB)": 91.64, + "step": 8190, + "token_acc": 0.9106433677521842, + "train_speed(iter/s)": 0.13863 + }, + { + "epoch": 0.7734050585126463, + "grad_norm": 0.47129666805267334, + "learning_rate": 2.6786868483269856e-05, + "loss": 0.21903567314147948, + "memory(GiB)": 91.64, + "step": 8195, + "token_acc": 0.9255419415645617, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.7738769346923368, + "grad_norm": 0.24712376296520233, + "learning_rate": 2.668066230322449e-05, + "loss": 0.2212503433227539, + "memory(GiB)": 91.64, + "step": 8200, + "token_acc": 0.9146115906288532, + "train_speed(iter/s)": 0.138632 + }, + { + "epoch": 0.7743488108720272, + "grad_norm": 0.24980393052101135, + "learning_rate": 2.657463466680372e-05, + "loss": 0.22688717842102052, + "memory(GiB)": 91.64, + "step": 8205, + "token_acc": 0.9009870450339297, + "train_speed(iter/s)": 0.138633 + }, + { + "epoch": 0.7748206870517176, + "grad_norm": 0.3085843026638031, + "learning_rate": 2.6468785832200793e-05, + "loss": 0.2235793113708496, + "memory(GiB)": 91.64, + "step": 8210, + "token_acc": 0.9282822440717178, + "train_speed(iter/s)": 0.138633 + }, + { + "epoch": 0.775292563231408, + "grad_norm": 0.22891342639923096, + "learning_rate": 2.6363116057173588e-05, + "loss": 0.21712026596069336, + "memory(GiB)": 91.64, + "step": 8215, + "token_acc": 0.9137645107794361, + "train_speed(iter/s)": 0.138633 + }, + { + "epoch": 0.7757644394110985, + "grad_norm": 0.35751280188560486, + "learning_rate": 2.6257625599043844e-05, + "loss": 0.22840476036071777, + "memory(GiB)": 91.64, + "step": 8220, + "token_acc": 0.9042871385842473, + "train_speed(iter/s)": 0.138632 + }, + { + "epoch": 0.776236315590789, + "grad_norm": 0.36552250385284424, + "learning_rate": 2.6152314714696757e-05, + "loss": 0.21927356719970703, + "memory(GiB)": 91.64, + "step": 8225, + "token_acc": 0.9125596184419714, + "train_speed(iter/s)": 0.138632 + }, + { + "epoch": 0.7767081917704795, + "grad_norm": 0.2750298082828522, + "learning_rate": 2.60471836605802e-05, + "loss": 0.21922688484191893, + "memory(GiB)": 91.64, + "step": 8230, + "token_acc": 0.9250343878954608, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.7771800679501699, + "grad_norm": 0.4435797333717346, + "learning_rate": 2.5942232692704017e-05, + "loss": 0.22108829021453857, + "memory(GiB)": 91.64, + "step": 8235, + "token_acc": 0.9235737351991389, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.7776519441298603, + "grad_norm": 0.48604586720466614, + "learning_rate": 2.5837462066639718e-05, + "loss": 0.23142728805541993, + "memory(GiB)": 91.64, + "step": 8240, + "token_acc": 0.9114088159031979, + "train_speed(iter/s)": 0.138632 + }, + { + "epoch": 0.7781238203095507, + "grad_norm": 0.243556946516037, + "learning_rate": 2.573287203751955e-05, + "loss": 0.22401127815246583, + "memory(GiB)": 91.64, + "step": 8245, + "token_acc": 0.9081429990069514, + "train_speed(iter/s)": 0.138632 + }, + { + "epoch": 0.7785956964892412, + "grad_norm": 0.3457433581352234, + "learning_rate": 2.562846286003592e-05, + "loss": 0.22924907207489015, + "memory(GiB)": 91.64, + "step": 8250, + "token_acc": 0.914388705316912, + "train_speed(iter/s)": 0.138632 + }, + { + "epoch": 0.7790675726689317, + "grad_norm": 0.2969830334186554, + "learning_rate": 2.5524234788440905e-05, + "loss": 0.226761794090271, + "memory(GiB)": 91.64, + "step": 8255, + "token_acc": 0.9296254256526674, + "train_speed(iter/s)": 0.138632 + }, + { + "epoch": 0.7795394488486221, + "grad_norm": 0.5631513595581055, + "learning_rate": 2.5420188076545603e-05, + "loss": 0.21923332214355468, + "memory(GiB)": 91.64, + "step": 8260, + "token_acc": 0.9157566302652106, + "train_speed(iter/s)": 0.138631 + }, + { + "epoch": 0.7800113250283126, + "grad_norm": 0.24374043941497803, + "learning_rate": 2.531632297771931e-05, + "loss": 0.22521307468414306, + "memory(GiB)": 91.64, + "step": 8265, + "token_acc": 0.9289855072463769, + "train_speed(iter/s)": 0.138633 + }, + { + "epoch": 0.7804832012080031, + "grad_norm": 0.6146330833435059, + "learning_rate": 2.5212639744889312e-05, + "loss": 0.22694334983825684, + "memory(GiB)": 91.64, + "step": 8270, + "token_acc": 0.9116894197952219, + "train_speed(iter/s)": 0.138633 + }, + { + "epoch": 0.7809550773876934, + "grad_norm": 0.21018651127815247, + "learning_rate": 2.5109138630539797e-05, + "loss": 0.22090816497802734, + "memory(GiB)": 91.64, + "step": 8275, + "token_acc": 0.9197469197469198, + "train_speed(iter/s)": 0.138633 + }, + { + "epoch": 0.7814269535673839, + "grad_norm": 0.2725118100643158, + "learning_rate": 2.5005819886711578e-05, + "loss": 0.21987648010253907, + "memory(GiB)": 91.64, + "step": 8280, + "token_acc": 0.9179578359156718, + "train_speed(iter/s)": 0.138633 + }, + { + "epoch": 0.7818988297470744, + "grad_norm": 0.6226215362548828, + "learning_rate": 2.4902683765001355e-05, + "loss": 0.21619763374328613, + "memory(GiB)": 91.64, + "step": 8285, + "token_acc": 0.9423529411764706, + "train_speed(iter/s)": 0.138634 + }, + { + "epoch": 0.7823707059267648, + "grad_norm": 0.2744337320327759, + "learning_rate": 2.4799730516561147e-05, + "loss": 0.22009749412536622, + "memory(GiB)": 91.64, + "step": 8290, + "token_acc": 0.9216516675489677, + "train_speed(iter/s)": 0.138634 + }, + { + "epoch": 0.7828425821064553, + "grad_norm": 0.284661203622818, + "learning_rate": 2.4696960392097523e-05, + "loss": 0.22542271614074708, + "memory(GiB)": 91.64, + "step": 8295, + "token_acc": 0.9180474697716077, + "train_speed(iter/s)": 0.138634 + }, + { + "epoch": 0.7833144582861458, + "grad_norm": 0.25613096356391907, + "learning_rate": 2.4594373641871314e-05, + "loss": 0.22844905853271485, + "memory(GiB)": 91.64, + "step": 8300, + "token_acc": 0.9212598425196851, + "train_speed(iter/s)": 0.138633 + }, + { + "epoch": 0.7837863344658361, + "grad_norm": 0.5812061429023743, + "learning_rate": 2.4491970515696626e-05, + "loss": 0.22294883728027343, + "memory(GiB)": 91.64, + "step": 8305, + "token_acc": 0.9196940726577438, + "train_speed(iter/s)": 0.138632 + }, + { + "epoch": 0.7842582106455266, + "grad_norm": 0.28404125571250916, + "learning_rate": 2.4389751262940498e-05, + "loss": 0.22034523487091065, + "memory(GiB)": 91.64, + "step": 8310, + "token_acc": 0.915719325754606, + "train_speed(iter/s)": 0.138633 + }, + { + "epoch": 0.784730086825217, + "grad_norm": 0.3765624463558197, + "learning_rate": 2.4287716132522243e-05, + "loss": 0.22550301551818847, + "memory(GiB)": 91.64, + "step": 8315, + "token_acc": 0.923728813559322, + "train_speed(iter/s)": 0.138633 + }, + { + "epoch": 0.7852019630049075, + "grad_norm": 0.4880436956882477, + "learning_rate": 2.41858653729127e-05, + "loss": 0.22433314323425294, + "memory(GiB)": 91.64, + "step": 8320, + "token_acc": 0.9113504556752279, + "train_speed(iter/s)": 0.138633 + }, + { + "epoch": 0.785673839184598, + "grad_norm": 0.5990275144577026, + "learning_rate": 2.4084199232133797e-05, + "loss": 0.22993867397308348, + "memory(GiB)": 91.64, + "step": 8325, + "token_acc": 0.9129451667608819, + "train_speed(iter/s)": 0.138634 + }, + { + "epoch": 0.7861457153642885, + "grad_norm": 0.559020459651947, + "learning_rate": 2.3982717957757995e-05, + "loss": 0.22906913757324218, + "memory(GiB)": 91.64, + "step": 8330, + "token_acc": 0.9164596273291925, + "train_speed(iter/s)": 0.138633 + }, + { + "epoch": 0.7866175915439788, + "grad_norm": 0.3202402591705322, + "learning_rate": 2.3881421796907366e-05, + "loss": 0.22067337036132811, + "memory(GiB)": 91.64, + "step": 8335, + "token_acc": 0.9253401986024273, + "train_speed(iter/s)": 0.138633 + }, + { + "epoch": 0.7870894677236693, + "grad_norm": 0.34719038009643555, + "learning_rate": 2.378031099625334e-05, + "loss": 0.223028564453125, + "memory(GiB)": 91.64, + "step": 8340, + "token_acc": 0.9278810408921933, + "train_speed(iter/s)": 0.138633 + }, + { + "epoch": 0.7875613439033597, + "grad_norm": 0.39138731360435486, + "learning_rate": 2.3679385802015987e-05, + "loss": 0.2211979389190674, + "memory(GiB)": 91.64, + "step": 8345, + "token_acc": 0.9202751922298664, + "train_speed(iter/s)": 0.138633 + }, + { + "epoch": 0.7880332200830502, + "grad_norm": 0.24618585407733917, + "learning_rate": 2.3578646459963272e-05, + "loss": 0.2210986375808716, + "memory(GiB)": 91.64, + "step": 8350, + "token_acc": 0.9225880993645291, + "train_speed(iter/s)": 0.138633 + }, + { + "epoch": 0.7885050962627407, + "grad_norm": 0.4435372054576874, + "learning_rate": 2.347809321541069e-05, + "loss": 0.2195359230041504, + "memory(GiB)": 91.64, + "step": 8355, + "token_acc": 0.9158878504672897, + "train_speed(iter/s)": 0.138632 + }, + { + "epoch": 0.7889769724424311, + "grad_norm": 0.21401335299015045, + "learning_rate": 2.337772631322054e-05, + "loss": 0.2184643030166626, + "memory(GiB)": 91.64, + "step": 8360, + "token_acc": 0.935247582771755, + "train_speed(iter/s)": 0.138632 + }, + { + "epoch": 0.7894488486221215, + "grad_norm": 0.2693486511707306, + "learning_rate": 2.327754599780132e-05, + "loss": 0.22365641593933105, + "memory(GiB)": 91.64, + "step": 8365, + "token_acc": 0.9265658747300216, + "train_speed(iter/s)": 0.138633 + }, + { + "epoch": 0.789920724801812, + "grad_norm": 0.3792926073074341, + "learning_rate": 2.317755251310719e-05, + "loss": 0.21845946311950684, + "memory(GiB)": 91.64, + "step": 8370, + "token_acc": 0.9219151670951157, + "train_speed(iter/s)": 0.138633 + }, + { + "epoch": 0.7903926009815024, + "grad_norm": 0.23595775663852692, + "learning_rate": 2.3077746102637364e-05, + "loss": 0.22609164714813232, + "memory(GiB)": 91.64, + "step": 8375, + "token_acc": 0.9150650960942344, + "train_speed(iter/s)": 0.138635 + }, + { + "epoch": 0.7908644771611929, + "grad_norm": 0.32832691073417664, + "learning_rate": 2.297812700943539e-05, + "loss": 0.2214564323425293, + "memory(GiB)": 91.64, + "step": 8380, + "token_acc": 0.9153902261123268, + "train_speed(iter/s)": 0.138634 + }, + { + "epoch": 0.7913363533408834, + "grad_norm": 0.36022377014160156, + "learning_rate": 2.2878695476088873e-05, + "loss": 0.2199798345565796, + "memory(GiB)": 91.64, + "step": 8385, + "token_acc": 0.9154995331465919, + "train_speed(iter/s)": 0.138634 + }, + { + "epoch": 0.7918082295205738, + "grad_norm": 0.449059396982193, + "learning_rate": 2.2779451744728474e-05, + "loss": 0.2180879831314087, + "memory(GiB)": 91.64, + "step": 8390, + "token_acc": 0.9329475833900612, + "train_speed(iter/s)": 0.138635 + }, + { + "epoch": 0.7922801057002643, + "grad_norm": 0.28043511509895325, + "learning_rate": 2.268039605702765e-05, + "loss": 0.22389302253723145, + "memory(GiB)": 91.64, + "step": 8395, + "token_acc": 0.932570977917981, + "train_speed(iter/s)": 0.138634 + }, + { + "epoch": 0.7927519818799547, + "grad_norm": 0.27153950929641724, + "learning_rate": 2.2581528654201943e-05, + "loss": 0.2266615390777588, + "memory(GiB)": 91.64, + "step": 8400, + "token_acc": 0.929364278506559, + "train_speed(iter/s)": 0.138635 + }, + { + "epoch": 0.7932238580596451, + "grad_norm": 0.22254140675067902, + "learning_rate": 2.2482849777008308e-05, + "loss": 0.22057442665100097, + "memory(GiB)": 91.64, + "step": 8405, + "token_acc": 0.9215481171548117, + "train_speed(iter/s)": 0.138636 + }, + { + "epoch": 0.7936957342393356, + "grad_norm": 0.24786394834518433, + "learning_rate": 2.2384359665744657e-05, + "loss": 0.21846373081207277, + "memory(GiB)": 91.64, + "step": 8410, + "token_acc": 0.916236062007071, + "train_speed(iter/s)": 0.138636 + }, + { + "epoch": 0.7941676104190261, + "grad_norm": 0.3908824622631073, + "learning_rate": 2.2286058560249325e-05, + "loss": 0.22614388465881347, + "memory(GiB)": 91.64, + "step": 8415, + "token_acc": 0.9074074074074074, + "train_speed(iter/s)": 0.138637 + }, + { + "epoch": 0.7946394865987165, + "grad_norm": 0.8846114277839661, + "learning_rate": 2.2187946699900218e-05, + "loss": 0.2331876277923584, + "memory(GiB)": 91.64, + "step": 8420, + "token_acc": 0.9200567644276254, + "train_speed(iter/s)": 0.138637 + }, + { + "epoch": 0.795111362778407, + "grad_norm": 0.59001225233078, + "learning_rate": 2.2090024323614524e-05, + "loss": 0.22399048805236815, + "memory(GiB)": 91.64, + "step": 8425, + "token_acc": 0.923191278493558, + "train_speed(iter/s)": 0.138636 + }, + { + "epoch": 0.7955832389580973, + "grad_norm": 0.5200862288475037, + "learning_rate": 2.1992291669847974e-05, + "loss": 0.22100448608398438, + "memory(GiB)": 91.64, + "step": 8430, + "token_acc": 0.9250924784217016, + "train_speed(iter/s)": 0.138637 + }, + { + "epoch": 0.7960551151377878, + "grad_norm": 0.3765367865562439, + "learning_rate": 2.189474897659426e-05, + "loss": 0.21650943756103516, + "memory(GiB)": 91.64, + "step": 8435, + "token_acc": 0.9229422066549913, + "train_speed(iter/s)": 0.138638 + }, + { + "epoch": 0.7965269913174783, + "grad_norm": 0.5452553033828735, + "learning_rate": 2.1797396481384546e-05, + "loss": 0.22537777423858643, + "memory(GiB)": 91.64, + "step": 8440, + "token_acc": 0.9138381201044387, + "train_speed(iter/s)": 0.138638 + }, + { + "epoch": 0.7969988674971688, + "grad_norm": 0.2730084955692291, + "learning_rate": 2.1700234421286804e-05, + "loss": 0.2295698881149292, + "memory(GiB)": 91.64, + "step": 8445, + "token_acc": 0.9063561377971858, + "train_speed(iter/s)": 0.138639 + }, + { + "epoch": 0.7974707436768592, + "grad_norm": 0.2253396362066269, + "learning_rate": 2.1603263032905284e-05, + "loss": 0.22352323532104493, + "memory(GiB)": 91.64, + "step": 8450, + "token_acc": 0.939419795221843, + "train_speed(iter/s)": 0.138639 + }, + { + "epoch": 0.7979426198565497, + "grad_norm": 0.9259458184242249, + "learning_rate": 2.1506482552379915e-05, + "loss": 0.22939915657043458, + "memory(GiB)": 91.64, + "step": 8455, + "token_acc": 0.9181561618062088, + "train_speed(iter/s)": 0.13864 + }, + { + "epoch": 0.79841449603624, + "grad_norm": 0.34216970205307007, + "learning_rate": 2.1409893215385758e-05, + "loss": 0.21892695426940917, + "memory(GiB)": 91.64, + "step": 8460, + "token_acc": 0.9082747853239657, + "train_speed(iter/s)": 0.13864 + }, + { + "epoch": 0.7988863722159305, + "grad_norm": 0.30453988909721375, + "learning_rate": 2.1313495257132333e-05, + "loss": 0.22130227088928223, + "memory(GiB)": 91.64, + "step": 8465, + "token_acc": 0.9300937766410913, + "train_speed(iter/s)": 0.13864 + }, + { + "epoch": 0.799358248395621, + "grad_norm": 0.36281198263168335, + "learning_rate": 2.121728891236322e-05, + "loss": 0.2198082685470581, + "memory(GiB)": 91.64, + "step": 8470, + "token_acc": 0.9177984274481773, + "train_speed(iter/s)": 0.13864 + }, + { + "epoch": 0.7998301245753114, + "grad_norm": 0.23792099952697754, + "learning_rate": 2.112127441535534e-05, + "loss": 0.2212691307067871, + "memory(GiB)": 91.64, + "step": 8475, + "token_acc": 0.9101941747572816, + "train_speed(iter/s)": 0.138639 + }, + { + "epoch": 0.8003020007550019, + "grad_norm": 0.3405323028564453, + "learning_rate": 2.1025451999918454e-05, + "loss": 0.2203970432281494, + "memory(GiB)": 91.64, + "step": 8480, + "token_acc": 0.925, + "train_speed(iter/s)": 0.13864 + }, + { + "epoch": 0.8007738769346924, + "grad_norm": 0.24904760718345642, + "learning_rate": 2.0929821899394588e-05, + "loss": 0.22575736045837402, + "memory(GiB)": 91.64, + "step": 8485, + "token_acc": 0.886896551724138, + "train_speed(iter/s)": 0.138642 + }, + { + "epoch": 0.8012457531143827, + "grad_norm": 0.2277214378118515, + "learning_rate": 2.0834384346657386e-05, + "loss": 0.22250080108642578, + "memory(GiB)": 91.64, + "step": 8490, + "token_acc": 0.9332630839480155, + "train_speed(iter/s)": 0.138642 + }, + { + "epoch": 0.8017176292940732, + "grad_norm": 0.354336142539978, + "learning_rate": 2.0739139574111677e-05, + "loss": 0.21643447875976562, + "memory(GiB)": 91.64, + "step": 8495, + "token_acc": 0.9221705426356589, + "train_speed(iter/s)": 0.13864 + }, + { + "epoch": 0.8021895054737637, + "grad_norm": 0.2935597002506256, + "learning_rate": 2.0644087813692815e-05, + "loss": 0.22014429569244384, + "memory(GiB)": 91.64, + "step": 8500, + "token_acc": 0.920774647887324, + "train_speed(iter/s)": 0.138642 + }, + { + "epoch": 0.8026613816534541, + "grad_norm": 0.27267611026763916, + "learning_rate": 2.0549229296866158e-05, + "loss": 0.2266261339187622, + "memory(GiB)": 91.64, + "step": 8505, + "token_acc": 0.9265078560567663, + "train_speed(iter/s)": 0.138641 + }, + { + "epoch": 0.8031332578331446, + "grad_norm": 0.250590980052948, + "learning_rate": 2.0454564254626473e-05, + "loss": 0.21946268081665038, + "memory(GiB)": 91.64, + "step": 8510, + "token_acc": 0.9360582760016187, + "train_speed(iter/s)": 0.138643 + }, + { + "epoch": 0.8036051340128351, + "grad_norm": 0.2068144679069519, + "learning_rate": 2.0360092917497408e-05, + "loss": 0.2195216417312622, + "memory(GiB)": 91.64, + "step": 8515, + "token_acc": 0.9289740698985344, + "train_speed(iter/s)": 0.138643 + }, + { + "epoch": 0.8040770101925255, + "grad_norm": 0.3886139392852783, + "learning_rate": 2.0265815515530838e-05, + "loss": 0.2155540943145752, + "memory(GiB)": 91.64, + "step": 8520, + "token_acc": 0.9433656957928802, + "train_speed(iter/s)": 0.138643 + }, + { + "epoch": 0.8045488863722159, + "grad_norm": 0.31128185987472534, + "learning_rate": 2.0171732278306464e-05, + "loss": 0.2209153413772583, + "memory(GiB)": 91.64, + "step": 8525, + "token_acc": 0.9294670846394985, + "train_speed(iter/s)": 0.138642 + }, + { + "epoch": 0.8050207625519064, + "grad_norm": 0.5342454314231873, + "learning_rate": 2.007784343493112e-05, + "loss": 0.22390148639678956, + "memory(GiB)": 91.64, + "step": 8530, + "token_acc": 0.9230158730158731, + "train_speed(iter/s)": 0.138642 + }, + { + "epoch": 0.8054926387315968, + "grad_norm": 0.4349924325942993, + "learning_rate": 1.998414921403827e-05, + "loss": 0.21895236968994142, + "memory(GiB)": 91.64, + "step": 8535, + "token_acc": 0.9248291571753986, + "train_speed(iter/s)": 0.138642 + }, + { + "epoch": 0.8059645149112873, + "grad_norm": 0.5242533683776855, + "learning_rate": 1.989064984378747e-05, + "loss": 0.21892497539520264, + "memory(GiB)": 91.64, + "step": 8540, + "token_acc": 0.930921052631579, + "train_speed(iter/s)": 0.138643 + }, + { + "epoch": 0.8064363910909778, + "grad_norm": 0.34894755482673645, + "learning_rate": 1.9797345551863765e-05, + "loss": 0.21972031593322755, + "memory(GiB)": 91.64, + "step": 8545, + "token_acc": 0.9258809234507898, + "train_speed(iter/s)": 0.138644 + }, + { + "epoch": 0.8069082672706682, + "grad_norm": 0.46887141466140747, + "learning_rate": 1.9704236565477117e-05, + "loss": 0.21499874591827392, + "memory(GiB)": 91.64, + "step": 8550, + "token_acc": 0.9317157712305026, + "train_speed(iter/s)": 0.138644 + }, + { + "epoch": 0.8073801434503586, + "grad_norm": 0.414568692445755, + "learning_rate": 1.9611323111361935e-05, + "loss": 0.22043170928955078, + "memory(GiB)": 91.64, + "step": 8555, + "token_acc": 0.9253034547152195, + "train_speed(iter/s)": 0.138644 + }, + { + "epoch": 0.807852019630049, + "grad_norm": 0.23252899944782257, + "learning_rate": 1.951860541577647e-05, + "loss": 0.21883907318115234, + "memory(GiB)": 91.64, + "step": 8560, + "token_acc": 0.9171251719394773, + "train_speed(iter/s)": 0.138645 + }, + { + "epoch": 0.8083238958097395, + "grad_norm": 0.2740820348262787, + "learning_rate": 1.9426083704502273e-05, + "loss": 0.22658867835998536, + "memory(GiB)": 91.64, + "step": 8565, + "token_acc": 0.9211643420254699, + "train_speed(iter/s)": 0.138644 + }, + { + "epoch": 0.80879577198943, + "grad_norm": 0.44111067056655884, + "learning_rate": 1.9333758202843655e-05, + "loss": 0.22524876594543458, + "memory(GiB)": 91.64, + "step": 8570, + "token_acc": 0.9118528027385537, + "train_speed(iter/s)": 0.138644 + }, + { + "epoch": 0.8092676481691204, + "grad_norm": 0.20151256024837494, + "learning_rate": 1.924162913562707e-05, + "loss": 0.21728076934814453, + "memory(GiB)": 91.64, + "step": 8575, + "token_acc": 0.9278195488721804, + "train_speed(iter/s)": 0.138645 + }, + { + "epoch": 0.8097395243488109, + "grad_norm": 0.5388370752334595, + "learning_rate": 1.9149696727200695e-05, + "loss": 0.22715296745300292, + "memory(GiB)": 91.64, + "step": 8580, + "token_acc": 0.9294187425860023, + "train_speed(iter/s)": 0.138645 + }, + { + "epoch": 0.8102114005285013, + "grad_norm": 0.27115631103515625, + "learning_rate": 1.9057961201433772e-05, + "loss": 0.2197357416152954, + "memory(GiB)": 91.64, + "step": 8585, + "token_acc": 0.9310043668122271, + "train_speed(iter/s)": 0.138645 + }, + { + "epoch": 0.8106832767081917, + "grad_norm": 0.21140803396701813, + "learning_rate": 1.896642278171612e-05, + "loss": 0.2216893196105957, + "memory(GiB)": 91.64, + "step": 8590, + "token_acc": 0.9260104302477183, + "train_speed(iter/s)": 0.138646 + }, + { + "epoch": 0.8111551528878822, + "grad_norm": 0.33303895592689514, + "learning_rate": 1.8875081690957575e-05, + "loss": 0.22209582328796387, + "memory(GiB)": 91.64, + "step": 8595, + "token_acc": 0.9165911151405258, + "train_speed(iter/s)": 0.138647 + }, + { + "epoch": 0.8116270290675727, + "grad_norm": 0.2280055433511734, + "learning_rate": 1.8783938151587465e-05, + "loss": 0.2175816535949707, + "memory(GiB)": 91.64, + "step": 8600, + "token_acc": 0.9249368459040058, + "train_speed(iter/s)": 0.138646 + }, + { + "epoch": 0.8120989052472631, + "grad_norm": 0.3590617775917053, + "learning_rate": 1.8692992385553975e-05, + "loss": 0.2196737289428711, + "memory(GiB)": 91.64, + "step": 8605, + "token_acc": 0.9189448441247002, + "train_speed(iter/s)": 0.138646 + }, + { + "epoch": 0.8125707814269536, + "grad_norm": 0.6457542777061462, + "learning_rate": 1.860224461432377e-05, + "loss": 0.21764006614685058, + "memory(GiB)": 91.64, + "step": 8610, + "token_acc": 0.9244288224956063, + "train_speed(iter/s)": 0.138646 + }, + { + "epoch": 0.8130426576066441, + "grad_norm": 0.46899905800819397, + "learning_rate": 1.8511695058881316e-05, + "loss": 0.2232901096343994, + "memory(GiB)": 91.64, + "step": 8615, + "token_acc": 0.9269063611220861, + "train_speed(iter/s)": 0.138647 + }, + { + "epoch": 0.8135145337863344, + "grad_norm": 0.28184252977371216, + "learning_rate": 1.8421343939728442e-05, + "loss": 0.21828012466430663, + "memory(GiB)": 91.64, + "step": 8620, + "token_acc": 0.9019536903039074, + "train_speed(iter/s)": 0.138648 + }, + { + "epoch": 0.8139864099660249, + "grad_norm": 0.3113933205604553, + "learning_rate": 1.833119147688369e-05, + "loss": 0.21824917793273926, + "memory(GiB)": 91.64, + "step": 8625, + "token_acc": 0.9256303862112991, + "train_speed(iter/s)": 0.138647 + }, + { + "epoch": 0.8144582861457154, + "grad_norm": 0.30904653668403625, + "learning_rate": 1.8241237889881934e-05, + "loss": 0.2253275156021118, + "memory(GiB)": 91.64, + "step": 8630, + "token_acc": 0.9330543933054394, + "train_speed(iter/s)": 0.138647 + }, + { + "epoch": 0.8149301623254058, + "grad_norm": 0.6179171204566956, + "learning_rate": 1.815148339777363e-05, + "loss": 0.22154507637023926, + "memory(GiB)": 91.64, + "step": 8635, + "token_acc": 0.9194214876033058, + "train_speed(iter/s)": 0.138648 + }, + { + "epoch": 0.8154020385050963, + "grad_norm": 0.32482269406318665, + "learning_rate": 1.8061928219124503e-05, + "loss": 0.21969285011291503, + "memory(GiB)": 91.64, + "step": 8640, + "token_acc": 0.9162815982603969, + "train_speed(iter/s)": 0.138648 + }, + { + "epoch": 0.8158739146847868, + "grad_norm": 0.3604651093482971, + "learning_rate": 1.79725725720149e-05, + "loss": 0.21565718650817872, + "memory(GiB)": 91.64, + "step": 8645, + "token_acc": 0.924759080800593, + "train_speed(iter/s)": 0.138649 + }, + { + "epoch": 0.8163457908644771, + "grad_norm": 0.6190539002418518, + "learning_rate": 1.7883416674039278e-05, + "loss": 0.2202209711074829, + "memory(GiB)": 91.64, + "step": 8650, + "token_acc": 0.9248197734294542, + "train_speed(iter/s)": 0.138649 + }, + { + "epoch": 0.8168176670441676, + "grad_norm": 0.2979640066623688, + "learning_rate": 1.7794460742305696e-05, + "loss": 0.22345561981201173, + "memory(GiB)": 91.64, + "step": 8655, + "token_acc": 0.9106302916274694, + "train_speed(iter/s)": 0.13865 + }, + { + "epoch": 0.817289543223858, + "grad_norm": 0.49348747730255127, + "learning_rate": 1.770570499343517e-05, + "loss": 0.21452643871307372, + "memory(GiB)": 91.64, + "step": 8660, + "token_acc": 0.9195678271308524, + "train_speed(iter/s)": 0.13865 + }, + { + "epoch": 0.8177614194035485, + "grad_norm": 0.3675593137741089, + "learning_rate": 1.7617149643561358e-05, + "loss": 0.21637289524078368, + "memory(GiB)": 91.64, + "step": 8665, + "token_acc": 0.9341085271317829, + "train_speed(iter/s)": 0.138651 + }, + { + "epoch": 0.818233295583239, + "grad_norm": 0.5269376039505005, + "learning_rate": 1.752879490832985e-05, + "loss": 0.21624937057495117, + "memory(GiB)": 91.64, + "step": 8670, + "token_acc": 0.9225589225589226, + "train_speed(iter/s)": 0.138652 + }, + { + "epoch": 0.8187051717629295, + "grad_norm": 0.6057656407356262, + "learning_rate": 1.744064100289773e-05, + "loss": 0.21797895431518555, + "memory(GiB)": 91.64, + "step": 8675, + "token_acc": 0.9184810126582279, + "train_speed(iter/s)": 0.138653 + }, + { + "epoch": 0.8191770479426198, + "grad_norm": 0.44021740555763245, + "learning_rate": 1.7352688141933036e-05, + "loss": 0.2198997974395752, + "memory(GiB)": 91.64, + "step": 8680, + "token_acc": 0.9260405916752666, + "train_speed(iter/s)": 0.138652 + }, + { + "epoch": 0.8196489241223103, + "grad_norm": 0.24512818455696106, + "learning_rate": 1.726493653961425e-05, + "loss": 0.2200550079345703, + "memory(GiB)": 91.64, + "step": 8685, + "token_acc": 0.9201101928374655, + "train_speed(iter/s)": 0.138654 + }, + { + "epoch": 0.8201208003020007, + "grad_norm": 0.25874602794647217, + "learning_rate": 1.717738640962968e-05, + "loss": 0.21884992122650146, + "memory(GiB)": 91.64, + "step": 8690, + "token_acc": 0.9136848713119899, + "train_speed(iter/s)": 0.138653 + }, + { + "epoch": 0.8205926764816912, + "grad_norm": 0.36624372005462646, + "learning_rate": 1.7090037965177098e-05, + "loss": 0.22657575607299804, + "memory(GiB)": 91.64, + "step": 8695, + "token_acc": 0.917910447761194, + "train_speed(iter/s)": 0.138655 + }, + { + "epoch": 0.8210645526613817, + "grad_norm": 0.4566001296043396, + "learning_rate": 1.7002891418963107e-05, + "loss": 0.21981406211853027, + "memory(GiB)": 91.64, + "step": 8700, + "token_acc": 0.9316739873108834, + "train_speed(iter/s)": 0.138655 + }, + { + "epoch": 0.8215364288410721, + "grad_norm": 0.5014268755912781, + "learning_rate": 1.691594698320267e-05, + "loss": 0.21487929821014404, + "memory(GiB)": 91.64, + "step": 8705, + "token_acc": 0.9231936854887675, + "train_speed(iter/s)": 0.138654 + }, + { + "epoch": 0.8220083050207625, + "grad_norm": 0.5701547861099243, + "learning_rate": 1.6829204869618585e-05, + "loss": 0.2149423837661743, + "memory(GiB)": 91.64, + "step": 8710, + "token_acc": 0.9118796992481203, + "train_speed(iter/s)": 0.138653 + }, + { + "epoch": 0.822480181200453, + "grad_norm": 0.5961725115776062, + "learning_rate": 1.6742665289440973e-05, + "loss": 0.21888768672943115, + "memory(GiB)": 91.64, + "step": 8715, + "token_acc": 0.9206291148500366, + "train_speed(iter/s)": 0.138653 + }, + { + "epoch": 0.8229520573801434, + "grad_norm": 0.5988127589225769, + "learning_rate": 1.665632845340669e-05, + "loss": 0.22203760147094725, + "memory(GiB)": 91.64, + "step": 8720, + "token_acc": 0.9163961038961039, + "train_speed(iter/s)": 0.138654 + }, + { + "epoch": 0.8234239335598339, + "grad_norm": 0.2811583876609802, + "learning_rate": 1.6570194571758955e-05, + "loss": 0.21281654834747316, + "memory(GiB)": 91.64, + "step": 8725, + "token_acc": 0.9121813031161473, + "train_speed(iter/s)": 0.138655 + }, + { + "epoch": 0.8238958097395244, + "grad_norm": 0.39711859822273254, + "learning_rate": 1.648426385424675e-05, + "loss": 0.22025790214538574, + "memory(GiB)": 91.64, + "step": 8730, + "token_acc": 0.9323593073593074, + "train_speed(iter/s)": 0.138656 + }, + { + "epoch": 0.8243676859192148, + "grad_norm": 0.36098819971084595, + "learning_rate": 1.6398536510124285e-05, + "loss": 0.21392159461975097, + "memory(GiB)": 91.64, + "step": 8735, + "token_acc": 0.9101847872797594, + "train_speed(iter/s)": 0.138657 + }, + { + "epoch": 0.8248395620989053, + "grad_norm": 0.46026375889778137, + "learning_rate": 1.631301274815058e-05, + "loss": 0.21499810218811036, + "memory(GiB)": 91.64, + "step": 8740, + "token_acc": 0.9302019315188762, + "train_speed(iter/s)": 0.138657 + }, + { + "epoch": 0.8253114382785957, + "grad_norm": 0.34468331933021545, + "learning_rate": 1.622769277658882e-05, + "loss": 0.21223993301391603, + "memory(GiB)": 91.64, + "step": 8745, + "token_acc": 0.9186937687437521, + "train_speed(iter/s)": 0.138656 + }, + { + "epoch": 0.8257833144582861, + "grad_norm": 0.4011945426464081, + "learning_rate": 1.614257680320601e-05, + "loss": 0.22296814918518065, + "memory(GiB)": 91.64, + "step": 8750, + "token_acc": 0.9173285198555957, + "train_speed(iter/s)": 0.138657 + }, + { + "epoch": 0.8262551906379766, + "grad_norm": 0.23414309322834015, + "learning_rate": 1.605766503527236e-05, + "loss": 0.2172119140625, + "memory(GiB)": 91.64, + "step": 8755, + "token_acc": 0.9090909090909091, + "train_speed(iter/s)": 0.138658 + }, + { + "epoch": 0.8267270668176671, + "grad_norm": 0.49239152669906616, + "learning_rate": 1.597295767956081e-05, + "loss": 0.2250286340713501, + "memory(GiB)": 91.64, + "step": 8760, + "token_acc": 0.9303750919342976, + "train_speed(iter/s)": 0.138657 + }, + { + "epoch": 0.8271989429973575, + "grad_norm": 0.29153236746788025, + "learning_rate": 1.5888454942346498e-05, + "loss": 0.21357007026672364, + "memory(GiB)": 91.64, + "step": 8765, + "token_acc": 0.9249068501003153, + "train_speed(iter/s)": 0.138657 + }, + { + "epoch": 0.827670819177048, + "grad_norm": 0.39535972476005554, + "learning_rate": 1.5804157029406364e-05, + "loss": 0.21768288612365722, + "memory(GiB)": 91.64, + "step": 8770, + "token_acc": 0.918443696221474, + "train_speed(iter/s)": 0.138657 + }, + { + "epoch": 0.8281426953567383, + "grad_norm": 0.3631436228752136, + "learning_rate": 1.5720064146018455e-05, + "loss": 0.21306240558624268, + "memory(GiB)": 91.64, + "step": 8775, + "token_acc": 0.9182437547312642, + "train_speed(iter/s)": 0.138657 + }, + { + "epoch": 0.8286145715364288, + "grad_norm": 0.2444126009941101, + "learning_rate": 1.563617649696162e-05, + "loss": 0.21740403175354003, + "memory(GiB)": 91.64, + "step": 8780, + "token_acc": 0.9132075471698113, + "train_speed(iter/s)": 0.138657 + }, + { + "epoch": 0.8290864477161193, + "grad_norm": 0.30875638127326965, + "learning_rate": 1.555249428651494e-05, + "loss": 0.21742725372314453, + "memory(GiB)": 91.64, + "step": 8785, + "token_acc": 0.9158807996982271, + "train_speed(iter/s)": 0.138658 + }, + { + "epoch": 0.8295583238958097, + "grad_norm": 0.7618734240531921, + "learning_rate": 1.5469017718457124e-05, + "loss": 0.21721193790435792, + "memory(GiB)": 91.64, + "step": 8790, + "token_acc": 0.9266528925619835, + "train_speed(iter/s)": 0.138658 + }, + { + "epoch": 0.8300302000755002, + "grad_norm": 0.25342825055122375, + "learning_rate": 1.5385746996066263e-05, + "loss": 0.21593549251556396, + "memory(GiB)": 91.64, + "step": 8795, + "token_acc": 0.9261596718207636, + "train_speed(iter/s)": 0.138658 + }, + { + "epoch": 0.8305020762551907, + "grad_norm": 0.3047173321247101, + "learning_rate": 1.5302682322119087e-05, + "loss": 0.21290826797485352, + "memory(GiB)": 91.64, + "step": 8800, + "token_acc": 0.9214285714285714, + "train_speed(iter/s)": 0.138659 + }, + { + "epoch": 0.830973952434881, + "grad_norm": 0.18959245085716248, + "learning_rate": 1.5219823898890551e-05, + "loss": 0.21209537982940674, + "memory(GiB)": 91.64, + "step": 8805, + "token_acc": 0.9263112267013437, + "train_speed(iter/s)": 0.138659 + }, + { + "epoch": 0.8314458286145715, + "grad_norm": 0.35109010338783264, + "learning_rate": 1.5137171928153393e-05, + "loss": 0.22048661708831788, + "memory(GiB)": 91.64, + "step": 8810, + "token_acc": 0.9231553893233594, + "train_speed(iter/s)": 0.13866 + }, + { + "epoch": 0.831917704794262, + "grad_norm": 0.4367806315422058, + "learning_rate": 1.5054726611177627e-05, + "loss": 0.21188702583312988, + "memory(GiB)": 91.64, + "step": 8815, + "token_acc": 0.9250555731978406, + "train_speed(iter/s)": 0.138659 + }, + { + "epoch": 0.8323895809739524, + "grad_norm": 0.2685738801956177, + "learning_rate": 1.4972488148729958e-05, + "loss": 0.21199412345886232, + "memory(GiB)": 91.64, + "step": 8820, + "token_acc": 0.9164345403899722, + "train_speed(iter/s)": 0.138659 + }, + { + "epoch": 0.8328614571536429, + "grad_norm": 0.7678811550140381, + "learning_rate": 1.4890456741073488e-05, + "loss": 0.21964569091796876, + "memory(GiB)": 91.64, + "step": 8825, + "token_acc": 0.9182865370770338, + "train_speed(iter/s)": 0.138659 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.3738093972206116, + "learning_rate": 1.4808632587967031e-05, + "loss": 0.2381913185119629, + "memory(GiB)": 91.64, + "step": 8830, + "token_acc": 0.9197786998616874, + "train_speed(iter/s)": 0.138659 + }, + { + "epoch": 0.8338052095130238, + "grad_norm": 0.4026806652545929, + "learning_rate": 1.4727015888664685e-05, + "loss": 0.22088391780853273, + "memory(GiB)": 91.64, + "step": 8835, + "token_acc": 0.9287822878228782, + "train_speed(iter/s)": 0.138659 + }, + { + "epoch": 0.8342770856927142, + "grad_norm": 0.44328606128692627, + "learning_rate": 1.4645606841915415e-05, + "loss": 0.2210165023803711, + "memory(GiB)": 91.64, + "step": 8840, + "token_acc": 0.9258569299552906, + "train_speed(iter/s)": 0.138658 + }, + { + "epoch": 0.8347489618724047, + "grad_norm": 0.29257553815841675, + "learning_rate": 1.456440564596252e-05, + "loss": 0.21825973987579345, + "memory(GiB)": 91.64, + "step": 8845, + "token_acc": 0.9290515309932785, + "train_speed(iter/s)": 0.138658 + }, + { + "epoch": 0.8352208380520951, + "grad_norm": 0.3918415904045105, + "learning_rate": 1.4483412498543081e-05, + "loss": 0.22037510871887206, + "memory(GiB)": 91.64, + "step": 8850, + "token_acc": 0.9130610594419734, + "train_speed(iter/s)": 0.138658 + }, + { + "epoch": 0.8356927142317856, + "grad_norm": 0.34599733352661133, + "learning_rate": 1.4402627596887696e-05, + "loss": 0.2142866611480713, + "memory(GiB)": 91.64, + "step": 8855, + "token_acc": 0.9203539823008849, + "train_speed(iter/s)": 0.138659 + }, + { + "epoch": 0.8361645904114761, + "grad_norm": 0.33332359790802, + "learning_rate": 1.4322051137719684e-05, + "loss": 0.21822817325592042, + "memory(GiB)": 91.64, + "step": 8860, + "token_acc": 0.9312896405919662, + "train_speed(iter/s)": 0.138658 + }, + { + "epoch": 0.8366364665911665, + "grad_norm": 0.31388983130455017, + "learning_rate": 1.4241683317254884e-05, + "loss": 0.2175750255584717, + "memory(GiB)": 91.64, + "step": 8865, + "token_acc": 0.921377183967112, + "train_speed(iter/s)": 0.138659 + }, + { + "epoch": 0.8371083427708569, + "grad_norm": 0.6400064826011658, + "learning_rate": 1.4161524331201059e-05, + "loss": 0.20982787609100342, + "memory(GiB)": 91.64, + "step": 8870, + "token_acc": 0.9286498353457738, + "train_speed(iter/s)": 0.13866 + }, + { + "epoch": 0.8375802189505474, + "grad_norm": 0.21114827692508698, + "learning_rate": 1.4081574374757323e-05, + "loss": 0.21725311279296874, + "memory(GiB)": 91.64, + "step": 8875, + "token_acc": 0.9271402550091075, + "train_speed(iter/s)": 0.138661 + }, + { + "epoch": 0.8380520951302378, + "grad_norm": 0.26979145407676697, + "learning_rate": 1.4001833642613948e-05, + "loss": 0.21486730575561525, + "memory(GiB)": 91.64, + "step": 8880, + "token_acc": 0.9132149901380671, + "train_speed(iter/s)": 0.138661 + }, + { + "epoch": 0.8385239713099283, + "grad_norm": 0.2545173466205597, + "learning_rate": 1.3922302328951597e-05, + "loss": 0.2135646104812622, + "memory(GiB)": 91.64, + "step": 8885, + "token_acc": 0.9220452640402347, + "train_speed(iter/s)": 0.138661 + }, + { + "epoch": 0.8389958474896188, + "grad_norm": 0.5725402235984802, + "learning_rate": 1.3842980627440972e-05, + "loss": 0.2180727243423462, + "memory(GiB)": 91.64, + "step": 8890, + "token_acc": 0.9074926747593135, + "train_speed(iter/s)": 0.138661 + }, + { + "epoch": 0.8394677236693092, + "grad_norm": 0.4995214343070984, + "learning_rate": 1.3763868731242357e-05, + "loss": 0.2170236587524414, + "memory(GiB)": 91.64, + "step": 8895, + "token_acc": 0.9211840228245364, + "train_speed(iter/s)": 0.138662 + }, + { + "epoch": 0.8399395998489996, + "grad_norm": 0.5268911123275757, + "learning_rate": 1.3684966833005164e-05, + "loss": 0.21324462890625, + "memory(GiB)": 91.64, + "step": 8900, + "token_acc": 0.9303507880020335, + "train_speed(iter/s)": 0.138663 + }, + { + "epoch": 0.84041147602869, + "grad_norm": 0.22460459172725677, + "learning_rate": 1.3606275124867317e-05, + "loss": 0.2095392942428589, + "memory(GiB)": 91.64, + "step": 8905, + "token_acc": 0.9259259259259259, + "train_speed(iter/s)": 0.138663 + }, + { + "epoch": 0.8408833522083805, + "grad_norm": 0.4552537202835083, + "learning_rate": 1.3527793798455046e-05, + "loss": 0.21751093864440918, + "memory(GiB)": 91.64, + "step": 8910, + "token_acc": 0.9306561334211104, + "train_speed(iter/s)": 0.138662 + }, + { + "epoch": 0.841355228388071, + "grad_norm": 0.5160773396492004, + "learning_rate": 1.3449523044882184e-05, + "loss": 0.2187131404876709, + "memory(GiB)": 91.64, + "step": 8915, + "token_acc": 0.9276982186517639, + "train_speed(iter/s)": 0.138663 + }, + { + "epoch": 0.8418271045677614, + "grad_norm": 0.8382816910743713, + "learning_rate": 1.3371463054749766e-05, + "loss": 0.21896607875823976, + "memory(GiB)": 91.64, + "step": 8920, + "token_acc": 0.9211409395973155, + "train_speed(iter/s)": 0.138663 + }, + { + "epoch": 0.8422989807474519, + "grad_norm": 0.3334463834762573, + "learning_rate": 1.3293614018145639e-05, + "loss": 0.2141813278198242, + "memory(GiB)": 91.64, + "step": 8925, + "token_acc": 0.9138248847926267, + "train_speed(iter/s)": 0.138662 + }, + { + "epoch": 0.8427708569271423, + "grad_norm": 0.26530754566192627, + "learning_rate": 1.3215976124643947e-05, + "loss": 0.2144141674041748, + "memory(GiB)": 91.64, + "step": 8930, + "token_acc": 0.9184426229508197, + "train_speed(iter/s)": 0.138662 + }, + { + "epoch": 0.8432427331068327, + "grad_norm": 0.21236403286457062, + "learning_rate": 1.3138549563304581e-05, + "loss": 0.21677255630493164, + "memory(GiB)": 91.64, + "step": 8935, + "token_acc": 0.9428851174934726, + "train_speed(iter/s)": 0.138662 + }, + { + "epoch": 0.8437146092865232, + "grad_norm": 0.3409329950809479, + "learning_rate": 1.3061334522672964e-05, + "loss": 0.22018632888793946, + "memory(GiB)": 91.64, + "step": 8940, + "token_acc": 0.918456817185445, + "train_speed(iter/s)": 0.138662 + }, + { + "epoch": 0.8441864854662137, + "grad_norm": 0.47499603033065796, + "learning_rate": 1.2984331190779276e-05, + "loss": 0.2202282428741455, + "memory(GiB)": 91.64, + "step": 8945, + "token_acc": 0.9200565970993987, + "train_speed(iter/s)": 0.138661 + }, + { + "epoch": 0.8446583616459041, + "grad_norm": 0.4795122444629669, + "learning_rate": 1.2907539755138232e-05, + "loss": 0.2215047597885132, + "memory(GiB)": 91.64, + "step": 8950, + "token_acc": 0.9223080417991822, + "train_speed(iter/s)": 0.13866 + }, + { + "epoch": 0.8451302378255946, + "grad_norm": 0.443649560213089, + "learning_rate": 1.2830960402748581e-05, + "loss": 0.22158918380737305, + "memory(GiB)": 91.64, + "step": 8955, + "token_acc": 0.8966992665036675, + "train_speed(iter/s)": 0.138661 + }, + { + "epoch": 0.8456021140052851, + "grad_norm": 0.4760352671146393, + "learning_rate": 1.2754593320092523e-05, + "loss": 0.21788718700408935, + "memory(GiB)": 91.64, + "step": 8960, + "token_acc": 0.9285157265401611, + "train_speed(iter/s)": 0.13866 + }, + { + "epoch": 0.8460739901849754, + "grad_norm": 0.4041005074977875, + "learning_rate": 1.2678438693135386e-05, + "loss": 0.217704439163208, + "memory(GiB)": 91.64, + "step": 8965, + "token_acc": 0.9372652141247183, + "train_speed(iter/s)": 0.138659 + }, + { + "epoch": 0.8465458663646659, + "grad_norm": 0.22946274280548096, + "learning_rate": 1.260249670732524e-05, + "loss": 0.21619582176208496, + "memory(GiB)": 91.64, + "step": 8970, + "token_acc": 0.9303710490151168, + "train_speed(iter/s)": 0.138658 + }, + { + "epoch": 0.8470177425443564, + "grad_norm": 0.49980804324150085, + "learning_rate": 1.2526767547592177e-05, + "loss": 0.21017656326293946, + "memory(GiB)": 91.64, + "step": 8975, + "token_acc": 0.9205999117776797, + "train_speed(iter/s)": 0.138657 + }, + { + "epoch": 0.8474896187240468, + "grad_norm": 0.41217175126075745, + "learning_rate": 1.2451251398348107e-05, + "loss": 0.20879015922546387, + "memory(GiB)": 91.64, + "step": 8980, + "token_acc": 0.9199461823074335, + "train_speed(iter/s)": 0.138656 + }, + { + "epoch": 0.8479614949037373, + "grad_norm": 0.21734531223773956, + "learning_rate": 1.2375948443486274e-05, + "loss": 0.22095990180969238, + "memory(GiB)": 91.64, + "step": 8985, + "token_acc": 0.925754775107825, + "train_speed(iter/s)": 0.138654 + }, + { + "epoch": 0.8484333710834278, + "grad_norm": 0.8602133989334106, + "learning_rate": 1.2300858866380638e-05, + "loss": 0.22048001289367675, + "memory(GiB)": 91.64, + "step": 8990, + "token_acc": 0.920877998979071, + "train_speed(iter/s)": 0.138653 + }, + { + "epoch": 0.8489052472631181, + "grad_norm": 0.3685482442378998, + "learning_rate": 1.222598284988563e-05, + "loss": 0.21594226360321045, + "memory(GiB)": 91.64, + "step": 8995, + "token_acc": 0.9338983050847458, + "train_speed(iter/s)": 0.138652 + }, + { + "epoch": 0.8493771234428086, + "grad_norm": 0.24085231125354767, + "learning_rate": 1.2151320576335701e-05, + "loss": 0.21290385723114014, + "memory(GiB)": 91.64, + "step": 9000, + "token_acc": 0.9308624376336422, + "train_speed(iter/s)": 0.138651 + }, + { + "epoch": 0.849848999622499, + "grad_norm": 0.3610924482345581, + "learning_rate": 1.2076872227544645e-05, + "loss": 0.21317293643951415, + "memory(GiB)": 91.64, + "step": 9005, + "token_acc": 0.9221343873517787, + "train_speed(iter/s)": 0.13865 + }, + { + "epoch": 0.8503208758021895, + "grad_norm": 0.4309537410736084, + "learning_rate": 1.2002637984805432e-05, + "loss": 0.21564769744873047, + "memory(GiB)": 91.64, + "step": 9010, + "token_acc": 0.9249384741591469, + "train_speed(iter/s)": 0.13865 + }, + { + "epoch": 0.85079275198188, + "grad_norm": 0.2729756534099579, + "learning_rate": 1.1928618028889626e-05, + "loss": 0.21591358184814452, + "memory(GiB)": 91.64, + "step": 9015, + "token_acc": 0.928030303030303, + "train_speed(iter/s)": 0.138648 + }, + { + "epoch": 0.8512646281615704, + "grad_norm": 0.7843363881111145, + "learning_rate": 1.1854812540046933e-05, + "loss": 0.21337780952453614, + "memory(GiB)": 91.64, + "step": 9020, + "token_acc": 0.9305054151624549, + "train_speed(iter/s)": 0.138647 + }, + { + "epoch": 0.8517365043412608, + "grad_norm": 0.3829854428768158, + "learning_rate": 1.1781221698004851e-05, + "loss": 0.2175013542175293, + "memory(GiB)": 91.64, + "step": 9025, + "token_acc": 0.9183318853171155, + "train_speed(iter/s)": 0.138647 + }, + { + "epoch": 0.8522083805209513, + "grad_norm": 0.2974933981895447, + "learning_rate": 1.1707845681968143e-05, + "loss": 0.21187739372253417, + "memory(GiB)": 91.64, + "step": 9030, + "token_acc": 0.9268082663605052, + "train_speed(iter/s)": 0.138647 + }, + { + "epoch": 0.8526802567006417, + "grad_norm": 0.5384204387664795, + "learning_rate": 1.1634684670618468e-05, + "loss": 0.2191821575164795, + "memory(GiB)": 91.64, + "step": 9035, + "token_acc": 0.9259259259259259, + "train_speed(iter/s)": 0.138647 + }, + { + "epoch": 0.8531521328803322, + "grad_norm": 0.4722369909286499, + "learning_rate": 1.1561738842113912e-05, + "loss": 0.21865737438201904, + "memory(GiB)": 91.64, + "step": 9040, + "token_acc": 0.93042071197411, + "train_speed(iter/s)": 0.138645 + }, + { + "epoch": 0.8536240090600227, + "grad_norm": 0.29009294509887695, + "learning_rate": 1.1489008374088516e-05, + "loss": 0.2190547466278076, + "memory(GiB)": 91.64, + "step": 9045, + "token_acc": 0.9174520636984076, + "train_speed(iter/s)": 0.138643 + }, + { + "epoch": 0.8540958852397131, + "grad_norm": 0.43121811747550964, + "learning_rate": 1.1416493443651921e-05, + "loss": 0.21074953079223632, + "memory(GiB)": 91.64, + "step": 9050, + "token_acc": 0.9237262586674706, + "train_speed(iter/s)": 0.138641 + }, + { + "epoch": 0.8545677614194035, + "grad_norm": 0.30366286635398865, + "learning_rate": 1.1344194227388948e-05, + "loss": 0.21621460914611818, + "memory(GiB)": 91.64, + "step": 9055, + "token_acc": 0.9195775792038993, + "train_speed(iter/s)": 0.13864 + }, + { + "epoch": 0.855039637599094, + "grad_norm": 0.35432592034339905, + "learning_rate": 1.1272110901359024e-05, + "loss": 0.2122971534729004, + "memory(GiB)": 91.64, + "step": 9060, + "token_acc": 0.9211165048543689, + "train_speed(iter/s)": 0.138639 + }, + { + "epoch": 0.8555115137787844, + "grad_norm": 0.32513248920440674, + "learning_rate": 1.1200243641095908e-05, + "loss": 0.2123556613922119, + "memory(GiB)": 91.64, + "step": 9065, + "token_acc": 0.9351747463359639, + "train_speed(iter/s)": 0.138637 + }, + { + "epoch": 0.8559833899584749, + "grad_norm": 0.3691011369228363, + "learning_rate": 1.1128592621607226e-05, + "loss": 0.21590576171875, + "memory(GiB)": 91.64, + "step": 9070, + "token_acc": 0.9240752757949383, + "train_speed(iter/s)": 0.138636 + }, + { + "epoch": 0.8564552661381654, + "grad_norm": 0.2052968144416809, + "learning_rate": 1.1057158017373947e-05, + "loss": 0.21484103202819824, + "memory(GiB)": 91.64, + "step": 9075, + "token_acc": 0.9237958303378864, + "train_speed(iter/s)": 0.138635 + }, + { + "epoch": 0.8569271423178558, + "grad_norm": 0.38909971714019775, + "learning_rate": 1.0985940002350103e-05, + "loss": 0.2229299545288086, + "memory(GiB)": 91.64, + "step": 9080, + "token_acc": 0.9117383512544803, + "train_speed(iter/s)": 0.138634 + }, + { + "epoch": 0.8573990184975463, + "grad_norm": 0.8026570081710815, + "learning_rate": 1.0914938749962323e-05, + "loss": 0.22169604301452636, + "memory(GiB)": 91.64, + "step": 9085, + "token_acc": 0.9359975961538461, + "train_speed(iter/s)": 0.138632 + }, + { + "epoch": 0.8578708946772367, + "grad_norm": 0.3203129768371582, + "learning_rate": 1.0844154433109299e-05, + "loss": 0.21136178970336914, + "memory(GiB)": 91.64, + "step": 9090, + "token_acc": 0.9140328697850821, + "train_speed(iter/s)": 0.138632 + }, + { + "epoch": 0.8583427708569271, + "grad_norm": 0.21245694160461426, + "learning_rate": 1.0773587224161507e-05, + "loss": 0.21133153438568114, + "memory(GiB)": 91.64, + "step": 9095, + "token_acc": 0.9113320079522863, + "train_speed(iter/s)": 0.13863 + }, + { + "epoch": 0.8588146470366176, + "grad_norm": 0.25271087884902954, + "learning_rate": 1.0703237294960744e-05, + "loss": 0.21964166164398194, + "memory(GiB)": 91.64, + "step": 9100, + "token_acc": 0.9271875, + "train_speed(iter/s)": 0.138629 + }, + { + "epoch": 0.859286523216308, + "grad_norm": 0.5589426755905151, + "learning_rate": 1.063310481681965e-05, + "loss": 0.2167053699493408, + "memory(GiB)": 91.64, + "step": 9105, + "token_acc": 0.9252544529262087, + "train_speed(iter/s)": 0.138628 + }, + { + "epoch": 0.8597583993959985, + "grad_norm": 0.37324556708335876, + "learning_rate": 1.056318996052138e-05, + "loss": 0.22221214771270753, + "memory(GiB)": 91.64, + "step": 9110, + "token_acc": 0.9235668789808917, + "train_speed(iter/s)": 0.138628 + }, + { + "epoch": 0.860230275575689, + "grad_norm": 0.3813531994819641, + "learning_rate": 1.0493492896319135e-05, + "loss": 0.21518683433532715, + "memory(GiB)": 91.64, + "step": 9115, + "token_acc": 0.9208173690932312, + "train_speed(iter/s)": 0.138627 + }, + { + "epoch": 0.8607021517553793, + "grad_norm": 0.3796219229698181, + "learning_rate": 1.042401379393575e-05, + "loss": 0.2043860912322998, + "memory(GiB)": 91.64, + "step": 9120, + "token_acc": 0.9289118347895154, + "train_speed(iter/s)": 0.138626 + }, + { + "epoch": 0.8611740279350698, + "grad_norm": 0.3149944245815277, + "learning_rate": 1.0354752822563307e-05, + "loss": 0.21240837574005128, + "memory(GiB)": 91.64, + "step": 9125, + "token_acc": 0.939908256880734, + "train_speed(iter/s)": 0.138625 + }, + { + "epoch": 0.8616459041147603, + "grad_norm": 0.21904854476451874, + "learning_rate": 1.0285710150862715e-05, + "loss": 0.2140800952911377, + "memory(GiB)": 91.64, + "step": 9130, + "token_acc": 0.9252788104089219, + "train_speed(iter/s)": 0.138625 + }, + { + "epoch": 0.8621177802944507, + "grad_norm": 0.4739190638065338, + "learning_rate": 1.0216885946963239e-05, + "loss": 0.2191236734390259, + "memory(GiB)": 91.64, + "step": 9135, + "token_acc": 0.9306414848583523, + "train_speed(iter/s)": 0.138626 + }, + { + "epoch": 0.8625896564741412, + "grad_norm": 0.28348055481910706, + "learning_rate": 1.0148280378462182e-05, + "loss": 0.2117250919342041, + "memory(GiB)": 91.64, + "step": 9140, + "token_acc": 0.9304769603880356, + "train_speed(iter/s)": 0.138624 + }, + { + "epoch": 0.8630615326538317, + "grad_norm": 0.2477513998746872, + "learning_rate": 1.007989361242445e-05, + "loss": 0.2174776792526245, + "memory(GiB)": 91.64, + "step": 9145, + "token_acc": 0.9298039215686275, + "train_speed(iter/s)": 0.138623 + }, + { + "epoch": 0.863533408833522, + "grad_norm": 0.29344943165779114, + "learning_rate": 1.00117258153821e-05, + "loss": 0.20976610183715821, + "memory(GiB)": 91.64, + "step": 9150, + "token_acc": 0.9243150684931507, + "train_speed(iter/s)": 0.138622 + }, + { + "epoch": 0.8640052850132125, + "grad_norm": 0.3104238510131836, + "learning_rate": 9.943777153334e-06, + "loss": 0.21416120529174804, + "memory(GiB)": 91.64, + "step": 9155, + "token_acc": 0.9261862917398945, + "train_speed(iter/s)": 0.13862 + }, + { + "epoch": 0.864477161192903, + "grad_norm": 0.2621361017227173, + "learning_rate": 9.876047791745335e-06, + "loss": 0.21070308685302735, + "memory(GiB)": 91.64, + "step": 9160, + "token_acc": 0.9147621988882026, + "train_speed(iter/s)": 0.13862 + }, + { + "epoch": 0.8649490373725934, + "grad_norm": 0.30771052837371826, + "learning_rate": 9.808537895547309e-06, + "loss": 0.20619337558746337, + "memory(GiB)": 91.64, + "step": 9165, + "token_acc": 0.9170305676855895, + "train_speed(iter/s)": 0.138618 + }, + { + "epoch": 0.8654209135522839, + "grad_norm": 0.3718501925468445, + "learning_rate": 9.741247629136696e-06, + "loss": 0.2108161449432373, + "memory(GiB)": 91.64, + "step": 9170, + "token_acc": 0.9279187817258884, + "train_speed(iter/s)": 0.138617 + }, + { + "epoch": 0.8658927897319744, + "grad_norm": 0.6464650630950928, + "learning_rate": 9.67417715637542e-06, + "loss": 0.21423704624176027, + "memory(GiB)": 91.64, + "step": 9175, + "token_acc": 0.9234731420161884, + "train_speed(iter/s)": 0.138615 + }, + { + "epoch": 0.8663646659116648, + "grad_norm": 0.5365249514579773, + "learning_rate": 9.607326640590164e-06, + "loss": 0.21575627326965333, + "memory(GiB)": 91.64, + "step": 9180, + "token_acc": 0.9094117647058824, + "train_speed(iter/s)": 0.138614 + }, + { + "epoch": 0.8668365420913552, + "grad_norm": 0.19745229184627533, + "learning_rate": 9.540696244572033e-06, + "loss": 0.2120530366897583, + "memory(GiB)": 91.64, + "step": 9185, + "token_acc": 0.9127725856697819, + "train_speed(iter/s)": 0.138612 + }, + { + "epoch": 0.8673084182710457, + "grad_norm": 0.3388816714286804, + "learning_rate": 9.474286130576026e-06, + "loss": 0.2139723300933838, + "memory(GiB)": 91.64, + "step": 9190, + "token_acc": 0.9162839985870717, + "train_speed(iter/s)": 0.138611 + }, + { + "epoch": 0.8677802944507361, + "grad_norm": 0.4545825719833374, + "learning_rate": 9.408096460320792e-06, + "loss": 0.21589879989624022, + "memory(GiB)": 91.64, + "step": 9195, + "token_acc": 0.9250776397515528, + "train_speed(iter/s)": 0.138608 + }, + { + "epoch": 0.8682521706304266, + "grad_norm": 0.24916082620620728, + "learning_rate": 9.342127394988132e-06, + "loss": 0.21355302333831788, + "memory(GiB)": 91.64, + "step": 9200, + "token_acc": 0.925, + "train_speed(iter/s)": 0.138607 + }, + { + "epoch": 0.8687240468101171, + "grad_norm": 0.7581355571746826, + "learning_rate": 9.276379095222665e-06, + "loss": 0.216577410697937, + "memory(GiB)": 91.64, + "step": 9205, + "token_acc": 0.91288056206089, + "train_speed(iter/s)": 0.138606 + }, + { + "epoch": 0.8691959229898075, + "grad_norm": 0.608823835849762, + "learning_rate": 9.210851721131398e-06, + "loss": 0.21935479640960692, + "memory(GiB)": 91.64, + "step": 9210, + "token_acc": 0.9302580999450851, + "train_speed(iter/s)": 0.138604 + }, + { + "epoch": 0.8696677991694979, + "grad_norm": 0.22969260811805725, + "learning_rate": 9.145545432283353e-06, + "loss": 0.21476612091064454, + "memory(GiB)": 91.64, + "step": 9215, + "token_acc": 0.9197501201345507, + "train_speed(iter/s)": 0.138602 + }, + { + "epoch": 0.8701396753491883, + "grad_norm": 1.0408731698989868, + "learning_rate": 9.080460387709145e-06, + "loss": 0.2159780502319336, + "memory(GiB)": 91.64, + "step": 9220, + "token_acc": 0.9308084486525856, + "train_speed(iter/s)": 0.1386 + }, + { + "epoch": 0.8706115515288788, + "grad_norm": 0.20538538694381714, + "learning_rate": 9.015596745900679e-06, + "loss": 0.21535878181457518, + "memory(GiB)": 91.64, + "step": 9225, + "token_acc": 0.929406574012795, + "train_speed(iter/s)": 0.1386 + }, + { + "epoch": 0.8710834277085693, + "grad_norm": 0.3050077557563782, + "learning_rate": 8.950954664810695e-06, + "loss": 0.22024221420288087, + "memory(GiB)": 91.64, + "step": 9230, + "token_acc": 0.9159280830590023, + "train_speed(iter/s)": 0.138597 + }, + { + "epoch": 0.8715553038882597, + "grad_norm": 0.25116536021232605, + "learning_rate": 8.886534301852368e-06, + "loss": 0.21032414436340333, + "memory(GiB)": 91.64, + "step": 9235, + "token_acc": 0.9265141318977119, + "train_speed(iter/s)": 0.138595 + }, + { + "epoch": 0.8720271800679502, + "grad_norm": 0.3281894028186798, + "learning_rate": 8.822335813899018e-06, + "loss": 0.21384072303771973, + "memory(GiB)": 91.64, + "step": 9240, + "token_acc": 0.9211908931698775, + "train_speed(iter/s)": 0.138594 + }, + { + "epoch": 0.8724990562476406, + "grad_norm": 0.20488016307353973, + "learning_rate": 8.758359357283574e-06, + "loss": 0.21094036102294922, + "memory(GiB)": 91.64, + "step": 9245, + "token_acc": 0.9269041769041769, + "train_speed(iter/s)": 0.138593 + }, + { + "epoch": 0.872970932427331, + "grad_norm": 0.4258732497692108, + "learning_rate": 8.69460508779838e-06, + "loss": 0.21204228401184083, + "memory(GiB)": 91.64, + "step": 9250, + "token_acc": 0.9133466135458167, + "train_speed(iter/s)": 0.138591 + }, + { + "epoch": 0.8734428086070215, + "grad_norm": 0.39470452070236206, + "learning_rate": 8.631073160694658e-06, + "loss": 0.2152315616607666, + "memory(GiB)": 91.64, + "step": 9255, + "token_acc": 0.9243814289685849, + "train_speed(iter/s)": 0.138589 + }, + { + "epoch": 0.873914684786712, + "grad_norm": 0.26770398020744324, + "learning_rate": 8.567763730682221e-06, + "loss": 0.21648941040039063, + "memory(GiB)": 91.64, + "step": 9260, + "token_acc": 0.9256169621133125, + "train_speed(iter/s)": 0.138587 + }, + { + "epoch": 0.8743865609664024, + "grad_norm": 0.31089380383491516, + "learning_rate": 8.50467695192907e-06, + "loss": 0.20960922241210939, + "memory(GiB)": 91.64, + "step": 9265, + "token_acc": 0.9156517816869644, + "train_speed(iter/s)": 0.138585 + }, + { + "epoch": 0.8748584371460929, + "grad_norm": 0.23713769018650055, + "learning_rate": 8.441812978061015e-06, + "loss": 0.211074161529541, + "memory(GiB)": 91.64, + "step": 9270, + "token_acc": 0.9206798866855525, + "train_speed(iter/s)": 0.138583 + }, + { + "epoch": 0.8753303133257833, + "grad_norm": 0.38536888360977173, + "learning_rate": 8.379171962161259e-06, + "loss": 0.21424272060394287, + "memory(GiB)": 91.64, + "step": 9275, + "token_acc": 0.9308885754583921, + "train_speed(iter/s)": 0.138582 + }, + { + "epoch": 0.8758021895054737, + "grad_norm": 0.5602840781211853, + "learning_rate": 8.316754056770138e-06, + "loss": 0.21376981735229492, + "memory(GiB)": 91.64, + "step": 9280, + "token_acc": 0.9224137931034483, + "train_speed(iter/s)": 0.13858 + }, + { + "epoch": 0.8762740656851642, + "grad_norm": 0.4108741879463196, + "learning_rate": 8.254559413884633e-06, + "loss": 0.21517577171325683, + "memory(GiB)": 91.64, + "step": 9285, + "token_acc": 0.9342483242898181, + "train_speed(iter/s)": 0.138578 + }, + { + "epoch": 0.8767459418648547, + "grad_norm": 0.2960684895515442, + "learning_rate": 8.192588184958073e-06, + "loss": 0.21525194644927978, + "memory(GiB)": 91.64, + "step": 9290, + "token_acc": 0.9293150684931507, + "train_speed(iter/s)": 0.138577 + }, + { + "epoch": 0.8772178180445451, + "grad_norm": 0.21378456056118011, + "learning_rate": 8.130840520899719e-06, + "loss": 0.20404720306396484, + "memory(GiB)": 91.64, + "step": 9295, + "token_acc": 0.9324137931034483, + "train_speed(iter/s)": 0.138576 + }, + { + "epoch": 0.8776896942242356, + "grad_norm": 0.3066118061542511, + "learning_rate": 8.069316572074448e-06, + "loss": 0.2152477979660034, + "memory(GiB)": 91.64, + "step": 9300, + "token_acc": 0.9345850999394306, + "train_speed(iter/s)": 0.138575 + }, + { + "epoch": 0.8781615704039261, + "grad_norm": 0.33726122975349426, + "learning_rate": 8.008016488302306e-06, + "loss": 0.21046628952026367, + "memory(GiB)": 91.64, + "step": 9305, + "token_acc": 0.9202271498107085, + "train_speed(iter/s)": 0.138571 + }, + { + "epoch": 0.8786334465836164, + "grad_norm": 0.38846316933631897, + "learning_rate": 7.946940418858251e-06, + "loss": 0.2146662950515747, + "memory(GiB)": 91.64, + "step": 9310, + "token_acc": 0.9275923718712753, + "train_speed(iter/s)": 0.138568 + }, + { + "epoch": 0.8791053227633069, + "grad_norm": 0.5795027017593384, + "learning_rate": 7.886088512471678e-06, + "loss": 0.2132624387741089, + "memory(GiB)": 91.64, + "step": 9315, + "token_acc": 0.923013923013923, + "train_speed(iter/s)": 0.138566 + }, + { + "epoch": 0.8795771989429974, + "grad_norm": 0.28636613488197327, + "learning_rate": 7.825460917326177e-06, + "loss": 0.21249852180480958, + "memory(GiB)": 91.64, + "step": 9320, + "token_acc": 0.9045996592844975, + "train_speed(iter/s)": 0.138564 + }, + { + "epoch": 0.8800490751226878, + "grad_norm": 0.24777673184871674, + "learning_rate": 7.765057781059059e-06, + "loss": 0.20986220836639405, + "memory(GiB)": 91.64, + "step": 9325, + "token_acc": 0.9265103697024346, + "train_speed(iter/s)": 0.138563 + }, + { + "epoch": 0.8805209513023783, + "grad_norm": 0.4328806698322296, + "learning_rate": 7.704879250761021e-06, + "loss": 0.21023178100585938, + "memory(GiB)": 91.64, + "step": 9330, + "token_acc": 0.9189572116746954, + "train_speed(iter/s)": 0.138562 + }, + { + "epoch": 0.8809928274820688, + "grad_norm": 0.7614713311195374, + "learning_rate": 7.644925472975873e-06, + "loss": 0.220062255859375, + "memory(GiB)": 91.64, + "step": 9335, + "token_acc": 0.9254629629629629, + "train_speed(iter/s)": 0.138561 + }, + { + "epoch": 0.8814647036617591, + "grad_norm": 0.6232017874717712, + "learning_rate": 7.585196593700105e-06, + "loss": 0.2095489978790283, + "memory(GiB)": 91.64, + "step": 9340, + "token_acc": 0.9332292073408824, + "train_speed(iter/s)": 0.13856 + }, + { + "epoch": 0.8819365798414496, + "grad_norm": 0.396986186504364, + "learning_rate": 7.525692758382463e-06, + "loss": 0.21920385360717773, + "memory(GiB)": 91.64, + "step": 9345, + "token_acc": 0.9284452296819788, + "train_speed(iter/s)": 0.138557 + }, + { + "epoch": 0.88240845602114, + "grad_norm": 0.8557764887809753, + "learning_rate": 7.466414111923814e-06, + "loss": 0.2085169553756714, + "memory(GiB)": 91.64, + "step": 9350, + "token_acc": 0.9175862068965517, + "train_speed(iter/s)": 0.138553 + }, + { + "epoch": 0.8828803322008305, + "grad_norm": 0.29277458786964417, + "learning_rate": 7.407360798676577e-06, + "loss": 0.21147971153259276, + "memory(GiB)": 91.64, + "step": 9355, + "token_acc": 0.9102040816326531, + "train_speed(iter/s)": 0.13855 + }, + { + "epoch": 0.883352208380521, + "grad_norm": 0.5351800322532654, + "learning_rate": 7.348532962444421e-06, + "loss": 0.21797070503234864, + "memory(GiB)": 91.64, + "step": 9360, + "token_acc": 0.9135326261652202, + "train_speed(iter/s)": 0.138548 + }, + { + "epoch": 0.8838240845602114, + "grad_norm": 0.2421572506427765, + "learning_rate": 7.28993074648201e-06, + "loss": 0.21367790699005126, + "memory(GiB)": 91.64, + "step": 9365, + "token_acc": 0.9238447319778188, + "train_speed(iter/s)": 0.138546 + }, + { + "epoch": 0.8842959607399018, + "grad_norm": 0.43830910325050354, + "learning_rate": 7.231554293494547e-06, + "loss": 0.214955472946167, + "memory(GiB)": 91.64, + "step": 9370, + "token_acc": 0.9115107913669065, + "train_speed(iter/s)": 0.138544 + }, + { + "epoch": 0.8847678369195923, + "grad_norm": 0.368368536233902, + "learning_rate": 7.173403745637497e-06, + "loss": 0.20797204971313477, + "memory(GiB)": 91.64, + "step": 9375, + "token_acc": 0.9261330194231902, + "train_speed(iter/s)": 0.138543 + }, + { + "epoch": 0.8852397130992827, + "grad_norm": 0.682052493095398, + "learning_rate": 7.115479244516199e-06, + "loss": 0.2070131778717041, + "memory(GiB)": 91.64, + "step": 9380, + "token_acc": 0.9177962396152164, + "train_speed(iter/s)": 0.138541 + }, + { + "epoch": 0.8857115892789732, + "grad_norm": 0.325766384601593, + "learning_rate": 7.0577809311855425e-06, + "loss": 0.21118898391723634, + "memory(GiB)": 91.64, + "step": 9385, + "token_acc": 0.9205384281372123, + "train_speed(iter/s)": 0.138539 + }, + { + "epoch": 0.8861834654586637, + "grad_norm": 0.2540743947029114, + "learning_rate": 7.00030894614957e-06, + "loss": 0.21211137771606445, + "memory(GiB)": 91.64, + "step": 9390, + "token_acc": 0.9358490566037736, + "train_speed(iter/s)": 0.138538 + }, + { + "epoch": 0.8866553416383541, + "grad_norm": 0.27310407161712646, + "learning_rate": 6.94306342936123e-06, + "loss": 0.21425786018371581, + "memory(GiB)": 91.64, + "step": 9395, + "token_acc": 0.9352612553401248, + "train_speed(iter/s)": 0.138535 + }, + { + "epoch": 0.8871272178180446, + "grad_norm": 0.2509388327598572, + "learning_rate": 6.88604452022199e-06, + "loss": 0.20922818183898925, + "memory(GiB)": 91.64, + "step": 9400, + "token_acc": 0.9203892083149049, + "train_speed(iter/s)": 0.138534 + }, + { + "epoch": 0.887599093997735, + "grad_norm": 0.3544827103614807, + "learning_rate": 6.829252357581462e-06, + "loss": 0.22144675254821777, + "memory(GiB)": 91.64, + "step": 9405, + "token_acc": 0.9165378670788253, + "train_speed(iter/s)": 0.138532 + }, + { + "epoch": 0.8880709701774254, + "grad_norm": 0.5197407603263855, + "learning_rate": 6.772687079737139e-06, + "loss": 0.21385698318481444, + "memory(GiB)": 91.64, + "step": 9410, + "token_acc": 0.923288424525708, + "train_speed(iter/s)": 0.138529 + }, + { + "epoch": 0.8885428463571159, + "grad_norm": 0.23393069207668304, + "learning_rate": 6.716348824433949e-06, + "loss": 0.2115368127822876, + "memory(GiB)": 91.64, + "step": 9415, + "token_acc": 0.9165676959619953, + "train_speed(iter/s)": 0.138527 + }, + { + "epoch": 0.8890147225368064, + "grad_norm": 0.5718517303466797, + "learning_rate": 6.660237728864039e-06, + "loss": 0.22244482040405272, + "memory(GiB)": 91.64, + "step": 9420, + "token_acc": 0.9183126883160362, + "train_speed(iter/s)": 0.138526 + }, + { + "epoch": 0.8894865987164968, + "grad_norm": 0.37451472878456116, + "learning_rate": 6.604353929666384e-06, + "loss": 0.2113950252532959, + "memory(GiB)": 91.64, + "step": 9425, + "token_acc": 0.9068877551020408, + "train_speed(iter/s)": 0.138526 + }, + { + "epoch": 0.8899584748961873, + "grad_norm": 0.23787033557891846, + "learning_rate": 6.548697562926431e-06, + "loss": 0.21541337966918944, + "memory(GiB)": 91.64, + "step": 9430, + "token_acc": 0.9162337662337663, + "train_speed(iter/s)": 0.138524 + }, + { + "epoch": 0.8904303510758776, + "grad_norm": 0.5333724617958069, + "learning_rate": 6.49326876417583e-06, + "loss": 0.20914788246154786, + "memory(GiB)": 91.64, + "step": 9435, + "token_acc": 0.9208791208791208, + "train_speed(iter/s)": 0.138522 + }, + { + "epoch": 0.8909022272555681, + "grad_norm": 0.24550431966781616, + "learning_rate": 6.438067668392045e-06, + "loss": 0.20611090660095216, + "memory(GiB)": 91.64, + "step": 9440, + "token_acc": 0.9329292929292929, + "train_speed(iter/s)": 0.138521 + }, + { + "epoch": 0.8913741034352586, + "grad_norm": 0.39244842529296875, + "learning_rate": 6.383094409998036e-06, + "loss": 0.21288225650787354, + "memory(GiB)": 91.64, + "step": 9445, + "token_acc": 0.9141078838174274, + "train_speed(iter/s)": 0.13852 + }, + { + "epoch": 0.891845979614949, + "grad_norm": 0.2563187777996063, + "learning_rate": 6.3283491228619875e-06, + "loss": 0.21205806732177734, + "memory(GiB)": 91.64, + "step": 9450, + "token_acc": 0.9324095978371071, + "train_speed(iter/s)": 0.138519 + }, + { + "epoch": 0.8923178557946395, + "grad_norm": 0.2973138093948364, + "learning_rate": 6.273831940296904e-06, + "loss": 0.20941767692565919, + "memory(GiB)": 91.64, + "step": 9455, + "token_acc": 0.9111757105943152, + "train_speed(iter/s)": 0.138516 + }, + { + "epoch": 0.89278973197433, + "grad_norm": 0.4202001392841339, + "learning_rate": 6.219542995060313e-06, + "loss": 0.21974601745605468, + "memory(GiB)": 91.64, + "step": 9460, + "token_acc": 0.9231898238747553, + "train_speed(iter/s)": 0.138515 + }, + { + "epoch": 0.8932616081540203, + "grad_norm": 0.7754381895065308, + "learning_rate": 6.165482419353996e-06, + "loss": 0.20998201370239258, + "memory(GiB)": 91.64, + "step": 9465, + "token_acc": 0.9214920071047957, + "train_speed(iter/s)": 0.138514 + }, + { + "epoch": 0.8937334843337108, + "grad_norm": 0.2756913900375366, + "learning_rate": 6.1116503448236054e-06, + "loss": 0.2183704376220703, + "memory(GiB)": 91.64, + "step": 9470, + "token_acc": 0.9101675332177932, + "train_speed(iter/s)": 0.138512 + }, + { + "epoch": 0.8942053605134013, + "grad_norm": 0.2889973223209381, + "learning_rate": 6.058046902558301e-06, + "loss": 0.2072589874267578, + "memory(GiB)": 91.64, + "step": 9475, + "token_acc": 0.9353671147880042, + "train_speed(iter/s)": 0.138509 + }, + { + "epoch": 0.8946772366930917, + "grad_norm": 0.3864864110946655, + "learning_rate": 6.004672223090568e-06, + "loss": 0.21283507347106934, + "memory(GiB)": 91.64, + "step": 9480, + "token_acc": 0.9110769230769231, + "train_speed(iter/s)": 0.138507 + }, + { + "epoch": 0.8951491128727822, + "grad_norm": 0.25138232111930847, + "learning_rate": 5.951526436395782e-06, + "loss": 0.21158528327941895, + "memory(GiB)": 91.64, + "step": 9485, + "token_acc": 0.9071403447062961, + "train_speed(iter/s)": 0.138506 + }, + { + "epoch": 0.8956209890524727, + "grad_norm": 0.4256957173347473, + "learning_rate": 5.898609671891897e-06, + "loss": 0.21031410694122316, + "memory(GiB)": 91.64, + "step": 9490, + "token_acc": 0.9060980172091283, + "train_speed(iter/s)": 0.138504 + }, + { + "epoch": 0.896092865232163, + "grad_norm": 0.361007422208786, + "learning_rate": 5.845922058439268e-06, + "loss": 0.21310253143310548, + "memory(GiB)": 91.64, + "step": 9495, + "token_acc": 0.9223516361619523, + "train_speed(iter/s)": 0.138501 + }, + { + "epoch": 0.8965647414118535, + "grad_norm": 0.5840691328048706, + "learning_rate": 5.793463724340109e-06, + "loss": 0.22230072021484376, + "memory(GiB)": 91.64, + "step": 9500, + "token_acc": 0.9187408491947291, + "train_speed(iter/s)": 0.1385 + }, + { + "epoch": 0.897036617591544, + "grad_norm": 0.2761625647544861, + "learning_rate": 5.741234797338391e-06, + "loss": 0.2137458324432373, + "memory(GiB)": 91.64, + "step": 9505, + "token_acc": 0.9283000949667616, + "train_speed(iter/s)": 0.138499 + }, + { + "epoch": 0.8975084937712344, + "grad_norm": 0.6072384715080261, + "learning_rate": 5.689235404619387e-06, + "loss": 0.21013424396514893, + "memory(GiB)": 91.64, + "step": 9510, + "token_acc": 0.9139955569660425, + "train_speed(iter/s)": 0.138498 + }, + { + "epoch": 0.8979803699509249, + "grad_norm": 0.25448301434516907, + "learning_rate": 5.637465672809483e-06, + "loss": 0.2110156536102295, + "memory(GiB)": 91.64, + "step": 9515, + "token_acc": 0.9319535904672311, + "train_speed(iter/s)": 0.138496 + }, + { + "epoch": 0.8984522461306154, + "grad_norm": 0.3504413962364197, + "learning_rate": 5.585925727975727e-06, + "loss": 0.20918526649475097, + "memory(GiB)": 91.64, + "step": 9520, + "token_acc": 0.9220079410096427, + "train_speed(iter/s)": 0.138495 + }, + { + "epoch": 0.8989241223103058, + "grad_norm": 0.3817376494407654, + "learning_rate": 5.534615695625689e-06, + "loss": 0.21109294891357422, + "memory(GiB)": 91.64, + "step": 9525, + "token_acc": 0.9241635687732342, + "train_speed(iter/s)": 0.138493 + }, + { + "epoch": 0.8993959984899962, + "grad_norm": 0.7123754620552063, + "learning_rate": 5.4835357007069765e-06, + "loss": 0.21664328575134278, + "memory(GiB)": 91.64, + "step": 9530, + "token_acc": 0.9258387286639199, + "train_speed(iter/s)": 0.138492 + }, + { + "epoch": 0.8998678746696867, + "grad_norm": 0.4179804027080536, + "learning_rate": 5.43268586760709e-06, + "loss": 0.2076176404953003, + "memory(GiB)": 91.64, + "step": 9535, + "token_acc": 0.9145885286783042, + "train_speed(iter/s)": 0.13849 + }, + { + "epoch": 0.9003397508493771, + "grad_norm": 0.6187090873718262, + "learning_rate": 5.382066320153046e-06, + "loss": 0.21339232921600343, + "memory(GiB)": 91.64, + "step": 9540, + "token_acc": 0.9352548036758563, + "train_speed(iter/s)": 0.138487 + }, + { + "epoch": 0.9008116270290676, + "grad_norm": 0.23456265032291412, + "learning_rate": 5.331677181611006e-06, + "loss": 0.21850805282592772, + "memory(GiB)": 91.64, + "step": 9545, + "token_acc": 0.9417892156862745, + "train_speed(iter/s)": 0.138486 + }, + { + "epoch": 0.901283503208758, + "grad_norm": 0.3491338789463043, + "learning_rate": 5.281518574686162e-06, + "loss": 0.21774368286132811, + "memory(GiB)": 91.64, + "step": 9550, + "token_acc": 0.9337481698389458, + "train_speed(iter/s)": 0.138484 + }, + { + "epoch": 0.9017553793884485, + "grad_norm": 0.35306277871131897, + "learning_rate": 5.231590621522275e-06, + "loss": 0.21966407299041749, + "memory(GiB)": 91.64, + "step": 9555, + "token_acc": 0.9345238095238095, + "train_speed(iter/s)": 0.138482 + }, + { + "epoch": 0.9022272555681389, + "grad_norm": 0.3036733567714691, + "learning_rate": 5.18189344370138e-06, + "loss": 0.21493420600891114, + "memory(GiB)": 91.64, + "step": 9560, + "token_acc": 0.9334840167904424, + "train_speed(iter/s)": 0.138479 + }, + { + "epoch": 0.9026991317478293, + "grad_norm": 0.38445791602134705, + "learning_rate": 5.132427162243625e-06, + "loss": 0.20835418701171876, + "memory(GiB)": 91.64, + "step": 9565, + "token_acc": 0.9253386190948133, + "train_speed(iter/s)": 0.138477 + }, + { + "epoch": 0.9031710079275198, + "grad_norm": 0.5530850291252136, + "learning_rate": 5.083191897606843e-06, + "loss": 0.21398649215698243, + "memory(GiB)": 91.64, + "step": 9570, + "token_acc": 0.9331123832479663, + "train_speed(iter/s)": 0.138475 + }, + { + "epoch": 0.9036428841072103, + "grad_norm": 0.2753889858722687, + "learning_rate": 5.034187769686283e-06, + "loss": 0.2125246524810791, + "memory(GiB)": 91.64, + "step": 9575, + "token_acc": 0.9422946367956552, + "train_speed(iter/s)": 0.138474 + }, + { + "epoch": 0.9041147602869007, + "grad_norm": 0.332717627286911, + "learning_rate": 4.985414897814444e-06, + "loss": 0.2124195098876953, + "memory(GiB)": 91.64, + "step": 9580, + "token_acc": 0.9092827004219409, + "train_speed(iter/s)": 0.138473 + }, + { + "epoch": 0.9045866364665912, + "grad_norm": 0.2855324149131775, + "learning_rate": 4.936873400760544e-06, + "loss": 0.21741337776184083, + "memory(GiB)": 91.64, + "step": 9585, + "token_acc": 0.9338592233009708, + "train_speed(iter/s)": 0.138472 + }, + { + "epoch": 0.9050585126462816, + "grad_norm": 0.2555350065231323, + "learning_rate": 4.888563396730461e-06, + "loss": 0.21048321723937988, + "memory(GiB)": 91.64, + "step": 9590, + "token_acc": 0.921410365335599, + "train_speed(iter/s)": 0.138469 + }, + { + "epoch": 0.905530388825972, + "grad_norm": 0.24384169280529022, + "learning_rate": 4.840485003366324e-06, + "loss": 0.21017694473266602, + "memory(GiB)": 91.64, + "step": 9595, + "token_acc": 0.9220325833979829, + "train_speed(iter/s)": 0.138468 + }, + { + "epoch": 0.9060022650056625, + "grad_norm": 0.5031404495239258, + "learning_rate": 4.79263833774628e-06, + "loss": 0.21278533935546876, + "memory(GiB)": 91.64, + "step": 9600, + "token_acc": 0.9458646616541353, + "train_speed(iter/s)": 0.138465 + }, + { + "epoch": 0.906474141185353, + "grad_norm": 0.26602670550346375, + "learning_rate": 4.745023516384117e-06, + "loss": 0.2099222183227539, + "memory(GiB)": 91.64, + "step": 9605, + "token_acc": 0.9246897732135216, + "train_speed(iter/s)": 0.138464 + }, + { + "epoch": 0.9069460173650434, + "grad_norm": 0.3199900984764099, + "learning_rate": 4.6976406552291515e-06, + "loss": 0.2130331039428711, + "memory(GiB)": 91.64, + "step": 9610, + "token_acc": 0.9132743362831859, + "train_speed(iter/s)": 0.138463 + }, + { + "epoch": 0.9074178935447339, + "grad_norm": 0.4659852385520935, + "learning_rate": 4.650489869665731e-06, + "loss": 0.21802825927734376, + "memory(GiB)": 91.64, + "step": 9615, + "token_acc": 0.9254470426409904, + "train_speed(iter/s)": 0.13846 + }, + { + "epoch": 0.9078897697244243, + "grad_norm": 0.23172196745872498, + "learning_rate": 4.603571274513141e-06, + "loss": 0.21299545764923095, + "memory(GiB)": 91.64, + "step": 9620, + "token_acc": 0.9347454731807311, + "train_speed(iter/s)": 0.138459 + }, + { + "epoch": 0.9083616459041147, + "grad_norm": 0.24915564060211182, + "learning_rate": 4.556884984025234e-06, + "loss": 0.2165134906768799, + "memory(GiB)": 91.64, + "step": 9625, + "token_acc": 0.9073196419167984, + "train_speed(iter/s)": 0.138457 + }, + { + "epoch": 0.9088335220838052, + "grad_norm": 0.26748067140579224, + "learning_rate": 4.510431111890134e-06, + "loss": 0.2091744899749756, + "memory(GiB)": 91.64, + "step": 9630, + "token_acc": 0.9328651685393259, + "train_speed(iter/s)": 0.138456 + }, + { + "epoch": 0.9093053982634957, + "grad_norm": 0.4371824860572815, + "learning_rate": 4.4642097712299995e-06, + "loss": 0.21007180213928223, + "memory(GiB)": 91.64, + "step": 9635, + "token_acc": 0.9227008860372746, + "train_speed(iter/s)": 0.138454 + }, + { + "epoch": 0.9097772744431861, + "grad_norm": 0.4615022540092468, + "learning_rate": 4.418221074600792e-06, + "loss": 0.20864152908325195, + "memory(GiB)": 91.64, + "step": 9640, + "token_acc": 0.92157712305026, + "train_speed(iter/s)": 0.138453 + }, + { + "epoch": 0.9102491506228766, + "grad_norm": 0.5096688866615295, + "learning_rate": 4.372465133991888e-06, + "loss": 0.21734046936035156, + "memory(GiB)": 91.64, + "step": 9645, + "token_acc": 0.927038626609442, + "train_speed(iter/s)": 0.138452 + }, + { + "epoch": 0.9107210268025671, + "grad_norm": 0.21919603645801544, + "learning_rate": 4.326942060825889e-06, + "loss": 0.21668176651000975, + "memory(GiB)": 91.64, + "step": 9650, + "token_acc": 0.9174714661984197, + "train_speed(iter/s)": 0.13845 + }, + { + "epoch": 0.9111929029822574, + "grad_norm": 0.761786937713623, + "learning_rate": 4.281651965958355e-06, + "loss": 0.21059024333953857, + "memory(GiB)": 91.64, + "step": 9655, + "token_acc": 0.9291750503018109, + "train_speed(iter/s)": 0.138448 + }, + { + "epoch": 0.9116647791619479, + "grad_norm": 0.22924518585205078, + "learning_rate": 4.236594959677454e-06, + "loss": 0.21478679180145263, + "memory(GiB)": 91.64, + "step": 9660, + "token_acc": 0.9087285771223595, + "train_speed(iter/s)": 0.138446 + }, + { + "epoch": 0.9121366553416383, + "grad_norm": 0.568040132522583, + "learning_rate": 4.191771151703794e-06, + "loss": 0.2100062847137451, + "memory(GiB)": 91.64, + "step": 9665, + "token_acc": 0.9274411424160556, + "train_speed(iter/s)": 0.138444 + }, + { + "epoch": 0.9126085315213288, + "grad_norm": 0.5014088153839111, + "learning_rate": 4.147180651190085e-06, + "loss": 0.21394610404968262, + "memory(GiB)": 91.64, + "step": 9670, + "token_acc": 0.9418457648546145, + "train_speed(iter/s)": 0.138442 + }, + { + "epoch": 0.9130804077010193, + "grad_norm": 0.4113982617855072, + "learning_rate": 4.102823566720926e-06, + "loss": 0.21270480155944824, + "memory(GiB)": 91.64, + "step": 9675, + "token_acc": 0.9344614558152028, + "train_speed(iter/s)": 0.13844 + }, + { + "epoch": 0.9135522838807097, + "grad_norm": 0.36646685004234314, + "learning_rate": 4.058700006312488e-06, + "loss": 0.20875248908996583, + "memory(GiB)": 91.64, + "step": 9680, + "token_acc": 0.9254823685961411, + "train_speed(iter/s)": 0.138438 + }, + { + "epoch": 0.9140241600604001, + "grad_norm": 0.37394624948501587, + "learning_rate": 4.014810077412279e-06, + "loss": 0.20997467041015624, + "memory(GiB)": 91.64, + "step": 9685, + "token_acc": 0.9180238870792616, + "train_speed(iter/s)": 0.138438 + }, + { + "epoch": 0.9144960362400906, + "grad_norm": 0.28262215852737427, + "learning_rate": 3.9711538868988815e-06, + "loss": 0.2138798475265503, + "memory(GiB)": 91.64, + "step": 9690, + "token_acc": 0.9245947850599013, + "train_speed(iter/s)": 0.138438 + }, + { + "epoch": 0.914967912419781, + "grad_norm": 0.24222835898399353, + "learning_rate": 3.927731541081692e-06, + "loss": 0.21181824207305908, + "memory(GiB)": 91.64, + "step": 9695, + "token_acc": 0.9092514124293786, + "train_speed(iter/s)": 0.138437 + }, + { + "epoch": 0.9154397885994715, + "grad_norm": 0.2124110907316208, + "learning_rate": 3.884543145700659e-06, + "loss": 0.209627103805542, + "memory(GiB)": 91.64, + "step": 9700, + "token_acc": 0.9199381761978361, + "train_speed(iter/s)": 0.138436 + }, + { + "epoch": 0.915911664779162, + "grad_norm": 0.38374340534210205, + "learning_rate": 3.841588805926033e-06, + "loss": 0.21204769611358643, + "memory(GiB)": 91.64, + "step": 9705, + "token_acc": 0.9301503094606542, + "train_speed(iter/s)": 0.138434 + }, + { + "epoch": 0.9163835409588524, + "grad_norm": 0.22321577370166779, + "learning_rate": 3.7988686263580985e-06, + "loss": 0.21288986206054689, + "memory(GiB)": 91.64, + "step": 9710, + "token_acc": 0.915719696969697, + "train_speed(iter/s)": 0.138432 + }, + { + "epoch": 0.9168554171385428, + "grad_norm": 0.3166010081768036, + "learning_rate": 3.7563827110269177e-06, + "loss": 0.20771732330322265, + "memory(GiB)": 91.64, + "step": 9715, + "token_acc": 0.9292553191489362, + "train_speed(iter/s)": 0.138432 + }, + { + "epoch": 0.9173272933182333, + "grad_norm": 0.4344588816165924, + "learning_rate": 3.714131163392065e-06, + "loss": 0.212508225440979, + "memory(GiB)": 91.64, + "step": 9720, + "token_acc": 0.9188432835820896, + "train_speed(iter/s)": 0.13843 + }, + { + "epoch": 0.9177991694979237, + "grad_norm": 0.3643015921115875, + "learning_rate": 3.6721140863424817e-06, + "loss": 0.21447608470916749, + "memory(GiB)": 91.64, + "step": 9725, + "token_acc": 0.9209919261822376, + "train_speed(iter/s)": 0.138429 + }, + { + "epoch": 0.9182710456776142, + "grad_norm": 0.45395082235336304, + "learning_rate": 3.6303315821960227e-06, + "loss": 0.21562774181365968, + "memory(GiB)": 91.64, + "step": 9730, + "token_acc": 0.9170403587443946, + "train_speed(iter/s)": 0.138427 + }, + { + "epoch": 0.9187429218573047, + "grad_norm": 0.6889760494232178, + "learning_rate": 3.5887837526993983e-06, + "loss": 0.22067337036132811, + "memory(GiB)": 91.64, + "step": 9735, + "token_acc": 0.9127798507462687, + "train_speed(iter/s)": 0.138426 + }, + { + "epoch": 0.9192147980369951, + "grad_norm": 0.3173013925552368, + "learning_rate": 3.5474706990278217e-06, + "loss": 0.21371016502380372, + "memory(GiB)": 91.64, + "step": 9740, + "token_acc": 0.923474329996771, + "train_speed(iter/s)": 0.138422 + }, + { + "epoch": 0.9196866742166856, + "grad_norm": 0.25628921389579773, + "learning_rate": 3.506392521784796e-06, + "loss": 0.21721768379211426, + "memory(GiB)": 91.64, + "step": 9745, + "token_acc": 0.9235836627140975, + "train_speed(iter/s)": 0.13842 + }, + { + "epoch": 0.920158550396376, + "grad_norm": 0.26340246200561523, + "learning_rate": 3.4655493210018484e-06, + "loss": 0.20722723007202148, + "memory(GiB)": 91.64, + "step": 9750, + "token_acc": 0.9288334556126192, + "train_speed(iter/s)": 0.138418 + }, + { + "epoch": 0.9206304265760664, + "grad_norm": 0.5831912159919739, + "learning_rate": 3.424941196138376e-06, + "loss": 0.21827468872070313, + "memory(GiB)": 91.64, + "step": 9755, + "token_acc": 0.9169483341380975, + "train_speed(iter/s)": 0.138417 + }, + { + "epoch": 0.9211023027557569, + "grad_norm": 0.22615396976470947, + "learning_rate": 3.384568246081221e-06, + "loss": 0.2103797435760498, + "memory(GiB)": 91.64, + "step": 9760, + "token_acc": 0.9311565696302124, + "train_speed(iter/s)": 0.138415 + }, + { + "epoch": 0.9215741789354474, + "grad_norm": 0.4777851700782776, + "learning_rate": 3.3444305691446075e-06, + "loss": 0.21615819931030272, + "memory(GiB)": 91.64, + "step": 9765, + "token_acc": 0.9326971371170266, + "train_speed(iter/s)": 0.138413 + }, + { + "epoch": 0.9220460551151378, + "grad_norm": 0.2671146094799042, + "learning_rate": 3.3045282630698506e-06, + "loss": 0.21225442886352539, + "memory(GiB)": 91.64, + "step": 9770, + "token_acc": 0.9126576366184026, + "train_speed(iter/s)": 0.138412 + }, + { + "epoch": 0.9225179312948283, + "grad_norm": 0.24419555068016052, + "learning_rate": 3.264861425025034e-06, + "loss": 0.20959196090698243, + "memory(GiB)": 91.64, + "step": 9775, + "token_acc": 0.9362338093656593, + "train_speed(iter/s)": 0.13841 + }, + { + "epoch": 0.9229898074745186, + "grad_norm": 0.3162088394165039, + "learning_rate": 3.2254301516049025e-06, + "loss": 0.21189465522766113, + "memory(GiB)": 91.64, + "step": 9780, + "token_acc": 0.9261028378758078, + "train_speed(iter/s)": 0.13841 + }, + { + "epoch": 0.9234616836542091, + "grad_norm": 0.2638489305973053, + "learning_rate": 3.1862345388305237e-06, + "loss": 0.2071277379989624, + "memory(GiB)": 91.64, + "step": 9785, + "token_acc": 0.9252124645892351, + "train_speed(iter/s)": 0.138408 + }, + { + "epoch": 0.9239335598338996, + "grad_norm": 0.4801633656024933, + "learning_rate": 3.1472746821491373e-06, + "loss": 0.2107628345489502, + "memory(GiB)": 91.64, + "step": 9790, + "token_acc": 0.9388661202185792, + "train_speed(iter/s)": 0.138406 + }, + { + "epoch": 0.92440543601359, + "grad_norm": 0.3137304484844208, + "learning_rate": 3.1085506764338524e-06, + "loss": 0.20806145668029785, + "memory(GiB)": 91.64, + "step": 9795, + "token_acc": 0.9370592844084618, + "train_speed(iter/s)": 0.138404 + }, + { + "epoch": 0.9248773121932805, + "grad_norm": 0.4165758788585663, + "learning_rate": 3.070062615983449e-06, + "loss": 0.21570463180541993, + "memory(GiB)": 91.64, + "step": 9800, + "token_acc": 0.9288702928870293, + "train_speed(iter/s)": 0.138403 + }, + { + "epoch": 0.925349188372971, + "grad_norm": 0.26783013343811035, + "learning_rate": 3.031810594522133e-06, + "loss": 0.21047744750976563, + "memory(GiB)": 91.64, + "step": 9805, + "token_acc": 0.9246785058175138, + "train_speed(iter/s)": 0.138401 + }, + { + "epoch": 0.9258210645526613, + "grad_norm": 0.3021276593208313, + "learning_rate": 2.99379470519936e-06, + "loss": 0.20863080024719238, + "memory(GiB)": 91.64, + "step": 9810, + "token_acc": 0.9242424242424242, + "train_speed(iter/s)": 0.138398 + }, + { + "epoch": 0.9262929407323518, + "grad_norm": 0.7376922965049744, + "learning_rate": 2.9560150405895325e-06, + "loss": 0.21465864181518554, + "memory(GiB)": 91.64, + "step": 9815, + "token_acc": 0.9263358778625954, + "train_speed(iter/s)": 0.138396 + }, + { + "epoch": 0.9267648169120423, + "grad_norm": 0.33614635467529297, + "learning_rate": 2.9184716926918263e-06, + "loss": 0.20645856857299805, + "memory(GiB)": 91.64, + "step": 9820, + "token_acc": 0.9207180254300673, + "train_speed(iter/s)": 0.138394 + }, + { + "epoch": 0.9272366930917327, + "grad_norm": 0.24106520414352417, + "learning_rate": 2.8811647529299436e-06, + "loss": 0.20869898796081543, + "memory(GiB)": 91.64, + "step": 9825, + "token_acc": 0.9237107110161733, + "train_speed(iter/s)": 0.138393 + }, + { + "epoch": 0.9277085692714232, + "grad_norm": 0.2242364138364792, + "learning_rate": 2.8440943121518905e-06, + "loss": 0.20895557403564452, + "memory(GiB)": 91.64, + "step": 9830, + "token_acc": 0.9298651252408477, + "train_speed(iter/s)": 0.138391 + }, + { + "epoch": 0.9281804454511137, + "grad_norm": 0.401591420173645, + "learning_rate": 2.807260460629768e-06, + "loss": 0.20607337951660157, + "memory(GiB)": 91.64, + "step": 9835, + "token_acc": 0.9263485477178424, + "train_speed(iter/s)": 0.138391 + }, + { + "epoch": 0.928652321630804, + "grad_norm": 0.37716448307037354, + "learning_rate": 2.7706632880595716e-06, + "loss": 0.21465234756469725, + "memory(GiB)": 91.64, + "step": 9840, + "token_acc": 0.9250949257852952, + "train_speed(iter/s)": 0.138389 + }, + { + "epoch": 0.9291241978104945, + "grad_norm": 0.5281177759170532, + "learning_rate": 2.734302883560902e-06, + "loss": 0.21172585487365722, + "memory(GiB)": 91.64, + "step": 9845, + "token_acc": 0.9257907542579076, + "train_speed(iter/s)": 0.138387 + }, + { + "epoch": 0.929596073990185, + "grad_norm": 0.30686765909194946, + "learning_rate": 2.6981793356768314e-06, + "loss": 0.21002044677734374, + "memory(GiB)": 91.64, + "step": 9850, + "token_acc": 0.9194915254237288, + "train_speed(iter/s)": 0.138385 + }, + { + "epoch": 0.9300679501698754, + "grad_norm": 0.42280927300453186, + "learning_rate": 2.662292732373639e-06, + "loss": 0.2111431121826172, + "memory(GiB)": 91.64, + "step": 9855, + "token_acc": 0.9319787985865724, + "train_speed(iter/s)": 0.138384 + }, + { + "epoch": 0.9305398263495659, + "grad_norm": 0.19301030039787292, + "learning_rate": 2.6266431610405984e-06, + "loss": 0.20500037670135499, + "memory(GiB)": 91.64, + "step": 9860, + "token_acc": 0.9368821292775665, + "train_speed(iter/s)": 0.138383 + }, + { + "epoch": 0.9310117025292564, + "grad_norm": 0.27901899814605713, + "learning_rate": 2.591230708489778e-06, + "loss": 0.2096851348876953, + "memory(GiB)": 91.64, + "step": 9865, + "token_acc": 0.928513731825525, + "train_speed(iter/s)": 0.138382 + }, + { + "epoch": 0.9314835787089468, + "grad_norm": 0.26552677154541016, + "learning_rate": 2.5560554609558417e-06, + "loss": 0.2152176856994629, + "memory(GiB)": 91.64, + "step": 9870, + "token_acc": 0.9420463629096723, + "train_speed(iter/s)": 0.13838 + }, + { + "epoch": 0.9319554548886372, + "grad_norm": 0.3852667808532715, + "learning_rate": 2.5211175040958048e-06, + "loss": 0.20902786254882813, + "memory(GiB)": 91.64, + "step": 9875, + "token_acc": 0.9132697311361665, + "train_speed(iter/s)": 0.138379 + }, + { + "epoch": 0.9324273310683276, + "grad_norm": 0.3729971647262573, + "learning_rate": 2.4864169229888654e-06, + "loss": 0.2123638153076172, + "memory(GiB)": 91.64, + "step": 9880, + "token_acc": 0.9168326693227091, + "train_speed(iter/s)": 0.138378 + }, + { + "epoch": 0.9328992072480181, + "grad_norm": 0.3718613386154175, + "learning_rate": 2.4519538021361422e-06, + "loss": 0.21329360008239745, + "memory(GiB)": 91.64, + "step": 9885, + "token_acc": 0.9229032258064516, + "train_speed(iter/s)": 0.138376 + }, + { + "epoch": 0.9333710834277086, + "grad_norm": 0.23476502299308777, + "learning_rate": 2.417728225460525e-06, + "loss": 0.20669341087341309, + "memory(GiB)": 91.64, + "step": 9890, + "token_acc": 0.9244951712028094, + "train_speed(iter/s)": 0.138376 + }, + { + "epoch": 0.933842959607399, + "grad_norm": 0.3839211165904999, + "learning_rate": 2.3837402763064567e-06, + "loss": 0.20854783058166504, + "memory(GiB)": 91.64, + "step": 9895, + "token_acc": 0.9435273675065161, + "train_speed(iter/s)": 0.138374 + }, + { + "epoch": 0.9343148357870895, + "grad_norm": 0.4082486033439636, + "learning_rate": 2.349990037439709e-06, + "loss": 0.21600706577301027, + "memory(GiB)": 91.64, + "step": 9900, + "token_acc": 0.931433659839715, + "train_speed(iter/s)": 0.138373 + }, + { + "epoch": 0.9347867119667799, + "grad_norm": 0.23630106449127197, + "learning_rate": 2.3164775910471834e-06, + "loss": 0.2045605182647705, + "memory(GiB)": 91.64, + "step": 9905, + "token_acc": 0.9155844155844156, + "train_speed(iter/s)": 0.13837 + }, + { + "epoch": 0.9352585881464703, + "grad_norm": 0.26476147770881653, + "learning_rate": 2.283203018736757e-06, + "loss": 0.21763415336608888, + "memory(GiB)": 91.64, + "step": 9910, + "token_acc": 0.9334708612686952, + "train_speed(iter/s)": 0.13837 + }, + { + "epoch": 0.9357304643261608, + "grad_norm": 0.23421037197113037, + "learning_rate": 2.2501664015369906e-06, + "loss": 0.21492888927459716, + "memory(GiB)": 91.64, + "step": 9915, + "token_acc": 0.9112426035502958, + "train_speed(iter/s)": 0.138369 + }, + { + "epoch": 0.9362023405058513, + "grad_norm": 0.5269423127174377, + "learning_rate": 2.2173678198970316e-06, + "loss": 0.2087617874145508, + "memory(GiB)": 91.64, + "step": 9920, + "token_acc": 0.9160095989029825, + "train_speed(iter/s)": 0.138366 + }, + { + "epoch": 0.9366742166855417, + "grad_norm": 0.3009715676307678, + "learning_rate": 2.1848073536863577e-06, + "loss": 0.20663909912109374, + "memory(GiB)": 91.64, + "step": 9925, + "token_acc": 0.920963260265514, + "train_speed(iter/s)": 0.138365 + }, + { + "epoch": 0.9371460928652322, + "grad_norm": 0.31081992387771606, + "learning_rate": 2.152485082194633e-06, + "loss": 0.21198580265045167, + "memory(GiB)": 91.64, + "step": 9930, + "token_acc": 0.9265060240963855, + "train_speed(iter/s)": 0.138362 + }, + { + "epoch": 0.9376179690449226, + "grad_norm": 0.21676135063171387, + "learning_rate": 2.120401084131418e-06, + "loss": 0.2091569185256958, + "memory(GiB)": 91.64, + "step": 9935, + "token_acc": 0.9232542819499341, + "train_speed(iter/s)": 0.13836 + }, + { + "epoch": 0.938089845224613, + "grad_norm": 0.6019723415374756, + "learning_rate": 2.0885554376261164e-06, + "loss": 0.2074450969696045, + "memory(GiB)": 91.64, + "step": 9940, + "token_acc": 0.9200394866732478, + "train_speed(iter/s)": 0.13836 + }, + { + "epoch": 0.9385617214043035, + "grad_norm": 0.37886330485343933, + "learning_rate": 2.056948220227639e-06, + "loss": 0.2108628273010254, + "memory(GiB)": 91.64, + "step": 9945, + "token_acc": 0.9312602291325696, + "train_speed(iter/s)": 0.138359 + }, + { + "epoch": 0.939033597583994, + "grad_norm": 0.2426460087299347, + "learning_rate": 2.0255795089043296e-06, + "loss": 0.21312596797943115, + "memory(GiB)": 91.64, + "step": 9950, + "token_acc": 0.929093567251462, + "train_speed(iter/s)": 0.138357 + }, + { + "epoch": 0.9395054737636844, + "grad_norm": 0.38751596212387085, + "learning_rate": 1.994449380043717e-06, + "loss": 0.2076961040496826, + "memory(GiB)": 91.64, + "step": 9955, + "token_acc": 0.9129967776584318, + "train_speed(iter/s)": 0.138358 + }, + { + "epoch": 0.9399773499433749, + "grad_norm": 0.2836040258407593, + "learning_rate": 1.9635579094523514e-06, + "loss": 0.20892977714538574, + "memory(GiB)": 91.64, + "step": 9960, + "token_acc": 0.9155970439517698, + "train_speed(iter/s)": 0.138356 + }, + { + "epoch": 0.9404492261230654, + "grad_norm": 0.34263989329338074, + "learning_rate": 1.932905172355637e-06, + "loss": 0.21328263282775878, + "memory(GiB)": 91.64, + "step": 9965, + "token_acc": 0.9321761491481839, + "train_speed(iter/s)": 0.138354 + }, + { + "epoch": 0.9409211023027557, + "grad_norm": 0.3382331430912018, + "learning_rate": 1.902491243397575e-06, + "loss": 0.20347208976745607, + "memory(GiB)": 91.64, + "step": 9970, + "token_acc": 0.9281078382426361, + "train_speed(iter/s)": 0.138352 + }, + { + "epoch": 0.9413929784824462, + "grad_norm": 0.4317174255847931, + "learning_rate": 1.8723161966406777e-06, + "loss": 0.21226215362548828, + "memory(GiB)": 91.64, + "step": 9975, + "token_acc": 0.9255702280912365, + "train_speed(iter/s)": 0.138351 + }, + { + "epoch": 0.9418648546621367, + "grad_norm": 0.252453088760376, + "learning_rate": 1.842380105565711e-06, + "loss": 0.20707168579101562, + "memory(GiB)": 91.64, + "step": 9980, + "token_acc": 0.930406852248394, + "train_speed(iter/s)": 0.138351 + }, + { + "epoch": 0.9423367308418271, + "grad_norm": 0.2883821725845337, + "learning_rate": 1.8126830430715724e-06, + "loss": 0.20754437446594237, + "memory(GiB)": 91.64, + "step": 9985, + "token_acc": 0.9166115155526141, + "train_speed(iter/s)": 0.13835 + }, + { + "epoch": 0.9428086070215176, + "grad_norm": 0.2774185538291931, + "learning_rate": 1.7832250814750817e-06, + "loss": 0.21275861263275148, + "memory(GiB)": 91.64, + "step": 9990, + "token_acc": 0.9190020505809979, + "train_speed(iter/s)": 0.138351 + }, + { + "epoch": 0.943280483201208, + "grad_norm": 0.54234379529953, + "learning_rate": 1.7540062925108126e-06, + "loss": 0.21790072917938233, + "memory(GiB)": 91.64, + "step": 9995, + "token_acc": 0.9157351676698194, + "train_speed(iter/s)": 0.13835 + }, + { + "epoch": 0.9437523593808984, + "grad_norm": 0.3133432865142822, + "learning_rate": 1.7250267473309046e-06, + "loss": 0.21021485328674316, + "memory(GiB)": 91.64, + "step": 10000, + "token_acc": 0.9144818976279651, + "train_speed(iter/s)": 0.138348 + }, + { + "epoch": 0.9442242355605889, + "grad_norm": 0.8445714712142944, + "learning_rate": 1.696286516504908e-06, + "loss": 0.2085047721862793, + "memory(GiB)": 91.64, + "step": 10005, + "token_acc": 0.9371900826446281, + "train_speed(iter/s)": 0.138347 + }, + { + "epoch": 0.9446961117402793, + "grad_norm": 0.2897244691848755, + "learning_rate": 1.6677856700196394e-06, + "loss": 0.21313183307647704, + "memory(GiB)": 91.64, + "step": 10010, + "token_acc": 0.9148644009612084, + "train_speed(iter/s)": 0.138345 + }, + { + "epoch": 0.9451679879199698, + "grad_norm": 0.3938073515892029, + "learning_rate": 1.6395242772789144e-06, + "loss": 0.2124699354171753, + "memory(GiB)": 91.64, + "step": 10015, + "token_acc": 0.927461139896373, + "train_speed(iter/s)": 0.138344 + }, + { + "epoch": 0.9456398640996603, + "grad_norm": 0.2608243525028229, + "learning_rate": 1.6115024071034933e-06, + "loss": 0.20247375965118408, + "memory(GiB)": 91.64, + "step": 10020, + "token_acc": 0.9147230320699709, + "train_speed(iter/s)": 0.138342 + }, + { + "epoch": 0.9461117402793507, + "grad_norm": 0.44675490260124207, + "learning_rate": 1.58372012773087e-06, + "loss": 0.214943265914917, + "memory(GiB)": 91.64, + "step": 10025, + "token_acc": 0.9279670706608736, + "train_speed(iter/s)": 0.13834 + }, + { + "epoch": 0.9465836164590411, + "grad_norm": 0.4210216701030731, + "learning_rate": 1.5561775068150485e-06, + "loss": 0.2119581937789917, + "memory(GiB)": 91.64, + "step": 10030, + "token_acc": 0.9224646226415094, + "train_speed(iter/s)": 0.138339 + }, + { + "epoch": 0.9470554926387316, + "grad_norm": 0.33085545897483826, + "learning_rate": 1.5288746114264673e-06, + "loss": 0.21336703300476073, + "memory(GiB)": 91.64, + "step": 10035, + "token_acc": 0.9265560165975104, + "train_speed(iter/s)": 0.138337 + }, + { + "epoch": 0.947527368818422, + "grad_norm": 0.3594004213809967, + "learning_rate": 1.501811508051787e-06, + "loss": 0.2055363416671753, + "memory(GiB)": 91.64, + "step": 10040, + "token_acc": 0.9253539253539254, + "train_speed(iter/s)": 0.138335 + }, + { + "epoch": 0.9479992449981125, + "grad_norm": 0.29170048236846924, + "learning_rate": 1.474988262593735e-06, + "loss": 0.20636603832244874, + "memory(GiB)": 91.64, + "step": 10045, + "token_acc": 0.9267399267399268, + "train_speed(iter/s)": 0.138333 + }, + { + "epoch": 0.948471121177803, + "grad_norm": 0.29537680745124817, + "learning_rate": 1.448404940370951e-06, + "loss": 0.22197353839874268, + "memory(GiB)": 91.64, + "step": 10050, + "token_acc": 0.9287945034353529, + "train_speed(iter/s)": 0.138332 + }, + { + "epoch": 0.9489429973574934, + "grad_norm": 0.29160556197166443, + "learning_rate": 1.4220616061178415e-06, + "loss": 0.20441300868988038, + "memory(GiB)": 91.64, + "step": 10055, + "token_acc": 0.929786066922655, + "train_speed(iter/s)": 0.138332 + }, + { + "epoch": 0.9494148735371838, + "grad_norm": 0.36549004912376404, + "learning_rate": 1.3959583239843698e-06, + "loss": 0.21138055324554444, + "memory(GiB)": 91.64, + "step": 10060, + "token_acc": 0.9201907790143085, + "train_speed(iter/s)": 0.138329 + }, + { + "epoch": 0.9498867497168743, + "grad_norm": 0.24625816941261292, + "learning_rate": 1.3700951575359666e-06, + "loss": 0.21180267333984376, + "memory(GiB)": 91.64, + "step": 10065, + "token_acc": 0.9221022993899578, + "train_speed(iter/s)": 0.138328 + }, + { + "epoch": 0.9503586258965647, + "grad_norm": 0.24190473556518555, + "learning_rate": 1.3444721697533413e-06, + "loss": 0.20847978591918945, + "memory(GiB)": 91.64, + "step": 10070, + "token_acc": 0.9261583011583011, + "train_speed(iter/s)": 0.138326 + }, + { + "epoch": 0.9508305020762552, + "grad_norm": 0.44247832894325256, + "learning_rate": 1.3190894230323159e-06, + "loss": 0.2141636848449707, + "memory(GiB)": 91.64, + "step": 10075, + "token_acc": 0.9151750972762646, + "train_speed(iter/s)": 0.138325 + }, + { + "epoch": 0.9513023782559457, + "grad_norm": 0.4810398817062378, + "learning_rate": 1.2939469791837133e-06, + "loss": 0.211592960357666, + "memory(GiB)": 91.64, + "step": 10080, + "token_acc": 0.9310824921684651, + "train_speed(iter/s)": 0.138323 + }, + { + "epoch": 0.9517742544356361, + "grad_norm": 0.26484912633895874, + "learning_rate": 1.2690448994331472e-06, + "loss": 0.21042494773864745, + "memory(GiB)": 91.64, + "step": 10085, + "token_acc": 0.9333983105912931, + "train_speed(iter/s)": 0.13832 + }, + { + "epoch": 0.9522461306153266, + "grad_norm": 0.24709878861904144, + "learning_rate": 1.2443832444209547e-06, + "loss": 0.2084174394607544, + "memory(GiB)": 91.64, + "step": 10090, + "token_acc": 0.9254068716094033, + "train_speed(iter/s)": 0.138319 + }, + { + "epoch": 0.952718006795017, + "grad_norm": 0.29317525029182434, + "learning_rate": 1.2199620742019636e-06, + "loss": 0.20613834857940674, + "memory(GiB)": 91.64, + "step": 10095, + "token_acc": 0.9246799642750819, + "train_speed(iter/s)": 0.138318 + }, + { + "epoch": 0.9531898829747074, + "grad_norm": 0.374560683965683, + "learning_rate": 1.195781448245392e-06, + "loss": 0.21635422706604004, + "memory(GiB)": 91.64, + "step": 10100, + "token_acc": 0.926530612244898, + "train_speed(iter/s)": 0.138317 + }, + { + "epoch": 0.9536617591543979, + "grad_norm": 0.33670246601104736, + "learning_rate": 1.1718414254347276e-06, + "loss": 0.20765471458435059, + "memory(GiB)": 91.64, + "step": 10105, + "token_acc": 0.928082191780822, + "train_speed(iter/s)": 0.138315 + }, + { + "epoch": 0.9541336353340883, + "grad_norm": 0.21765998005867004, + "learning_rate": 1.1481420640675257e-06, + "loss": 0.20929303169250488, + "memory(GiB)": 91.64, + "step": 10110, + "token_acc": 0.9243661366566395, + "train_speed(iter/s)": 0.138314 + }, + { + "epoch": 0.9546055115137788, + "grad_norm": 0.7290151715278625, + "learning_rate": 1.124683421855277e-06, + "loss": 0.20725011825561523, + "memory(GiB)": 91.64, + "step": 10115, + "token_acc": 0.927360774818402, + "train_speed(iter/s)": 0.138314 + }, + { + "epoch": 0.9550773876934693, + "grad_norm": 0.26937779784202576, + "learning_rate": 1.1014655559233312e-06, + "loss": 0.21137325763702391, + "memory(GiB)": 91.64, + "step": 10120, + "token_acc": 0.9266569200779727, + "train_speed(iter/s)": 0.138314 + }, + { + "epoch": 0.9555492638731596, + "grad_norm": 0.26850059628486633, + "learning_rate": 1.0784885228106722e-06, + "loss": 0.21040570735931396, + "memory(GiB)": 91.64, + "step": 10125, + "token_acc": 0.9151398264223722, + "train_speed(iter/s)": 0.138313 + }, + { + "epoch": 0.9560211400528501, + "grad_norm": 0.34640467166900635, + "learning_rate": 1.055752378469832e-06, + "loss": 0.2133777141571045, + "memory(GiB)": 91.64, + "step": 10130, + "token_acc": 0.9248380129589633, + "train_speed(iter/s)": 0.138312 + }, + { + "epoch": 0.9564930162325406, + "grad_norm": 0.24151375889778137, + "learning_rate": 1.0332571782667555e-06, + "loss": 0.21562702655792237, + "memory(GiB)": 91.64, + "step": 10135, + "token_acc": 0.9167294649585531, + "train_speed(iter/s)": 0.13831 + }, + { + "epoch": 0.956964892412231, + "grad_norm": 0.8157694935798645, + "learning_rate": 1.0110029769806462e-06, + "loss": 0.21337783336639404, + "memory(GiB)": 91.64, + "step": 10140, + "token_acc": 0.9196428571428571, + "train_speed(iter/s)": 0.138309 + }, + { + "epoch": 0.9574367685919215, + "grad_norm": 0.24254101514816284, + "learning_rate": 9.889898288038103e-07, + "loss": 0.2101999282836914, + "memory(GiB)": 91.64, + "step": 10145, + "token_acc": 0.9198137609932747, + "train_speed(iter/s)": 0.138309 + }, + { + "epoch": 0.957908644771612, + "grad_norm": 0.6977924108505249, + "learning_rate": 9.67217787341601e-07, + "loss": 0.2110898494720459, + "memory(GiB)": 91.64, + "step": 10150, + "token_acc": 0.9282487377092745, + "train_speed(iter/s)": 0.138307 + }, + { + "epoch": 0.9583805209513023, + "grad_norm": 0.5975959300994873, + "learning_rate": 9.456869056122197e-07, + "loss": 0.20567688941955567, + "memory(GiB)": 91.64, + "step": 10155, + "token_acc": 0.9243664717348928, + "train_speed(iter/s)": 0.138306 + }, + { + "epoch": 0.9588523971309928, + "grad_norm": 0.32901281118392944, + "learning_rate": 9.243972360465702e-07, + "loss": 0.20593414306640626, + "memory(GiB)": 91.64, + "step": 10160, + "token_acc": 0.9356405585913783, + "train_speed(iter/s)": 0.138304 + }, + { + "epoch": 0.9593242733106833, + "grad_norm": 0.2798580527305603, + "learning_rate": 9.033488304882487e-07, + "loss": 0.20811738967895507, + "memory(GiB)": 91.64, + "step": 10165, + "token_acc": 0.9180633147113594, + "train_speed(iter/s)": 0.138303 + }, + { + "epoch": 0.9597961494903737, + "grad_norm": 0.1956169605255127, + "learning_rate": 8.825417401932545e-07, + "loss": 0.21179871559143065, + "memory(GiB)": 91.64, + "step": 10170, + "token_acc": 0.9301929625425652, + "train_speed(iter/s)": 0.138302 + }, + { + "epoch": 0.9602680256700642, + "grad_norm": 0.272049218416214, + "learning_rate": 8.619760158300016e-07, + "loss": 0.207623291015625, + "memory(GiB)": 91.64, + "step": 10175, + "token_acc": 0.9189681335356601, + "train_speed(iter/s)": 0.138302 + }, + { + "epoch": 0.9607399018497547, + "grad_norm": 0.3947182893753052, + "learning_rate": 8.416517074791297e-07, + "loss": 0.2093808650970459, + "memory(GiB)": 91.64, + "step": 10180, + "token_acc": 0.9274025587622732, + "train_speed(iter/s)": 0.138301 + }, + { + "epoch": 0.961211778029445, + "grad_norm": 0.2339448779821396, + "learning_rate": 8.215688646333819e-07, + "loss": 0.20636515617370604, + "memory(GiB)": 91.64, + "step": 10185, + "token_acc": 0.918429003021148, + "train_speed(iter/s)": 0.1383 + }, + { + "epoch": 0.9616836542091355, + "grad_norm": 0.2749667763710022, + "learning_rate": 8.01727536197483e-07, + "loss": 0.21485419273376466, + "memory(GiB)": 91.64, + "step": 10190, + "token_acc": 0.9255688391824142, + "train_speed(iter/s)": 0.138297 + }, + { + "epoch": 0.962155530388826, + "grad_norm": 0.2768957316875458, + "learning_rate": 7.821277704880947e-07, + "loss": 0.20581846237182616, + "memory(GiB)": 91.64, + "step": 10195, + "token_acc": 0.9279426149331594, + "train_speed(iter/s)": 0.138295 + }, + { + "epoch": 0.9626274065685164, + "grad_norm": 0.5955285429954529, + "learning_rate": 7.627696152335717e-07, + "loss": 0.20913143157958985, + "memory(GiB)": 91.64, + "step": 10200, + "token_acc": 0.921988855550793, + "train_speed(iter/s)": 0.138294 + }, + { + "epoch": 0.9630992827482069, + "grad_norm": 0.2856890857219696, + "learning_rate": 7.436531175739392e-07, + "loss": 0.21088147163391113, + "memory(GiB)": 91.64, + "step": 10205, + "token_acc": 0.9269616026711185, + "train_speed(iter/s)": 0.138292 + }, + { + "epoch": 0.9635711589278974, + "grad_norm": 0.272771418094635, + "learning_rate": 7.247783240607598e-07, + "loss": 0.2031481981277466, + "memory(GiB)": 91.64, + "step": 10210, + "token_acc": 0.9186991869918699, + "train_speed(iter/s)": 0.13829 + }, + { + "epoch": 0.9640430351075878, + "grad_norm": 0.42939260601997375, + "learning_rate": 7.061452806569668e-07, + "loss": 0.2058316707611084, + "memory(GiB)": 91.64, + "step": 10215, + "token_acc": 0.9244166940519224, + "train_speed(iter/s)": 0.138288 + }, + { + "epoch": 0.9645149112872782, + "grad_norm": 0.3867746889591217, + "learning_rate": 6.877540327368648e-07, + "loss": 0.2134486675262451, + "memory(GiB)": 91.64, + "step": 10220, + "token_acc": 0.9220824598183088, + "train_speed(iter/s)": 0.138287 + }, + { + "epoch": 0.9649867874669686, + "grad_norm": 0.23862211406230927, + "learning_rate": 6.696046250858845e-07, + "loss": 0.2061309814453125, + "memory(GiB)": 91.64, + "step": 10225, + "token_acc": 0.9267634154573068, + "train_speed(iter/s)": 0.138285 + }, + { + "epoch": 0.9654586636466591, + "grad_norm": 0.3633194863796234, + "learning_rate": 6.516971019005724e-07, + "loss": 0.21052196025848388, + "memory(GiB)": 91.64, + "step": 10230, + "token_acc": 0.9288199936728884, + "train_speed(iter/s)": 0.138284 + }, + { + "epoch": 0.9659305398263496, + "grad_norm": 0.24097710847854614, + "learning_rate": 6.340315067884461e-07, + "loss": 0.21546194553375245, + "memory(GiB)": 91.64, + "step": 10235, + "token_acc": 0.9337719298245614, + "train_speed(iter/s)": 0.138283 + }, + { + "epoch": 0.96640241600604, + "grad_norm": 0.23538899421691895, + "learning_rate": 6.166078827678945e-07, + "loss": 0.21139774322509766, + "memory(GiB)": 91.64, + "step": 10240, + "token_acc": 0.942993907745866, + "train_speed(iter/s)": 0.138282 + }, + { + "epoch": 0.9668742921857305, + "grad_norm": 0.19995267689228058, + "learning_rate": 5.994262722680332e-07, + "loss": 0.21091582775115966, + "memory(GiB)": 91.64, + "step": 10245, + "token_acc": 0.9313725490196079, + "train_speed(iter/s)": 0.138281 + }, + { + "epoch": 0.9673461683654209, + "grad_norm": 0.4463655948638916, + "learning_rate": 5.824867171287163e-07, + "loss": 0.21558599472045897, + "memory(GiB)": 91.64, + "step": 10250, + "token_acc": 0.9165417291354323, + "train_speed(iter/s)": 0.13828 + }, + { + "epoch": 0.9678180445451113, + "grad_norm": 0.22608503699302673, + "learning_rate": 5.6578925860028e-07, + "loss": 0.20667457580566406, + "memory(GiB)": 91.64, + "step": 10255, + "token_acc": 0.9164754474529601, + "train_speed(iter/s)": 0.138279 + }, + { + "epoch": 0.9682899207248018, + "grad_norm": 0.21197392046451569, + "learning_rate": 5.493339373435657e-07, + "loss": 0.20172109603881835, + "memory(GiB)": 91.64, + "step": 10260, + "token_acc": 0.919234360410831, + "train_speed(iter/s)": 0.138277 + }, + { + "epoch": 0.9687617969044923, + "grad_norm": 0.4548327624797821, + "learning_rate": 5.331207934297422e-07, + "loss": 0.20822181701660156, + "memory(GiB)": 91.64, + "step": 10265, + "token_acc": 0.93, + "train_speed(iter/s)": 0.138276 + }, + { + "epoch": 0.9692336730841827, + "grad_norm": 0.2541370689868927, + "learning_rate": 5.171498663402718e-07, + "loss": 0.2125007390975952, + "memory(GiB)": 91.64, + "step": 10270, + "token_acc": 0.9267723880597015, + "train_speed(iter/s)": 0.138276 + }, + { + "epoch": 0.9697055492638732, + "grad_norm": 0.2773514986038208, + "learning_rate": 5.014211949667446e-07, + "loss": 0.20577967166900635, + "memory(GiB)": 91.64, + "step": 10275, + "token_acc": 0.9172289698605488, + "train_speed(iter/s)": 0.138275 + }, + { + "epoch": 0.9701774254435636, + "grad_norm": 0.2945961654186249, + "learning_rate": 4.859348176108669e-07, + "loss": 0.2093435525894165, + "memory(GiB)": 91.64, + "step": 10280, + "token_acc": 0.9211669770328988, + "train_speed(iter/s)": 0.138274 + }, + { + "epoch": 0.970649301623254, + "grad_norm": 0.43366917967796326, + "learning_rate": 4.7069077198428345e-07, + "loss": 0.2111149787902832, + "memory(GiB)": 91.64, + "step": 10285, + "token_acc": 0.9272727272727272, + "train_speed(iter/s)": 0.138274 + }, + { + "epoch": 0.9711211778029445, + "grad_norm": 0.42656323313713074, + "learning_rate": 4.556890952085446e-07, + "loss": 0.21514317989349366, + "memory(GiB)": 91.64, + "step": 10290, + "token_acc": 0.9248013620885358, + "train_speed(iter/s)": 0.138273 + }, + { + "epoch": 0.971593053982635, + "grad_norm": 0.3890469968318939, + "learning_rate": 4.4092982381499505e-07, + "loss": 0.20530645847320556, + "memory(GiB)": 91.64, + "step": 10295, + "token_acc": 0.9163179916317992, + "train_speed(iter/s)": 0.138271 + }, + { + "epoch": 0.9720649301623254, + "grad_norm": 0.3857133090496063, + "learning_rate": 4.264129937446848e-07, + "loss": 0.2022775650024414, + "memory(GiB)": 91.64, + "step": 10300, + "token_acc": 0.9249260355029586, + "train_speed(iter/s)": 0.13827 + }, + { + "epoch": 0.9725368063420159, + "grad_norm": 0.47042417526245117, + "learning_rate": 4.121386403482586e-07, + "loss": 0.21275124549865723, + "memory(GiB)": 91.64, + "step": 10305, + "token_acc": 0.9202175883952856, + "train_speed(iter/s)": 0.138269 + }, + { + "epoch": 0.9730086825217064, + "grad_norm": 0.3687644898891449, + "learning_rate": 3.981067983859554e-07, + "loss": 0.21237845420837403, + "memory(GiB)": 91.64, + "step": 10310, + "token_acc": 0.920115149334293, + "train_speed(iter/s)": 0.138267 + }, + { + "epoch": 0.9734805587013967, + "grad_norm": 0.1944044530391693, + "learning_rate": 3.8431750202738704e-07, + "loss": 0.2046431064605713, + "memory(GiB)": 91.64, + "step": 10315, + "token_acc": 0.9339069221744232, + "train_speed(iter/s)": 0.138265 + }, + { + "epoch": 0.9739524348810872, + "grad_norm": 0.30766624212265015, + "learning_rate": 3.707707848515707e-07, + "loss": 0.2141507625579834, + "memory(GiB)": 91.64, + "step": 10320, + "token_acc": 0.9316535433070866, + "train_speed(iter/s)": 0.138265 + }, + { + "epoch": 0.9744243110607776, + "grad_norm": 0.3184703290462494, + "learning_rate": 3.5746667984682956e-07, + "loss": 0.21402463912963868, + "memory(GiB)": 91.64, + "step": 10325, + "token_acc": 0.9198218262806236, + "train_speed(iter/s)": 0.138265 + }, + { + "epoch": 0.9748961872404681, + "grad_norm": 0.21421895921230316, + "learning_rate": 3.444052194106262e-07, + "loss": 0.2053246021270752, + "memory(GiB)": 91.64, + "step": 10330, + "token_acc": 0.9339525283797729, + "train_speed(iter/s)": 0.138263 + }, + { + "epoch": 0.9753680634201586, + "grad_norm": 0.3974616825580597, + "learning_rate": 3.3158643534960677e-07, + "loss": 0.21539621353149413, + "memory(GiB)": 91.64, + "step": 10335, + "token_acc": 0.9283607248209018, + "train_speed(iter/s)": 0.138262 + }, + { + "epoch": 0.975839939599849, + "grad_norm": 0.7123221755027771, + "learning_rate": 3.1901035887942356e-07, + "loss": 0.20816359519958497, + "memory(GiB)": 91.64, + "step": 10340, + "token_acc": 0.9127652408218255, + "train_speed(iter/s)": 0.138261 + }, + { + "epoch": 0.9763118157795394, + "grad_norm": 0.26935598254203796, + "learning_rate": 3.066770206247349e-07, + "loss": 0.21007375717163085, + "memory(GiB)": 91.64, + "step": 10345, + "token_acc": 0.9361033519553073, + "train_speed(iter/s)": 0.13826 + }, + { + "epoch": 0.9767836919592299, + "grad_norm": 0.6770565509796143, + "learning_rate": 2.94586450619061e-07, + "loss": 0.21003360748291017, + "memory(GiB)": 91.64, + "step": 10350, + "token_acc": 0.9277614447711046, + "train_speed(iter/s)": 0.138259 + }, + { + "epoch": 0.9772555681389203, + "grad_norm": 0.2321297973394394, + "learning_rate": 2.8273867830477254e-07, + "loss": 0.20995206832885743, + "memory(GiB)": 91.64, + "step": 10355, + "token_acc": 0.9150858175248419, + "train_speed(iter/s)": 0.138257 + }, + { + "epoch": 0.9777274443186108, + "grad_norm": 0.4965348541736603, + "learning_rate": 2.711337325329577e-07, + "loss": 0.21208806037902833, + "memory(GiB)": 91.64, + "step": 10360, + "token_acc": 0.9319755600814664, + "train_speed(iter/s)": 0.138255 + }, + { + "epoch": 0.9781993204983013, + "grad_norm": 0.37109440565109253, + "learning_rate": 2.5977164156343327e-07, + "loss": 0.20955781936645507, + "memory(GiB)": 91.64, + "step": 10365, + "token_acc": 0.933184023889511, + "train_speed(iter/s)": 0.138254 + }, + { + "epoch": 0.9786711966779917, + "grad_norm": 0.401061087846756, + "learning_rate": 2.486524330645779e-07, + "loss": 0.21407065391540528, + "memory(GiB)": 91.64, + "step": 10370, + "token_acc": 0.9242843951985226, + "train_speed(iter/s)": 0.138251 + }, + { + "epoch": 0.9791430728576821, + "grad_norm": 0.7057275176048279, + "learning_rate": 2.3777613411335443e-07, + "loss": 0.20971500873565674, + "memory(GiB)": 91.64, + "step": 10375, + "token_acc": 0.9272334293948127, + "train_speed(iter/s)": 0.13825 + }, + { + "epoch": 0.9796149490373726, + "grad_norm": 0.4812077581882477, + "learning_rate": 2.271427711951768e-07, + "loss": 0.21029391288757324, + "memory(GiB)": 91.64, + "step": 10380, + "token_acc": 0.9272134709397066, + "train_speed(iter/s)": 0.138248 + }, + { + "epoch": 0.980086825217063, + "grad_norm": 0.5195487141609192, + "learning_rate": 2.167523702038876e-07, + "loss": 0.21241250038146972, + "memory(GiB)": 91.64, + "step": 10385, + "token_acc": 0.9222193414570911, + "train_speed(iter/s)": 0.138246 + }, + { + "epoch": 0.9805587013967535, + "grad_norm": 0.2308487743139267, + "learning_rate": 2.0660495644168055e-07, + "loss": 0.20739927291870117, + "memory(GiB)": 91.64, + "step": 10390, + "token_acc": 0.9294685990338164, + "train_speed(iter/s)": 0.138245 + }, + { + "epoch": 0.981030577576444, + "grad_norm": 0.5138281583786011, + "learning_rate": 1.967005546190448e-07, + "loss": 0.21623821258544923, + "memory(GiB)": 91.64, + "step": 10395, + "token_acc": 0.9200546634779638, + "train_speed(iter/s)": 0.138244 + }, + { + "epoch": 0.9815024537561344, + "grad_norm": 0.27799543738365173, + "learning_rate": 1.870391888546652e-07, + "loss": 0.21060950756073, + "memory(GiB)": 91.64, + "step": 10400, + "token_acc": 0.9342560553633218, + "train_speed(iter/s)": 0.138243 + }, + { + "epoch": 0.9819743299358248, + "grad_norm": 0.480560302734375, + "learning_rate": 1.7762088267544442e-07, + "loss": 0.21494081020355224, + "memory(GiB)": 91.64, + "step": 10405, + "token_acc": 0.9232977850697293, + "train_speed(iter/s)": 0.138244 + }, + { + "epoch": 0.9824462061155153, + "grad_norm": 0.24424020946025848, + "learning_rate": 1.6844565901636966e-07, + "loss": 0.2057812452316284, + "memory(GiB)": 91.64, + "step": 10410, + "token_acc": 0.9184561810795124, + "train_speed(iter/s)": 0.138242 + }, + { + "epoch": 0.9829180822952057, + "grad_norm": 0.2741668224334717, + "learning_rate": 1.5951354022047948e-07, + "loss": 0.21181230545043944, + "memory(GiB)": 91.64, + "step": 10415, + "token_acc": 0.925390625, + "train_speed(iter/s)": 0.138242 + }, + { + "epoch": 0.9833899584748962, + "grad_norm": 0.267378032207489, + "learning_rate": 1.508245480388415e-07, + "loss": 0.20557844638824463, + "memory(GiB)": 91.64, + "step": 10420, + "token_acc": 0.9227083998722453, + "train_speed(iter/s)": 0.138241 + }, + { + "epoch": 0.9838618346545867, + "grad_norm": 0.24558934569358826, + "learning_rate": 1.4237870363046358e-07, + "loss": 0.20829353332519532, + "memory(GiB)": 91.64, + "step": 10425, + "token_acc": 0.9241808827511508, + "train_speed(iter/s)": 0.13824 + }, + { + "epoch": 0.9843337108342771, + "grad_norm": 0.26023128628730774, + "learning_rate": 1.3417602756222724e-07, + "loss": 0.20859556198120116, + "memory(GiB)": 91.64, + "step": 10430, + "token_acc": 0.9226679555340744, + "train_speed(iter/s)": 0.138238 + }, + { + "epoch": 0.9848055870139676, + "grad_norm": 0.4792412221431732, + "learning_rate": 1.262165398089099e-07, + "loss": 0.21122725009918214, + "memory(GiB)": 91.64, + "step": 10435, + "token_acc": 0.9246612466124661, + "train_speed(iter/s)": 0.138237 + }, + { + "epoch": 0.9852774631936579, + "grad_norm": 0.2937432825565338, + "learning_rate": 1.1850025975304046e-07, + "loss": 0.20611369609832764, + "memory(GiB)": 91.64, + "step": 10440, + "token_acc": 0.9364161849710982, + "train_speed(iter/s)": 0.138236 + }, + { + "epoch": 0.9857493393733484, + "grad_norm": 0.5766665935516357, + "learning_rate": 1.1102720618493268e-07, + "loss": 0.20731678009033203, + "memory(GiB)": 91.64, + "step": 10445, + "token_acc": 0.9293218720152817, + "train_speed(iter/s)": 0.138236 + }, + { + "epoch": 0.9862212155530389, + "grad_norm": 0.34089088439941406, + "learning_rate": 1.037973973025963e-07, + "loss": 0.2109663963317871, + "memory(GiB)": 91.64, + "step": 10450, + "token_acc": 0.934560327198364, + "train_speed(iter/s)": 0.138234 + }, + { + "epoch": 0.9866930917327293, + "grad_norm": 0.36588117480278015, + "learning_rate": 9.681085071170382e-08, + "loss": 0.20971145629882812, + "memory(GiB)": 91.64, + "step": 10455, + "token_acc": 0.9096446700507614, + "train_speed(iter/s)": 0.138232 + }, + { + "epoch": 0.9871649679124198, + "grad_norm": 0.21309404075145721, + "learning_rate": 9.0067583425546e-08, + "loss": 0.21097216606140137, + "memory(GiB)": 91.64, + "step": 10460, + "token_acc": 0.9385533707865169, + "train_speed(iter/s)": 0.138232 + }, + { + "epoch": 0.9876368440921103, + "grad_norm": 0.24648167192935944, + "learning_rate": 8.356761186499862e-08, + "loss": 0.21074647903442384, + "memory(GiB)": 91.64, + "step": 10465, + "token_acc": 0.9346246973365617, + "train_speed(iter/s)": 0.138231 + }, + { + "epoch": 0.9881087202718006, + "grad_norm": 0.43389177322387695, + "learning_rate": 7.731095185846693e-08, + "loss": 0.20950679779052733, + "memory(GiB)": 91.64, + "step": 10470, + "token_acc": 0.9093137254901961, + "train_speed(iter/s)": 0.13823 + }, + { + "epoch": 0.9885805964514911, + "grad_norm": 0.4236377477645874, + "learning_rate": 7.129761864185236e-08, + "loss": 0.21015970706939696, + "memory(GiB)": 91.64, + "step": 10475, + "token_acc": 0.923942153186931, + "train_speed(iter/s)": 0.138229 + }, + { + "epoch": 0.9890524726311816, + "grad_norm": 0.39386889338493347, + "learning_rate": 6.552762685854141e-08, + "loss": 0.21105661392211914, + "memory(GiB)": 91.64, + "step": 10480, + "token_acc": 0.9331564986737401, + "train_speed(iter/s)": 0.138228 + }, + { + "epoch": 0.989524348810872, + "grad_norm": 0.24592046439647675, + "learning_rate": 6.000099055932795e-08, + "loss": 0.21296677589416504, + "memory(GiB)": 91.64, + "step": 10485, + "token_acc": 0.9173738276016079, + "train_speed(iter/s)": 0.138227 + }, + { + "epoch": 0.9899962249905625, + "grad_norm": 0.3638093173503876, + "learning_rate": 5.471772320240209e-08, + "loss": 0.20483295917510985, + "memory(GiB)": 91.64, + "step": 10490, + "token_acc": 0.9202294056308655, + "train_speed(iter/s)": 0.138229 + }, + { + "epoch": 0.990468101170253, + "grad_norm": 0.20603786408901215, + "learning_rate": 4.9677837653316904e-08, + "loss": 0.20644736289978027, + "memory(GiB)": 91.64, + "step": 10495, + "token_acc": 0.9233983286908078, + "train_speed(iter/s)": 0.138228 + }, + { + "epoch": 0.9909399773499433, + "grad_norm": 0.6620780229568481, + "learning_rate": 4.4881346184943994e-08, + "loss": 0.2090602159500122, + "memory(GiB)": 91.64, + "step": 10500, + "token_acc": 0.9377893518518519, + "train_speed(iter/s)": 0.138227 + }, + { + "epoch": 0.9914118535296338, + "grad_norm": 0.48382601141929626, + "learning_rate": 4.032826047747351e-08, + "loss": 0.20607643127441405, + "memory(GiB)": 91.64, + "step": 10505, + "token_acc": 0.9327878433664524, + "train_speed(iter/s)": 0.138226 + }, + { + "epoch": 0.9918837297093243, + "grad_norm": 0.304166704416275, + "learning_rate": 3.601859161834753e-08, + "loss": 0.20815153121948243, + "memory(GiB)": 91.64, + "step": 10510, + "token_acc": 0.9228538283062645, + "train_speed(iter/s)": 0.138224 + }, + { + "epoch": 0.9923556058890147, + "grad_norm": 0.25076785683631897, + "learning_rate": 3.195235010226005e-08, + "loss": 0.20654242038726806, + "memory(GiB)": 91.64, + "step": 10515, + "token_acc": 0.9157088122605364, + "train_speed(iter/s)": 0.138223 + }, + { + "epoch": 0.9928274820687052, + "grad_norm": 0.27873122692108154, + "learning_rate": 2.8129545831112604e-08, + "loss": 0.21101999282836914, + "memory(GiB)": 91.64, + "step": 10520, + "token_acc": 0.9243013795542978, + "train_speed(iter/s)": 0.138222 + }, + { + "epoch": 0.9932993582483957, + "grad_norm": 0.3841204047203064, + "learning_rate": 2.455018811403642e-08, + "loss": 0.20729808807373046, + "memory(GiB)": 91.64, + "step": 10525, + "token_acc": 0.9227557411273486, + "train_speed(iter/s)": 0.138221 + }, + { + "epoch": 0.9937712344280861, + "grad_norm": 0.5184294581413269, + "learning_rate": 2.121428566727035e-08, + "loss": 0.21497209072113038, + "memory(GiB)": 91.64, + "step": 10530, + "token_acc": 0.9206409767264403, + "train_speed(iter/s)": 0.13822 + }, + { + "epoch": 0.9942431106077765, + "grad_norm": 0.2996355891227722, + "learning_rate": 1.8121846614260752e-08, + "loss": 0.20966591835021972, + "memory(GiB)": 91.64, + "step": 10535, + "token_acc": 0.9370962257735465, + "train_speed(iter/s)": 0.138218 + }, + { + "epoch": 0.994714986787467, + "grad_norm": 0.5140137076377869, + "learning_rate": 1.5272878485561582e-08, + "loss": 0.21541638374328614, + "memory(GiB)": 91.64, + "step": 10540, + "token_acc": 0.9206145966709347, + "train_speed(iter/s)": 0.138217 + }, + { + "epoch": 0.9951868629671574, + "grad_norm": 0.3749140501022339, + "learning_rate": 1.2667388218834398e-08, + "loss": 0.2071479320526123, + "memory(GiB)": 91.64, + "step": 10545, + "token_acc": 0.9281074058033781, + "train_speed(iter/s)": 0.138215 + }, + { + "epoch": 0.9956587391468479, + "grad_norm": 0.23596055805683136, + "learning_rate": 1.0305382158848353e-08, + "loss": 0.2132049322128296, + "memory(GiB)": 91.64, + "step": 10550, + "token_acc": 0.9148795776971297, + "train_speed(iter/s)": 0.138214 + }, + { + "epoch": 0.9961306153265383, + "grad_norm": 0.5079225301742554, + "learning_rate": 8.186866057435793e-09, + "loss": 0.2057969093322754, + "memory(GiB)": 91.64, + "step": 10555, + "token_acc": 0.9209541627689429, + "train_speed(iter/s)": 0.138213 + }, + { + "epoch": 0.9966024915062288, + "grad_norm": 0.2810031771659851, + "learning_rate": 6.311845073492251e-09, + "loss": 0.2084169864654541, + "memory(GiB)": 91.64, + "step": 10560, + "token_acc": 0.9232763089683774, + "train_speed(iter/s)": 0.138213 + }, + { + "epoch": 0.9970743676859192, + "grad_norm": 0.6900485157966614, + "learning_rate": 4.680323772998651e-09, + "loss": 0.2122826337814331, + "memory(GiB)": 91.64, + "step": 10565, + "token_acc": 0.9240121580547113, + "train_speed(iter/s)": 0.138211 + }, + { + "epoch": 0.9975462438656096, + "grad_norm": 0.22658671438694, + "learning_rate": 3.2923061289324987e-09, + "loss": 0.2080448627471924, + "memory(GiB)": 91.64, + "step": 10570, + "token_acc": 0.9372056514913658, + "train_speed(iter/s)": 0.138209 + }, + { + "epoch": 0.9980181200453001, + "grad_norm": 0.20944856107234955, + "learning_rate": 2.1477955213455857e-09, + "loss": 0.2094266891479492, + "memory(GiB)": 91.64, + "step": 10575, + "token_acc": 0.9141078838174274, + "train_speed(iter/s)": 0.138208 + }, + { + "epoch": 0.9984899962249906, + "grad_norm": 0.336431086063385, + "learning_rate": 1.2467947372751808e-09, + "loss": 0.21478500366210937, + "memory(GiB)": 91.64, + "step": 10580, + "token_acc": 0.9218197879858657, + "train_speed(iter/s)": 0.138207 + }, + { + "epoch": 0.998961872404681, + "grad_norm": 0.6330317854881287, + "learning_rate": 5.893059708106385e-10, + "loss": 0.21838765144348143, + "memory(GiB)": 91.64, + "step": 10585, + "token_acc": 0.9174628450106157, + "train_speed(iter/s)": 0.138207 + }, + { + "epoch": 0.9994337485843715, + "grad_norm": 0.22374173998832703, + "learning_rate": 1.7533082302678695e-10, + "loss": 0.20774707794189454, + "memory(GiB)": 91.64, + "step": 10590, + "token_acc": 0.9141304347826087, + "train_speed(iter/s)": 0.138207 + }, + { + "epoch": 0.9999056247640619, + "grad_norm": 0.23590397834777832, + "learning_rate": 4.870302028336937e-12, + "loss": 0.21752536296844482, + "memory(GiB)": 91.64, + "step": 10595, + "token_acc": 0.9202557200538358, + "train_speed(iter/s)": 0.138204 + }, + { + "epoch": 1.0, + "eval_loss": 0.23324698209762573, + "eval_runtime": 3.6087, + "eval_samples_per_second": 27.711, + "eval_steps_per_second": 0.554, + "eval_token_acc": 0.9133481698944079, + "step": 10596 + } + ], + "logging_steps": 5, + "max_steps": 10596, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1111111111, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.6446246652105405e+20, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}