{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 2562, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00117096018735363, "grad_norm": 4.268844127655029, "learning_rate": 1.5503875968992249e-07, "loss": 0.6057141423225403, "memory(GiB)": 127.52, "step": 1, "token_acc": 0.8403535163595337, "train_speed(iter/s)": 0.023362 }, { "epoch": 0.00585480093676815, "grad_norm": 4.00634765625, "learning_rate": 7.751937984496125e-07, "loss": 0.5714304447174072, "memory(GiB)": 127.52, "step": 5, "token_acc": 0.8414377559265664, "train_speed(iter/s)": 0.058165 }, { "epoch": 0.0117096018735363, "grad_norm": 3.714452028274536, "learning_rate": 1.550387596899225e-06, "loss": 0.5679570198059082, "memory(GiB)": 127.52, "step": 10, "token_acc": 0.8314816958517272, "train_speed(iter/s)": 0.072408 }, { "epoch": 0.01756440281030445, "grad_norm": 2.5877742767333984, "learning_rate": 2.3255813953488376e-06, "loss": 0.5387242317199707, "memory(GiB)": 127.52, "step": 15, "token_acc": 0.8362197181678389, "train_speed(iter/s)": 0.080114 }, { "epoch": 0.0234192037470726, "grad_norm": 1.0245263576507568, "learning_rate": 3.10077519379845e-06, "loss": 0.4778164863586426, "memory(GiB)": 127.52, "step": 20, "token_acc": 0.8409289456094262, "train_speed(iter/s)": 0.083991 }, { "epoch": 0.02927400468384075, "grad_norm": 1.0654064416885376, "learning_rate": 3.875968992248063e-06, "loss": 0.45667400360107424, "memory(GiB)": 127.52, "step": 25, "token_acc": 0.8518281248542258, "train_speed(iter/s)": 0.086554 }, { "epoch": 0.0351288056206089, "grad_norm": 0.5691505670547485, "learning_rate": 4.651162790697675e-06, "loss": 0.44004316329956056, "memory(GiB)": 127.52, "step": 30, "token_acc": 0.8520554823322664, "train_speed(iter/s)": 0.088726 }, { "epoch": 0.040983606557377046, "grad_norm": 0.5251653790473938, "learning_rate": 5.4263565891472865e-06, "loss": 0.43890109062194826, "memory(GiB)": 127.52, "step": 35, "token_acc": 0.8498815333197345, "train_speed(iter/s)": 0.090273 }, { "epoch": 0.0468384074941452, "grad_norm": 0.4052143096923828, "learning_rate": 6.2015503875969e-06, "loss": 0.41214742660522463, "memory(GiB)": 127.52, "step": 40, "token_acc": 0.8529612170691973, "train_speed(iter/s)": 0.091488 }, { "epoch": 0.05269320843091335, "grad_norm": 0.3396666944026947, "learning_rate": 6.976744186046513e-06, "loss": 0.423629093170166, "memory(GiB)": 127.52, "step": 45, "token_acc": 0.845838888731289, "train_speed(iter/s)": 0.092386 }, { "epoch": 0.0585480093676815, "grad_norm": 0.3074694573879242, "learning_rate": 7.751937984496126e-06, "loss": 0.41414508819580076, "memory(GiB)": 127.52, "step": 50, "token_acc": 0.8609199657045593, "train_speed(iter/s)": 0.0932 }, { "epoch": 0.06440281030444965, "grad_norm": 0.31701743602752686, "learning_rate": 8.527131782945736e-06, "loss": 0.4058098793029785, "memory(GiB)": 127.52, "step": 55, "token_acc": 0.8598621225118498, "train_speed(iter/s)": 0.093922 }, { "epoch": 0.0702576112412178, "grad_norm": 0.29167020320892334, "learning_rate": 9.30232558139535e-06, "loss": 0.42685737609863283, "memory(GiB)": 127.52, "step": 60, "token_acc": 0.8583035383662712, "train_speed(iter/s)": 0.09448 }, { "epoch": 0.07611241217798595, "grad_norm": 0.2796083092689514, "learning_rate": 1.0077519379844963e-05, "loss": 0.4080663681030273, "memory(GiB)": 127.52, "step": 65, "token_acc": 0.86975264356343, "train_speed(iter/s)": 0.095012 }, { "epoch": 0.08196721311475409, "grad_norm": 0.32925185561180115, "learning_rate": 1.0852713178294573e-05, "loss": 0.4072235584259033, "memory(GiB)": 127.52, "step": 70, "token_acc": 0.8576062541566801, "train_speed(iter/s)": 0.095364 }, { "epoch": 0.08782201405152225, "grad_norm": 0.29692476987838745, "learning_rate": 1.1627906976744187e-05, "loss": 0.4062563419342041, "memory(GiB)": 127.52, "step": 75, "token_acc": 0.8451938495195714, "train_speed(iter/s)": 0.0958 }, { "epoch": 0.0936768149882904, "grad_norm": 0.32430365681648254, "learning_rate": 1.24031007751938e-05, "loss": 0.4052871227264404, "memory(GiB)": 127.52, "step": 80, "token_acc": 0.8536078219242759, "train_speed(iter/s)": 0.096071 }, { "epoch": 0.09953161592505855, "grad_norm": 0.2918962240219116, "learning_rate": 1.3178294573643412e-05, "loss": 0.39542815685272215, "memory(GiB)": 127.52, "step": 85, "token_acc": 0.8758206774505389, "train_speed(iter/s)": 0.096477 }, { "epoch": 0.1053864168618267, "grad_norm": 0.30198103189468384, "learning_rate": 1.3953488372093025e-05, "loss": 0.4015383243560791, "memory(GiB)": 127.52, "step": 90, "token_acc": 0.8578144099246164, "train_speed(iter/s)": 0.096833 }, { "epoch": 0.11124121779859485, "grad_norm": 0.32643797993659973, "learning_rate": 1.4728682170542636e-05, "loss": 0.401915454864502, "memory(GiB)": 127.52, "step": 95, "token_acc": 0.8631059302340187, "train_speed(iter/s)": 0.097122 }, { "epoch": 0.117096018735363, "grad_norm": 0.3097076416015625, "learning_rate": 1.550387596899225e-05, "loss": 0.4027417182922363, "memory(GiB)": 127.52, "step": 100, "token_acc": 0.8636492034198335, "train_speed(iter/s)": 0.097418 }, { "epoch": 0.12295081967213115, "grad_norm": 0.28134772181510925, "learning_rate": 1.6279069767441862e-05, "loss": 0.39868090152740476, "memory(GiB)": 127.52, "step": 105, "token_acc": 0.8655913809126278, "train_speed(iter/s)": 0.0977 }, { "epoch": 0.1288056206088993, "grad_norm": 0.2668236196041107, "learning_rate": 1.7054263565891473e-05, "loss": 0.38587536811828616, "memory(GiB)": 127.52, "step": 110, "token_acc": 0.864132983946116, "train_speed(iter/s)": 0.09794 }, { "epoch": 0.13466042154566746, "grad_norm": 0.3235706686973572, "learning_rate": 1.7829457364341087e-05, "loss": 0.40470218658447266, "memory(GiB)": 127.52, "step": 115, "token_acc": 0.8720630828529737, "train_speed(iter/s)": 0.098141 }, { "epoch": 0.1405152224824356, "grad_norm": 0.2895485460758209, "learning_rate": 1.86046511627907e-05, "loss": 0.39603259563446047, "memory(GiB)": 127.52, "step": 120, "token_acc": 0.8639678736880146, "train_speed(iter/s)": 0.09831 }, { "epoch": 0.14637002341920374, "grad_norm": 0.3098626434803009, "learning_rate": 1.937984496124031e-05, "loss": 0.4097726821899414, "memory(GiB)": 127.52, "step": 125, "token_acc": 0.8581558732162836, "train_speed(iter/s)": 0.098474 }, { "epoch": 0.1522248243559719, "grad_norm": 0.35938969254493713, "learning_rate": 1.9999991663467044e-05, "loss": 0.4081538200378418, "memory(GiB)": 127.52, "step": 130, "token_acc": 0.8630013632327376, "train_speed(iter/s)": 0.098587 }, { "epoch": 0.15807962529274006, "grad_norm": 0.3397412896156311, "learning_rate": 1.9999699886272926e-05, "loss": 0.40991506576538084, "memory(GiB)": 127.52, "step": 135, "token_acc": 0.8502879675585575, "train_speed(iter/s)": 0.098664 }, { "epoch": 0.16393442622950818, "grad_norm": 0.32449835538864136, "learning_rate": 1.9998991296330317e-05, "loss": 0.40630359649658204, "memory(GiB)": 127.52, "step": 140, "token_acc": 0.8630894085796805, "train_speed(iter/s)": 0.098721 }, { "epoch": 0.16978922716627634, "grad_norm": 0.32687216997146606, "learning_rate": 1.9997865923175027e-05, "loss": 0.396761417388916, "memory(GiB)": 127.52, "step": 145, "token_acc": 0.8691767868585987, "train_speed(iter/s)": 0.09874 }, { "epoch": 0.1756440281030445, "grad_norm": 0.32365313172340393, "learning_rate": 1.999632381371545e-05, "loss": 0.40283679962158203, "memory(GiB)": 127.52, "step": 150, "token_acc": 0.8533993606842608, "train_speed(iter/s)": 0.09881 }, { "epoch": 0.18149882903981265, "grad_norm": 0.3086594343185425, "learning_rate": 1.999436503223061e-05, "loss": 0.4014937400817871, "memory(GiB)": 127.52, "step": 155, "token_acc": 0.8624249503342012, "train_speed(iter/s)": 0.098838 }, { "epoch": 0.1873536299765808, "grad_norm": 0.32935866713523865, "learning_rate": 1.9991989660367463e-05, "loss": 0.4079470634460449, "memory(GiB)": 127.52, "step": 160, "token_acc": 0.8470391967320465, "train_speed(iter/s)": 0.098876 }, { "epoch": 0.19320843091334894, "grad_norm": 0.27776622772216797, "learning_rate": 1.998919779713751e-05, "loss": 0.4115422248840332, "memory(GiB)": 127.52, "step": 165, "token_acc": 0.8531520964716057, "train_speed(iter/s)": 0.098931 }, { "epoch": 0.1990632318501171, "grad_norm": 0.28459489345550537, "learning_rate": 1.998598955891266e-05, "loss": 0.4005699634552002, "memory(GiB)": 127.52, "step": 170, "token_acc": 0.867363933744935, "train_speed(iter/s)": 0.099005 }, { "epoch": 0.20491803278688525, "grad_norm": 0.3174498379230499, "learning_rate": 1.9982365079420382e-05, "loss": 0.38856048583984376, "memory(GiB)": 127.52, "step": 175, "token_acc": 0.8610733940638768, "train_speed(iter/s)": 0.099065 }, { "epoch": 0.2107728337236534, "grad_norm": 0.30468112230300903, "learning_rate": 1.9978324509738147e-05, "loss": 0.392287540435791, "memory(GiB)": 127.52, "step": 180, "token_acc": 0.8659642567171478, "train_speed(iter/s)": 0.099132 }, { "epoch": 0.21662763466042154, "grad_norm": 0.31203576922416687, "learning_rate": 1.9973868018287093e-05, "loss": 0.3912659168243408, "memory(GiB)": 127.52, "step": 185, "token_acc": 0.8592000200480526, "train_speed(iter/s)": 0.099222 }, { "epoch": 0.2224824355971897, "grad_norm": 0.2872975766658783, "learning_rate": 1.9968995790825048e-05, "loss": 0.3968376159667969, "memory(GiB)": 127.52, "step": 190, "token_acc": 0.8492010693857249, "train_speed(iter/s)": 0.099228 }, { "epoch": 0.22833723653395785, "grad_norm": 0.3107975721359253, "learning_rate": 1.9963708030438754e-05, "loss": 0.39564805030822753, "memory(GiB)": 127.52, "step": 195, "token_acc": 0.8623048224402377, "train_speed(iter/s)": 0.099285 }, { "epoch": 0.234192037470726, "grad_norm": 0.33172452449798584, "learning_rate": 1.995800495753542e-05, "loss": 0.3955163240432739, "memory(GiB)": 127.52, "step": 200, "token_acc": 0.8543361827625122, "train_speed(iter/s)": 0.099318 }, { "epoch": 0.24004683840749413, "grad_norm": 0.4809193015098572, "learning_rate": 1.9951886809833537e-05, "loss": 0.40662593841552735, "memory(GiB)": 127.52, "step": 205, "token_acc": 0.8530674732086181, "train_speed(iter/s)": 0.099376 }, { "epoch": 0.2459016393442623, "grad_norm": 0.3544229567050934, "learning_rate": 1.9945353842352943e-05, "loss": 0.4021385669708252, "memory(GiB)": 127.52, "step": 210, "token_acc": 0.8561705450570045, "train_speed(iter/s)": 0.099425 }, { "epoch": 0.25175644028103045, "grad_norm": 0.336126446723938, "learning_rate": 1.9938406327404233e-05, "loss": 0.3979261159896851, "memory(GiB)": 127.52, "step": 215, "token_acc": 0.8645368893679286, "train_speed(iter/s)": 0.099503 }, { "epoch": 0.2576112412177986, "grad_norm": 0.33789604902267456, "learning_rate": 1.9931044554577373e-05, "loss": 0.3947408676147461, "memory(GiB)": 127.52, "step": 220, "token_acc": 0.8581383757515342, "train_speed(iter/s)": 0.099556 }, { "epoch": 0.26346604215456676, "grad_norm": 0.3256719708442688, "learning_rate": 1.992326883072965e-05, "loss": 0.39812633991241453, "memory(GiB)": 127.52, "step": 225, "token_acc": 0.8538002738372856, "train_speed(iter/s)": 0.099561 }, { "epoch": 0.2693208430913349, "grad_norm": 0.29769811034202576, "learning_rate": 1.991507947997287e-05, "loss": 0.40686187744140623, "memory(GiB)": 127.52, "step": 230, "token_acc": 0.8601537153116829, "train_speed(iter/s)": 0.099609 }, { "epoch": 0.275175644028103, "grad_norm": 0.30855706334114075, "learning_rate": 1.9906476843659866e-05, "loss": 0.40198640823364257, "memory(GiB)": 127.52, "step": 235, "token_acc": 0.8681018040834193, "train_speed(iter/s)": 0.099643 }, { "epoch": 0.2810304449648712, "grad_norm": 0.38956841826438904, "learning_rate": 1.989746128037024e-05, "loss": 0.3874382972717285, "memory(GiB)": 127.52, "step": 240, "token_acc": 0.8601923167422234, "train_speed(iter/s)": 0.099684 }, { "epoch": 0.28688524590163933, "grad_norm": 0.317061185836792, "learning_rate": 1.988803316589545e-05, "loss": 0.396057653427124, "memory(GiB)": 127.52, "step": 245, "token_acc": 0.8594824803587602, "train_speed(iter/s)": 0.099766 }, { "epoch": 0.2927400468384075, "grad_norm": 0.31615447998046875, "learning_rate": 1.987819289322311e-05, "loss": 0.39992465972900393, "memory(GiB)": 127.52, "step": 250, "token_acc": 0.858279346005983, "train_speed(iter/s)": 0.099854 }, { "epoch": 0.29859484777517564, "grad_norm": 0.32358142733573914, "learning_rate": 1.9867940872520646e-05, "loss": 0.40424213409423826, "memory(GiB)": 127.52, "step": 255, "token_acc": 0.8581384084126314, "train_speed(iter/s)": 0.09983 }, { "epoch": 0.3044496487119438, "grad_norm": 0.289928138256073, "learning_rate": 1.9857277531118173e-05, "loss": 0.3975801706314087, "memory(GiB)": 127.52, "step": 260, "token_acc": 0.8720583892069197, "train_speed(iter/s)": 0.099856 }, { "epoch": 0.31030444964871196, "grad_norm": 0.2990163266658783, "learning_rate": 1.9846203313490697e-05, "loss": 0.38855001926422117, "memory(GiB)": 127.52, "step": 265, "token_acc": 0.8751479791620219, "train_speed(iter/s)": 0.099904 }, { "epoch": 0.3161592505854801, "grad_norm": 0.3375948369503021, "learning_rate": 1.983471868123958e-05, "loss": 0.3869392156600952, "memory(GiB)": 127.52, "step": 270, "token_acc": 0.8583391727600954, "train_speed(iter/s)": 0.099986 }, { "epoch": 0.32201405152224827, "grad_norm": 0.31450051069259644, "learning_rate": 1.98228241130733e-05, "loss": 0.4127011775970459, "memory(GiB)": 127.52, "step": 275, "token_acc": 0.8624973560772896, "train_speed(iter/s)": 0.100024 }, { "epoch": 0.32786885245901637, "grad_norm": 0.30610159039497375, "learning_rate": 1.98105201047875e-05, "loss": 0.38500449657440183, "memory(GiB)": 127.52, "step": 280, "token_acc": 0.8676562826677817, "train_speed(iter/s)": 0.1001 }, { "epoch": 0.3337236533957845, "grad_norm": 0.29564493894577026, "learning_rate": 1.9797807169244326e-05, "loss": 0.39098482131958007, "memory(GiB)": 127.52, "step": 285, "token_acc": 0.8600835808177637, "train_speed(iter/s)": 0.100123 }, { "epoch": 0.3395784543325527, "grad_norm": 0.2966287136077881, "learning_rate": 1.9784685836351045e-05, "loss": 0.40611705780029295, "memory(GiB)": 127.52, "step": 290, "token_acc": 0.85560257646949, "train_speed(iter/s)": 0.100118 }, { "epoch": 0.34543325526932084, "grad_norm": 0.3238191604614258, "learning_rate": 1.9771156653037944e-05, "loss": 0.3969024419784546, "memory(GiB)": 127.52, "step": 295, "token_acc": 0.8581954258818798, "train_speed(iter/s)": 0.100158 }, { "epoch": 0.351288056206089, "grad_norm": 0.27766069769859314, "learning_rate": 1.975722018323556e-05, "loss": 0.38973977565765383, "memory(GiB)": 127.52, "step": 300, "token_acc": 0.8660634024604128, "train_speed(iter/s)": 0.100143 }, { "epoch": 0.35714285714285715, "grad_norm": 0.30145326256752014, "learning_rate": 1.974287700785116e-05, "loss": 0.3852071285247803, "memory(GiB)": 127.52, "step": 305, "token_acc": 0.8624855074734434, "train_speed(iter/s)": 0.100192 }, { "epoch": 0.3629976580796253, "grad_norm": 0.3129558563232422, "learning_rate": 1.9728127724744516e-05, "loss": 0.3764306306838989, "memory(GiB)": 127.52, "step": 310, "token_acc": 0.8621159494397087, "train_speed(iter/s)": 0.100188 }, { "epoch": 0.36885245901639346, "grad_norm": 0.28354689478874207, "learning_rate": 1.9712972948703006e-05, "loss": 0.4006787300109863, "memory(GiB)": 127.52, "step": 315, "token_acc": 0.8685145789802604, "train_speed(iter/s)": 0.100213 }, { "epoch": 0.3747072599531616, "grad_norm": 0.32204070687294006, "learning_rate": 1.9697413311415967e-05, "loss": 0.3947436332702637, "memory(GiB)": 127.52, "step": 320, "token_acc": 0.840712523808037, "train_speed(iter/s)": 0.100233 }, { "epoch": 0.3805620608899297, "grad_norm": 0.2838529944419861, "learning_rate": 1.9681449461448386e-05, "loss": 0.3909641981124878, "memory(GiB)": 127.52, "step": 325, "token_acc": 0.8644274332135604, "train_speed(iter/s)": 0.10024 }, { "epoch": 0.3864168618266979, "grad_norm": 0.2927788197994232, "learning_rate": 1.9665082064213856e-05, "loss": 0.3943678140640259, "memory(GiB)": 127.52, "step": 330, "token_acc": 0.8593159978638758, "train_speed(iter/s)": 0.100274 }, { "epoch": 0.39227166276346603, "grad_norm": 0.28758853673934937, "learning_rate": 1.9648311801946823e-05, "loss": 0.39302983283996584, "memory(GiB)": 127.52, "step": 335, "token_acc": 0.8576617952773522, "train_speed(iter/s)": 0.10031 }, { "epoch": 0.3981264637002342, "grad_norm": 0.32002732157707214, "learning_rate": 1.9631139373674188e-05, "loss": 0.3899127721786499, "memory(GiB)": 127.52, "step": 340, "token_acc": 0.859130068814327, "train_speed(iter/s)": 0.100326 }, { "epoch": 0.40398126463700235, "grad_norm": 0.29767319560050964, "learning_rate": 1.9613565495186126e-05, "loss": 0.38013973236083987, "memory(GiB)": 127.52, "step": 345, "token_acc": 0.8582271352459535, "train_speed(iter/s)": 0.100383 }, { "epoch": 0.4098360655737705, "grad_norm": 0.30334916710853577, "learning_rate": 1.9595590899006288e-05, "loss": 0.3990506649017334, "memory(GiB)": 127.52, "step": 350, "token_acc": 0.8646594498490017, "train_speed(iter/s)": 0.100383 }, { "epoch": 0.41569086651053866, "grad_norm": 0.27606984972953796, "learning_rate": 1.957721633436124e-05, "loss": 0.39636931419372556, "memory(GiB)": 127.52, "step": 355, "token_acc": 0.8610379971059329, "train_speed(iter/s)": 0.100374 }, { "epoch": 0.4215456674473068, "grad_norm": 0.2963041067123413, "learning_rate": 1.9558442567149244e-05, "loss": 0.3938555955886841, "memory(GiB)": 127.52, "step": 360, "token_acc": 0.8734277076877441, "train_speed(iter/s)": 0.100411 }, { "epoch": 0.4274004683840749, "grad_norm": 0.3044081926345825, "learning_rate": 1.953927037990834e-05, "loss": 0.4011641502380371, "memory(GiB)": 127.52, "step": 365, "token_acc": 0.8496909477706446, "train_speed(iter/s)": 0.100429 }, { "epoch": 0.4332552693208431, "grad_norm": 0.3151879906654358, "learning_rate": 1.9519700571783718e-05, "loss": 0.40146493911743164, "memory(GiB)": 127.52, "step": 370, "token_acc": 0.8655695668198701, "train_speed(iter/s)": 0.100402 }, { "epoch": 0.43911007025761123, "grad_norm": 0.30802202224731445, "learning_rate": 1.9499733958494405e-05, "loss": 0.3972816467285156, "memory(GiB)": 127.52, "step": 375, "token_acc": 0.8541561335505496, "train_speed(iter/s)": 0.100424 }, { "epoch": 0.4449648711943794, "grad_norm": 0.2896055579185486, "learning_rate": 1.947937137229928e-05, "loss": 0.39000208377838136, "memory(GiB)": 127.52, "step": 380, "token_acc": 0.8715701816495711, "train_speed(iter/s)": 0.10044 }, { "epoch": 0.45081967213114754, "grad_norm": 0.3016491234302521, "learning_rate": 1.9458613661962366e-05, "loss": 0.3910162687301636, "memory(GiB)": 127.52, "step": 385, "token_acc": 0.8739838931744026, "train_speed(iter/s)": 0.100469 }, { "epoch": 0.4566744730679157, "grad_norm": 0.29643046855926514, "learning_rate": 1.943746169271746e-05, "loss": 0.39229693412780764, "memory(GiB)": 127.52, "step": 390, "token_acc": 0.8722126097825781, "train_speed(iter/s)": 0.100449 }, { "epoch": 0.46252927400468385, "grad_norm": 0.27366167306900024, "learning_rate": 1.941591634623206e-05, "loss": 0.39676542282104493, "memory(GiB)": 127.52, "step": 395, "token_acc": 0.8644101402067695, "train_speed(iter/s)": 0.100471 }, { "epoch": 0.468384074941452, "grad_norm": 0.2772040069103241, "learning_rate": 1.9393978520570638e-05, "loss": 0.38228650093078614, "memory(GiB)": 127.52, "step": 400, "token_acc": 0.8660634050880627, "train_speed(iter/s)": 0.100525 }, { "epoch": 0.47423887587822017, "grad_norm": 0.27195385098457336, "learning_rate": 1.9371649130157166e-05, "loss": 0.3779789209365845, "memory(GiB)": 127.52, "step": 405, "token_acc": 0.8644070452060074, "train_speed(iter/s)": 0.100537 }, { "epoch": 0.48009367681498827, "grad_norm": 0.3120705783367157, "learning_rate": 1.9348929105737044e-05, "loss": 0.3843944549560547, "memory(GiB)": 127.52, "step": 410, "token_acc": 0.8640640315662635, "train_speed(iter/s)": 0.100541 }, { "epoch": 0.4859484777517564, "grad_norm": 0.30002740025520325, "learning_rate": 1.932581939433827e-05, "loss": 0.3987558841705322, "memory(GiB)": 127.52, "step": 415, "token_acc": 0.8650914968394279, "train_speed(iter/s)": 0.100546 }, { "epoch": 0.4918032786885246, "grad_norm": 0.2787948250770569, "learning_rate": 1.9302320959231997e-05, "loss": 0.3887160778045654, "memory(GiB)": 127.52, "step": 420, "token_acc": 0.8633874480548741, "train_speed(iter/s)": 0.100566 }, { "epoch": 0.49765807962529274, "grad_norm": 0.30231156945228577, "learning_rate": 1.927843477989234e-05, "loss": 0.38535680770874026, "memory(GiB)": 127.52, "step": 425, "token_acc": 0.8781958006354674, "train_speed(iter/s)": 0.100582 }, { "epoch": 0.5035128805620609, "grad_norm": 0.43067944049835205, "learning_rate": 1.9254161851955587e-05, "loss": 0.3992464065551758, "memory(GiB)": 127.52, "step": 430, "token_acc": 0.8681868917427511, "train_speed(iter/s)": 0.100614 }, { "epoch": 0.509367681498829, "grad_norm": 0.31797730922698975, "learning_rate": 1.9229503187178694e-05, "loss": 0.3914906978607178, "memory(GiB)": 127.52, "step": 435, "token_acc": 0.8623976908030916, "train_speed(iter/s)": 0.100629 }, { "epoch": 0.5152224824355972, "grad_norm": 0.3029649555683136, "learning_rate": 1.920445981339708e-05, "loss": 0.3909397840499878, "memory(GiB)": 127.52, "step": 440, "token_acc": 0.8603624171988666, "train_speed(iter/s)": 0.10065 }, { "epoch": 0.5210772833723654, "grad_norm": 0.30808401107788086, "learning_rate": 1.9179032774481822e-05, "loss": 0.38848447799682617, "memory(GiB)": 127.52, "step": 445, "token_acc": 0.8688334300638422, "train_speed(iter/s)": 0.10068 }, { "epoch": 0.5269320843091335, "grad_norm": 0.30352672934532166, "learning_rate": 1.9153223130296125e-05, "loss": 0.38553576469421386, "memory(GiB)": 127.52, "step": 450, "token_acc": 0.871061226654355, "train_speed(iter/s)": 0.100707 }, { "epoch": 0.5327868852459017, "grad_norm": 0.30111393332481384, "learning_rate": 1.9127031956651153e-05, "loss": 0.38896827697753905, "memory(GiB)": 127.52, "step": 455, "token_acc": 0.868666861524493, "train_speed(iter/s)": 0.10072 }, { "epoch": 0.5386416861826698, "grad_norm": 0.3043946325778961, "learning_rate": 1.9100460345261175e-05, "loss": 0.4031389236450195, "memory(GiB)": 127.52, "step": 460, "token_acc": 0.8602805306930444, "train_speed(iter/s)": 0.10069 }, { "epoch": 0.544496487119438, "grad_norm": 0.3046748638153076, "learning_rate": 1.9073509403698062e-05, "loss": 0.3981820821762085, "memory(GiB)": 127.52, "step": 465, "token_acc": 0.8679260633787171, "train_speed(iter/s)": 0.100702 }, { "epoch": 0.550351288056206, "grad_norm": 0.31403180956840515, "learning_rate": 1.9046180255345142e-05, "loss": 0.3932758569717407, "memory(GiB)": 127.52, "step": 470, "token_acc": 0.8679127068807225, "train_speed(iter/s)": 0.10072 }, { "epoch": 0.5562060889929742, "grad_norm": 0.29715070128440857, "learning_rate": 1.9018474039350342e-05, "loss": 0.3857383966445923, "memory(GiB)": 127.52, "step": 475, "token_acc": 0.8670612150699786, "train_speed(iter/s)": 0.100729 }, { "epoch": 0.5620608899297423, "grad_norm": 0.3304217755794525, "learning_rate": 1.899039191057872e-05, "loss": 0.3876671075820923, "memory(GiB)": 127.52, "step": 480, "token_acc": 0.8610883356974732, "train_speed(iter/s)": 0.100734 }, { "epoch": 0.5679156908665105, "grad_norm": 0.28700098395347595, "learning_rate": 1.8961935039564338e-05, "loss": 0.3859807252883911, "memory(GiB)": 127.52, "step": 485, "token_acc": 0.861040389753261, "train_speed(iter/s)": 0.100756 }, { "epoch": 0.5737704918032787, "grad_norm": 0.30889761447906494, "learning_rate": 1.8933104612461454e-05, "loss": 0.3886594772338867, "memory(GiB)": 127.52, "step": 490, "token_acc": 0.862199389425299, "train_speed(iter/s)": 0.100764 }, { "epoch": 0.5796252927400468, "grad_norm": 0.30002301931381226, "learning_rate": 1.8903901830995093e-05, "loss": 0.3925405740737915, "memory(GiB)": 127.52, "step": 495, "token_acc": 0.8591249033461787, "train_speed(iter/s)": 0.10076 }, { "epoch": 0.585480093676815, "grad_norm": 0.28031232953071594, "learning_rate": 1.8874327912410945e-05, "loss": 0.40421361923217775, "memory(GiB)": 127.52, "step": 500, "token_acc": 0.8617515420490447, "train_speed(iter/s)": 0.100788 }, { "epoch": 0.5913348946135831, "grad_norm": 0.27785587310791016, "learning_rate": 1.884438408942463e-05, "loss": 0.39117045402526857, "memory(GiB)": 127.52, "step": 505, "token_acc": 0.8509159982582465, "train_speed(iter/s)": 0.100792 }, { "epoch": 0.5971896955503513, "grad_norm": 0.26203179359436035, "learning_rate": 1.881407161017033e-05, "loss": 0.3850869655609131, "memory(GiB)": 127.52, "step": 510, "token_acc": 0.871426780341023, "train_speed(iter/s)": 0.100813 }, { "epoch": 0.6030444964871194, "grad_norm": 0.2775160074234009, "learning_rate": 1.8783391738148738e-05, "loss": 0.38030352592468264, "memory(GiB)": 127.52, "step": 515, "token_acc": 0.865779336694748, "train_speed(iter/s)": 0.100836 }, { "epoch": 0.6088992974238876, "grad_norm": 0.283777117729187, "learning_rate": 1.875234575217441e-05, "loss": 0.38051447868347166, "memory(GiB)": 127.52, "step": 520, "token_acc": 0.8643710911880905, "train_speed(iter/s)": 0.100855 }, { "epoch": 0.6147540983606558, "grad_norm": 0.2693696618080139, "learning_rate": 1.8720934946322466e-05, "loss": 0.3941120862960815, "memory(GiB)": 127.52, "step": 525, "token_acc": 0.8575597963261037, "train_speed(iter/s)": 0.10087 }, { "epoch": 0.6206088992974239, "grad_norm": 0.2502153515815735, "learning_rate": 1.8689160629874622e-05, "loss": 0.36350240707397463, "memory(GiB)": 127.52, "step": 530, "token_acc": 0.8788319745551232, "train_speed(iter/s)": 0.100858 }, { "epoch": 0.6264637002341921, "grad_norm": 0.2630903124809265, "learning_rate": 1.865702412726465e-05, "loss": 0.3757188081741333, "memory(GiB)": 127.52, "step": 535, "token_acc": 0.8759880681391801, "train_speed(iter/s)": 0.100883 }, { "epoch": 0.6323185011709602, "grad_norm": 0.2726694941520691, "learning_rate": 1.8624526778023142e-05, "loss": 0.3769080638885498, "memory(GiB)": 127.52, "step": 540, "token_acc": 0.8733085553248108, "train_speed(iter/s)": 0.100896 }, { "epoch": 0.6381733021077284, "grad_norm": 0.2886805832386017, "learning_rate": 1.85916699367217e-05, "loss": 0.3801791429519653, "memory(GiB)": 127.52, "step": 545, "token_acc": 0.8658838767809878, "train_speed(iter/s)": 0.100897 }, { "epoch": 0.6440281030444965, "grad_norm": 0.28697773814201355, "learning_rate": 1.855845497291646e-05, "loss": 0.3925698041915894, "memory(GiB)": 127.52, "step": 550, "token_acc": 0.8631926701668678, "train_speed(iter/s)": 0.100906 }, { "epoch": 0.6498829039812647, "grad_norm": 0.26602187752723694, "learning_rate": 1.8524883271091004e-05, "loss": 0.38099260330200196, "memory(GiB)": 127.52, "step": 555, "token_acc": 0.8710958004218123, "train_speed(iter/s)": 0.100905 }, { "epoch": 0.6557377049180327, "grad_norm": 0.2533867359161377, "learning_rate": 1.8490956230598668e-05, "loss": 0.3997593879699707, "memory(GiB)": 127.52, "step": 560, "token_acc": 0.8649844205573561, "train_speed(iter/s)": 0.100903 }, { "epoch": 0.6615925058548009, "grad_norm": 0.287895530462265, "learning_rate": 1.8456675265604183e-05, "loss": 0.3792722702026367, "memory(GiB)": 127.52, "step": 565, "token_acc": 0.8638586429067867, "train_speed(iter/s)": 0.100923 }, { "epoch": 0.667447306791569, "grad_norm": 0.30773329734802246, "learning_rate": 1.842204180502476e-05, "loss": 0.3829328536987305, "memory(GiB)": 127.52, "step": 570, "token_acc": 0.8727389815600163, "train_speed(iter/s)": 0.100938 }, { "epoch": 0.6733021077283372, "grad_norm": 0.30301594734191895, "learning_rate": 1.8387057292470517e-05, "loss": 0.39844498634338377, "memory(GiB)": 127.52, "step": 575, "token_acc": 0.8632732480308832, "train_speed(iter/s)": 0.100939 }, { "epoch": 0.6791569086651054, "grad_norm": 0.27384889125823975, "learning_rate": 1.8351723186184295e-05, "loss": 0.3866116523742676, "memory(GiB)": 127.52, "step": 580, "token_acc": 0.8537265892945595, "train_speed(iter/s)": 0.100945 }, { "epoch": 0.6850117096018735, "grad_norm": 0.300459086894989, "learning_rate": 1.8316040958980896e-05, "loss": 0.3856982707977295, "memory(GiB)": 127.52, "step": 585, "token_acc": 0.8774584957729205, "train_speed(iter/s)": 0.100955 }, { "epoch": 0.6908665105386417, "grad_norm": 0.32351046800613403, "learning_rate": 1.828001209818567e-05, "loss": 0.403375244140625, "memory(GiB)": 127.52, "step": 590, "token_acc": 0.8606907256499806, "train_speed(iter/s)": 0.100969 }, { "epoch": 0.6967213114754098, "grad_norm": 0.3171491324901581, "learning_rate": 1.8243638105572547e-05, "loss": 0.3851677656173706, "memory(GiB)": 127.52, "step": 595, "token_acc": 0.8713710233181722, "train_speed(iter/s)": 0.100978 }, { "epoch": 0.702576112412178, "grad_norm": 0.3137357532978058, "learning_rate": 1.82069204973014e-05, "loss": 0.3799635648727417, "memory(GiB)": 127.52, "step": 600, "token_acc": 0.8784900280426953, "train_speed(iter/s)": 0.101006 }, { "epoch": 0.7084309133489461, "grad_norm": 0.28434112668037415, "learning_rate": 1.816986080385489e-05, "loss": 0.40052270889282227, "memory(GiB)": 127.52, "step": 605, "token_acc": 0.8462195284773476, "train_speed(iter/s)": 0.101006 }, { "epoch": 0.7142857142857143, "grad_norm": 0.30604925751686096, "learning_rate": 1.813246056997465e-05, "loss": 0.3835596084594727, "memory(GiB)": 127.52, "step": 610, "token_acc": 0.8614169593452318, "train_speed(iter/s)": 0.101011 }, { "epoch": 0.7201405152224825, "grad_norm": 0.3114904463291168, "learning_rate": 1.809472135459688e-05, "loss": 0.38530282974243163, "memory(GiB)": 127.52, "step": 615, "token_acc": 0.8642289288270977, "train_speed(iter/s)": 0.101016 }, { "epoch": 0.7259953161592506, "grad_norm": 0.29733744263648987, "learning_rate": 1.8056644730787412e-05, "loss": 0.39410853385925293, "memory(GiB)": 127.52, "step": 620, "token_acc": 0.8700788764122717, "train_speed(iter/s)": 0.101043 }, { "epoch": 0.7318501170960188, "grad_norm": 0.28432950377464294, "learning_rate": 1.8018232285676092e-05, "loss": 0.3745533227920532, "memory(GiB)": 127.52, "step": 625, "token_acc": 0.8656255611667859, "train_speed(iter/s)": 0.101068 }, { "epoch": 0.7377049180327869, "grad_norm": 0.2615796625614166, "learning_rate": 1.797948562039066e-05, "loss": 0.3919194459915161, "memory(GiB)": 127.52, "step": 630, "token_acc": 0.8600643002591344, "train_speed(iter/s)": 0.101046 }, { "epoch": 0.7435597189695551, "grad_norm": 0.27267464995384216, "learning_rate": 1.7940406349989987e-05, "loss": 0.388127875328064, "memory(GiB)": 127.52, "step": 635, "token_acc": 0.8630637748223948, "train_speed(iter/s)": 0.10107 }, { "epoch": 0.7494145199063232, "grad_norm": 0.274472177028656, "learning_rate": 1.7900996103396772e-05, "loss": 0.38143386840820315, "memory(GiB)": 127.52, "step": 640, "token_acc": 0.8701312848988129, "train_speed(iter/s)": 0.101069 }, { "epoch": 0.7552693208430913, "grad_norm": 0.27030906081199646, "learning_rate": 1.7861256523329634e-05, "loss": 0.3786378145217896, "memory(GiB)": 127.52, "step": 645, "token_acc": 0.8602489884842826, "train_speed(iter/s)": 0.101063 }, { "epoch": 0.7611241217798594, "grad_norm": 0.2663189172744751, "learning_rate": 1.7821189266234647e-05, "loss": 0.38404848575592043, "memory(GiB)": 127.52, "step": 650, "token_acc": 0.8616431608743905, "train_speed(iter/s)": 0.10106 }, { "epoch": 0.7669789227166276, "grad_norm": 0.26061564683914185, "learning_rate": 1.7780796002216285e-05, "loss": 0.3781083822250366, "memory(GiB)": 127.52, "step": 655, "token_acc": 0.8578937981658266, "train_speed(iter/s)": 0.101068 }, { "epoch": 0.7728337236533958, "grad_norm": 0.2600330412387848, "learning_rate": 1.7740078414967817e-05, "loss": 0.3852128505706787, "memory(GiB)": 127.52, "step": 660, "token_acc": 0.872952104972653, "train_speed(iter/s)": 0.101073 }, { "epoch": 0.7786885245901639, "grad_norm": 0.27133384346961975, "learning_rate": 1.7699038201701132e-05, "loss": 0.37737174034118653, "memory(GiB)": 127.52, "step": 665, "token_acc": 0.8593767976691324, "train_speed(iter/s)": 0.101088 }, { "epoch": 0.7845433255269321, "grad_norm": 0.270047664642334, "learning_rate": 1.7657677073075968e-05, "loss": 0.38488593101501467, "memory(GiB)": 127.52, "step": 670, "token_acc": 0.8627122177041754, "train_speed(iter/s)": 0.101091 }, { "epoch": 0.7903981264637002, "grad_norm": 0.29772108793258667, "learning_rate": 1.761599675312864e-05, "loss": 0.3877399444580078, "memory(GiB)": 127.52, "step": 675, "token_acc": 0.8765810968128602, "train_speed(iter/s)": 0.101091 }, { "epoch": 0.7962529274004684, "grad_norm": 0.30914777517318726, "learning_rate": 1.7573998979200163e-05, "loss": 0.38101863861083984, "memory(GiB)": 127.52, "step": 680, "token_acc": 0.8670370510587819, "train_speed(iter/s)": 0.101106 }, { "epoch": 0.8021077283372365, "grad_norm": 0.24654199182987213, "learning_rate": 1.753168550186383e-05, "loss": 0.3897979259490967, "memory(GiB)": 127.52, "step": 685, "token_acc": 0.8695668499228697, "train_speed(iter/s)": 0.101113 }, { "epoch": 0.8079625292740047, "grad_norm": 0.268245667219162, "learning_rate": 1.7489058084852247e-05, "loss": 0.3852191686630249, "memory(GiB)": 127.52, "step": 690, "token_acc": 0.8590092968475919, "train_speed(iter/s)": 0.101108 }, { "epoch": 0.8138173302107728, "grad_norm": 0.2539999186992645, "learning_rate": 1.744611850498383e-05, "loss": 0.38076086044311525, "memory(GiB)": 127.52, "step": 695, "token_acc": 0.8692958838741554, "train_speed(iter/s)": 0.101093 }, { "epoch": 0.819672131147541, "grad_norm": 0.30060875415802, "learning_rate": 1.7402868552088724e-05, "loss": 0.37528285980224607, "memory(GiB)": 127.52, "step": 700, "token_acc": 0.863746098668577, "train_speed(iter/s)": 0.101099 }, { "epoch": 0.8255269320843092, "grad_norm": 0.2880835235118866, "learning_rate": 1.73593100289342e-05, "loss": 0.3839045286178589, "memory(GiB)": 127.52, "step": 705, "token_acc": 0.8606477737869129, "train_speed(iter/s)": 0.101117 }, { "epoch": 0.8313817330210773, "grad_norm": 0.27465176582336426, "learning_rate": 1.7315444751149533e-05, "loss": 0.38219666481018066, "memory(GiB)": 127.52, "step": 710, "token_acc": 0.866171235481518, "train_speed(iter/s)": 0.101137 }, { "epoch": 0.8372365339578455, "grad_norm": 0.2839786410331726, "learning_rate": 1.727127454715029e-05, "loss": 0.3815479755401611, "memory(GiB)": 127.52, "step": 715, "token_acc": 0.8742821134330966, "train_speed(iter/s)": 0.101149 }, { "epoch": 0.8430913348946136, "grad_norm": 0.31399768590927124, "learning_rate": 1.722680125806214e-05, "loss": 0.38201520442962644, "memory(GiB)": 127.52, "step": 720, "token_acc": 0.8587188600974719, "train_speed(iter/s)": 0.101155 }, { "epoch": 0.8489461358313818, "grad_norm": 0.3099398910999298, "learning_rate": 1.71820267376441e-05, "loss": 0.386704421043396, "memory(GiB)": 127.52, "step": 725, "token_acc": 0.8638798635493387, "train_speed(iter/s)": 0.101166 }, { "epoch": 0.8548009367681498, "grad_norm": 0.2707797884941101, "learning_rate": 1.7136952852211274e-05, "loss": 0.3908542156219482, "memory(GiB)": 127.52, "step": 730, "token_acc": 0.8531080479659894, "train_speed(iter/s)": 0.10118 }, { "epoch": 0.860655737704918, "grad_norm": 0.24912209808826447, "learning_rate": 1.7091581480557057e-05, "loss": 0.3775820732116699, "memory(GiB)": 127.52, "step": 735, "token_acc": 0.8631545113262953, "train_speed(iter/s)": 0.101187 }, { "epoch": 0.8665105386416861, "grad_norm": 0.2668187916278839, "learning_rate": 1.7045914513874815e-05, "loss": 0.39071335792541506, "memory(GiB)": 127.52, "step": 740, "token_acc": 0.863421279036421, "train_speed(iter/s)": 0.101213 }, { "epoch": 0.8723653395784543, "grad_norm": 0.24733468890190125, "learning_rate": 1.699995385567907e-05, "loss": 0.39272005558013917, "memory(GiB)": 127.52, "step": 745, "token_acc": 0.8545664531712299, "train_speed(iter/s)": 0.101244 }, { "epoch": 0.8782201405152225, "grad_norm": 0.2632930278778076, "learning_rate": 1.695370142172614e-05, "loss": 0.3845970630645752, "memory(GiB)": 127.52, "step": 750, "token_acc": 0.8612419217474074, "train_speed(iter/s)": 0.101242 }, { "epoch": 0.8840749414519906, "grad_norm": 0.26514074206352234, "learning_rate": 1.690715913993429e-05, "loss": 0.38790068626403806, "memory(GiB)": 127.52, "step": 755, "token_acc": 0.8648871034856036, "train_speed(iter/s)": 0.101244 }, { "epoch": 0.8899297423887588, "grad_norm": 0.26957836747169495, "learning_rate": 1.6860328950303392e-05, "loss": 0.36716523170471194, "memory(GiB)": 127.52, "step": 760, "token_acc": 0.8711639836976192, "train_speed(iter/s)": 0.101257 }, { "epoch": 0.8957845433255269, "grad_norm": 0.2675636410713196, "learning_rate": 1.6813212804834033e-05, "loss": 0.38340959548950193, "memory(GiB)": 127.52, "step": 765, "token_acc": 0.8579816582165225, "train_speed(iter/s)": 0.101264 }, { "epoch": 0.9016393442622951, "grad_norm": 0.26134225726127625, "learning_rate": 1.676581266744615e-05, "loss": 0.3752238988876343, "memory(GiB)": 127.52, "step": 770, "token_acc": 0.8638096187142661, "train_speed(iter/s)": 0.101274 }, { "epoch": 0.9074941451990632, "grad_norm": 0.2766994535923004, "learning_rate": 1.6718130513897207e-05, "loss": 0.37386231422424315, "memory(GiB)": 127.52, "step": 775, "token_acc": 0.8692816207520612, "train_speed(iter/s)": 0.10128 }, { "epoch": 0.9133489461358314, "grad_norm": 0.2736496329307556, "learning_rate": 1.667016833169979e-05, "loss": 0.3910179138183594, "memory(GiB)": 127.52, "step": 780, "token_acc": 0.8679116603442695, "train_speed(iter/s)": 0.101285 }, { "epoch": 0.9192037470725996, "grad_norm": 0.25334644317626953, "learning_rate": 1.6621928120038806e-05, "loss": 0.3837088346481323, "memory(GiB)": 127.52, "step": 785, "token_acc": 0.8568342264714894, "train_speed(iter/s)": 0.101285 }, { "epoch": 0.9250585480093677, "grad_norm": 0.2526282072067261, "learning_rate": 1.657341188968811e-05, "loss": 0.3741894721984863, "memory(GiB)": 127.52, "step": 790, "token_acc": 0.8600209680781232, "train_speed(iter/s)": 0.101298 }, { "epoch": 0.9309133489461359, "grad_norm": 0.2629476487636566, "learning_rate": 1.6524621662926733e-05, "loss": 0.3736875057220459, "memory(GiB)": 127.52, "step": 795, "token_acc": 0.8765449927636102, "train_speed(iter/s)": 0.101311 }, { "epoch": 0.936768149882904, "grad_norm": 0.26536864042282104, "learning_rate": 1.6475559473454558e-05, "loss": 0.3841824769973755, "memory(GiB)": 127.52, "step": 800, "token_acc": 0.8732290436835891, "train_speed(iter/s)": 0.101317 }, { "epoch": 0.9426229508196722, "grad_norm": 0.9267993569374084, "learning_rate": 1.6426227366307563e-05, "loss": 0.3876027584075928, "memory(GiB)": 127.52, "step": 805, "token_acc": 0.873662949476559, "train_speed(iter/s)": 0.10131 }, { "epoch": 0.9484777517564403, "grad_norm": 0.31513214111328125, "learning_rate": 1.6376627397772576e-05, "loss": 0.38577656745910643, "memory(GiB)": 127.52, "step": 810, "token_acc": 0.8582883611177872, "train_speed(iter/s)": 0.101308 }, { "epoch": 0.9543325526932084, "grad_norm": 0.43881845474243164, "learning_rate": 1.6326761635301572e-05, "loss": 0.3793084383010864, "memory(GiB)": 127.52, "step": 815, "token_acc": 0.8658072630089608, "train_speed(iter/s)": 0.101317 }, { "epoch": 0.9601873536299765, "grad_norm": 0.2627616822719574, "learning_rate": 1.6276632157425475e-05, "loss": 0.3868673801422119, "memory(GiB)": 127.52, "step": 820, "token_acc": 0.8609059346385673, "train_speed(iter/s)": 0.101319 }, { "epoch": 0.9660421545667447, "grad_norm": 0.28743499517440796, "learning_rate": 1.6226241053667536e-05, "loss": 0.39165661334991453, "memory(GiB)": 127.52, "step": 825, "token_acc": 0.8566733687511922, "train_speed(iter/s)": 0.101328 }, { "epoch": 0.9718969555035128, "grad_norm": 0.2647303640842438, "learning_rate": 1.617559042445625e-05, "loss": 0.3914238929748535, "memory(GiB)": 127.52, "step": 830, "token_acc": 0.8686447332876824, "train_speed(iter/s)": 0.101331 }, { "epoch": 0.977751756440281, "grad_norm": 0.28214219212532043, "learning_rate": 1.6124682381037767e-05, "loss": 0.3775761127471924, "memory(GiB)": 127.52, "step": 835, "token_acc": 0.8658163872414246, "train_speed(iter/s)": 0.101335 }, { "epoch": 0.9836065573770492, "grad_norm": 0.2978610694408417, "learning_rate": 1.607351904538792e-05, "loss": 0.39282917976379395, "memory(GiB)": 127.52, "step": 840, "token_acc": 0.866700342369647, "train_speed(iter/s)": 0.101325 }, { "epoch": 0.9894613583138173, "grad_norm": 0.2674395740032196, "learning_rate": 1.6022102550123775e-05, "loss": 0.3796736240386963, "memory(GiB)": 127.52, "step": 845, "token_acc": 0.8609281823639752, "train_speed(iter/s)": 0.101326 }, { "epoch": 0.9953161592505855, "grad_norm": 0.2766255736351013, "learning_rate": 1.597043503841471e-05, "loss": 0.3800792217254639, "memory(GiB)": 127.52, "step": 850, "token_acc": 0.8745568192822268, "train_speed(iter/s)": 0.101325 }, { "epoch": 1.0011709601873535, "grad_norm": 0.36053553223609924, "learning_rate": 1.5918518663893124e-05, "loss": 0.3734774589538574, "memory(GiB)": 127.52, "step": 855, "token_acc": 0.8709147849019284, "train_speed(iter/s)": 0.100996 }, { "epoch": 1.0070257611241218, "grad_norm": 0.29508745670318604, "learning_rate": 1.5866355590564637e-05, "loss": 0.3578346252441406, "memory(GiB)": 127.52, "step": 860, "token_acc": 0.8851065028386151, "train_speed(iter/s)": 0.100988 }, { "epoch": 1.0128805620608898, "grad_norm": 0.30008167028427124, "learning_rate": 1.5813947992717894e-05, "loss": 0.34525480270385744, "memory(GiB)": 127.52, "step": 865, "token_acc": 0.8753548176879359, "train_speed(iter/s)": 0.10098 }, { "epoch": 1.018735362997658, "grad_norm": 0.2938152253627777, "learning_rate": 1.5761298054833947e-05, "loss": 0.3546164035797119, "memory(GiB)": 127.52, "step": 870, "token_acc": 0.8762193571592467, "train_speed(iter/s)": 0.100965 }, { "epoch": 1.0245901639344261, "grad_norm": 0.27178069949150085, "learning_rate": 1.5708407971495195e-05, "loss": 0.3612537384033203, "memory(GiB)": 127.52, "step": 875, "token_acc": 0.8722169198754557, "train_speed(iter/s)": 0.100976 }, { "epoch": 1.0304449648711944, "grad_norm": 0.2759335935115814, "learning_rate": 1.565527994729389e-05, "loss": 0.3513669967651367, "memory(GiB)": 127.52, "step": 880, "token_acc": 0.8818436745370559, "train_speed(iter/s)": 0.100984 }, { "epoch": 1.0362997658079625, "grad_norm": 0.2735261917114258, "learning_rate": 1.5601916196740283e-05, "loss": 0.3473806858062744, "memory(GiB)": 127.52, "step": 885, "token_acc": 0.8784491835740441, "train_speed(iter/s)": 0.100979 }, { "epoch": 1.0421545667447307, "grad_norm": 0.28892189264297485, "learning_rate": 1.5548318944170276e-05, "loss": 0.3433929443359375, "memory(GiB)": 127.52, "step": 890, "token_acc": 0.8839334112478968, "train_speed(iter/s)": 0.100971 }, { "epoch": 1.0480093676814988, "grad_norm": 0.2602222263813019, "learning_rate": 1.5494490423652732e-05, "loss": 0.3427423000335693, "memory(GiB)": 127.52, "step": 895, "token_acc": 0.876471048390882, "train_speed(iter/s)": 0.100951 }, { "epoch": 1.053864168618267, "grad_norm": 0.2913144528865814, "learning_rate": 1.544043287889635e-05, "loss": 0.3336780071258545, "memory(GiB)": 127.52, "step": 900, "token_acc": 0.8869567959634185, "train_speed(iter/s)": 0.10095 }, { "epoch": 1.059718969555035, "grad_norm": 0.2634846270084381, "learning_rate": 1.538614856315614e-05, "loss": 0.3489675998687744, "memory(GiB)": 127.52, "step": 905, "token_acc": 0.8832413903915163, "train_speed(iter/s)": 0.100958 }, { "epoch": 1.0655737704918034, "grad_norm": 0.2699672281742096, "learning_rate": 1.5331639739139477e-05, "loss": 0.3432894229888916, "memory(GiB)": 127.52, "step": 910, "token_acc": 0.8669136816431162, "train_speed(iter/s)": 0.100951 }, { "epoch": 1.0714285714285714, "grad_norm": 0.2946908175945282, "learning_rate": 1.5276908678911837e-05, "loss": 0.3399630546569824, "memory(GiB)": 127.52, "step": 915, "token_acc": 0.8821736748390632, "train_speed(iter/s)": 0.100953 }, { "epoch": 1.0772833723653397, "grad_norm": 0.31119436025619507, "learning_rate": 1.5221957663802043e-05, "loss": 0.3506146430969238, "memory(GiB)": 127.52, "step": 920, "token_acc": 0.8818868935608091, "train_speed(iter/s)": 0.100935 }, { "epoch": 1.0831381733021077, "grad_norm": 0.27400681376457214, "learning_rate": 1.5166788984307204e-05, "loss": 0.35775036811828614, "memory(GiB)": 127.52, "step": 925, "token_acc": 0.8750959445346218, "train_speed(iter/s)": 0.100931 }, { "epoch": 1.088992974238876, "grad_norm": 0.3916493058204651, "learning_rate": 1.5111404939997227e-05, "loss": 0.3546015739440918, "memory(GiB)": 127.52, "step": 930, "token_acc": 0.8738711676022755, "train_speed(iter/s)": 0.100933 }, { "epoch": 1.094847775175644, "grad_norm": 0.3681865930557251, "learning_rate": 1.5055807839418966e-05, "loss": 0.33371834754943847, "memory(GiB)": 127.52, "step": 935, "token_acc": 0.8814006570111667, "train_speed(iter/s)": 0.100931 }, { "epoch": 1.100702576112412, "grad_norm": 0.27416518330574036, "learning_rate": 1.5000000000000002e-05, "loss": 0.3561122417449951, "memory(GiB)": 127.52, "step": 940, "token_acc": 0.8838524966358717, "train_speed(iter/s)": 0.100932 }, { "epoch": 1.1065573770491803, "grad_norm": 0.2653830349445343, "learning_rate": 1.494398374795204e-05, "loss": 0.3430471897125244, "memory(GiB)": 127.52, "step": 945, "token_acc": 0.8739330062998951, "train_speed(iter/s)": 0.100924 }, { "epoch": 1.1124121779859484, "grad_norm": 0.29074740409851074, "learning_rate": 1.4887761418173947e-05, "loss": 0.36190090179443357, "memory(GiB)": 127.52, "step": 950, "token_acc": 0.8833006769910948, "train_speed(iter/s)": 0.100919 }, { "epoch": 1.1182669789227166, "grad_norm": 0.2751435339450836, "learning_rate": 1.4831335354154444e-05, "loss": 0.34648761749267576, "memory(GiB)": 127.52, "step": 955, "token_acc": 0.8776634838921327, "train_speed(iter/s)": 0.100926 }, { "epoch": 1.1241217798594847, "grad_norm": 0.2628922164440155, "learning_rate": 1.4774707907874392e-05, "loss": 0.34562859535217283, "memory(GiB)": 127.52, "step": 960, "token_acc": 0.8836736799002247, "train_speed(iter/s)": 0.100911 }, { "epoch": 1.129976580796253, "grad_norm": 0.2639271020889282, "learning_rate": 1.4717881439708786e-05, "loss": 0.34596388339996337, "memory(GiB)": 127.52, "step": 965, "token_acc": 0.8673695686030214, "train_speed(iter/s)": 0.100909 }, { "epoch": 1.135831381733021, "grad_norm": 0.28422874212265015, "learning_rate": 1.4660858318328348e-05, "loss": 0.3498117446899414, "memory(GiB)": 127.52, "step": 970, "token_acc": 0.866499586445358, "train_speed(iter/s)": 0.100888 }, { "epoch": 1.1416861826697893, "grad_norm": 0.2625197470188141, "learning_rate": 1.4603640920600813e-05, "loss": 0.35533895492553713, "memory(GiB)": 127.52, "step": 975, "token_acc": 0.8624783775908141, "train_speed(iter/s)": 0.100863 }, { "epoch": 1.1475409836065573, "grad_norm": 0.2902534008026123, "learning_rate": 1.4546231631491827e-05, "loss": 0.35151519775390627, "memory(GiB)": 127.52, "step": 980, "token_acc": 0.871260222085633, "train_speed(iter/s)": 0.100833 }, { "epoch": 1.1533957845433256, "grad_norm": 0.2525332570075989, "learning_rate": 1.4488632843965573e-05, "loss": 0.3441092729568481, "memory(GiB)": 127.52, "step": 985, "token_acc": 0.8626160602258469, "train_speed(iter/s)": 0.100824 }, { "epoch": 1.1592505854800936, "grad_norm": 0.26731306314468384, "learning_rate": 1.4430846958884995e-05, "loss": 0.3539264678955078, "memory(GiB)": 127.52, "step": 990, "token_acc": 0.8706765643432645, "train_speed(iter/s)": 0.100815 }, { "epoch": 1.165105386416862, "grad_norm": 0.2605798542499542, "learning_rate": 1.4372876384911741e-05, "loss": 0.35328848361968995, "memory(GiB)": 127.52, "step": 995, "token_acc": 0.8729384617783252, "train_speed(iter/s)": 0.100809 }, { "epoch": 1.17096018735363, "grad_norm": 0.2707096338272095, "learning_rate": 1.4314723538405752e-05, "loss": 0.36124861240386963, "memory(GiB)": 127.52, "step": 1000, "token_acc": 0.8623729975690332, "train_speed(iter/s)": 0.100795 }, { "epoch": 1.1768149882903982, "grad_norm": 0.26851606369018555, "learning_rate": 1.4256390843324556e-05, "loss": 0.35548346042633056, "memory(GiB)": 127.52, "step": 1005, "token_acc": 0.868687436031853, "train_speed(iter/s)": 0.100786 }, { "epoch": 1.1826697892271663, "grad_norm": 0.27084365487098694, "learning_rate": 1.4197880731122221e-05, "loss": 0.351657772064209, "memory(GiB)": 127.52, "step": 1010, "token_acc": 0.8682709314201729, "train_speed(iter/s)": 0.100787 }, { "epoch": 1.1885245901639343, "grad_norm": 0.27497202157974243, "learning_rate": 1.4139195640648008e-05, "loss": 0.355600380897522, "memory(GiB)": 127.52, "step": 1015, "token_acc": 0.8803992028496556, "train_speed(iter/s)": 0.10078 }, { "epoch": 1.1943793911007026, "grad_norm": 0.2708893418312073, "learning_rate": 1.4080338018044712e-05, "loss": 0.3596624851226807, "memory(GiB)": 127.52, "step": 1020, "token_acc": 0.8694279635903098, "train_speed(iter/s)": 0.100784 }, { "epoch": 1.2002341920374708, "grad_norm": 0.32129156589508057, "learning_rate": 1.4021310316646708e-05, "loss": 0.3490485668182373, "memory(GiB)": 127.52, "step": 1025, "token_acc": 0.8754893595663521, "train_speed(iter/s)": 0.100766 }, { "epoch": 1.2060889929742389, "grad_norm": 0.25467485189437866, "learning_rate": 1.3962114996877685e-05, "loss": 0.3347738981246948, "memory(GiB)": 127.52, "step": 1030, "token_acc": 0.8824631079656678, "train_speed(iter/s)": 0.100756 }, { "epoch": 1.211943793911007, "grad_norm": 0.2674933671951294, "learning_rate": 1.390275452614808e-05, "loss": 0.338185977935791, "memory(GiB)": 127.52, "step": 1035, "token_acc": 0.8744415325096718, "train_speed(iter/s)": 0.100755 }, { "epoch": 1.2177985948477752, "grad_norm": 0.2707443833351135, "learning_rate": 1.3843231378752252e-05, "loss": 0.3448366165161133, "memory(GiB)": 127.52, "step": 1040, "token_acc": 0.8736029828057016, "train_speed(iter/s)": 0.100747 }, { "epoch": 1.2236533957845432, "grad_norm": 0.24581725895404816, "learning_rate": 1.3783548035765327e-05, "loss": 0.34962687492370603, "memory(GiB)": 127.52, "step": 1045, "token_acc": 0.8796080775037746, "train_speed(iter/s)": 0.100757 }, { "epoch": 1.2295081967213115, "grad_norm": 0.2379993051290512, "learning_rate": 1.3723706984939783e-05, "loss": 0.33640050888061523, "memory(GiB)": 127.52, "step": 1050, "token_acc": 0.8721236366123021, "train_speed(iter/s)": 0.100739 }, { "epoch": 1.2353629976580796, "grad_norm": 0.26605796813964844, "learning_rate": 1.366371072060177e-05, "loss": 0.3490384340286255, "memory(GiB)": 127.52, "step": 1055, "token_acc": 0.862867230488973, "train_speed(iter/s)": 0.100731 }, { "epoch": 1.2412177985948478, "grad_norm": 0.25522705912590027, "learning_rate": 1.3603561743547125e-05, "loss": 0.34296507835388185, "memory(GiB)": 127.52, "step": 1060, "token_acc": 0.8687898169167538, "train_speed(iter/s)": 0.100739 }, { "epoch": 1.2470725995316159, "grad_norm": 0.2729935348033905, "learning_rate": 1.3543262560937135e-05, "loss": 0.34846017360687254, "memory(GiB)": 127.52, "step": 1065, "token_acc": 0.8741769020279135, "train_speed(iter/s)": 0.100744 }, { "epoch": 1.2529274004683841, "grad_norm": 0.2534308433532715, "learning_rate": 1.3482815686194033e-05, "loss": 0.33311474323272705, "memory(GiB)": 127.52, "step": 1070, "token_acc": 0.8795399856245507, "train_speed(iter/s)": 0.100751 }, { "epoch": 1.2587822014051522, "grad_norm": 0.2755572497844696, "learning_rate": 1.3422223638896235e-05, "loss": 0.3432854413986206, "memory(GiB)": 127.52, "step": 1075, "token_acc": 0.8758250682788096, "train_speed(iter/s)": 0.100756 }, { "epoch": 1.2646370023419204, "grad_norm": 0.2861506938934326, "learning_rate": 1.3361488944673315e-05, "loss": 0.3542114496231079, "memory(GiB)": 127.52, "step": 1080, "token_acc": 0.8687981877806241, "train_speed(iter/s)": 0.100759 }, { "epoch": 1.2704918032786885, "grad_norm": 0.3049258589744568, "learning_rate": 1.3300614135100736e-05, "loss": 0.3503614664077759, "memory(GiB)": 127.52, "step": 1085, "token_acc": 0.875489517451949, "train_speed(iter/s)": 0.100754 }, { "epoch": 1.2763466042154565, "grad_norm": 0.25668370723724365, "learning_rate": 1.3239601747594319e-05, "loss": 0.3487658739089966, "memory(GiB)": 127.52, "step": 1090, "token_acc": 0.8770075135561131, "train_speed(iter/s)": 0.100751 }, { "epoch": 1.2822014051522248, "grad_norm": 0.2401314228773117, "learning_rate": 1.3178454325304472e-05, "loss": 0.3507190465927124, "memory(GiB)": 127.52, "step": 1095, "token_acc": 0.8644839657435769, "train_speed(iter/s)": 0.100741 }, { "epoch": 1.288056206088993, "grad_norm": 0.2501038908958435, "learning_rate": 1.3117174417010213e-05, "loss": 0.3356021404266357, "memory(GiB)": 127.52, "step": 1100, "token_acc": 0.8694938440492476, "train_speed(iter/s)": 0.100738 }, { "epoch": 1.2939110070257611, "grad_norm": 0.25629186630249023, "learning_rate": 1.3055764577012892e-05, "loss": 0.3668931007385254, "memory(GiB)": 127.52, "step": 1105, "token_acc": 0.8810234328372201, "train_speed(iter/s)": 0.100745 }, { "epoch": 1.2997658079625292, "grad_norm": 0.2689758539199829, "learning_rate": 1.2994227365029752e-05, "loss": 0.34679102897644043, "memory(GiB)": 127.52, "step": 1110, "token_acc": 0.8783292769097903, "train_speed(iter/s)": 0.100746 }, { "epoch": 1.3056206088992974, "grad_norm": 0.2619406282901764, "learning_rate": 1.2932565346087218e-05, "loss": 0.35414924621582033, "memory(GiB)": 127.52, "step": 1115, "token_acc": 0.8748901150285233, "train_speed(iter/s)": 0.100729 }, { "epoch": 1.3114754098360657, "grad_norm": 0.3210083842277527, "learning_rate": 1.2870781090413991e-05, "loss": 0.35202646255493164, "memory(GiB)": 127.52, "step": 1120, "token_acc": 0.8757856423662141, "train_speed(iter/s)": 0.10072 }, { "epoch": 1.3173302107728337, "grad_norm": 0.27284613251686096, "learning_rate": 1.2808877173333896e-05, "loss": 0.3467656850814819, "memory(GiB)": 127.52, "step": 1125, "token_acc": 0.883265632074048, "train_speed(iter/s)": 0.100724 }, { "epoch": 1.3231850117096018, "grad_norm": 0.2710505425930023, "learning_rate": 1.2746856175158556e-05, "loss": 0.35611112117767335, "memory(GiB)": 127.52, "step": 1130, "token_acc": 0.8756308252586658, "train_speed(iter/s)": 0.100737 }, { "epoch": 1.32903981264637, "grad_norm": 0.26133865118026733, "learning_rate": 1.2684720681079825e-05, "loss": 0.3506006240844727, "memory(GiB)": 127.52, "step": 1135, "token_acc": 0.8604187872166245, "train_speed(iter/s)": 0.100742 }, { "epoch": 1.334894613583138, "grad_norm": 0.27019548416137695, "learning_rate": 1.2622473281062042e-05, "loss": 0.35390684604644773, "memory(GiB)": 127.52, "step": 1140, "token_acc": 0.8757172258949731, "train_speed(iter/s)": 0.100736 }, { "epoch": 1.3407494145199064, "grad_norm": 0.26330387592315674, "learning_rate": 1.256011656973406e-05, "loss": 0.36088995933532714, "memory(GiB)": 127.52, "step": 1145, "token_acc": 0.8777154145240186, "train_speed(iter/s)": 0.100733 }, { "epoch": 1.3466042154566744, "grad_norm": 0.24824829399585724, "learning_rate": 1.2497653146281113e-05, "loss": 0.3501885175704956, "memory(GiB)": 127.52, "step": 1150, "token_acc": 0.8752751123830188, "train_speed(iter/s)": 0.100712 }, { "epoch": 1.3524590163934427, "grad_norm": 0.2536720037460327, "learning_rate": 1.2435085614336459e-05, "loss": 0.3565546989440918, "memory(GiB)": 127.52, "step": 1155, "token_acc": 0.8831354083065811, "train_speed(iter/s)": 0.100705 }, { "epoch": 1.3583138173302107, "grad_norm": 0.24884596467018127, "learning_rate": 1.2372416581872857e-05, "loss": 0.34425859451293944, "memory(GiB)": 127.52, "step": 1160, "token_acc": 0.8804687524440259, "train_speed(iter/s)": 0.100705 }, { "epoch": 1.364168618266979, "grad_norm": 0.2567351162433624, "learning_rate": 1.2309648661093878e-05, "loss": 0.3500640630722046, "memory(GiB)": 127.52, "step": 1165, "token_acc": 0.8808626074837297, "train_speed(iter/s)": 0.100708 }, { "epoch": 1.370023419203747, "grad_norm": 0.27127236127853394, "learning_rate": 1.2246784468324993e-05, "loss": 0.35610170364379884, "memory(GiB)": 127.52, "step": 1170, "token_acc": 0.8642630631304163, "train_speed(iter/s)": 0.100707 }, { "epoch": 1.3758782201405153, "grad_norm": 0.25630801916122437, "learning_rate": 1.218382662390454e-05, "loss": 0.3440692901611328, "memory(GiB)": 127.52, "step": 1175, "token_acc": 0.863847903863763, "train_speed(iter/s)": 0.100714 }, { "epoch": 1.3817330210772834, "grad_norm": 0.2579875886440277, "learning_rate": 1.2120777752074492e-05, "loss": 0.35255093574523927, "memory(GiB)": 127.52, "step": 1180, "token_acc": 0.8730105052212985, "train_speed(iter/s)": 0.100715 }, { "epoch": 1.3875878220140514, "grad_norm": 0.2638234496116638, "learning_rate": 1.2057640480871084e-05, "loss": 0.3546736717224121, "memory(GiB)": 127.52, "step": 1185, "token_acc": 0.8738721335992023, "train_speed(iter/s)": 0.100725 }, { "epoch": 1.3934426229508197, "grad_norm": 0.25871458649635315, "learning_rate": 1.1994417442015243e-05, "loss": 0.35408906936645507, "memory(GiB)": 127.52, "step": 1190, "token_acc": 0.8796952149117578, "train_speed(iter/s)": 0.100732 }, { "epoch": 1.399297423887588, "grad_norm": 0.2632989287376404, "learning_rate": 1.193111127080292e-05, "loss": 0.3432591676712036, "memory(GiB)": 127.52, "step": 1195, "token_acc": 0.8828218086199104, "train_speed(iter/s)": 0.10074 }, { "epoch": 1.405152224824356, "grad_norm": 0.24726183712482452, "learning_rate": 1.186772460599523e-05, "loss": 0.34243590831756593, "memory(GiB)": 127.52, "step": 1200, "token_acc": 0.8815012144480138, "train_speed(iter/s)": 0.100741 }, { "epoch": 1.411007025761124, "grad_norm": 0.3329097032546997, "learning_rate": 1.1804260089708464e-05, "loss": 0.3537503480911255, "memory(GiB)": 127.52, "step": 1205, "token_acc": 0.8658939159898351, "train_speed(iter/s)": 0.100735 }, { "epoch": 1.4168618266978923, "grad_norm": 0.25181666016578674, "learning_rate": 1.1740720367303958e-05, "loss": 0.347446870803833, "memory(GiB)": 127.52, "step": 1210, "token_acc": 0.8740943022953225, "train_speed(iter/s)": 0.10074 }, { "epoch": 1.4227166276346606, "grad_norm": 0.2532757818698883, "learning_rate": 1.1677108087277835e-05, "loss": 0.3539264678955078, "memory(GiB)": 127.52, "step": 1215, "token_acc": 0.8749382353125137, "train_speed(iter/s)": 0.100743 }, { "epoch": 1.4285714285714286, "grad_norm": 0.2551215887069702, "learning_rate": 1.1613425901150595e-05, "loss": 0.35313239097595217, "memory(GiB)": 127.52, "step": 1220, "token_acc": 0.8776082867215627, "train_speed(iter/s)": 0.100745 }, { "epoch": 1.4344262295081966, "grad_norm": 0.2713333070278168, "learning_rate": 1.15496764633566e-05, "loss": 0.3634988307952881, "memory(GiB)": 127.52, "step": 1225, "token_acc": 0.8660714848651069, "train_speed(iter/s)": 0.10073 }, { "epoch": 1.440281030444965, "grad_norm": 0.26022830605506897, "learning_rate": 1.1485862431133445e-05, "loss": 0.3524580478668213, "memory(GiB)": 127.52, "step": 1230, "token_acc": 0.8803166548004755, "train_speed(iter/s)": 0.100717 }, { "epoch": 1.446135831381733, "grad_norm": 0.25171470642089844, "learning_rate": 1.1421986464411169e-05, "loss": 0.3533075571060181, "memory(GiB)": 127.52, "step": 1235, "token_acc": 0.8648047662981438, "train_speed(iter/s)": 0.100703 }, { "epoch": 1.4519906323185012, "grad_norm": 0.2464302033185959, "learning_rate": 1.1358051225701404e-05, "loss": 0.3423281192779541, "memory(GiB)": 127.52, "step": 1240, "token_acc": 0.8691011183611862, "train_speed(iter/s)": 0.100701 }, { "epoch": 1.4578454332552693, "grad_norm": 0.25466638803482056, "learning_rate": 1.1294059379986384e-05, "loss": 0.35201549530029297, "memory(GiB)": 127.52, "step": 1245, "token_acc": 0.8681012341038652, "train_speed(iter/s)": 0.100689 }, { "epoch": 1.4637002341920375, "grad_norm": 0.2576982080936432, "learning_rate": 1.1230013594607874e-05, "loss": 0.3531355857849121, "memory(GiB)": 127.52, "step": 1250, "token_acc": 0.873457880243676, "train_speed(iter/s)": 0.100693 }, { "epoch": 1.4695550351288056, "grad_norm": 0.25660985708236694, "learning_rate": 1.1165916539155968e-05, "loss": 0.35094761848449707, "memory(GiB)": 127.52, "step": 1255, "token_acc": 0.8773934266901257, "train_speed(iter/s)": 0.100701 }, { "epoch": 1.4754098360655736, "grad_norm": 0.24054618179798126, "learning_rate": 1.1101770885357843e-05, "loss": 0.34633212089538573, "memory(GiB)": 127.52, "step": 1260, "token_acc": 0.8775079994840057, "train_speed(iter/s)": 0.100711 }, { "epoch": 1.481264637002342, "grad_norm": 0.2445182204246521, "learning_rate": 1.1037579306966365e-05, "loss": 0.34541456699371337, "memory(GiB)": 127.52, "step": 1265, "token_acc": 0.8862320037137543, "train_speed(iter/s)": 0.100709 }, { "epoch": 1.4871194379391102, "grad_norm": 0.2729050815105438, "learning_rate": 1.0973344479648652e-05, "loss": 0.3409654855728149, "memory(GiB)": 127.52, "step": 1270, "token_acc": 0.8771963474914158, "train_speed(iter/s)": 0.100709 }, { "epoch": 1.4929742388758782, "grad_norm": 0.24874503910541534, "learning_rate": 1.0909069080874556e-05, "loss": 0.3430008411407471, "memory(GiB)": 127.52, "step": 1275, "token_acc": 0.8704117168634027, "train_speed(iter/s)": 0.100714 }, { "epoch": 1.4988290398126463, "grad_norm": 0.2715279459953308, "learning_rate": 1.0844755789805042e-05, "loss": 0.35068159103393554, "memory(GiB)": 127.52, "step": 1280, "token_acc": 0.8675264981305526, "train_speed(iter/s)": 0.100714 }, { "epoch": 1.5046838407494145, "grad_norm": 0.23995369672775269, "learning_rate": 1.0780407287180526e-05, "loss": 0.35523912906646726, "memory(GiB)": 127.52, "step": 1285, "token_acc": 0.8685361997709505, "train_speed(iter/s)": 0.100706 }, { "epoch": 1.5105386416861828, "grad_norm": 0.26195716857910156, "learning_rate": 1.0716026255209124e-05, "loss": 0.349694561958313, "memory(GiB)": 127.52, "step": 1290, "token_acc": 0.8676919971870162, "train_speed(iter/s)": 0.100714 }, { "epoch": 1.5163934426229508, "grad_norm": 0.24379870295524597, "learning_rate": 1.0651615377454872e-05, "loss": 0.3513511657714844, "memory(GiB)": 127.52, "step": 1295, "token_acc": 0.8762717457922776, "train_speed(iter/s)": 0.100714 }, { "epoch": 1.5222482435597189, "grad_norm": 0.2554638683795929, "learning_rate": 1.0587177338725834e-05, "loss": 0.3569997787475586, "memory(GiB)": 127.52, "step": 1300, "token_acc": 0.8766220533416101, "train_speed(iter/s)": 0.100716 }, { "epoch": 1.5281030444964872, "grad_norm": 0.2548043727874756, "learning_rate": 1.0522714824962228e-05, "loss": 0.3422648191452026, "memory(GiB)": 127.52, "step": 1305, "token_acc": 0.8870603034829783, "train_speed(iter/s)": 0.100709 }, { "epoch": 1.5339578454332554, "grad_norm": 0.24967636168003082, "learning_rate": 1.0458230523124443e-05, "loss": 0.3560429573059082, "memory(GiB)": 127.52, "step": 1310, "token_acc": 0.8787232780765522, "train_speed(iter/s)": 0.100701 }, { "epoch": 1.5398126463700235, "grad_norm": 0.2598780691623688, "learning_rate": 1.0393727121081057e-05, "loss": 0.3518627166748047, "memory(GiB)": 127.52, "step": 1315, "token_acc": 0.8750810752945474, "train_speed(iter/s)": 0.10071 }, { "epoch": 1.5456674473067915, "grad_norm": 0.23825575411319733, "learning_rate": 1.0329207307496785e-05, "loss": 0.3401672840118408, "memory(GiB)": 127.52, "step": 1320, "token_acc": 0.8770460187011242, "train_speed(iter/s)": 0.100714 }, { "epoch": 1.5515222482435598, "grad_norm": 0.2550235986709595, "learning_rate": 1.0264673771720429e-05, "loss": 0.350058913230896, "memory(GiB)": 127.52, "step": 1325, "token_acc": 0.881267240867612, "train_speed(iter/s)": 0.100714 }, { "epoch": 1.5573770491803278, "grad_norm": 0.269613653421402, "learning_rate": 1.0200129203672754e-05, "loss": 0.3502191543579102, "memory(GiB)": 127.52, "step": 1330, "token_acc": 0.8661129276756743, "train_speed(iter/s)": 0.10071 }, { "epoch": 1.5632318501170959, "grad_norm": 0.24150115251541138, "learning_rate": 1.0135576293734381e-05, "loss": 0.34059958457946776, "memory(GiB)": 127.52, "step": 1335, "token_acc": 0.8847581210563216, "train_speed(iter/s)": 0.100716 }, { "epoch": 1.5690866510538641, "grad_norm": 0.2703973650932312, "learning_rate": 1.007101773263365e-05, "loss": 0.35358033180236814, "memory(GiB)": 127.52, "step": 1340, "token_acc": 0.8770655404348506, "train_speed(iter/s)": 0.100721 }, { "epoch": 1.5749414519906324, "grad_norm": 0.23871327936649323, "learning_rate": 1.0006456211334445e-05, "loss": 0.3467454671859741, "memory(GiB)": 127.52, "step": 1345, "token_acc": 0.8759395313396612, "train_speed(iter/s)": 0.100731 }, { "epoch": 1.5807962529274004, "grad_norm": 0.25692564249038696, "learning_rate": 9.941894420924044e-06, "loss": 0.3450988054275513, "memory(GiB)": 127.52, "step": 1350, "token_acc": 0.8868195745646664, "train_speed(iter/s)": 0.100727 }, { "epoch": 1.5866510538641685, "grad_norm": 0.2428205907344818, "learning_rate": 9.87733505250094e-06, "loss": 0.3494907855987549, "memory(GiB)": 127.52, "step": 1355, "token_acc": 0.8756121235576668, "train_speed(iter/s)": 0.100725 }, { "epoch": 1.5925058548009368, "grad_norm": 0.24155238270759583, "learning_rate": 9.812780797062678e-06, "loss": 0.3456254005432129, "memory(GiB)": 127.52, "step": 1360, "token_acc": 0.8809245943605768, "train_speed(iter/s)": 0.10072 }, { "epoch": 1.598360655737705, "grad_norm": 0.464139848947525, "learning_rate": 9.748234345393672e-06, "loss": 0.34203310012817384, "memory(GiB)": 127.52, "step": 1365, "token_acc": 0.8774237555421359, "train_speed(iter/s)": 0.100715 }, { "epoch": 1.604215456674473, "grad_norm": 0.2672084867954254, "learning_rate": 9.68369838795306e-06, "loss": 0.350542688369751, "memory(GiB)": 127.52, "step": 1370, "token_acc": 0.8734205080790737, "train_speed(iter/s)": 0.100705 }, { "epoch": 1.6100702576112411, "grad_norm": 0.2600000500679016, "learning_rate": 9.61917561476255e-06, "loss": 0.3421807050704956, "memory(GiB)": 127.52, "step": 1375, "token_acc": 0.8668853013058622, "train_speed(iter/s)": 0.100709 }, { "epoch": 1.6159250585480094, "grad_norm": 0.2540619373321533, "learning_rate": 9.554668715294305e-06, "loss": 0.3543410778045654, "memory(GiB)": 127.52, "step": 1380, "token_acc": 0.8761743728864414, "train_speed(iter/s)": 0.100714 }, { "epoch": 1.6217798594847777, "grad_norm": 0.2585217356681824, "learning_rate": 9.490180378358826e-06, "loss": 0.35744295120239256, "memory(GiB)": 127.52, "step": 1385, "token_acc": 0.8715506016593595, "train_speed(iter/s)": 0.100718 }, { "epoch": 1.6276346604215457, "grad_norm": 0.26017606258392334, "learning_rate": 9.425713291992878e-06, "loss": 0.34558424949645994, "memory(GiB)": 127.52, "step": 1390, "token_acc": 0.8794015410099387, "train_speed(iter/s)": 0.100719 }, { "epoch": 1.6334894613583137, "grad_norm": 0.25051021575927734, "learning_rate": 9.361270143347452e-06, "loss": 0.35907368659973143, "memory(GiB)": 127.52, "step": 1395, "token_acc": 0.8715231746371632, "train_speed(iter/s)": 0.100723 }, { "epoch": 1.639344262295082, "grad_norm": 0.24877934157848358, "learning_rate": 9.296853618575753e-06, "loss": 0.34605088233947756, "memory(GiB)": 127.52, "step": 1400, "token_acc": 0.8828522126980963, "train_speed(iter/s)": 0.100731 }, { "epoch": 1.6451990632318503, "grad_norm": 0.23893095552921295, "learning_rate": 9.232466402721241e-06, "loss": 0.3570685625076294, "memory(GiB)": 127.52, "step": 1405, "token_acc": 0.8760022299616647, "train_speed(iter/s)": 0.10074 }, { "epoch": 1.651053864168618, "grad_norm": 0.24638938903808594, "learning_rate": 9.1681111796057e-06, "loss": 0.3466794967651367, "memory(GiB)": 127.52, "step": 1410, "token_acc": 0.8773031091974165, "train_speed(iter/s)": 0.100749 }, { "epoch": 1.6569086651053864, "grad_norm": 0.256526380777359, "learning_rate": 9.103790631717375e-06, "loss": 0.3623323917388916, "memory(GiB)": 127.52, "step": 1415, "token_acc": 0.8679865616745452, "train_speed(iter/s)": 0.100748 }, { "epoch": 1.6627634660421546, "grad_norm": 0.25238198041915894, "learning_rate": 9.039507440099164e-06, "loss": 0.3467939138412476, "memory(GiB)": 127.52, "step": 1420, "token_acc": 0.8828419526341228, "train_speed(iter/s)": 0.100746 }, { "epoch": 1.6686182669789227, "grad_norm": 0.23841890692710876, "learning_rate": 8.975264284236866e-06, "loss": 0.34966843128204345, "memory(GiB)": 127.52, "step": 1425, "token_acc": 0.8775815971188294, "train_speed(iter/s)": 0.100755 }, { "epoch": 1.6744730679156907, "grad_norm": 0.26001548767089844, "learning_rate": 8.911063841947476e-06, "loss": 0.35109724998474123, "memory(GiB)": 127.52, "step": 1430, "token_acc": 0.8745225380796411, "train_speed(iter/s)": 0.100754 }, { "epoch": 1.680327868852459, "grad_norm": 0.2468952238559723, "learning_rate": 8.846908789267589e-06, "loss": 0.35158143043518064, "memory(GiB)": 127.52, "step": 1435, "token_acc": 0.8772585276576946, "train_speed(iter/s)": 0.100766 }, { "epoch": 1.6861826697892273, "grad_norm": 0.24095061421394348, "learning_rate": 8.78280180034184e-06, "loss": 0.3411277770996094, "memory(GiB)": 127.52, "step": 1440, "token_acc": 0.8712463039204312, "train_speed(iter/s)": 0.10077 }, { "epoch": 1.6920374707259953, "grad_norm": 0.25439053773880005, "learning_rate": 8.718745547311458e-06, "loss": 0.3543074131011963, "memory(GiB)": 127.52, "step": 1445, "token_acc": 0.871980767417743, "train_speed(iter/s)": 0.100763 }, { "epoch": 1.6978922716627634, "grad_norm": 1.5297069549560547, "learning_rate": 8.654742700202849e-06, "loss": 0.3533529043197632, "memory(GiB)": 127.52, "step": 1450, "token_acc": 0.8742467882207196, "train_speed(iter/s)": 0.100766 }, { "epoch": 1.7037470725995316, "grad_norm": 0.25103631615638733, "learning_rate": 8.590795926816348e-06, "loss": 0.3418538570404053, "memory(GiB)": 127.52, "step": 1455, "token_acc": 0.8745452901882429, "train_speed(iter/s)": 0.100768 }, { "epoch": 1.7096018735362999, "grad_norm": 0.3538268208503723, "learning_rate": 8.526907892614986e-06, "loss": 0.34701027870178225, "memory(GiB)": 127.52, "step": 1460, "token_acc": 0.8781468525993731, "train_speed(iter/s)": 0.100762 }, { "epoch": 1.715456674473068, "grad_norm": 0.2575690448284149, "learning_rate": 8.463081260613391e-06, "loss": 0.3492567539215088, "memory(GiB)": 127.52, "step": 1465, "token_acc": 0.8833869870635476, "train_speed(iter/s)": 0.10076 }, { "epoch": 1.721311475409836, "grad_norm": 0.25249573588371277, "learning_rate": 8.399318691266806e-06, "loss": 0.35265603065490725, "memory(GiB)": 127.52, "step": 1470, "token_acc": 0.8733317460118548, "train_speed(iter/s)": 0.10076 }, { "epoch": 1.7271662763466042, "grad_norm": 0.26620882749557495, "learning_rate": 8.335622842360168e-06, "loss": 0.3444960594177246, "memory(GiB)": 127.52, "step": 1475, "token_acc": 0.8786412367096045, "train_speed(iter/s)": 0.100754 }, { "epoch": 1.7330210772833725, "grad_norm": 0.25925421714782715, "learning_rate": 8.271996368897345e-06, "loss": 0.35317885875701904, "memory(GiB)": 127.52, "step": 1480, "token_acc": 0.8806173955625871, "train_speed(iter/s)": 0.10074 }, { "epoch": 1.7388758782201406, "grad_norm": 0.24599948525428772, "learning_rate": 8.208441922990454e-06, "loss": 0.34299373626708984, "memory(GiB)": 127.52, "step": 1485, "token_acc": 0.8748146671484283, "train_speed(iter/s)": 0.100736 }, { "epoch": 1.7447306791569086, "grad_norm": 0.2374086081981659, "learning_rate": 8.144962153749331e-06, "loss": 0.3454796314239502, "memory(GiB)": 127.52, "step": 1490, "token_acc": 0.8697578355578018, "train_speed(iter/s)": 0.100735 }, { "epoch": 1.7505854800936769, "grad_norm": 0.2567986845970154, "learning_rate": 8.081559707171094e-06, "loss": 0.35629470348358155, "memory(GiB)": 127.52, "step": 1495, "token_acc": 0.8722708482627621, "train_speed(iter/s)": 0.100742 }, { "epoch": 1.756440281030445, "grad_norm": 0.2612420320510864, "learning_rate": 8.01823722602986e-06, "loss": 0.34243695735931395, "memory(GiB)": 127.52, "step": 1500, "token_acc": 0.8835913661147516, "train_speed(iter/s)": 0.100749 }, { "epoch": 1.762295081967213, "grad_norm": 0.25001969933509827, "learning_rate": 7.954997349766576e-06, "loss": 0.3504654407501221, "memory(GiB)": 127.52, "step": 1505, "token_acc": 0.8767294491512118, "train_speed(iter/s)": 0.100752 }, { "epoch": 1.7681498829039812, "grad_norm": 0.24179641902446747, "learning_rate": 7.891842714379027e-06, "loss": 0.3378228425979614, "memory(GiB)": 127.52, "step": 1510, "token_acc": 0.8821447808495446, "train_speed(iter/s)": 0.10075 }, { "epoch": 1.7740046838407495, "grad_norm": 0.2632296085357666, "learning_rate": 7.828775952311921e-06, "loss": 0.34106738567352296, "memory(GiB)": 127.52, "step": 1515, "token_acc": 0.872465283102722, "train_speed(iter/s)": 0.100743 }, { "epoch": 1.7798594847775175, "grad_norm": 0.2476883977651596, "learning_rate": 7.765799692347201e-06, "loss": 0.34442992210388185, "memory(GiB)": 127.52, "step": 1520, "token_acc": 0.8729373501693029, "train_speed(iter/s)": 0.100743 }, { "epoch": 1.7857142857142856, "grad_norm": 0.2630121111869812, "learning_rate": 7.702916559494444e-06, "loss": 0.3511634588241577, "memory(GiB)": 127.52, "step": 1525, "token_acc": 0.8770366431554089, "train_speed(iter/s)": 0.100748 }, { "epoch": 1.7915690866510539, "grad_norm": 0.24981631338596344, "learning_rate": 7.64012917488146e-06, "loss": 0.33224847316741946, "memory(GiB)": 127.52, "step": 1530, "token_acc": 0.8896432981333869, "train_speed(iter/s)": 0.100745 }, { "epoch": 1.7974238875878221, "grad_norm": 0.25589603185653687, "learning_rate": 7.577440155645028e-06, "loss": 0.3430049896240234, "memory(GiB)": 127.52, "step": 1535, "token_acc": 0.8784328165618647, "train_speed(iter/s)": 0.100733 }, { "epoch": 1.8032786885245902, "grad_norm": 0.24135656654834747, "learning_rate": 7.514852114821811e-06, "loss": 0.35404491424560547, "memory(GiB)": 127.52, "step": 1540, "token_acc": 0.8758359005184462, "train_speed(iter/s)": 0.100732 }, { "epoch": 1.8091334894613582, "grad_norm": 0.3086133599281311, "learning_rate": 7.452367661239433e-06, "loss": 0.3292539596557617, "memory(GiB)": 127.52, "step": 1545, "token_acc": 0.8771563599039064, "train_speed(iter/s)": 0.10073 }, { "epoch": 1.8149882903981265, "grad_norm": 0.26186585426330566, "learning_rate": 7.389989399407741e-06, "loss": 0.3564730644226074, "memory(GiB)": 127.52, "step": 1550, "token_acc": 0.8767442953125245, "train_speed(iter/s)": 0.100726 }, { "epoch": 1.8208430913348947, "grad_norm": 0.2449086308479309, "learning_rate": 7.3277199294102485e-06, "loss": 0.3377220630645752, "memory(GiB)": 127.52, "step": 1555, "token_acc": 0.8731188520277088, "train_speed(iter/s)": 0.100729 }, { "epoch": 1.8266978922716628, "grad_norm": 0.2617018222808838, "learning_rate": 7.265561846795741e-06, "loss": 0.35269980430603026, "memory(GiB)": 127.52, "step": 1560, "token_acc": 0.8755254872982656, "train_speed(iter/s)": 0.100718 }, { "epoch": 1.8325526932084308, "grad_norm": 0.2533339262008667, "learning_rate": 7.203517742470101e-06, "loss": 0.3477527856826782, "memory(GiB)": 127.52, "step": 1565, "token_acc": 0.8841913617578873, "train_speed(iter/s)": 0.100718 }, { "epoch": 1.838407494145199, "grad_norm": 0.24031810462474823, "learning_rate": 7.141590202588312e-06, "loss": 0.35293850898742674, "memory(GiB)": 127.52, "step": 1570, "token_acc": 0.8790135675181339, "train_speed(iter/s)": 0.100724 }, { "epoch": 1.8442622950819674, "grad_norm": 0.2540515661239624, "learning_rate": 7.079781808446648e-06, "loss": 0.35478663444519043, "memory(GiB)": 127.52, "step": 1575, "token_acc": 0.8638225043564849, "train_speed(iter/s)": 0.100721 }, { "epoch": 1.8501170960187352, "grad_norm": 0.24163876473903656, "learning_rate": 7.018095136375089e-06, "loss": 0.33953070640563965, "memory(GiB)": 127.52, "step": 1580, "token_acc": 0.8760248415939393, "train_speed(iter/s)": 0.100721 }, { "epoch": 1.8559718969555035, "grad_norm": 0.24985362589359283, "learning_rate": 6.956532757629945e-06, "loss": 0.34739911556243896, "memory(GiB)": 127.52, "step": 1585, "token_acc": 0.8751094324520373, "train_speed(iter/s)": 0.10072 }, { "epoch": 1.8618266978922717, "grad_norm": 0.24738718569278717, "learning_rate": 6.89509723828665e-06, "loss": 0.35140252113342285, "memory(GiB)": 127.52, "step": 1590, "token_acc": 0.8747874666018945, "train_speed(iter/s)": 0.100721 }, { "epoch": 1.8676814988290398, "grad_norm": 0.2528833746910095, "learning_rate": 6.833791139132824e-06, "loss": 0.3366274356842041, "memory(GiB)": 127.52, "step": 1595, "token_acc": 0.877359708131215, "train_speed(iter/s)": 0.100705 }, { "epoch": 1.8735362997658078, "grad_norm": 0.22930973768234253, "learning_rate": 6.772617015561529e-06, "loss": 0.34548795223236084, "memory(GiB)": 127.52, "step": 1600, "token_acc": 0.8674766998186026, "train_speed(iter/s)": 0.100705 }, { "epoch": 1.879391100702576, "grad_norm": 0.23658259212970734, "learning_rate": 6.7115774174647475e-06, "loss": 0.3390948295593262, "memory(GiB)": 127.52, "step": 1605, "token_acc": 0.883574050014699, "train_speed(iter/s)": 0.100706 }, { "epoch": 1.8852459016393444, "grad_norm": 0.25393053889274597, "learning_rate": 6.6506748891271045e-06, "loss": 0.3500185012817383, "memory(GiB)": 127.52, "step": 1610, "token_acc": 0.8819961495087196, "train_speed(iter/s)": 0.100708 }, { "epoch": 1.8911007025761124, "grad_norm": 0.23870056867599487, "learning_rate": 6.5899119691198025e-06, "loss": 0.343201732635498, "memory(GiB)": 127.52, "step": 1615, "token_acc": 0.8769540112004077, "train_speed(iter/s)": 0.100712 }, { "epoch": 1.8969555035128804, "grad_norm": 0.23795676231384277, "learning_rate": 6.529291190194829e-06, "loss": 0.3476824998855591, "memory(GiB)": 127.52, "step": 1620, "token_acc": 0.8771016372387611, "train_speed(iter/s)": 0.100717 }, { "epoch": 1.9028103044496487, "grad_norm": 0.23620595037937164, "learning_rate": 6.468815079179364e-06, "loss": 0.3438570022583008, "memory(GiB)": 127.52, "step": 1625, "token_acc": 0.8808678958099098, "train_speed(iter/s)": 0.100717 }, { "epoch": 1.908665105386417, "grad_norm": 0.27084144949913025, "learning_rate": 6.408486156870466e-06, "loss": 0.3575857162475586, "memory(GiB)": 127.52, "step": 1630, "token_acc": 0.8567800504203767, "train_speed(iter/s)": 0.10072 }, { "epoch": 1.914519906323185, "grad_norm": 0.24774354696273804, "learning_rate": 6.348306937929991e-06, "loss": 0.3539011001586914, "memory(GiB)": 127.52, "step": 1635, "token_acc": 0.8722537158121981, "train_speed(iter/s)": 0.100726 }, { "epoch": 1.920374707259953, "grad_norm": 0.23919358849525452, "learning_rate": 6.288279930779789e-06, "loss": 0.33454456329345705, "memory(GiB)": 127.52, "step": 1640, "token_acc": 0.8859452149573859, "train_speed(iter/s)": 0.100729 }, { "epoch": 1.9262295081967213, "grad_norm": 0.2600441575050354, "learning_rate": 6.228407637497131e-06, "loss": 0.34556894302368163, "memory(GiB)": 127.52, "step": 1645, "token_acc": 0.8641004272904045, "train_speed(iter/s)": 0.100727 }, { "epoch": 1.9320843091334896, "grad_norm": 0.2533404231071472, "learning_rate": 6.1686925537104306e-06, "loss": 0.3354111433029175, "memory(GiB)": 127.52, "step": 1650, "token_acc": 0.8690573840794189, "train_speed(iter/s)": 0.100726 }, { "epoch": 1.9379391100702577, "grad_norm": 0.24305778741836548, "learning_rate": 6.109137168495205e-06, "loss": 0.342392110824585, "memory(GiB)": 127.52, "step": 1655, "token_acc": 0.8907634917938944, "train_speed(iter/s)": 0.100732 }, { "epoch": 1.9437939110070257, "grad_norm": 0.23065665364265442, "learning_rate": 6.049743964270336e-06, "loss": 0.35349397659301757, "memory(GiB)": 127.52, "step": 1660, "token_acc": 0.8749648996911172, "train_speed(iter/s)": 0.100731 }, { "epoch": 1.949648711943794, "grad_norm": 0.26187312602996826, "learning_rate": 5.990515416694591e-06, "loss": 0.3514526844024658, "memory(GiB)": 127.52, "step": 1665, "token_acc": 0.8773919272455463, "train_speed(iter/s)": 0.100729 }, { "epoch": 1.955503512880562, "grad_norm": 0.2436314970254898, "learning_rate": 5.931453994563434e-06, "loss": 0.34615340232849123, "memory(GiB)": 127.52, "step": 1670, "token_acc": 0.8825784399814935, "train_speed(iter/s)": 0.100722 }, { "epoch": 1.96135831381733, "grad_norm": 1.0637788772583008, "learning_rate": 5.872562159706116e-06, "loss": 0.34925112724304197, "memory(GiB)": 127.52, "step": 1675, "token_acc": 0.8725762818496382, "train_speed(iter/s)": 0.100718 }, { "epoch": 1.9672131147540983, "grad_norm": 0.2608899176120758, "learning_rate": 5.8138423668830605e-06, "loss": 0.34130330085754396, "memory(GiB)": 127.52, "step": 1680, "token_acc": 0.876563876375788, "train_speed(iter/s)": 0.10072 }, { "epoch": 1.9730679156908666, "grad_norm": 0.24455122649669647, "learning_rate": 5.755297063683551e-06, "loss": 0.3456611633300781, "memory(GiB)": 127.52, "step": 1685, "token_acc": 0.8803155448934612, "train_speed(iter/s)": 0.100717 }, { "epoch": 1.9789227166276346, "grad_norm": 0.23744545876979828, "learning_rate": 5.696928690423693e-06, "loss": 0.3404732942581177, "memory(GiB)": 127.52, "step": 1690, "token_acc": 0.873919857146425, "train_speed(iter/s)": 0.100721 }, { "epoch": 1.9847775175644027, "grad_norm": 0.2499692440032959, "learning_rate": 5.638739680044718e-06, "loss": 0.3554127931594849, "memory(GiB)": 127.52, "step": 1695, "token_acc": 0.8678405344492528, "train_speed(iter/s)": 0.10072 }, { "epoch": 1.990632318501171, "grad_norm": 0.23933644592761993, "learning_rate": 5.580732458011544e-06, "loss": 0.34451732635498045, "memory(GiB)": 127.52, "step": 1700, "token_acc": 0.8813060735041081, "train_speed(iter/s)": 0.100721 }, { "epoch": 1.9964871194379392, "grad_norm": 0.2454347014427185, "learning_rate": 5.522909442211708e-06, "loss": 0.3448106527328491, "memory(GiB)": 127.52, "step": 1705, "token_acc": 0.8718723798596708, "train_speed(iter/s)": 0.100717 }, { "epoch": 2.002341920374707, "grad_norm": 0.30603164434432983, "learning_rate": 5.465273042854551e-06, "loss": 0.3320322036743164, "memory(GiB)": 127.52, "step": 1710, "token_acc": 0.8845191075650899, "train_speed(iter/s)": 0.10054 }, { "epoch": 2.0081967213114753, "grad_norm": 0.26624929904937744, "learning_rate": 5.407825662370778e-06, "loss": 0.3192149639129639, "memory(GiB)": 127.52, "step": 1715, "token_acc": 0.8862581577460744, "train_speed(iter/s)": 0.100533 }, { "epoch": 2.0140515222482436, "grad_norm": 0.28559088706970215, "learning_rate": 5.350569695312313e-06, "loss": 0.3315494775772095, "memory(GiB)": 127.52, "step": 1720, "token_acc": 0.8817901407312053, "train_speed(iter/s)": 0.100527 }, { "epoch": 2.019906323185012, "grad_norm": 0.24132603406906128, "learning_rate": 5.293507528252474e-06, "loss": 0.3354511737823486, "memory(GiB)": 127.52, "step": 1725, "token_acc": 0.8808201997328972, "train_speed(iter/s)": 0.100523 }, { "epoch": 2.0257611241217797, "grad_norm": 0.25403663516044617, "learning_rate": 5.236641539686518e-06, "loss": 0.3226620197296143, "memory(GiB)": 127.52, "step": 1730, "token_acc": 0.8806968959125817, "train_speed(iter/s)": 0.10053 }, { "epoch": 2.031615925058548, "grad_norm": 0.24015206098556519, "learning_rate": 5.179974099932472e-06, "loss": 0.3161166667938232, "memory(GiB)": 127.52, "step": 1735, "token_acc": 0.8794680331257753, "train_speed(iter/s)": 0.100526 }, { "epoch": 2.037470725995316, "grad_norm": 0.2842601537704468, "learning_rate": 5.12350757103236e-06, "loss": 0.31528186798095703, "memory(GiB)": 127.52, "step": 1740, "token_acc": 0.8833886035950154, "train_speed(iter/s)": 0.10053 }, { "epoch": 2.0433255269320845, "grad_norm": 0.23931631445884705, "learning_rate": 5.067244306653736e-06, "loss": 0.32300970554351804, "memory(GiB)": 127.52, "step": 1745, "token_acc": 0.8907401132070736, "train_speed(iter/s)": 0.100533 }, { "epoch": 2.0491803278688523, "grad_norm": 0.25491324067115784, "learning_rate": 5.0111866519915575e-06, "loss": 0.31856546401977537, "memory(GiB)": 127.52, "step": 1750, "token_acc": 0.8788062223735568, "train_speed(iter/s)": 0.100534 }, { "epoch": 2.0550351288056206, "grad_norm": 0.2541966140270233, "learning_rate": 4.95533694367047e-06, "loss": 0.31543042659759524, "memory(GiB)": 127.52, "step": 1755, "token_acc": 0.8854616459729288, "train_speed(iter/s)": 0.100541 }, { "epoch": 2.060889929742389, "grad_norm": 0.250337690114975, "learning_rate": 4.899697509647379e-06, "loss": 0.32208833694458006, "memory(GiB)": 127.52, "step": 1760, "token_acc": 0.8763743304143462, "train_speed(iter/s)": 0.100545 }, { "epoch": 2.066744730679157, "grad_norm": 0.23674513399600983, "learning_rate": 4.844270669114424e-06, "loss": 0.32359483242034914, "memory(GiB)": 127.52, "step": 1765, "token_acc": 0.8885440198244088, "train_speed(iter/s)": 0.100551 }, { "epoch": 2.072599531615925, "grad_norm": 0.2509515881538391, "learning_rate": 4.789058732402319e-06, "loss": 0.3145972728729248, "memory(GiB)": 127.52, "step": 1770, "token_acc": 0.8812067213755373, "train_speed(iter/s)": 0.100554 }, { "epoch": 2.078454332552693, "grad_norm": 0.27846959233283997, "learning_rate": 4.734064000884044e-06, "loss": 0.3361539840698242, "memory(GiB)": 127.52, "step": 1775, "token_acc": 0.8687031468980935, "train_speed(iter/s)": 0.100561 }, { "epoch": 2.0843091334894615, "grad_norm": 0.2520703971385956, "learning_rate": 4.679288766878908e-06, "loss": 0.3277717590332031, "memory(GiB)": 127.52, "step": 1780, "token_acc": 0.8835239754091976, "train_speed(iter/s)": 0.100561 }, { "epoch": 2.0901639344262297, "grad_norm": 0.26310279965400696, "learning_rate": 4.624735313557019e-06, "loss": 0.32394185066223147, "memory(GiB)": 127.52, "step": 1785, "token_acc": 0.8875730035291546, "train_speed(iter/s)": 0.100566 }, { "epoch": 2.0960187353629975, "grad_norm": 0.2666696310043335, "learning_rate": 4.570405914844105e-06, "loss": 0.31819107532501223, "memory(GiB)": 127.52, "step": 1790, "token_acc": 0.8859368071299645, "train_speed(iter/s)": 0.100562 }, { "epoch": 2.101873536299766, "grad_norm": 0.25196680426597595, "learning_rate": 4.516302835326723e-06, "loss": 0.322560453414917, "memory(GiB)": 127.52, "step": 1795, "token_acc": 0.8921213689835521, "train_speed(iter/s)": 0.100564 }, { "epoch": 2.107728337236534, "grad_norm": 0.24787664413452148, "learning_rate": 4.462428330157886e-06, "loss": 0.3134110927581787, "memory(GiB)": 127.52, "step": 1800, "token_acc": 0.8915973959679097, "train_speed(iter/s)": 0.100565 }, { "epoch": 2.113583138173302, "grad_norm": 0.23812943696975708, "learning_rate": 4.4087846449630475e-06, "loss": 0.31724915504455564, "memory(GiB)": 127.52, "step": 1805, "token_acc": 0.8883239519028294, "train_speed(iter/s)": 0.100568 }, { "epoch": 2.11943793911007, "grad_norm": 0.2460552453994751, "learning_rate": 4.355374015746493e-06, "loss": 0.31520168781280516, "memory(GiB)": 127.52, "step": 1810, "token_acc": 0.8825987185966718, "train_speed(iter/s)": 0.100568 }, { "epoch": 2.1252927400468384, "grad_norm": 0.2627100646495819, "learning_rate": 4.302198668798159e-06, "loss": 0.3187079906463623, "memory(GiB)": 127.52, "step": 1815, "token_acc": 0.8795669142641319, "train_speed(iter/s)": 0.100574 }, { "epoch": 2.1311475409836067, "grad_norm": 0.23737181723117828, "learning_rate": 4.249260820600813e-06, "loss": 0.30634393692016604, "memory(GiB)": 127.52, "step": 1820, "token_acc": 0.8882761935077175, "train_speed(iter/s)": 0.100574 }, { "epoch": 2.1370023419203745, "grad_norm": 0.44100987911224365, "learning_rate": 4.1965626777376766e-06, "loss": 0.3143752574920654, "memory(GiB)": 127.52, "step": 1825, "token_acc": 0.8907455736843094, "train_speed(iter/s)": 0.100576 }, { "epoch": 2.142857142857143, "grad_norm": 0.243091881275177, "learning_rate": 4.144106436800453e-06, "loss": 0.32144436836242674, "memory(GiB)": 127.52, "step": 1830, "token_acc": 0.8904153173473116, "train_speed(iter/s)": 0.100586 }, { "epoch": 2.148711943793911, "grad_norm": 0.22646024823188782, "learning_rate": 4.091894284297758e-06, "loss": 0.3123732089996338, "memory(GiB)": 127.52, "step": 1835, "token_acc": 0.8785402692433979, "train_speed(iter/s)": 0.100589 }, { "epoch": 2.1545667447306793, "grad_norm": 0.2700958549976349, "learning_rate": 4.039928396563983e-06, "loss": 0.33238074779510496, "memory(GiB)": 127.52, "step": 1840, "token_acc": 0.8842443529070076, "train_speed(iter/s)": 0.10059 }, { "epoch": 2.160421545667447, "grad_norm": 0.2499818056821823, "learning_rate": 3.9882109396685845e-06, "loss": 0.30622167587280275, "memory(GiB)": 127.52, "step": 1845, "token_acc": 0.8795685480484824, "train_speed(iter/s)": 0.100591 }, { "epoch": 2.1662763466042154, "grad_norm": 0.22730578482151031, "learning_rate": 3.936744069325797e-06, "loss": 0.3057937860488892, "memory(GiB)": 127.52, "step": 1850, "token_acc": 0.8902019848511362, "train_speed(iter/s)": 0.100589 }, { "epoch": 2.1721311475409837, "grad_norm": 0.23967498540878296, "learning_rate": 3.885529930804768e-06, "loss": 0.3023227214813232, "memory(GiB)": 127.52, "step": 1855, "token_acc": 0.8807274179657759, "train_speed(iter/s)": 0.100589 }, { "epoch": 2.177985948477752, "grad_norm": 0.2622321844100952, "learning_rate": 3.834570658840152e-06, "loss": 0.32261273860931394, "memory(GiB)": 127.52, "step": 1860, "token_acc": 0.8792452360659205, "train_speed(iter/s)": 0.100591 }, { "epoch": 2.1838407494145198, "grad_norm": 0.23954476416110992, "learning_rate": 3.7838683775431106e-06, "loss": 0.31424174308776853, "memory(GiB)": 127.52, "step": 1865, "token_acc": 0.8843662495044312, "train_speed(iter/s)": 0.100597 }, { "epoch": 2.189695550351288, "grad_norm": 0.23363274335861206, "learning_rate": 3.733425200312797e-06, "loss": 0.316208815574646, "memory(GiB)": 127.52, "step": 1870, "token_acc": 0.876293130342547, "train_speed(iter/s)": 0.100602 }, { "epoch": 2.1955503512880563, "grad_norm": 0.24841627478599548, "learning_rate": 3.683243229748249e-06, "loss": 0.3097521781921387, "memory(GiB)": 127.52, "step": 1875, "token_acc": 0.8804246009543149, "train_speed(iter/s)": 0.100606 }, { "epoch": 2.201405152224824, "grad_norm": 0.25356635451316833, "learning_rate": 3.633324557560747e-06, "loss": 0.31675851345062256, "memory(GiB)": 127.52, "step": 1880, "token_acc": 0.8871838137645497, "train_speed(iter/s)": 0.10061 }, { "epoch": 2.2072599531615924, "grad_norm": 0.2366763949394226, "learning_rate": 3.5836712644866277e-06, "loss": 0.30890917778015137, "memory(GiB)": 127.52, "step": 1885, "token_acc": 0.8819356314491541, "train_speed(iter/s)": 0.100613 }, { "epoch": 2.2131147540983607, "grad_norm": 0.24897019565105438, "learning_rate": 3.5342854202005696e-06, "loss": 0.31049222946166993, "memory(GiB)": 127.52, "step": 1890, "token_acc": 0.8878919948532936, "train_speed(iter/s)": 0.100619 }, { "epoch": 2.218969555035129, "grad_norm": 0.239404559135437, "learning_rate": 3.485169083229293e-06, "loss": 0.31925191879272463, "memory(GiB)": 127.52, "step": 1895, "token_acc": 0.8928798404593369, "train_speed(iter/s)": 0.100627 }, { "epoch": 2.2248243559718968, "grad_norm": 0.2341826856136322, "learning_rate": 3.4363243008657842e-06, "loss": 0.31410508155822753, "memory(GiB)": 127.52, "step": 1900, "token_acc": 0.8741590609526956, "train_speed(iter/s)": 0.100624 }, { "epoch": 2.230679156908665, "grad_norm": 0.24927052855491638, "learning_rate": 3.3877531090839478e-06, "loss": 0.3199175834655762, "memory(GiB)": 127.52, "step": 1905, "token_acc": 0.8767657620459692, "train_speed(iter/s)": 0.100628 }, { "epoch": 2.2365339578454333, "grad_norm": 0.2401537299156189, "learning_rate": 3.3394575324537327e-06, "loss": 0.3235038757324219, "memory(GiB)": 127.52, "step": 1910, "token_acc": 0.8763058505839384, "train_speed(iter/s)": 0.100623 }, { "epoch": 2.2423887587822016, "grad_norm": 0.23076413571834564, "learning_rate": 3.2914395840567605e-06, "loss": 0.31050064563751223, "memory(GiB)": 127.52, "step": 1915, "token_acc": 0.8874926079243052, "train_speed(iter/s)": 0.100622 }, { "epoch": 2.2482435597189694, "grad_norm": 0.2379971295595169, "learning_rate": 3.2437012654024057e-06, "loss": 0.3159012317657471, "memory(GiB)": 127.52, "step": 1920, "token_acc": 0.8895969009656411, "train_speed(iter/s)": 0.100622 }, { "epoch": 2.2540983606557377, "grad_norm": 0.23007337749004364, "learning_rate": 3.1962445663443643e-06, "loss": 0.31895716190338136, "memory(GiB)": 127.52, "step": 1925, "token_acc": 0.8823520222942871, "train_speed(iter/s)": 0.100616 }, { "epoch": 2.259953161592506, "grad_norm": 0.2437550276517868, "learning_rate": 3.1490714649977196e-06, "loss": 0.3226035118103027, "memory(GiB)": 127.52, "step": 1930, "token_acc": 0.8907227393284292, "train_speed(iter/s)": 0.100614 }, { "epoch": 2.265807962529274, "grad_norm": 0.2513379454612732, "learning_rate": 3.102183927656488e-06, "loss": 0.31055560111999514, "memory(GiB)": 127.52, "step": 1935, "token_acc": 0.8758090614886731, "train_speed(iter/s)": 0.100617 }, { "epoch": 2.271662763466042, "grad_norm": 0.23778940737247467, "learning_rate": 3.0555839087116547e-06, "loss": 0.32387375831604004, "memory(GiB)": 127.52, "step": 1940, "token_acc": 0.887034375, "train_speed(iter/s)": 0.10062 }, { "epoch": 2.2775175644028103, "grad_norm": 0.26385143399238586, "learning_rate": 3.009273350569705e-06, "loss": 0.32143163681030273, "memory(GiB)": 127.52, "step": 1945, "token_acc": 0.8916146423189599, "train_speed(iter/s)": 0.100632 }, { "epoch": 2.2833723653395785, "grad_norm": 0.23078720271587372, "learning_rate": 2.963254183571682e-06, "loss": 0.31597721576690674, "memory(GiB)": 127.52, "step": 1950, "token_acc": 0.8873806150822559, "train_speed(iter/s)": 0.10063 }, { "epoch": 2.289227166276347, "grad_norm": 0.23988991975784302, "learning_rate": 2.9175283259126943e-06, "loss": 0.31755337715148924, "memory(GiB)": 127.52, "step": 1955, "token_acc": 0.8924940331886264, "train_speed(iter/s)": 0.100631 }, { "epoch": 2.2950819672131146, "grad_norm": 0.23374050855636597, "learning_rate": 2.872097683561986e-06, "loss": 0.3156282424926758, "memory(GiB)": 127.52, "step": 1960, "token_acc": 0.8946095897383691, "train_speed(iter/s)": 0.100632 }, { "epoch": 2.300936768149883, "grad_norm": 0.22969146072864532, "learning_rate": 2.8269641501834834e-06, "loss": 0.32587299346923826, "memory(GiB)": 127.52, "step": 1965, "token_acc": 0.8774885813450646, "train_speed(iter/s)": 0.100637 }, { "epoch": 2.306791569086651, "grad_norm": 0.23242172598838806, "learning_rate": 2.782129607056848e-06, "loss": 0.31759541034698485, "memory(GiB)": 127.52, "step": 1970, "token_acc": 0.8783747102265459, "train_speed(iter/s)": 0.10064 }, { "epoch": 2.312646370023419, "grad_norm": 0.22935490310192108, "learning_rate": 2.7375959229990856e-06, "loss": 0.307840371131897, "memory(GiB)": 127.52, "step": 1975, "token_acc": 0.8862128010598808, "train_speed(iter/s)": 0.100639 }, { "epoch": 2.3185011709601873, "grad_norm": 0.2637212574481964, "learning_rate": 2.6933649542866326e-06, "loss": 0.3114126682281494, "memory(GiB)": 127.52, "step": 1980, "token_acc": 0.8820059272541622, "train_speed(iter/s)": 0.100646 }, { "epoch": 2.3243559718969555, "grad_norm": 0.22703419625759125, "learning_rate": 2.649438544577977e-06, "loss": 0.30065155029296875, "memory(GiB)": 127.52, "step": 1985, "token_acc": 0.8849238586641156, "train_speed(iter/s)": 0.100647 }, { "epoch": 2.330210772833724, "grad_norm": 0.22714027762413025, "learning_rate": 2.6058185248368317e-06, "loss": 0.3135934352874756, "memory(GiB)": 127.52, "step": 1990, "token_acc": 0.8923622270535968, "train_speed(iter/s)": 0.100647 }, { "epoch": 2.3360655737704916, "grad_norm": 0.23052531480789185, "learning_rate": 2.562506713255789e-06, "loss": 0.3088988304138184, "memory(GiB)": 127.52, "step": 1995, "token_acc": 0.8901272198016593, "train_speed(iter/s)": 0.100652 }, { "epoch": 2.34192037470726, "grad_norm": 0.2511214017868042, "learning_rate": 2.519504915180555e-06, "loss": 0.3128695487976074, "memory(GiB)": 127.52, "step": 2000, "token_acc": 0.8865565346454385, "train_speed(iter/s)": 0.100653 }, { "epoch": 2.347775175644028, "grad_norm": 0.23098479211330414, "learning_rate": 2.4768149230346917e-06, "loss": 0.3291048526763916, "memory(GiB)": 127.52, "step": 2005, "token_acc": 0.8865806253889527, "train_speed(iter/s)": 0.100648 }, { "epoch": 2.3536299765807964, "grad_norm": 0.2332172840833664, "learning_rate": 2.4344385162448924e-06, "loss": 0.31312854290008546, "memory(GiB)": 127.52, "step": 2010, "token_acc": 0.8905434652297092, "train_speed(iter/s)": 0.100649 }, { "epoch": 2.3594847775175642, "grad_norm": 0.229131281375885, "learning_rate": 2.392377461166826e-06, "loss": 0.3113706588745117, "memory(GiB)": 127.52, "step": 2015, "token_acc": 0.889476325707392, "train_speed(iter/s)": 0.100651 }, { "epoch": 2.3653395784543325, "grad_norm": 0.24932575225830078, "learning_rate": 2.350633511011511e-06, "loss": 0.3204165458679199, "memory(GiB)": 127.52, "step": 2020, "token_acc": 0.8841538567415554, "train_speed(iter/s)": 0.100647 }, { "epoch": 2.371194379391101, "grad_norm": 0.23387765884399414, "learning_rate": 2.309208405772221e-06, "loss": 0.32724220752716066, "memory(GiB)": 127.52, "step": 2025, "token_acc": 0.8882853658229917, "train_speed(iter/s)": 0.100652 }, { "epoch": 2.3770491803278686, "grad_norm": 0.24220742285251617, "learning_rate": 2.2681038721519768e-06, "loss": 0.33083477020263674, "memory(GiB)": 127.52, "step": 2030, "token_acc": 0.8838624553173172, "train_speed(iter/s)": 0.100651 }, { "epoch": 2.382903981264637, "grad_norm": 0.2579573690891266, "learning_rate": 2.227321623491563e-06, "loss": 0.3199321746826172, "memory(GiB)": 127.52, "step": 2035, "token_acc": 0.8799424487730837, "train_speed(iter/s)": 0.100653 }, { "epoch": 2.388758782201405, "grad_norm": 0.22851942479610443, "learning_rate": 2.186863359698108e-06, "loss": 0.3142981052398682, "memory(GiB)": 127.52, "step": 2040, "token_acc": 0.9041223969400765, "train_speed(iter/s)": 0.100653 }, { "epoch": 2.3946135831381734, "grad_norm": 0.24671818315982819, "learning_rate": 2.1467307671742377e-06, "loss": 0.31820495128631593, "memory(GiB)": 127.52, "step": 2045, "token_acc": 0.8822625886964798, "train_speed(iter/s)": 0.100657 }, { "epoch": 2.4004683840749417, "grad_norm": 0.2494201809167862, "learning_rate": 2.106925518747779e-06, "loss": 0.31292271614074707, "memory(GiB)": 127.52, "step": 2050, "token_acc": 0.8868852561536922, "train_speed(iter/s)": 0.100659 }, { "epoch": 2.4063231850117095, "grad_norm": 0.25766271352767944, "learning_rate": 2.06744927360202e-06, "loss": 0.315954852104187, "memory(GiB)": 127.52, "step": 2055, "token_acc": 0.8844018739071213, "train_speed(iter/s)": 0.100653 }, { "epoch": 2.4121779859484778, "grad_norm": 0.23304541409015656, "learning_rate": 2.0283036772065712e-06, "loss": 0.31738996505737305, "memory(GiB)": 127.52, "step": 2060, "token_acc": 0.8888605233133514, "train_speed(iter/s)": 0.100656 }, { "epoch": 2.418032786885246, "grad_norm": 0.23033016920089722, "learning_rate": 1.9894903612487683e-06, "loss": 0.32506499290466306, "memory(GiB)": 127.52, "step": 2065, "token_acc": 0.8765848323481849, "train_speed(iter/s)": 0.100657 }, { "epoch": 2.423887587822014, "grad_norm": 0.2522413730621338, "learning_rate": 1.9510109435656457e-06, "loss": 0.3240881681442261, "memory(GiB)": 127.52, "step": 2070, "token_acc": 0.8874444430454654, "train_speed(iter/s)": 0.10066 }, { "epoch": 2.429742388758782, "grad_norm": 0.23793016374111176, "learning_rate": 1.9128670280765283e-06, "loss": 0.326206374168396, "memory(GiB)": 127.52, "step": 2075, "token_acc": 0.8811696876529852, "train_speed(iter/s)": 0.100656 }, { "epoch": 2.4355971896955504, "grad_norm": 0.2260826826095581, "learning_rate": 1.8750602047161603e-06, "loss": 0.3155853748321533, "memory(GiB)": 127.52, "step": 2080, "token_acc": 0.8918628516614084, "train_speed(iter/s)": 0.100657 }, { "epoch": 2.4414519906323187, "grad_norm": 0.22915047407150269, "learning_rate": 1.8375920493684264e-06, "loss": 0.32075018882751466, "memory(GiB)": 127.52, "step": 2085, "token_acc": 0.8806146127312637, "train_speed(iter/s)": 0.100664 }, { "epoch": 2.4473067915690865, "grad_norm": 0.23555633425712585, "learning_rate": 1.8004641238006815e-06, "loss": 0.3198583126068115, "memory(GiB)": 127.52, "step": 2090, "token_acc": 0.8878798889856471, "train_speed(iter/s)": 0.100663 }, { "epoch": 2.4531615925058547, "grad_norm": 0.23224787414073944, "learning_rate": 1.7636779755986443e-06, "loss": 0.32527942657470704, "memory(GiB)": 127.52, "step": 2095, "token_acc": 0.8808102158192161, "train_speed(iter/s)": 0.100659 }, { "epoch": 2.459016393442623, "grad_norm": 0.2313682585954666, "learning_rate": 1.7272351381018792e-06, "loss": 0.3221132278442383, "memory(GiB)": 127.52, "step": 2100, "token_acc": 0.8723955898759107, "train_speed(iter/s)": 0.10066 }, { "epoch": 2.4648711943793913, "grad_norm": 0.23031777143478394, "learning_rate": 1.6911371303399048e-06, "loss": 0.3093102931976318, "memory(GiB)": 127.52, "step": 2105, "token_acc": 0.887525459211663, "train_speed(iter/s)": 0.100655 }, { "epoch": 2.470725995316159, "grad_norm": 0.23843398690223694, "learning_rate": 1.6553854569688632e-06, "loss": 0.3248276710510254, "memory(GiB)": 127.52, "step": 2110, "token_acc": 0.882843537798315, "train_speed(iter/s)": 0.100654 }, { "epoch": 2.4765807962529274, "grad_norm": 0.23203721642494202, "learning_rate": 1.619981608208796e-06, "loss": 0.32454729080200195, "memory(GiB)": 127.52, "step": 2115, "token_acc": 0.869970732560573, "train_speed(iter/s)": 0.100657 }, { "epoch": 2.4824355971896956, "grad_norm": 0.23711416125297546, "learning_rate": 1.584927059781548e-06, "loss": 0.3233715295791626, "memory(GiB)": 127.52, "step": 2120, "token_acc": 0.8797791727772037, "train_speed(iter/s)": 0.100658 }, { "epoch": 2.4882903981264635, "grad_norm": 0.23975679278373718, "learning_rate": 1.5502232728492362e-06, "loss": 0.31569533348083495, "memory(GiB)": 127.52, "step": 2125, "token_acc": 0.8874189972049156, "train_speed(iter/s)": 0.100661 }, { "epoch": 2.4941451990632317, "grad_norm": 0.23424658179283142, "learning_rate": 1.5158716939533524e-06, "loss": 0.32528119087219237, "memory(GiB)": 127.52, "step": 2130, "token_acc": 0.8848355062483098, "train_speed(iter/s)": 0.100663 }, { "epoch": 2.5, "grad_norm": 0.2467930018901825, "learning_rate": 1.4818737549544725e-06, "loss": 0.3232418060302734, "memory(GiB)": 127.52, "step": 2135, "token_acc": 0.8760404837079283, "train_speed(iter/s)": 0.100669 }, { "epoch": 2.5058548009367683, "grad_norm": 0.23344840109348297, "learning_rate": 1.448230872972568e-06, "loss": 0.3205883979797363, "memory(GiB)": 127.52, "step": 2140, "token_acc": 0.8896608528350288, "train_speed(iter/s)": 0.100665 }, { "epoch": 2.5117096018735365, "grad_norm": 0.2276953160762787, "learning_rate": 1.4149444503279297e-06, "loss": 0.32780184745788576, "memory(GiB)": 127.52, "step": 2145, "token_acc": 0.8763619018928553, "train_speed(iter/s)": 0.100666 }, { "epoch": 2.5175644028103044, "grad_norm": 0.23720286786556244, "learning_rate": 1.382015874482735e-06, "loss": 0.3210983037948608, "memory(GiB)": 127.52, "step": 2150, "token_acc": 0.8830952351167766, "train_speed(iter/s)": 0.100669 }, { "epoch": 2.5234192037470726, "grad_norm": 0.2429177612066269, "learning_rate": 1.3494465179831895e-06, "loss": 0.31808924674987793, "memory(GiB)": 127.52, "step": 2155, "token_acc": 0.8801182829610709, "train_speed(iter/s)": 0.100671 }, { "epoch": 2.529274004683841, "grad_norm": 0.2192358821630478, "learning_rate": 1.3172377384023393e-06, "loss": 0.3137265682220459, "memory(GiB)": 127.52, "step": 2160, "token_acc": 0.8851310631053786, "train_speed(iter/s)": 0.100675 }, { "epoch": 2.5351288056206087, "grad_norm": 0.22843384742736816, "learning_rate": 1.2853908782834722e-06, "loss": 0.31639652252197265, "memory(GiB)": 127.52, "step": 2165, "token_acc": 0.8930099545248551, "train_speed(iter/s)": 0.100673 }, { "epoch": 2.540983606557377, "grad_norm": 0.23414385318756104, "learning_rate": 1.2539072650841523e-06, "loss": 0.32384276390075684, "memory(GiB)": 127.52, "step": 2170, "token_acc": 0.8826712369541582, "train_speed(iter/s)": 0.100679 }, { "epoch": 2.5468384074941453, "grad_norm": 0.2386016696691513, "learning_rate": 1.2227882111209011e-06, "loss": 0.3276023864746094, "memory(GiB)": 127.52, "step": 2175, "token_acc": 0.876178791079083, "train_speed(iter/s)": 0.10068 }, { "epoch": 2.552693208430913, "grad_norm": 0.23498761653900146, "learning_rate": 1.1920350135144898e-06, "loss": 0.3207254409790039, "memory(GiB)": 127.52, "step": 2180, "token_acc": 0.8885690220875708, "train_speed(iter/s)": 0.100681 }, { "epoch": 2.5585480093676813, "grad_norm": 0.23011547327041626, "learning_rate": 1.1616489541358678e-06, "loss": 0.3184302806854248, "memory(GiB)": 127.52, "step": 2185, "token_acc": 0.8778273150286384, "train_speed(iter/s)": 0.100682 }, { "epoch": 2.5644028103044496, "grad_norm": 0.22844338417053223, "learning_rate": 1.1316312995527424e-06, "loss": 0.3216708183288574, "memory(GiB)": 127.52, "step": 2190, "token_acc": 0.8842230056468974, "train_speed(iter/s)": 0.100685 }, { "epoch": 2.570257611241218, "grad_norm": 0.23386669158935547, "learning_rate": 1.1019833009767744e-06, "loss": 0.3198892831802368, "memory(GiB)": 127.52, "step": 2195, "token_acc": 0.881730841074942, "train_speed(iter/s)": 0.100684 }, { "epoch": 2.576112412177986, "grad_norm": 0.23416638374328613, "learning_rate": 1.072706194211426e-06, "loss": 0.32181246280670167, "memory(GiB)": 127.52, "step": 2200, "token_acc": 0.8872248114887651, "train_speed(iter/s)": 0.100687 }, { "epoch": 2.581967213114754, "grad_norm": 0.232351616024971, "learning_rate": 1.0438011996004581e-06, "loss": 0.32013840675354005, "memory(GiB)": 127.52, "step": 2205, "token_acc": 0.8815920274367514, "train_speed(iter/s)": 0.100688 }, { "epoch": 2.5878220140515222, "grad_norm": 0.24018974602222443, "learning_rate": 1.0152695219770558e-06, "loss": 0.3074916124343872, "memory(GiB)": 127.52, "step": 2210, "token_acc": 0.8911461159004883, "train_speed(iter/s)": 0.100686 }, { "epoch": 2.5936768149882905, "grad_norm": 0.2339586764574051, "learning_rate": 9.871123506136037e-07, "loss": 0.3152151107788086, "memory(GiB)": 127.52, "step": 2215, "token_acc": 0.8945800996908322, "train_speed(iter/s)": 0.100689 }, { "epoch": 2.5995316159250583, "grad_norm": 0.23918944597244263, "learning_rate": 9.593308591721274e-07, "loss": 0.3115771532058716, "memory(GiB)": 127.52, "step": 2220, "token_acc": 0.8863534338516209, "train_speed(iter/s)": 0.100692 }, { "epoch": 2.6053864168618266, "grad_norm": 0.228268101811409, "learning_rate": 9.319262056553602e-07, "loss": 0.3226304531097412, "memory(GiB)": 127.52, "step": 2225, "token_acc": 0.8902835788085294, "train_speed(iter/s)": 0.10069 }, { "epoch": 2.611241217798595, "grad_norm": 0.23581595718860626, "learning_rate": 9.048995323584764e-07, "loss": 0.3258847713470459, "memory(GiB)": 127.52, "step": 2230, "token_acc": 0.8929581827894788, "train_speed(iter/s)": 0.10069 }, { "epoch": 2.617096018735363, "grad_norm": 0.4460615813732147, "learning_rate": 8.78251965821485e-07, "loss": 0.3083215236663818, "memory(GiB)": 127.52, "step": 2235, "token_acc": 0.8851051496528254, "train_speed(iter/s)": 0.10069 }, { "epoch": 2.6229508196721314, "grad_norm": 0.23269429802894592, "learning_rate": 8.519846167822665e-07, "loss": 0.31586997509002684, "memory(GiB)": 127.52, "step": 2240, "token_acc": 0.8981023709170914, "train_speed(iter/s)": 0.100691 }, { "epoch": 2.628805620608899, "grad_norm": 0.608095645904541, "learning_rate": 8.260985801302734e-07, "loss": 0.30504627227783204, "memory(GiB)": 127.52, "step": 2245, "token_acc": 0.8836382464618571, "train_speed(iter/s)": 0.100692 }, { "epoch": 2.6346604215456675, "grad_norm": 0.22992344200611115, "learning_rate": 8.005949348608977e-07, "loss": 0.31817898750305174, "memory(GiB)": 127.52, "step": 2250, "token_acc": 0.8803807403423412, "train_speed(iter/s)": 0.100694 }, { "epoch": 2.6405152224824358, "grad_norm": 0.2216484099626541, "learning_rate": 7.754747440304911e-07, "loss": 0.3218961000442505, "memory(GiB)": 127.52, "step": 2255, "token_acc": 0.8802025202800865, "train_speed(iter/s)": 0.1007 }, { "epoch": 2.6463700234192036, "grad_norm": 0.22643844783306122, "learning_rate": 7.507390547120541e-07, "loss": 0.31406736373901367, "memory(GiB)": 127.52, "step": 2260, "token_acc": 0.8841787048704839, "train_speed(iter/s)": 0.100704 }, { "epoch": 2.652224824355972, "grad_norm": 0.22945396602153778, "learning_rate": 7.263888979515954e-07, "loss": 0.32517061233520506, "memory(GiB)": 127.52, "step": 2265, "token_acc": 0.8788511831616095, "train_speed(iter/s)": 0.10071 }, { "epoch": 2.65807962529274, "grad_norm": 0.22719787061214447, "learning_rate": 7.024252887251548e-07, "loss": 0.31670680046081545, "memory(GiB)": 127.52, "step": 2270, "token_acc": 0.8838603030141137, "train_speed(iter/s)": 0.100707 }, { "epoch": 2.663934426229508, "grad_norm": 0.2364586889743805, "learning_rate": 6.788492258964896e-07, "loss": 0.3206209659576416, "memory(GiB)": 127.52, "step": 2275, "token_acc": 0.8808837716472833, "train_speed(iter/s)": 0.100707 }, { "epoch": 2.669789227166276, "grad_norm": 0.23205353319644928, "learning_rate": 6.556616921754489e-07, "loss": 0.3177974224090576, "memory(GiB)": 127.52, "step": 2280, "token_acc": 0.8846845210507196, "train_speed(iter/s)": 0.100709 }, { "epoch": 2.6756440281030445, "grad_norm": 0.23928001523017883, "learning_rate": 6.328636540770028e-07, "loss": 0.3218786001205444, "memory(GiB)": 127.52, "step": 2285, "token_acc": 0.8839321457165733, "train_speed(iter/s)": 0.10071 }, { "epoch": 2.6814988290398127, "grad_norm": 0.22948609292507172, "learning_rate": 6.10456061880963e-07, "loss": 0.32559771537780763, "memory(GiB)": 127.52, "step": 2290, "token_acc": 0.888954265344254, "train_speed(iter/s)": 0.10071 }, { "epoch": 2.687353629976581, "grad_norm": 0.22480416297912598, "learning_rate": 5.884398495923727e-07, "loss": 0.31432313919067384, "memory(GiB)": 127.52, "step": 2295, "token_acc": 0.8786473253733409, "train_speed(iter/s)": 0.100714 }, { "epoch": 2.693208430913349, "grad_norm": 0.49891427159309387, "learning_rate": 5.668159349025649e-07, "loss": 0.33366761207580564, "memory(GiB)": 127.52, "step": 2300, "token_acc": 0.8706380208333333, "train_speed(iter/s)": 0.100713 }, { "epoch": 2.699063231850117, "grad_norm": 0.23788191378116608, "learning_rate": 5.455852191509214e-07, "loss": 0.326168417930603, "memory(GiB)": 127.52, "step": 2305, "token_acc": 0.8757156059468948, "train_speed(iter/s)": 0.100714 }, { "epoch": 2.7049180327868854, "grad_norm": 0.23934431374073029, "learning_rate": 5.247485872873026e-07, "loss": 0.3131624460220337, "memory(GiB)": 127.52, "step": 2310, "token_acc": 0.8873159330925727, "train_speed(iter/s)": 0.100715 }, { "epoch": 2.710772833723653, "grad_norm": 0.22434021532535553, "learning_rate": 5.043069078351526e-07, "loss": 0.3083023548126221, "memory(GiB)": 127.52, "step": 2315, "token_acc": 0.8900379146919432, "train_speed(iter/s)": 0.10072 }, { "epoch": 2.7166276346604215, "grad_norm": 0.2241913378238678, "learning_rate": 4.842610328552999e-07, "loss": 0.31645286083221436, "memory(GiB)": 127.52, "step": 2320, "token_acc": 0.8860757524370778, "train_speed(iter/s)": 0.100719 }, { "epoch": 2.7224824355971897, "grad_norm": 0.22683191299438477, "learning_rate": 4.6461179791044806e-07, "loss": 0.3162517547607422, "memory(GiB)": 127.52, "step": 2325, "token_acc": 0.8806341851421645, "train_speed(iter/s)": 0.100722 }, { "epoch": 2.728337236533958, "grad_norm": 0.22332416474819183, "learning_rate": 4.453600220303378e-07, "loss": 0.3006160736083984, "memory(GiB)": 127.52, "step": 2330, "token_acc": 0.8811269139759368, "train_speed(iter/s)": 0.100726 }, { "epoch": 2.7341920374707263, "grad_norm": 0.2320730835199356, "learning_rate": 4.2650650767761535e-07, "loss": 0.3053130149841309, "memory(GiB)": 127.52, "step": 2335, "token_acc": 0.8909103410770822, "train_speed(iter/s)": 0.100726 }, { "epoch": 2.740046838407494, "grad_norm": 0.2575525939464569, "learning_rate": 4.0805204071437953e-07, "loss": 0.32894713878631593, "memory(GiB)": 127.52, "step": 2340, "token_acc": 0.880288983757294, "train_speed(iter/s)": 0.100724 }, { "epoch": 2.7459016393442623, "grad_norm": 0.2190413624048233, "learning_rate": 3.899973903694243e-07, "loss": 0.32172608375549316, "memory(GiB)": 127.52, "step": 2345, "token_acc": 0.8842697990204148, "train_speed(iter/s)": 0.100724 }, { "epoch": 2.7517564402810306, "grad_norm": 0.22509151697158813, "learning_rate": 3.72343309206179e-07, "loss": 0.31258511543273926, "memory(GiB)": 127.52, "step": 2350, "token_acc": 0.8854250593299245, "train_speed(iter/s)": 0.100723 }, { "epoch": 2.7576112412177984, "grad_norm": 0.22671233117580414, "learning_rate": 3.55090533091339e-07, "loss": 0.3143455028533936, "memory(GiB)": 127.52, "step": 2355, "token_acc": 0.896848520654861, "train_speed(iter/s)": 0.10072 }, { "epoch": 2.7634660421545667, "grad_norm": 0.21764405071735382, "learning_rate": 3.382397811641858e-07, "loss": 0.3072871208190918, "memory(GiB)": 127.52, "step": 2360, "token_acc": 0.8893455142073456, "train_speed(iter/s)": 0.100725 }, { "epoch": 2.769320843091335, "grad_norm": 0.22008980810642242, "learning_rate": 3.217917558066241e-07, "loss": 0.31331815719604494, "memory(GiB)": 127.52, "step": 2365, "token_acc": 0.8801702516246458, "train_speed(iter/s)": 0.100727 }, { "epoch": 2.775175644028103, "grad_norm": 0.2225882112979889, "learning_rate": 3.057471426138958e-07, "loss": 0.3275087833404541, "memory(GiB)": 127.52, "step": 2370, "token_acc": 0.8743533027834035, "train_speed(iter/s)": 0.100726 }, { "epoch": 2.781030444964871, "grad_norm": 0.22171831130981445, "learning_rate": 2.901066103660033e-07, "loss": 0.3129570484161377, "memory(GiB)": 127.52, "step": 2375, "token_acc": 0.8872727501597082, "train_speed(iter/s)": 0.100728 }, { "epoch": 2.7868852459016393, "grad_norm": 0.2355940192937851, "learning_rate": 2.7487081099983435e-07, "loss": 0.32728214263916017, "memory(GiB)": 127.52, "step": 2380, "token_acc": 0.882063511039243, "train_speed(iter/s)": 0.100731 }, { "epoch": 2.7927400468384076, "grad_norm": 0.21898697316646576, "learning_rate": 2.6004037958199167e-07, "loss": 0.31028578281402586, "memory(GiB)": 127.52, "step": 2385, "token_acc": 0.8959504867399893, "train_speed(iter/s)": 0.100732 }, { "epoch": 2.798594847775176, "grad_norm": 0.22940264642238617, "learning_rate": 2.4561593428231165e-07, "loss": 0.3168987274169922, "memory(GiB)": 127.52, "step": 2390, "token_acc": 0.9043824201593208, "train_speed(iter/s)": 0.100729 }, { "epoch": 2.8044496487119437, "grad_norm": 0.22128568589687347, "learning_rate": 2.3159807634811182e-07, "loss": 0.30646657943725586, "memory(GiB)": 127.52, "step": 2395, "token_acc": 0.890519620223563, "train_speed(iter/s)": 0.10073 }, { "epoch": 2.810304449648712, "grad_norm": 0.23035509884357452, "learning_rate": 2.1798739007911517e-07, "loss": 0.321412467956543, "memory(GiB)": 127.52, "step": 2400, "token_acc": 0.8813866834368367, "train_speed(iter/s)": 0.100729 }, { "epoch": 2.8161592505854802, "grad_norm": 0.22361230850219727, "learning_rate": 2.0478444280310206e-07, "loss": 0.314456582069397, "memory(GiB)": 127.52, "step": 2405, "token_acc": 0.8847936237191627, "train_speed(iter/s)": 0.100733 }, { "epoch": 2.822014051522248, "grad_norm": 0.248680979013443, "learning_rate": 1.919897848522656e-07, "loss": 0.31545486450195315, "memory(GiB)": 127.52, "step": 2410, "token_acc": 0.8842675175238047, "train_speed(iter/s)": 0.100732 }, { "epoch": 2.8278688524590163, "grad_norm": 0.2220403105020523, "learning_rate": 1.796039495402646e-07, "loss": 0.3194711923599243, "memory(GiB)": 127.52, "step": 2415, "token_acc": 0.889650254732648, "train_speed(iter/s)": 0.100731 }, { "epoch": 2.8337236533957846, "grad_norm": 0.23251083493232727, "learning_rate": 1.6762745313999795e-07, "loss": 0.32554826736450193, "memory(GiB)": 127.52, "step": 2420, "token_acc": 0.8688351785435834, "train_speed(iter/s)": 0.100728 }, { "epoch": 2.839578454332553, "grad_norm": 0.2339450716972351, "learning_rate": 1.5606079486208846e-07, "loss": 0.3137704372406006, "memory(GiB)": 127.52, "step": 2425, "token_acc": 0.8856111133651886, "train_speed(iter/s)": 0.100732 }, { "epoch": 2.845433255269321, "grad_norm": 0.22966544330120087, "learning_rate": 1.449044568340663e-07, "loss": 0.32210094928741456, "memory(GiB)": 127.52, "step": 2430, "token_acc": 0.8884470889772489, "train_speed(iter/s)": 0.100732 }, { "epoch": 2.851288056206089, "grad_norm": 0.24191494286060333, "learning_rate": 1.3415890408027932e-07, "loss": 0.31206402778625486, "memory(GiB)": 127.52, "step": 2435, "token_acc": 0.8830502196115786, "train_speed(iter/s)": 0.100731 }, { "epoch": 2.857142857142857, "grad_norm": 0.23956511914730072, "learning_rate": 1.2382458450250657e-07, "loss": 0.32455346584320066, "memory(GiB)": 127.52, "step": 2440, "token_acc": 0.8758227950966726, "train_speed(iter/s)": 0.100735 }, { "epoch": 2.8629976580796255, "grad_norm": 0.22552776336669922, "learning_rate": 1.1390192886129304e-07, "loss": 0.3120935678482056, "memory(GiB)": 127.52, "step": 2445, "token_acc": 0.897060631760815, "train_speed(iter/s)": 0.100735 }, { "epoch": 2.8688524590163933, "grad_norm": 0.2666381001472473, "learning_rate": 1.0439135075798634e-07, "loss": 0.3291801452636719, "memory(GiB)": 127.52, "step": 2450, "token_acc": 0.8820067150139295, "train_speed(iter/s)": 0.100741 }, { "epoch": 2.8747072599531616, "grad_norm": 0.22115741670131683, "learning_rate": 9.529324661750494e-08, "loss": 0.32175321578979493, "memory(GiB)": 127.52, "step": 2455, "token_acc": 0.8775227487104135, "train_speed(iter/s)": 0.100739 }, { "epoch": 2.88056206088993, "grad_norm": 0.22983959317207336, "learning_rate": 8.6607995671808e-08, "loss": 0.31844320297241213, "memory(GiB)": 127.52, "step": 2460, "token_acc": 0.8813101879265747, "train_speed(iter/s)": 0.10074 }, { "epoch": 2.8864168618266977, "grad_norm": 0.23733210563659668, "learning_rate": 7.833595994409248e-08, "loss": 0.3080190658569336, "memory(GiB)": 127.52, "step": 2465, "token_acc": 0.88289333750391, "train_speed(iter/s)": 0.100738 }, { "epoch": 2.892271662763466, "grad_norm": 0.24082650244235992, "learning_rate": 7.047748423370193e-08, "loss": 0.3234051465988159, "memory(GiB)": 127.52, "step": 2470, "token_acc": 0.8791906373996674, "train_speed(iter/s)": 0.100744 }, { "epoch": 2.898126463700234, "grad_norm": 0.24151204526424408, "learning_rate": 6.303289610175233e-08, "loss": 0.31094648838043215, "memory(GiB)": 127.52, "step": 2475, "token_acc": 0.8864608150470219, "train_speed(iter/s)": 0.100743 }, { "epoch": 2.9039812646370025, "grad_norm": 0.23166167736053467, "learning_rate": 5.6002505857480906e-08, "loss": 0.3175530910491943, "memory(GiB)": 127.52, "step": 2480, "token_acc": 0.8859342832291451, "train_speed(iter/s)": 0.100739 }, { "epoch": 2.9098360655737707, "grad_norm": 0.22753314673900604, "learning_rate": 4.938660654530969e-08, "loss": 0.3289816379547119, "memory(GiB)": 127.52, "step": 2485, "token_acc": 0.8799638876393262, "train_speed(iter/s)": 0.100739 }, { "epoch": 2.9156908665105385, "grad_norm": 0.22824768722057343, "learning_rate": 4.318547393263317e-08, "loss": 0.33161611557006837, "memory(GiB)": 127.52, "step": 2490, "token_acc": 0.8840203211591419, "train_speed(iter/s)": 0.100737 }, { "epoch": 2.921545667447307, "grad_norm": 0.2232208400964737, "learning_rate": 3.739936649832188e-08, "loss": 0.31346931457519533, "memory(GiB)": 127.52, "step": 2495, "token_acc": 0.8866209251707488, "train_speed(iter/s)": 0.100742 }, { "epoch": 2.927400468384075, "grad_norm": 0.22846031188964844, "learning_rate": 3.2028525421946563e-08, "loss": 0.31502933502197267, "memory(GiB)": 127.52, "step": 2500, "token_acc": 0.8958872772065662, "train_speed(iter/s)": 0.100746 }, { "epoch": 2.933255269320843, "grad_norm": 0.22012905776500702, "learning_rate": 2.70731745737296e-08, "loss": 0.317963695526123, "memory(GiB)": 127.52, "step": 2505, "token_acc": 0.8870393801646438, "train_speed(iter/s)": 0.100749 }, { "epoch": 2.939110070257611, "grad_norm": 0.22778548300266266, "learning_rate": 2.2533520505211294e-08, "loss": 0.3122371196746826, "memory(GiB)": 127.52, "step": 2510, "token_acc": 0.888907967032967, "train_speed(iter/s)": 0.100751 }, { "epoch": 2.9449648711943794, "grad_norm": 0.22804217040538788, "learning_rate": 1.8409752440639027e-08, "loss": 0.3041959524154663, "memory(GiB)": 127.52, "step": 2515, "token_acc": 0.8861121607989981, "train_speed(iter/s)": 0.100754 }, { "epoch": 2.9508196721311473, "grad_norm": 0.2233329713344574, "learning_rate": 1.470204226908134e-08, "loss": 0.32151806354522705, "memory(GiB)": 127.52, "step": 2520, "token_acc": 0.8879425846286458, "train_speed(iter/s)": 0.100749 }, { "epoch": 2.9566744730679155, "grad_norm": 0.24781863391399384, "learning_rate": 1.1410544537263645e-08, "loss": 0.32978765964508056, "memory(GiB)": 127.52, "step": 2525, "token_acc": 0.8869459116971757, "train_speed(iter/s)": 0.100749 }, { "epoch": 2.962529274004684, "grad_norm": 0.22210603952407837, "learning_rate": 8.535396443124511e-09, "loss": 0.30834412574768066, "memory(GiB)": 127.52, "step": 2530, "token_acc": 0.8843790902885199, "train_speed(iter/s)": 0.100751 }, { "epoch": 2.968384074941452, "grad_norm": 0.22260542213916779, "learning_rate": 6.076717830098e-09, "loss": 0.31018791198730467, "memory(GiB)": 127.52, "step": 2535, "token_acc": 0.8947010997127103, "train_speed(iter/s)": 0.10075 }, { "epoch": 2.9742388758782203, "grad_norm": 0.24026013910770416, "learning_rate": 4.034611182121007e-09, "loss": 0.3117814064025879, "memory(GiB)": 127.52, "step": 2540, "token_acc": 0.8939134081534292, "train_speed(iter/s)": 0.100749 }, { "epoch": 2.980093676814988, "grad_norm": 0.22812722623348236, "learning_rate": 2.40916161935445e-09, "loss": 0.31728358268737794, "memory(GiB)": 127.52, "step": 2545, "token_acc": 0.883892058363205, "train_speed(iter/s)": 0.10075 }, { "epoch": 2.9859484777517564, "grad_norm": 0.2219596952199936, "learning_rate": 1.2004368946427758e-09, "loss": 0.31175081729888915, "memory(GiB)": 127.52, "step": 2550, "token_acc": 0.8867498701584854, "train_speed(iter/s)": 0.100752 }, { "epoch": 2.9918032786885247, "grad_norm": 0.22541016340255737, "learning_rate": 4.084873906851083e-10, "loss": 0.31843390464782717, "memory(GiB)": 127.52, "step": 2555, "token_acc": 0.893655570084918, "train_speed(iter/s)": 0.10075 }, { "epoch": 2.9976580796252925, "grad_norm": 0.22078001499176025, "learning_rate": 3.334611793692766e-11, "loss": 0.31821532249450685, "memory(GiB)": 127.52, "step": 2560, "token_acc": 0.8979642133800124, "train_speed(iter/s)": 0.100751 } ], "logging_steps": 5, "max_steps": 2562, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1575512474484736.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }