{ "best_global_step": 5161, "best_metric": 0.37820467, "best_model_checkpoint": "/home/work/newrag/qwen3/ms-swift-finetuning/output/qwen2.5-bnk-phase2/v1-20250804-040453/checkpoint-5161", "epoch": 1.0, "eval_steps": 500, "global_step": 5161, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00019377028532674515, "grad_norm": 0.8763577342033386, "learning_rate": 1.1583011583011583e-06, "loss": 0.9684686660766602, "memory(GiB)": 55.14, "step": 1, "token_acc": 0.7912246865959498, "train_speed(iter/s)": 0.010196 }, { "epoch": 0.0003875405706534903, "grad_norm": 0.848241925239563, "learning_rate": 2.3166023166023166e-06, "loss": 0.9568785429000854, "memory(GiB)": 55.14, "step": 2, "token_acc": 0.7863654591632134, "train_speed(iter/s)": 0.015869 }, { "epoch": 0.0005813108559802354, "grad_norm": 0.8443148136138916, "learning_rate": 3.4749034749034742e-06, "loss": 0.8965896964073181, "memory(GiB)": 66.69, "step": 3, "token_acc": 0.7995744899005435, "train_speed(iter/s)": 0.019428 }, { "epoch": 0.0007750811413069806, "grad_norm": 0.8933870196342468, "learning_rate": 4.633204633204633e-06, "loss": 0.9800578355789185, "memory(GiB)": 78.26, "step": 4, "token_acc": 0.7830852365415987, "train_speed(iter/s)": 0.021886 }, { "epoch": 0.0009688514266337257, "grad_norm": 0.9806432723999023, "learning_rate": 5.791505791505791e-06, "loss": 1.0483475923538208, "memory(GiB)": 78.26, "step": 5, "token_acc": 0.7753844505170658, "train_speed(iter/s)": 0.023671 }, { "epoch": 0.0011626217119604708, "grad_norm": 0.897155225276947, "learning_rate": 6.9498069498069484e-06, "loss": 1.0103771686553955, "memory(GiB)": 78.26, "step": 6, "token_acc": 0.7809635036496351, "train_speed(iter/s)": 0.025066 }, { "epoch": 0.0013563919972872161, "grad_norm": 0.9799181818962097, "learning_rate": 8.108108108108107e-06, "loss": 1.0178213119506836, "memory(GiB)": 78.26, "step": 7, "token_acc": 0.7772091722595078, "train_speed(iter/s)": 0.02613 }, { "epoch": 0.0015501622826139612, "grad_norm": 1.0471524000167847, "learning_rate": 9.266409266409266e-06, "loss": 1.1097385883331299, "memory(GiB)": 78.26, "step": 8, "token_acc": 0.7599808682440987, "train_speed(iter/s)": 0.026976 }, { "epoch": 0.0017439325679407063, "grad_norm": 0.8407416939735413, "learning_rate": 1.0424710424710423e-05, "loss": 0.9527825117111206, "memory(GiB)": 78.26, "step": 9, "token_acc": 0.7823414284532899, "train_speed(iter/s)": 0.027706 }, { "epoch": 0.0019377028532674514, "grad_norm": 0.8665539622306824, "learning_rate": 1.1583011583011582e-05, "loss": 1.0066951513290405, "memory(GiB)": 78.26, "step": 10, "token_acc": 0.7773281507906927, "train_speed(iter/s)": 0.028311 }, { "epoch": 0.0021314731385941965, "grad_norm": 0.8096230030059814, "learning_rate": 1.274131274131274e-05, "loss": 0.9802660346031189, "memory(GiB)": 78.26, "step": 11, "token_acc": 0.7735733563339918, "train_speed(iter/s)": 0.028814 }, { "epoch": 0.0023252434239209416, "grad_norm": 0.6567773818969727, "learning_rate": 1.3899613899613897e-05, "loss": 0.9128895998001099, "memory(GiB)": 78.26, "step": 12, "token_acc": 0.7855364418288818, "train_speed(iter/s)": 0.029273 }, { "epoch": 0.0025190137092476867, "grad_norm": 0.5237937569618225, "learning_rate": 1.5057915057915056e-05, "loss": 0.9058799743652344, "memory(GiB)": 78.26, "step": 13, "token_acc": 0.7805104547360853, "train_speed(iter/s)": 0.029674 }, { "epoch": 0.0027127839945744322, "grad_norm": 0.4156467914581299, "learning_rate": 1.6216216216216215e-05, "loss": 0.8571181893348694, "memory(GiB)": 78.26, "step": 14, "token_acc": 0.7879932696276072, "train_speed(iter/s)": 0.030038 }, { "epoch": 0.0029065542799011773, "grad_norm": 0.2674636244773865, "learning_rate": 1.7374517374517374e-05, "loss": 0.7994478344917297, "memory(GiB)": 78.26, "step": 15, "token_acc": 0.7901431703158408, "train_speed(iter/s)": 0.030338 }, { "epoch": 0.0031003245652279224, "grad_norm": 0.24086351692676544, "learning_rate": 1.8532818532818533e-05, "loss": 0.8486983180046082, "memory(GiB)": 78.26, "step": 16, "token_acc": 0.7774675145147912, "train_speed(iter/s)": 0.0306 }, { "epoch": 0.0032940948505546675, "grad_norm": 0.21650397777557373, "learning_rate": 1.9691119691119688e-05, "loss": 0.8528725504875183, "memory(GiB)": 78.26, "step": 17, "token_acc": 0.775322540195614, "train_speed(iter/s)": 0.030832 }, { "epoch": 0.0034878651358814126, "grad_norm": 0.15441368520259857, "learning_rate": 2.0849420849420847e-05, "loss": 0.803363025188446, "memory(GiB)": 78.26, "step": 18, "token_acc": 0.7852022395479044, "train_speed(iter/s)": 0.031049 }, { "epoch": 0.0036816354212081577, "grad_norm": 0.14429929852485657, "learning_rate": 2.200772200772201e-05, "loss": 0.8013445734977722, "memory(GiB)": 78.26, "step": 19, "token_acc": 0.7839805825242718, "train_speed(iter/s)": 0.031259 }, { "epoch": 0.003875405706534903, "grad_norm": 0.1348273605108261, "learning_rate": 2.3166023166023165e-05, "loss": 0.7239266633987427, "memory(GiB)": 78.26, "step": 20, "token_acc": 0.8037726258718549, "train_speed(iter/s)": 0.031439 }, { "epoch": 0.004069175991861648, "grad_norm": 0.1491738110780716, "learning_rate": 2.4324324324324324e-05, "loss": 0.8465297222137451, "memory(GiB)": 78.26, "step": 21, "token_acc": 0.7757449962935508, "train_speed(iter/s)": 0.031603 }, { "epoch": 0.004262946277188393, "grad_norm": 0.167159304022789, "learning_rate": 2.548262548262548e-05, "loss": 0.736566424369812, "memory(GiB)": 78.26, "step": 22, "token_acc": 0.8000630083629282, "train_speed(iter/s)": 0.03176 }, { "epoch": 0.004456716562515138, "grad_norm": 0.15523645281791687, "learning_rate": 2.6640926640926638e-05, "loss": 0.6989044547080994, "memory(GiB)": 78.26, "step": 23, "token_acc": 0.8113520593800029, "train_speed(iter/s)": 0.031902 }, { "epoch": 0.004650486847841883, "grad_norm": 0.16765183210372925, "learning_rate": 2.7799227799227794e-05, "loss": 0.6959363222122192, "memory(GiB)": 78.26, "step": 24, "token_acc": 0.8058972412047583, "train_speed(iter/s)": 0.03202 }, { "epoch": 0.004844257133168628, "grad_norm": 0.16556301712989807, "learning_rate": 2.8957528957528956e-05, "loss": 0.7282753586769104, "memory(GiB)": 78.26, "step": 25, "token_acc": 0.8006341844340851, "train_speed(iter/s)": 0.03212 }, { "epoch": 0.005038027418495373, "grad_norm": 0.15321362018585205, "learning_rate": 3.011583011583011e-05, "loss": 0.7563999891281128, "memory(GiB)": 78.26, "step": 26, "token_acc": 0.7961533964690751, "train_speed(iter/s)": 0.032232 }, { "epoch": 0.0052317977038221185, "grad_norm": 0.14468924701213837, "learning_rate": 3.1274131274131274e-05, "loss": 0.7149499654769897, "memory(GiB)": 78.26, "step": 27, "token_acc": 0.8046370478271425, "train_speed(iter/s)": 0.032331 }, { "epoch": 0.0054255679891488644, "grad_norm": 0.13254722952842712, "learning_rate": 3.243243243243243e-05, "loss": 0.7311335802078247, "memory(GiB)": 78.26, "step": 28, "token_acc": 0.8027618690788417, "train_speed(iter/s)": 0.032422 }, { "epoch": 0.0056193382744756095, "grad_norm": 0.12081703543663025, "learning_rate": 3.3590733590733585e-05, "loss": 0.7195248603820801, "memory(GiB)": 78.26, "step": 29, "token_acc": 0.8041594893099605, "train_speed(iter/s)": 0.032499 }, { "epoch": 0.005813108559802355, "grad_norm": 0.11528925597667694, "learning_rate": 3.474903474903475e-05, "loss": 0.6729803681373596, "memory(GiB)": 78.26, "step": 30, "token_acc": 0.8157344552564334, "train_speed(iter/s)": 0.03258 }, { "epoch": 0.0060068788451291, "grad_norm": 0.13018707931041718, "learning_rate": 3.59073359073359e-05, "loss": 0.7402101159095764, "memory(GiB)": 78.26, "step": 31, "token_acc": 0.7976209655764996, "train_speed(iter/s)": 0.032663 }, { "epoch": 0.006200649130455845, "grad_norm": 0.12700200080871582, "learning_rate": 3.7065637065637065e-05, "loss": 0.7580655813217163, "memory(GiB)": 78.26, "step": 32, "token_acc": 0.7951273532668881, "train_speed(iter/s)": 0.032732 }, { "epoch": 0.00639441941578259, "grad_norm": 0.12156156450510025, "learning_rate": 3.822393822393822e-05, "loss": 0.6386986374855042, "memory(GiB)": 78.26, "step": 33, "token_acc": 0.8224443651076996, "train_speed(iter/s)": 0.032803 }, { "epoch": 0.006588189701109335, "grad_norm": 0.13328500092029572, "learning_rate": 3.9382239382239376e-05, "loss": 0.7081706523895264, "memory(GiB)": 78.26, "step": 34, "token_acc": 0.8066472273114339, "train_speed(iter/s)": 0.03287 }, { "epoch": 0.00678195998643608, "grad_norm": 0.12701700627803802, "learning_rate": 4.054054054054054e-05, "loss": 0.6880778074264526, "memory(GiB)": 78.26, "step": 35, "token_acc": 0.8144979985743269, "train_speed(iter/s)": 0.032933 }, { "epoch": 0.006975730271762825, "grad_norm": 0.13268497586250305, "learning_rate": 4.1698841698841694e-05, "loss": 0.7258732914924622, "memory(GiB)": 78.26, "step": 36, "token_acc": 0.8062436274215798, "train_speed(iter/s)": 0.033003 }, { "epoch": 0.00716950055708957, "grad_norm": 0.1224246695637703, "learning_rate": 4.285714285714285e-05, "loss": 0.7171946167945862, "memory(GiB)": 78.26, "step": 37, "token_acc": 0.8046310103255212, "train_speed(iter/s)": 0.033068 }, { "epoch": 0.007363270842416315, "grad_norm": 0.12160563468933105, "learning_rate": 4.401544401544402e-05, "loss": 0.7064481377601624, "memory(GiB)": 78.26, "step": 38, "token_acc": 0.8088185898806071, "train_speed(iter/s)": 0.033127 }, { "epoch": 0.0075570411277430605, "grad_norm": 0.11494564265012741, "learning_rate": 4.5173745173745174e-05, "loss": 0.7347713708877563, "memory(GiB)": 78.26, "step": 39, "token_acc": 0.798022857441745, "train_speed(iter/s)": 0.033179 }, { "epoch": 0.007750811413069806, "grad_norm": 0.11666765064001083, "learning_rate": 4.633204633204633e-05, "loss": 0.759428083896637, "memory(GiB)": 78.26, "step": 40, "token_acc": 0.7921686007998233, "train_speed(iter/s)": 0.033222 }, { "epoch": 0.007944581698396552, "grad_norm": 0.11888981610536575, "learning_rate": 4.7490347490347485e-05, "loss": 0.7375974655151367, "memory(GiB)": 78.26, "step": 41, "token_acc": 0.8020120724346076, "train_speed(iter/s)": 0.033273 }, { "epoch": 0.008138351983723296, "grad_norm": 0.11488386988639832, "learning_rate": 4.864864864864865e-05, "loss": 0.696284294128418, "memory(GiB)": 78.26, "step": 42, "token_acc": 0.8061018470051596, "train_speed(iter/s)": 0.033319 }, { "epoch": 0.008332122269050042, "grad_norm": 0.1082736924290657, "learning_rate": 4.98069498069498e-05, "loss": 0.6931319832801819, "memory(GiB)": 78.26, "step": 43, "token_acc": 0.8120705042391789, "train_speed(iter/s)": 0.033355 }, { "epoch": 0.008525892554376786, "grad_norm": 0.12314610928297043, "learning_rate": 5.096525096525096e-05, "loss": 0.7261099815368652, "memory(GiB)": 78.26, "step": 44, "token_acc": 0.8016441410059914, "train_speed(iter/s)": 0.033397 }, { "epoch": 0.008719662839703532, "grad_norm": 0.12343846261501312, "learning_rate": 5.212355212355212e-05, "loss": 0.6880357265472412, "memory(GiB)": 78.26, "step": 45, "token_acc": 0.8112455396966993, "train_speed(iter/s)": 0.033441 }, { "epoch": 0.008913433125030276, "grad_norm": 0.11987273395061493, "learning_rate": 5.3281853281853276e-05, "loss": 0.6877344250679016, "memory(GiB)": 78.26, "step": 46, "token_acc": 0.8103002813343285, "train_speed(iter/s)": 0.033486 }, { "epoch": 0.009107203410357022, "grad_norm": 0.1239103302359581, "learning_rate": 5.444015444015443e-05, "loss": 0.7504675388336182, "memory(GiB)": 78.26, "step": 47, "token_acc": 0.7963265423078025, "train_speed(iter/s)": 0.033526 }, { "epoch": 0.009300973695683766, "grad_norm": 0.11254343390464783, "learning_rate": 5.559845559845559e-05, "loss": 0.652977466583252, "memory(GiB)": 78.26, "step": 48, "token_acc": 0.8192933347417345, "train_speed(iter/s)": 0.033559 }, { "epoch": 0.009494743981010512, "grad_norm": 0.13146735727787018, "learning_rate": 5.6756756756756757e-05, "loss": 0.7184028029441833, "memory(GiB)": 78.26, "step": 49, "token_acc": 0.8021164329650468, "train_speed(iter/s)": 0.033598 }, { "epoch": 0.009688514266337257, "grad_norm": 0.11965050548315048, "learning_rate": 5.791505791505791e-05, "loss": 0.6229060888290405, "memory(GiB)": 78.26, "step": 50, "token_acc": 0.8271026669398988, "train_speed(iter/s)": 0.03363 }, { "epoch": 0.009882284551664003, "grad_norm": 0.12212494015693665, "learning_rate": 5.907335907335907e-05, "loss": 0.698147177696228, "memory(GiB)": 78.26, "step": 51, "token_acc": 0.8134602899805237, "train_speed(iter/s)": 0.033662 }, { "epoch": 0.010076054836990747, "grad_norm": 0.1269523799419403, "learning_rate": 6.023166023166022e-05, "loss": 0.6502314805984497, "memory(GiB)": 78.26, "step": 52, "token_acc": 0.8226499256566991, "train_speed(iter/s)": 0.033693 }, { "epoch": 0.010269825122317493, "grad_norm": 0.13087137043476105, "learning_rate": 6.138996138996139e-05, "loss": 0.6692061424255371, "memory(GiB)": 78.26, "step": 53, "token_acc": 0.8201304240156289, "train_speed(iter/s)": 0.033727 }, { "epoch": 0.010463595407644237, "grad_norm": 0.14609168469905853, "learning_rate": 6.254826254826255e-05, "loss": 0.7044385671615601, "memory(GiB)": 78.26, "step": 54, "token_acc": 0.8073001694132143, "train_speed(iter/s)": 0.033764 }, { "epoch": 0.010657365692970983, "grad_norm": 0.12461866438388824, "learning_rate": 6.37065637065637e-05, "loss": 0.6916706562042236, "memory(GiB)": 78.26, "step": 55, "token_acc": 0.8121318182938552, "train_speed(iter/s)": 0.033788 }, { "epoch": 0.010851135978297729, "grad_norm": 0.12224052846431732, "learning_rate": 6.486486486486486e-05, "loss": 0.6395901441574097, "memory(GiB)": 78.26, "step": 56, "token_acc": 0.824472191901174, "train_speed(iter/s)": 0.033815 }, { "epoch": 0.011044906263624473, "grad_norm": 0.1317582130432129, "learning_rate": 6.602316602316601e-05, "loss": 0.6729621291160583, "memory(GiB)": 78.26, "step": 57, "token_acc": 0.8167814800579487, "train_speed(iter/s)": 0.033843 }, { "epoch": 0.011238676548951219, "grad_norm": 0.12963414192199707, "learning_rate": 6.718146718146717e-05, "loss": 0.667634129524231, "memory(GiB)": 78.26, "step": 58, "token_acc": 0.8161623128430612, "train_speed(iter/s)": 0.033872 }, { "epoch": 0.011432446834277963, "grad_norm": 0.14076951146125793, "learning_rate": 6.833976833976833e-05, "loss": 0.659672200679779, "memory(GiB)": 78.26, "step": 59, "token_acc": 0.8184602051133901, "train_speed(iter/s)": 0.033893 }, { "epoch": 0.01162621711960471, "grad_norm": 0.1304904669523239, "learning_rate": 6.94980694980695e-05, "loss": 0.6900058388710022, "memory(GiB)": 78.26, "step": 60, "token_acc": 0.8142272123233802, "train_speed(iter/s)": 0.033915 }, { "epoch": 0.011819987404931454, "grad_norm": 0.12284844368696213, "learning_rate": 7.065637065637065e-05, "loss": 0.6068597435951233, "memory(GiB)": 78.26, "step": 61, "token_acc": 0.8333287809674688, "train_speed(iter/s)": 0.03394 }, { "epoch": 0.0120137576902582, "grad_norm": 0.16575318574905396, "learning_rate": 7.18146718146718e-05, "loss": 0.6391609311103821, "memory(GiB)": 78.26, "step": 62, "token_acc": 0.8243709681258777, "train_speed(iter/s)": 0.033959 }, { "epoch": 0.012207527975584944, "grad_norm": 0.1536901593208313, "learning_rate": 7.297297297297297e-05, "loss": 0.6720226407051086, "memory(GiB)": 78.26, "step": 63, "token_acc": 0.8132890573603584, "train_speed(iter/s)": 0.03398 }, { "epoch": 0.01240129826091169, "grad_norm": 0.12783144414424896, "learning_rate": 7.413127413127413e-05, "loss": 0.6299335956573486, "memory(GiB)": 78.26, "step": 64, "token_acc": 0.8247123041659774, "train_speed(iter/s)": 0.033999 }, { "epoch": 0.012595068546238434, "grad_norm": 0.12650437653064728, "learning_rate": 7.528957528957529e-05, "loss": 0.5913993120193481, "memory(GiB)": 78.26, "step": 65, "token_acc": 0.8381598287733052, "train_speed(iter/s)": 0.034015 }, { "epoch": 0.01278883883156518, "grad_norm": 0.13705813884735107, "learning_rate": 7.644787644787644e-05, "loss": 0.6303204298019409, "memory(GiB)": 78.26, "step": 66, "token_acc": 0.8255334138486312, "train_speed(iter/s)": 0.034034 }, { "epoch": 0.012982609116891924, "grad_norm": 0.15545400977134705, "learning_rate": 7.76061776061776e-05, "loss": 0.6292211413383484, "memory(GiB)": 78.26, "step": 67, "token_acc": 0.8286270117314154, "train_speed(iter/s)": 0.034055 }, { "epoch": 0.01317637940221867, "grad_norm": 0.14907345175743103, "learning_rate": 7.876447876447875e-05, "loss": 0.6275659799575806, "memory(GiB)": 78.26, "step": 68, "token_acc": 0.8254116669263935, "train_speed(iter/s)": 0.03407 }, { "epoch": 0.013370149687545414, "grad_norm": 0.15840691328048706, "learning_rate": 7.992277992277992e-05, "loss": 0.6730965375900269, "memory(GiB)": 78.26, "step": 69, "token_acc": 0.8142665820821879, "train_speed(iter/s)": 0.034091 }, { "epoch": 0.01356391997287216, "grad_norm": 0.14479568600654602, "learning_rate": 8.108108108108108e-05, "loss": 0.6207985877990723, "memory(GiB)": 78.26, "step": 70, "token_acc": 0.8265187594377976, "train_speed(iter/s)": 0.034111 }, { "epoch": 0.013757690258198904, "grad_norm": 0.13668787479400635, "learning_rate": 8.223938223938223e-05, "loss": 0.6429966688156128, "memory(GiB)": 78.26, "step": 71, "token_acc": 0.8240099449476115, "train_speed(iter/s)": 0.034127 }, { "epoch": 0.01395146054352565, "grad_norm": 0.14420422911643982, "learning_rate": 8.339768339768339e-05, "loss": 0.6110091805458069, "memory(GiB)": 78.26, "step": 72, "token_acc": 0.8311519082643192, "train_speed(iter/s)": 0.034142 }, { "epoch": 0.014145230828852395, "grad_norm": 0.16625623404979706, "learning_rate": 8.455598455598454e-05, "loss": 0.6166950464248657, "memory(GiB)": 78.26, "step": 73, "token_acc": 0.8307741268199539, "train_speed(iter/s)": 0.034155 }, { "epoch": 0.01433900111417914, "grad_norm": 0.14389079809188843, "learning_rate": 8.57142857142857e-05, "loss": 0.5988722443580627, "memory(GiB)": 78.26, "step": 74, "token_acc": 0.8348723657246945, "train_speed(iter/s)": 0.034166 }, { "epoch": 0.014532771399505887, "grad_norm": 0.18358756601810455, "learning_rate": 8.687258687258685e-05, "loss": 0.7113780379295349, "memory(GiB)": 78.26, "step": 75, "token_acc": 0.8083578854685225, "train_speed(iter/s)": 0.034184 }, { "epoch": 0.01472654168483263, "grad_norm": 0.1523071825504303, "learning_rate": 8.803088803088804e-05, "loss": 0.6291996240615845, "memory(GiB)": 78.26, "step": 76, "token_acc": 0.8278446372484995, "train_speed(iter/s)": 0.0342 }, { "epoch": 0.014920311970159377, "grad_norm": 0.16895808279514313, "learning_rate": 8.918918918918919e-05, "loss": 0.6497290730476379, "memory(GiB)": 78.26, "step": 77, "token_acc": 0.819365872911459, "train_speed(iter/s)": 0.034215 }, { "epoch": 0.015114082255486121, "grad_norm": 0.17319287359714508, "learning_rate": 9.034749034749035e-05, "loss": 0.5960345268249512, "memory(GiB)": 78.26, "step": 78, "token_acc": 0.8366524602867106, "train_speed(iter/s)": 0.034224 }, { "epoch": 0.015307852540812867, "grad_norm": 0.14983628690242767, "learning_rate": 9.15057915057915e-05, "loss": 0.6000748872756958, "memory(GiB)": 78.26, "step": 79, "token_acc": 0.8339163897148855, "train_speed(iter/s)": 0.034241 }, { "epoch": 0.015501622826139611, "grad_norm": 0.14565478265285492, "learning_rate": 9.266409266409266e-05, "loss": 0.6301745772361755, "memory(GiB)": 78.26, "step": 80, "token_acc": 0.8267408675799087, "train_speed(iter/s)": 0.034254 }, { "epoch": 0.015695393111466355, "grad_norm": 0.16253505647182465, "learning_rate": 9.382239382239381e-05, "loss": 0.6245601177215576, "memory(GiB)": 78.26, "step": 81, "token_acc": 0.8304850012464338, "train_speed(iter/s)": 0.034267 }, { "epoch": 0.015889163396793103, "grad_norm": 0.15641474723815918, "learning_rate": 9.498069498069497e-05, "loss": 0.6229044795036316, "memory(GiB)": 78.26, "step": 82, "token_acc": 0.8290063166096224, "train_speed(iter/s)": 0.034278 }, { "epoch": 0.016082933682119847, "grad_norm": 0.19741681218147278, "learning_rate": 9.613899613899614e-05, "loss": 0.5958088636398315, "memory(GiB)": 78.26, "step": 83, "token_acc": 0.8380999073922755, "train_speed(iter/s)": 0.034294 }, { "epoch": 0.01627670396744659, "grad_norm": 0.16509681940078735, "learning_rate": 9.72972972972973e-05, "loss": 0.652334988117218, "memory(GiB)": 78.26, "step": 84, "token_acc": 0.8223709473915191, "train_speed(iter/s)": 0.034307 }, { "epoch": 0.016470474252773336, "grad_norm": 0.17223341763019562, "learning_rate": 9.845559845559845e-05, "loss": 0.655316174030304, "memory(GiB)": 78.26, "step": 85, "token_acc": 0.82221954379727, "train_speed(iter/s)": 0.034319 }, { "epoch": 0.016664244538100084, "grad_norm": 0.1611337959766388, "learning_rate": 9.96138996138996e-05, "loss": 0.6308309435844421, "memory(GiB)": 78.26, "step": 86, "token_acc": 0.8256332148699723, "train_speed(iter/s)": 0.034331 }, { "epoch": 0.016858014823426828, "grad_norm": 0.1462010145187378, "learning_rate": 0.00010077220077220076, "loss": 0.5816379189491272, "memory(GiB)": 78.26, "step": 87, "token_acc": 0.8376213592233009, "train_speed(iter/s)": 0.03434 }, { "epoch": 0.017051785108753572, "grad_norm": 0.15553230047225952, "learning_rate": 0.00010193050193050192, "loss": 0.5978987216949463, "memory(GiB)": 78.26, "step": 88, "token_acc": 0.8393552427369511, "train_speed(iter/s)": 0.03435 }, { "epoch": 0.017245555394080316, "grad_norm": 0.172433540225029, "learning_rate": 0.00010308880308880307, "loss": 0.6419100165367126, "memory(GiB)": 78.26, "step": 89, "token_acc": 0.8243931496649293, "train_speed(iter/s)": 0.034363 }, { "epoch": 0.017439325679407064, "grad_norm": 0.17001327872276306, "learning_rate": 0.00010424710424710424, "loss": 0.6268438100814819, "memory(GiB)": 78.26, "step": 90, "token_acc": 0.8299691153761306, "train_speed(iter/s)": 0.034376 }, { "epoch": 0.017633095964733808, "grad_norm": 0.19090093672275543, "learning_rate": 0.0001054054054054054, "loss": 0.6875048875808716, "memory(GiB)": 78.26, "step": 91, "token_acc": 0.8123634272570442, "train_speed(iter/s)": 0.034391 }, { "epoch": 0.017826866250060552, "grad_norm": 0.16166290640830994, "learning_rate": 0.00010656370656370655, "loss": 0.5984062552452087, "memory(GiB)": 78.26, "step": 92, "token_acc": 0.8348241568976055, "train_speed(iter/s)": 0.034401 }, { "epoch": 0.0180206365353873, "grad_norm": 0.14463870227336884, "learning_rate": 0.00010772200772200771, "loss": 0.5481261014938354, "memory(GiB)": 78.26, "step": 93, "token_acc": 0.8472176412382793, "train_speed(iter/s)": 0.034409 }, { "epoch": 0.018214406820714044, "grad_norm": 0.17527909576892853, "learning_rate": 0.00010888030888030886, "loss": 0.6553927063941956, "memory(GiB)": 78.26, "step": 94, "token_acc": 0.8222487233587132, "train_speed(iter/s)": 0.034415 }, { "epoch": 0.01840817710604079, "grad_norm": 0.16232283413410187, "learning_rate": 0.00011003861003861002, "loss": 0.6176364421844482, "memory(GiB)": 78.26, "step": 95, "token_acc": 0.8296051451559177, "train_speed(iter/s)": 0.034422 }, { "epoch": 0.018601947391367533, "grad_norm": 0.1550573855638504, "learning_rate": 0.00011119691119691117, "loss": 0.5948160886764526, "memory(GiB)": 78.26, "step": 96, "token_acc": 0.8375553097345133, "train_speed(iter/s)": 0.034429 }, { "epoch": 0.01879571767669428, "grad_norm": 0.16793379187583923, "learning_rate": 0.00011235521235521234, "loss": 0.6064925193786621, "memory(GiB)": 78.26, "step": 97, "token_acc": 0.8337825430204662, "train_speed(iter/s)": 0.034438 }, { "epoch": 0.018989487962021025, "grad_norm": 0.17881280183792114, "learning_rate": 0.00011351351351351351, "loss": 0.5967330932617188, "memory(GiB)": 78.26, "step": 98, "token_acc": 0.829712168876606, "train_speed(iter/s)": 0.03445 }, { "epoch": 0.01918325824734777, "grad_norm": 0.1565878838300705, "learning_rate": 0.00011467181467181467, "loss": 0.5600182414054871, "memory(GiB)": 78.26, "step": 99, "token_acc": 0.8435590667538151, "train_speed(iter/s)": 0.034461 }, { "epoch": 0.019377028532674513, "grad_norm": 0.15280094742774963, "learning_rate": 0.00011583011583011582, "loss": 0.5975373983383179, "memory(GiB)": 78.26, "step": 100, "token_acc": 0.8333461057041408, "train_speed(iter/s)": 0.034468 }, { "epoch": 0.01957079881800126, "grad_norm": 0.15495073795318604, "learning_rate": 0.00011698841698841698, "loss": 0.6017282009124756, "memory(GiB)": 78.26, "step": 101, "token_acc": 0.8339026241596184, "train_speed(iter/s)": 0.034475 }, { "epoch": 0.019764569103328005, "grad_norm": 0.14694905281066895, "learning_rate": 0.00011814671814671814, "loss": 0.5602938532829285, "memory(GiB)": 78.26, "step": 102, "token_acc": 0.8411804083454201, "train_speed(iter/s)": 0.034482 }, { "epoch": 0.01995833938865475, "grad_norm": 0.1619928628206253, "learning_rate": 0.00011930501930501929, "loss": 0.5849668979644775, "memory(GiB)": 78.26, "step": 103, "token_acc": 0.8391262944887513, "train_speed(iter/s)": 0.03449 }, { "epoch": 0.020152109673981494, "grad_norm": 0.1454261839389801, "learning_rate": 0.00012046332046332045, "loss": 0.5901338458061218, "memory(GiB)": 78.26, "step": 104, "token_acc": 0.8340299917345614, "train_speed(iter/s)": 0.034498 }, { "epoch": 0.02034587995930824, "grad_norm": 0.2236863374710083, "learning_rate": 0.00012162162162162162, "loss": 0.5838300585746765, "memory(GiB)": 78.26, "step": 105, "token_acc": 0.8396284829721362, "train_speed(iter/s)": 0.034506 }, { "epoch": 0.020539650244634985, "grad_norm": 0.15272359549999237, "learning_rate": 0.00012277992277992278, "loss": 0.5919955372810364, "memory(GiB)": 78.26, "step": 106, "token_acc": 0.8308620948755198, "train_speed(iter/s)": 0.034513 }, { "epoch": 0.02073342052996173, "grad_norm": 0.13382184505462646, "learning_rate": 0.00012393822393822393, "loss": 0.5290847420692444, "memory(GiB)": 78.26, "step": 107, "token_acc": 0.8508332939898966, "train_speed(iter/s)": 0.034521 }, { "epoch": 0.020927190815288474, "grad_norm": 0.1610032171010971, "learning_rate": 0.0001250965250965251, "loss": 0.6414130330085754, "memory(GiB)": 78.26, "step": 108, "token_acc": 0.8247312177217657, "train_speed(iter/s)": 0.034529 }, { "epoch": 0.02112096110061522, "grad_norm": 0.1506713628768921, "learning_rate": 0.00012625482625482624, "loss": 0.6079325675964355, "memory(GiB)": 78.26, "step": 109, "token_acc": 0.8329596412556054, "train_speed(iter/s)": 0.034536 }, { "epoch": 0.021314731385941966, "grad_norm": 0.16280919313430786, "learning_rate": 0.0001274131274131274, "loss": 0.6136084198951721, "memory(GiB)": 78.26, "step": 110, "token_acc": 0.8316319235938349, "train_speed(iter/s)": 0.034545 }, { "epoch": 0.02150850167126871, "grad_norm": 0.14740246534347534, "learning_rate": 0.00012857142857142855, "loss": 0.5931621789932251, "memory(GiB)": 78.26, "step": 111, "token_acc": 0.8302635446262369, "train_speed(iter/s)": 0.034549 }, { "epoch": 0.021702271956595458, "grad_norm": 0.15233321487903595, "learning_rate": 0.00012972972972972972, "loss": 0.5877476930618286, "memory(GiB)": 78.26, "step": 112, "token_acc": 0.8364121451149842, "train_speed(iter/s)": 0.034559 }, { "epoch": 0.021896042241922202, "grad_norm": 0.1440098136663437, "learning_rate": 0.0001308880308880309, "loss": 0.553906261920929, "memory(GiB)": 78.26, "step": 113, "token_acc": 0.8466698357365446, "train_speed(iter/s)": 0.034565 }, { "epoch": 0.022089812527248946, "grad_norm": 0.16069145500659943, "learning_rate": 0.00013204633204633203, "loss": 0.6046357750892639, "memory(GiB)": 78.26, "step": 114, "token_acc": 0.832103537128133, "train_speed(iter/s)": 0.034573 }, { "epoch": 0.02228358281257569, "grad_norm": 0.1648361086845398, "learning_rate": 0.0001332046332046332, "loss": 0.6004931330680847, "memory(GiB)": 78.26, "step": 115, "token_acc": 0.8346290569636716, "train_speed(iter/s)": 0.034581 }, { "epoch": 0.022477353097902438, "grad_norm": 0.15239156782627106, "learning_rate": 0.00013436293436293434, "loss": 0.6015850901603699, "memory(GiB)": 78.26, "step": 116, "token_acc": 0.8332898444838743, "train_speed(iter/s)": 0.034589 }, { "epoch": 0.022671123383229182, "grad_norm": 0.15159296989440918, "learning_rate": 0.0001355212355212355, "loss": 0.5867636203765869, "memory(GiB)": 78.26, "step": 117, "token_acc": 0.8370697910212358, "train_speed(iter/s)": 0.034594 }, { "epoch": 0.022864893668555927, "grad_norm": 0.1747017502784729, "learning_rate": 0.00013667953667953665, "loss": 0.636696457862854, "memory(GiB)": 78.26, "step": 118, "token_acc": 0.8233567399562565, "train_speed(iter/s)": 0.034599 }, { "epoch": 0.02305866395388267, "grad_norm": 0.15044157207012177, "learning_rate": 0.00013783783783783782, "loss": 0.5734996199607849, "memory(GiB)": 78.26, "step": 119, "token_acc": 0.8417291220556745, "train_speed(iter/s)": 0.034603 }, { "epoch": 0.02325243423920942, "grad_norm": 0.16326741874217987, "learning_rate": 0.000138996138996139, "loss": 0.5967447757720947, "memory(GiB)": 78.26, "step": 120, "token_acc": 0.8323221786037829, "train_speed(iter/s)": 0.034609 }, { "epoch": 0.023446204524536163, "grad_norm": 0.13857780396938324, "learning_rate": 0.00014015444015444016, "loss": 0.5407213568687439, "memory(GiB)": 78.26, "step": 121, "token_acc": 0.8461235837180026, "train_speed(iter/s)": 0.034612 }, { "epoch": 0.023639974809862907, "grad_norm": 0.13604214787483215, "learning_rate": 0.0001413127413127413, "loss": 0.5105344653129578, "memory(GiB)": 78.26, "step": 122, "token_acc": 0.8567164915396379, "train_speed(iter/s)": 0.034616 }, { "epoch": 0.02383374509518965, "grad_norm": 0.14465397596359253, "learning_rate": 0.00014247104247104247, "loss": 0.5738057494163513, "memory(GiB)": 78.26, "step": 123, "token_acc": 0.8395190358188781, "train_speed(iter/s)": 0.034624 }, { "epoch": 0.0240275153805164, "grad_norm": 0.14596134424209595, "learning_rate": 0.0001436293436293436, "loss": 0.5468869209289551, "memory(GiB)": 78.26, "step": 124, "token_acc": 0.8466944373600839, "train_speed(iter/s)": 0.034632 }, { "epoch": 0.024221285665843143, "grad_norm": 0.14446629583835602, "learning_rate": 0.00014478764478764478, "loss": 0.5509823560714722, "memory(GiB)": 78.26, "step": 125, "token_acc": 0.8461442816999478, "train_speed(iter/s)": 0.034635 }, { "epoch": 0.024415055951169887, "grad_norm": 0.13912013173103333, "learning_rate": 0.00014594594594594595, "loss": 0.5554008483886719, "memory(GiB)": 78.26, "step": 126, "token_acc": 0.8426597276608645, "train_speed(iter/s)": 0.03464 }, { "epoch": 0.02460882623649663, "grad_norm": 0.14441823959350586, "learning_rate": 0.0001471042471042471, "loss": 0.5533645749092102, "memory(GiB)": 78.26, "step": 127, "token_acc": 0.8452828346917859, "train_speed(iter/s)": 0.034642 }, { "epoch": 0.02480259652182338, "grad_norm": 0.1425672024488449, "learning_rate": 0.00014826254826254826, "loss": 0.5448683500289917, "memory(GiB)": 78.26, "step": 128, "token_acc": 0.8480569849679332, "train_speed(iter/s)": 0.034646 }, { "epoch": 0.024996366807150124, "grad_norm": 0.15022799372673035, "learning_rate": 0.0001494208494208494, "loss": 0.5586391687393188, "memory(GiB)": 78.26, "step": 129, "token_acc": 0.843522056269538, "train_speed(iter/s)": 0.034653 }, { "epoch": 0.025190137092476868, "grad_norm": 0.14275088906288147, "learning_rate": 0.00015057915057915057, "loss": 0.5584322810173035, "memory(GiB)": 78.26, "step": 130, "token_acc": 0.8456992938407218, "train_speed(iter/s)": 0.034657 }, { "epoch": 0.025383907377803615, "grad_norm": 0.15598376095294952, "learning_rate": 0.0001517374517374517, "loss": 0.5684648156166077, "memory(GiB)": 78.26, "step": 131, "token_acc": 0.841918682337542, "train_speed(iter/s)": 0.03466 }, { "epoch": 0.02557767766313036, "grad_norm": 0.13116468489170074, "learning_rate": 0.00015289575289575288, "loss": 0.518610954284668, "memory(GiB)": 78.26, "step": 132, "token_acc": 0.8523458494613052, "train_speed(iter/s)": 0.034665 }, { "epoch": 0.025771447948457104, "grad_norm": 0.140156552195549, "learning_rate": 0.00015405405405405402, "loss": 0.5425719022750854, "memory(GiB)": 78.26, "step": 133, "token_acc": 0.8470962689771895, "train_speed(iter/s)": 0.03467 }, { "epoch": 0.025965218233783848, "grad_norm": 0.15615837275981903, "learning_rate": 0.0001552123552123552, "loss": 0.58672034740448, "memory(GiB)": 78.26, "step": 134, "token_acc": 0.8424782073907695, "train_speed(iter/s)": 0.034673 }, { "epoch": 0.026158988519110596, "grad_norm": 0.1514168381690979, "learning_rate": 0.00015637065637065634, "loss": 0.6196274161338806, "memory(GiB)": 78.26, "step": 135, "token_acc": 0.8292035111586176, "train_speed(iter/s)": 0.034681 }, { "epoch": 0.02635275880443734, "grad_norm": 0.15724033117294312, "learning_rate": 0.0001575289575289575, "loss": 0.5608164668083191, "memory(GiB)": 78.26, "step": 136, "token_acc": 0.8445253505933118, "train_speed(iter/s)": 0.034686 }, { "epoch": 0.026546529089764084, "grad_norm": 0.17307862639427185, "learning_rate": 0.0001586872586872587, "loss": 0.6565619111061096, "memory(GiB)": 78.26, "step": 137, "token_acc": 0.8214262132717068, "train_speed(iter/s)": 0.034691 }, { "epoch": 0.02674029937509083, "grad_norm": 0.1405402421951294, "learning_rate": 0.00015984555984555984, "loss": 0.5477691888809204, "memory(GiB)": 78.26, "step": 138, "token_acc": 0.8483550808604967, "train_speed(iter/s)": 0.034696 }, { "epoch": 0.026934069660417576, "grad_norm": 0.15360456705093384, "learning_rate": 0.000161003861003861, "loss": 0.6261464357376099, "memory(GiB)": 78.26, "step": 139, "token_acc": 0.8274440827470335, "train_speed(iter/s)": 0.0347 }, { "epoch": 0.02712783994574432, "grad_norm": 0.12648895382881165, "learning_rate": 0.00016216216216216215, "loss": 0.4741262197494507, "memory(GiB)": 78.26, "step": 140, "token_acc": 0.8678006997225238, "train_speed(iter/s)": 0.034702 }, { "epoch": 0.027321610231071065, "grad_norm": 0.1415882259607315, "learning_rate": 0.00016332046332046332, "loss": 0.5620299577713013, "memory(GiB)": 78.26, "step": 141, "token_acc": 0.8392420713649041, "train_speed(iter/s)": 0.034706 }, { "epoch": 0.02751538051639781, "grad_norm": 0.143955260515213, "learning_rate": 0.00016447876447876446, "loss": 0.5819345712661743, "memory(GiB)": 78.26, "step": 142, "token_acc": 0.8385089572540735, "train_speed(iter/s)": 0.03471 }, { "epoch": 0.027709150801724557, "grad_norm": 0.14960965514183044, "learning_rate": 0.00016563706563706563, "loss": 0.5373987555503845, "memory(GiB)": 78.26, "step": 143, "token_acc": 0.8491564847739486, "train_speed(iter/s)": 0.034715 }, { "epoch": 0.0279029210870513, "grad_norm": 0.13836169242858887, "learning_rate": 0.00016679536679536678, "loss": 0.5637241005897522, "memory(GiB)": 78.26, "step": 144, "token_acc": 0.8398485764610005, "train_speed(iter/s)": 0.034719 }, { "epoch": 0.028096691372378045, "grad_norm": 0.15270289778709412, "learning_rate": 0.00016795366795366795, "loss": 0.6161123514175415, "memory(GiB)": 78.26, "step": 145, "token_acc": 0.828073843783531, "train_speed(iter/s)": 0.034725 }, { "epoch": 0.02829046165770479, "grad_norm": 0.14659975469112396, "learning_rate": 0.0001691119691119691, "loss": 0.5577420592308044, "memory(GiB)": 78.26, "step": 146, "token_acc": 0.8441303927792653, "train_speed(iter/s)": 0.034731 }, { "epoch": 0.028484231943031537, "grad_norm": 0.13934315741062164, "learning_rate": 0.00017027027027027026, "loss": 0.5665932893753052, "memory(GiB)": 78.26, "step": 147, "token_acc": 0.8396104567059309, "train_speed(iter/s)": 0.034731 }, { "epoch": 0.02867800222835828, "grad_norm": 0.15382340550422668, "learning_rate": 0.0001714285714285714, "loss": 0.6315574645996094, "memory(GiB)": 78.26, "step": 148, "token_acc": 0.8281886687133687, "train_speed(iter/s)": 0.034735 }, { "epoch": 0.028871772513685025, "grad_norm": 0.15263354778289795, "learning_rate": 0.00017258687258687257, "loss": 0.5572026968002319, "memory(GiB)": 78.26, "step": 149, "token_acc": 0.8416983617968031, "train_speed(iter/s)": 0.034739 }, { "epoch": 0.029065542799011773, "grad_norm": 0.14366725087165833, "learning_rate": 0.0001737451737451737, "loss": 0.5455084443092346, "memory(GiB)": 78.26, "step": 150, "token_acc": 0.8478349198978408, "train_speed(iter/s)": 0.034744 }, { "epoch": 0.029259313084338517, "grad_norm": 0.14723645150661469, "learning_rate": 0.00017490347490347488, "loss": 0.5214795470237732, "memory(GiB)": 78.26, "step": 151, "token_acc": 0.8530382721575649, "train_speed(iter/s)": 0.034748 }, { "epoch": 0.02945308336966526, "grad_norm": 0.1369503289461136, "learning_rate": 0.00017606177606177607, "loss": 0.5099643468856812, "memory(GiB)": 78.26, "step": 152, "token_acc": 0.8562671739589939, "train_speed(iter/s)": 0.034751 }, { "epoch": 0.029646853654992006, "grad_norm": 0.16232426464557648, "learning_rate": 0.00017722007722007722, "loss": 0.6399052143096924, "memory(GiB)": 78.26, "step": 153, "token_acc": 0.8246472248353716, "train_speed(iter/s)": 0.034755 }, { "epoch": 0.029840623940318754, "grad_norm": 0.15433335304260254, "learning_rate": 0.00017837837837837839, "loss": 0.5725199580192566, "memory(GiB)": 78.26, "step": 154, "token_acc": 0.8409897602206277, "train_speed(iter/s)": 0.034758 }, { "epoch": 0.030034394225645498, "grad_norm": 0.13948160409927368, "learning_rate": 0.00017953667953667953, "loss": 0.5367989540100098, "memory(GiB)": 78.26, "step": 155, "token_acc": 0.8473491551719958, "train_speed(iter/s)": 0.034762 }, { "epoch": 0.030228164510972242, "grad_norm": 0.14237907528877258, "learning_rate": 0.0001806949806949807, "loss": 0.5689682364463806, "memory(GiB)": 78.26, "step": 156, "token_acc": 0.8421181996941228, "train_speed(iter/s)": 0.034765 }, { "epoch": 0.030421934796298986, "grad_norm": 0.13025622069835663, "learning_rate": 0.00018185328185328184, "loss": 0.5721440315246582, "memory(GiB)": 78.26, "step": 157, "token_acc": 0.8421661117277099, "train_speed(iter/s)": 0.034767 }, { "epoch": 0.030615705081625734, "grad_norm": 0.15070052444934845, "learning_rate": 0.000183011583011583, "loss": 0.5769027471542358, "memory(GiB)": 78.26, "step": 158, "token_acc": 0.8369094922737307, "train_speed(iter/s)": 0.034772 }, { "epoch": 0.030809475366952478, "grad_norm": 0.15805287659168243, "learning_rate": 0.00018416988416988415, "loss": 0.6211342215538025, "memory(GiB)": 78.26, "step": 159, "token_acc": 0.8280907315607631, "train_speed(iter/s)": 0.034775 }, { "epoch": 0.031003245652279222, "grad_norm": 0.1476234793663025, "learning_rate": 0.00018532818532818532, "loss": 0.5813598036766052, "memory(GiB)": 78.26, "step": 160, "token_acc": 0.8377501760032183, "train_speed(iter/s)": 0.034778 }, { "epoch": 0.031197015937605967, "grad_norm": 0.1391780972480774, "learning_rate": 0.00018648648648648646, "loss": 0.5335437059402466, "memory(GiB)": 78.26, "step": 161, "token_acc": 0.848728354978355, "train_speed(iter/s)": 0.034779 }, { "epoch": 0.03139078622293271, "grad_norm": 0.1483832597732544, "learning_rate": 0.00018764478764478763, "loss": 0.5789586305618286, "memory(GiB)": 78.26, "step": 162, "token_acc": 0.8385598141695703, "train_speed(iter/s)": 0.034781 }, { "epoch": 0.031584556508259455, "grad_norm": 0.14238472282886505, "learning_rate": 0.00018880308880308877, "loss": 0.5635126233100891, "memory(GiB)": 78.26, "step": 163, "token_acc": 0.8438286425038327, "train_speed(iter/s)": 0.034786 }, { "epoch": 0.031778326793586206, "grad_norm": 0.12377775460481644, "learning_rate": 0.00018996138996138994, "loss": 0.493597149848938, "memory(GiB)": 78.26, "step": 164, "token_acc": 0.8575653587393326, "train_speed(iter/s)": 0.034785 }, { "epoch": 0.03197209707891295, "grad_norm": 0.13860811293125153, "learning_rate": 0.00019111969111969108, "loss": 0.5463928580284119, "memory(GiB)": 78.26, "step": 165, "token_acc": 0.844146549588483, "train_speed(iter/s)": 0.034789 }, { "epoch": 0.032165867364239695, "grad_norm": 0.13424870371818542, "learning_rate": 0.00019227799227799228, "loss": 0.5533477067947388, "memory(GiB)": 78.26, "step": 166, "token_acc": 0.8481163938685373, "train_speed(iter/s)": 0.03479 }, { "epoch": 0.03235963764956644, "grad_norm": 0.12482903897762299, "learning_rate": 0.00019343629343629342, "loss": 0.5704807043075562, "memory(GiB)": 78.26, "step": 167, "token_acc": 0.8384424192212097, "train_speed(iter/s)": 0.034792 }, { "epoch": 0.03255340793489318, "grad_norm": 0.13756157457828522, "learning_rate": 0.0001945945945945946, "loss": 0.5534645318984985, "memory(GiB)": 78.26, "step": 168, "token_acc": 0.8454033863318016, "train_speed(iter/s)": 0.034794 }, { "epoch": 0.03274717822021993, "grad_norm": 0.13826580345630646, "learning_rate": 0.00019575289575289573, "loss": 0.534266471862793, "memory(GiB)": 78.26, "step": 169, "token_acc": 0.8489137497528039, "train_speed(iter/s)": 0.034796 }, { "epoch": 0.03294094850554667, "grad_norm": 0.14315061271190643, "learning_rate": 0.0001969111969111969, "loss": 0.5491966009140015, "memory(GiB)": 78.26, "step": 170, "token_acc": 0.8464807519310109, "train_speed(iter/s)": 0.034798 }, { "epoch": 0.03313471879087342, "grad_norm": 0.12808647751808167, "learning_rate": 0.00019806949806949804, "loss": 0.5022028684616089, "memory(GiB)": 78.26, "step": 171, "token_acc": 0.8551667255802993, "train_speed(iter/s)": 0.034798 }, { "epoch": 0.03332848907620017, "grad_norm": 0.14019440114498138, "learning_rate": 0.0001992277992277992, "loss": 0.5900301933288574, "memory(GiB)": 78.26, "step": 172, "token_acc": 0.8387483645878762, "train_speed(iter/s)": 0.034802 }, { "epoch": 0.03352225936152691, "grad_norm": 0.13789018988609314, "learning_rate": 0.00020038610038610038, "loss": 0.541344940662384, "memory(GiB)": 78.26, "step": 173, "token_acc": 0.8496686986482905, "train_speed(iter/s)": 0.034806 }, { "epoch": 0.033716029646853655, "grad_norm": 0.1350238174200058, "learning_rate": 0.00020154440154440152, "loss": 0.5568801164627075, "memory(GiB)": 78.26, "step": 174, "token_acc": 0.8458727972794469, "train_speed(iter/s)": 0.034809 }, { "epoch": 0.0339097999321804, "grad_norm": 0.14095835387706757, "learning_rate": 0.0002027027027027027, "loss": 0.5708956122398376, "memory(GiB)": 78.26, "step": 175, "token_acc": 0.845259973423054, "train_speed(iter/s)": 0.034811 }, { "epoch": 0.034103570217507144, "grad_norm": 0.13087715208530426, "learning_rate": 0.00020386100386100383, "loss": 0.5266304016113281, "memory(GiB)": 78.26, "step": 176, "token_acc": 0.8541874507744814, "train_speed(iter/s)": 0.034813 }, { "epoch": 0.03429734050283389, "grad_norm": 0.13467884063720703, "learning_rate": 0.000205019305019305, "loss": 0.4999612867832184, "memory(GiB)": 78.26, "step": 177, "token_acc": 0.859081674116213, "train_speed(iter/s)": 0.034816 }, { "epoch": 0.03449111078816063, "grad_norm": 0.1308935433626175, "learning_rate": 0.00020617760617760615, "loss": 0.5238428115844727, "memory(GiB)": 78.26, "step": 178, "token_acc": 0.8521208527878653, "train_speed(iter/s)": 0.034818 }, { "epoch": 0.034684881073487384, "grad_norm": 0.1268441081047058, "learning_rate": 0.00020733590733590731, "loss": 0.5368191599845886, "memory(GiB)": 78.26, "step": 179, "token_acc": 0.8482607365092318, "train_speed(iter/s)": 0.034819 }, { "epoch": 0.03487865135881413, "grad_norm": 0.12708643078804016, "learning_rate": 0.00020849420849420848, "loss": 0.49307650327682495, "memory(GiB)": 78.26, "step": 180, "token_acc": 0.8607868020304569, "train_speed(iter/s)": 0.034821 }, { "epoch": 0.03507242164414087, "grad_norm": 0.12540721893310547, "learning_rate": 0.00020965250965250965, "loss": 0.507483720779419, "memory(GiB)": 78.26, "step": 181, "token_acc": 0.8567486597287922, "train_speed(iter/s)": 0.034824 }, { "epoch": 0.035266191929467616, "grad_norm": 0.1270364373922348, "learning_rate": 0.0002108108108108108, "loss": 0.5022760629653931, "memory(GiB)": 78.26, "step": 182, "token_acc": 0.8590840060746543, "train_speed(iter/s)": 0.034825 }, { "epoch": 0.03545996221479436, "grad_norm": 0.14542265236377716, "learning_rate": 0.00021196911196911196, "loss": 0.516878604888916, "memory(GiB)": 78.26, "step": 183, "token_acc": 0.8556428363617128, "train_speed(iter/s)": 0.034829 }, { "epoch": 0.035653732500121105, "grad_norm": 0.14277629554271698, "learning_rate": 0.0002131274131274131, "loss": 0.5679965019226074, "memory(GiB)": 78.26, "step": 184, "token_acc": 0.841894944113048, "train_speed(iter/s)": 0.034833 }, { "epoch": 0.03584750278544785, "grad_norm": 0.13046088814735413, "learning_rate": 0.00021428571428571427, "loss": 0.5434874296188354, "memory(GiB)": 78.26, "step": 185, "token_acc": 0.8491733876369872, "train_speed(iter/s)": 0.034835 }, { "epoch": 0.0360412730707746, "grad_norm": 0.12494053691625595, "learning_rate": 0.00021544401544401542, "loss": 0.537643313407898, "memory(GiB)": 78.26, "step": 186, "token_acc": 0.8506308711310349, "train_speed(iter/s)": 0.034836 }, { "epoch": 0.036235043356101344, "grad_norm": 0.12971843779087067, "learning_rate": 0.00021660231660231659, "loss": 0.5533092617988586, "memory(GiB)": 78.26, "step": 187, "token_acc": 0.845800933125972, "train_speed(iter/s)": 0.034839 }, { "epoch": 0.03642881364142809, "grad_norm": 0.12924546003341675, "learning_rate": 0.00021776061776061773, "loss": 0.5168436765670776, "memory(GiB)": 78.26, "step": 188, "token_acc": 0.8556058890147226, "train_speed(iter/s)": 0.034839 }, { "epoch": 0.03662258392675483, "grad_norm": 0.1508742719888687, "learning_rate": 0.0002189189189189189, "loss": 0.6136130690574646, "memory(GiB)": 78.26, "step": 189, "token_acc": 0.8350558426404401, "train_speed(iter/s)": 0.034842 }, { "epoch": 0.03681635421208158, "grad_norm": 0.1334015280008316, "learning_rate": 0.00022007722007722004, "loss": 0.545418918132782, "memory(GiB)": 78.26, "step": 190, "token_acc": 0.8462031558185404, "train_speed(iter/s)": 0.034844 }, { "epoch": 0.03701012449740832, "grad_norm": 0.13890019059181213, "learning_rate": 0.0002212355212355212, "loss": 0.5522346496582031, "memory(GiB)": 78.26, "step": 191, "token_acc": 0.8448294130112312, "train_speed(iter/s)": 0.034845 }, { "epoch": 0.037203894782735066, "grad_norm": 0.13158449530601501, "learning_rate": 0.00022239382239382235, "loss": 0.5597431063652039, "memory(GiB)": 78.26, "step": 192, "token_acc": 0.8428118697781619, "train_speed(iter/s)": 0.034848 }, { "epoch": 0.03739766506806181, "grad_norm": 0.13762398064136505, "learning_rate": 0.00022355212355212352, "loss": 0.5332627892494202, "memory(GiB)": 78.26, "step": 193, "token_acc": 0.8513293253173013, "train_speed(iter/s)": 0.034851 }, { "epoch": 0.03759143535338856, "grad_norm": 0.13980168104171753, "learning_rate": 0.0002247104247104247, "loss": 0.5322573184967041, "memory(GiB)": 78.26, "step": 194, "token_acc": 0.85326330790953, "train_speed(iter/s)": 0.034851 }, { "epoch": 0.037785205638715305, "grad_norm": 0.12236251682043076, "learning_rate": 0.00022586872586872586, "loss": 0.5012400150299072, "memory(GiB)": 78.26, "step": 195, "token_acc": 0.8592677644111584, "train_speed(iter/s)": 0.034853 }, { "epoch": 0.03797897592404205, "grad_norm": 0.14247579872608185, "learning_rate": 0.00022702702702702703, "loss": 0.5467385053634644, "memory(GiB)": 78.26, "step": 196, "token_acc": 0.8513190123595822, "train_speed(iter/s)": 0.034856 }, { "epoch": 0.038172746209368794, "grad_norm": 0.12214695662260056, "learning_rate": 0.00022818532818532817, "loss": 0.4757518768310547, "memory(GiB)": 78.26, "step": 197, "token_acc": 0.8650738047495954, "train_speed(iter/s)": 0.034857 }, { "epoch": 0.03836651649469554, "grad_norm": 0.12631294131278992, "learning_rate": 0.00022934362934362934, "loss": 0.5603289604187012, "memory(GiB)": 78.26, "step": 198, "token_acc": 0.8452558741537236, "train_speed(iter/s)": 0.034857 }, { "epoch": 0.03856028678002228, "grad_norm": 0.12949825823307037, "learning_rate": 0.00023050193050193048, "loss": 0.5440013408660889, "memory(GiB)": 78.26, "step": 199, "token_acc": 0.8481728066281994, "train_speed(iter/s)": 0.03486 }, { "epoch": 0.038754057065349026, "grad_norm": 0.13039319217205048, "learning_rate": 0.00023166023166023165, "loss": 0.5734332799911499, "memory(GiB)": 78.26, "step": 200, "token_acc": 0.8401181070071356, "train_speed(iter/s)": 0.034862 }, { "epoch": 0.03894782735067577, "grad_norm": 0.13796895742416382, "learning_rate": 0.0002328185328185328, "loss": 0.5982975959777832, "memory(GiB)": 78.26, "step": 201, "token_acc": 0.832787772216962, "train_speed(iter/s)": 0.03479 }, { "epoch": 0.03914159763600252, "grad_norm": 0.12662553787231445, "learning_rate": 0.00023397683397683396, "loss": 0.5020790696144104, "memory(GiB)": 78.26, "step": 202, "token_acc": 0.858308341381589, "train_speed(iter/s)": 0.034793 }, { "epoch": 0.039335367921329266, "grad_norm": 0.13133689761161804, "learning_rate": 0.0002351351351351351, "loss": 0.48208650946617126, "memory(GiB)": 78.26, "step": 203, "token_acc": 0.8654059206966738, "train_speed(iter/s)": 0.034794 }, { "epoch": 0.03952913820665601, "grad_norm": 0.12844805419445038, "learning_rate": 0.00023629343629343627, "loss": 0.5480844974517822, "memory(GiB)": 78.26, "step": 204, "token_acc": 0.8480197137937178, "train_speed(iter/s)": 0.034797 }, { "epoch": 0.039722908491982754, "grad_norm": 0.13410721719264984, "learning_rate": 0.0002374517374517374, "loss": 0.524722695350647, "memory(GiB)": 78.26, "step": 205, "token_acc": 0.8535315555041361, "train_speed(iter/s)": 0.034801 }, { "epoch": 0.0399166787773095, "grad_norm": 0.15366047620773315, "learning_rate": 0.00023861003861003858, "loss": 0.593670129776001, "memory(GiB)": 78.26, "step": 206, "token_acc": 0.834993270524899, "train_speed(iter/s)": 0.034805 }, { "epoch": 0.04011044906263624, "grad_norm": 0.1396535038948059, "learning_rate": 0.00023976833976833972, "loss": 0.5373449325561523, "memory(GiB)": 78.26, "step": 207, "token_acc": 0.8462800580988756, "train_speed(iter/s)": 0.034806 }, { "epoch": 0.04030421934796299, "grad_norm": 0.14069020748138428, "learning_rate": 0.0002409266409266409, "loss": 0.583406925201416, "memory(GiB)": 78.26, "step": 208, "token_acc": 0.8375456332490874, "train_speed(iter/s)": 0.034809 }, { "epoch": 0.04049798963328974, "grad_norm": 0.13054059445858002, "learning_rate": 0.0002420849420849421, "loss": 0.5496135950088501, "memory(GiB)": 78.26, "step": 209, "token_acc": 0.843966505507651, "train_speed(iter/s)": 0.034811 }, { "epoch": 0.04069175991861648, "grad_norm": 0.14763560891151428, "learning_rate": 0.00024324324324324323, "loss": 0.5195255279541016, "memory(GiB)": 78.26, "step": 210, "token_acc": 0.8529995042141795, "train_speed(iter/s)": 0.034814 }, { "epoch": 0.04088553020394323, "grad_norm": 0.12177236378192902, "learning_rate": 0.0002444015444015444, "loss": 0.49852877855300903, "memory(GiB)": 78.26, "step": 211, "token_acc": 0.8585345707205675, "train_speed(iter/s)": 0.034815 }, { "epoch": 0.04107930048926997, "grad_norm": 0.1217300221323967, "learning_rate": 0.00024555984555984557, "loss": 0.48390355706214905, "memory(GiB)": 78.26, "step": 212, "token_acc": 0.8644299537231804, "train_speed(iter/s)": 0.034818 }, { "epoch": 0.041273070774596715, "grad_norm": 0.14275750517845154, "learning_rate": 0.0002467181467181467, "loss": 0.5250924825668335, "memory(GiB)": 78.26, "step": 213, "token_acc": 0.8536855100046294, "train_speed(iter/s)": 0.03482 }, { "epoch": 0.04146684105992346, "grad_norm": 0.1468067467212677, "learning_rate": 0.00024787644787644785, "loss": 0.5309884548187256, "memory(GiB)": 78.26, "step": 214, "token_acc": 0.8531648971912378, "train_speed(iter/s)": 0.034823 }, { "epoch": 0.041660611345250204, "grad_norm": 0.12400522828102112, "learning_rate": 0.000249034749034749, "loss": 0.5384760499000549, "memory(GiB)": 78.26, "step": 215, "token_acc": 0.8493547014607857, "train_speed(iter/s)": 0.034824 }, { "epoch": 0.04185438163057695, "grad_norm": 0.14342345297336578, "learning_rate": 0.0002501930501930502, "loss": 0.5606729388237, "memory(GiB)": 78.26, "step": 216, "token_acc": 0.8429284181681875, "train_speed(iter/s)": 0.034825 }, { "epoch": 0.0420481519159037, "grad_norm": 0.1313794106245041, "learning_rate": 0.0002513513513513513, "loss": 0.5184580087661743, "memory(GiB)": 78.26, "step": 217, "token_acc": 0.8552152612420286, "train_speed(iter/s)": 0.034827 }, { "epoch": 0.04224192220123044, "grad_norm": 0.13364368677139282, "learning_rate": 0.0002525096525096525, "loss": 0.5389662981033325, "memory(GiB)": 78.26, "step": 218, "token_acc": 0.8503248639797159, "train_speed(iter/s)": 0.034829 }, { "epoch": 0.04243569248655719, "grad_norm": 0.1303595006465912, "learning_rate": 0.00025366795366795364, "loss": 0.5534095764160156, "memory(GiB)": 78.26, "step": 219, "token_acc": 0.8464527027027027, "train_speed(iter/s)": 0.034832 }, { "epoch": 0.04262946277188393, "grad_norm": 0.13916410505771637, "learning_rate": 0.0002548262548262548, "loss": 0.5153782963752747, "memory(GiB)": 78.26, "step": 220, "token_acc": 0.8532017429948345, "train_speed(iter/s)": 0.034834 }, { "epoch": 0.042823233057210676, "grad_norm": 0.12516328692436218, "learning_rate": 0.00025598455598455593, "loss": 0.5454630851745605, "memory(GiB)": 78.26, "step": 221, "token_acc": 0.8455304060358847, "train_speed(iter/s)": 0.034837 }, { "epoch": 0.04301700334253742, "grad_norm": 0.12558779120445251, "learning_rate": 0.0002571428571428571, "loss": 0.5476839542388916, "memory(GiB)": 78.26, "step": 222, "token_acc": 0.8491302437385303, "train_speed(iter/s)": 0.034838 }, { "epoch": 0.043210773627864164, "grad_norm": 0.1301163285970688, "learning_rate": 0.00025830115830115827, "loss": 0.5624793171882629, "memory(GiB)": 78.26, "step": 223, "token_acc": 0.8417110837775045, "train_speed(iter/s)": 0.03484 }, { "epoch": 0.043404543913190916, "grad_norm": 0.1233832985162735, "learning_rate": 0.00025945945945945944, "loss": 0.5634196996688843, "memory(GiB)": 78.26, "step": 224, "token_acc": 0.8440744270023572, "train_speed(iter/s)": 0.034841 }, { "epoch": 0.04359831419851766, "grad_norm": 0.11998777091503143, "learning_rate": 0.0002606177606177606, "loss": 0.5352566838264465, "memory(GiB)": 78.26, "step": 225, "token_acc": 0.850525063369717, "train_speed(iter/s)": 0.034843 }, { "epoch": 0.043792084483844404, "grad_norm": 0.11935053765773773, "learning_rate": 0.0002617760617760618, "loss": 0.5280268788337708, "memory(GiB)": 78.26, "step": 226, "token_acc": 0.8505288461538462, "train_speed(iter/s)": 0.034845 }, { "epoch": 0.04398585476917115, "grad_norm": 0.11708512902259827, "learning_rate": 0.00026293436293436294, "loss": 0.5135525465011597, "memory(GiB)": 78.26, "step": 227, "token_acc": 0.8566448341432495, "train_speed(iter/s)": 0.034846 }, { "epoch": 0.04417962505449789, "grad_norm": 0.1259176880121231, "learning_rate": 0.00026409266409266406, "loss": 0.5575259327888489, "memory(GiB)": 78.26, "step": 228, "token_acc": 0.8424169123390531, "train_speed(iter/s)": 0.034847 }, { "epoch": 0.04437339533982464, "grad_norm": 0.12446990609169006, "learning_rate": 0.0002652509652509652, "loss": 0.5132482647895813, "memory(GiB)": 78.26, "step": 229, "token_acc": 0.8576641341938295, "train_speed(iter/s)": 0.034848 }, { "epoch": 0.04456716562515138, "grad_norm": 0.1376199871301651, "learning_rate": 0.0002664092664092664, "loss": 0.5118637681007385, "memory(GiB)": 78.26, "step": 230, "token_acc": 0.8575808249721293, "train_speed(iter/s)": 0.034849 }, { "epoch": 0.044760935910478125, "grad_norm": 0.13398443162441254, "learning_rate": 0.00026756756756756756, "loss": 0.543663501739502, "memory(GiB)": 78.26, "step": 231, "token_acc": 0.8458760878036109, "train_speed(iter/s)": 0.034849 }, { "epoch": 0.044954706195804876, "grad_norm": 0.13235385715961456, "learning_rate": 0.0002687258687258687, "loss": 0.5783892869949341, "memory(GiB)": 78.26, "step": 232, "token_acc": 0.8393900968051217, "train_speed(iter/s)": 0.034852 }, { "epoch": 0.04514847648113162, "grad_norm": 0.13555991649627686, "learning_rate": 0.00026988416988416985, "loss": 0.5653671026229858, "memory(GiB)": 78.26, "step": 233, "token_acc": 0.8414533928152148, "train_speed(iter/s)": 0.034855 }, { "epoch": 0.045342246766458365, "grad_norm": 0.1324978917837143, "learning_rate": 0.000271042471042471, "loss": 0.561633825302124, "memory(GiB)": 78.26, "step": 234, "token_acc": 0.845039593124316, "train_speed(iter/s)": 0.034858 }, { "epoch": 0.04553601705178511, "grad_norm": 0.1257573366165161, "learning_rate": 0.0002722007722007722, "loss": 0.5691174864768982, "memory(GiB)": 78.26, "step": 235, "token_acc": 0.8414384744097102, "train_speed(iter/s)": 0.03486 }, { "epoch": 0.04572978733711185, "grad_norm": 0.12331625819206238, "learning_rate": 0.0002733590733590733, "loss": 0.5002002120018005, "memory(GiB)": 78.26, "step": 236, "token_acc": 0.8585492089747352, "train_speed(iter/s)": 0.034862 }, { "epoch": 0.0459235576224386, "grad_norm": 0.12218355387449265, "learning_rate": 0.00027451737451737447, "loss": 0.48562729358673096, "memory(GiB)": 78.26, "step": 237, "token_acc": 0.8611403640740184, "train_speed(iter/s)": 0.034864 }, { "epoch": 0.04611732790776534, "grad_norm": 0.12444531172513962, "learning_rate": 0.00027567567567567564, "loss": 0.5203258991241455, "memory(GiB)": 78.26, "step": 238, "token_acc": 0.8543315991857046, "train_speed(iter/s)": 0.034866 }, { "epoch": 0.046311098193092086, "grad_norm": 0.12778066098690033, "learning_rate": 0.0002768339768339768, "loss": 0.5473844408988953, "memory(GiB)": 78.26, "step": 239, "token_acc": 0.8458585640138409, "train_speed(iter/s)": 0.034867 }, { "epoch": 0.04650486847841884, "grad_norm": 0.12521140277385712, "learning_rate": 0.000277992277992278, "loss": 0.5302430391311646, "memory(GiB)": 78.26, "step": 240, "token_acc": 0.8497068457705965, "train_speed(iter/s)": 0.034868 }, { "epoch": 0.04669863876374558, "grad_norm": 0.13087455928325653, "learning_rate": 0.00027915057915057915, "loss": 0.52344810962677, "memory(GiB)": 78.26, "step": 241, "token_acc": 0.8509464638253397, "train_speed(iter/s)": 0.034871 }, { "epoch": 0.046892409049072326, "grad_norm": 0.12300854921340942, "learning_rate": 0.0002803088803088803, "loss": 0.5085083842277527, "memory(GiB)": 78.26, "step": 242, "token_acc": 0.8563601071279393, "train_speed(iter/s)": 0.034873 }, { "epoch": 0.04708617933439907, "grad_norm": 0.1252821385860443, "learning_rate": 0.00028146718146718143, "loss": 0.5789982080459595, "memory(GiB)": 78.26, "step": 243, "token_acc": 0.8396179117080508, "train_speed(iter/s)": 0.034874 }, { "epoch": 0.047279949619725814, "grad_norm": 0.12332039326429367, "learning_rate": 0.0002826254826254826, "loss": 0.5306107997894287, "memory(GiB)": 78.26, "step": 244, "token_acc": 0.8511578885733525, "train_speed(iter/s)": 0.034877 }, { "epoch": 0.04747371990505256, "grad_norm": 0.11339928209781647, "learning_rate": 0.00028378378378378377, "loss": 0.533706784248352, "memory(GiB)": 78.26, "step": 245, "token_acc": 0.8514316174230886, "train_speed(iter/s)": 0.034877 }, { "epoch": 0.0476674901903793, "grad_norm": 0.12364498525857925, "learning_rate": 0.00028494208494208494, "loss": 0.5375621914863586, "memory(GiB)": 78.26, "step": 246, "token_acc": 0.8459392614747498, "train_speed(iter/s)": 0.03488 }, { "epoch": 0.047861260475706054, "grad_norm": 0.12653161585330963, "learning_rate": 0.00028610038610038605, "loss": 0.4905088245868683, "memory(GiB)": 78.26, "step": 247, "token_acc": 0.8620850743557136, "train_speed(iter/s)": 0.034882 }, { "epoch": 0.0480550307610328, "grad_norm": 0.1326380968093872, "learning_rate": 0.0002872586872586872, "loss": 0.5300474762916565, "memory(GiB)": 78.26, "step": 248, "token_acc": 0.8534217764115839, "train_speed(iter/s)": 0.034883 }, { "epoch": 0.04824880104635954, "grad_norm": 0.11917278915643692, "learning_rate": 0.0002884169884169884, "loss": 0.5086590647697449, "memory(GiB)": 78.26, "step": 249, "token_acc": 0.8568408610460755, "train_speed(iter/s)": 0.034884 }, { "epoch": 0.048442571331686286, "grad_norm": 0.12199165672063828, "learning_rate": 0.00028957528957528956, "loss": 0.5237119197845459, "memory(GiB)": 78.26, "step": 250, "token_acc": 0.8521546095586315, "train_speed(iter/s)": 0.034885 }, { "epoch": 0.04863634161701303, "grad_norm": 0.11789362877607346, "learning_rate": 0.0002907335907335907, "loss": 0.4931395649909973, "memory(GiB)": 78.26, "step": 251, "token_acc": 0.8595119082622757, "train_speed(iter/s)": 0.034887 }, { "epoch": 0.048830111902339775, "grad_norm": 0.11933058500289917, "learning_rate": 0.0002918918918918919, "loss": 0.5037907361984253, "memory(GiB)": 78.26, "step": 252, "token_acc": 0.8580700162252839, "train_speed(iter/s)": 0.034889 }, { "epoch": 0.04902388218766652, "grad_norm": 0.11169978976249695, "learning_rate": 0.000293050193050193, "loss": 0.4690629243850708, "memory(GiB)": 78.26, "step": 253, "token_acc": 0.8666485000123864, "train_speed(iter/s)": 0.03489 }, { "epoch": 0.04921765247299326, "grad_norm": 0.11621616035699844, "learning_rate": 0.0002942084942084942, "loss": 0.49911874532699585, "memory(GiB)": 78.26, "step": 254, "token_acc": 0.8591189560995347, "train_speed(iter/s)": 0.034891 }, { "epoch": 0.049411422758320014, "grad_norm": 0.1331455260515213, "learning_rate": 0.00029536679536679535, "loss": 0.5178690552711487, "memory(GiB)": 78.26, "step": 255, "token_acc": 0.8545419436705194, "train_speed(iter/s)": 0.034894 }, { "epoch": 0.04960519304364676, "grad_norm": 0.13746346533298492, "learning_rate": 0.0002965250965250965, "loss": 0.5907042026519775, "memory(GiB)": 78.26, "step": 256, "token_acc": 0.8399606359750285, "train_speed(iter/s)": 0.034896 }, { "epoch": 0.0497989633289735, "grad_norm": 0.13077257573604584, "learning_rate": 0.00029768339768339764, "loss": 0.5277935266494751, "memory(GiB)": 78.26, "step": 257, "token_acc": 0.8542142655941911, "train_speed(iter/s)": 0.034897 }, { "epoch": 0.04999273361430025, "grad_norm": 0.11512145400047302, "learning_rate": 0.0002988416988416988, "loss": 0.5082737803459167, "memory(GiB)": 78.26, "step": 258, "token_acc": 0.8569431737318288, "train_speed(iter/s)": 0.034898 }, { "epoch": 0.05018650389962699, "grad_norm": 0.12492549419403076, "learning_rate": 0.0003, "loss": 0.5232591032981873, "memory(GiB)": 78.26, "step": 259, "token_acc": 0.8529347048792563, "train_speed(iter/s)": 0.034899 }, { "epoch": 0.050380274184953736, "grad_norm": 0.1214490681886673, "learning_rate": 0.0002999999691954846, "loss": 0.538935124874115, "memory(GiB)": 78.26, "step": 260, "token_acc": 0.848226576457839, "train_speed(iter/s)": 0.034901 }, { "epoch": 0.05057404447028048, "grad_norm": 0.12162759155035019, "learning_rate": 0.0002999998767819513, "loss": 0.5226523876190186, "memory(GiB)": 78.26, "step": 261, "token_acc": 0.8527364343343139, "train_speed(iter/s)": 0.034904 }, { "epoch": 0.05076781475560723, "grad_norm": 0.14032401144504547, "learning_rate": 0.0002999997227594379, "loss": 0.5270801782608032, "memory(GiB)": 78.26, "step": 262, "token_acc": 0.8529345116700657, "train_speed(iter/s)": 0.034905 }, { "epoch": 0.050961585040933975, "grad_norm": 0.12755590677261353, "learning_rate": 0.00029999950712800773, "loss": 0.5621036887168884, "memory(GiB)": 78.26, "step": 263, "token_acc": 0.8452567559094614, "train_speed(iter/s)": 0.034908 }, { "epoch": 0.05115535532626072, "grad_norm": 0.12125645577907562, "learning_rate": 0.0002999992298877494, "loss": 0.48772531747817993, "memory(GiB)": 78.26, "step": 264, "token_acc": 0.860484942704546, "train_speed(iter/s)": 0.03491 }, { "epoch": 0.051349125611587464, "grad_norm": 0.14849497377872467, "learning_rate": 0.00029999889103877667, "loss": 0.5987675189971924, "memory(GiB)": 78.26, "step": 265, "token_acc": 0.8323770133690618, "train_speed(iter/s)": 0.034912 }, { "epoch": 0.05154289589691421, "grad_norm": 0.11173395067453384, "learning_rate": 0.00029999849058122874, "loss": 0.4839743971824646, "memory(GiB)": 78.26, "step": 266, "token_acc": 0.8618188850746991, "train_speed(iter/s)": 0.034913 }, { "epoch": 0.05173666618224095, "grad_norm": 0.13394448161125183, "learning_rate": 0.0002999980285152701, "loss": 0.5341227054595947, "memory(GiB)": 78.26, "step": 267, "token_acc": 0.8524495008126306, "train_speed(iter/s)": 0.034913 }, { "epoch": 0.051930436467567696, "grad_norm": 0.13987241685390472, "learning_rate": 0.0002999975048410906, "loss": 0.5620037913322449, "memory(GiB)": 78.26, "step": 268, "token_acc": 0.8440742478752803, "train_speed(iter/s)": 0.034913 }, { "epoch": 0.05212420675289444, "grad_norm": 0.12064801156520844, "learning_rate": 0.0002999969195589052, "loss": 0.5743050575256348, "memory(GiB)": 78.26, "step": 269, "token_acc": 0.8411526254595733, "train_speed(iter/s)": 0.034916 }, { "epoch": 0.05231797703822119, "grad_norm": 0.1192815750837326, "learning_rate": 0.00029999627266895444, "loss": 0.497215211391449, "memory(GiB)": 78.26, "step": 270, "token_acc": 0.8576876267748479, "train_speed(iter/s)": 0.034917 }, { "epoch": 0.052511747323547936, "grad_norm": 0.11295323818922043, "learning_rate": 0.0002999955641715039, "loss": 0.46060121059417725, "memory(GiB)": 78.26, "step": 271, "token_acc": 0.8695854936493567, "train_speed(iter/s)": 0.034918 }, { "epoch": 0.05270551760887468, "grad_norm": 0.11337928473949432, "learning_rate": 0.00029999479406684466, "loss": 0.47304582595825195, "memory(GiB)": 78.26, "step": 272, "token_acc": 0.8662467580585402, "train_speed(iter/s)": 0.034918 }, { "epoch": 0.052899287894201424, "grad_norm": 0.13473428785800934, "learning_rate": 0.000299993962355293, "loss": 0.5001351833343506, "memory(GiB)": 78.26, "step": 273, "token_acc": 0.8586831727649379, "train_speed(iter/s)": 0.03492 }, { "epoch": 0.05309305817952817, "grad_norm": 0.12432650476694107, "learning_rate": 0.00029999306903719043, "loss": 0.49999624490737915, "memory(GiB)": 78.26, "step": 274, "token_acc": 0.8596558122982282, "train_speed(iter/s)": 0.034921 }, { "epoch": 0.05328682846485491, "grad_norm": 0.12385527044534683, "learning_rate": 0.0002999921141129039, "loss": 0.5260789394378662, "memory(GiB)": 78.26, "step": 275, "token_acc": 0.8528883832638099, "train_speed(iter/s)": 0.034922 }, { "epoch": 0.05348059875018166, "grad_norm": 0.1163427084684372, "learning_rate": 0.00029999109758282577, "loss": 0.5076729655265808, "memory(GiB)": 78.26, "step": 276, "token_acc": 0.8572664593754225, "train_speed(iter/s)": 0.034923 }, { "epoch": 0.0536743690355084, "grad_norm": 0.10747300833463669, "learning_rate": 0.0002999900194473734, "loss": 0.4819478392601013, "memory(GiB)": 78.26, "step": 277, "token_acc": 0.8656782763309278, "train_speed(iter/s)": 0.034925 }, { "epoch": 0.05386813932083515, "grad_norm": 0.12642782926559448, "learning_rate": 0.00029998887970698966, "loss": 0.5688496232032776, "memory(GiB)": 78.26, "step": 278, "token_acc": 0.8424845950704225, "train_speed(iter/s)": 0.034926 }, { "epoch": 0.0540619096061619, "grad_norm": 0.11862245947122574, "learning_rate": 0.00029998767836214265, "loss": 0.5431630611419678, "memory(GiB)": 78.26, "step": 279, "token_acc": 0.8460705036731674, "train_speed(iter/s)": 0.034927 }, { "epoch": 0.05425567989148864, "grad_norm": 0.10874070972204208, "learning_rate": 0.00029998641541332583, "loss": 0.446528822183609, "memory(GiB)": 78.26, "step": 280, "token_acc": 0.875025387583779, "train_speed(iter/s)": 0.034929 }, { "epoch": 0.054449450176815385, "grad_norm": 0.1398598551750183, "learning_rate": 0.0002999850908610579, "loss": 0.5694330334663391, "memory(GiB)": 78.26, "step": 281, "token_acc": 0.841722914998284, "train_speed(iter/s)": 0.03493 }, { "epoch": 0.05464322046214213, "grad_norm": 0.11889787018299103, "learning_rate": 0.00029998370470588287, "loss": 0.4838942885398865, "memory(GiB)": 78.26, "step": 282, "token_acc": 0.8644304682040531, "train_speed(iter/s)": 0.034932 }, { "epoch": 0.054836990747468874, "grad_norm": 0.10652502626180649, "learning_rate": 0.00029998225694837015, "loss": 0.4832991063594818, "memory(GiB)": 78.26, "step": 283, "token_acc": 0.8605131303975627, "train_speed(iter/s)": 0.034933 }, { "epoch": 0.05503076103279562, "grad_norm": 0.1357640027999878, "learning_rate": 0.0002999807475891143, "loss": 0.5402747392654419, "memory(GiB)": 78.26, "step": 284, "token_acc": 0.8523422441967653, "train_speed(iter/s)": 0.034934 }, { "epoch": 0.05522453131812237, "grad_norm": 0.12577500939369202, "learning_rate": 0.00029997917662873526, "loss": 0.5225556492805481, "memory(GiB)": 78.26, "step": 285, "token_acc": 0.8524319637512936, "train_speed(iter/s)": 0.034935 }, { "epoch": 0.05541830160344911, "grad_norm": 0.11081529408693314, "learning_rate": 0.0002999775440678783, "loss": 0.47770658135414124, "memory(GiB)": 78.26, "step": 286, "token_acc": 0.8655869779677737, "train_speed(iter/s)": 0.034937 }, { "epoch": 0.05561207188877586, "grad_norm": 0.11565393954515457, "learning_rate": 0.00029997584990721396, "loss": 0.5191536545753479, "memory(GiB)": 78.26, "step": 287, "token_acc": 0.8520145631067961, "train_speed(iter/s)": 0.034937 }, { "epoch": 0.0558058421741026, "grad_norm": 0.1177186667919159, "learning_rate": 0.000299974094147438, "loss": 0.5068661570549011, "memory(GiB)": 78.26, "step": 288, "token_acc": 0.8555767793372644, "train_speed(iter/s)": 0.034939 }, { "epoch": 0.055999612459429346, "grad_norm": 0.11559902131557465, "learning_rate": 0.00029997227678927164, "loss": 0.5000442862510681, "memory(GiB)": 78.26, "step": 289, "token_acc": 0.8562957392033785, "train_speed(iter/s)": 0.03494 }, { "epoch": 0.05619338274475609, "grad_norm": 0.1195712685585022, "learning_rate": 0.0002999703978334613, "loss": 0.5059924125671387, "memory(GiB)": 78.26, "step": 290, "token_acc": 0.8566373608427013, "train_speed(iter/s)": 0.034941 }, { "epoch": 0.056387153030082834, "grad_norm": 0.11978676170110703, "learning_rate": 0.00029996845728077874, "loss": 0.5218163728713989, "memory(GiB)": 78.26, "step": 291, "token_acc": 0.8552988421821216, "train_speed(iter/s)": 0.034943 }, { "epoch": 0.05658092331540958, "grad_norm": 0.11734765022993088, "learning_rate": 0.00029996645513202086, "loss": 0.5239380598068237, "memory(GiB)": 78.26, "step": 292, "token_acc": 0.85505396631614, "train_speed(iter/s)": 0.034945 }, { "epoch": 0.05677469360073633, "grad_norm": 0.11253336071968079, "learning_rate": 0.0002999643913880102, "loss": 0.512554943561554, "memory(GiB)": 78.26, "step": 293, "token_acc": 0.8555791147627883, "train_speed(iter/s)": 0.034947 }, { "epoch": 0.056968463886063074, "grad_norm": 0.1775335669517517, "learning_rate": 0.0002999622660495943, "loss": 0.48465481400489807, "memory(GiB)": 78.26, "step": 294, "token_acc": 0.8632988755461122, "train_speed(iter/s)": 0.034948 }, { "epoch": 0.05716223417138982, "grad_norm": 0.11287941783666611, "learning_rate": 0.0002999600791176461, "loss": 0.5061824321746826, "memory(GiB)": 78.26, "step": 295, "token_acc": 0.8573063443244403, "train_speed(iter/s)": 0.034949 }, { "epoch": 0.05735600445671656, "grad_norm": 0.26354172825813293, "learning_rate": 0.00029995783059306373, "loss": 0.5662236213684082, "memory(GiB)": 78.26, "step": 296, "token_acc": 0.8466373350094281, "train_speed(iter/s)": 0.034951 }, { "epoch": 0.05754977474204331, "grad_norm": 0.4357684552669525, "learning_rate": 0.0002999555204767709, "loss": 0.5678244233131409, "memory(GiB)": 78.26, "step": 297, "token_acc": 0.8449365109151186, "train_speed(iter/s)": 0.034952 }, { "epoch": 0.05774354502737005, "grad_norm": 0.12695299088954926, "learning_rate": 0.00029995314876971627, "loss": 0.5329843759536743, "memory(GiB)": 78.26, "step": 298, "token_acc": 0.8508813455487176, "train_speed(iter/s)": 0.034953 }, { "epoch": 0.057937315312696795, "grad_norm": 0.13667218387126923, "learning_rate": 0.00029995071547287414, "loss": 0.5417138934135437, "memory(GiB)": 78.26, "step": 299, "token_acc": 0.8479577535288818, "train_speed(iter/s)": 0.034954 }, { "epoch": 0.058131085598023546, "grad_norm": 0.10648724436759949, "learning_rate": 0.00029994822058724375, "loss": 0.4887983202934265, "memory(GiB)": 78.26, "step": 300, "token_acc": 0.8605226687784597, "train_speed(iter/s)": 0.034954 }, { "epoch": 0.05832485588335029, "grad_norm": 0.11893422901630402, "learning_rate": 0.00029994566411384993, "loss": 0.505358099937439, "memory(GiB)": 78.26, "step": 301, "token_acc": 0.8587887578336663, "train_speed(iter/s)": 0.034955 }, { "epoch": 0.058518626168677035, "grad_norm": 0.12744036316871643, "learning_rate": 0.0002999430460537427, "loss": 0.5472061634063721, "memory(GiB)": 78.26, "step": 302, "token_acc": 0.8475548152800153, "train_speed(iter/s)": 0.034956 }, { "epoch": 0.05871239645400378, "grad_norm": 0.12299606949090958, "learning_rate": 0.00029994036640799726, "loss": 0.5060437917709351, "memory(GiB)": 78.26, "step": 303, "token_acc": 0.8581589526852902, "train_speed(iter/s)": 0.034956 }, { "epoch": 0.05890616673933052, "grad_norm": 0.11920775473117828, "learning_rate": 0.00029993762517771435, "loss": 0.5124378204345703, "memory(GiB)": 78.26, "step": 304, "token_acc": 0.853845315310405, "train_speed(iter/s)": 0.034957 }, { "epoch": 0.05909993702465727, "grad_norm": 0.11296442151069641, "learning_rate": 0.0002999348223640198, "loss": 0.5146512985229492, "memory(GiB)": 78.26, "step": 305, "token_acc": 0.856233997982776, "train_speed(iter/s)": 0.034958 }, { "epoch": 0.05929370730998401, "grad_norm": 0.11093028634786606, "learning_rate": 0.0002999319579680647, "loss": 0.485245943069458, "memory(GiB)": 78.26, "step": 306, "token_acc": 0.8640687244248123, "train_speed(iter/s)": 0.034959 }, { "epoch": 0.059487477595310756, "grad_norm": 0.11484235525131226, "learning_rate": 0.00029992903199102576, "loss": 0.4985674023628235, "memory(GiB)": 78.26, "step": 307, "token_acc": 0.8578893469527086, "train_speed(iter/s)": 0.03496 }, { "epoch": 0.05968124788063751, "grad_norm": 0.11024551838636398, "learning_rate": 0.00029992604443410456, "loss": 0.4915925860404968, "memory(GiB)": 78.26, "step": 308, "token_acc": 0.8619262990183482, "train_speed(iter/s)": 0.03496 }, { "epoch": 0.05987501816596425, "grad_norm": 0.11654435843229294, "learning_rate": 0.00029992299529852827, "loss": 0.5016142129898071, "memory(GiB)": 78.26, "step": 309, "token_acc": 0.8607442107550769, "train_speed(iter/s)": 0.03496 }, { "epoch": 0.060068788451290996, "grad_norm": 0.10757559537887573, "learning_rate": 0.0002999198845855492, "loss": 0.4506911039352417, "memory(GiB)": 78.26, "step": 310, "token_acc": 0.8732513679968862, "train_speed(iter/s)": 0.03496 }, { "epoch": 0.06026255873661774, "grad_norm": 0.11542216688394547, "learning_rate": 0.00029991671229644503, "loss": 0.47681623697280884, "memory(GiB)": 78.26, "step": 311, "token_acc": 0.8678860712584043, "train_speed(iter/s)": 0.034962 }, { "epoch": 0.060456329021944484, "grad_norm": 0.12022379785776138, "learning_rate": 0.0002999134784325187, "loss": 0.5177062153816223, "memory(GiB)": 78.26, "step": 312, "token_acc": 0.8545514413146945, "train_speed(iter/s)": 0.034964 }, { "epoch": 0.06065009930727123, "grad_norm": 0.11384547501802444, "learning_rate": 0.0002999101829950985, "loss": 0.5101380348205566, "memory(GiB)": 78.26, "step": 313, "token_acc": 0.8565856442943443, "train_speed(iter/s)": 0.034964 }, { "epoch": 0.06084386959259797, "grad_norm": 0.11187402158975601, "learning_rate": 0.0002999068259855378, "loss": 0.5241718888282776, "memory(GiB)": 78.26, "step": 314, "token_acc": 0.8541580041580041, "train_speed(iter/s)": 0.034966 }, { "epoch": 0.061037639877924724, "grad_norm": 0.10513240844011307, "learning_rate": 0.0002999034074052156, "loss": 0.4202113449573517, "memory(GiB)": 78.26, "step": 315, "token_acc": 0.8803593092589247, "train_speed(iter/s)": 0.034966 }, { "epoch": 0.06123141016325147, "grad_norm": 0.11992194503545761, "learning_rate": 0.0002998999272555359, "loss": 0.506824254989624, "memory(GiB)": 78.26, "step": 316, "token_acc": 0.8580272713024398, "train_speed(iter/s)": 0.034967 }, { "epoch": 0.06142518044857821, "grad_norm": 0.11586200445890427, "learning_rate": 0.0002998963855379281, "loss": 0.5369887351989746, "memory(GiB)": 78.26, "step": 317, "token_acc": 0.8491955856933918, "train_speed(iter/s)": 0.034968 }, { "epoch": 0.061618950733904956, "grad_norm": 0.11802522093057632, "learning_rate": 0.0002998927822538469, "loss": 0.5725922584533691, "memory(GiB)": 78.26, "step": 318, "token_acc": 0.8380715205103811, "train_speed(iter/s)": 0.03497 }, { "epoch": 0.0618127210192317, "grad_norm": 0.12824736535549164, "learning_rate": 0.0002998891174047722, "loss": 0.567888617515564, "memory(GiB)": 78.26, "step": 319, "token_acc": 0.841992673992674, "train_speed(iter/s)": 0.034972 }, { "epoch": 0.062006491304558445, "grad_norm": 0.10866405814886093, "learning_rate": 0.00029988539099220937, "loss": 0.4857517182826996, "memory(GiB)": 78.26, "step": 320, "token_acc": 0.8629794688168165, "train_speed(iter/s)": 0.034973 }, { "epoch": 0.06220026158988519, "grad_norm": 0.10862304270267487, "learning_rate": 0.00029988160301768884, "loss": 0.45832955837249756, "memory(GiB)": 78.26, "step": 321, "token_acc": 0.8700757980143056, "train_speed(iter/s)": 0.034974 }, { "epoch": 0.06239403187521193, "grad_norm": 0.11096280068159103, "learning_rate": 0.00029987775348276646, "loss": 0.5003219842910767, "memory(GiB)": 78.26, "step": 322, "token_acc": 0.8605376800379422, "train_speed(iter/s)": 0.034975 }, { "epoch": 0.06258780216053868, "grad_norm": 0.11027340590953827, "learning_rate": 0.0002998738423890234, "loss": 0.4482397139072418, "memory(GiB)": 78.26, "step": 323, "token_acc": 0.8740220554858348, "train_speed(iter/s)": 0.034975 }, { "epoch": 0.06278157244586542, "grad_norm": 0.11176367849111557, "learning_rate": 0.000299869869738066, "loss": 0.5309886932373047, "memory(GiB)": 78.26, "step": 324, "token_acc": 0.8513593064326234, "train_speed(iter/s)": 0.034976 }, { "epoch": 0.06297534273119217, "grad_norm": 0.10626234114170074, "learning_rate": 0.000299865835531526, "loss": 0.4504719376564026, "memory(GiB)": 78.26, "step": 325, "token_acc": 0.8722543040638998, "train_speed(iter/s)": 0.034977 }, { "epoch": 0.06316911301651891, "grad_norm": 0.10870497673749924, "learning_rate": 0.00029986173977106017, "loss": 0.5229367017745972, "memory(GiB)": 78.26, "step": 326, "token_acc": 0.8503923012467308, "train_speed(iter/s)": 0.034978 }, { "epoch": 0.06336288330184567, "grad_norm": 0.11313097178936005, "learning_rate": 0.0002998575824583509, "loss": 0.4801797866821289, "memory(GiB)": 78.26, "step": 327, "token_acc": 0.8636890035268726, "train_speed(iter/s)": 0.034979 }, { "epoch": 0.06355665358717241, "grad_norm": 0.11869515478610992, "learning_rate": 0.0002998533635951058, "loss": 0.5341205596923828, "memory(GiB)": 78.26, "step": 328, "token_acc": 0.8493568134624201, "train_speed(iter/s)": 0.03498 }, { "epoch": 0.06375042387249916, "grad_norm": 0.11751175671815872, "learning_rate": 0.00029984908318305743, "loss": 0.47566699981689453, "memory(GiB)": 78.26, "step": 329, "token_acc": 0.864566263149548, "train_speed(iter/s)": 0.034981 }, { "epoch": 0.0639441941578259, "grad_norm": 0.11967992782592773, "learning_rate": 0.000299844741223964, "loss": 0.4633430540561676, "memory(GiB)": 78.26, "step": 330, "token_acc": 0.8679728375820995, "train_speed(iter/s)": 0.034981 }, { "epoch": 0.06413796444315265, "grad_norm": 0.12265921384096146, "learning_rate": 0.00029984033771960895, "loss": 0.5029769539833069, "memory(GiB)": 78.26, "step": 331, "token_acc": 0.8578765113276207, "train_speed(iter/s)": 0.034982 }, { "epoch": 0.06433173472847939, "grad_norm": 0.11440466344356537, "learning_rate": 0.0002998358726718008, "loss": 0.5016182661056519, "memory(GiB)": 78.26, "step": 332, "token_acc": 0.8598369870713884, "train_speed(iter/s)": 0.034982 }, { "epoch": 0.06452550501380613, "grad_norm": 0.1108684316277504, "learning_rate": 0.0002998313460823735, "loss": 0.5170926451683044, "memory(GiB)": 78.26, "step": 333, "token_acc": 0.8539208882720333, "train_speed(iter/s)": 0.034983 }, { "epoch": 0.06471927529913288, "grad_norm": 0.12245868891477585, "learning_rate": 0.00029982675795318616, "loss": 0.49607276916503906, "memory(GiB)": 78.26, "step": 334, "token_acc": 0.8618406713164778, "train_speed(iter/s)": 0.034983 }, { "epoch": 0.06491304558445962, "grad_norm": 0.11894813925027847, "learning_rate": 0.0002998221082861234, "loss": 0.49732956290245056, "memory(GiB)": 78.26, "step": 335, "token_acc": 0.8582034755649832, "train_speed(iter/s)": 0.034985 }, { "epoch": 0.06510681586978637, "grad_norm": 0.12267972528934479, "learning_rate": 0.0002998173970830949, "loss": 0.5078924298286438, "memory(GiB)": 78.26, "step": 336, "token_acc": 0.8583969800719866, "train_speed(iter/s)": 0.034986 }, { "epoch": 0.06530058615511311, "grad_norm": 0.10648109018802643, "learning_rate": 0.0002998126243460357, "loss": 0.5070585608482361, "memory(GiB)": 78.26, "step": 337, "token_acc": 0.8565583698958306, "train_speed(iter/s)": 0.034987 }, { "epoch": 0.06549435644043985, "grad_norm": 0.11792565882205963, "learning_rate": 0.000299807790076906, "loss": 0.5286574959754944, "memory(GiB)": 78.26, "step": 338, "token_acc": 0.8529562054765698, "train_speed(iter/s)": 0.034989 }, { "epoch": 0.0656881267257666, "grad_norm": 0.12195685505867004, "learning_rate": 0.0002998028942776914, "loss": 0.4869121313095093, "memory(GiB)": 78.26, "step": 339, "token_acc": 0.8612365934096389, "train_speed(iter/s)": 0.03499 }, { "epoch": 0.06588189701109334, "grad_norm": 0.1249171569943428, "learning_rate": 0.0002997979369504028, "loss": 0.5445664525032043, "memory(GiB)": 78.26, "step": 340, "token_acc": 0.8522178660532028, "train_speed(iter/s)": 0.034991 }, { "epoch": 0.06607566729642009, "grad_norm": 0.13457630574703217, "learning_rate": 0.0002997929180970763, "loss": 0.5670903325080872, "memory(GiB)": 78.26, "step": 341, "token_acc": 0.8372440096177763, "train_speed(iter/s)": 0.034991 }, { "epoch": 0.06626943758174685, "grad_norm": 0.11432075500488281, "learning_rate": 0.0002997878377197732, "loss": 0.5261147022247314, "memory(GiB)": 78.26, "step": 342, "token_acc": 0.8552226935312831, "train_speed(iter/s)": 0.034992 }, { "epoch": 0.06646320786707359, "grad_norm": 0.10323546081781387, "learning_rate": 0.00029978269582058015, "loss": 0.4720154404640198, "memory(GiB)": 78.26, "step": 343, "token_acc": 0.8659866148531952, "train_speed(iter/s)": 0.034992 }, { "epoch": 0.06665697815240033, "grad_norm": 0.11725510656833649, "learning_rate": 0.0002997774924016092, "loss": 0.5160101652145386, "memory(GiB)": 78.26, "step": 344, "token_acc": 0.8551109929549382, "train_speed(iter/s)": 0.034992 }, { "epoch": 0.06685074843772708, "grad_norm": 0.12052306532859802, "learning_rate": 0.0002997722274649974, "loss": 0.537044107913971, "memory(GiB)": 78.26, "step": 345, "token_acc": 0.8476571428571429, "train_speed(iter/s)": 0.034993 }, { "epoch": 0.06704451872305382, "grad_norm": 0.11849239468574524, "learning_rate": 0.00029976690101290727, "loss": 0.5134192705154419, "memory(GiB)": 78.26, "step": 346, "token_acc": 0.8558187985790219, "train_speed(iter/s)": 0.034995 }, { "epoch": 0.06723828900838057, "grad_norm": 0.12307219952344894, "learning_rate": 0.00029976151304752645, "loss": 0.4876058101654053, "memory(GiB)": 78.26, "step": 347, "token_acc": 0.8650527622594661, "train_speed(iter/s)": 0.034996 }, { "epoch": 0.06743205929370731, "grad_norm": 0.11402394622564316, "learning_rate": 0.00029975606357106804, "loss": 0.47068169713020325, "memory(GiB)": 78.26, "step": 348, "token_acc": 0.8668408661682147, "train_speed(iter/s)": 0.034998 }, { "epoch": 0.06762582957903406, "grad_norm": 0.12601493299007416, "learning_rate": 0.00029975055258577016, "loss": 0.5161094665527344, "memory(GiB)": 78.26, "step": 349, "token_acc": 0.8581284381363401, "train_speed(iter/s)": 0.034999 }, { "epoch": 0.0678195998643608, "grad_norm": 0.12433503568172455, "learning_rate": 0.0002997449800938964, "loss": 0.5517579913139343, "memory(GiB)": 78.26, "step": 350, "token_acc": 0.8489635649712355, "train_speed(iter/s)": 0.035001 }, { "epoch": 0.06801337014968754, "grad_norm": 0.11672255396842957, "learning_rate": 0.0002997393460977355, "loss": 0.49452394247055054, "memory(GiB)": 78.26, "step": 351, "token_acc": 0.8589091777061701, "train_speed(iter/s)": 0.035002 }, { "epoch": 0.06820714043501429, "grad_norm": 0.11498520523309708, "learning_rate": 0.00029973365059960153, "loss": 0.5128313302993774, "memory(GiB)": 78.26, "step": 352, "token_acc": 0.8535805294087055, "train_speed(iter/s)": 0.035002 }, { "epoch": 0.06840091072034103, "grad_norm": 0.14143286645412445, "learning_rate": 0.00029972789360183376, "loss": 0.5255135893821716, "memory(GiB)": 78.26, "step": 353, "token_acc": 0.8556857047731469, "train_speed(iter/s)": 0.035003 }, { "epoch": 0.06859468100566778, "grad_norm": 0.12254343181848526, "learning_rate": 0.00029972207510679675, "loss": 0.5137450695037842, "memory(GiB)": 78.26, "step": 354, "token_acc": 0.8559991402933749, "train_speed(iter/s)": 0.035004 }, { "epoch": 0.06878845129099452, "grad_norm": 0.11112511903047562, "learning_rate": 0.0002997161951168803, "loss": 0.46960967779159546, "memory(GiB)": 78.26, "step": 355, "token_acc": 0.8666562937606971, "train_speed(iter/s)": 0.035004 }, { "epoch": 0.06898222157632126, "grad_norm": 0.11723387986421585, "learning_rate": 0.0002997102536344995, "loss": 0.5213668346405029, "memory(GiB)": 78.26, "step": 356, "token_acc": 0.8504984318996416, "train_speed(iter/s)": 0.035005 }, { "epoch": 0.06917599186164802, "grad_norm": 0.11225343495607376, "learning_rate": 0.0002997042506620946, "loss": 0.4935987591743469, "memory(GiB)": 78.26, "step": 357, "token_acc": 0.8581589163069739, "train_speed(iter/s)": 0.035007 }, { "epoch": 0.06936976214697477, "grad_norm": 0.11436620354652405, "learning_rate": 0.0002996981862021313, "loss": 0.47615599632263184, "memory(GiB)": 78.26, "step": 358, "token_acc": 0.8642991737933033, "train_speed(iter/s)": 0.035007 }, { "epoch": 0.06956353243230151, "grad_norm": 0.12642012536525726, "learning_rate": 0.00029969206025710037, "loss": 0.5081407427787781, "memory(GiB)": 78.26, "step": 359, "token_acc": 0.8585974082543831, "train_speed(iter/s)": 0.035008 }, { "epoch": 0.06975730271762826, "grad_norm": 0.1168803945183754, "learning_rate": 0.0002996858728295179, "loss": 0.4714093804359436, "memory(GiB)": 78.26, "step": 360, "token_acc": 0.8668951045236009, "train_speed(iter/s)": 0.035008 }, { "epoch": 0.069951073002955, "grad_norm": 0.11019770056009293, "learning_rate": 0.00029967962392192526, "loss": 0.5050376057624817, "memory(GiB)": 78.26, "step": 361, "token_acc": 0.8604877186782117, "train_speed(iter/s)": 0.03501 }, { "epoch": 0.07014484328828174, "grad_norm": 0.12403653562068939, "learning_rate": 0.000299673313536889, "loss": 0.49756452441215515, "memory(GiB)": 78.26, "step": 362, "token_acc": 0.8619797028974849, "train_speed(iter/s)": 0.035011 }, { "epoch": 0.07033861357360849, "grad_norm": 0.10632438957691193, "learning_rate": 0.00029966694167700105, "loss": 0.50919508934021, "memory(GiB)": 78.26, "step": 363, "token_acc": 0.8557894184337602, "train_speed(iter/s)": 0.035011 }, { "epoch": 0.07053238385893523, "grad_norm": 0.12035337090492249, "learning_rate": 0.0002996605083448784, "loss": 0.5421941876411438, "memory(GiB)": 78.26, "step": 364, "token_acc": 0.8479168897932955, "train_speed(iter/s)": 0.035012 }, { "epoch": 0.07072615414426198, "grad_norm": 0.11031734943389893, "learning_rate": 0.00029965401354316345, "loss": 0.48488667607307434, "memory(GiB)": 78.26, "step": 365, "token_acc": 0.8627623778240666, "train_speed(iter/s)": 0.035013 }, { "epoch": 0.07091992442958872, "grad_norm": 0.10722577571868896, "learning_rate": 0.00029964745727452375, "loss": 0.45194217562675476, "memory(GiB)": 78.26, "step": 366, "token_acc": 0.8705380798689725, "train_speed(iter/s)": 0.035013 }, { "epoch": 0.07111369471491547, "grad_norm": 0.10588467866182327, "learning_rate": 0.0002996408395416521, "loss": 0.45176932215690613, "memory(GiB)": 78.26, "step": 367, "token_acc": 0.8708429432333904, "train_speed(iter/s)": 0.035014 }, { "epoch": 0.07130746500024221, "grad_norm": 0.10778294503688812, "learning_rate": 0.0002996341603472668, "loss": 0.501011312007904, "memory(GiB)": 78.26, "step": 368, "token_acc": 0.8594550505951623, "train_speed(iter/s)": 0.035014 }, { "epoch": 0.07150123528556895, "grad_norm": 0.12546683847904205, "learning_rate": 0.00029962741969411096, "loss": 0.4865407943725586, "memory(GiB)": 78.26, "step": 369, "token_acc": 0.8665692482545868, "train_speed(iter/s)": 0.035016 }, { "epoch": 0.0716950055708957, "grad_norm": 0.11863457411527634, "learning_rate": 0.0002996206175849532, "loss": 0.5059066414833069, "memory(GiB)": 78.26, "step": 370, "token_acc": 0.8579709417580488, "train_speed(iter/s)": 0.035017 }, { "epoch": 0.07188877585622244, "grad_norm": 0.1237197294831276, "learning_rate": 0.0002996137540225873, "loss": 0.5302804708480835, "memory(GiB)": 78.26, "step": 371, "token_acc": 0.8513494809688581, "train_speed(iter/s)": 0.035018 }, { "epoch": 0.0720825461415492, "grad_norm": 0.11564111709594727, "learning_rate": 0.0002996068290098324, "loss": 0.4804614186286926, "memory(GiB)": 78.26, "step": 372, "token_acc": 0.8635471113692303, "train_speed(iter/s)": 0.035019 }, { "epoch": 0.07227631642687594, "grad_norm": 0.115287646651268, "learning_rate": 0.0002995998425495327, "loss": 0.5029768943786621, "memory(GiB)": 78.26, "step": 373, "token_acc": 0.8573006711038692, "train_speed(iter/s)": 0.035019 }, { "epoch": 0.07247008671220269, "grad_norm": 0.11475943773984909, "learning_rate": 0.0002995927946445578, "loss": 0.4446421265602112, "memory(GiB)": 78.26, "step": 374, "token_acc": 0.8731128990987405, "train_speed(iter/s)": 0.035021 }, { "epoch": 0.07266385699752943, "grad_norm": 0.11748611181974411, "learning_rate": 0.00029958568529780245, "loss": 0.4998936653137207, "memory(GiB)": 78.26, "step": 375, "token_acc": 0.8600776778413737, "train_speed(iter/s)": 0.035021 }, { "epoch": 0.07285762728285618, "grad_norm": 0.11014379560947418, "learning_rate": 0.00029957851451218654, "loss": 0.48282113671302795, "memory(GiB)": 78.26, "step": 376, "token_acc": 0.8650632477795992, "train_speed(iter/s)": 0.035022 }, { "epoch": 0.07305139756818292, "grad_norm": 0.1131962388753891, "learning_rate": 0.0002995712822906554, "loss": 0.47199296951293945, "memory(GiB)": 78.26, "step": 377, "token_acc": 0.8649652360874542, "train_speed(iter/s)": 0.035023 }, { "epoch": 0.07324516785350967, "grad_norm": 0.11221049726009369, "learning_rate": 0.0002995639886361795, "loss": 0.5101888179779053, "memory(GiB)": 78.26, "step": 378, "token_acc": 0.8550319599324829, "train_speed(iter/s)": 0.035024 }, { "epoch": 0.07343893813883641, "grad_norm": 0.1063636839389801, "learning_rate": 0.0002995566335517546, "loss": 0.5004944205284119, "memory(GiB)": 78.26, "step": 379, "token_acc": 0.8585596162973695, "train_speed(iter/s)": 0.035024 }, { "epoch": 0.07363270842416315, "grad_norm": 0.11803896725177765, "learning_rate": 0.00029954921704040147, "loss": 0.510295033454895, "memory(GiB)": 78.26, "step": 380, "token_acc": 0.8577513030528667, "train_speed(iter/s)": 0.035026 }, { "epoch": 0.0738264787094899, "grad_norm": 0.11037638783454895, "learning_rate": 0.00029954173910516635, "loss": 0.44948601722717285, "memory(GiB)": 78.26, "step": 381, "token_acc": 0.8719704952581665, "train_speed(iter/s)": 0.035026 }, { "epoch": 0.07402024899481664, "grad_norm": 0.1230226382613182, "learning_rate": 0.0002995341997491207, "loss": 0.5153728723526001, "memory(GiB)": 78.26, "step": 382, "token_acc": 0.8567678516574045, "train_speed(iter/s)": 0.035026 }, { "epoch": 0.07421401928014339, "grad_norm": 0.11615514755249023, "learning_rate": 0.00029952659897536106, "loss": 0.46452564001083374, "memory(GiB)": 78.26, "step": 383, "token_acc": 0.8670107503877628, "train_speed(iter/s)": 0.035027 }, { "epoch": 0.07440778956547013, "grad_norm": 0.13006049394607544, "learning_rate": 0.00029951893678700927, "loss": 0.506874144077301, "memory(GiB)": 78.26, "step": 384, "token_acc": 0.8588143291124011, "train_speed(iter/s)": 0.035028 }, { "epoch": 0.07460155985079688, "grad_norm": 0.11657480150461197, "learning_rate": 0.00029951121318721243, "loss": 0.49863314628601074, "memory(GiB)": 78.26, "step": 385, "token_acc": 0.8593393170109936, "train_speed(iter/s)": 0.035029 }, { "epoch": 0.07479533013612362, "grad_norm": 0.11655829846858978, "learning_rate": 0.0002995034281791428, "loss": 0.5014276504516602, "memory(GiB)": 78.26, "step": 386, "token_acc": 0.861993529502388, "train_speed(iter/s)": 0.03503 }, { "epoch": 0.07498910042145036, "grad_norm": 0.10777109861373901, "learning_rate": 0.0002994955817659979, "loss": 0.4714622497558594, "memory(GiB)": 78.26, "step": 387, "token_acc": 0.8677407562147482, "train_speed(iter/s)": 0.035031 }, { "epoch": 0.07518287070677712, "grad_norm": 0.12219083309173584, "learning_rate": 0.00029948767395100045, "loss": 0.5111258625984192, "memory(GiB)": 78.26, "step": 388, "token_acc": 0.8558166152672241, "train_speed(iter/s)": 0.035032 }, { "epoch": 0.07537664099210387, "grad_norm": 0.10944923013448715, "learning_rate": 0.00029947970473739844, "loss": 0.4479862451553345, "memory(GiB)": 78.26, "step": 389, "token_acc": 0.8729919678714859, "train_speed(iter/s)": 0.035033 }, { "epoch": 0.07557041127743061, "grad_norm": 0.10940490663051605, "learning_rate": 0.000299471674128465, "loss": 0.49675190448760986, "memory(GiB)": 78.26, "step": 390, "token_acc": 0.8615333689812339, "train_speed(iter/s)": 0.035032 }, { "epoch": 0.07576418156275735, "grad_norm": 0.11179753392934799, "learning_rate": 0.0002994635821274986, "loss": 0.4398466646671295, "memory(GiB)": 78.26, "step": 391, "token_acc": 0.8771611786033172, "train_speed(iter/s)": 0.035033 }, { "epoch": 0.0759579518480841, "grad_norm": 0.12613913416862488, "learning_rate": 0.0002994554287378227, "loss": 0.5353314876556396, "memory(GiB)": 78.26, "step": 392, "token_acc": 0.8493086152908249, "train_speed(iter/s)": 0.035033 }, { "epoch": 0.07615172213341084, "grad_norm": 0.11761578917503357, "learning_rate": 0.00029944721396278623, "loss": 0.5115870237350464, "memory(GiB)": 78.26, "step": 393, "token_acc": 0.8544060286040279, "train_speed(iter/s)": 0.035034 }, { "epoch": 0.07634549241873759, "grad_norm": 0.11017254739999771, "learning_rate": 0.0002994389378057632, "loss": 0.48874709010124207, "memory(GiB)": 78.26, "step": 394, "token_acc": 0.8645016225055574, "train_speed(iter/s)": 0.035034 }, { "epoch": 0.07653926270406433, "grad_norm": 0.10908011347055435, "learning_rate": 0.00029943060027015276, "loss": 0.47319239377975464, "memory(GiB)": 78.26, "step": 395, "token_acc": 0.8659232780237955, "train_speed(iter/s)": 0.035034 }, { "epoch": 0.07673303298939108, "grad_norm": 0.11207929253578186, "learning_rate": 0.0002994222013593795, "loss": 0.45064985752105713, "memory(GiB)": 78.26, "step": 396, "token_acc": 0.8735906331309627, "train_speed(iter/s)": 0.035035 }, { "epoch": 0.07692680327471782, "grad_norm": 0.11263241618871689, "learning_rate": 0.000299413741076893, "loss": 0.49612677097320557, "memory(GiB)": 78.26, "step": 397, "token_acc": 0.8607561516527616, "train_speed(iter/s)": 0.035036 }, { "epoch": 0.07712057356004456, "grad_norm": 0.12468399852514267, "learning_rate": 0.0002994052194261681, "loss": 0.530525267124176, "memory(GiB)": 78.26, "step": 398, "token_acc": 0.8516811113159998, "train_speed(iter/s)": 0.035037 }, { "epoch": 0.07731434384537131, "grad_norm": 0.10838499665260315, "learning_rate": 0.00029939663641070496, "loss": 0.46940740942955017, "memory(GiB)": 78.26, "step": 399, "token_acc": 0.8676767140734452, "train_speed(iter/s)": 0.035037 }, { "epoch": 0.07750811413069805, "grad_norm": 0.107694610953331, "learning_rate": 0.0002993879920340288, "loss": 0.4813489019870758, "memory(GiB)": 78.26, "step": 400, "token_acc": 0.8637204826412022, "train_speed(iter/s)": 0.035037 }, { "epoch": 0.0777018844160248, "grad_norm": 0.10585100203752518, "learning_rate": 0.00029937928629969007, "loss": 0.4711493253707886, "memory(GiB)": 78.26, "step": 401, "token_acc": 0.868919624217119, "train_speed(iter/s)": 0.034999 }, { "epoch": 0.07789565470135154, "grad_norm": 0.12802836298942566, "learning_rate": 0.0002993705192112645, "loss": 0.537087619304657, "memory(GiB)": 78.26, "step": 402, "token_acc": 0.8504514311327399, "train_speed(iter/s)": 0.035001 }, { "epoch": 0.0780894249866783, "grad_norm": 0.10905808210372925, "learning_rate": 0.00029936169077235294, "loss": 0.46871036291122437, "memory(GiB)": 78.26, "step": 403, "token_acc": 0.8674802147324147, "train_speed(iter/s)": 0.035002 }, { "epoch": 0.07828319527200504, "grad_norm": 0.1310214251279831, "learning_rate": 0.0002993528009865815, "loss": 0.5204190611839294, "memory(GiB)": 78.26, "step": 404, "token_acc": 0.8552753875213625, "train_speed(iter/s)": 0.035003 }, { "epoch": 0.07847696555733179, "grad_norm": 0.10810894519090652, "learning_rate": 0.0002993438498576014, "loss": 0.45181161165237427, "memory(GiB)": 78.26, "step": 405, "token_acc": 0.8717410764238559, "train_speed(iter/s)": 0.035004 }, { "epoch": 0.07867073584265853, "grad_norm": 0.10277310758829117, "learning_rate": 0.0002993348373890891, "loss": 0.4377118945121765, "memory(GiB)": 78.26, "step": 406, "token_acc": 0.8732325819672131, "train_speed(iter/s)": 0.035005 }, { "epoch": 0.07886450612798528, "grad_norm": 0.1221415251493454, "learning_rate": 0.0002993257635847464, "loss": 0.5307734608650208, "memory(GiB)": 78.26, "step": 407, "token_acc": 0.8525828880078148, "train_speed(iter/s)": 0.035005 }, { "epoch": 0.07905827641331202, "grad_norm": 0.11799792945384979, "learning_rate": 0.0002993166284483, "loss": 0.5113755464553833, "memory(GiB)": 78.26, "step": 408, "token_acc": 0.8569156381218984, "train_speed(iter/s)": 0.035006 }, { "epoch": 0.07925204669863876, "grad_norm": 0.11527646332979202, "learning_rate": 0.000299307431983502, "loss": 0.45992469787597656, "memory(GiB)": 78.26, "step": 409, "token_acc": 0.8694049499736703, "train_speed(iter/s)": 0.035007 }, { "epoch": 0.07944581698396551, "grad_norm": 0.11302848160266876, "learning_rate": 0.00029929817419412964, "loss": 0.492914617061615, "memory(GiB)": 78.26, "step": 410, "token_acc": 0.8602246439421388, "train_speed(iter/s)": 0.035007 }, { "epoch": 0.07963958726929225, "grad_norm": 0.11444272100925446, "learning_rate": 0.0002992888550839853, "loss": 0.5189880728721619, "memory(GiB)": 78.26, "step": 411, "token_acc": 0.8538719731479262, "train_speed(iter/s)": 0.035009 }, { "epoch": 0.079833357554619, "grad_norm": 0.1215919554233551, "learning_rate": 0.0002992794746568967, "loss": 0.5156800150871277, "memory(GiB)": 78.26, "step": 412, "token_acc": 0.8556100806786991, "train_speed(iter/s)": 0.03501 }, { "epoch": 0.08002712783994574, "grad_norm": 0.10883322358131409, "learning_rate": 0.0002992700329167166, "loss": 0.4452913999557495, "memory(GiB)": 78.26, "step": 413, "token_acc": 0.8729308276689324, "train_speed(iter/s)": 0.03501 }, { "epoch": 0.08022089812527249, "grad_norm": 0.10672541707754135, "learning_rate": 0.00029926052986732285, "loss": 0.4543689787387848, "memory(GiB)": 78.26, "step": 414, "token_acc": 0.8705642256902761, "train_speed(iter/s)": 0.03501 }, { "epoch": 0.08041466841059923, "grad_norm": 0.10857294499874115, "learning_rate": 0.00029925096551261873, "loss": 0.48616546392440796, "memory(GiB)": 78.26, "step": 415, "token_acc": 0.8620314125989207, "train_speed(iter/s)": 0.035011 }, { "epoch": 0.08060843869592597, "grad_norm": 0.11831134557723999, "learning_rate": 0.0002992413398565325, "loss": 0.48336830735206604, "memory(GiB)": 78.26, "step": 416, "token_acc": 0.8604266578468662, "train_speed(iter/s)": 0.035011 }, { "epoch": 0.08080220898125272, "grad_norm": 0.12000903487205505, "learning_rate": 0.0002992316529030178, "loss": 0.49910253286361694, "memory(GiB)": 78.26, "step": 417, "token_acc": 0.857624620965228, "train_speed(iter/s)": 0.035012 }, { "epoch": 0.08099597926657948, "grad_norm": 0.11360272020101547, "learning_rate": 0.0002992219046560532, "loss": 0.4726894199848175, "memory(GiB)": 78.26, "step": 418, "token_acc": 0.8649808638600328, "train_speed(iter/s)": 0.035013 }, { "epoch": 0.08118974955190622, "grad_norm": 0.11910063773393631, "learning_rate": 0.0002992120951196426, "loss": 0.49129027128219604, "memory(GiB)": 78.26, "step": 419, "token_acc": 0.8614566125740939, "train_speed(iter/s)": 0.035013 }, { "epoch": 0.08138351983723296, "grad_norm": 0.11072386801242828, "learning_rate": 0.0002992022242978151, "loss": 0.4478093385696411, "memory(GiB)": 78.26, "step": 420, "token_acc": 0.8705150236942144, "train_speed(iter/s)": 0.035014 }, { "epoch": 0.08157729012255971, "grad_norm": 0.11215101927518845, "learning_rate": 0.0002991922921946248, "loss": 0.43505847454071045, "memory(GiB)": 78.26, "step": 421, "token_acc": 0.8739453295629583, "train_speed(iter/s)": 0.035015 }, { "epoch": 0.08177106040788645, "grad_norm": 0.12848572432994843, "learning_rate": 0.0002991822988141512, "loss": 0.4977684020996094, "memory(GiB)": 78.26, "step": 422, "token_acc": 0.8609223368850855, "train_speed(iter/s)": 0.035016 }, { "epoch": 0.0819648306932132, "grad_norm": 0.10522928833961487, "learning_rate": 0.0002991722441604988, "loss": 0.415419340133667, "memory(GiB)": 78.26, "step": 423, "token_acc": 0.880544936757575, "train_speed(iter/s)": 0.035017 }, { "epoch": 0.08215860097853994, "grad_norm": 0.12996944785118103, "learning_rate": 0.00029916212823779723, "loss": 0.4946865737438202, "memory(GiB)": 78.26, "step": 424, "token_acc": 0.8606087064986903, "train_speed(iter/s)": 0.035018 }, { "epoch": 0.08235237126386669, "grad_norm": 0.1072884202003479, "learning_rate": 0.0002991519510502015, "loss": 0.44100552797317505, "memory(GiB)": 78.26, "step": 425, "token_acc": 0.8742977528089888, "train_speed(iter/s)": 0.035018 }, { "epoch": 0.08254614154919343, "grad_norm": 0.11559458076953888, "learning_rate": 0.0002991417126018916, "loss": 0.4932180643081665, "memory(GiB)": 78.26, "step": 426, "token_acc": 0.8621398554887082, "train_speed(iter/s)": 0.035018 }, { "epoch": 0.08273991183452017, "grad_norm": 0.10428472608327866, "learning_rate": 0.00029913141289707277, "loss": 0.44228169322013855, "memory(GiB)": 78.26, "step": 427, "token_acc": 0.8711459857697936, "train_speed(iter/s)": 0.035018 }, { "epoch": 0.08293368211984692, "grad_norm": 0.11177484691143036, "learning_rate": 0.0002991210519399753, "loss": 0.46969670057296753, "memory(GiB)": 78.26, "step": 428, "token_acc": 0.8665406640525826, "train_speed(iter/s)": 0.035019 }, { "epoch": 0.08312745240517366, "grad_norm": 0.12332018464803696, "learning_rate": 0.00029911062973485476, "loss": 0.5035005807876587, "memory(GiB)": 78.26, "step": 429, "token_acc": 0.856846298426235, "train_speed(iter/s)": 0.03502 }, { "epoch": 0.08332122269050041, "grad_norm": 0.11600656807422638, "learning_rate": 0.00029910014628599184, "loss": 0.45422035455703735, "memory(GiB)": 78.26, "step": 430, "token_acc": 0.8697181133128663, "train_speed(iter/s)": 0.035021 }, { "epoch": 0.08351499297582715, "grad_norm": 0.12605808675289154, "learning_rate": 0.0002990896015976924, "loss": 0.48837852478027344, "memory(GiB)": 78.26, "step": 431, "token_acc": 0.8621029303127309, "train_speed(iter/s)": 0.035023 }, { "epoch": 0.0837087632611539, "grad_norm": 0.11456278711557388, "learning_rate": 0.00029907899567428736, "loss": 0.48890623450279236, "memory(GiB)": 78.26, "step": 432, "token_acc": 0.8624089155593656, "train_speed(iter/s)": 0.035022 }, { "epoch": 0.08390253354648065, "grad_norm": 0.11591736972332001, "learning_rate": 0.0002990683285201329, "loss": 0.4833714962005615, "memory(GiB)": 78.26, "step": 433, "token_acc": 0.8619619484549993, "train_speed(iter/s)": 0.035023 }, { "epoch": 0.0840963038318074, "grad_norm": 0.11160556972026825, "learning_rate": 0.00029905760013961024, "loss": 0.5017392635345459, "memory(GiB)": 78.26, "step": 434, "token_acc": 0.857612434705058, "train_speed(iter/s)": 0.035023 }, { "epoch": 0.08429007411713414, "grad_norm": 0.11990434676408768, "learning_rate": 0.000299046810537126, "loss": 0.5158663988113403, "memory(GiB)": 78.26, "step": 435, "token_acc": 0.8545139761525788, "train_speed(iter/s)": 0.035025 }, { "epoch": 0.08448384440246089, "grad_norm": 0.1072223111987114, "learning_rate": 0.0002990359597171115, "loss": 0.4850725531578064, "memory(GiB)": 78.26, "step": 436, "token_acc": 0.8633244854009116, "train_speed(iter/s)": 0.035025 }, { "epoch": 0.08467761468778763, "grad_norm": 0.1028514951467514, "learning_rate": 0.00029902504768402363, "loss": 0.4222199618816376, "memory(GiB)": 78.26, "step": 437, "token_acc": 0.8768839397139292, "train_speed(iter/s)": 0.035025 }, { "epoch": 0.08487138497311437, "grad_norm": 0.12057259678840637, "learning_rate": 0.0002990140744423443, "loss": 0.4803910255432129, "memory(GiB)": 78.26, "step": 438, "token_acc": 0.8642994549535108, "train_speed(iter/s)": 0.035027 }, { "epoch": 0.08506515525844112, "grad_norm": 0.11526720970869064, "learning_rate": 0.0002990030399965803, "loss": 0.505075216293335, "memory(GiB)": 78.26, "step": 439, "token_acc": 0.8569081317921075, "train_speed(iter/s)": 0.035027 }, { "epoch": 0.08525892554376786, "grad_norm": 0.115642249584198, "learning_rate": 0.000298991944351264, "loss": 0.4736385643482208, "memory(GiB)": 78.26, "step": 440, "token_acc": 0.8655344655344656, "train_speed(iter/s)": 0.035027 }, { "epoch": 0.08545269582909461, "grad_norm": 0.10979454219341278, "learning_rate": 0.0002989807875109525, "loss": 0.4606488347053528, "memory(GiB)": 78.26, "step": 441, "token_acc": 0.8691641871787936, "train_speed(iter/s)": 0.035028 }, { "epoch": 0.08564646611442135, "grad_norm": 0.1096881628036499, "learning_rate": 0.0002989695694802284, "loss": 0.46464937925338745, "memory(GiB)": 78.26, "step": 442, "token_acc": 0.8675828444373007, "train_speed(iter/s)": 0.035028 }, { "epoch": 0.0858402363997481, "grad_norm": 0.11683948338031769, "learning_rate": 0.0002989582902636991, "loss": 0.5202597379684448, "memory(GiB)": 78.26, "step": 443, "token_acc": 0.8540174192205939, "train_speed(iter/s)": 0.035029 }, { "epoch": 0.08603400668507484, "grad_norm": 0.11013835668563843, "learning_rate": 0.00029894694986599735, "loss": 0.45811498165130615, "memory(GiB)": 78.26, "step": 444, "token_acc": 0.8691748066748066, "train_speed(iter/s)": 0.035029 }, { "epoch": 0.08622777697040158, "grad_norm": 0.12029381096363068, "learning_rate": 0.0002989355482917809, "loss": 0.5269613862037659, "memory(GiB)": 78.26, "step": 445, "token_acc": 0.8544889657602255, "train_speed(iter/s)": 0.03503 }, { "epoch": 0.08642154725572833, "grad_norm": 0.11967018991708755, "learning_rate": 0.00029892408554573266, "loss": 0.5077260732650757, "memory(GiB)": 78.26, "step": 446, "token_acc": 0.8567639257294429, "train_speed(iter/s)": 0.035031 }, { "epoch": 0.08661531754105507, "grad_norm": 0.1161133274435997, "learning_rate": 0.00029891256163256085, "loss": 0.49124279618263245, "memory(GiB)": 78.26, "step": 447, "token_acc": 0.8613902094308108, "train_speed(iter/s)": 0.035031 }, { "epoch": 0.08680908782638183, "grad_norm": 0.11311367154121399, "learning_rate": 0.0002989009765569985, "loss": 0.4679524898529053, "memory(GiB)": 78.26, "step": 448, "token_acc": 0.8683576233183856, "train_speed(iter/s)": 0.035033 }, { "epoch": 0.08700285811170858, "grad_norm": 0.1288428157567978, "learning_rate": 0.00029888933032380394, "loss": 0.5291860103607178, "memory(GiB)": 78.26, "step": 449, "token_acc": 0.850703275825085, "train_speed(iter/s)": 0.035034 }, { "epoch": 0.08719662839703532, "grad_norm": 0.1069454550743103, "learning_rate": 0.0002988776229377606, "loss": 0.44400641322135925, "memory(GiB)": 78.26, "step": 450, "token_acc": 0.8715773445359871, "train_speed(iter/s)": 0.035035 }, { "epoch": 0.08739039868236206, "grad_norm": 0.12414740771055222, "learning_rate": 0.00029886585440367703, "loss": 0.49137061834335327, "memory(GiB)": 78.26, "step": 451, "token_acc": 0.8623676960997535, "train_speed(iter/s)": 0.035036 }, { "epoch": 0.08758416896768881, "grad_norm": 0.11944427341222763, "learning_rate": 0.0002988540247263869, "loss": 0.4897725582122803, "memory(GiB)": 78.26, "step": 452, "token_acc": 0.8620128910176807, "train_speed(iter/s)": 0.035037 }, { "epoch": 0.08777793925301555, "grad_norm": 0.11256846785545349, "learning_rate": 0.0002988421339107489, "loss": 0.45858433842658997, "memory(GiB)": 78.26, "step": 453, "token_acc": 0.8707719882217386, "train_speed(iter/s)": 0.035037 }, { "epoch": 0.0879717095383423, "grad_norm": 0.11249680817127228, "learning_rate": 0.000298830181961647, "loss": 0.49964413046836853, "memory(GiB)": 78.26, "step": 454, "token_acc": 0.8624331586368352, "train_speed(iter/s)": 0.035038 }, { "epoch": 0.08816547982366904, "grad_norm": 0.11055975407361984, "learning_rate": 0.00029881816888399014, "loss": 0.458358496427536, "memory(GiB)": 78.26, "step": 455, "token_acc": 0.8681250838813582, "train_speed(iter/s)": 0.035039 }, { "epoch": 0.08835925010899578, "grad_norm": 0.11990530788898468, "learning_rate": 0.0002988060946827124, "loss": 0.51871657371521, "memory(GiB)": 78.26, "step": 456, "token_acc": 0.8554156908665106, "train_speed(iter/s)": 0.035039 }, { "epoch": 0.08855302039432253, "grad_norm": 0.13425587117671967, "learning_rate": 0.00029879395936277303, "loss": 0.5187729597091675, "memory(GiB)": 78.26, "step": 457, "token_acc": 0.8552357434987268, "train_speed(iter/s)": 0.03504 }, { "epoch": 0.08874679067964927, "grad_norm": 0.12039537727832794, "learning_rate": 0.0002987817629291563, "loss": 0.5168625116348267, "memory(GiB)": 78.26, "step": 458, "token_acc": 0.852051777849393, "train_speed(iter/s)": 0.035041 }, { "epoch": 0.08894056096497602, "grad_norm": 0.10525672137737274, "learning_rate": 0.0002987695053868716, "loss": 0.47165447473526, "memory(GiB)": 78.26, "step": 459, "token_acc": 0.8652151769798828, "train_speed(iter/s)": 0.03504 }, { "epoch": 0.08913433125030276, "grad_norm": 0.12413229048252106, "learning_rate": 0.00029875718674095346, "loss": 0.4889269471168518, "memory(GiB)": 78.26, "step": 460, "token_acc": 0.8654352837852674, "train_speed(iter/s)": 0.03504 }, { "epoch": 0.0893281015356295, "grad_norm": 0.11871378868818283, "learning_rate": 0.00029874480699646145, "loss": 0.47212713956832886, "memory(GiB)": 78.26, "step": 461, "token_acc": 0.8666805704006882, "train_speed(iter/s)": 0.03504 }, { "epoch": 0.08952187182095625, "grad_norm": 0.11364120990037918, "learning_rate": 0.0002987323661584803, "loss": 0.4883894622325897, "memory(GiB)": 78.26, "step": 462, "token_acc": 0.8616672536642194, "train_speed(iter/s)": 0.035041 }, { "epoch": 0.08971564210628301, "grad_norm": 0.11412378400564194, "learning_rate": 0.00029871986423211976, "loss": 0.48642444610595703, "memory(GiB)": 78.26, "step": 463, "token_acc": 0.8636845270483118, "train_speed(iter/s)": 0.035042 }, { "epoch": 0.08990941239160975, "grad_norm": 0.12467852979898453, "learning_rate": 0.0002987073012225147, "loss": 0.537085235118866, "memory(GiB)": 78.26, "step": 464, "token_acc": 0.851391779396462, "train_speed(iter/s)": 0.035042 }, { "epoch": 0.0901031826769365, "grad_norm": 0.11268845200538635, "learning_rate": 0.00029869467713482516, "loss": 0.5060177445411682, "memory(GiB)": 78.26, "step": 465, "token_acc": 0.8563837385366968, "train_speed(iter/s)": 0.035042 }, { "epoch": 0.09029695296226324, "grad_norm": 0.1154741421341896, "learning_rate": 0.00029868199197423607, "loss": 0.5034860372543335, "memory(GiB)": 78.26, "step": 466, "token_acc": 0.8582380491378634, "train_speed(iter/s)": 0.035043 }, { "epoch": 0.09049072324758999, "grad_norm": 0.11442988365888596, "learning_rate": 0.0002986692457459577, "loss": 0.4907042980194092, "memory(GiB)": 78.26, "step": 467, "token_acc": 0.8586531036135595, "train_speed(iter/s)": 0.035045 }, { "epoch": 0.09068449353291673, "grad_norm": 0.1217266321182251, "learning_rate": 0.00029865643845522515, "loss": 0.5039050579071045, "memory(GiB)": 78.26, "step": 468, "token_acc": 0.859063377160585, "train_speed(iter/s)": 0.035045 }, { "epoch": 0.09087826381824347, "grad_norm": 0.11309139430522919, "learning_rate": 0.00029864357010729885, "loss": 0.459963858127594, "memory(GiB)": 78.26, "step": 469, "token_acc": 0.8677241875771288, "train_speed(iter/s)": 0.035045 }, { "epoch": 0.09107203410357022, "grad_norm": 0.10612141340970993, "learning_rate": 0.00029863064070746406, "loss": 0.4284835755825043, "memory(GiB)": 78.26, "step": 470, "token_acc": 0.8769310129503435, "train_speed(iter/s)": 0.035045 }, { "epoch": 0.09126580438889696, "grad_norm": 0.13069190084934235, "learning_rate": 0.00029861765026103126, "loss": 0.570850670337677, "memory(GiB)": 78.26, "step": 471, "token_acc": 0.8367923900751679, "train_speed(iter/s)": 0.035047 }, { "epoch": 0.0914595746742237, "grad_norm": 0.11362801492214203, "learning_rate": 0.000298604598773336, "loss": 0.47750890254974365, "memory(GiB)": 78.26, "step": 472, "token_acc": 0.8643800970372665, "train_speed(iter/s)": 0.035048 }, { "epoch": 0.09165334495955045, "grad_norm": 0.11324970424175262, "learning_rate": 0.0002985914862497388, "loss": 0.47157877683639526, "memory(GiB)": 78.26, "step": 473, "token_acc": 0.8665956123989698, "train_speed(iter/s)": 0.035049 }, { "epoch": 0.0918471152448772, "grad_norm": 0.11150199174880981, "learning_rate": 0.00029857831269562544, "loss": 0.47126272320747375, "memory(GiB)": 78.26, "step": 474, "token_acc": 0.8654503990877993, "train_speed(iter/s)": 0.035049 }, { "epoch": 0.09204088553020394, "grad_norm": 0.10614330321550369, "learning_rate": 0.00029856507811640667, "loss": 0.4606248736381531, "memory(GiB)": 78.26, "step": 475, "token_acc": 0.8700272670069217, "train_speed(iter/s)": 0.03505 }, { "epoch": 0.09223465581553068, "grad_norm": 0.11242176592350006, "learning_rate": 0.0002985517825175181, "loss": 0.46226733922958374, "memory(GiB)": 78.26, "step": 476, "token_acc": 0.8658786323456982, "train_speed(iter/s)": 0.03505 }, { "epoch": 0.09242842610085743, "grad_norm": 0.11099898815155029, "learning_rate": 0.0002985384259044208, "loss": 0.4800049662590027, "memory(GiB)": 78.26, "step": 477, "token_acc": 0.8640604175921812, "train_speed(iter/s)": 0.03505 }, { "epoch": 0.09262219638618417, "grad_norm": 0.11142181605100632, "learning_rate": 0.0002985250082826005, "loss": 0.48013511300086975, "memory(GiB)": 78.26, "step": 478, "token_acc": 0.8613092239573668, "train_speed(iter/s)": 0.03505 }, { "epoch": 0.09281596667151093, "grad_norm": 0.11062056571245193, "learning_rate": 0.0002985115296575684, "loss": 0.5107330083847046, "memory(GiB)": 78.26, "step": 479, "token_acc": 0.8574547485466223, "train_speed(iter/s)": 0.03505 }, { "epoch": 0.09300973695683767, "grad_norm": 0.10769648849964142, "learning_rate": 0.00029849799003486035, "loss": 0.4654473066329956, "memory(GiB)": 78.26, "step": 480, "token_acc": 0.8700014251104461, "train_speed(iter/s)": 0.035052 }, { "epoch": 0.09320350724216442, "grad_norm": 0.11566044390201569, "learning_rate": 0.00029848438942003746, "loss": 0.5077210664749146, "memory(GiB)": 78.26, "step": 481, "token_acc": 0.857971693966654, "train_speed(iter/s)": 0.035053 }, { "epoch": 0.09339727752749116, "grad_norm": 0.10409853607416153, "learning_rate": 0.00029847072781868597, "loss": 0.4633353352546692, "memory(GiB)": 78.26, "step": 482, "token_acc": 0.8679831912361242, "train_speed(iter/s)": 0.035053 }, { "epoch": 0.0935910478128179, "grad_norm": 0.11067858338356018, "learning_rate": 0.00029845700523641695, "loss": 0.4580146074295044, "memory(GiB)": 78.26, "step": 483, "token_acc": 0.8679867986798679, "train_speed(iter/s)": 0.035054 }, { "epoch": 0.09378481809814465, "grad_norm": 0.11449804157018661, "learning_rate": 0.0002984432216788667, "loss": 0.4559538662433624, "memory(GiB)": 78.26, "step": 484, "token_acc": 0.8718022752208812, "train_speed(iter/s)": 0.035054 }, { "epoch": 0.0939785883834714, "grad_norm": 0.11818825453519821, "learning_rate": 0.0002984293771516965, "loss": 0.5102649331092834, "memory(GiB)": 78.26, "step": 485, "token_acc": 0.8580015707122576, "train_speed(iter/s)": 0.035055 }, { "epoch": 0.09417235866879814, "grad_norm": 0.11284741759300232, "learning_rate": 0.00029841547166059264, "loss": 0.48310020565986633, "memory(GiB)": 78.26, "step": 486, "token_acc": 0.8661896877956481, "train_speed(iter/s)": 0.035056 }, { "epoch": 0.09436612895412488, "grad_norm": 0.12485720217227936, "learning_rate": 0.0002984015052112665, "loss": 0.506294310092926, "memory(GiB)": 78.26, "step": 487, "token_acc": 0.8588789601949635, "train_speed(iter/s)": 0.035057 }, { "epoch": 0.09455989923945163, "grad_norm": 0.10859735310077667, "learning_rate": 0.0002983874778094545, "loss": 0.4783238172531128, "memory(GiB)": 78.26, "step": 488, "token_acc": 0.8641949593834618, "train_speed(iter/s)": 0.035057 }, { "epoch": 0.09475366952477837, "grad_norm": 0.10904448479413986, "learning_rate": 0.00029837338946091794, "loss": 0.4522710144519806, "memory(GiB)": 78.26, "step": 489, "token_acc": 0.8690276365871533, "train_speed(iter/s)": 0.035057 }, { "epoch": 0.09494743981010512, "grad_norm": 0.10352976620197296, "learning_rate": 0.0002983592401714435, "loss": 0.4140471816062927, "memory(GiB)": 78.26, "step": 490, "token_acc": 0.8802987811455075, "train_speed(iter/s)": 0.035057 }, { "epoch": 0.09514121009543186, "grad_norm": 0.10894495248794556, "learning_rate": 0.00029834502994684247, "loss": 0.4381650686264038, "memory(GiB)": 78.26, "step": 491, "token_acc": 0.875124829777576, "train_speed(iter/s)": 0.035057 }, { "epoch": 0.0953349803807586, "grad_norm": 0.10618970543146133, "learning_rate": 0.00029833075879295146, "loss": 0.44262564182281494, "memory(GiB)": 78.26, "step": 492, "token_acc": 0.8727838336831989, "train_speed(iter/s)": 0.035057 }, { "epoch": 0.09552875066608535, "grad_norm": 0.11135976016521454, "learning_rate": 0.00029831642671563203, "loss": 0.504035472869873, "memory(GiB)": 78.26, "step": 493, "token_acc": 0.8569741773191643, "train_speed(iter/s)": 0.035057 }, { "epoch": 0.09572252095141211, "grad_norm": 0.11450373381376266, "learning_rate": 0.00029830203372077077, "loss": 0.46363234519958496, "memory(GiB)": 78.26, "step": 494, "token_acc": 0.8658075691564747, "train_speed(iter/s)": 0.035058 }, { "epoch": 0.09591629123673885, "grad_norm": 0.1127939373254776, "learning_rate": 0.0002982875798142791, "loss": 0.4617582857608795, "memory(GiB)": 78.26, "step": 495, "token_acc": 0.8662511720258121, "train_speed(iter/s)": 0.035058 }, { "epoch": 0.0961100615220656, "grad_norm": 0.11518420279026031, "learning_rate": 0.00029827306500209387, "loss": 0.45998328924179077, "memory(GiB)": 78.26, "step": 496, "token_acc": 0.8671692940370117, "train_speed(iter/s)": 0.035059 }, { "epoch": 0.09630383180739234, "grad_norm": 0.10046973824501038, "learning_rate": 0.0002982584892901766, "loss": 0.4228544235229492, "memory(GiB)": 78.26, "step": 497, "token_acc": 0.8777845109683675, "train_speed(iter/s)": 0.035059 }, { "epoch": 0.09649760209271908, "grad_norm": 0.10890112817287445, "learning_rate": 0.00029824385268451394, "loss": 0.43357470631599426, "memory(GiB)": 78.26, "step": 498, "token_acc": 0.875686566090231, "train_speed(iter/s)": 0.035059 }, { "epoch": 0.09669137237804583, "grad_norm": 0.1078747883439064, "learning_rate": 0.0002982291551911174, "loss": 0.4050006866455078, "memory(GiB)": 78.26, "step": 499, "token_acc": 0.8838355027744645, "train_speed(iter/s)": 0.035059 }, { "epoch": 0.09688514266337257, "grad_norm": 0.12394701689481735, "learning_rate": 0.0002982143968160238, "loss": 0.42825788259506226, "memory(GiB)": 78.26, "step": 500, "token_acc": 0.8775999553895054, "train_speed(iter/s)": 0.03506 }, { "epoch": 0.09688514266337257, "eval_loss": 0.5390220284461975, "eval_runtime": 1346.167, "eval_samples_per_second": 5.013, "eval_steps_per_second": 5.013, "eval_token_acc": 0.8670573123458688, "step": 500 }, { "epoch": 0.09707891294869932, "grad_norm": 0.11574912816286087, "learning_rate": 0.0002981995775652948, "loss": 0.4860585331916809, "memory(GiB)": 78.26, "step": 501, "token_acc": 0.8617111447871986, "train_speed(iter/s)": 0.03204 }, { "epoch": 0.09727268323402606, "grad_norm": 0.10734662413597107, "learning_rate": 0.000298184697445017, "loss": 0.48478803038597107, "memory(GiB)": 78.26, "step": 502, "token_acc": 0.8619134645052572, "train_speed(iter/s)": 0.032046 }, { "epoch": 0.0974664535193528, "grad_norm": 0.10531099885702133, "learning_rate": 0.00029816975646130206, "loss": 0.40947991609573364, "memory(GiB)": 78.26, "step": 503, "token_acc": 0.8847219204866391, "train_speed(iter/s)": 0.032052 }, { "epoch": 0.09766022380467955, "grad_norm": 0.11789926886558533, "learning_rate": 0.0002981547546202867, "loss": 0.48241478204727173, "memory(GiB)": 78.26, "step": 504, "token_acc": 0.8646450249428649, "train_speed(iter/s)": 0.032058 }, { "epoch": 0.0978539940900063, "grad_norm": 0.12707959115505219, "learning_rate": 0.0002981396919281325, "loss": 0.5034913420677185, "memory(GiB)": 78.26, "step": 505, "token_acc": 0.8592644506741423, "train_speed(iter/s)": 0.032064 }, { "epoch": 0.09804776437533304, "grad_norm": 0.11420562863349915, "learning_rate": 0.0002981245683910262, "loss": 0.4770240783691406, "memory(GiB)": 78.26, "step": 506, "token_acc": 0.8642017671785194, "train_speed(iter/s)": 0.03207 }, { "epoch": 0.09824153466065978, "grad_norm": 0.11006432771682739, "learning_rate": 0.00029810938401517937, "loss": 0.4580528140068054, "memory(GiB)": 78.26, "step": 507, "token_acc": 0.8687396218329851, "train_speed(iter/s)": 0.032076 }, { "epoch": 0.09843530494598653, "grad_norm": 0.11123590171337128, "learning_rate": 0.00029809413880682866, "loss": 0.4647199213504791, "memory(GiB)": 78.26, "step": 508, "token_acc": 0.8687078223879421, "train_speed(iter/s)": 0.032082 }, { "epoch": 0.09862907523131328, "grad_norm": 0.11409325152635574, "learning_rate": 0.00029807883277223573, "loss": 0.44523417949676514, "memory(GiB)": 78.26, "step": 509, "token_acc": 0.8726046297715775, "train_speed(iter/s)": 0.032087 }, { "epoch": 0.09882284551664003, "grad_norm": 0.10299310833215714, "learning_rate": 0.00029806346591768713, "loss": 0.4694310128688812, "memory(GiB)": 78.26, "step": 510, "token_acc": 0.8651645115862245, "train_speed(iter/s)": 0.032093 }, { "epoch": 0.09901661580196677, "grad_norm": 0.11479201167821884, "learning_rate": 0.0002980480382494945, "loss": 0.4353720545768738, "memory(GiB)": 78.26, "step": 511, "token_acc": 0.8747248514197666, "train_speed(iter/s)": 0.032098 }, { "epoch": 0.09921038608729352, "grad_norm": 0.12371563911437988, "learning_rate": 0.0002980325497739943, "loss": 0.4968646466732025, "memory(GiB)": 78.26, "step": 512, "token_acc": 0.8599567167456011, "train_speed(iter/s)": 0.032104 }, { "epoch": 0.09940415637262026, "grad_norm": 0.12062571942806244, "learning_rate": 0.00029801700049754816, "loss": 0.5140507817268372, "memory(GiB)": 78.26, "step": 513, "token_acc": 0.8532771431740046, "train_speed(iter/s)": 0.03211 }, { "epoch": 0.099597926657947, "grad_norm": 0.11467831581830978, "learning_rate": 0.0002980013904265425, "loss": 0.4433932900428772, "memory(GiB)": 78.26, "step": 514, "token_acc": 0.8714406689954483, "train_speed(iter/s)": 0.032115 }, { "epoch": 0.09979169694327375, "grad_norm": 0.10191385447978973, "learning_rate": 0.00029798571956738887, "loss": 0.44032424688339233, "memory(GiB)": 78.26, "step": 515, "token_acc": 0.8739991993594876, "train_speed(iter/s)": 0.032121 }, { "epoch": 0.0999854672286005, "grad_norm": 0.1040392592549324, "learning_rate": 0.00029796998792652366, "loss": 0.43922415375709534, "memory(GiB)": 78.26, "step": 516, "token_acc": 0.8740507096930729, "train_speed(iter/s)": 0.032126 }, { "epoch": 0.10017923751392724, "grad_norm": 0.1119905412197113, "learning_rate": 0.00029795419551040833, "loss": 0.4818880259990692, "memory(GiB)": 78.26, "step": 517, "token_acc": 0.8635854743792163, "train_speed(iter/s)": 0.032131 }, { "epoch": 0.10037300779925398, "grad_norm": 0.10882623493671417, "learning_rate": 0.00029793834232552923, "loss": 0.4754379093647003, "memory(GiB)": 78.26, "step": 518, "token_acc": 0.8650517086330936, "train_speed(iter/s)": 0.032137 }, { "epoch": 0.10056677808458073, "grad_norm": 0.11377903819084167, "learning_rate": 0.00029792242837839764, "loss": 0.4642985165119171, "memory(GiB)": 78.26, "step": 519, "token_acc": 0.8642178631297238, "train_speed(iter/s)": 0.032143 }, { "epoch": 0.10076054836990747, "grad_norm": 0.1146177351474762, "learning_rate": 0.0002979064536755499, "loss": 0.47592130303382874, "memory(GiB)": 78.26, "step": 520, "token_acc": 0.8636443661971831, "train_speed(iter/s)": 0.032148 }, { "epoch": 0.10095431865523422, "grad_norm": 0.1103118285536766, "learning_rate": 0.00029789041822354725, "loss": 0.4815801978111267, "memory(GiB)": 78.26, "step": 521, "token_acc": 0.864503921461087, "train_speed(iter/s)": 0.032153 }, { "epoch": 0.10114808894056096, "grad_norm": 0.10768909007310867, "learning_rate": 0.00029787432202897586, "loss": 0.45158839225769043, "memory(GiB)": 78.26, "step": 522, "token_acc": 0.8713771448091021, "train_speed(iter/s)": 0.032159 }, { "epoch": 0.1013418592258877, "grad_norm": 0.11492832750082016, "learning_rate": 0.00029785816509844687, "loss": 0.510797381401062, "memory(GiB)": 78.26, "step": 523, "token_acc": 0.8555613215706626, "train_speed(iter/s)": 0.032164 }, { "epoch": 0.10153562951121446, "grad_norm": 0.11431033164262772, "learning_rate": 0.00029784194743859635, "loss": 0.4578917324542999, "memory(GiB)": 78.26, "step": 524, "token_acc": 0.8709328500014617, "train_speed(iter/s)": 0.03217 }, { "epoch": 0.1017293997965412, "grad_norm": 0.11284809559583664, "learning_rate": 0.00029782566905608537, "loss": 0.5204124450683594, "memory(GiB)": 78.26, "step": 525, "token_acc": 0.8546162111927511, "train_speed(iter/s)": 0.032175 }, { "epoch": 0.10192317008186795, "grad_norm": 0.10947126150131226, "learning_rate": 0.00029780932995759993, "loss": 0.47740259766578674, "memory(GiB)": 78.26, "step": 526, "token_acc": 0.8662353223100887, "train_speed(iter/s)": 0.03218 }, { "epoch": 0.1021169403671947, "grad_norm": 0.10436630249023438, "learning_rate": 0.0002977929301498508, "loss": 0.41452687978744507, "memory(GiB)": 78.26, "step": 527, "token_acc": 0.8821526069681326, "train_speed(iter/s)": 0.032185 }, { "epoch": 0.10231071065252144, "grad_norm": 0.10980913788080215, "learning_rate": 0.00029777646963957395, "loss": 0.4662986397743225, "memory(GiB)": 78.26, "step": 528, "token_acc": 0.8664124246946333, "train_speed(iter/s)": 0.032191 }, { "epoch": 0.10250448093784818, "grad_norm": 0.12164165824651718, "learning_rate": 0.00029775994843353015, "loss": 0.5069164037704468, "memory(GiB)": 78.26, "step": 529, "token_acc": 0.855151571221513, "train_speed(iter/s)": 0.032197 }, { "epoch": 0.10269825122317493, "grad_norm": 0.10582577437162399, "learning_rate": 0.000297743366538505, "loss": 0.48306143283843994, "memory(GiB)": 78.26, "step": 530, "token_acc": 0.8629473684210527, "train_speed(iter/s)": 0.032202 }, { "epoch": 0.10289202150850167, "grad_norm": 0.10868332535028458, "learning_rate": 0.00029772672396130914, "loss": 0.46425890922546387, "memory(GiB)": 78.26, "step": 531, "token_acc": 0.867730867643638, "train_speed(iter/s)": 0.032208 }, { "epoch": 0.10308579179382842, "grad_norm": 0.10470977425575256, "learning_rate": 0.0002977100207087783, "loss": 0.4376215934753418, "memory(GiB)": 78.26, "step": 532, "token_acc": 0.8725883947271159, "train_speed(iter/s)": 0.032213 }, { "epoch": 0.10327956207915516, "grad_norm": 0.10088858008384705, "learning_rate": 0.0002976932567877728, "loss": 0.44405412673950195, "memory(GiB)": 78.26, "step": 533, "token_acc": 0.8700791271440507, "train_speed(iter/s)": 0.032219 }, { "epoch": 0.1034733323644819, "grad_norm": 0.11034450680017471, "learning_rate": 0.00029767643220517803, "loss": 0.4524500370025635, "memory(GiB)": 78.26, "step": 534, "token_acc": 0.8718685509782921, "train_speed(iter/s)": 0.032224 }, { "epoch": 0.10366710264980865, "grad_norm": 0.1017063558101654, "learning_rate": 0.0002976595469679044, "loss": 0.45182546973228455, "memory(GiB)": 78.26, "step": 535, "token_acc": 0.8708247343881372, "train_speed(iter/s)": 0.032229 }, { "epoch": 0.10386087293513539, "grad_norm": 0.12204176932573318, "learning_rate": 0.000297642601082887, "loss": 0.4296923875808716, "memory(GiB)": 78.26, "step": 536, "token_acc": 0.8769677967208831, "train_speed(iter/s)": 0.032234 }, { "epoch": 0.10405464322046214, "grad_norm": 0.11646491289138794, "learning_rate": 0.00029762559455708606, "loss": 0.5057516694068909, "memory(GiB)": 78.26, "step": 537, "token_acc": 0.8552889453890566, "train_speed(iter/s)": 0.03224 }, { "epoch": 0.10424841350578888, "grad_norm": 0.1092115193605423, "learning_rate": 0.00029760852739748656, "loss": 0.4715476334095001, "memory(GiB)": 78.26, "step": 538, "token_acc": 0.8671120470554063, "train_speed(iter/s)": 0.032244 }, { "epoch": 0.10444218379111564, "grad_norm": 0.10526008903980255, "learning_rate": 0.00029759139961109843, "loss": 0.4452831447124481, "memory(GiB)": 78.26, "step": 539, "token_acc": 0.8757942310248571, "train_speed(iter/s)": 0.032249 }, { "epoch": 0.10463595407644238, "grad_norm": 0.10792571306228638, "learning_rate": 0.00029757421120495657, "loss": 0.4550265669822693, "memory(GiB)": 78.26, "step": 540, "token_acc": 0.8692174444670878, "train_speed(iter/s)": 0.032254 }, { "epoch": 0.10482972436176913, "grad_norm": 0.13169942796230316, "learning_rate": 0.00029755696218612075, "loss": 0.4817619323730469, "memory(GiB)": 78.26, "step": 541, "token_acc": 0.8636959761549925, "train_speed(iter/s)": 0.03226 }, { "epoch": 0.10502349464709587, "grad_norm": 0.1143367663025856, "learning_rate": 0.0002975396525616755, "loss": 0.4964498281478882, "memory(GiB)": 78.26, "step": 542, "token_acc": 0.8611530156476469, "train_speed(iter/s)": 0.032265 }, { "epoch": 0.10521726493242262, "grad_norm": 0.11364971101284027, "learning_rate": 0.0002975222823387304, "loss": 0.5033693313598633, "memory(GiB)": 78.26, "step": 543, "token_acc": 0.8565000138033846, "train_speed(iter/s)": 0.03227 }, { "epoch": 0.10541103521774936, "grad_norm": 0.11851288378238678, "learning_rate": 0.0002975048515244199, "loss": 0.4963114261627197, "memory(GiB)": 78.26, "step": 544, "token_acc": 0.8588933277681713, "train_speed(iter/s)": 0.032275 }, { "epoch": 0.1056048055030761, "grad_norm": 0.10574258863925934, "learning_rate": 0.00029748736012590325, "loss": 0.422842800617218, "memory(GiB)": 78.26, "step": 545, "token_acc": 0.8765757477808995, "train_speed(iter/s)": 0.03228 }, { "epoch": 0.10579857578840285, "grad_norm": 0.11669930815696716, "learning_rate": 0.00029746980815036463, "loss": 0.504793107509613, "memory(GiB)": 78.26, "step": 546, "token_acc": 0.8590873623674248, "train_speed(iter/s)": 0.032285 }, { "epoch": 0.10599234607372959, "grad_norm": 0.11788391321897507, "learning_rate": 0.00029745219560501317, "loss": 0.5016002058982849, "memory(GiB)": 78.26, "step": 547, "token_acc": 0.8547953818015547, "train_speed(iter/s)": 0.03229 }, { "epoch": 0.10618611635905634, "grad_norm": 0.1270827203989029, "learning_rate": 0.0002974345224970828, "loss": 0.49133074283599854, "memory(GiB)": 78.26, "step": 548, "token_acc": 0.8635658391797085, "train_speed(iter/s)": 0.032296 }, { "epoch": 0.10637988664438308, "grad_norm": 0.11762789636850357, "learning_rate": 0.0002974167888338323, "loss": 0.4606111943721771, "memory(GiB)": 78.26, "step": 549, "token_acc": 0.8683581749914666, "train_speed(iter/s)": 0.032301 }, { "epoch": 0.10657365692970983, "grad_norm": 0.11224767565727234, "learning_rate": 0.00029739899462254534, "loss": 0.4845007658004761, "memory(GiB)": 78.26, "step": 550, "token_acc": 0.8617370771520969, "train_speed(iter/s)": 0.032306 }, { "epoch": 0.10676742721503657, "grad_norm": 0.12319989502429962, "learning_rate": 0.00029738113987053057, "loss": 0.5313453078269958, "memory(GiB)": 78.26, "step": 551, "token_acc": 0.8511862414550404, "train_speed(iter/s)": 0.032312 }, { "epoch": 0.10696119750036331, "grad_norm": 0.11050526797771454, "learning_rate": 0.00029736322458512137, "loss": 0.4900805354118347, "memory(GiB)": 78.26, "step": 552, "token_acc": 0.8623392457978925, "train_speed(iter/s)": 0.032317 }, { "epoch": 0.10715496778569006, "grad_norm": 0.10729413479566574, "learning_rate": 0.000297345248773676, "loss": 0.42731186747550964, "memory(GiB)": 78.26, "step": 553, "token_acc": 0.8775543301978592, "train_speed(iter/s)": 0.032322 }, { "epoch": 0.1073487380710168, "grad_norm": 0.11160185933113098, "learning_rate": 0.00029732721244357766, "loss": 0.43972256779670715, "memory(GiB)": 78.26, "step": 554, "token_acc": 0.8772162864568294, "train_speed(iter/s)": 0.032327 }, { "epoch": 0.10754250835634356, "grad_norm": 0.1065671369433403, "learning_rate": 0.0002973091156022343, "loss": 0.4213421940803528, "memory(GiB)": 78.26, "step": 555, "token_acc": 0.8795612510860121, "train_speed(iter/s)": 0.032332 }, { "epoch": 0.1077362786416703, "grad_norm": 0.12415304034948349, "learning_rate": 0.0002972909582570789, "loss": 0.5269736051559448, "memory(GiB)": 78.26, "step": 556, "token_acc": 0.8518212183871481, "train_speed(iter/s)": 0.032337 }, { "epoch": 0.10793004892699705, "grad_norm": 0.11127987504005432, "learning_rate": 0.00029727274041556903, "loss": 0.43144795298576355, "memory(GiB)": 78.26, "step": 557, "token_acc": 0.8748079670107538, "train_speed(iter/s)": 0.032342 }, { "epoch": 0.1081238192123238, "grad_norm": 0.12338504940271378, "learning_rate": 0.0002972544620851873, "loss": 0.5014538764953613, "memory(GiB)": 78.26, "step": 558, "token_acc": 0.8602012135858572, "train_speed(iter/s)": 0.032347 }, { "epoch": 0.10831758949765054, "grad_norm": 0.11730282008647919, "learning_rate": 0.0002972361232734411, "loss": 0.5035312175750732, "memory(GiB)": 78.26, "step": 559, "token_acc": 0.857784964507907, "train_speed(iter/s)": 0.032351 }, { "epoch": 0.10851135978297728, "grad_norm": 0.11394292116165161, "learning_rate": 0.00029721772398786267, "loss": 0.5110200047492981, "memory(GiB)": 78.26, "step": 560, "token_acc": 0.8569026399865478, "train_speed(iter/s)": 0.032356 }, { "epoch": 0.10870513006830403, "grad_norm": 0.10496672242879868, "learning_rate": 0.0002971992642360091, "loss": 0.4619162976741791, "memory(GiB)": 78.26, "step": 561, "token_acc": 0.8678721452368306, "train_speed(iter/s)": 0.03236 }, { "epoch": 0.10889890035363077, "grad_norm": 0.11340762674808502, "learning_rate": 0.0002971807440254623, "loss": 0.4587838649749756, "memory(GiB)": 78.26, "step": 562, "token_acc": 0.869656622284513, "train_speed(iter/s)": 0.032365 }, { "epoch": 0.10909267063895751, "grad_norm": 0.11113093048334122, "learning_rate": 0.0002971621633638291, "loss": 0.4310606122016907, "memory(GiB)": 78.26, "step": 563, "token_acc": 0.8775456994698572, "train_speed(iter/s)": 0.03237 }, { "epoch": 0.10928644092428426, "grad_norm": 0.11764062941074371, "learning_rate": 0.00029714352225874096, "loss": 0.4371579587459564, "memory(GiB)": 78.26, "step": 564, "token_acc": 0.8744286816643912, "train_speed(iter/s)": 0.032375 }, { "epoch": 0.109480211209611, "grad_norm": 0.12444280833005905, "learning_rate": 0.00029712482071785436, "loss": 0.4438256621360779, "memory(GiB)": 78.26, "step": 565, "token_acc": 0.8732139048837705, "train_speed(iter/s)": 0.032379 }, { "epoch": 0.10967398149493775, "grad_norm": 0.1286301612854004, "learning_rate": 0.0002971060587488505, "loss": 0.48562532663345337, "memory(GiB)": 78.26, "step": 566, "token_acc": 0.8602812731310141, "train_speed(iter/s)": 0.032384 }, { "epoch": 0.10986775178026449, "grad_norm": 0.10392966866493225, "learning_rate": 0.00029708723635943536, "loss": 0.41354262828826904, "memory(GiB)": 78.26, "step": 567, "token_acc": 0.8819511093547112, "train_speed(iter/s)": 0.032389 }, { "epoch": 0.11006152206559124, "grad_norm": 0.12217994779348373, "learning_rate": 0.00029706835355733987, "loss": 0.4606168568134308, "memory(GiB)": 78.26, "step": 568, "token_acc": 0.8694803015044706, "train_speed(iter/s)": 0.032394 }, { "epoch": 0.11025529235091798, "grad_norm": 0.10960107296705246, "learning_rate": 0.00029704941035031977, "loss": 0.4657290577888489, "memory(GiB)": 78.26, "step": 569, "token_acc": 0.8707625272331154, "train_speed(iter/s)": 0.032398 }, { "epoch": 0.11044906263624474, "grad_norm": 0.1072949692606926, "learning_rate": 0.0002970304067461554, "loss": 0.47186678647994995, "memory(GiB)": 78.26, "step": 570, "token_acc": 0.8641140360004161, "train_speed(iter/s)": 0.032403 }, { "epoch": 0.11064283292157148, "grad_norm": 0.1146392896771431, "learning_rate": 0.0002970113427526521, "loss": 0.460860013961792, "memory(GiB)": 78.26, "step": 571, "token_acc": 0.8702294444126566, "train_speed(iter/s)": 0.032408 }, { "epoch": 0.11083660320689823, "grad_norm": 0.11065780371427536, "learning_rate": 0.00029699221837764, "loss": 0.4562651515007019, "memory(GiB)": 78.26, "step": 572, "token_acc": 0.8716815289879143, "train_speed(iter/s)": 0.032412 }, { "epoch": 0.11103037349222497, "grad_norm": 0.10410984605550766, "learning_rate": 0.0002969730336289741, "loss": 0.410195916891098, "memory(GiB)": 78.26, "step": 573, "token_acc": 0.8815111480133825, "train_speed(iter/s)": 0.032416 }, { "epoch": 0.11122414377755171, "grad_norm": 0.1023547351360321, "learning_rate": 0.0002969537885145338, "loss": 0.44055572152137756, "memory(GiB)": 78.26, "step": 574, "token_acc": 0.8741572450990561, "train_speed(iter/s)": 0.032421 }, { "epoch": 0.11141791406287846, "grad_norm": 0.11351530253887177, "learning_rate": 0.00029693448304222384, "loss": 0.48370641469955444, "memory(GiB)": 78.26, "step": 575, "token_acc": 0.8646675213198818, "train_speed(iter/s)": 0.032426 }, { "epoch": 0.1116116843482052, "grad_norm": 0.10755743831396103, "learning_rate": 0.0002969151172199734, "loss": 0.4608403444290161, "memory(GiB)": 78.26, "step": 576, "token_acc": 0.8711053970511804, "train_speed(iter/s)": 0.032431 }, { "epoch": 0.11180545463353195, "grad_norm": 0.11767973005771637, "learning_rate": 0.00029689569105573654, "loss": 0.4457208216190338, "memory(GiB)": 78.26, "step": 577, "token_acc": 0.8742761124613676, "train_speed(iter/s)": 0.032435 }, { "epoch": 0.11199922491885869, "grad_norm": 0.10909876972436905, "learning_rate": 0.0002968762045574921, "loss": 0.4398786425590515, "memory(GiB)": 78.26, "step": 578, "token_acc": 0.8746623305095859, "train_speed(iter/s)": 0.032439 }, { "epoch": 0.11219299520418544, "grad_norm": 0.10908223688602448, "learning_rate": 0.0002968566577332438, "loss": 0.4776487350463867, "memory(GiB)": 78.26, "step": 579, "token_acc": 0.8675198893116569, "train_speed(iter/s)": 0.032444 }, { "epoch": 0.11238676548951218, "grad_norm": 0.11661187559366226, "learning_rate": 0.0002968370505910199, "loss": 0.4867633581161499, "memory(GiB)": 78.26, "step": 580, "token_acc": 0.8616142945163278, "train_speed(iter/s)": 0.032449 }, { "epoch": 0.11258053577483892, "grad_norm": 0.10946566611528397, "learning_rate": 0.0002968173831388737, "loss": 0.4538075029850006, "memory(GiB)": 78.26, "step": 581, "token_acc": 0.8708697120254691, "train_speed(iter/s)": 0.032453 }, { "epoch": 0.11277430606016567, "grad_norm": 0.10774602741003036, "learning_rate": 0.00029679765538488315, "loss": 0.4607040286064148, "memory(GiB)": 78.26, "step": 582, "token_acc": 0.8665272975798022, "train_speed(iter/s)": 0.032457 }, { "epoch": 0.11296807634549241, "grad_norm": 0.11124128848314285, "learning_rate": 0.00029677786733715085, "loss": 0.48860326409339905, "memory(GiB)": 78.26, "step": 583, "token_acc": 0.8632434789299546, "train_speed(iter/s)": 0.032461 }, { "epoch": 0.11316184663081916, "grad_norm": 0.11296314746141434, "learning_rate": 0.00029675801900380444, "loss": 0.44037532806396484, "memory(GiB)": 78.26, "step": 584, "token_acc": 0.8732820680628273, "train_speed(iter/s)": 0.032466 }, { "epoch": 0.11335561691614592, "grad_norm": 0.12023019045591354, "learning_rate": 0.000296738110392996, "loss": 0.4782663881778717, "memory(GiB)": 78.26, "step": 585, "token_acc": 0.8658891296908863, "train_speed(iter/s)": 0.03247 }, { "epoch": 0.11354938720147266, "grad_norm": 0.10668137669563293, "learning_rate": 0.0002967181415129027, "loss": 0.46215036511421204, "memory(GiB)": 78.26, "step": 586, "token_acc": 0.870391061452514, "train_speed(iter/s)": 0.032475 }, { "epoch": 0.1137431574867994, "grad_norm": 0.11411837488412857, "learning_rate": 0.00029669811237172615, "loss": 0.48195797204971313, "memory(GiB)": 78.26, "step": 587, "token_acc": 0.8628970636393407, "train_speed(iter/s)": 0.032479 }, { "epoch": 0.11393692777212615, "grad_norm": 0.11026424914598465, "learning_rate": 0.0002966780229776929, "loss": 0.4768543541431427, "memory(GiB)": 78.26, "step": 588, "token_acc": 0.8641793084597512, "train_speed(iter/s)": 0.032484 }, { "epoch": 0.11413069805745289, "grad_norm": 0.09665640443563461, "learning_rate": 0.0002966578733390543, "loss": 0.40953487157821655, "memory(GiB)": 78.26, "step": 589, "token_acc": 0.8820130831304016, "train_speed(iter/s)": 0.032488 }, { "epoch": 0.11432446834277964, "grad_norm": 0.10579517483711243, "learning_rate": 0.00029663766346408623, "loss": 0.4386206269264221, "memory(GiB)": 78.26, "step": 590, "token_acc": 0.8759878419452888, "train_speed(iter/s)": 0.032492 }, { "epoch": 0.11451823862810638, "grad_norm": 0.1087721437215805, "learning_rate": 0.00029661739336108947, "loss": 0.456497460603714, "memory(GiB)": 78.26, "step": 591, "token_acc": 0.8705190602070312, "train_speed(iter/s)": 0.032497 }, { "epoch": 0.11471200891343313, "grad_norm": 0.11170576512813568, "learning_rate": 0.0002965970630383895, "loss": 0.5045605897903442, "memory(GiB)": 78.26, "step": 592, "token_acc": 0.8581936467002697, "train_speed(iter/s)": 0.032502 }, { "epoch": 0.11490577919875987, "grad_norm": 0.10968808829784393, "learning_rate": 0.00029657667250433645, "loss": 0.45187485218048096, "memory(GiB)": 78.26, "step": 593, "token_acc": 0.8707526605975126, "train_speed(iter/s)": 0.032506 }, { "epoch": 0.11509954948408661, "grad_norm": 0.12064143270254135, "learning_rate": 0.00029655622176730543, "loss": 0.48971986770629883, "memory(GiB)": 78.26, "step": 594, "token_acc": 0.8627284832344753, "train_speed(iter/s)": 0.032511 }, { "epoch": 0.11529331976941336, "grad_norm": 0.10990637540817261, "learning_rate": 0.000296535710835696, "loss": 0.46801620721817017, "memory(GiB)": 78.26, "step": 595, "token_acc": 0.8661632053702538, "train_speed(iter/s)": 0.032515 }, { "epoch": 0.1154870900547401, "grad_norm": 0.11462108045816422, "learning_rate": 0.00029651513971793255, "loss": 0.4742628037929535, "memory(GiB)": 78.26, "step": 596, "token_acc": 0.8639128007756265, "train_speed(iter/s)": 0.03252 }, { "epoch": 0.11568086034006685, "grad_norm": 0.10829971730709076, "learning_rate": 0.0002964945084224642, "loss": 0.47385069727897644, "memory(GiB)": 78.26, "step": 597, "token_acc": 0.8652491420019095, "train_speed(iter/s)": 0.032524 }, { "epoch": 0.11587463062539359, "grad_norm": 0.1086253821849823, "learning_rate": 0.00029647381695776474, "loss": 0.4455265998840332, "memory(GiB)": 78.26, "step": 598, "token_acc": 0.872541041111267, "train_speed(iter/s)": 0.032528 }, { "epoch": 0.11606840091072033, "grad_norm": 0.11207325756549835, "learning_rate": 0.0002964530653323328, "loss": 0.4860404133796692, "memory(GiB)": 78.26, "step": 599, "token_acc": 0.8606821106821106, "train_speed(iter/s)": 0.032532 }, { "epoch": 0.11626217119604709, "grad_norm": 0.11669515818357468, "learning_rate": 0.0002964322535546916, "loss": 0.5320515632629395, "memory(GiB)": 78.26, "step": 600, "token_acc": 0.8462377317339149, "train_speed(iter/s)": 0.032537 }, { "epoch": 0.11645594148137384, "grad_norm": 0.10914132744073868, "learning_rate": 0.00029641138163338907, "loss": 0.4786490797996521, "memory(GiB)": 78.26, "step": 601, "token_acc": 0.8650941795350415, "train_speed(iter/s)": 0.03252 }, { "epoch": 0.11664971176670058, "grad_norm": 0.10866682976484299, "learning_rate": 0.0002963904495769978, "loss": 0.4337434768676758, "memory(GiB)": 78.26, "step": 602, "token_acc": 0.875823794658342, "train_speed(iter/s)": 0.032525 }, { "epoch": 0.11684348205202733, "grad_norm": 0.11499880999326706, "learning_rate": 0.0002963694573941153, "loss": 0.46066945791244507, "memory(GiB)": 78.26, "step": 603, "token_acc": 0.8695754925999232, "train_speed(iter/s)": 0.032529 }, { "epoch": 0.11703725233735407, "grad_norm": 0.09975297749042511, "learning_rate": 0.0002963484050933636, "loss": 0.4278182089328766, "memory(GiB)": 78.26, "step": 604, "token_acc": 0.8778897736293872, "train_speed(iter/s)": 0.032533 }, { "epoch": 0.11723102262268081, "grad_norm": 0.11780435591936111, "learning_rate": 0.0002963272926833893, "loss": 0.4575609862804413, "memory(GiB)": 78.26, "step": 605, "token_acc": 0.8685906319290465, "train_speed(iter/s)": 0.032538 }, { "epoch": 0.11742479290800756, "grad_norm": 0.10950589925050735, "learning_rate": 0.00029630612017286393, "loss": 0.43958431482315063, "memory(GiB)": 78.26, "step": 606, "token_acc": 0.8741350906095552, "train_speed(iter/s)": 0.032542 }, { "epoch": 0.1176185631933343, "grad_norm": 0.11566930264234543, "learning_rate": 0.00029628488757048365, "loss": 0.4670230448246002, "memory(GiB)": 78.26, "step": 607, "token_acc": 0.8678823105807314, "train_speed(iter/s)": 0.032546 }, { "epoch": 0.11781233347866105, "grad_norm": 0.11282102763652802, "learning_rate": 0.00029626359488496914, "loss": 0.4526304006576538, "memory(GiB)": 78.26, "step": 608, "token_acc": 0.8723770983213429, "train_speed(iter/s)": 0.032551 }, { "epoch": 0.11800610376398779, "grad_norm": 0.10305222868919373, "learning_rate": 0.0002962422421250661, "loss": 0.437641978263855, "memory(GiB)": 78.26, "step": 609, "token_acc": 0.8741574159921818, "train_speed(iter/s)": 0.032555 }, { "epoch": 0.11819987404931454, "grad_norm": 0.10737847536802292, "learning_rate": 0.0002962208292995444, "loss": 0.46929752826690674, "memory(GiB)": 78.26, "step": 610, "token_acc": 0.8661016542219875, "train_speed(iter/s)": 0.032559 }, { "epoch": 0.11839364433464128, "grad_norm": 0.11264858394861221, "learning_rate": 0.00029619935641719906, "loss": 0.4528808295726776, "memory(GiB)": 78.26, "step": 611, "token_acc": 0.8737685426339032, "train_speed(iter/s)": 0.032563 }, { "epoch": 0.11858741461996802, "grad_norm": 0.10752102732658386, "learning_rate": 0.00029617782348684946, "loss": 0.46101704239845276, "memory(GiB)": 78.26, "step": 612, "token_acc": 0.8676157977176102, "train_speed(iter/s)": 0.032568 }, { "epoch": 0.11878118490529477, "grad_norm": 0.12059959769248962, "learning_rate": 0.00029615623051733986, "loss": 0.4816949665546417, "memory(GiB)": 78.26, "step": 613, "token_acc": 0.8623071979434447, "train_speed(iter/s)": 0.032572 }, { "epoch": 0.11897495519062151, "grad_norm": 0.11858416348695755, "learning_rate": 0.00029613457751753903, "loss": 0.4759201407432556, "memory(GiB)": 78.26, "step": 614, "token_acc": 0.8641525737563264, "train_speed(iter/s)": 0.032576 }, { "epoch": 0.11916872547594827, "grad_norm": 0.11951098591089249, "learning_rate": 0.0002961128644963404, "loss": 0.4854854643344879, "memory(GiB)": 78.26, "step": 615, "token_acc": 0.8598115112756648, "train_speed(iter/s)": 0.032581 }, { "epoch": 0.11936249576127501, "grad_norm": 0.11944432556629181, "learning_rate": 0.0002960910914626621, "loss": 0.49400413036346436, "memory(GiB)": 78.26, "step": 616, "token_acc": 0.8579318625243885, "train_speed(iter/s)": 0.032585 }, { "epoch": 0.11955626604660176, "grad_norm": 0.10993971675634384, "learning_rate": 0.00029606925842544694, "loss": 0.421941339969635, "memory(GiB)": 78.26, "step": 617, "token_acc": 0.8773255961925123, "train_speed(iter/s)": 0.032588 }, { "epoch": 0.1197500363319285, "grad_norm": 0.10016026347875595, "learning_rate": 0.00029604736539366234, "loss": 0.4218504726886749, "memory(GiB)": 78.26, "step": 618, "token_acc": 0.8786656708986806, "train_speed(iter/s)": 0.032592 }, { "epoch": 0.11994380661725525, "grad_norm": 0.11316211521625519, "learning_rate": 0.00029602541237630026, "loss": 0.43415647745132446, "memory(GiB)": 78.26, "step": 619, "token_acc": 0.8750876058008518, "train_speed(iter/s)": 0.032595 }, { "epoch": 0.12013757690258199, "grad_norm": 0.12216460704803467, "learning_rate": 0.00029600339938237746, "loss": 0.4619022309780121, "memory(GiB)": 78.26, "step": 620, "token_acc": 0.867951649787651, "train_speed(iter/s)": 0.032599 }, { "epoch": 0.12033134718790874, "grad_norm": 0.10427816212177277, "learning_rate": 0.0002959813264209353, "loss": 0.4056597054004669, "memory(GiB)": 78.26, "step": 621, "token_acc": 0.8820620175323826, "train_speed(iter/s)": 0.032603 }, { "epoch": 0.12052511747323548, "grad_norm": 0.113120436668396, "learning_rate": 0.0002959591935010397, "loss": 0.4769364595413208, "memory(GiB)": 78.26, "step": 622, "token_acc": 0.8646665349552651, "train_speed(iter/s)": 0.032607 }, { "epoch": 0.12071888775856222, "grad_norm": 0.10840658843517303, "learning_rate": 0.00029593700063178127, "loss": 0.4412975609302521, "memory(GiB)": 78.26, "step": 623, "token_acc": 0.8720108105499385, "train_speed(iter/s)": 0.032611 }, { "epoch": 0.12091265804388897, "grad_norm": 0.11971483379602432, "learning_rate": 0.00029591474782227523, "loss": 0.4542793333530426, "memory(GiB)": 78.26, "step": 624, "token_acc": 0.8705333207178225, "train_speed(iter/s)": 0.032616 }, { "epoch": 0.12110642832921571, "grad_norm": 0.1130179837346077, "learning_rate": 0.00029589243508166136, "loss": 0.48266369104385376, "memory(GiB)": 78.26, "step": 625, "token_acc": 0.8646267140680548, "train_speed(iter/s)": 0.032619 }, { "epoch": 0.12130019861454246, "grad_norm": 0.11181651800870895, "learning_rate": 0.0002958700624191041, "loss": 0.41792306303977966, "memory(GiB)": 78.26, "step": 626, "token_acc": 0.8797664711191335, "train_speed(iter/s)": 0.032623 }, { "epoch": 0.1214939688998692, "grad_norm": 0.11760863661766052, "learning_rate": 0.00029584762984379253, "loss": 0.49022918939590454, "memory(GiB)": 78.26, "step": 627, "token_acc": 0.8612506898254378, "train_speed(iter/s)": 0.032627 }, { "epoch": 0.12168773918519595, "grad_norm": 0.10427683591842651, "learning_rate": 0.00029582513736494027, "loss": 0.4219359755516052, "memory(GiB)": 78.26, "step": 628, "token_acc": 0.8782604333868379, "train_speed(iter/s)": 0.032631 }, { "epoch": 0.12188150947052269, "grad_norm": 0.10674095898866653, "learning_rate": 0.00029580258499178566, "loss": 0.4139460325241089, "memory(GiB)": 78.26, "step": 629, "token_acc": 0.8805075685104946, "train_speed(iter/s)": 0.032635 }, { "epoch": 0.12207527975584945, "grad_norm": 0.1084229126572609, "learning_rate": 0.00029577997273359157, "loss": 0.42610836029052734, "memory(GiB)": 78.26, "step": 630, "token_acc": 0.8765509800395612, "train_speed(iter/s)": 0.032639 }, { "epoch": 0.12226905004117619, "grad_norm": 0.12075889855623245, "learning_rate": 0.00029575730059964534, "loss": 0.47887349128723145, "memory(GiB)": 78.26, "step": 631, "token_acc": 0.8641042884486059, "train_speed(iter/s)": 0.032643 }, { "epoch": 0.12246282032650294, "grad_norm": 0.13688334822654724, "learning_rate": 0.00029573456859925917, "loss": 0.5096902847290039, "memory(GiB)": 78.26, "step": 632, "token_acc": 0.8591979075850044, "train_speed(iter/s)": 0.032647 }, { "epoch": 0.12265659061182968, "grad_norm": 0.09688639640808105, "learning_rate": 0.0002957117767417696, "loss": 0.4519810974597931, "memory(GiB)": 78.26, "step": 633, "token_acc": 0.8703838308049426, "train_speed(iter/s)": 0.032651 }, { "epoch": 0.12285036089715642, "grad_norm": 0.10635363310575485, "learning_rate": 0.0002956889250365379, "loss": 0.4540587067604065, "memory(GiB)": 78.26, "step": 634, "token_acc": 0.8705026824989194, "train_speed(iter/s)": 0.032654 }, { "epoch": 0.12304413118248317, "grad_norm": 0.1146511361002922, "learning_rate": 0.00029566601349294985, "loss": 0.44937869906425476, "memory(GiB)": 78.26, "step": 635, "token_acc": 0.8692985300475573, "train_speed(iter/s)": 0.032658 }, { "epoch": 0.12323790146780991, "grad_norm": 0.11808592826128006, "learning_rate": 0.0002956430421204159, "loss": 0.4610764980316162, "memory(GiB)": 78.26, "step": 636, "token_acc": 0.86909560882543, "train_speed(iter/s)": 0.032663 }, { "epoch": 0.12343167175313666, "grad_norm": 0.11165952682495117, "learning_rate": 0.0002956200109283709, "loss": 0.44256392121315, "memory(GiB)": 78.26, "step": 637, "token_acc": 0.872780487804878, "train_speed(iter/s)": 0.032667 }, { "epoch": 0.1236254420384634, "grad_norm": 0.11056765913963318, "learning_rate": 0.0002955969199262745, "loss": 0.44177088141441345, "memory(GiB)": 78.26, "step": 638, "token_acc": 0.8738199880373443, "train_speed(iter/s)": 0.032671 }, { "epoch": 0.12381921232379015, "grad_norm": 0.12657155096530914, "learning_rate": 0.00029557376912361076, "loss": 0.4548972249031067, "memory(GiB)": 78.26, "step": 639, "token_acc": 0.8688163152817969, "train_speed(iter/s)": 0.032675 }, { "epoch": 0.12401298260911689, "grad_norm": 0.1116451621055603, "learning_rate": 0.00029555055852988836, "loss": 0.44486871361732483, "memory(GiB)": 78.26, "step": 640, "token_acc": 0.8735002147061053, "train_speed(iter/s)": 0.032679 }, { "epoch": 0.12420675289444363, "grad_norm": 0.11732634902000427, "learning_rate": 0.0002955272881546404, "loss": 0.4493526816368103, "memory(GiB)": 78.26, "step": 641, "token_acc": 0.8715556519852462, "train_speed(iter/s)": 0.032683 }, { "epoch": 0.12440052317977038, "grad_norm": 0.11484182626008987, "learning_rate": 0.00029550395800742477, "loss": 0.4882141351699829, "memory(GiB)": 78.26, "step": 642, "token_acc": 0.8629650457276468, "train_speed(iter/s)": 0.032687 }, { "epoch": 0.12459429346509712, "grad_norm": 0.10542822629213333, "learning_rate": 0.0002954805680978237, "loss": 0.4319552779197693, "memory(GiB)": 78.26, "step": 643, "token_acc": 0.8768453037188416, "train_speed(iter/s)": 0.03269 }, { "epoch": 0.12478806375042387, "grad_norm": 0.10408841073513031, "learning_rate": 0.0002954571184354441, "loss": 0.402774840593338, "memory(GiB)": 78.26, "step": 644, "token_acc": 0.8839004707464694, "train_speed(iter/s)": 0.032694 }, { "epoch": 0.12498183403575061, "grad_norm": 0.10431966185569763, "learning_rate": 0.0002954336090299174, "loss": 0.4379886984825134, "memory(GiB)": 78.26, "step": 645, "token_acc": 0.8783049896738991, "train_speed(iter/s)": 0.032698 }, { "epoch": 0.12517560432107736, "grad_norm": 0.11031137406826019, "learning_rate": 0.0002954100398908995, "loss": 0.44032973051071167, "memory(GiB)": 78.26, "step": 646, "token_acc": 0.8727701606958573, "train_speed(iter/s)": 0.032702 }, { "epoch": 0.1253693746064041, "grad_norm": 0.11465989798307419, "learning_rate": 0.000295386411028071, "loss": 0.4586394429206848, "memory(GiB)": 78.26, "step": 647, "token_acc": 0.8680211123783254, "train_speed(iter/s)": 0.032705 }, { "epoch": 0.12556314489173084, "grad_norm": 0.10031208395957947, "learning_rate": 0.0002953627224511367, "loss": 0.4253405034542084, "memory(GiB)": 78.26, "step": 648, "token_acc": 0.8783018191380011, "train_speed(iter/s)": 0.032708 }, { "epoch": 0.1257569151770576, "grad_norm": 0.11166791617870331, "learning_rate": 0.0002953389741698262, "loss": 0.45778605341911316, "memory(GiB)": 78.26, "step": 649, "token_acc": 0.8692058831525433, "train_speed(iter/s)": 0.032712 }, { "epoch": 0.12595068546238433, "grad_norm": 0.13080237805843353, "learning_rate": 0.0002953151661938937, "loss": 0.4996272325515747, "memory(GiB)": 78.26, "step": 650, "token_acc": 0.8591764266135061, "train_speed(iter/s)": 0.032716 }, { "epoch": 0.12614445574771108, "grad_norm": 0.1140795424580574, "learning_rate": 0.00029529129853311765, "loss": 0.4572920799255371, "memory(GiB)": 78.26, "step": 651, "token_acc": 0.8702007294029808, "train_speed(iter/s)": 0.03272 }, { "epoch": 0.12633822603303782, "grad_norm": 0.10730257630348206, "learning_rate": 0.00029526737119730113, "loss": 0.4401112198829651, "memory(GiB)": 78.26, "step": 652, "token_acc": 0.8765418681736673, "train_speed(iter/s)": 0.032724 }, { "epoch": 0.1265319963183646, "grad_norm": 0.10705429315567017, "learning_rate": 0.0002952433841962718, "loss": 0.42639651894569397, "memory(GiB)": 78.26, "step": 653, "token_acc": 0.87889592899469, "train_speed(iter/s)": 0.032727 }, { "epoch": 0.12672576660369134, "grad_norm": 0.10336251556873322, "learning_rate": 0.0002952193375398817, "loss": 0.4257362484931946, "memory(GiB)": 78.26, "step": 654, "token_acc": 0.8773996726028077, "train_speed(iter/s)": 0.032731 }, { "epoch": 0.12691953688901808, "grad_norm": 0.11652205884456635, "learning_rate": 0.0002951952312380075, "loss": 0.4846917986869812, "memory(GiB)": 78.26, "step": 655, "token_acc": 0.8638244781206257, "train_speed(iter/s)": 0.032734 }, { "epoch": 0.12711330717434483, "grad_norm": 0.1111086755990982, "learning_rate": 0.00029517106530055034, "loss": 0.4347532093524933, "memory(GiB)": 78.26, "step": 656, "token_acc": 0.8746101794645484, "train_speed(iter/s)": 0.032738 }, { "epoch": 0.12730707745967157, "grad_norm": 0.11007381230592728, "learning_rate": 0.0002951468397374357, "loss": 0.44459158182144165, "memory(GiB)": 78.26, "step": 657, "token_acc": 0.8722839608558633, "train_speed(iter/s)": 0.032742 }, { "epoch": 0.1275008477449983, "grad_norm": 0.10285302996635437, "learning_rate": 0.00029512255455861375, "loss": 0.3928186297416687, "memory(GiB)": 78.26, "step": 658, "token_acc": 0.8879543480957698, "train_speed(iter/s)": 0.032745 }, { "epoch": 0.12769461803032506, "grad_norm": 0.1072237491607666, "learning_rate": 0.00029509820977405906, "loss": 0.4308614730834961, "memory(GiB)": 78.26, "step": 659, "token_acc": 0.876323788978231, "train_speed(iter/s)": 0.032749 }, { "epoch": 0.1278883883156518, "grad_norm": 0.12245503067970276, "learning_rate": 0.0002950738053937707, "loss": 0.49942547082901, "memory(GiB)": 78.26, "step": 660, "token_acc": 0.8578174506458689, "train_speed(iter/s)": 0.032753 }, { "epoch": 0.12808215860097855, "grad_norm": 0.10673464089632034, "learning_rate": 0.0002950493414277721, "loss": 0.4471551477909088, "memory(GiB)": 78.26, "step": 661, "token_acc": 0.8700834952475975, "train_speed(iter/s)": 0.032756 }, { "epoch": 0.1282759288863053, "grad_norm": 0.11559055000543594, "learning_rate": 0.0002950248178861114, "loss": 0.4593106806278229, "memory(GiB)": 78.26, "step": 662, "token_acc": 0.8703804270157487, "train_speed(iter/s)": 0.032761 }, { "epoch": 0.12846969917163203, "grad_norm": 0.11425399780273438, "learning_rate": 0.000295000234778861, "loss": 0.456471711397171, "memory(GiB)": 78.26, "step": 663, "token_acc": 0.868738057655562, "train_speed(iter/s)": 0.032764 }, { "epoch": 0.12866346945695878, "grad_norm": 0.11667713522911072, "learning_rate": 0.0002949755921161179, "loss": 0.4741016924381256, "memory(GiB)": 78.26, "step": 664, "token_acc": 0.864152462756677, "train_speed(iter/s)": 0.032768 }, { "epoch": 0.12885723974228552, "grad_norm": 0.1101280227303505, "learning_rate": 0.0002949508899080035, "loss": 0.4579119384288788, "memory(GiB)": 78.26, "step": 665, "token_acc": 0.8712681302556577, "train_speed(iter/s)": 0.032772 }, { "epoch": 0.12905101002761227, "grad_norm": 0.10604141652584076, "learning_rate": 0.0002949261281646636, "loss": 0.46181899309158325, "memory(GiB)": 78.26, "step": 666, "token_acc": 0.8664379947229551, "train_speed(iter/s)": 0.032775 }, { "epoch": 0.129244780312939, "grad_norm": 0.11616591364145279, "learning_rate": 0.0002949013068962685, "loss": 0.4584234952926636, "memory(GiB)": 78.26, "step": 667, "token_acc": 0.8690121073872609, "train_speed(iter/s)": 0.032778 }, { "epoch": 0.12943855059826576, "grad_norm": 0.11570385843515396, "learning_rate": 0.00029487642611301305, "loss": 0.4578292965888977, "memory(GiB)": 78.26, "step": 668, "token_acc": 0.8677614050350715, "train_speed(iter/s)": 0.032782 }, { "epoch": 0.1296323208835925, "grad_norm": 0.09622353315353394, "learning_rate": 0.0002948514858251164, "loss": 0.41976287961006165, "memory(GiB)": 78.26, "step": 669, "token_acc": 0.8775399043243499, "train_speed(iter/s)": 0.032785 }, { "epoch": 0.12982609116891924, "grad_norm": 0.10756651312112808, "learning_rate": 0.0002948264860428223, "loss": 0.41081976890563965, "memory(GiB)": 78.26, "step": 670, "token_acc": 0.8800941338537481, "train_speed(iter/s)": 0.032789 }, { "epoch": 0.130019861454246, "grad_norm": 0.11921778321266174, "learning_rate": 0.00029480142677639864, "loss": 0.5204399824142456, "memory(GiB)": 78.26, "step": 671, "token_acc": 0.8542876514459217, "train_speed(iter/s)": 0.032792 }, { "epoch": 0.13021363173957273, "grad_norm": 0.10356691479682922, "learning_rate": 0.00029477630803613806, "loss": 0.4329625368118286, "memory(GiB)": 78.26, "step": 672, "token_acc": 0.8725223288914362, "train_speed(iter/s)": 0.032795 }, { "epoch": 0.13040740202489948, "grad_norm": 0.11215896904468536, "learning_rate": 0.0002947511298323575, "loss": 0.4640699625015259, "memory(GiB)": 78.26, "step": 673, "token_acc": 0.8692367949865711, "train_speed(iter/s)": 0.032799 }, { "epoch": 0.13060117231022622, "grad_norm": 0.10207220911979675, "learning_rate": 0.0002947258921753983, "loss": 0.41831666231155396, "memory(GiB)": 78.26, "step": 674, "token_acc": 0.8800374444184413, "train_speed(iter/s)": 0.032802 }, { "epoch": 0.13079494259555297, "grad_norm": 0.11593617498874664, "learning_rate": 0.0002947005950756262, "loss": 0.4830693006515503, "memory(GiB)": 78.26, "step": 675, "token_acc": 0.8652564297671588, "train_speed(iter/s)": 0.032806 }, { "epoch": 0.1309887128808797, "grad_norm": 0.10835679620504379, "learning_rate": 0.00029467523854343153, "loss": 0.41629844903945923, "memory(GiB)": 78.26, "step": 676, "token_acc": 0.8807510973825394, "train_speed(iter/s)": 0.032809 }, { "epoch": 0.13118248316620645, "grad_norm": 0.1169678345322609, "learning_rate": 0.00029464982258922874, "loss": 0.43599221110343933, "memory(GiB)": 78.26, "step": 677, "token_acc": 0.8760780894853193, "train_speed(iter/s)": 0.032813 }, { "epoch": 0.1313762534515332, "grad_norm": 0.10601246356964111, "learning_rate": 0.00029462434722345697, "loss": 0.43275031447410583, "memory(GiB)": 78.26, "step": 678, "token_acc": 0.8755334681042228, "train_speed(iter/s)": 0.032817 }, { "epoch": 0.13157002373685994, "grad_norm": 0.10723179578781128, "learning_rate": 0.0002945988124565796, "loss": 0.432290643453598, "memory(GiB)": 78.26, "step": 679, "token_acc": 0.8754735278964755, "train_speed(iter/s)": 0.03282 }, { "epoch": 0.1317637940221867, "grad_norm": 0.1098715141415596, "learning_rate": 0.0002945732182990844, "loss": 0.46575891971588135, "memory(GiB)": 78.26, "step": 680, "token_acc": 0.8681277056277056, "train_speed(iter/s)": 0.032823 }, { "epoch": 0.13195756430751343, "grad_norm": 0.10466992110013962, "learning_rate": 0.0002945475647614836, "loss": 0.4427857995033264, "memory(GiB)": 78.26, "step": 681, "token_acc": 0.8734357015159308, "train_speed(iter/s)": 0.032827 }, { "epoch": 0.13215133459284018, "grad_norm": 0.10467381030321121, "learning_rate": 0.0002945218518543138, "loss": 0.4318023920059204, "memory(GiB)": 78.26, "step": 682, "token_acc": 0.8751313343570678, "train_speed(iter/s)": 0.03283 }, { "epoch": 0.13234510487816692, "grad_norm": 0.1041102334856987, "learning_rate": 0.00029449607958813604, "loss": 0.43351155519485474, "memory(GiB)": 78.26, "step": 683, "token_acc": 0.873902149545035, "train_speed(iter/s)": 0.032833 }, { "epoch": 0.1325388751634937, "grad_norm": 0.11077916622161865, "learning_rate": 0.0002944702479735356, "loss": 0.43510738015174866, "memory(GiB)": 78.26, "step": 684, "token_acc": 0.8754168786388559, "train_speed(iter/s)": 0.032837 }, { "epoch": 0.13273264544882044, "grad_norm": 0.10889974236488342, "learning_rate": 0.0002944443570211223, "loss": 0.4274645745754242, "memory(GiB)": 78.26, "step": 685, "token_acc": 0.8780943484353106, "train_speed(iter/s)": 0.032841 }, { "epoch": 0.13292641573414718, "grad_norm": 0.10870851576328278, "learning_rate": 0.00029441840674153017, "loss": 0.47596949338912964, "memory(GiB)": 78.26, "step": 686, "token_acc": 0.863644912544668, "train_speed(iter/s)": 0.032844 }, { "epoch": 0.13312018601947392, "grad_norm": 0.11301931738853455, "learning_rate": 0.0002943923971454177, "loss": 0.46415457129478455, "memory(GiB)": 78.26, "step": 687, "token_acc": 0.8705835224996662, "train_speed(iter/s)": 0.032847 }, { "epoch": 0.13331395630480067, "grad_norm": 0.10821959376335144, "learning_rate": 0.0002943663282434678, "loss": 0.43708160519599915, "memory(GiB)": 78.26, "step": 688, "token_acc": 0.8739687603123969, "train_speed(iter/s)": 0.03285 }, { "epoch": 0.1335077265901274, "grad_norm": 0.1074993684887886, "learning_rate": 0.0002943402000463875, "loss": 0.4149300754070282, "memory(GiB)": 78.26, "step": 689, "token_acc": 0.8803831398155441, "train_speed(iter/s)": 0.032854 }, { "epoch": 0.13370149687545416, "grad_norm": 0.11187649518251419, "learning_rate": 0.0002943140125649086, "loss": 0.4131820797920227, "memory(GiB)": 78.26, "step": 690, "token_acc": 0.8804750593824228, "train_speed(iter/s)": 0.032857 }, { "epoch": 0.1338952671607809, "grad_norm": 0.10465361177921295, "learning_rate": 0.0002942877658097868, "loss": 0.45171868801116943, "memory(GiB)": 78.26, "step": 691, "token_acc": 0.8700957481747215, "train_speed(iter/s)": 0.032861 }, { "epoch": 0.13408903744610765, "grad_norm": 0.12278591841459274, "learning_rate": 0.00029426145979180243, "loss": 0.4629440903663635, "memory(GiB)": 78.26, "step": 692, "token_acc": 0.8690040563923638, "train_speed(iter/s)": 0.032864 }, { "epoch": 0.1342828077314344, "grad_norm": 0.10109657794237137, "learning_rate": 0.00029423509452176005, "loss": 0.42186468839645386, "memory(GiB)": 78.26, "step": 693, "token_acc": 0.877427268729604, "train_speed(iter/s)": 0.032868 }, { "epoch": 0.13447657801676113, "grad_norm": 0.10865118354558945, "learning_rate": 0.00029420867001048867, "loss": 0.4467792510986328, "memory(GiB)": 78.26, "step": 694, "token_acc": 0.8724129751527012, "train_speed(iter/s)": 0.032871 }, { "epoch": 0.13467034830208788, "grad_norm": 0.1067957952618599, "learning_rate": 0.0002941821862688414, "loss": 0.43282240629196167, "memory(GiB)": 78.26, "step": 695, "token_acc": 0.8747978812378032, "train_speed(iter/s)": 0.032874 }, { "epoch": 0.13486411858741462, "grad_norm": 0.11119311302900314, "learning_rate": 0.00029415564330769595, "loss": 0.4267749488353729, "memory(GiB)": 78.26, "step": 696, "token_acc": 0.8750176528738879, "train_speed(iter/s)": 0.032877 }, { "epoch": 0.13505788887274137, "grad_norm": 0.1177472174167633, "learning_rate": 0.00029412904113795417, "loss": 0.4727480709552765, "memory(GiB)": 78.26, "step": 697, "token_acc": 0.8640138408304499, "train_speed(iter/s)": 0.032881 }, { "epoch": 0.1352516591580681, "grad_norm": 0.1068907380104065, "learning_rate": 0.0002941023797705423, "loss": 0.4286806583404541, "memory(GiB)": 78.26, "step": 698, "token_acc": 0.8773055332798717, "train_speed(iter/s)": 0.032884 }, { "epoch": 0.13544542944339485, "grad_norm": 0.10560762882232666, "learning_rate": 0.00029407565921641093, "loss": 0.43469613790512085, "memory(GiB)": 78.26, "step": 699, "token_acc": 0.8766754544193146, "train_speed(iter/s)": 0.032888 }, { "epoch": 0.1356391997287216, "grad_norm": 0.11222469061613083, "learning_rate": 0.0002940488794865348, "loss": 0.49300503730773926, "memory(GiB)": 78.26, "step": 700, "token_acc": 0.8590341029442915, "train_speed(iter/s)": 0.032891 }, { "epoch": 0.13583297001404834, "grad_norm": 0.10281093418598175, "learning_rate": 0.0002940220405919131, "loss": 0.40541547536849976, "memory(GiB)": 78.26, "step": 701, "token_acc": 0.8827319887407821, "train_speed(iter/s)": 0.032894 }, { "epoch": 0.1360267402993751, "grad_norm": 0.11664767563343048, "learning_rate": 0.00029399514254356936, "loss": 0.47185465693473816, "memory(GiB)": 78.26, "step": 702, "token_acc": 0.8657896737890655, "train_speed(iter/s)": 0.032897 }, { "epoch": 0.13622051058470183, "grad_norm": 0.11212712526321411, "learning_rate": 0.0002939681853525512, "loss": 0.44248533248901367, "memory(GiB)": 78.26, "step": 703, "token_acc": 0.8741593492114769, "train_speed(iter/s)": 0.032901 }, { "epoch": 0.13641428087002858, "grad_norm": 0.10552475601434708, "learning_rate": 0.0002939411690299308, "loss": 0.41650229692459106, "memory(GiB)": 78.26, "step": 704, "token_acc": 0.8810408921933085, "train_speed(iter/s)": 0.032904 }, { "epoch": 0.13660805115535532, "grad_norm": 0.11409718543291092, "learning_rate": 0.0002939140935868044, "loss": 0.42940688133239746, "memory(GiB)": 78.26, "step": 705, "token_acc": 0.8761153646882736, "train_speed(iter/s)": 0.032908 }, { "epoch": 0.13680182144068206, "grad_norm": 0.1144266277551651, "learning_rate": 0.0002938869590342927, "loss": 0.4688735604286194, "memory(GiB)": 78.26, "step": 706, "token_acc": 0.8650337555096803, "train_speed(iter/s)": 0.032911 }, { "epoch": 0.1369955917260088, "grad_norm": 0.11157376319169998, "learning_rate": 0.0002938597653835405, "loss": 0.45128777623176575, "memory(GiB)": 78.26, "step": 707, "token_acc": 0.8709400486478752, "train_speed(iter/s)": 0.032913 }, { "epoch": 0.13718936201133555, "grad_norm": 0.10916193574666977, "learning_rate": 0.000293832512645717, "loss": 0.43759334087371826, "memory(GiB)": 78.26, "step": 708, "token_acc": 0.8740179495065283, "train_speed(iter/s)": 0.032916 }, { "epoch": 0.1373831322966623, "grad_norm": 0.12670759856700897, "learning_rate": 0.00029380520083201563, "loss": 0.49803271889686584, "memory(GiB)": 78.26, "step": 709, "token_acc": 0.8560126128708614, "train_speed(iter/s)": 0.03292 }, { "epoch": 0.13757690258198904, "grad_norm": 0.11388320475816727, "learning_rate": 0.00029377782995365404, "loss": 0.43962639570236206, "memory(GiB)": 78.26, "step": 710, "token_acc": 0.8743197278911564, "train_speed(iter/s)": 0.032923 }, { "epoch": 0.13777067286731579, "grad_norm": 0.116911880671978, "learning_rate": 0.0002937504000218743, "loss": 0.46620821952819824, "memory(GiB)": 78.26, "step": 711, "token_acc": 0.8658806794868288, "train_speed(iter/s)": 0.032926 }, { "epoch": 0.13796444315264253, "grad_norm": 0.10047486424446106, "learning_rate": 0.0002937229110479425, "loss": 0.411388635635376, "memory(GiB)": 78.26, "step": 712, "token_acc": 0.8812828160820372, "train_speed(iter/s)": 0.032929 }, { "epoch": 0.13815821343796927, "grad_norm": 0.10067104548215866, "learning_rate": 0.00029369536304314916, "loss": 0.4194115698337555, "memory(GiB)": 78.26, "step": 713, "token_acc": 0.8785104275426338, "train_speed(iter/s)": 0.032932 }, { "epoch": 0.13835198372329605, "grad_norm": 0.10838618874549866, "learning_rate": 0.0002936677560188089, "loss": 0.46899646520614624, "memory(GiB)": 78.26, "step": 714, "token_acc": 0.8667539476848265, "train_speed(iter/s)": 0.032935 }, { "epoch": 0.1385457540086228, "grad_norm": 0.12074261903762817, "learning_rate": 0.00029364008998626086, "loss": 0.41880735754966736, "memory(GiB)": 78.26, "step": 715, "token_acc": 0.8823898581649303, "train_speed(iter/s)": 0.032938 }, { "epoch": 0.13873952429394953, "grad_norm": 0.11350797116756439, "learning_rate": 0.00029361236495686806, "loss": 0.4601813554763794, "memory(GiB)": 78.26, "step": 716, "token_acc": 0.8699107434095424, "train_speed(iter/s)": 0.032942 }, { "epoch": 0.13893329457927628, "grad_norm": 0.11661586165428162, "learning_rate": 0.0002935845809420179, "loss": 0.4675025939941406, "memory(GiB)": 78.26, "step": 717, "token_acc": 0.8649476228847703, "train_speed(iter/s)": 0.032945 }, { "epoch": 0.13912706486460302, "grad_norm": 0.11661852151155472, "learning_rate": 0.0002935567379531222, "loss": 0.47289931774139404, "memory(GiB)": 78.26, "step": 718, "token_acc": 0.8682559598494354, "train_speed(iter/s)": 0.032949 }, { "epoch": 0.13932083514992977, "grad_norm": 0.11107414215803146, "learning_rate": 0.0002935288360016166, "loss": 0.4379945397377014, "memory(GiB)": 78.26, "step": 719, "token_acc": 0.8734506763257683, "train_speed(iter/s)": 0.032952 }, { "epoch": 0.1395146054352565, "grad_norm": 0.1156369224190712, "learning_rate": 0.00029350087509896137, "loss": 0.4373580515384674, "memory(GiB)": 78.26, "step": 720, "token_acc": 0.8751508606999936, "train_speed(iter/s)": 0.032954 }, { "epoch": 0.13970837572058326, "grad_norm": 0.11756598204374313, "learning_rate": 0.00029347285525664065, "loss": 0.48482561111450195, "memory(GiB)": 78.26, "step": 721, "token_acc": 0.8630889444773964, "train_speed(iter/s)": 0.032958 }, { "epoch": 0.13990214600591, "grad_norm": 0.10152934491634369, "learning_rate": 0.00029344477648616304, "loss": 0.41498956084251404, "memory(GiB)": 78.26, "step": 722, "token_acc": 0.881353398445286, "train_speed(iter/s)": 0.032961 }, { "epoch": 0.14009591629123674, "grad_norm": 0.11098542809486389, "learning_rate": 0.0002934166387990612, "loss": 0.4146006107330322, "memory(GiB)": 78.26, "step": 723, "token_acc": 0.8806968679173152, "train_speed(iter/s)": 0.032963 }, { "epoch": 0.1402896865765635, "grad_norm": 0.11475761979818344, "learning_rate": 0.00029338844220689204, "loss": 0.4258284270763397, "memory(GiB)": 78.26, "step": 724, "token_acc": 0.8758714403875694, "train_speed(iter/s)": 0.032967 }, { "epoch": 0.14048345686189023, "grad_norm": 0.10699902474880219, "learning_rate": 0.0002933601867212367, "loss": 0.4143221974372864, "memory(GiB)": 78.26, "step": 725, "token_acc": 0.8792861274064282, "train_speed(iter/s)": 0.03297 }, { "epoch": 0.14067722714721698, "grad_norm": 0.11193379759788513, "learning_rate": 0.0002933318723537004, "loss": 0.4646386504173279, "memory(GiB)": 78.26, "step": 726, "token_acc": 0.8705041600375577, "train_speed(iter/s)": 0.032973 }, { "epoch": 0.14087099743254372, "grad_norm": 0.10847294330596924, "learning_rate": 0.0002933034991159127, "loss": 0.431307852268219, "memory(GiB)": 78.26, "step": 727, "token_acc": 0.8776439295122638, "train_speed(iter/s)": 0.032976 }, { "epoch": 0.14106476771787047, "grad_norm": 0.11553511023521423, "learning_rate": 0.0002932750670195272, "loss": 0.47172775864601135, "memory(GiB)": 78.26, "step": 728, "token_acc": 0.8666254148619003, "train_speed(iter/s)": 0.032979 }, { "epoch": 0.1412585380031972, "grad_norm": 0.12142791599035263, "learning_rate": 0.0002932465760762217, "loss": 0.4737962484359741, "memory(GiB)": 78.26, "step": 729, "token_acc": 0.8672086720867209, "train_speed(iter/s)": 0.032982 }, { "epoch": 0.14145230828852395, "grad_norm": 0.11232297122478485, "learning_rate": 0.0002932180262976982, "loss": 0.4628666341304779, "memory(GiB)": 78.26, "step": 730, "token_acc": 0.8675850891410049, "train_speed(iter/s)": 0.032985 }, { "epoch": 0.1416460785738507, "grad_norm": 0.11426133662462234, "learning_rate": 0.0002931894176956829, "loss": 0.48010843992233276, "memory(GiB)": 78.26, "step": 731, "token_acc": 0.8621309527436702, "train_speed(iter/s)": 0.032988 }, { "epoch": 0.14183984885917744, "grad_norm": 0.10444355756044388, "learning_rate": 0.0002931607502819261, "loss": 0.4301010072231293, "memory(GiB)": 78.26, "step": 732, "token_acc": 0.8748947870489872, "train_speed(iter/s)": 0.032991 }, { "epoch": 0.1420336191445042, "grad_norm": 0.10665851086378098, "learning_rate": 0.0002931320240682023, "loss": 0.4206262230873108, "memory(GiB)": 78.26, "step": 733, "token_acc": 0.876729803932444, "train_speed(iter/s)": 0.032994 }, { "epoch": 0.14222738942983093, "grad_norm": 0.10180269926786423, "learning_rate": 0.00029310323906631006, "loss": 0.41888511180877686, "memory(GiB)": 78.26, "step": 734, "token_acc": 0.8802506609223538, "train_speed(iter/s)": 0.032997 }, { "epoch": 0.14242115971515767, "grad_norm": 0.109284408390522, "learning_rate": 0.00029307439528807223, "loss": 0.45739686489105225, "memory(GiB)": 78.26, "step": 735, "token_acc": 0.8659565024411895, "train_speed(iter/s)": 0.033 }, { "epoch": 0.14261493000048442, "grad_norm": 0.11013077199459076, "learning_rate": 0.0002930454927453357, "loss": 0.45072418451309204, "memory(GiB)": 78.26, "step": 736, "token_acc": 0.8717199543645825, "train_speed(iter/s)": 0.033003 }, { "epoch": 0.14280870028581116, "grad_norm": 0.11075732111930847, "learning_rate": 0.00029301653144997154, "loss": 0.45079749822616577, "memory(GiB)": 78.26, "step": 737, "token_acc": 0.8688188438181866, "train_speed(iter/s)": 0.033006 }, { "epoch": 0.1430024705711379, "grad_norm": 0.12135940045118332, "learning_rate": 0.0002929875114138749, "loss": 0.4898369312286377, "memory(GiB)": 78.26, "step": 738, "token_acc": 0.8604469273743017, "train_speed(iter/s)": 0.033009 }, { "epoch": 0.14319624085646465, "grad_norm": 0.10148437321186066, "learning_rate": 0.00029295843264896506, "loss": 0.37936070561408997, "memory(GiB)": 78.26, "step": 739, "token_acc": 0.890900732946855, "train_speed(iter/s)": 0.033011 }, { "epoch": 0.1433900111417914, "grad_norm": 0.10243412852287292, "learning_rate": 0.0002929292951671855, "loss": 0.42304426431655884, "memory(GiB)": 78.26, "step": 740, "token_acc": 0.878807962695576, "train_speed(iter/s)": 0.033014 }, { "epoch": 0.14358378142711814, "grad_norm": 0.10791970789432526, "learning_rate": 0.0002929000989805038, "loss": 0.42843425273895264, "memory(GiB)": 78.26, "step": 741, "token_acc": 0.8764322681978526, "train_speed(iter/s)": 0.033017 }, { "epoch": 0.14377755171244488, "grad_norm": 0.11472160369157791, "learning_rate": 0.00029287084410091154, "loss": 0.4652520716190338, "memory(GiB)": 78.26, "step": 742, "token_acc": 0.8662527909382999, "train_speed(iter/s)": 0.03302 }, { "epoch": 0.14397132199777163, "grad_norm": 0.12771129608154297, "learning_rate": 0.00029284153054042454, "loss": 0.5056222081184387, "memory(GiB)": 78.26, "step": 743, "token_acc": 0.8580256694660118, "train_speed(iter/s)": 0.033023 }, { "epoch": 0.1441650922830984, "grad_norm": 0.10892749577760696, "learning_rate": 0.0002928121583110826, "loss": 0.4871826469898224, "memory(GiB)": 78.26, "step": 744, "token_acc": 0.8604575074297244, "train_speed(iter/s)": 0.033026 }, { "epoch": 0.14435886256842514, "grad_norm": 0.10514423251152039, "learning_rate": 0.0002927827274249498, "loss": 0.4291183352470398, "memory(GiB)": 78.26, "step": 745, "token_acc": 0.8779703110007337, "train_speed(iter/s)": 0.033029 }, { "epoch": 0.1445526328537519, "grad_norm": 0.11185097694396973, "learning_rate": 0.00029275323789411406, "loss": 0.4657382667064667, "memory(GiB)": 78.26, "step": 746, "token_acc": 0.8690815142887272, "train_speed(iter/s)": 0.033031 }, { "epoch": 0.14474640313907863, "grad_norm": 0.1169874519109726, "learning_rate": 0.00029272368973068765, "loss": 0.4796285331249237, "memory(GiB)": 78.26, "step": 747, "token_acc": 0.8630539484445522, "train_speed(iter/s)": 0.033034 }, { "epoch": 0.14494017342440538, "grad_norm": 0.10806058347225189, "learning_rate": 0.0002926940829468067, "loss": 0.44982171058654785, "memory(GiB)": 78.26, "step": 748, "token_acc": 0.8720634405109884, "train_speed(iter/s)": 0.033037 }, { "epoch": 0.14513394370973212, "grad_norm": 0.10634516179561615, "learning_rate": 0.00029266441755463154, "loss": 0.4241579473018646, "memory(GiB)": 78.26, "step": 749, "token_acc": 0.8774991483898016, "train_speed(iter/s)": 0.03304 }, { "epoch": 0.14532771399505887, "grad_norm": 0.10630025714635849, "learning_rate": 0.00029263469356634656, "loss": 0.45012885332107544, "memory(GiB)": 78.26, "step": 750, "token_acc": 0.8703159258521028, "train_speed(iter/s)": 0.033043 }, { "epoch": 0.1455214842803856, "grad_norm": 0.11359134316444397, "learning_rate": 0.0002926049109941602, "loss": 0.47736871242523193, "memory(GiB)": 78.26, "step": 751, "token_acc": 0.8675646281268301, "train_speed(iter/s)": 0.033046 }, { "epoch": 0.14571525456571235, "grad_norm": 0.10710503160953522, "learning_rate": 0.00029257506985030495, "loss": 0.4275802671909332, "memory(GiB)": 78.26, "step": 752, "token_acc": 0.8778828604612576, "train_speed(iter/s)": 0.033048 }, { "epoch": 0.1459090248510391, "grad_norm": 0.10511702299118042, "learning_rate": 0.00029254517014703737, "loss": 0.41856449842453003, "memory(GiB)": 78.26, "step": 753, "token_acc": 0.8796367161589878, "train_speed(iter/s)": 0.033051 }, { "epoch": 0.14610279513636584, "grad_norm": 0.10652240365743637, "learning_rate": 0.000292515211896638, "loss": 0.4258817732334137, "memory(GiB)": 78.26, "step": 754, "token_acc": 0.8783486686751893, "train_speed(iter/s)": 0.033054 }, { "epoch": 0.1462965654216926, "grad_norm": 0.10977134853601456, "learning_rate": 0.00029248519511141166, "loss": 0.45545950531959534, "memory(GiB)": 78.26, "step": 755, "token_acc": 0.8708563737967758, "train_speed(iter/s)": 0.033057 }, { "epoch": 0.14649033570701933, "grad_norm": 0.10239318013191223, "learning_rate": 0.00029245511980368694, "loss": 0.40798917412757874, "memory(GiB)": 78.26, "step": 756, "token_acc": 0.8818212167124625, "train_speed(iter/s)": 0.033059 }, { "epoch": 0.14668410599234608, "grad_norm": 0.10805483907461166, "learning_rate": 0.0002924249859858166, "loss": 0.44618624448776245, "memory(GiB)": 78.26, "step": 757, "token_acc": 0.873049779628478, "train_speed(iter/s)": 0.033062 }, { "epoch": 0.14687787627767282, "grad_norm": 0.09752228856086731, "learning_rate": 0.0002923947936701774, "loss": 0.4019618034362793, "memory(GiB)": 78.26, "step": 758, "token_acc": 0.8834425762325449, "train_speed(iter/s)": 0.033065 }, { "epoch": 0.14707164656299956, "grad_norm": 0.10851308703422546, "learning_rate": 0.00029236454286917017, "loss": 0.4336164891719818, "memory(GiB)": 78.26, "step": 759, "token_acc": 0.8760058721183123, "train_speed(iter/s)": 0.033067 }, { "epoch": 0.1472654168483263, "grad_norm": 0.10959117859601974, "learning_rate": 0.00029233423359521966, "loss": 0.4287548363208771, "memory(GiB)": 78.26, "step": 760, "token_acc": 0.874751174456565, "train_speed(iter/s)": 0.033069 }, { "epoch": 0.14745918713365305, "grad_norm": 0.11091278493404388, "learning_rate": 0.0002923038658607748, "loss": 0.44881102442741394, "memory(GiB)": 78.26, "step": 761, "token_acc": 0.8713587887777806, "train_speed(iter/s)": 0.033072 }, { "epoch": 0.1476529574189798, "grad_norm": 0.11895415931940079, "learning_rate": 0.0002922734396783083, "loss": 0.49504998326301575, "memory(GiB)": 78.26, "step": 762, "token_acc": 0.8594216543375924, "train_speed(iter/s)": 0.033075 }, { "epoch": 0.14784672770430654, "grad_norm": 0.09505145251750946, "learning_rate": 0.00029224295506031714, "loss": 0.38731294870376587, "memory(GiB)": 78.26, "step": 763, "token_acc": 0.8887887484287171, "train_speed(iter/s)": 0.033077 }, { "epoch": 0.14804049798963329, "grad_norm": 0.10213866084814072, "learning_rate": 0.0002922124120193221, "loss": 0.4288552403450012, "memory(GiB)": 78.26, "step": 764, "token_acc": 0.8762533946104032, "train_speed(iter/s)": 0.033079 }, { "epoch": 0.14823426827496003, "grad_norm": 0.11133123189210892, "learning_rate": 0.00029218181056786806, "loss": 0.46136656403541565, "memory(GiB)": 78.26, "step": 765, "token_acc": 0.8676832940918797, "train_speed(iter/s)": 0.033082 }, { "epoch": 0.14842803856028677, "grad_norm": 0.1136741042137146, "learning_rate": 0.00029215115071852386, "loss": 0.44833481311798096, "memory(GiB)": 78.26, "step": 766, "token_acc": 0.8728677891250345, "train_speed(iter/s)": 0.033085 }, { "epoch": 0.14862180884561352, "grad_norm": 0.1057792380452156, "learning_rate": 0.0002921204324838823, "loss": 0.4067574441432953, "memory(GiB)": 78.26, "step": 767, "token_acc": 0.883631027963543, "train_speed(iter/s)": 0.033088 }, { "epoch": 0.14881557913094026, "grad_norm": 0.12142736464738846, "learning_rate": 0.0002920896558765602, "loss": 0.4738287031650543, "memory(GiB)": 78.26, "step": 768, "token_acc": 0.8667360749609578, "train_speed(iter/s)": 0.033091 }, { "epoch": 0.149009349416267, "grad_norm": 0.12040547281503677, "learning_rate": 0.0002920588209091983, "loss": 0.43065720796585083, "memory(GiB)": 78.26, "step": 769, "token_acc": 0.8761208605841793, "train_speed(iter/s)": 0.033094 }, { "epoch": 0.14920311970159375, "grad_norm": 0.09921874105930328, "learning_rate": 0.0002920279275944614, "loss": 0.43212834000587463, "memory(GiB)": 78.26, "step": 770, "token_acc": 0.8759197564892266, "train_speed(iter/s)": 0.033096 }, { "epoch": 0.1493968899869205, "grad_norm": 0.141917884349823, "learning_rate": 0.0002919969759450382, "loss": 0.4761424660682678, "memory(GiB)": 78.26, "step": 771, "token_acc": 0.8653158180933573, "train_speed(iter/s)": 0.033099 }, { "epoch": 0.14959066027224724, "grad_norm": 0.1117415577173233, "learning_rate": 0.0002919659659736414, "loss": 0.4318593144416809, "memory(GiB)": 78.26, "step": 772, "token_acc": 0.8758288466987815, "train_speed(iter/s)": 0.033101 }, { "epoch": 0.14978443055757398, "grad_norm": 0.16481183469295502, "learning_rate": 0.00029193489769300754, "loss": 0.4566386044025421, "memory(GiB)": 78.26, "step": 773, "token_acc": 0.8704999401985408, "train_speed(iter/s)": 0.033104 }, { "epoch": 0.14997820084290073, "grad_norm": 0.10933414846658707, "learning_rate": 0.0002919037711158973, "loss": 0.430907666683197, "memory(GiB)": 78.26, "step": 774, "token_acc": 0.8769761425697039, "train_speed(iter/s)": 0.033107 }, { "epoch": 0.1501719711282275, "grad_norm": 0.1175195574760437, "learning_rate": 0.00029187258625509513, "loss": 0.45036518573760986, "memory(GiB)": 78.26, "step": 775, "token_acc": 0.869928163775235, "train_speed(iter/s)": 0.03311 }, { "epoch": 0.15036574141355424, "grad_norm": 0.1023663803935051, "learning_rate": 0.0002918413431234096, "loss": 0.39650124311447144, "memory(GiB)": 78.26, "step": 776, "token_acc": 0.8873017168666504, "train_speed(iter/s)": 0.033112 }, { "epoch": 0.150559511698881, "grad_norm": 0.12010087072849274, "learning_rate": 0.000291810041733673, "loss": 0.4465680420398712, "memory(GiB)": 78.26, "step": 777, "token_acc": 0.8728560775540641, "train_speed(iter/s)": 0.033115 }, { "epoch": 0.15075328198420773, "grad_norm": 0.10836753249168396, "learning_rate": 0.0002917786820987416, "loss": 0.43890365958213806, "memory(GiB)": 78.26, "step": 778, "token_acc": 0.8746218049034951, "train_speed(iter/s)": 0.033118 }, { "epoch": 0.15094705226953448, "grad_norm": 0.11870795488357544, "learning_rate": 0.0002917472642314958, "loss": 0.46871399879455566, "memory(GiB)": 78.26, "step": 779, "token_acc": 0.867821888084105, "train_speed(iter/s)": 0.033121 }, { "epoch": 0.15114082255486122, "grad_norm": 0.1149909496307373, "learning_rate": 0.00029171578814483966, "loss": 0.44199684262275696, "memory(GiB)": 78.26, "step": 780, "token_acc": 0.8727175418713008, "train_speed(iter/s)": 0.033124 }, { "epoch": 0.15133459284018796, "grad_norm": 0.1077098697423935, "learning_rate": 0.0002916842538517013, "loss": 0.4246280789375305, "memory(GiB)": 78.26, "step": 781, "token_acc": 0.8763708602062753, "train_speed(iter/s)": 0.033126 }, { "epoch": 0.1515283631255147, "grad_norm": 0.11270920932292938, "learning_rate": 0.0002916526613650326, "loss": 0.4364209771156311, "memory(GiB)": 78.26, "step": 782, "token_acc": 0.8724545624382823, "train_speed(iter/s)": 0.033129 }, { "epoch": 0.15172213341084145, "grad_norm": 0.10723229497671127, "learning_rate": 0.0002916210106978096, "loss": 0.47758546471595764, "memory(GiB)": 78.26, "step": 783, "token_acc": 0.8667694833461854, "train_speed(iter/s)": 0.033132 }, { "epoch": 0.1519159036961682, "grad_norm": 0.12354382872581482, "learning_rate": 0.000291589301863032, "loss": 0.4946221709251404, "memory(GiB)": 78.26, "step": 784, "token_acc": 0.8581249637407902, "train_speed(iter/s)": 0.033134 }, { "epoch": 0.15210967398149494, "grad_norm": 0.11447127163410187, "learning_rate": 0.00029155753487372345, "loss": 0.44242554903030396, "memory(GiB)": 78.26, "step": 785, "token_acc": 0.8728641926006219, "train_speed(iter/s)": 0.033137 }, { "epoch": 0.15230344426682169, "grad_norm": 0.10308409482240677, "learning_rate": 0.0002915257097429315, "loss": 0.4122265577316284, "memory(GiB)": 78.26, "step": 786, "token_acc": 0.881477327759794, "train_speed(iter/s)": 0.033139 }, { "epoch": 0.15249721455214843, "grad_norm": 0.10119541734457016, "learning_rate": 0.00029149382648372763, "loss": 0.3953108787536621, "memory(GiB)": 78.26, "step": 787, "token_acc": 0.8823500826124988, "train_speed(iter/s)": 0.033141 }, { "epoch": 0.15269098483747517, "grad_norm": 0.10768993198871613, "learning_rate": 0.0002914618851092072, "loss": 0.4354395270347595, "memory(GiB)": 78.26, "step": 788, "token_acc": 0.8756917373170133, "train_speed(iter/s)": 0.033144 }, { "epoch": 0.15288475512280192, "grad_norm": 0.10865043103694916, "learning_rate": 0.0002914298856324893, "loss": 0.40121009945869446, "memory(GiB)": 78.26, "step": 789, "token_acc": 0.886211232187762, "train_speed(iter/s)": 0.033147 }, { "epoch": 0.15307852540812866, "grad_norm": 0.1131611317396164, "learning_rate": 0.00029139782806671696, "loss": 0.41886383295059204, "memory(GiB)": 78.26, "step": 790, "token_acc": 0.8795520757465404, "train_speed(iter/s)": 0.03315 }, { "epoch": 0.1532722956934554, "grad_norm": 0.11826243251562119, "learning_rate": 0.0002913657124250571, "loss": 0.4526827037334442, "memory(GiB)": 78.26, "step": 791, "token_acc": 0.8706143245597825, "train_speed(iter/s)": 0.033152 }, { "epoch": 0.15346606597878215, "grad_norm": 0.12152021378278732, "learning_rate": 0.0002913335387207006, "loss": 0.4480191767215729, "memory(GiB)": 78.26, "step": 792, "token_acc": 0.8714323428470857, "train_speed(iter/s)": 0.033155 }, { "epoch": 0.1536598362641089, "grad_norm": 0.10104276239871979, "learning_rate": 0.0002913013069668619, "loss": 0.41667938232421875, "memory(GiB)": 78.26, "step": 793, "token_acc": 0.880495535822304, "train_speed(iter/s)": 0.033158 }, { "epoch": 0.15385360654943564, "grad_norm": 0.1120065450668335, "learning_rate": 0.0002912690171767795, "loss": 0.444894015789032, "memory(GiB)": 78.26, "step": 794, "token_acc": 0.8719184281987115, "train_speed(iter/s)": 0.033161 }, { "epoch": 0.15404737683476238, "grad_norm": 0.10677407681941986, "learning_rate": 0.00029123666936371577, "loss": 0.4287039041519165, "memory(GiB)": 78.26, "step": 795, "token_acc": 0.8766376159685128, "train_speed(iter/s)": 0.033163 }, { "epoch": 0.15424114712008913, "grad_norm": 0.10139796137809753, "learning_rate": 0.0002912042635409568, "loss": 0.3969685435295105, "memory(GiB)": 78.26, "step": 796, "token_acc": 0.8835710646968925, "train_speed(iter/s)": 0.033166 }, { "epoch": 0.15443491740541587, "grad_norm": 0.11726026982069016, "learning_rate": 0.0002911717997218123, "loss": 0.45599818229675293, "memory(GiB)": 78.26, "step": 797, "token_acc": 0.8708311582854823, "train_speed(iter/s)": 0.033169 }, { "epoch": 0.15462868769074262, "grad_norm": 0.10516592860221863, "learning_rate": 0.0002911392779196164, "loss": 0.39878836274147034, "memory(GiB)": 78.26, "step": 798, "token_acc": 0.8827302498037836, "train_speed(iter/s)": 0.033171 }, { "epoch": 0.15482245797606936, "grad_norm": 0.1172914132475853, "learning_rate": 0.00029110669814772644, "loss": 0.43762531876564026, "memory(GiB)": 78.26, "step": 799, "token_acc": 0.8754726710209694, "train_speed(iter/s)": 0.033174 }, { "epoch": 0.1550162282613961, "grad_norm": 0.1156349629163742, "learning_rate": 0.0002910740604195238, "loss": 0.46107926964759827, "memory(GiB)": 78.26, "step": 800, "token_acc": 0.869101935557004, "train_speed(iter/s)": 0.033176 }, { "epoch": 0.15520999854672285, "grad_norm": 0.1091204434633255, "learning_rate": 0.00029104136474841384, "loss": 0.4546446204185486, "memory(GiB)": 78.26, "step": 801, "token_acc": 0.8690357722725606, "train_speed(iter/s)": 0.033161 }, { "epoch": 0.1554037688320496, "grad_norm": 0.10520661622285843, "learning_rate": 0.00029100861114782537, "loss": 0.391659677028656, "memory(GiB)": 78.26, "step": 802, "token_acc": 0.883874415497662, "train_speed(iter/s)": 0.033164 }, { "epoch": 0.15559753911737634, "grad_norm": 0.10649837553501129, "learning_rate": 0.0002909757996312113, "loss": 0.4287815988063812, "memory(GiB)": 78.26, "step": 803, "token_acc": 0.8760036358127556, "train_speed(iter/s)": 0.033167 }, { "epoch": 0.15579130940270308, "grad_norm": 0.11375483870506287, "learning_rate": 0.00029094293021204816, "loss": 0.43256044387817383, "memory(GiB)": 78.26, "step": 804, "token_acc": 0.8724521560241453, "train_speed(iter/s)": 0.033169 }, { "epoch": 0.15598507968802985, "grad_norm": 0.1098484992980957, "learning_rate": 0.00029091000290383626, "loss": 0.44678372144699097, "memory(GiB)": 78.26, "step": 805, "token_acc": 0.8710053046965972, "train_speed(iter/s)": 0.033171 }, { "epoch": 0.1561788499733566, "grad_norm": 0.12109239399433136, "learning_rate": 0.0002908770177200998, "loss": 0.43375393748283386, "memory(GiB)": 78.26, "step": 806, "token_acc": 0.8734656356459201, "train_speed(iter/s)": 0.033174 }, { "epoch": 0.15637262025868334, "grad_norm": 0.1153409481048584, "learning_rate": 0.00029084397467438666, "loss": 0.4369393587112427, "memory(GiB)": 78.26, "step": 807, "token_acc": 0.8766715060747307, "train_speed(iter/s)": 0.033176 }, { "epoch": 0.1565663905440101, "grad_norm": 0.11903059482574463, "learning_rate": 0.0002908108737802685, "loss": 0.4392112195491791, "memory(GiB)": 78.26, "step": 808, "token_acc": 0.8715450323167365, "train_speed(iter/s)": 0.033178 }, { "epoch": 0.15676016082933683, "grad_norm": 0.10243628919124603, "learning_rate": 0.0002907777150513407, "loss": 0.40879738330841064, "memory(GiB)": 78.26, "step": 809, "token_acc": 0.8799474375821288, "train_speed(iter/s)": 0.033181 }, { "epoch": 0.15695393111466358, "grad_norm": 0.1134597584605217, "learning_rate": 0.0002907444985012225, "loss": 0.4965320825576782, "memory(GiB)": 78.26, "step": 810, "token_acc": 0.8585183351214547, "train_speed(iter/s)": 0.033183 }, { "epoch": 0.15714770139999032, "grad_norm": 0.11033570021390915, "learning_rate": 0.0002907112241435568, "loss": 0.46451640129089355, "memory(GiB)": 78.26, "step": 811, "token_acc": 0.8680681696762474, "train_speed(iter/s)": 0.033186 }, { "epoch": 0.15734147168531706, "grad_norm": 0.10630565881729126, "learning_rate": 0.0002906778919920103, "loss": 0.4418585002422333, "memory(GiB)": 78.26, "step": 812, "token_acc": 0.8757123405113723, "train_speed(iter/s)": 0.033188 }, { "epoch": 0.1575352419706438, "grad_norm": 0.10460490733385086, "learning_rate": 0.0002906445020602734, "loss": 0.4312971532344818, "memory(GiB)": 78.26, "step": 813, "token_acc": 0.8766326100695175, "train_speed(iter/s)": 0.03319 }, { "epoch": 0.15772901225597055, "grad_norm": 0.09658509492874146, "learning_rate": 0.0002906110543620603, "loss": 0.3654577136039734, "memory(GiB)": 78.26, "step": 814, "token_acc": 0.894104601737202, "train_speed(iter/s)": 0.033193 }, { "epoch": 0.1579227825412973, "grad_norm": 0.11015032976865768, "learning_rate": 0.0002905775489111087, "loss": 0.4334821403026581, "memory(GiB)": 78.26, "step": 815, "token_acc": 0.8754580788450861, "train_speed(iter/s)": 0.033195 }, { "epoch": 0.15811655282662404, "grad_norm": 0.10890354961156845, "learning_rate": 0.0002905439857211804, "loss": 0.4261341392993927, "memory(GiB)": 78.26, "step": 816, "token_acc": 0.8757039216681041, "train_speed(iter/s)": 0.033197 }, { "epoch": 0.15831032311195078, "grad_norm": 0.10780946165323257, "learning_rate": 0.00029051036480606053, "loss": 0.41828641295433044, "memory(GiB)": 78.26, "step": 817, "token_acc": 0.8813302092763193, "train_speed(iter/s)": 0.0332 }, { "epoch": 0.15850409339727753, "grad_norm": 0.11241378635168076, "learning_rate": 0.0002904766861795582, "loss": 0.4504837095737457, "memory(GiB)": 78.26, "step": 818, "token_acc": 0.8700797107404059, "train_speed(iter/s)": 0.033202 }, { "epoch": 0.15869786368260427, "grad_norm": 0.10670538991689682, "learning_rate": 0.00029044294985550607, "loss": 0.4240032434463501, "memory(GiB)": 78.26, "step": 819, "token_acc": 0.8797002130335464, "train_speed(iter/s)": 0.033205 }, { "epoch": 0.15889163396793102, "grad_norm": 0.09907601028680801, "learning_rate": 0.00029040915584776063, "loss": 0.4048020541667938, "memory(GiB)": 78.26, "step": 820, "token_acc": 0.8819687307138449, "train_speed(iter/s)": 0.033207 }, { "epoch": 0.15908540425325776, "grad_norm": 0.10909497737884521, "learning_rate": 0.00029037530417020194, "loss": 0.4121737480163574, "memory(GiB)": 78.26, "step": 821, "token_acc": 0.882557633753806, "train_speed(iter/s)": 0.03321 }, { "epoch": 0.1592791745385845, "grad_norm": 0.12102677673101425, "learning_rate": 0.00029034139483673373, "loss": 0.4818951487541199, "memory(GiB)": 78.26, "step": 822, "token_acc": 0.8623975497008026, "train_speed(iter/s)": 0.033212 }, { "epoch": 0.15947294482391125, "grad_norm": 0.11021539568901062, "learning_rate": 0.00029030742786128363, "loss": 0.42791998386383057, "memory(GiB)": 78.26, "step": 823, "token_acc": 0.8786863654583977, "train_speed(iter/s)": 0.033214 }, { "epoch": 0.159666715109238, "grad_norm": 0.12519389390945435, "learning_rate": 0.0002902734032578027, "loss": 0.4770694077014923, "memory(GiB)": 78.26, "step": 824, "token_acc": 0.8628657500881326, "train_speed(iter/s)": 0.033217 }, { "epoch": 0.15986048539456474, "grad_norm": 0.10490487515926361, "learning_rate": 0.0002902393210402657, "loss": 0.3933722674846649, "memory(GiB)": 78.26, "step": 825, "token_acc": 0.8858756744604317, "train_speed(iter/s)": 0.033219 }, { "epoch": 0.16005425567989148, "grad_norm": 0.10252831131219864, "learning_rate": 0.0002902051812226712, "loss": 0.3960869014263153, "memory(GiB)": 78.26, "step": 826, "token_acc": 0.8850197639086826, "train_speed(iter/s)": 0.033221 }, { "epoch": 0.16024802596521823, "grad_norm": 0.11041463166475296, "learning_rate": 0.0002901709838190413, "loss": 0.4398559033870697, "memory(GiB)": 78.26, "step": 827, "token_acc": 0.8748830196526983, "train_speed(iter/s)": 0.033224 }, { "epoch": 0.16044179625054497, "grad_norm": 0.11490801721811295, "learning_rate": 0.00029013672884342184, "loss": 0.47440311312675476, "memory(GiB)": 78.26, "step": 828, "token_acc": 0.865016464809957, "train_speed(iter/s)": 0.033226 }, { "epoch": 0.16063556653587172, "grad_norm": 0.11218751221895218, "learning_rate": 0.00029010241630988217, "loss": 0.43563181161880493, "memory(GiB)": 78.26, "step": 829, "token_acc": 0.8744653041825095, "train_speed(iter/s)": 0.033229 }, { "epoch": 0.16082933682119846, "grad_norm": 0.10691240429878235, "learning_rate": 0.00029006804623251547, "loss": 0.4449552297592163, "memory(GiB)": 78.26, "step": 830, "token_acc": 0.869553477182905, "train_speed(iter/s)": 0.033231 }, { "epoch": 0.1610231071065252, "grad_norm": 0.09648893773555756, "learning_rate": 0.00029003361862543834, "loss": 0.37794414162635803, "memory(GiB)": 78.26, "step": 831, "token_acc": 0.8909843389926163, "train_speed(iter/s)": 0.033233 }, { "epoch": 0.16121687739185195, "grad_norm": 0.10781273245811462, "learning_rate": 0.0002899991335027913, "loss": 0.4308411180973053, "memory(GiB)": 78.26, "step": 832, "token_acc": 0.877130367419212, "train_speed(iter/s)": 0.033235 }, { "epoch": 0.1614106476771787, "grad_norm": 0.10362358391284943, "learning_rate": 0.0002899645908787381, "loss": 0.40463730692863464, "memory(GiB)": 78.26, "step": 833, "token_acc": 0.8802580076079166, "train_speed(iter/s)": 0.033237 }, { "epoch": 0.16160441796250544, "grad_norm": 0.10180053114891052, "learning_rate": 0.0002899299907674665, "loss": 0.4222317337989807, "memory(GiB)": 78.26, "step": 834, "token_acc": 0.8798938964980744, "train_speed(iter/s)": 0.03324 }, { "epoch": 0.1617981882478322, "grad_norm": 0.1112997904419899, "learning_rate": 0.0002898953331831876, "loss": 0.45217838883399963, "memory(GiB)": 78.26, "step": 835, "token_acc": 0.8695664756612838, "train_speed(iter/s)": 0.033243 }, { "epoch": 0.16199195853315895, "grad_norm": 0.10161816328763962, "learning_rate": 0.0002898606181401362, "loss": 0.42238953709602356, "memory(GiB)": 78.26, "step": 836, "token_acc": 0.8775941837409121, "train_speed(iter/s)": 0.033245 }, { "epoch": 0.1621857288184857, "grad_norm": 0.11536618322134018, "learning_rate": 0.0002898258456525708, "loss": 0.4566255509853363, "memory(GiB)": 78.26, "step": 837, "token_acc": 0.8699845787186317, "train_speed(iter/s)": 0.033247 }, { "epoch": 0.16237949910381244, "grad_norm": 0.10816803574562073, "learning_rate": 0.0002897910157347733, "loss": 0.4683791697025299, "memory(GiB)": 78.26, "step": 838, "token_acc": 0.8670610211706102, "train_speed(iter/s)": 0.033249 }, { "epoch": 0.16257326938913919, "grad_norm": 0.11843861639499664, "learning_rate": 0.00028975612840104935, "loss": 0.45893388986587524, "memory(GiB)": 78.26, "step": 839, "token_acc": 0.8687269042794136, "train_speed(iter/s)": 0.033252 }, { "epoch": 0.16276703967446593, "grad_norm": 0.10663704574108124, "learning_rate": 0.000289721183665728, "loss": 0.4239065945148468, "memory(GiB)": 78.26, "step": 840, "token_acc": 0.8774621714892488, "train_speed(iter/s)": 0.033254 }, { "epoch": 0.16296080995979267, "grad_norm": 0.11534672975540161, "learning_rate": 0.00028968618154316206, "loss": 0.4733141362667084, "memory(GiB)": 78.26, "step": 841, "token_acc": 0.8653584989329564, "train_speed(iter/s)": 0.033256 }, { "epoch": 0.16315458024511942, "grad_norm": 0.10526616871356964, "learning_rate": 0.00028965112204772786, "loss": 0.4100850820541382, "memory(GiB)": 78.26, "step": 842, "token_acc": 0.8815631620593716, "train_speed(iter/s)": 0.033258 }, { "epoch": 0.16334835053044616, "grad_norm": 0.11487496644258499, "learning_rate": 0.00028961600519382527, "loss": 0.44554945826530457, "memory(GiB)": 78.26, "step": 843, "token_acc": 0.87236373596275, "train_speed(iter/s)": 0.033261 }, { "epoch": 0.1635421208157729, "grad_norm": 0.12236074358224869, "learning_rate": 0.00028958083099587774, "loss": 0.45087599754333496, "memory(GiB)": 78.26, "step": 844, "token_acc": 0.8709829274416269, "train_speed(iter/s)": 0.033263 }, { "epoch": 0.16373589110109965, "grad_norm": 0.11835192143917084, "learning_rate": 0.0002895455994683323, "loss": 0.4500479996204376, "memory(GiB)": 78.26, "step": 845, "token_acc": 0.8728866462050668, "train_speed(iter/s)": 0.033266 }, { "epoch": 0.1639296613864264, "grad_norm": 0.11468973010778427, "learning_rate": 0.0002895103106256593, "loss": 0.45714688301086426, "memory(GiB)": 78.26, "step": 846, "token_acc": 0.8693362313994315, "train_speed(iter/s)": 0.033268 }, { "epoch": 0.16412343167175314, "grad_norm": 0.11863450706005096, "learning_rate": 0.000289474964482353, "loss": 0.5020843148231506, "memory(GiB)": 78.26, "step": 847, "token_acc": 0.8581676315127108, "train_speed(iter/s)": 0.03327 }, { "epoch": 0.16431720195707988, "grad_norm": 0.09941709041595459, "learning_rate": 0.0002894395610529309, "loss": 0.3951787054538727, "memory(GiB)": 78.26, "step": 848, "token_acc": 0.8848054109751909, "train_speed(iter/s)": 0.033272 }, { "epoch": 0.16451097224240663, "grad_norm": 0.09850473701953888, "learning_rate": 0.0002894041003519343, "loss": 0.37680599093437195, "memory(GiB)": 78.26, "step": 849, "token_acc": 0.8903767059263413, "train_speed(iter/s)": 0.033274 }, { "epoch": 0.16470474252773337, "grad_norm": 0.10843576490879059, "learning_rate": 0.0002893685823939276, "loss": 0.4342295229434967, "memory(GiB)": 78.26, "step": 850, "token_acc": 0.8752759675488762, "train_speed(iter/s)": 0.033276 }, { "epoch": 0.16489851281306012, "grad_norm": 0.09915035963058472, "learning_rate": 0.00028933300719349923, "loss": 0.41352295875549316, "memory(GiB)": 78.26, "step": 851, "token_acc": 0.8786153965024339, "train_speed(iter/s)": 0.033279 }, { "epoch": 0.16509228309838686, "grad_norm": 0.11959869414567947, "learning_rate": 0.00028929737476526075, "loss": 0.506973147392273, "memory(GiB)": 78.26, "step": 852, "token_acc": 0.8568055984935781, "train_speed(iter/s)": 0.033282 }, { "epoch": 0.1652860533837136, "grad_norm": 0.10934106260538101, "learning_rate": 0.00028926168512384743, "loss": 0.43080487847328186, "memory(GiB)": 78.26, "step": 853, "token_acc": 0.875984682713348, "train_speed(iter/s)": 0.033284 }, { "epoch": 0.16547982366904035, "grad_norm": 0.10780926048755646, "learning_rate": 0.0002892259382839179, "loss": 0.4089958667755127, "memory(GiB)": 78.26, "step": 854, "token_acc": 0.8801118674476281, "train_speed(iter/s)": 0.033287 }, { "epoch": 0.1656735939543671, "grad_norm": 0.1189161166548729, "learning_rate": 0.0002891901342601543, "loss": 0.47321808338165283, "memory(GiB)": 78.26, "step": 855, "token_acc": 0.8673148412925364, "train_speed(iter/s)": 0.033289 }, { "epoch": 0.16586736423969384, "grad_norm": 0.11358017474412918, "learning_rate": 0.00028915427306726245, "loss": 0.4263777732849121, "memory(GiB)": 78.26, "step": 856, "token_acc": 0.8767594964603477, "train_speed(iter/s)": 0.033291 }, { "epoch": 0.16606113452502058, "grad_norm": 0.1009831428527832, "learning_rate": 0.00028911835471997143, "loss": 0.4132530987262726, "memory(GiB)": 78.26, "step": 857, "token_acc": 0.8810461753266181, "train_speed(iter/s)": 0.033293 }, { "epoch": 0.16625490481034733, "grad_norm": 0.10224108397960663, "learning_rate": 0.0002890823792330339, "loss": 0.4362914562225342, "memory(GiB)": 78.26, "step": 858, "token_acc": 0.8741152805026449, "train_speed(iter/s)": 0.033295 }, { "epoch": 0.16644867509567407, "grad_norm": 0.1064584031701088, "learning_rate": 0.00028904634662122586, "loss": 0.44220423698425293, "memory(GiB)": 78.26, "step": 859, "token_acc": 0.8728430508570941, "train_speed(iter/s)": 0.033297 }, { "epoch": 0.16664244538100081, "grad_norm": 0.1092744916677475, "learning_rate": 0.000289010256899347, "loss": 0.422356516122818, "memory(GiB)": 78.26, "step": 860, "token_acc": 0.8780035309255214, "train_speed(iter/s)": 0.033299 }, { "epoch": 0.16683621566632756, "grad_norm": 0.1130421906709671, "learning_rate": 0.0002889741100822202, "loss": 0.45889347791671753, "memory(GiB)": 78.26, "step": 861, "token_acc": 0.867090987216672, "train_speed(iter/s)": 0.033302 }, { "epoch": 0.1670299859516543, "grad_norm": 0.11241286247968674, "learning_rate": 0.00028893790618469213, "loss": 0.47700247168540955, "memory(GiB)": 78.26, "step": 862, "token_acc": 0.862, "train_speed(iter/s)": 0.033304 }, { "epoch": 0.16722375623698105, "grad_norm": 0.1090337261557579, "learning_rate": 0.0002889016452216325, "loss": 0.42925596237182617, "memory(GiB)": 78.26, "step": 863, "token_acc": 0.8770264194853622, "train_speed(iter/s)": 0.033306 }, { "epoch": 0.1674175265223078, "grad_norm": 0.12319561094045639, "learning_rate": 0.00028886532720793476, "loss": 0.4747333526611328, "memory(GiB)": 78.26, "step": 864, "token_acc": 0.8648867565967787, "train_speed(iter/s)": 0.033308 }, { "epoch": 0.16761129680763454, "grad_norm": 0.11913838982582092, "learning_rate": 0.0002888289521585157, "loss": 0.43152928352355957, "memory(GiB)": 78.26, "step": 865, "token_acc": 0.8771402469572392, "train_speed(iter/s)": 0.033311 }, { "epoch": 0.1678050670929613, "grad_norm": 0.12217225134372711, "learning_rate": 0.0002887925200883155, "loss": 0.468718022108078, "memory(GiB)": 78.26, "step": 866, "token_acc": 0.8652350105007839, "train_speed(iter/s)": 0.033313 }, { "epoch": 0.16799883737828805, "grad_norm": 0.10935720801353455, "learning_rate": 0.0002887560310122978, "loss": 0.427009254693985, "memory(GiB)": 78.26, "step": 867, "token_acc": 0.8757216164207825, "train_speed(iter/s)": 0.033315 }, { "epoch": 0.1681926076636148, "grad_norm": 0.11316697299480438, "learning_rate": 0.0002887194849454496, "loss": 0.4437088370323181, "memory(GiB)": 78.26, "step": 868, "token_acc": 0.873464339700426, "train_speed(iter/s)": 0.033317 }, { "epoch": 0.16838637794894154, "grad_norm": 0.11642614752054214, "learning_rate": 0.00028868288190278145, "loss": 0.4325766861438751, "memory(GiB)": 78.26, "step": 869, "token_acc": 0.8753681468056185, "train_speed(iter/s)": 0.033319 }, { "epoch": 0.16858014823426828, "grad_norm": 0.10680645704269409, "learning_rate": 0.00028864622189932713, "loss": 0.443330854177475, "memory(GiB)": 78.26, "step": 870, "token_acc": 0.8720809086981933, "train_speed(iter/s)": 0.033321 }, { "epoch": 0.16877391851959503, "grad_norm": 0.11195547878742218, "learning_rate": 0.00028860950495014393, "loss": 0.44975459575653076, "memory(GiB)": 78.26, "step": 871, "token_acc": 0.8717114605117083, "train_speed(iter/s)": 0.033323 }, { "epoch": 0.16896768880492177, "grad_norm": 0.10190173983573914, "learning_rate": 0.00028857273107031243, "loss": 0.4224821925163269, "memory(GiB)": 78.26, "step": 872, "token_acc": 0.8783074075926575, "train_speed(iter/s)": 0.033326 }, { "epoch": 0.16916145909024852, "grad_norm": 0.10820237547159195, "learning_rate": 0.0002885359002749367, "loss": 0.4363636076450348, "memory(GiB)": 78.26, "step": 873, "token_acc": 0.8747307109231629, "train_speed(iter/s)": 0.033328 }, { "epoch": 0.16935522937557526, "grad_norm": 0.10944371670484543, "learning_rate": 0.00028849901257914416, "loss": 0.4190293550491333, "memory(GiB)": 78.26, "step": 874, "token_acc": 0.87886427298192, "train_speed(iter/s)": 0.03333 }, { "epoch": 0.169548999660902, "grad_norm": 0.11039703339338303, "learning_rate": 0.0002884620679980855, "loss": 0.44972705841064453, "memory(GiB)": 78.26, "step": 875, "token_acc": 0.8701769399209686, "train_speed(iter/s)": 0.033332 }, { "epoch": 0.16974276994622875, "grad_norm": 0.10529112815856934, "learning_rate": 0.00028842506654693493, "loss": 0.45059582591056824, "memory(GiB)": 78.26, "step": 876, "token_acc": 0.8691080109913337, "train_speed(iter/s)": 0.033335 }, { "epoch": 0.1699365402315555, "grad_norm": 0.10080941766500473, "learning_rate": 0.00028838800824088984, "loss": 0.4126817286014557, "memory(GiB)": 78.26, "step": 877, "token_acc": 0.880942601909847, "train_speed(iter/s)": 0.033336 }, { "epoch": 0.17013031051688224, "grad_norm": 0.0992860496044159, "learning_rate": 0.00028835089309517116, "loss": 0.39974620938301086, "memory(GiB)": 78.26, "step": 878, "token_acc": 0.8827877507919747, "train_speed(iter/s)": 0.033338 }, { "epoch": 0.17032408080220898, "grad_norm": 0.11538361012935638, "learning_rate": 0.0002883137211250231, "loss": 0.4553724229335785, "memory(GiB)": 78.26, "step": 879, "token_acc": 0.8721838688599108, "train_speed(iter/s)": 0.03334 }, { "epoch": 0.17051785108753573, "grad_norm": 0.10386830568313599, "learning_rate": 0.0002882764923457131, "loss": 0.40255507826805115, "memory(GiB)": 78.26, "step": 880, "token_acc": 0.8842005576892724, "train_speed(iter/s)": 0.033343 }, { "epoch": 0.17071162137286247, "grad_norm": 0.10418614745140076, "learning_rate": 0.000288239206772532, "loss": 0.41422078013420105, "memory(GiB)": 78.26, "step": 881, "token_acc": 0.8813669826953899, "train_speed(iter/s)": 0.033345 }, { "epoch": 0.17090539165818922, "grad_norm": 0.10407593846321106, "learning_rate": 0.00028820186442079414, "loss": 0.40433794260025024, "memory(GiB)": 78.26, "step": 882, "token_acc": 0.8834683588753167, "train_speed(iter/s)": 0.033347 }, { "epoch": 0.17109916194351596, "grad_norm": 0.11592178791761398, "learning_rate": 0.0002881644653058369, "loss": 0.4496549963951111, "memory(GiB)": 78.26, "step": 883, "token_acc": 0.8705704799275581, "train_speed(iter/s)": 0.033349 }, { "epoch": 0.1712929322288427, "grad_norm": 0.10687348991632462, "learning_rate": 0.0002881270094430212, "loss": 0.3880995512008667, "memory(GiB)": 78.26, "step": 884, "token_acc": 0.889063373346464, "train_speed(iter/s)": 0.033351 }, { "epoch": 0.17148670251416945, "grad_norm": 0.11258858442306519, "learning_rate": 0.000288089496847731, "loss": 0.43804460763931274, "memory(GiB)": 78.26, "step": 885, "token_acc": 0.8751539547598306, "train_speed(iter/s)": 0.033353 }, { "epoch": 0.1716804727994962, "grad_norm": 0.10251414030790329, "learning_rate": 0.00028805192753537386, "loss": 0.39091038703918457, "memory(GiB)": 78.26, "step": 886, "token_acc": 0.884448902027027, "train_speed(iter/s)": 0.033355 }, { "epoch": 0.17187424308482294, "grad_norm": 0.10573960095643997, "learning_rate": 0.0002880143015213805, "loss": 0.43722304701805115, "memory(GiB)": 78.26, "step": 887, "token_acc": 0.8758301120031717, "train_speed(iter/s)": 0.033358 }, { "epoch": 0.17206801337014968, "grad_norm": 0.10792998969554901, "learning_rate": 0.00028797661882120495, "loss": 0.40696701407432556, "memory(GiB)": 78.26, "step": 888, "token_acc": 0.8834754878388297, "train_speed(iter/s)": 0.03336 }, { "epoch": 0.17226178365547642, "grad_norm": 0.10105016082525253, "learning_rate": 0.0002879388794503245, "loss": 0.3817410469055176, "memory(GiB)": 78.26, "step": 889, "token_acc": 0.8874983670803397, "train_speed(iter/s)": 0.033362 }, { "epoch": 0.17245555394080317, "grad_norm": 0.1093856692314148, "learning_rate": 0.0002879010834242396, "loss": 0.4195503294467926, "memory(GiB)": 78.26, "step": 890, "token_acc": 0.8788049002300327, "train_speed(iter/s)": 0.033364 }, { "epoch": 0.1726493242261299, "grad_norm": 0.11232632398605347, "learning_rate": 0.00028786323075847425, "loss": 0.45594799518585205, "memory(GiB)": 78.26, "step": 891, "token_acc": 0.8703043956619689, "train_speed(iter/s)": 0.033366 }, { "epoch": 0.17284309451145666, "grad_norm": 0.10968980938196182, "learning_rate": 0.00028782532146857546, "loss": 0.4293142557144165, "memory(GiB)": 78.26, "step": 892, "token_acc": 0.8755912321954648, "train_speed(iter/s)": 0.033368 }, { "epoch": 0.1730368647967834, "grad_norm": 0.12125832587480545, "learning_rate": 0.0002877873555701137, "loss": 0.49456584453582764, "memory(GiB)": 78.26, "step": 893, "token_acc": 0.8624862149246416, "train_speed(iter/s)": 0.03337 }, { "epoch": 0.17323063508211015, "grad_norm": 0.1106322631239891, "learning_rate": 0.00028774933307868243, "loss": 0.4586373269557953, "memory(GiB)": 78.26, "step": 894, "token_acc": 0.869065453654019, "train_speed(iter/s)": 0.033373 }, { "epoch": 0.1734244053674369, "grad_norm": 0.09941691905260086, "learning_rate": 0.00028771125400989863, "loss": 0.413199782371521, "memory(GiB)": 78.26, "step": 895, "token_acc": 0.8804068187829097, "train_speed(iter/s)": 0.033375 }, { "epoch": 0.17361817565276366, "grad_norm": 0.11516234278678894, "learning_rate": 0.0002876731183794024, "loss": 0.43939366936683655, "memory(GiB)": 78.26, "step": 896, "token_acc": 0.8730681167716084, "train_speed(iter/s)": 0.033377 }, { "epoch": 0.1738119459380904, "grad_norm": 0.10177889466285706, "learning_rate": 0.000287634926202857, "loss": 0.37696516513824463, "memory(GiB)": 78.26, "step": 897, "token_acc": 0.8901657575597098, "train_speed(iter/s)": 0.033379 }, { "epoch": 0.17400571622341715, "grad_norm": 0.11825387924909592, "learning_rate": 0.00028759667749594903, "loss": 0.4378824830055237, "memory(GiB)": 78.26, "step": 898, "token_acc": 0.8729655768399828, "train_speed(iter/s)": 0.033381 }, { "epoch": 0.1741994865087439, "grad_norm": 0.1011887937784195, "learning_rate": 0.0002875583722743882, "loss": 0.4053630232810974, "memory(GiB)": 78.26, "step": 899, "token_acc": 0.8832929164007658, "train_speed(iter/s)": 0.033383 }, { "epoch": 0.17439325679407064, "grad_norm": 0.11091629415750504, "learning_rate": 0.0002875200105539076, "loss": 0.43335989117622375, "memory(GiB)": 78.26, "step": 900, "token_acc": 0.8734870853616364, "train_speed(iter/s)": 0.033385 }, { "epoch": 0.17458702707939738, "grad_norm": 0.12012141942977905, "learning_rate": 0.00028748159235026337, "loss": 0.4563184380531311, "memory(GiB)": 78.26, "step": 901, "token_acc": 0.8695292291774444, "train_speed(iter/s)": 0.033387 }, { "epoch": 0.17478079736472413, "grad_norm": 0.1084456518292427, "learning_rate": 0.00028744311767923487, "loss": 0.42208635807037354, "memory(GiB)": 78.26, "step": 902, "token_acc": 0.8803865467709915, "train_speed(iter/s)": 0.033389 }, { "epoch": 0.17497456765005087, "grad_norm": 0.11079172044992447, "learning_rate": 0.00028740458655662467, "loss": 0.4311895966529846, "memory(GiB)": 78.26, "step": 903, "token_acc": 0.877756061809076, "train_speed(iter/s)": 0.033391 }, { "epoch": 0.17516833793537762, "grad_norm": 0.10750513523817062, "learning_rate": 0.00028736599899825856, "loss": 0.43391120433807373, "memory(GiB)": 78.26, "step": 904, "token_acc": 0.8769369834710744, "train_speed(iter/s)": 0.033393 }, { "epoch": 0.17536210822070436, "grad_norm": 0.11065103858709335, "learning_rate": 0.00028732735501998556, "loss": 0.4614184498786926, "memory(GiB)": 78.26, "step": 905, "token_acc": 0.8664335104252877, "train_speed(iter/s)": 0.033395 }, { "epoch": 0.1755558785060311, "grad_norm": 0.11256072670221329, "learning_rate": 0.0002872886546376777, "loss": 0.4341152012348175, "memory(GiB)": 78.26, "step": 906, "token_acc": 0.8766848053578903, "train_speed(iter/s)": 0.033397 }, { "epoch": 0.17574964879135785, "grad_norm": 0.10496652871370316, "learning_rate": 0.00028724989786723027, "loss": 0.4188949167728424, "memory(GiB)": 78.26, "step": 907, "token_acc": 0.8798962523819606, "train_speed(iter/s)": 0.033399 }, { "epoch": 0.1759434190766846, "grad_norm": 0.10520727187395096, "learning_rate": 0.00028721108472456173, "loss": 0.40807557106018066, "memory(GiB)": 78.26, "step": 908, "token_acc": 0.8816372301654051, "train_speed(iter/s)": 0.033401 }, { "epoch": 0.17613718936201134, "grad_norm": 0.10006900131702423, "learning_rate": 0.0002871722152256137, "loss": 0.41225340962409973, "memory(GiB)": 78.26, "step": 909, "token_acc": 0.8827568245850794, "train_speed(iter/s)": 0.033403 }, { "epoch": 0.17633095964733808, "grad_norm": 0.10612796992063522, "learning_rate": 0.0002871332893863509, "loss": 0.40802818536758423, "memory(GiB)": 78.26, "step": 910, "token_acc": 0.8830624153381265, "train_speed(iter/s)": 0.033405 }, { "epoch": 0.17652472993266483, "grad_norm": 0.11015652120113373, "learning_rate": 0.0002870943072227613, "loss": 0.4355056881904602, "memory(GiB)": 78.26, "step": 911, "token_acc": 0.8744773544633198, "train_speed(iter/s)": 0.033407 }, { "epoch": 0.17671850021799157, "grad_norm": 0.11280830204486847, "learning_rate": 0.00028705526875085575, "loss": 0.39818063378334045, "memory(GiB)": 78.26, "step": 912, "token_acc": 0.8824910029729307, "train_speed(iter/s)": 0.033409 }, { "epoch": 0.17691227050331831, "grad_norm": 0.10122046619653702, "learning_rate": 0.00028701617398666857, "loss": 0.412325382232666, "memory(GiB)": 78.26, "step": 913, "token_acc": 0.8825016812373907, "train_speed(iter/s)": 0.033411 }, { "epoch": 0.17710604078864506, "grad_norm": 0.11189986765384674, "learning_rate": 0.00028697702294625693, "loss": 0.4302537441253662, "memory(GiB)": 78.26, "step": 914, "token_acc": 0.8747316444826105, "train_speed(iter/s)": 0.033413 }, { "epoch": 0.1772998110739718, "grad_norm": 0.1030743420124054, "learning_rate": 0.0002869378156457013, "loss": 0.4212334454059601, "memory(GiB)": 78.26, "step": 915, "token_acc": 0.8769243019428814, "train_speed(iter/s)": 0.033415 }, { "epoch": 0.17749358135929855, "grad_norm": 0.09644519537687302, "learning_rate": 0.0002868985521011051, "loss": 0.3844713568687439, "memory(GiB)": 78.26, "step": 916, "token_acc": 0.8888474264248576, "train_speed(iter/s)": 0.033416 }, { "epoch": 0.1776873516446253, "grad_norm": 0.10586938261985779, "learning_rate": 0.000286859232328595, "loss": 0.4248504638671875, "memory(GiB)": 78.26, "step": 917, "token_acc": 0.8777870642473915, "train_speed(iter/s)": 0.033418 }, { "epoch": 0.17788112192995204, "grad_norm": 0.11170489341020584, "learning_rate": 0.00028681985634432055, "loss": 0.41771912574768066, "memory(GiB)": 78.26, "step": 918, "token_acc": 0.8776203050061971, "train_speed(iter/s)": 0.03342 }, { "epoch": 0.17807489221527878, "grad_norm": 0.11335260421037674, "learning_rate": 0.00028678042416445463, "loss": 0.4187549352645874, "memory(GiB)": 78.26, "step": 919, "token_acc": 0.8776417309627642, "train_speed(iter/s)": 0.033422 }, { "epoch": 0.17826866250060552, "grad_norm": 0.10571952164173126, "learning_rate": 0.0002867409358051931, "loss": 0.4060910642147064, "memory(GiB)": 78.26, "step": 920, "token_acc": 0.8823606660713348, "train_speed(iter/s)": 0.033423 }, { "epoch": 0.17846243278593227, "grad_norm": 0.12325315922498703, "learning_rate": 0.00028670139128275483, "loss": 0.44652315974235535, "memory(GiB)": 78.26, "step": 921, "token_acc": 0.8700105204529983, "train_speed(iter/s)": 0.033425 }, { "epoch": 0.178656203071259, "grad_norm": 0.1102391704916954, "learning_rate": 0.0002866617906133819, "loss": 0.4251522123813629, "memory(GiB)": 78.26, "step": 922, "token_acc": 0.876720608841388, "train_speed(iter/s)": 0.033427 }, { "epoch": 0.17884997335658576, "grad_norm": 0.1056000366806984, "learning_rate": 0.00028662213381333926, "loss": 0.4017042815685272, "memory(GiB)": 78.26, "step": 923, "token_acc": 0.8835219794682811, "train_speed(iter/s)": 0.033429 }, { "epoch": 0.1790437436419125, "grad_norm": 0.11068852990865707, "learning_rate": 0.00028658242089891513, "loss": 0.40017062425613403, "memory(GiB)": 78.26, "step": 924, "token_acc": 0.8838785514205683, "train_speed(iter/s)": 0.033431 }, { "epoch": 0.17923751392723924, "grad_norm": 0.10683512687683105, "learning_rate": 0.0002865426518864206, "loss": 0.4157922565937042, "memory(GiB)": 78.26, "step": 925, "token_acc": 0.8800466330435665, "train_speed(iter/s)": 0.033433 }, { "epoch": 0.17943128421256602, "grad_norm": 0.10481857508420944, "learning_rate": 0.00028650282679218994, "loss": 0.4214838743209839, "memory(GiB)": 78.26, "step": 926, "token_acc": 0.8785436944843359, "train_speed(iter/s)": 0.033435 }, { "epoch": 0.17962505449789276, "grad_norm": 0.10210458934307098, "learning_rate": 0.0002864629456325803, "loss": 0.39417165517807007, "memory(GiB)": 78.26, "step": 927, "token_acc": 0.8880719554934686, "train_speed(iter/s)": 0.033437 }, { "epoch": 0.1798188247832195, "grad_norm": 0.10829130560159683, "learning_rate": 0.000286423008423972, "loss": 0.4183374047279358, "memory(GiB)": 78.26, "step": 928, "token_acc": 0.8781808337845154, "train_speed(iter/s)": 0.033438 }, { "epoch": 0.18001259506854625, "grad_norm": 0.10995203256607056, "learning_rate": 0.00028638301518276826, "loss": 0.447214275598526, "memory(GiB)": 78.26, "step": 929, "token_acc": 0.8710934284538345, "train_speed(iter/s)": 0.03344 }, { "epoch": 0.180206365353873, "grad_norm": 0.10909594595432281, "learning_rate": 0.00028634296592539547, "loss": 0.3997655510902405, "memory(GiB)": 78.26, "step": 930, "token_acc": 0.8848353156450137, "train_speed(iter/s)": 0.033442 }, { "epoch": 0.18040013563919974, "grad_norm": 0.1183900386095047, "learning_rate": 0.0002863028606683029, "loss": 0.4467264413833618, "memory(GiB)": 78.26, "step": 931, "token_acc": 0.8734884573103701, "train_speed(iter/s)": 0.033444 }, { "epoch": 0.18059390592452648, "grad_norm": 0.10841910541057587, "learning_rate": 0.00028626269942796294, "loss": 0.4616568088531494, "memory(GiB)": 78.26, "step": 932, "token_acc": 0.8696779070709253, "train_speed(iter/s)": 0.033446 }, { "epoch": 0.18078767620985323, "grad_norm": 0.11348854750394821, "learning_rate": 0.0002862224822208707, "loss": 0.45820578932762146, "memory(GiB)": 78.26, "step": 933, "token_acc": 0.8705489207844856, "train_speed(iter/s)": 0.033448 }, { "epoch": 0.18098144649517997, "grad_norm": 0.10200490057468414, "learning_rate": 0.0002861822090635446, "loss": 0.40025630593299866, "memory(GiB)": 78.26, "step": 934, "token_acc": 0.8839878318584071, "train_speed(iter/s)": 0.03345 }, { "epoch": 0.18117521678050671, "grad_norm": 0.10408560186624527, "learning_rate": 0.00028614187997252585, "loss": 0.44248849153518677, "memory(GiB)": 78.26, "step": 935, "token_acc": 0.8738729347996039, "train_speed(iter/s)": 0.033451 }, { "epoch": 0.18136898706583346, "grad_norm": 0.1146451011300087, "learning_rate": 0.0002861014949643787, "loss": 0.4483628273010254, "memory(GiB)": 78.26, "step": 936, "token_acc": 0.8693443867125927, "train_speed(iter/s)": 0.033453 }, { "epoch": 0.1815627573511602, "grad_norm": 0.11855790764093399, "learning_rate": 0.0002860610540556905, "loss": 0.41510143876075745, "memory(GiB)": 78.26, "step": 937, "token_acc": 0.8812605670888882, "train_speed(iter/s)": 0.033455 }, { "epoch": 0.18175652763648695, "grad_norm": 0.12932439148426056, "learning_rate": 0.0002860205572630712, "loss": 0.41384294629096985, "memory(GiB)": 78.26, "step": 938, "token_acc": 0.8795511921458625, "train_speed(iter/s)": 0.033457 }, { "epoch": 0.1819502979218137, "grad_norm": 0.1010417714715004, "learning_rate": 0.00028598000460315404, "loss": 0.3942815363407135, "memory(GiB)": 78.26, "step": 939, "token_acc": 0.88588022518277, "train_speed(iter/s)": 0.033459 }, { "epoch": 0.18214406820714044, "grad_norm": 0.11028870195150375, "learning_rate": 0.00028593939609259506, "loss": 0.46208620071411133, "memory(GiB)": 78.26, "step": 940, "token_acc": 0.8662080777943533, "train_speed(iter/s)": 0.033461 }, { "epoch": 0.18233783849246718, "grad_norm": 0.10703767091035843, "learning_rate": 0.0002858987317480733, "loss": 0.4361118674278259, "memory(GiB)": 78.26, "step": 941, "token_acc": 0.8754664583875726, "train_speed(iter/s)": 0.033463 }, { "epoch": 0.18253160877779392, "grad_norm": 0.09738662093877792, "learning_rate": 0.00028585801158629063, "loss": 0.4183694124221802, "memory(GiB)": 78.26, "step": 942, "token_acc": 0.8812399810345217, "train_speed(iter/s)": 0.033465 }, { "epoch": 0.18272537906312067, "grad_norm": 0.10861563682556152, "learning_rate": 0.000285817235623972, "loss": 0.4377916157245636, "memory(GiB)": 78.26, "step": 943, "token_acc": 0.8772842397668782, "train_speed(iter/s)": 0.033466 }, { "epoch": 0.1829191493484474, "grad_norm": 0.11255238950252533, "learning_rate": 0.0002857764038778651, "loss": 0.4062233865261078, "memory(GiB)": 78.26, "step": 944, "token_acc": 0.8827611918896441, "train_speed(iter/s)": 0.033468 }, { "epoch": 0.18311291963377416, "grad_norm": 0.1005673035979271, "learning_rate": 0.0002857355163647407, "loss": 0.3768383264541626, "memory(GiB)": 78.26, "step": 945, "token_acc": 0.8898000897352809, "train_speed(iter/s)": 0.03347 }, { "epoch": 0.1833066899191009, "grad_norm": 0.1112765297293663, "learning_rate": 0.00028569457310139237, "loss": 0.4166128635406494, "memory(GiB)": 78.26, "step": 946, "token_acc": 0.87920341245126, "train_speed(iter/s)": 0.033472 }, { "epoch": 0.18350046020442765, "grad_norm": 0.12681414186954498, "learning_rate": 0.00028565357410463663, "loss": 0.4559955894947052, "memory(GiB)": 78.26, "step": 947, "token_acc": 0.8697389451251998, "train_speed(iter/s)": 0.033474 }, { "epoch": 0.1836942304897544, "grad_norm": 0.1174798235297203, "learning_rate": 0.0002856125193913128, "loss": 0.48667585849761963, "memory(GiB)": 78.26, "step": 948, "token_acc": 0.8624453490125132, "train_speed(iter/s)": 0.033476 }, { "epoch": 0.18388800077508113, "grad_norm": 0.11614065617322922, "learning_rate": 0.00028557140897828324, "loss": 0.43031829595565796, "memory(GiB)": 78.26, "step": 949, "token_acc": 0.8768826126436422, "train_speed(iter/s)": 0.033478 }, { "epoch": 0.18408177106040788, "grad_norm": 0.11647061258554459, "learning_rate": 0.00028553024288243306, "loss": 0.43689054250717163, "memory(GiB)": 78.26, "step": 950, "token_acc": 0.8749047820567076, "train_speed(iter/s)": 0.03348 }, { "epoch": 0.18427554134573462, "grad_norm": 0.12167865037918091, "learning_rate": 0.0002854890211206703, "loss": 0.4729604125022888, "memory(GiB)": 78.26, "step": 951, "token_acc": 0.8637253103406103, "train_speed(iter/s)": 0.033483 }, { "epoch": 0.18446931163106137, "grad_norm": 0.11439337581396103, "learning_rate": 0.00028544774370992587, "loss": 0.4109064042568207, "memory(GiB)": 78.26, "step": 952, "token_acc": 0.8795947901591896, "train_speed(iter/s)": 0.033484 }, { "epoch": 0.1846630819163881, "grad_norm": 0.11438382416963577, "learning_rate": 0.0002854064106671534, "loss": 0.43977493047714233, "memory(GiB)": 78.26, "step": 953, "token_acc": 0.8736031785448225, "train_speed(iter/s)": 0.033486 }, { "epoch": 0.18485685220171486, "grad_norm": 0.10898079723119736, "learning_rate": 0.0002853650220093296, "loss": 0.45643150806427, "memory(GiB)": 78.26, "step": 954, "token_acc": 0.8683694024069004, "train_speed(iter/s)": 0.033488 }, { "epoch": 0.1850506224870416, "grad_norm": 0.1164807602763176, "learning_rate": 0.0002853235777534539, "loss": 0.41862934827804565, "memory(GiB)": 78.26, "step": 955, "token_acc": 0.8782421565296183, "train_speed(iter/s)": 0.03349 }, { "epoch": 0.18524439277236834, "grad_norm": 0.11116447299718857, "learning_rate": 0.00028528207791654847, "loss": 0.4242367148399353, "memory(GiB)": 78.26, "step": 956, "token_acc": 0.8753245376993589, "train_speed(iter/s)": 0.033492 }, { "epoch": 0.18543816305769512, "grad_norm": 0.11318708211183548, "learning_rate": 0.0002852405225156585, "loss": 0.43691354990005493, "memory(GiB)": 78.26, "step": 957, "token_acc": 0.8767160750423891, "train_speed(iter/s)": 0.033493 }, { "epoch": 0.18563193334302186, "grad_norm": 0.10934760421514511, "learning_rate": 0.00028519891156785187, "loss": 0.45028555393218994, "memory(GiB)": 78.26, "step": 958, "token_acc": 0.8715856095936043, "train_speed(iter/s)": 0.033495 }, { "epoch": 0.1858257036283486, "grad_norm": 0.11843264102935791, "learning_rate": 0.0002851572450902193, "loss": 0.45888209342956543, "memory(GiB)": 78.26, "step": 959, "token_acc": 0.8671850891810159, "train_speed(iter/s)": 0.033497 }, { "epoch": 0.18601947391367535, "grad_norm": 0.10771480202674866, "learning_rate": 0.0002851155230998744, "loss": 0.44502004981040955, "memory(GiB)": 78.26, "step": 960, "token_acc": 0.8715418593237414, "train_speed(iter/s)": 0.033499 }, { "epoch": 0.1862132441990021, "grad_norm": 0.11727321892976761, "learning_rate": 0.00028507374561395345, "loss": 0.4620700478553772, "memory(GiB)": 78.26, "step": 961, "token_acc": 0.8692043503148255, "train_speed(iter/s)": 0.033501 }, { "epoch": 0.18640701448432884, "grad_norm": 0.11580432206392288, "learning_rate": 0.0002850319126496156, "loss": 0.43666505813598633, "memory(GiB)": 78.26, "step": 962, "token_acc": 0.8729514358743046, "train_speed(iter/s)": 0.033503 }, { "epoch": 0.18660078476965558, "grad_norm": 0.11996878683567047, "learning_rate": 0.00028499002422404274, "loss": 0.49721530079841614, "memory(GiB)": 78.26, "step": 963, "token_acc": 0.8572844400396432, "train_speed(iter/s)": 0.033504 }, { "epoch": 0.18679455505498233, "grad_norm": 0.09926389157772064, "learning_rate": 0.00028494808035443966, "loss": 0.3834714889526367, "memory(GiB)": 78.26, "step": 964, "token_acc": 0.8917675254643499, "train_speed(iter/s)": 0.033506 }, { "epoch": 0.18698832534030907, "grad_norm": 0.11022564768791199, "learning_rate": 0.00028490608105803374, "loss": 0.43356478214263916, "memory(GiB)": 78.26, "step": 965, "token_acc": 0.8749029825923051, "train_speed(iter/s)": 0.033508 }, { "epoch": 0.1871820956256358, "grad_norm": 0.11110399663448334, "learning_rate": 0.0002848640263520753, "loss": 0.4324399530887604, "memory(GiB)": 78.26, "step": 966, "token_acc": 0.8767509301816591, "train_speed(iter/s)": 0.03351 }, { "epoch": 0.18737586591096256, "grad_norm": 0.11281298100948334, "learning_rate": 0.00028482191625383733, "loss": 0.4322070777416229, "memory(GiB)": 78.26, "step": 967, "token_acc": 0.8765359035058055, "train_speed(iter/s)": 0.033512 }, { "epoch": 0.1875696361962893, "grad_norm": 0.1022447943687439, "learning_rate": 0.0002847797507806155, "loss": 0.39318740367889404, "memory(GiB)": 78.26, "step": 968, "token_acc": 0.8852789546220203, "train_speed(iter/s)": 0.033513 }, { "epoch": 0.18776340648161605, "grad_norm": 0.12078309804201126, "learning_rate": 0.0002847375299497284, "loss": 0.4697941243648529, "memory(GiB)": 78.26, "step": 969, "token_acc": 0.8657820386654392, "train_speed(iter/s)": 0.033515 }, { "epoch": 0.1879571767669428, "grad_norm": 0.10926394909620285, "learning_rate": 0.00028469525377851715, "loss": 0.3991783857345581, "memory(GiB)": 78.26, "step": 970, "token_acc": 0.8852660300136426, "train_speed(iter/s)": 0.033517 }, { "epoch": 0.18815094705226953, "grad_norm": 0.11045163869857788, "learning_rate": 0.0002846529222843458, "loss": 0.4475860893726349, "memory(GiB)": 78.26, "step": 971, "token_acc": 0.8725651014968219, "train_speed(iter/s)": 0.033518 }, { "epoch": 0.18834471733759628, "grad_norm": 0.09906210750341415, "learning_rate": 0.000284610535484601, "loss": 0.34970560669898987, "memory(GiB)": 78.26, "step": 972, "token_acc": 0.8972711247963943, "train_speed(iter/s)": 0.03352 }, { "epoch": 0.18853848762292302, "grad_norm": 0.11121566593647003, "learning_rate": 0.00028456809339669214, "loss": 0.4188978672027588, "memory(GiB)": 78.26, "step": 973, "token_acc": 0.8785746369216949, "train_speed(iter/s)": 0.033522 }, { "epoch": 0.18873225790824977, "grad_norm": 0.11328519135713577, "learning_rate": 0.00028452559603805137, "loss": 0.44297975301742554, "memory(GiB)": 78.26, "step": 974, "token_acc": 0.8741913622081202, "train_speed(iter/s)": 0.033524 }, { "epoch": 0.1889260281935765, "grad_norm": 0.10972630977630615, "learning_rate": 0.00028448304342613344, "loss": 0.4079618453979492, "memory(GiB)": 78.26, "step": 975, "token_acc": 0.8813834977772959, "train_speed(iter/s)": 0.033526 }, { "epoch": 0.18911979847890326, "grad_norm": 0.0967109352350235, "learning_rate": 0.00028444043557841585, "loss": 0.38039693236351013, "memory(GiB)": 78.26, "step": 976, "token_acc": 0.8867412594640688, "train_speed(iter/s)": 0.033528 }, { "epoch": 0.18931356876423, "grad_norm": 0.10372382402420044, "learning_rate": 0.00028439777251239887, "loss": 0.42939719557762146, "memory(GiB)": 78.26, "step": 977, "token_acc": 0.8766995304010031, "train_speed(iter/s)": 0.033529 }, { "epoch": 0.18950733904955674, "grad_norm": 0.10110750049352646, "learning_rate": 0.00028435505424560527, "loss": 0.4236353039741516, "memory(GiB)": 78.26, "step": 978, "token_acc": 0.8794206803637589, "train_speed(iter/s)": 0.033531 }, { "epoch": 0.1897011093348835, "grad_norm": 0.11021259427070618, "learning_rate": 0.00028431228079558063, "loss": 0.4256800711154938, "memory(GiB)": 78.26, "step": 979, "token_acc": 0.8776469925658932, "train_speed(iter/s)": 0.033533 }, { "epoch": 0.18989487962021023, "grad_norm": 0.10382870584726334, "learning_rate": 0.00028426945217989316, "loss": 0.39924290776252747, "memory(GiB)": 78.26, "step": 980, "token_acc": 0.8837884005314389, "train_speed(iter/s)": 0.033535 }, { "epoch": 0.19008864990553698, "grad_norm": 0.11175846308469772, "learning_rate": 0.00028422656841613377, "loss": 0.42587774991989136, "memory(GiB)": 78.26, "step": 981, "token_acc": 0.8782795084309802, "train_speed(iter/s)": 0.033537 }, { "epoch": 0.19028242019086372, "grad_norm": 0.10329094529151917, "learning_rate": 0.00028418362952191585, "loss": 0.40456724166870117, "memory(GiB)": 78.26, "step": 982, "token_acc": 0.881374140565317, "train_speed(iter/s)": 0.033539 }, { "epoch": 0.19047619047619047, "grad_norm": 0.11278124898672104, "learning_rate": 0.0002841406355148757, "loss": 0.4433564245700836, "memory(GiB)": 78.26, "step": 983, "token_acc": 0.871099352158647, "train_speed(iter/s)": 0.033541 }, { "epoch": 0.1906699607615172, "grad_norm": 0.11207697540521622, "learning_rate": 0.000284097586412672, "loss": 0.40271443128585815, "memory(GiB)": 78.26, "step": 984, "token_acc": 0.8841104695500599, "train_speed(iter/s)": 0.033542 }, { "epoch": 0.19086373104684395, "grad_norm": 0.11175701022148132, "learning_rate": 0.00028405448223298624, "loss": 0.451436847448349, "memory(GiB)": 78.26, "step": 985, "token_acc": 0.8711304942705661, "train_speed(iter/s)": 0.033544 }, { "epoch": 0.1910575013321707, "grad_norm": 0.10010375082492828, "learning_rate": 0.0002840113229935224, "loss": 0.3909650444984436, "memory(GiB)": 78.26, "step": 986, "token_acc": 0.8874252112061217, "train_speed(iter/s)": 0.033545 }, { "epoch": 0.19125127161749747, "grad_norm": 0.09799065440893173, "learning_rate": 0.0002839681087120073, "loss": 0.38588500022888184, "memory(GiB)": 78.26, "step": 987, "token_acc": 0.8859629435674921, "train_speed(iter/s)": 0.033547 }, { "epoch": 0.19144504190282421, "grad_norm": 0.11740557104349136, "learning_rate": 0.0002839248394061899, "loss": 0.4740091562271118, "memory(GiB)": 78.26, "step": 988, "token_acc": 0.8610855829982769, "train_speed(iter/s)": 0.033549 }, { "epoch": 0.19163881218815096, "grad_norm": 0.11015292257070541, "learning_rate": 0.0002838815150938424, "loss": 0.41003599762916565, "memory(GiB)": 78.26, "step": 989, "token_acc": 0.882699868938401, "train_speed(iter/s)": 0.033551 }, { "epoch": 0.1918325824734777, "grad_norm": 0.11227616667747498, "learning_rate": 0.0002838381357927591, "loss": 0.41413140296936035, "memory(GiB)": 78.26, "step": 990, "token_acc": 0.880181635836862, "train_speed(iter/s)": 0.033552 }, { "epoch": 0.19202635275880445, "grad_norm": 0.11800327897071838, "learning_rate": 0.000283794701520757, "loss": 0.4505453109741211, "memory(GiB)": 78.26, "step": 991, "token_acc": 0.8734527175620483, "train_speed(iter/s)": 0.033554 }, { "epoch": 0.1922201230441312, "grad_norm": 0.11021004617214203, "learning_rate": 0.00028375121229567583, "loss": 0.42506149411201477, "memory(GiB)": 78.26, "step": 992, "token_acc": 0.8773052126543056, "train_speed(iter/s)": 0.033556 }, { "epoch": 0.19241389332945794, "grad_norm": 0.11306871473789215, "learning_rate": 0.0002837076681353777, "loss": 0.4095050096511841, "memory(GiB)": 78.26, "step": 993, "token_acc": 0.8811114926437571, "train_speed(iter/s)": 0.033557 }, { "epoch": 0.19260766361478468, "grad_norm": 0.09795724600553513, "learning_rate": 0.00028366406905774746, "loss": 0.376755952835083, "memory(GiB)": 78.26, "step": 994, "token_acc": 0.8902818813383374, "train_speed(iter/s)": 0.033559 }, { "epoch": 0.19280143390011142, "grad_norm": 0.10513719916343689, "learning_rate": 0.0002836204150806923, "loss": 0.39423179626464844, "memory(GiB)": 78.26, "step": 995, "token_acc": 0.8858132362606805, "train_speed(iter/s)": 0.03356 }, { "epoch": 0.19299520418543817, "grad_norm": 0.10870835930109024, "learning_rate": 0.0002835767062221422, "loss": 0.43109869956970215, "memory(GiB)": 78.26, "step": 996, "token_acc": 0.8794217244073977, "train_speed(iter/s)": 0.033562 }, { "epoch": 0.1931889744707649, "grad_norm": 0.11085118353366852, "learning_rate": 0.0002835329425000495, "loss": 0.41626110672950745, "memory(GiB)": 78.26, "step": 997, "token_acc": 0.8794915927446048, "train_speed(iter/s)": 0.033564 }, { "epoch": 0.19338274475609166, "grad_norm": 0.10917031019926071, "learning_rate": 0.00028348912393238914, "loss": 0.442068487405777, "memory(GiB)": 78.26, "step": 998, "token_acc": 0.8718288169021695, "train_speed(iter/s)": 0.033566 }, { "epoch": 0.1935765150414184, "grad_norm": 0.11420496553182602, "learning_rate": 0.00028344525053715857, "loss": 0.45237767696380615, "memory(GiB)": 78.26, "step": 999, "token_acc": 0.8713443106414352, "train_speed(iter/s)": 0.033568 }, { "epoch": 0.19377028532674515, "grad_norm": 0.10354109853506088, "learning_rate": 0.0002834013223323778, "loss": 0.40592384338378906, "memory(GiB)": 78.26, "step": 1000, "token_acc": 0.883349086326402, "train_speed(iter/s)": 0.033569 }, { "epoch": 0.19377028532674515, "eval_loss": 0.48618289828300476, "eval_runtime": 1345.1073, "eval_samples_per_second": 5.017, "eval_steps_per_second": 5.017, "eval_token_acc": 0.8788295367774018, "step": 1000 }, { "epoch": 0.1939640556120719, "grad_norm": 0.11008410900831223, "learning_rate": 0.00028335733933608937, "loss": 0.44370290637016296, "memory(GiB)": 78.26, "step": 1001, "token_acc": 0.8703201902339214, "train_speed(iter/s)": 0.032109 }, { "epoch": 0.19415782589739863, "grad_norm": 0.10204144567251205, "learning_rate": 0.00028331330156635814, "loss": 0.37651312351226807, "memory(GiB)": 78.26, "step": 1002, "token_acc": 0.8898332699723022, "train_speed(iter/s)": 0.032111 }, { "epoch": 0.19435159618272538, "grad_norm": 0.10822474956512451, "learning_rate": 0.0002832692090412717, "loss": 0.4068623185157776, "memory(GiB)": 78.26, "step": 1003, "token_acc": 0.8804925222697431, "train_speed(iter/s)": 0.032114 }, { "epoch": 0.19454536646805212, "grad_norm": 0.10033664852380753, "learning_rate": 0.0002832250617789401, "loss": 0.4149298667907715, "memory(GiB)": 78.26, "step": 1004, "token_acc": 0.8801352320458969, "train_speed(iter/s)": 0.032117 }, { "epoch": 0.19473913675337887, "grad_norm": 0.12285728007555008, "learning_rate": 0.00028318085979749563, "loss": 0.46761053800582886, "memory(GiB)": 78.26, "step": 1005, "token_acc": 0.8641041659816526, "train_speed(iter/s)": 0.03212 }, { "epoch": 0.1949329070387056, "grad_norm": 0.09842801839113235, "learning_rate": 0.0002831366031150934, "loss": 0.39952999353408813, "memory(GiB)": 78.26, "step": 1006, "token_acc": 0.8830670627587659, "train_speed(iter/s)": 0.032123 }, { "epoch": 0.19512667732403235, "grad_norm": 0.11463743448257446, "learning_rate": 0.0002830922917499108, "loss": 0.43408212065696716, "memory(GiB)": 78.26, "step": 1007, "token_acc": 0.8743004464566434, "train_speed(iter/s)": 0.032126 }, { "epoch": 0.1953204476093591, "grad_norm": 0.11018949747085571, "learning_rate": 0.00028304792572014754, "loss": 0.43823209404945374, "memory(GiB)": 78.26, "step": 1008, "token_acc": 0.8741348099243957, "train_speed(iter/s)": 0.032128 }, { "epoch": 0.19551421789468584, "grad_norm": 0.11423125863075256, "learning_rate": 0.00028300350504402606, "loss": 0.45217055082321167, "memory(GiB)": 78.26, "step": 1009, "token_acc": 0.8701353400348177, "train_speed(iter/s)": 0.032131 }, { "epoch": 0.1957079881800126, "grad_norm": 0.1115386039018631, "learning_rate": 0.0002829590297397912, "loss": 0.4279889762401581, "memory(GiB)": 78.26, "step": 1010, "token_acc": 0.8788333144732375, "train_speed(iter/s)": 0.032134 }, { "epoch": 0.19590175846533933, "grad_norm": 0.1104380413889885, "learning_rate": 0.00028291449982570995, "loss": 0.4566521942615509, "memory(GiB)": 78.26, "step": 1011, "token_acc": 0.8691424216142343, "train_speed(iter/s)": 0.032137 }, { "epoch": 0.19609552875066608, "grad_norm": 0.11296399682760239, "learning_rate": 0.00028286991532007217, "loss": 0.43526938557624817, "memory(GiB)": 78.26, "step": 1012, "token_acc": 0.8731874876819551, "train_speed(iter/s)": 0.03214 }, { "epoch": 0.19628929903599282, "grad_norm": 0.10835513472557068, "learning_rate": 0.0002828252762411898, "loss": 0.4160099923610687, "memory(GiB)": 78.26, "step": 1013, "token_acc": 0.880335822929728, "train_speed(iter/s)": 0.032143 }, { "epoch": 0.19648306932131956, "grad_norm": 0.09724919497966766, "learning_rate": 0.00028278058260739733, "loss": 0.3947051763534546, "memory(GiB)": 78.26, "step": 1014, "token_acc": 0.8845330218523385, "train_speed(iter/s)": 0.032145 }, { "epoch": 0.1966768396066463, "grad_norm": 0.11479044705629349, "learning_rate": 0.0002827358344370516, "loss": 0.4259167015552521, "memory(GiB)": 78.26, "step": 1015, "token_acc": 0.8770737733851042, "train_speed(iter/s)": 0.032148 }, { "epoch": 0.19687060989197305, "grad_norm": 0.11837134510278702, "learning_rate": 0.000282691031748532, "loss": 0.4474408030509949, "memory(GiB)": 78.26, "step": 1016, "token_acc": 0.8693976711066432, "train_speed(iter/s)": 0.032151 }, { "epoch": 0.19706438017729982, "grad_norm": 0.10522796213626862, "learning_rate": 0.00028264617456024, "loss": 0.41682255268096924, "memory(GiB)": 78.26, "step": 1017, "token_acc": 0.8796135557400558, "train_speed(iter/s)": 0.032154 }, { "epoch": 0.19725815046262657, "grad_norm": 0.10754600167274475, "learning_rate": 0.00028260126289059986, "loss": 0.4248649477958679, "memory(GiB)": 78.26, "step": 1018, "token_acc": 0.8764376037077302, "train_speed(iter/s)": 0.032156 }, { "epoch": 0.1974519207479533, "grad_norm": 0.10504762083292007, "learning_rate": 0.00028255629675805785, "loss": 0.4158293604850769, "memory(GiB)": 78.26, "step": 1019, "token_acc": 0.8806701766216252, "train_speed(iter/s)": 0.032159 }, { "epoch": 0.19764569103328006, "grad_norm": 0.11185994744300842, "learning_rate": 0.0002825112761810828, "loss": 0.43085163831710815, "memory(GiB)": 78.26, "step": 1020, "token_acc": 0.8744727730563424, "train_speed(iter/s)": 0.032162 }, { "epoch": 0.1978394613186068, "grad_norm": 0.09813162684440613, "learning_rate": 0.000282466201178166, "loss": 0.4018121063709259, "memory(GiB)": 78.26, "step": 1021, "token_acc": 0.8820514020879172, "train_speed(iter/s)": 0.032165 }, { "epoch": 0.19803323160393355, "grad_norm": 0.1113823875784874, "learning_rate": 0.0002824210717678209, "loss": 0.40826714038848877, "memory(GiB)": 78.26, "step": 1022, "token_acc": 0.882307549027434, "train_speed(iter/s)": 0.032167 }, { "epoch": 0.1982270018892603, "grad_norm": 0.1155644953250885, "learning_rate": 0.00028237588796858323, "loss": 0.4487013816833496, "memory(GiB)": 78.26, "step": 1023, "token_acc": 0.8709012113617377, "train_speed(iter/s)": 0.03217 }, { "epoch": 0.19842077217458703, "grad_norm": 0.11795882880687714, "learning_rate": 0.0002823306497990113, "loss": 0.43464547395706177, "memory(GiB)": 78.26, "step": 1024, "token_acc": 0.8770835761743793, "train_speed(iter/s)": 0.032173 }, { "epoch": 0.19861454245991378, "grad_norm": 0.1030697301030159, "learning_rate": 0.00028228535727768575, "loss": 0.4153880476951599, "memory(GiB)": 78.26, "step": 1025, "token_acc": 0.8816005247622172, "train_speed(iter/s)": 0.032175 }, { "epoch": 0.19880831274524052, "grad_norm": 0.10959405452013016, "learning_rate": 0.00028224001042320923, "loss": 0.4293935000896454, "memory(GiB)": 78.26, "step": 1026, "token_acc": 0.8751588677065281, "train_speed(iter/s)": 0.032178 }, { "epoch": 0.19900208303056727, "grad_norm": 0.11512494832277298, "learning_rate": 0.00028219460925420697, "loss": 0.46233439445495605, "memory(GiB)": 78.26, "step": 1027, "token_acc": 0.8720349947631076, "train_speed(iter/s)": 0.032181 }, { "epoch": 0.199195853315894, "grad_norm": 0.11418092250823975, "learning_rate": 0.00028214915378932653, "loss": 0.45269933342933655, "memory(GiB)": 78.26, "step": 1028, "token_acc": 0.8701842783140374, "train_speed(iter/s)": 0.032184 }, { "epoch": 0.19938962360122076, "grad_norm": 0.10788623243570328, "learning_rate": 0.00028210364404723765, "loss": 0.4401698708534241, "memory(GiB)": 78.26, "step": 1029, "token_acc": 0.8730818757377401, "train_speed(iter/s)": 0.032187 }, { "epoch": 0.1995833938865475, "grad_norm": 0.10503195226192474, "learning_rate": 0.00028205808004663237, "loss": 0.39039331674575806, "memory(GiB)": 78.26, "step": 1030, "token_acc": 0.8859542777970212, "train_speed(iter/s)": 0.03219 }, { "epoch": 0.19977716417187424, "grad_norm": 0.11032052338123322, "learning_rate": 0.0002820124618062251, "loss": 0.39245864748954773, "memory(GiB)": 78.26, "step": 1031, "token_acc": 0.8865768832322036, "train_speed(iter/s)": 0.032193 }, { "epoch": 0.199970934457201, "grad_norm": 0.10605579614639282, "learning_rate": 0.00028196678934475246, "loss": 0.42015910148620605, "memory(GiB)": 78.26, "step": 1032, "token_acc": 0.8770413805519122, "train_speed(iter/s)": 0.032196 }, { "epoch": 0.20016470474252773, "grad_norm": 0.11556072533130646, "learning_rate": 0.00028192106268097334, "loss": 0.412725567817688, "memory(GiB)": 78.26, "step": 1033, "token_acc": 0.8829517954994298, "train_speed(iter/s)": 0.032199 }, { "epoch": 0.20035847502785448, "grad_norm": 0.11041781306266785, "learning_rate": 0.00028187528183366893, "loss": 0.4078274369239807, "memory(GiB)": 78.26, "step": 1034, "token_acc": 0.8829038467063116, "train_speed(iter/s)": 0.032202 }, { "epoch": 0.20055224531318122, "grad_norm": 0.11429005116224289, "learning_rate": 0.0002818294468216426, "loss": 0.4424319267272949, "memory(GiB)": 78.26, "step": 1035, "token_acc": 0.8738159769761659, "train_speed(iter/s)": 0.032205 }, { "epoch": 0.20074601559850797, "grad_norm": 0.10818706452846527, "learning_rate": 0.00028178355766372013, "loss": 0.41155439615249634, "memory(GiB)": 78.26, "step": 1036, "token_acc": 0.8821931101407084, "train_speed(iter/s)": 0.032207 }, { "epoch": 0.2009397858838347, "grad_norm": 0.10576290637254715, "learning_rate": 0.0002817376143787493, "loss": 0.425853431224823, "memory(GiB)": 78.26, "step": 1037, "token_acc": 0.8778621912804088, "train_speed(iter/s)": 0.03221 }, { "epoch": 0.20113355616916145, "grad_norm": 0.11154097318649292, "learning_rate": 0.0002816916169856004, "loss": 0.39488485455513, "memory(GiB)": 78.26, "step": 1038, "token_acc": 0.8855062677979039, "train_speed(iter/s)": 0.032212 }, { "epoch": 0.2013273264544882, "grad_norm": 0.10686661303043365, "learning_rate": 0.00028164556550316563, "loss": 0.401602178812027, "memory(GiB)": 78.26, "step": 1039, "token_acc": 0.8835590770383853, "train_speed(iter/s)": 0.032215 }, { "epoch": 0.20152109673981494, "grad_norm": 0.11480426788330078, "learning_rate": 0.00028159945995035975, "loss": 0.4522130489349365, "memory(GiB)": 78.26, "step": 1040, "token_acc": 0.8704494740197641, "train_speed(iter/s)": 0.032218 }, { "epoch": 0.2017148670251417, "grad_norm": 0.10808564722537994, "learning_rate": 0.0002815533003461193, "loss": 0.4125955402851105, "memory(GiB)": 78.26, "step": 1041, "token_acc": 0.8798711669505963, "train_speed(iter/s)": 0.032221 }, { "epoch": 0.20190863731046843, "grad_norm": 0.11643750965595245, "learning_rate": 0.00028150708670940356, "loss": 0.44826722145080566, "memory(GiB)": 78.26, "step": 1042, "token_acc": 0.8713785046728972, "train_speed(iter/s)": 0.032223 }, { "epoch": 0.20210240759579517, "grad_norm": 0.1128215417265892, "learning_rate": 0.00028146081905919355, "loss": 0.4168522357940674, "memory(GiB)": 78.26, "step": 1043, "token_acc": 0.8776015581524763, "train_speed(iter/s)": 0.032226 }, { "epoch": 0.20229617788112192, "grad_norm": 0.10721378773450851, "learning_rate": 0.00028141449741449264, "loss": 0.4148525297641754, "memory(GiB)": 78.26, "step": 1044, "token_acc": 0.880571123565519, "train_speed(iter/s)": 0.032228 }, { "epoch": 0.20248994816644866, "grad_norm": 0.11519747227430344, "learning_rate": 0.0002813681217943264, "loss": 0.44546476006507874, "memory(GiB)": 78.26, "step": 1045, "token_acc": 0.8722711825355682, "train_speed(iter/s)": 0.032231 }, { "epoch": 0.2026837184517754, "grad_norm": 0.10372038185596466, "learning_rate": 0.00028132169221774256, "loss": 0.36725738644599915, "memory(GiB)": 78.26, "step": 1046, "token_acc": 0.8941524609236684, "train_speed(iter/s)": 0.032233 }, { "epoch": 0.20287748873710215, "grad_norm": 0.11429732292890549, "learning_rate": 0.00028127520870381095, "loss": 0.45745980739593506, "memory(GiB)": 78.26, "step": 1047, "token_acc": 0.8703554240277405, "train_speed(iter/s)": 0.032236 }, { "epoch": 0.20307125902242892, "grad_norm": 0.11484638601541519, "learning_rate": 0.00028122867127162364, "loss": 0.4351459741592407, "memory(GiB)": 78.26, "step": 1048, "token_acc": 0.8764367816091954, "train_speed(iter/s)": 0.032239 }, { "epoch": 0.20326502930775567, "grad_norm": 0.10716410726308823, "learning_rate": 0.0002811820799402948, "loss": 0.4354075789451599, "memory(GiB)": 78.26, "step": 1049, "token_acc": 0.8755018944749194, "train_speed(iter/s)": 0.032242 }, { "epoch": 0.2034587995930824, "grad_norm": 0.0956411212682724, "learning_rate": 0.00028113543472896074, "loss": 0.37369605898857117, "memory(GiB)": 78.26, "step": 1050, "token_acc": 0.8899866307131041, "train_speed(iter/s)": 0.032244 }, { "epoch": 0.20365256987840916, "grad_norm": 0.10245929658412933, "learning_rate": 0.0002810887356567798, "loss": 0.3949301242828369, "memory(GiB)": 78.26, "step": 1051, "token_acc": 0.8889433444422662, "train_speed(iter/s)": 0.032247 }, { "epoch": 0.2038463401637359, "grad_norm": 0.11300604790449142, "learning_rate": 0.0002810419827429327, "loss": 0.4184969663619995, "memory(GiB)": 78.26, "step": 1052, "token_acc": 0.8773636026580726, "train_speed(iter/s)": 0.032249 }, { "epoch": 0.20404011044906264, "grad_norm": 0.1060163602232933, "learning_rate": 0.00028099517600662207, "loss": 0.41772571206092834, "memory(GiB)": 78.26, "step": 1053, "token_acc": 0.8804525071341215, "train_speed(iter/s)": 0.032252 }, { "epoch": 0.2042338807343894, "grad_norm": 0.11069151014089584, "learning_rate": 0.00028094831546707265, "loss": 0.4313889741897583, "memory(GiB)": 78.26, "step": 1054, "token_acc": 0.8761178045515395, "train_speed(iter/s)": 0.032254 }, { "epoch": 0.20442765101971613, "grad_norm": 0.11553742736577988, "learning_rate": 0.00028090140114353133, "loss": 0.4176057279109955, "memory(GiB)": 78.26, "step": 1055, "token_acc": 0.8819988121591706, "train_speed(iter/s)": 0.032257 }, { "epoch": 0.20462142130504288, "grad_norm": 0.10594391077756882, "learning_rate": 0.00028085443305526713, "loss": 0.394021213054657, "memory(GiB)": 78.26, "step": 1056, "token_acc": 0.885325837540156, "train_speed(iter/s)": 0.03226 }, { "epoch": 0.20481519159036962, "grad_norm": 0.11002414673566818, "learning_rate": 0.0002808074112215711, "loss": 0.41269078850746155, "memory(GiB)": 78.26, "step": 1057, "token_acc": 0.8808142873550017, "train_speed(iter/s)": 0.032263 }, { "epoch": 0.20500896187569637, "grad_norm": 0.09443158656358719, "learning_rate": 0.0002807603356617563, "loss": 0.352535218000412, "memory(GiB)": 78.26, "step": 1058, "token_acc": 0.8964792433000526, "train_speed(iter/s)": 0.032265 }, { "epoch": 0.2052027321610231, "grad_norm": 0.11314352601766586, "learning_rate": 0.00028071320639515805, "loss": 0.4366722106933594, "memory(GiB)": 78.26, "step": 1059, "token_acc": 0.8743527508090615, "train_speed(iter/s)": 0.032268 }, { "epoch": 0.20539650244634985, "grad_norm": 0.10947109758853912, "learning_rate": 0.00028066602344113353, "loss": 0.4176010489463806, "memory(GiB)": 78.26, "step": 1060, "token_acc": 0.8801968582792068, "train_speed(iter/s)": 0.03227 }, { "epoch": 0.2055902727316766, "grad_norm": 0.10332785546779633, "learning_rate": 0.000280618786819062, "loss": 0.39836370944976807, "memory(GiB)": 78.26, "step": 1061, "token_acc": 0.8847858883602182, "train_speed(iter/s)": 0.032273 }, { "epoch": 0.20578404301700334, "grad_norm": 0.10330932587385178, "learning_rate": 0.0002805714965483449, "loss": 0.404694139957428, "memory(GiB)": 78.26, "step": 1062, "token_acc": 0.8835061262959473, "train_speed(iter/s)": 0.032275 }, { "epoch": 0.2059778133023301, "grad_norm": 0.12757974863052368, "learning_rate": 0.0002805241526484055, "loss": 0.4440545439720154, "memory(GiB)": 78.26, "step": 1063, "token_acc": 0.8742229290154692, "train_speed(iter/s)": 0.032278 }, { "epoch": 0.20617158358765683, "grad_norm": 0.1015448048710823, "learning_rate": 0.00028047675513868936, "loss": 0.41053593158721924, "memory(GiB)": 78.26, "step": 1064, "token_acc": 0.8818211780215641, "train_speed(iter/s)": 0.03228 }, { "epoch": 0.20636535387298358, "grad_norm": 0.10831795632839203, "learning_rate": 0.00028042930403866383, "loss": 0.4191955626010895, "memory(GiB)": 78.26, "step": 1065, "token_acc": 0.880605738575983, "train_speed(iter/s)": 0.032283 }, { "epoch": 0.20655912415831032, "grad_norm": 0.10870091617107391, "learning_rate": 0.0002803817993678183, "loss": 0.41718360781669617, "memory(GiB)": 78.26, "step": 1066, "token_acc": 0.8796682921131778, "train_speed(iter/s)": 0.032285 }, { "epoch": 0.20675289444363706, "grad_norm": 0.10902893543243408, "learning_rate": 0.00028033424114566434, "loss": 0.4172331988811493, "memory(GiB)": 78.26, "step": 1067, "token_acc": 0.8794469249603918, "train_speed(iter/s)": 0.032288 }, { "epoch": 0.2069466647289638, "grad_norm": 0.11219903081655502, "learning_rate": 0.0002802866293917353, "loss": 0.42483946681022644, "memory(GiB)": 78.26, "step": 1068, "token_acc": 0.8773620614354346, "train_speed(iter/s)": 0.03229 }, { "epoch": 0.20714043501429055, "grad_norm": 0.10504312813282013, "learning_rate": 0.00028023896412558664, "loss": 0.3886624276638031, "memory(GiB)": 78.26, "step": 1069, "token_acc": 0.8863732842790435, "train_speed(iter/s)": 0.032293 }, { "epoch": 0.2073342052996173, "grad_norm": 0.11080943793058395, "learning_rate": 0.00028019124536679573, "loss": 0.4068402647972107, "memory(GiB)": 78.26, "step": 1070, "token_acc": 0.8815200753561757, "train_speed(iter/s)": 0.032296 }, { "epoch": 0.20752797558494404, "grad_norm": 0.11574803292751312, "learning_rate": 0.000280143473134962, "loss": 0.4340014159679413, "memory(GiB)": 78.26, "step": 1071, "token_acc": 0.8773786767852557, "train_speed(iter/s)": 0.032299 }, { "epoch": 0.20772174587027079, "grad_norm": 0.10820218175649643, "learning_rate": 0.00028009564744970676, "loss": 0.43230772018432617, "memory(GiB)": 78.26, "step": 1072, "token_acc": 0.8758193236979952, "train_speed(iter/s)": 0.032301 }, { "epoch": 0.20791551615559753, "grad_norm": 0.11416003853082657, "learning_rate": 0.0002800477683306733, "loss": 0.4142245352268219, "memory(GiB)": 78.26, "step": 1073, "token_acc": 0.881467683756135, "train_speed(iter/s)": 0.032304 }, { "epoch": 0.20810928644092427, "grad_norm": 0.10887296497821808, "learning_rate": 0.0002799998357975269, "loss": 0.3994034230709076, "memory(GiB)": 78.26, "step": 1074, "token_acc": 0.8853917309454148, "train_speed(iter/s)": 0.032306 }, { "epoch": 0.20830305672625102, "grad_norm": 0.10443485528230667, "learning_rate": 0.00027995184986995465, "loss": 0.41546863317489624, "memory(GiB)": 78.26, "step": 1075, "token_acc": 0.8815385377451952, "train_speed(iter/s)": 0.032309 }, { "epoch": 0.20849682701157776, "grad_norm": 0.10981526970863342, "learning_rate": 0.0002799038105676658, "loss": 0.4088175296783447, "memory(GiB)": 78.26, "step": 1076, "token_acc": 0.8810430263475949, "train_speed(iter/s)": 0.032311 }, { "epoch": 0.2086905972969045, "grad_norm": 0.10930271446704865, "learning_rate": 0.0002798557179103912, "loss": 0.44178467988967896, "memory(GiB)": 78.26, "step": 1077, "token_acc": 0.8711125622398171, "train_speed(iter/s)": 0.032314 }, { "epoch": 0.20888436758223128, "grad_norm": 0.10522231459617615, "learning_rate": 0.00027980757191788395, "loss": 0.4103907644748688, "memory(GiB)": 78.26, "step": 1078, "token_acc": 0.8826355904120181, "train_speed(iter/s)": 0.032316 }, { "epoch": 0.20907813786755802, "grad_norm": 0.10563495755195618, "learning_rate": 0.00027975937260991886, "loss": 0.4192779064178467, "memory(GiB)": 78.26, "step": 1079, "token_acc": 0.8789847870957878, "train_speed(iter/s)": 0.032319 }, { "epoch": 0.20927190815288477, "grad_norm": 0.10958468914031982, "learning_rate": 0.00027971112000629264, "loss": 0.4349307417869568, "memory(GiB)": 78.26, "step": 1080, "token_acc": 0.8740859040094141, "train_speed(iter/s)": 0.032321 }, { "epoch": 0.2094656784382115, "grad_norm": 0.09944422543048859, "learning_rate": 0.000279662814126824, "loss": 0.35245591402053833, "memory(GiB)": 78.26, "step": 1081, "token_acc": 0.896417537322233, "train_speed(iter/s)": 0.032324 }, { "epoch": 0.20965944872353826, "grad_norm": 0.10272464156150818, "learning_rate": 0.0002796144549913534, "loss": 0.4116705358028412, "memory(GiB)": 78.26, "step": 1082, "token_acc": 0.8808783118162493, "train_speed(iter/s)": 0.032326 }, { "epoch": 0.209853219008865, "grad_norm": 0.11050140857696533, "learning_rate": 0.0002795660426197432, "loss": 0.42899516224861145, "memory(GiB)": 78.26, "step": 1083, "token_acc": 0.874714182142658, "train_speed(iter/s)": 0.032329 }, { "epoch": 0.21004698929419174, "grad_norm": 0.10963544994592667, "learning_rate": 0.0002795175770318778, "loss": 0.43252986669540405, "memory(GiB)": 78.26, "step": 1084, "token_acc": 0.8759278350515464, "train_speed(iter/s)": 0.032331 }, { "epoch": 0.2102407595795185, "grad_norm": 0.10902027040719986, "learning_rate": 0.0002794690582476632, "loss": 0.42844176292419434, "memory(GiB)": 78.26, "step": 1085, "token_acc": 0.8774478501489996, "train_speed(iter/s)": 0.032334 }, { "epoch": 0.21043452986484523, "grad_norm": 0.10531258583068848, "learning_rate": 0.00027942048628702747, "loss": 0.39160269498825073, "memory(GiB)": 78.26, "step": 1086, "token_acc": 0.8859537508298296, "train_speed(iter/s)": 0.032336 }, { "epoch": 0.21062830015017198, "grad_norm": 0.10602930188179016, "learning_rate": 0.0002793718611699203, "loss": 0.41201895475387573, "memory(GiB)": 78.26, "step": 1087, "token_acc": 0.8800353219229671, "train_speed(iter/s)": 0.032338 }, { "epoch": 0.21082207043549872, "grad_norm": 0.11210530251264572, "learning_rate": 0.0002793231829163134, "loss": 0.40734055638313293, "memory(GiB)": 78.26, "step": 1088, "token_acc": 0.8804322497961086, "train_speed(iter/s)": 0.032341 }, { "epoch": 0.21101584072082546, "grad_norm": 0.12339694052934647, "learning_rate": 0.00027927445154620026, "loss": 0.4420923888683319, "memory(GiB)": 78.26, "step": 1089, "token_acc": 0.8733834499841585, "train_speed(iter/s)": 0.032344 }, { "epoch": 0.2112096110061522, "grad_norm": 0.09618587791919708, "learning_rate": 0.00027922566707959607, "loss": 0.390455961227417, "memory(GiB)": 78.26, "step": 1090, "token_acc": 0.88821477324435, "train_speed(iter/s)": 0.032346 }, { "epoch": 0.21140338129147895, "grad_norm": 0.1139514371752739, "learning_rate": 0.00027917682953653805, "loss": 0.43686941266059875, "memory(GiB)": 78.26, "step": 1091, "token_acc": 0.8737241340844442, "train_speed(iter/s)": 0.032349 }, { "epoch": 0.2115971515768057, "grad_norm": 0.10429059714078903, "learning_rate": 0.000279127938937085, "loss": 0.41246047616004944, "memory(GiB)": 78.26, "step": 1092, "token_acc": 0.8822545491408587, "train_speed(iter/s)": 0.032351 }, { "epoch": 0.21179092186213244, "grad_norm": 0.10676047950983047, "learning_rate": 0.0002790789953013176, "loss": 0.40249383449554443, "memory(GiB)": 78.26, "step": 1093, "token_acc": 0.8805235195023973, "train_speed(iter/s)": 0.032354 }, { "epoch": 0.21198469214745919, "grad_norm": 0.10362397134304047, "learning_rate": 0.0002790299986493384, "loss": 0.3985450267791748, "memory(GiB)": 78.26, "step": 1094, "token_acc": 0.8856300352798694, "train_speed(iter/s)": 0.032356 }, { "epoch": 0.21217846243278593, "grad_norm": 0.12481772899627686, "learning_rate": 0.0002789809490012715, "loss": 0.48419952392578125, "memory(GiB)": 78.26, "step": 1095, "token_acc": 0.8615846373517498, "train_speed(iter/s)": 0.032359 }, { "epoch": 0.21237223271811267, "grad_norm": 0.09449519217014313, "learning_rate": 0.00027893184637726304, "loss": 0.3890914022922516, "memory(GiB)": 78.26, "step": 1096, "token_acc": 0.8871999258504032, "train_speed(iter/s)": 0.032361 }, { "epoch": 0.21256600300343942, "grad_norm": 0.1078861802816391, "learning_rate": 0.00027888269079748073, "loss": 0.4059605002403259, "memory(GiB)": 78.26, "step": 1097, "token_acc": 0.8837670990726387, "train_speed(iter/s)": 0.032363 }, { "epoch": 0.21275977328876616, "grad_norm": 0.1090501993894577, "learning_rate": 0.0002788334822821141, "loss": 0.4224609136581421, "memory(GiB)": 78.26, "step": 1098, "token_acc": 0.8791557955103201, "train_speed(iter/s)": 0.032365 }, { "epoch": 0.2129535435740929, "grad_norm": 0.10729729384183884, "learning_rate": 0.00027878422085137437, "loss": 0.4196450710296631, "memory(GiB)": 78.26, "step": 1099, "token_acc": 0.8797552019583843, "train_speed(iter/s)": 0.032368 }, { "epoch": 0.21314731385941965, "grad_norm": 0.11662878841161728, "learning_rate": 0.00027873490652549464, "loss": 0.44980597496032715, "memory(GiB)": 78.26, "step": 1100, "token_acc": 0.87146529562982, "train_speed(iter/s)": 0.03237 }, { "epoch": 0.2133410841447464, "grad_norm": 0.11625700443983078, "learning_rate": 0.00027868553932472955, "loss": 0.4741382300853729, "memory(GiB)": 78.26, "step": 1101, "token_acc": 0.8651265929377421, "train_speed(iter/s)": 0.032373 }, { "epoch": 0.21353485443007314, "grad_norm": 0.10019952803850174, "learning_rate": 0.0002786361192693555, "loss": 0.38797110319137573, "memory(GiB)": 78.26, "step": 1102, "token_acc": 0.8859078392670029, "train_speed(iter/s)": 0.032375 }, { "epoch": 0.21372862471539988, "grad_norm": 0.10606972128152847, "learning_rate": 0.0002785866463796707, "loss": 0.41653597354888916, "memory(GiB)": 78.26, "step": 1103, "token_acc": 0.8805738034589087, "train_speed(iter/s)": 0.032378 }, { "epoch": 0.21392239500072663, "grad_norm": 0.11103309690952301, "learning_rate": 0.000278537120675995, "loss": 0.4037442207336426, "memory(GiB)": 78.26, "step": 1104, "token_acc": 0.8811096803090532, "train_speed(iter/s)": 0.03238 }, { "epoch": 0.21411616528605337, "grad_norm": 0.10646630078554153, "learning_rate": 0.0002784875421786699, "loss": 0.38875192403793335, "memory(GiB)": 78.26, "step": 1105, "token_acc": 0.8858713855085408, "train_speed(iter/s)": 0.032383 }, { "epoch": 0.21430993557138012, "grad_norm": 0.10254091769456863, "learning_rate": 0.0002784379109080586, "loss": 0.3985883891582489, "memory(GiB)": 78.26, "step": 1106, "token_acc": 0.8814364477700108, "train_speed(iter/s)": 0.032385 }, { "epoch": 0.21450370585670686, "grad_norm": 0.11805382370948792, "learning_rate": 0.00027838822688454605, "loss": 0.44648706912994385, "memory(GiB)": 78.26, "step": 1107, "token_acc": 0.8738782929855441, "train_speed(iter/s)": 0.032388 }, { "epoch": 0.2146974761420336, "grad_norm": 0.10933182388544083, "learning_rate": 0.0002783384901285388, "loss": 0.43097054958343506, "memory(GiB)": 78.26, "step": 1108, "token_acc": 0.8753957234101638, "train_speed(iter/s)": 0.03239 }, { "epoch": 0.21489124642736038, "grad_norm": 0.11015936732292175, "learning_rate": 0.00027828870066046505, "loss": 0.4059434235095978, "memory(GiB)": 78.26, "step": 1109, "token_acc": 0.8822940702232354, "train_speed(iter/s)": 0.032393 }, { "epoch": 0.21508501671268712, "grad_norm": 0.09735988825559616, "learning_rate": 0.00027823885850077474, "loss": 0.36265531182289124, "memory(GiB)": 78.26, "step": 1110, "token_acc": 0.8935244370144557, "train_speed(iter/s)": 0.032395 }, { "epoch": 0.21527878699801387, "grad_norm": 0.10875032097101212, "learning_rate": 0.00027818896366993927, "loss": 0.41812700033187866, "memory(GiB)": 78.26, "step": 1111, "token_acc": 0.8791208791208791, "train_speed(iter/s)": 0.032398 }, { "epoch": 0.2154725572833406, "grad_norm": 0.11396840214729309, "learning_rate": 0.0002781390161884519, "loss": 0.41851508617401123, "memory(GiB)": 78.26, "step": 1112, "token_acc": 0.8805713529956355, "train_speed(iter/s)": 0.0324 }, { "epoch": 0.21566632756866735, "grad_norm": 0.09718259423971176, "learning_rate": 0.00027808901607682734, "loss": 0.3617076277732849, "memory(GiB)": 78.26, "step": 1113, "token_acc": 0.8916775495666627, "train_speed(iter/s)": 0.032402 }, { "epoch": 0.2158600978539941, "grad_norm": 0.11616761237382889, "learning_rate": 0.0002780389633556019, "loss": 0.40616050362586975, "memory(GiB)": 78.26, "step": 1114, "token_acc": 0.883260254376963, "train_speed(iter/s)": 0.032405 }, { "epoch": 0.21605386813932084, "grad_norm": 0.09937416017055511, "learning_rate": 0.0002779888580453338, "loss": 0.3472048044204712, "memory(GiB)": 78.26, "step": 1115, "token_acc": 0.9003076604876031, "train_speed(iter/s)": 0.032407 }, { "epoch": 0.2162476384246476, "grad_norm": 0.10610184073448181, "learning_rate": 0.00027793870016660247, "loss": 0.43418559432029724, "memory(GiB)": 78.26, "step": 1116, "token_acc": 0.8744638034210666, "train_speed(iter/s)": 0.032409 }, { "epoch": 0.21644140870997433, "grad_norm": 0.10146701335906982, "learning_rate": 0.0002778884897400091, "loss": 0.40924298763275146, "memory(GiB)": 78.26, "step": 1117, "token_acc": 0.8807274502624192, "train_speed(iter/s)": 0.032411 }, { "epoch": 0.21663517899530108, "grad_norm": 0.11153874546289444, "learning_rate": 0.0002778382267861765, "loss": 0.40396177768707275, "memory(GiB)": 78.26, "step": 1118, "token_acc": 0.8842105263157894, "train_speed(iter/s)": 0.032414 }, { "epoch": 0.21682894928062782, "grad_norm": 0.1122811809182167, "learning_rate": 0.000277787911325749, "loss": 0.40342846512794495, "memory(GiB)": 78.26, "step": 1119, "token_acc": 0.882786297835746, "train_speed(iter/s)": 0.032416 }, { "epoch": 0.21702271956595456, "grad_norm": 0.1134880930185318, "learning_rate": 0.0002777375433793926, "loss": 0.4314640164375305, "memory(GiB)": 78.26, "step": 1120, "token_acc": 0.8745123894291522, "train_speed(iter/s)": 0.032418 }, { "epoch": 0.2172164898512813, "grad_norm": 0.12474801391363144, "learning_rate": 0.0002776871229677946, "loss": 0.44416388869285583, "memory(GiB)": 78.26, "step": 1121, "token_acc": 0.8747154810374123, "train_speed(iter/s)": 0.032421 }, { "epoch": 0.21741026013660805, "grad_norm": 0.1143270805478096, "learning_rate": 0.0002776366501116642, "loss": 0.43782278895378113, "memory(GiB)": 78.26, "step": 1122, "token_acc": 0.8753460954479145, "train_speed(iter/s)": 0.032423 }, { "epoch": 0.2176040304219348, "grad_norm": 0.10256467759609222, "learning_rate": 0.00027758612483173183, "loss": 0.3676009774208069, "memory(GiB)": 78.26, "step": 1123, "token_acc": 0.8936442031299904, "train_speed(iter/s)": 0.032425 }, { "epoch": 0.21779780070726154, "grad_norm": 0.11031196266412735, "learning_rate": 0.00027753554714874957, "loss": 0.40533462166786194, "memory(GiB)": 78.26, "step": 1124, "token_acc": 0.8813823163138231, "train_speed(iter/s)": 0.032428 }, { "epoch": 0.21799157099258828, "grad_norm": 0.09570778906345367, "learning_rate": 0.00027748491708349117, "loss": 0.3897078037261963, "memory(GiB)": 78.26, "step": 1125, "token_acc": 0.8868044826994286, "train_speed(iter/s)": 0.03243 }, { "epoch": 0.21818534127791503, "grad_norm": 0.10430624336004257, "learning_rate": 0.00027743423465675167, "loss": 0.3872542083263397, "memory(GiB)": 78.26, "step": 1126, "token_acc": 0.8860606646058733, "train_speed(iter/s)": 0.032432 }, { "epoch": 0.21837911156324177, "grad_norm": 0.10934372991323471, "learning_rate": 0.0002773834998893476, "loss": 0.3712359666824341, "memory(GiB)": 78.26, "step": 1127, "token_acc": 0.8935813061202739, "train_speed(iter/s)": 0.032435 }, { "epoch": 0.21857288184856852, "grad_norm": 0.11239821463823318, "learning_rate": 0.0002773327128021173, "loss": 0.4694061875343323, "memory(GiB)": 78.26, "step": 1128, "token_acc": 0.867498464802099, "train_speed(iter/s)": 0.032437 }, { "epoch": 0.21876665213389526, "grad_norm": 0.12323271483182907, "learning_rate": 0.00027728187341592025, "loss": 0.4466722011566162, "memory(GiB)": 78.26, "step": 1129, "token_acc": 0.8716039707419018, "train_speed(iter/s)": 0.032439 }, { "epoch": 0.218960422419222, "grad_norm": 0.1258774995803833, "learning_rate": 0.0002772309817516376, "loss": 0.4656696021556854, "memory(GiB)": 78.26, "step": 1130, "token_acc": 0.8651681629371566, "train_speed(iter/s)": 0.032442 }, { "epoch": 0.21915419270454875, "grad_norm": 0.08737502247095108, "learning_rate": 0.0002771800378301719, "loss": 0.3400501012802124, "memory(GiB)": 78.26, "step": 1131, "token_acc": 0.9012786337143631, "train_speed(iter/s)": 0.032444 }, { "epoch": 0.2193479629898755, "grad_norm": 0.10477188974618912, "learning_rate": 0.0002771290416724472, "loss": 0.4307701587677002, "memory(GiB)": 78.26, "step": 1132, "token_acc": 0.8758152240789506, "train_speed(iter/s)": 0.032447 }, { "epoch": 0.21954173327520224, "grad_norm": 0.10766912251710892, "learning_rate": 0.000277077993299409, "loss": 0.41765448451042175, "memory(GiB)": 78.26, "step": 1133, "token_acc": 0.8770553935860058, "train_speed(iter/s)": 0.032449 }, { "epoch": 0.21973550356052898, "grad_norm": 0.1124100387096405, "learning_rate": 0.00027702689273202425, "loss": 0.43786174058914185, "memory(GiB)": 78.26, "step": 1134, "token_acc": 0.8744572796884457, "train_speed(iter/s)": 0.032452 }, { "epoch": 0.21992927384585573, "grad_norm": 0.0979883000254631, "learning_rate": 0.00027697573999128136, "loss": 0.3537387549877167, "memory(GiB)": 78.26, "step": 1135, "token_acc": 0.8937490836225014, "train_speed(iter/s)": 0.032454 }, { "epoch": 0.22012304413118247, "grad_norm": 0.10655289143323898, "learning_rate": 0.00027692453509819, "loss": 0.41893625259399414, "memory(GiB)": 78.26, "step": 1136, "token_acc": 0.8789784973184301, "train_speed(iter/s)": 0.032456 }, { "epoch": 0.22031681441650922, "grad_norm": 0.10611773282289505, "learning_rate": 0.0002768732780737815, "loss": 0.39203619956970215, "memory(GiB)": 78.26, "step": 1137, "token_acc": 0.8859876620924084, "train_speed(iter/s)": 0.032459 }, { "epoch": 0.22051058470183596, "grad_norm": 0.10541475564241409, "learning_rate": 0.0002768219689391085, "loss": 0.3815195560455322, "memory(GiB)": 78.26, "step": 1138, "token_acc": 0.8915424198443066, "train_speed(iter/s)": 0.032461 }, { "epoch": 0.22070435498716273, "grad_norm": 0.11761334538459778, "learning_rate": 0.000276770607715245, "loss": 0.4491420090198517, "memory(GiB)": 78.26, "step": 1139, "token_acc": 0.8722042094850799, "train_speed(iter/s)": 0.032463 }, { "epoch": 0.22089812527248948, "grad_norm": 0.11273287236690521, "learning_rate": 0.0002767191944232865, "loss": 0.441051721572876, "memory(GiB)": 78.26, "step": 1140, "token_acc": 0.8757705209438345, "train_speed(iter/s)": 0.032466 }, { "epoch": 0.22109189555781622, "grad_norm": 0.10666593909263611, "learning_rate": 0.00027666772908434967, "loss": 0.41061753034591675, "memory(GiB)": 78.26, "step": 1141, "token_acc": 0.8814865515755317, "train_speed(iter/s)": 0.032468 }, { "epoch": 0.22128566584314296, "grad_norm": 0.0970299169421196, "learning_rate": 0.0002766162117195729, "loss": 0.37377890944480896, "memory(GiB)": 78.26, "step": 1142, "token_acc": 0.8917459199248562, "train_speed(iter/s)": 0.03247 }, { "epoch": 0.2214794361284697, "grad_norm": 0.11498536169528961, "learning_rate": 0.0002765646423501156, "loss": 0.4479539394378662, "memory(GiB)": 78.26, "step": 1143, "token_acc": 0.873067222283145, "train_speed(iter/s)": 0.032473 }, { "epoch": 0.22167320641379645, "grad_norm": 0.1132684275507927, "learning_rate": 0.00027651302099715886, "loss": 0.4535306692123413, "memory(GiB)": 78.26, "step": 1144, "token_acc": 0.873893744313639, "train_speed(iter/s)": 0.032475 }, { "epoch": 0.2218669766991232, "grad_norm": 0.11660971492528915, "learning_rate": 0.0002764613476819048, "loss": 0.42548397183418274, "memory(GiB)": 78.26, "step": 1145, "token_acc": 0.8776913060594131, "train_speed(iter/s)": 0.032477 }, { "epoch": 0.22206074698444994, "grad_norm": 0.10114238411188126, "learning_rate": 0.0002764096224255771, "loss": 0.4015495181083679, "memory(GiB)": 78.26, "step": 1146, "token_acc": 0.8825136612021858, "train_speed(iter/s)": 0.032479 }, { "epoch": 0.22225451726977669, "grad_norm": 0.10448514670133591, "learning_rate": 0.0002763578452494207, "loss": 0.3914845883846283, "memory(GiB)": 78.26, "step": 1147, "token_acc": 0.8844678055190539, "train_speed(iter/s)": 0.032482 }, { "epoch": 0.22244828755510343, "grad_norm": 0.11461488157510757, "learning_rate": 0.0002763060161747019, "loss": 0.4321270287036896, "memory(GiB)": 78.26, "step": 1148, "token_acc": 0.8759273103686238, "train_speed(iter/s)": 0.032484 }, { "epoch": 0.22264205784043017, "grad_norm": 0.11510949581861496, "learning_rate": 0.00027625413522270833, "loss": 0.42202824354171753, "memory(GiB)": 78.26, "step": 1149, "token_acc": 0.8777195685670262, "train_speed(iter/s)": 0.032486 }, { "epoch": 0.22283582812575692, "grad_norm": 0.11425234377384186, "learning_rate": 0.0002762022024147488, "loss": 0.45739829540252686, "memory(GiB)": 78.26, "step": 1150, "token_acc": 0.8678213309024613, "train_speed(iter/s)": 0.032488 }, { "epoch": 0.22302959841108366, "grad_norm": 0.11414557695388794, "learning_rate": 0.0002761502177721535, "loss": 0.4292232096195221, "memory(GiB)": 78.26, "step": 1151, "token_acc": 0.8774583963691377, "train_speed(iter/s)": 0.032491 }, { "epoch": 0.2232233686964104, "grad_norm": 0.10301478952169418, "learning_rate": 0.00027609818131627407, "loss": 0.3989236056804657, "memory(GiB)": 78.26, "step": 1152, "token_acc": 0.8839550828699455, "train_speed(iter/s)": 0.032493 }, { "epoch": 0.22341713898173715, "grad_norm": 0.10694558918476105, "learning_rate": 0.0002760460930684831, "loss": 0.36604946851730347, "memory(GiB)": 78.26, "step": 1153, "token_acc": 0.8924390538257302, "train_speed(iter/s)": 0.032496 }, { "epoch": 0.2236109092670639, "grad_norm": 0.13084904849529266, "learning_rate": 0.0002759939530501748, "loss": 0.42997825145721436, "memory(GiB)": 78.26, "step": 1154, "token_acc": 0.8759590419790543, "train_speed(iter/s)": 0.032498 }, { "epoch": 0.22380467955239064, "grad_norm": 0.10484279692173004, "learning_rate": 0.00027594176128276435, "loss": 0.42240795493125916, "memory(GiB)": 78.26, "step": 1155, "token_acc": 0.8780322748568454, "train_speed(iter/s)": 0.032501 }, { "epoch": 0.22399844983771738, "grad_norm": 0.10142414271831512, "learning_rate": 0.00027588951778768835, "loss": 0.38500741124153137, "memory(GiB)": 78.26, "step": 1156, "token_acc": 0.8893578852952039, "train_speed(iter/s)": 0.032503 }, { "epoch": 0.22419222012304413, "grad_norm": 0.11852286010980606, "learning_rate": 0.0002758372225864046, "loss": 0.41841238737106323, "memory(GiB)": 78.26, "step": 1157, "token_acc": 0.8788539120704378, "train_speed(iter/s)": 0.032505 }, { "epoch": 0.22438599040837087, "grad_norm": 0.10002440959215164, "learning_rate": 0.0002757848757003922, "loss": 0.35776591300964355, "memory(GiB)": 78.26, "step": 1158, "token_acc": 0.8962676092945886, "train_speed(iter/s)": 0.032507 }, { "epoch": 0.22457976069369762, "grad_norm": 0.10413894802331924, "learning_rate": 0.0002757324771511514, "loss": 0.38659295439720154, "memory(GiB)": 78.26, "step": 1159, "token_acc": 0.887719821903838, "train_speed(iter/s)": 0.032509 }, { "epoch": 0.22477353097902436, "grad_norm": 0.09892678260803223, "learning_rate": 0.0002756800269602036, "loss": 0.37748774886131287, "memory(GiB)": 78.26, "step": 1160, "token_acc": 0.8910947249007374, "train_speed(iter/s)": 0.032511 }, { "epoch": 0.2249673012643511, "grad_norm": 0.12362544238567352, "learning_rate": 0.0002756275251490916, "loss": 0.43426865339279175, "memory(GiB)": 78.26, "step": 1161, "token_acc": 0.8750506441941496, "train_speed(iter/s)": 0.032513 }, { "epoch": 0.22516107154967785, "grad_norm": 0.10890624672174454, "learning_rate": 0.00027557497173937923, "loss": 0.3891877830028534, "memory(GiB)": 78.26, "step": 1162, "token_acc": 0.8879472436438945, "train_speed(iter/s)": 0.032515 }, { "epoch": 0.2253548418350046, "grad_norm": 0.11398806422948837, "learning_rate": 0.00027552236675265174, "loss": 0.40979334712028503, "memory(GiB)": 78.26, "step": 1163, "token_acc": 0.881280651429629, "train_speed(iter/s)": 0.032518 }, { "epoch": 0.22554861212033134, "grad_norm": 0.10806816816329956, "learning_rate": 0.00027546971021051526, "loss": 0.39391979575157166, "memory(GiB)": 78.26, "step": 1164, "token_acc": 0.8866101829123446, "train_speed(iter/s)": 0.03252 }, { "epoch": 0.22574238240565808, "grad_norm": 0.10718252509832382, "learning_rate": 0.00027541700213459726, "loss": 0.44121599197387695, "memory(GiB)": 78.26, "step": 1165, "token_acc": 0.8726908749968325, "train_speed(iter/s)": 0.032522 }, { "epoch": 0.22593615269098483, "grad_norm": 0.1094525083899498, "learning_rate": 0.00027536424254654643, "loss": 0.4089512526988983, "memory(GiB)": 78.26, "step": 1166, "token_acc": 0.8825124501342789, "train_speed(iter/s)": 0.032524 }, { "epoch": 0.22612992297631157, "grad_norm": 0.12114433944225311, "learning_rate": 0.00027531143146803256, "loss": 0.4136141538619995, "memory(GiB)": 78.26, "step": 1167, "token_acc": 0.8802897980960939, "train_speed(iter/s)": 0.032527 }, { "epoch": 0.22632369326163831, "grad_norm": 0.10031379014253616, "learning_rate": 0.00027525856892074646, "loss": 0.37666836380958557, "memory(GiB)": 78.26, "step": 1168, "token_acc": 0.8915674529813404, "train_speed(iter/s)": 0.032529 }, { "epoch": 0.2265174635469651, "grad_norm": 0.11311759054660797, "learning_rate": 0.0002752056549264003, "loss": 0.40700918436050415, "memory(GiB)": 78.26, "step": 1169, "token_acc": 0.8810722623836179, "train_speed(iter/s)": 0.032531 }, { "epoch": 0.22671123383229183, "grad_norm": 0.11526045948266983, "learning_rate": 0.0002751526895067273, "loss": 0.4090433120727539, "memory(GiB)": 78.26, "step": 1170, "token_acc": 0.8799791792230562, "train_speed(iter/s)": 0.032534 }, { "epoch": 0.22690500411761858, "grad_norm": 0.1078508123755455, "learning_rate": 0.0002750996726834817, "loss": 0.3891124427318573, "memory(GiB)": 78.26, "step": 1171, "token_acc": 0.887758734679347, "train_speed(iter/s)": 0.032536 }, { "epoch": 0.22709877440294532, "grad_norm": 0.12240707129240036, "learning_rate": 0.0002750466044784389, "loss": 0.4698965549468994, "memory(GiB)": 78.26, "step": 1172, "token_acc": 0.8641207547169811, "train_speed(iter/s)": 0.032538 }, { "epoch": 0.22729254468827206, "grad_norm": 0.11151791363954544, "learning_rate": 0.00027499348491339564, "loss": 0.4254417419433594, "memory(GiB)": 78.26, "step": 1173, "token_acc": 0.8755936161930712, "train_speed(iter/s)": 0.03254 }, { "epoch": 0.2274863149735988, "grad_norm": 0.1045253574848175, "learning_rate": 0.0002749403140101693, "loss": 0.406377375125885, "memory(GiB)": 78.26, "step": 1174, "token_acc": 0.8817293918478969, "train_speed(iter/s)": 0.032542 }, { "epoch": 0.22768008525892555, "grad_norm": 0.11521026492118835, "learning_rate": 0.00027488709179059886, "loss": 0.4493963122367859, "memory(GiB)": 78.26, "step": 1175, "token_acc": 0.8708561714101475, "train_speed(iter/s)": 0.032545 }, { "epoch": 0.2278738555442523, "grad_norm": 0.1163213700056076, "learning_rate": 0.00027483381827654384, "loss": 0.42551738023757935, "memory(GiB)": 78.26, "step": 1176, "token_acc": 0.8768161718256475, "train_speed(iter/s)": 0.032547 }, { "epoch": 0.22806762582957904, "grad_norm": 0.11902674287557602, "learning_rate": 0.0002747804934898853, "loss": 0.4208133816719055, "memory(GiB)": 78.26, "step": 1177, "token_acc": 0.8792419003713664, "train_speed(iter/s)": 0.032549 }, { "epoch": 0.22826139611490578, "grad_norm": 0.11104600876569748, "learning_rate": 0.00027472711745252514, "loss": 0.4281710982322693, "memory(GiB)": 78.26, "step": 1178, "token_acc": 0.8774945561675419, "train_speed(iter/s)": 0.032551 }, { "epoch": 0.22845516640023253, "grad_norm": 0.12366992980241776, "learning_rate": 0.00027467369018638625, "loss": 0.4080888032913208, "memory(GiB)": 78.26, "step": 1179, "token_acc": 0.8825195699186744, "train_speed(iter/s)": 0.032554 }, { "epoch": 0.22864893668555927, "grad_norm": 0.1136869415640831, "learning_rate": 0.00027462021171341264, "loss": 0.401567280292511, "memory(GiB)": 78.26, "step": 1180, "token_acc": 0.8848314606741573, "train_speed(iter/s)": 0.032556 }, { "epoch": 0.22884270697088602, "grad_norm": 0.11184633523225784, "learning_rate": 0.0002745666820555695, "loss": 0.43062543869018555, "memory(GiB)": 78.26, "step": 1181, "token_acc": 0.8777496003087271, "train_speed(iter/s)": 0.032558 }, { "epoch": 0.22903647725621276, "grad_norm": 0.1049606204032898, "learning_rate": 0.00027451310123484277, "loss": 0.41081663966178894, "memory(GiB)": 78.26, "step": 1182, "token_acc": 0.8797746098517307, "train_speed(iter/s)": 0.03256 }, { "epoch": 0.2292302475415395, "grad_norm": 0.10288522392511368, "learning_rate": 0.0002744594692732395, "loss": 0.39744895696640015, "memory(GiB)": 78.26, "step": 1183, "token_acc": 0.8816475626653231, "train_speed(iter/s)": 0.032562 }, { "epoch": 0.22942401782686625, "grad_norm": 0.1137736588716507, "learning_rate": 0.00027440578619278793, "loss": 0.4427635967731476, "memory(GiB)": 78.26, "step": 1184, "token_acc": 0.8732105788154099, "train_speed(iter/s)": 0.032564 }, { "epoch": 0.229617788112193, "grad_norm": 0.1281413733959198, "learning_rate": 0.000274352052015537, "loss": 0.4403104782104492, "memory(GiB)": 78.26, "step": 1185, "token_acc": 0.873717606594319, "train_speed(iter/s)": 0.032566 }, { "epoch": 0.22981155839751974, "grad_norm": 0.1134016141295433, "learning_rate": 0.00027429826676355685, "loss": 0.361285537481308, "memory(GiB)": 78.26, "step": 1186, "token_acc": 0.8938451254627725, "train_speed(iter/s)": 0.032569 }, { "epoch": 0.23000532868284648, "grad_norm": 0.1069924458861351, "learning_rate": 0.00027424443045893855, "loss": 0.42521095275878906, "memory(GiB)": 78.26, "step": 1187, "token_acc": 0.8769493235937237, "train_speed(iter/s)": 0.032571 }, { "epoch": 0.23019909896817323, "grad_norm": 0.11626293510198593, "learning_rate": 0.0002741905431237941, "loss": 0.44659221172332764, "memory(GiB)": 78.26, "step": 1188, "token_acc": 0.8720954699852707, "train_speed(iter/s)": 0.032573 }, { "epoch": 0.23039286925349997, "grad_norm": 0.10542602837085724, "learning_rate": 0.0002741366047802564, "loss": 0.4164351522922516, "memory(GiB)": 78.26, "step": 1189, "token_acc": 0.8788257817485642, "train_speed(iter/s)": 0.032575 }, { "epoch": 0.23058663953882672, "grad_norm": 0.12464678287506104, "learning_rate": 0.00027408261545047946, "loss": 0.47106125950813293, "memory(GiB)": 78.26, "step": 1190, "token_acc": 0.85997171145686, "train_speed(iter/s)": 0.032577 }, { "epoch": 0.23078040982415346, "grad_norm": 0.10709256678819656, "learning_rate": 0.00027402857515663814, "loss": 0.4175183176994324, "memory(GiB)": 78.26, "step": 1191, "token_acc": 0.8793799278364293, "train_speed(iter/s)": 0.032579 }, { "epoch": 0.2309741801094802, "grad_norm": 0.10052433609962463, "learning_rate": 0.0002739744839209282, "loss": 0.38393762707710266, "memory(GiB)": 78.26, "step": 1192, "token_acc": 0.8859593910996107, "train_speed(iter/s)": 0.032581 }, { "epoch": 0.23116795039480695, "grad_norm": 0.11317754536867142, "learning_rate": 0.0002739203417655664, "loss": 0.4203610122203827, "memory(GiB)": 78.26, "step": 1193, "token_acc": 0.878258625139809, "train_speed(iter/s)": 0.032583 }, { "epoch": 0.2313617206801337, "grad_norm": 0.11243908107280731, "learning_rate": 0.0002738661487127904, "loss": 0.3912915885448456, "memory(GiB)": 78.26, "step": 1194, "token_acc": 0.8882206116741347, "train_speed(iter/s)": 0.032586 }, { "epoch": 0.23155549096546044, "grad_norm": 0.12153773754835129, "learning_rate": 0.00027381190478485863, "loss": 0.41862982511520386, "memory(GiB)": 78.26, "step": 1195, "token_acc": 0.8794540140538524, "train_speed(iter/s)": 0.032588 }, { "epoch": 0.23174926125078718, "grad_norm": 0.10597037523984909, "learning_rate": 0.0002737576100040507, "loss": 0.38848644495010376, "memory(GiB)": 78.26, "step": 1196, "token_acc": 0.8870572073398096, "train_speed(iter/s)": 0.03259 }, { "epoch": 0.23194303153611392, "grad_norm": 0.10839282721281052, "learning_rate": 0.0002737032643926668, "loss": 0.3961186110973358, "memory(GiB)": 78.26, "step": 1197, "token_acc": 0.8850163836759011, "train_speed(iter/s)": 0.032592 }, { "epoch": 0.23213680182144067, "grad_norm": 0.11460833996534348, "learning_rate": 0.0002736488679730282, "loss": 0.4457128942012787, "memory(GiB)": 78.26, "step": 1198, "token_acc": 0.8719119445576845, "train_speed(iter/s)": 0.032594 }, { "epoch": 0.2323305721067674, "grad_norm": 0.1022963598370552, "learning_rate": 0.0002735944207674769, "loss": 0.39515194296836853, "memory(GiB)": 78.26, "step": 1199, "token_acc": 0.8834028679578924, "train_speed(iter/s)": 0.032596 }, { "epoch": 0.23252434239209419, "grad_norm": 0.11231876909732819, "learning_rate": 0.0002735399227983759, "loss": 0.4120155870914459, "memory(GiB)": 78.26, "step": 1200, "token_acc": 0.8804409194784245, "train_speed(iter/s)": 0.032598 }, { "epoch": 0.23271811267742093, "grad_norm": 0.10544200241565704, "learning_rate": 0.00027348537408810903, "loss": 0.4136923551559448, "memory(GiB)": 78.26, "step": 1201, "token_acc": 0.8811871376901352, "train_speed(iter/s)": 0.03259 }, { "epoch": 0.23291188296274767, "grad_norm": 0.11337179690599442, "learning_rate": 0.00027343077465908077, "loss": 0.4338820278644562, "memory(GiB)": 78.26, "step": 1202, "token_acc": 0.8770748279047591, "train_speed(iter/s)": 0.032592 }, { "epoch": 0.23310565324807442, "grad_norm": 0.11407187581062317, "learning_rate": 0.00027337612453371665, "loss": 0.4095255136489868, "memory(GiB)": 78.26, "step": 1203, "token_acc": 0.8804728546409807, "train_speed(iter/s)": 0.032594 }, { "epoch": 0.23329942353340116, "grad_norm": 0.11431252956390381, "learning_rate": 0.00027332142373446297, "loss": 0.38796931505203247, "memory(GiB)": 78.26, "step": 1204, "token_acc": 0.8878559697561278, "train_speed(iter/s)": 0.032597 }, { "epoch": 0.2334931938187279, "grad_norm": 0.12404537945985794, "learning_rate": 0.00027326667228378673, "loss": 0.45794570446014404, "memory(GiB)": 78.26, "step": 1205, "token_acc": 0.8690552162153949, "train_speed(iter/s)": 0.032599 }, { "epoch": 0.23368696410405465, "grad_norm": 0.10816068947315216, "learning_rate": 0.0002732118702041759, "loss": 0.42319175601005554, "memory(GiB)": 78.26, "step": 1206, "token_acc": 0.8777571014175469, "train_speed(iter/s)": 0.0326 }, { "epoch": 0.2338807343893814, "grad_norm": 0.12030370533466339, "learning_rate": 0.0002731570175181392, "loss": 0.44653579592704773, "memory(GiB)": 78.26, "step": 1207, "token_acc": 0.8731073377061239, "train_speed(iter/s)": 0.032602 }, { "epoch": 0.23407450467470814, "grad_norm": 0.11633247882127762, "learning_rate": 0.000273102114248206, "loss": 0.44103875756263733, "memory(GiB)": 78.26, "step": 1208, "token_acc": 0.8732270799824535, "train_speed(iter/s)": 0.032605 }, { "epoch": 0.23426827496003488, "grad_norm": 0.121209517121315, "learning_rate": 0.0002730471604169266, "loss": 0.44791626930236816, "memory(GiB)": 78.26, "step": 1209, "token_acc": 0.8704967327820505, "train_speed(iter/s)": 0.032607 }, { "epoch": 0.23446204524536163, "grad_norm": 0.10651442408561707, "learning_rate": 0.00027299215604687204, "loss": 0.3869745433330536, "memory(GiB)": 78.26, "step": 1210, "token_acc": 0.8861974534880438, "train_speed(iter/s)": 0.032609 }, { "epoch": 0.23465581553068837, "grad_norm": 0.11880136281251907, "learning_rate": 0.000272937101160634, "loss": 0.42030277848243713, "memory(GiB)": 78.26, "step": 1211, "token_acc": 0.8782967352155222, "train_speed(iter/s)": 0.032611 }, { "epoch": 0.23484958581601512, "grad_norm": 0.1096733883023262, "learning_rate": 0.0002728819957808252, "loss": 0.40627193450927734, "memory(GiB)": 78.26, "step": 1212, "token_acc": 0.8838417514030116, "train_speed(iter/s)": 0.032613 }, { "epoch": 0.23504335610134186, "grad_norm": 0.10282113403081894, "learning_rate": 0.0002728268399300786, "loss": 0.3663085699081421, "memory(GiB)": 78.26, "step": 1213, "token_acc": 0.8932632961260669, "train_speed(iter/s)": 0.032616 }, { "epoch": 0.2352371263866686, "grad_norm": 0.11431027203798294, "learning_rate": 0.00027277163363104845, "loss": 0.42960235476493835, "memory(GiB)": 78.26, "step": 1214, "token_acc": 0.8752838699642246, "train_speed(iter/s)": 0.032618 }, { "epoch": 0.23543089667199535, "grad_norm": 0.11704136431217194, "learning_rate": 0.0002727163769064094, "loss": 0.42778557538986206, "memory(GiB)": 78.26, "step": 1215, "token_acc": 0.8777720177409135, "train_speed(iter/s)": 0.03262 }, { "epoch": 0.2356246669573221, "grad_norm": 0.10846978425979614, "learning_rate": 0.00027266106977885674, "loss": 0.40238186717033386, "memory(GiB)": 78.26, "step": 1216, "token_acc": 0.88480611332412, "train_speed(iter/s)": 0.032622 }, { "epoch": 0.23581843724264884, "grad_norm": 0.1148129478096962, "learning_rate": 0.0002726057122711067, "loss": 0.4382418096065521, "memory(GiB)": 78.26, "step": 1217, "token_acc": 0.8733262608595923, "train_speed(iter/s)": 0.032624 }, { "epoch": 0.23601220752797558, "grad_norm": 0.11113447695970535, "learning_rate": 0.00027255030440589614, "loss": 0.41040360927581787, "memory(GiB)": 78.26, "step": 1218, "token_acc": 0.8821096468708997, "train_speed(iter/s)": 0.032626 }, { "epoch": 0.23620597781330233, "grad_norm": 0.10677137225866318, "learning_rate": 0.0002724948462059825, "loss": 0.4053942859172821, "memory(GiB)": 78.26, "step": 1219, "token_acc": 0.8811227442521177, "train_speed(iter/s)": 0.032628 }, { "epoch": 0.23639974809862907, "grad_norm": 0.11822210252285004, "learning_rate": 0.00027243933769414394, "loss": 0.4305747449398041, "memory(GiB)": 78.26, "step": 1220, "token_acc": 0.8739931621950513, "train_speed(iter/s)": 0.03263 }, { "epoch": 0.23659351838395581, "grad_norm": 0.13333040475845337, "learning_rate": 0.00027238377889317935, "loss": 0.42261943221092224, "memory(GiB)": 78.26, "step": 1221, "token_acc": 0.8800861924217832, "train_speed(iter/s)": 0.032632 }, { "epoch": 0.23678728866928256, "grad_norm": 0.10652173310518265, "learning_rate": 0.0002723281698259081, "loss": 0.40445369482040405, "memory(GiB)": 78.26, "step": 1222, "token_acc": 0.8841292322492718, "train_speed(iter/s)": 0.032634 }, { "epoch": 0.2369810589546093, "grad_norm": 0.10241284221410751, "learning_rate": 0.0002722725105151705, "loss": 0.3897974491119385, "memory(GiB)": 78.26, "step": 1223, "token_acc": 0.887528428624319, "train_speed(iter/s)": 0.032636 }, { "epoch": 0.23717482923993605, "grad_norm": 0.11149775236845016, "learning_rate": 0.00027221680098382726, "loss": 0.4146060049533844, "memory(GiB)": 78.26, "step": 1224, "token_acc": 0.8800704465401659, "train_speed(iter/s)": 0.032638 }, { "epoch": 0.2373685995252628, "grad_norm": 0.11865018308162689, "learning_rate": 0.00027216104125475974, "loss": 0.390518456697464, "memory(GiB)": 78.26, "step": 1225, "token_acc": 0.8875045692701352, "train_speed(iter/s)": 0.032641 }, { "epoch": 0.23756236981058954, "grad_norm": 0.10814682394266129, "learning_rate": 0.00027210523135086996, "loss": 0.39619240164756775, "memory(GiB)": 78.26, "step": 1226, "token_acc": 0.8837669838825017, "train_speed(iter/s)": 0.032643 }, { "epoch": 0.23775614009591628, "grad_norm": 0.10125034302473068, "learning_rate": 0.0002720493712950805, "loss": 0.3593364953994751, "memory(GiB)": 78.26, "step": 1227, "token_acc": 0.8945650484430858, "train_speed(iter/s)": 0.032644 }, { "epoch": 0.23794991038124302, "grad_norm": 0.11461735516786575, "learning_rate": 0.0002719934611103348, "loss": 0.41169360280036926, "memory(GiB)": 78.26, "step": 1228, "token_acc": 0.8817771660166704, "train_speed(iter/s)": 0.032647 }, { "epoch": 0.23814368066656977, "grad_norm": 0.1014741063117981, "learning_rate": 0.00027193750081959644, "loss": 0.39811670780181885, "memory(GiB)": 78.26, "step": 1229, "token_acc": 0.883717022349185, "train_speed(iter/s)": 0.032649 }, { "epoch": 0.23833745095189654, "grad_norm": 0.10858450084924698, "learning_rate": 0.00027188149044584997, "loss": 0.42627233266830444, "memory(GiB)": 78.26, "step": 1230, "token_acc": 0.8777344179399802, "train_speed(iter/s)": 0.032651 }, { "epoch": 0.23853122123722328, "grad_norm": 0.10094203799962997, "learning_rate": 0.0002718254300121002, "loss": 0.3662244379520416, "memory(GiB)": 78.26, "step": 1231, "token_acc": 0.8925039872408294, "train_speed(iter/s)": 0.032653 }, { "epoch": 0.23872499152255003, "grad_norm": 0.10092142224311829, "learning_rate": 0.0002717693195413728, "loss": 0.3618488907814026, "memory(GiB)": 78.26, "step": 1232, "token_acc": 0.8951329653788259, "train_speed(iter/s)": 0.032655 }, { "epoch": 0.23891876180787677, "grad_norm": 0.10050500929355621, "learning_rate": 0.0002717131590567138, "loss": 0.37352171540260315, "memory(GiB)": 78.26, "step": 1233, "token_acc": 0.8890915338461132, "train_speed(iter/s)": 0.032657 }, { "epoch": 0.23911253209320352, "grad_norm": 0.10266852378845215, "learning_rate": 0.0002716569485811898, "loss": 0.38168448209762573, "memory(GiB)": 78.26, "step": 1234, "token_acc": 0.8879904318660461, "train_speed(iter/s)": 0.032659 }, { "epoch": 0.23930630237853026, "grad_norm": 0.10961468517780304, "learning_rate": 0.00027160068813788797, "loss": 0.423623651266098, "memory(GiB)": 78.26, "step": 1235, "token_acc": 0.8771537798836959, "train_speed(iter/s)": 0.032661 }, { "epoch": 0.239500072663857, "grad_norm": 0.10763482749462128, "learning_rate": 0.000271544377749916, "loss": 0.3990626335144043, "memory(GiB)": 78.26, "step": 1236, "token_acc": 0.8839777513770386, "train_speed(iter/s)": 0.032663 }, { "epoch": 0.23969384294918375, "grad_norm": 0.11288584768772125, "learning_rate": 0.0002714880174404021, "loss": 0.43142029643058777, "memory(GiB)": 78.26, "step": 1237, "token_acc": 0.8738743873247464, "train_speed(iter/s)": 0.032665 }, { "epoch": 0.2398876132345105, "grad_norm": 0.1059926375746727, "learning_rate": 0.00027143160723249485, "loss": 0.39879652857780457, "memory(GiB)": 78.26, "step": 1238, "token_acc": 0.8833674819098997, "train_speed(iter/s)": 0.032667 }, { "epoch": 0.24008138351983724, "grad_norm": 0.10575394332408905, "learning_rate": 0.00027137514714936357, "loss": 0.3970308303833008, "memory(GiB)": 78.26, "step": 1239, "token_acc": 0.8835697867955933, "train_speed(iter/s)": 0.032668 }, { "epoch": 0.24027515380516398, "grad_norm": 0.11206567287445068, "learning_rate": 0.00027131863721419785, "loss": 0.4004877507686615, "memory(GiB)": 78.26, "step": 1240, "token_acc": 0.8835185939591457, "train_speed(iter/s)": 0.032671 }, { "epoch": 0.24046892409049073, "grad_norm": 0.11048437654972076, "learning_rate": 0.00027126207745020785, "loss": 0.41594791412353516, "memory(GiB)": 78.26, "step": 1241, "token_acc": 0.8785890073831009, "train_speed(iter/s)": 0.032672 }, { "epoch": 0.24066269437581747, "grad_norm": 0.11345624923706055, "learning_rate": 0.0002712054678806242, "loss": 0.4346695840358734, "memory(GiB)": 78.26, "step": 1242, "token_acc": 0.8738188213551137, "train_speed(iter/s)": 0.032674 }, { "epoch": 0.24085646466114422, "grad_norm": 0.11145740747451782, "learning_rate": 0.00027114880852869807, "loss": 0.4504337012767792, "memory(GiB)": 78.26, "step": 1243, "token_acc": 0.8700323658937762, "train_speed(iter/s)": 0.032676 }, { "epoch": 0.24105023494647096, "grad_norm": 0.11080954968929291, "learning_rate": 0.0002710920994177008, "loss": 0.43637946248054504, "memory(GiB)": 78.26, "step": 1244, "token_acc": 0.8750756533700138, "train_speed(iter/s)": 0.032678 }, { "epoch": 0.2412440052317977, "grad_norm": 0.10086048394441605, "learning_rate": 0.00027103534057092447, "loss": 0.3988358974456787, "memory(GiB)": 78.26, "step": 1245, "token_acc": 0.8830572217461677, "train_speed(iter/s)": 0.03268 }, { "epoch": 0.24143777551712445, "grad_norm": 0.11646781861782074, "learning_rate": 0.0002709785320116814, "loss": 0.44504404067993164, "memory(GiB)": 78.26, "step": 1246, "token_acc": 0.8725327939193331, "train_speed(iter/s)": 0.032682 }, { "epoch": 0.2416315458024512, "grad_norm": 0.11032546311616898, "learning_rate": 0.0002709216737633044, "loss": 0.44403916597366333, "memory(GiB)": 78.26, "step": 1247, "token_acc": 0.872105901587469, "train_speed(iter/s)": 0.032684 }, { "epoch": 0.24182531608777794, "grad_norm": 0.09166669845581055, "learning_rate": 0.0002708647658491467, "loss": 0.3495952785015106, "memory(GiB)": 78.26, "step": 1248, "token_acc": 0.8977107887579329, "train_speed(iter/s)": 0.032685 }, { "epoch": 0.24201908637310468, "grad_norm": 0.10630014538764954, "learning_rate": 0.0002708078082925819, "loss": 0.4074488878250122, "memory(GiB)": 78.26, "step": 1249, "token_acc": 0.8815855361990353, "train_speed(iter/s)": 0.032687 }, { "epoch": 0.24221285665843142, "grad_norm": 0.09900429099798203, "learning_rate": 0.000270750801117004, "loss": 0.39275825023651123, "memory(GiB)": 78.26, "step": 1250, "token_acc": 0.8847475750400977, "train_speed(iter/s)": 0.032689 }, { "epoch": 0.24240662694375817, "grad_norm": 0.10974457859992981, "learning_rate": 0.0002706937443458274, "loss": 0.4224117398262024, "memory(GiB)": 78.26, "step": 1251, "token_acc": 0.8794166810767191, "train_speed(iter/s)": 0.032691 }, { "epoch": 0.2426003972290849, "grad_norm": 0.10894999653100967, "learning_rate": 0.0002706366380024868, "loss": 0.37922725081443787, "memory(GiB)": 78.26, "step": 1252, "token_acc": 0.8908649728803882, "train_speed(iter/s)": 0.032693 }, { "epoch": 0.24279416751441166, "grad_norm": 0.12082328647375107, "learning_rate": 0.00027057948211043736, "loss": 0.43291229009628296, "memory(GiB)": 78.26, "step": 1253, "token_acc": 0.8723226076593552, "train_speed(iter/s)": 0.032695 }, { "epoch": 0.2429879377997384, "grad_norm": 0.10953840613365173, "learning_rate": 0.00027052227669315454, "loss": 0.38156622648239136, "memory(GiB)": 78.26, "step": 1254, "token_acc": 0.8904028611119321, "train_speed(iter/s)": 0.032697 }, { "epoch": 0.24318170808506515, "grad_norm": 0.11070944368839264, "learning_rate": 0.00027046502177413415, "loss": 0.4315045475959778, "memory(GiB)": 78.26, "step": 1255, "token_acc": 0.8770893778724934, "train_speed(iter/s)": 0.032699 }, { "epoch": 0.2433754783703919, "grad_norm": 0.10312207043170929, "learning_rate": 0.0002704077173768922, "loss": 0.38498249650001526, "memory(GiB)": 78.26, "step": 1256, "token_acc": 0.8877153677921801, "train_speed(iter/s)": 0.032701 }, { "epoch": 0.24356924865571863, "grad_norm": 0.11712785810232162, "learning_rate": 0.0002703503635249653, "loss": 0.4142095446586609, "memory(GiB)": 78.26, "step": 1257, "token_acc": 0.8779902972174204, "train_speed(iter/s)": 0.032703 }, { "epoch": 0.24376301894104538, "grad_norm": 0.11441401392221451, "learning_rate": 0.0002702929602419102, "loss": 0.40991705656051636, "memory(GiB)": 78.26, "step": 1258, "token_acc": 0.8792493116817441, "train_speed(iter/s)": 0.032705 }, { "epoch": 0.24395678922637212, "grad_norm": 0.10743826627731323, "learning_rate": 0.0002702355075513039, "loss": 0.3729992210865021, "memory(GiB)": 78.26, "step": 1259, "token_acc": 0.8912204989885367, "train_speed(iter/s)": 0.032707 }, { "epoch": 0.2441505595116989, "grad_norm": 0.12141770869493484, "learning_rate": 0.0002701780054767438, "loss": 0.456862211227417, "memory(GiB)": 78.26, "step": 1260, "token_acc": 0.868301950047494, "train_speed(iter/s)": 0.032708 }, { "epoch": 0.24434432979702564, "grad_norm": 0.11354810744524002, "learning_rate": 0.0002701204540418475, "loss": 0.43698757886886597, "memory(GiB)": 78.26, "step": 1261, "token_acc": 0.875366250678242, "train_speed(iter/s)": 0.03271 }, { "epoch": 0.24453810008235238, "grad_norm": 0.10827391594648361, "learning_rate": 0.000270062853270253, "loss": 0.39581573009490967, "memory(GiB)": 78.26, "step": 1262, "token_acc": 0.8848191899087549, "train_speed(iter/s)": 0.032712 }, { "epoch": 0.24473187036767913, "grad_norm": 0.10404475033283234, "learning_rate": 0.0002700052031856184, "loss": 0.3672696352005005, "memory(GiB)": 78.26, "step": 1263, "token_acc": 0.8917072443605286, "train_speed(iter/s)": 0.032714 }, { "epoch": 0.24492564065300587, "grad_norm": 0.12273856997489929, "learning_rate": 0.00026994750381162223, "loss": 0.4346576929092407, "memory(GiB)": 78.26, "step": 1264, "token_acc": 0.8755994537428144, "train_speed(iter/s)": 0.032716 }, { "epoch": 0.24511941093833262, "grad_norm": 0.11384225636720657, "learning_rate": 0.00026988975517196315, "loss": 0.4470018148422241, "memory(GiB)": 78.26, "step": 1265, "token_acc": 0.8724487077005063, "train_speed(iter/s)": 0.032718 }, { "epoch": 0.24531318122365936, "grad_norm": 0.11222903430461884, "learning_rate": 0.00026983195729036004, "loss": 0.4389076232910156, "memory(GiB)": 78.26, "step": 1266, "token_acc": 0.8733030464991983, "train_speed(iter/s)": 0.032719 }, { "epoch": 0.2455069515089861, "grad_norm": 0.09804686903953552, "learning_rate": 0.00026977411019055207, "loss": 0.34668290615081787, "memory(GiB)": 78.26, "step": 1267, "token_acc": 0.8978196899835492, "train_speed(iter/s)": 0.032721 }, { "epoch": 0.24570072179431285, "grad_norm": 0.10429113358259201, "learning_rate": 0.00026971621389629855, "loss": 0.4104643762111664, "memory(GiB)": 78.26, "step": 1268, "token_acc": 0.8796510205745658, "train_speed(iter/s)": 0.032723 }, { "epoch": 0.2458944920796396, "grad_norm": 0.10256537050008774, "learning_rate": 0.0002696582684313791, "loss": 0.37179747223854065, "memory(GiB)": 78.26, "step": 1269, "token_acc": 0.8922330825188111, "train_speed(iter/s)": 0.032724 }, { "epoch": 0.24608826236496634, "grad_norm": 0.11051318049430847, "learning_rate": 0.0002696002738195935, "loss": 0.42477959394454956, "memory(GiB)": 78.26, "step": 1270, "token_acc": 0.8768773913540061, "train_speed(iter/s)": 0.032726 }, { "epoch": 0.24628203265029308, "grad_norm": 0.10435964167118073, "learning_rate": 0.00026954223008476163, "loss": 0.3826453983783722, "memory(GiB)": 78.26, "step": 1271, "token_acc": 0.8870310249713184, "train_speed(iter/s)": 0.032728 }, { "epoch": 0.24647580293561983, "grad_norm": 0.11433306336402893, "learning_rate": 0.0002694841372507236, "loss": 0.4138341248035431, "memory(GiB)": 78.26, "step": 1272, "token_acc": 0.8797960325850382, "train_speed(iter/s)": 0.03273 }, { "epoch": 0.24666957322094657, "grad_norm": 0.11614301800727844, "learning_rate": 0.00026942599534133984, "loss": 0.44977250695228577, "memory(GiB)": 78.26, "step": 1273, "token_acc": 0.8709602418042316, "train_speed(iter/s)": 0.032732 }, { "epoch": 0.24686334350627331, "grad_norm": 0.18777108192443848, "learning_rate": 0.0002693678043804906, "loss": 0.3745490312576294, "memory(GiB)": 78.26, "step": 1274, "token_acc": 0.8912466843501327, "train_speed(iter/s)": 0.032734 }, { "epoch": 0.24705711379160006, "grad_norm": 0.10248465836048126, "learning_rate": 0.0002693095643920766, "loss": 0.37728846073150635, "memory(GiB)": 78.26, "step": 1275, "token_acc": 0.8897580936334095, "train_speed(iter/s)": 0.032736 }, { "epoch": 0.2472508840769268, "grad_norm": 0.17319722473621368, "learning_rate": 0.0002692512754000185, "loss": 0.42109817266464233, "memory(GiB)": 78.26, "step": 1276, "token_acc": 0.8798434724156143, "train_speed(iter/s)": 0.032737 }, { "epoch": 0.24744465436225355, "grad_norm": 0.1109052449464798, "learning_rate": 0.0002691929374282572, "loss": 0.3932892084121704, "memory(GiB)": 78.26, "step": 1277, "token_acc": 0.886698000389354, "train_speed(iter/s)": 0.032739 }, { "epoch": 0.2476384246475803, "grad_norm": 0.10127067565917969, "learning_rate": 0.00026913455050075374, "loss": 0.3878341615200043, "memory(GiB)": 78.26, "step": 1278, "token_acc": 0.8887858173572459, "train_speed(iter/s)": 0.032741 }, { "epoch": 0.24783219493290704, "grad_norm": 0.11656392365694046, "learning_rate": 0.00026907611464148905, "loss": 0.4242454171180725, "memory(GiB)": 78.26, "step": 1279, "token_acc": 0.8786354490579843, "train_speed(iter/s)": 0.032743 }, { "epoch": 0.24802596521823378, "grad_norm": 0.10642395168542862, "learning_rate": 0.00026901762987446436, "loss": 0.42408353090286255, "memory(GiB)": 78.26, "step": 1280, "token_acc": 0.8807520778430975, "train_speed(iter/s)": 0.032745 }, { "epoch": 0.24821973550356052, "grad_norm": 0.11042629927396774, "learning_rate": 0.000268959096223701, "loss": 0.3927229344844818, "memory(GiB)": 78.26, "step": 1281, "token_acc": 0.8880031570639305, "train_speed(iter/s)": 0.032747 }, { "epoch": 0.24841350578888727, "grad_norm": 0.09721720218658447, "learning_rate": 0.0002689005137132402, "loss": 0.3534315526485443, "memory(GiB)": 78.26, "step": 1282, "token_acc": 0.8957349486957596, "train_speed(iter/s)": 0.032748 }, { "epoch": 0.248607276074214, "grad_norm": 0.10490331053733826, "learning_rate": 0.0002688418823671435, "loss": 0.39838045835494995, "memory(GiB)": 78.26, "step": 1283, "token_acc": 0.8837405682220588, "train_speed(iter/s)": 0.03275 }, { "epoch": 0.24880104635954076, "grad_norm": 0.11232082545757294, "learning_rate": 0.0002687832022094923, "loss": 0.4016090929508209, "memory(GiB)": 78.26, "step": 1284, "token_acc": 0.8843071140346028, "train_speed(iter/s)": 0.032752 }, { "epoch": 0.2489948166448675, "grad_norm": 0.09907688200473785, "learning_rate": 0.0002687244732643881, "loss": 0.3738039433956146, "memory(GiB)": 78.26, "step": 1285, "token_acc": 0.8901385820445873, "train_speed(iter/s)": 0.032754 }, { "epoch": 0.24918858693019424, "grad_norm": 0.11846373975276947, "learning_rate": 0.0002686656955559525, "loss": 0.45066556334495544, "memory(GiB)": 78.26, "step": 1286, "token_acc": 0.8704005115386851, "train_speed(iter/s)": 0.032756 }, { "epoch": 0.249382357215521, "grad_norm": 0.10774929076433182, "learning_rate": 0.00026860686910832704, "loss": 0.40347960591316223, "memory(GiB)": 78.26, "step": 1287, "token_acc": 0.8835657036827049, "train_speed(iter/s)": 0.032758 }, { "epoch": 0.24957612750084773, "grad_norm": 0.10784945636987686, "learning_rate": 0.0002685479939456734, "loss": 0.41915833950042725, "memory(GiB)": 78.26, "step": 1288, "token_acc": 0.8762773629622079, "train_speed(iter/s)": 0.032759 }, { "epoch": 0.24976989778617448, "grad_norm": 0.10360375046730042, "learning_rate": 0.000268489070092173, "loss": 0.3886149823665619, "memory(GiB)": 78.26, "step": 1289, "token_acc": 0.8865310852948481, "train_speed(iter/s)": 0.032761 }, { "epoch": 0.24996366807150122, "grad_norm": 0.10862737894058228, "learning_rate": 0.00026843009757202777, "loss": 0.4151816666126251, "memory(GiB)": 78.26, "step": 1290, "token_acc": 0.8797664608766098, "train_speed(iter/s)": 0.032763 }, { "epoch": 0.25015743835682797, "grad_norm": 0.11150000244379044, "learning_rate": 0.00026837107640945905, "loss": 0.433391273021698, "memory(GiB)": 78.26, "step": 1291, "token_acc": 0.8773802907537953, "train_speed(iter/s)": 0.032765 }, { "epoch": 0.2503512086421547, "grad_norm": 0.10859289765357971, "learning_rate": 0.0002683120066287085, "loss": 0.41375094652175903, "memory(GiB)": 78.26, "step": 1292, "token_acc": 0.8798327262916498, "train_speed(iter/s)": 0.032767 }, { "epoch": 0.25054497892748145, "grad_norm": 0.11398376524448395, "learning_rate": 0.0002682528882540376, "loss": 0.4081695079803467, "memory(GiB)": 78.26, "step": 1293, "token_acc": 0.8846480067854113, "train_speed(iter/s)": 0.032769 }, { "epoch": 0.2507387492128082, "grad_norm": 0.10699091851711273, "learning_rate": 0.000268193721309728, "loss": 0.40067243576049805, "memory(GiB)": 78.26, "step": 1294, "token_acc": 0.8816360201176189, "train_speed(iter/s)": 0.03277 }, { "epoch": 0.25093251949813494, "grad_norm": 0.10412931442260742, "learning_rate": 0.00026813450582008103, "loss": 0.40070998668670654, "memory(GiB)": 78.26, "step": 1295, "token_acc": 0.8844565031409646, "train_speed(iter/s)": 0.032772 }, { "epoch": 0.2511262897834617, "grad_norm": 0.11724124103784561, "learning_rate": 0.00026807524180941814, "loss": 0.462046355009079, "memory(GiB)": 78.26, "step": 1296, "token_acc": 0.866198113456813, "train_speed(iter/s)": 0.032774 }, { "epoch": 0.25132006006878843, "grad_norm": 0.10965701937675476, "learning_rate": 0.0002680159293020806, "loss": 0.4059637784957886, "memory(GiB)": 78.26, "step": 1297, "token_acc": 0.8829141864372988, "train_speed(iter/s)": 0.032776 }, { "epoch": 0.2515138303541152, "grad_norm": 0.10186024010181427, "learning_rate": 0.0002679565683224297, "loss": 0.3853279948234558, "memory(GiB)": 78.26, "step": 1298, "token_acc": 0.8870169740948418, "train_speed(iter/s)": 0.032778 }, { "epoch": 0.2517076006394419, "grad_norm": 0.11461776494979858, "learning_rate": 0.00026789715889484657, "loss": 0.39657965302467346, "memory(GiB)": 78.26, "step": 1299, "token_acc": 0.8847986900967808, "train_speed(iter/s)": 0.03278 }, { "epoch": 0.25190137092476866, "grad_norm": 0.12204183638095856, "learning_rate": 0.0002678377010437323, "loss": 0.4581944942474365, "memory(GiB)": 78.26, "step": 1300, "token_acc": 0.8694807389051589, "train_speed(iter/s)": 0.032781 }, { "epoch": 0.2520951412100954, "grad_norm": 0.10922391712665558, "learning_rate": 0.00026777819479350775, "loss": 0.39435988664627075, "memory(GiB)": 78.26, "step": 1301, "token_acc": 0.8852810715217581, "train_speed(iter/s)": 0.032783 }, { "epoch": 0.25228891149542215, "grad_norm": 0.10757733881473541, "learning_rate": 0.00026771864016861377, "loss": 0.38533589243888855, "memory(GiB)": 78.26, "step": 1302, "token_acc": 0.8885692617484767, "train_speed(iter/s)": 0.032785 }, { "epoch": 0.2524826817807489, "grad_norm": 0.11346130073070526, "learning_rate": 0.0002676590371935111, "loss": 0.4119528830051422, "memory(GiB)": 78.26, "step": 1303, "token_acc": 0.8788996980878899, "train_speed(iter/s)": 0.032787 }, { "epoch": 0.25267645206607564, "grad_norm": 0.11655943840742111, "learning_rate": 0.0002675993858926802, "loss": 0.4352225661277771, "memory(GiB)": 78.26, "step": 1304, "token_acc": 0.8748795761078998, "train_speed(iter/s)": 0.032788 }, { "epoch": 0.2528702223514024, "grad_norm": 0.10136115550994873, "learning_rate": 0.00026753968629062146, "loss": 0.39623382687568665, "memory(GiB)": 78.26, "step": 1305, "token_acc": 0.882677549344216, "train_speed(iter/s)": 0.03279 }, { "epoch": 0.2530639926367292, "grad_norm": 0.10475181043148041, "learning_rate": 0.0002674799384118552, "loss": 0.3807075023651123, "memory(GiB)": 78.26, "step": 1306, "token_acc": 0.8890237979601748, "train_speed(iter/s)": 0.032792 }, { "epoch": 0.25325776292205593, "grad_norm": 0.11299892514944077, "learning_rate": 0.0002674201422809214, "loss": 0.4387453496456146, "memory(GiB)": 78.26, "step": 1307, "token_acc": 0.8713827248539375, "train_speed(iter/s)": 0.032794 }, { "epoch": 0.2534515332073827, "grad_norm": 0.11837562918663025, "learning_rate": 0.00026736029792238003, "loss": 0.46790987253189087, "memory(GiB)": 78.26, "step": 1308, "token_acc": 0.8648062202398891, "train_speed(iter/s)": 0.032796 }, { "epoch": 0.2536453034927094, "grad_norm": 0.10069328546524048, "learning_rate": 0.0002673004053608106, "loss": 0.3791685700416565, "memory(GiB)": 78.26, "step": 1309, "token_acc": 0.8905825121616565, "train_speed(iter/s)": 0.032797 }, { "epoch": 0.25383907377803616, "grad_norm": 0.12146264314651489, "learning_rate": 0.0002672404646208128, "loss": 0.4696671664714813, "memory(GiB)": 78.26, "step": 1310, "token_acc": 0.8655859144344838, "train_speed(iter/s)": 0.032799 }, { "epoch": 0.2540328440633629, "grad_norm": 0.1213066577911377, "learning_rate": 0.00026718047572700575, "loss": 0.40812620520591736, "memory(GiB)": 78.26, "step": 1311, "token_acc": 0.8800956738768719, "train_speed(iter/s)": 0.032801 }, { "epoch": 0.25422661434868965, "grad_norm": 0.1058630719780922, "learning_rate": 0.0002671204387040286, "loss": 0.4007977247238159, "memory(GiB)": 78.26, "step": 1312, "token_acc": 0.8834997096479827, "train_speed(iter/s)": 0.032802 }, { "epoch": 0.2544203846340164, "grad_norm": 0.09740731865167618, "learning_rate": 0.00026706035357654007, "loss": 0.34724316000938416, "memory(GiB)": 78.26, "step": 1313, "token_acc": 0.8995111614130565, "train_speed(iter/s)": 0.032804 }, { "epoch": 0.25461415491934314, "grad_norm": 0.12260702252388, "learning_rate": 0.00026700022036921884, "loss": 0.47982773184776306, "memory(GiB)": 78.26, "step": 1314, "token_acc": 0.8605628010809541, "train_speed(iter/s)": 0.032806 }, { "epoch": 0.2548079252046699, "grad_norm": 0.09923295676708221, "learning_rate": 0.00026694003910676315, "loss": 0.3682483732700348, "memory(GiB)": 78.26, "step": 1315, "token_acc": 0.8914272901666708, "train_speed(iter/s)": 0.032808 }, { "epoch": 0.2550016954899966, "grad_norm": 0.11524137109518051, "learning_rate": 0.0002668798098138911, "loss": 0.4038434326648712, "memory(GiB)": 78.26, "step": 1316, "token_acc": 0.8844845437065523, "train_speed(iter/s)": 0.03281 }, { "epoch": 0.25519546577532337, "grad_norm": 0.11129481345415115, "learning_rate": 0.00026681953251534053, "loss": 0.4236079454421997, "memory(GiB)": 78.26, "step": 1317, "token_acc": 0.8786745197919983, "train_speed(iter/s)": 0.032811 }, { "epoch": 0.2553892360606501, "grad_norm": 0.10136806964874268, "learning_rate": 0.00026675920723586886, "loss": 0.3650326430797577, "memory(GiB)": 78.26, "step": 1318, "token_acc": 0.8948705179282869, "train_speed(iter/s)": 0.032813 }, { "epoch": 0.25558300634597686, "grad_norm": 0.10744194686412811, "learning_rate": 0.0002666988340002533, "loss": 0.4144882559776306, "memory(GiB)": 78.26, "step": 1319, "token_acc": 0.8820147315987494, "train_speed(iter/s)": 0.032815 }, { "epoch": 0.2557767766313036, "grad_norm": 0.10830198973417282, "learning_rate": 0.00026663841283329086, "loss": 0.4078242778778076, "memory(GiB)": 78.26, "step": 1320, "token_acc": 0.8824059014869888, "train_speed(iter/s)": 0.032817 }, { "epoch": 0.25597054691663035, "grad_norm": 0.09699589014053345, "learning_rate": 0.000266577943759798, "loss": 0.34918874502182007, "memory(GiB)": 78.26, "step": 1321, "token_acc": 0.897431914673294, "train_speed(iter/s)": 0.032818 }, { "epoch": 0.2561643172019571, "grad_norm": 0.11712899804115295, "learning_rate": 0.00026651742680461115, "loss": 0.42532727122306824, "memory(GiB)": 78.26, "step": 1322, "token_acc": 0.8809470377019749, "train_speed(iter/s)": 0.03282 }, { "epoch": 0.25635808748728384, "grad_norm": 0.12428336590528488, "learning_rate": 0.0002664568619925862, "loss": 0.44392460584640503, "memory(GiB)": 78.26, "step": 1323, "token_acc": 0.8718965574699237, "train_speed(iter/s)": 0.032822 }, { "epoch": 0.2565518577726106, "grad_norm": 0.1330966353416443, "learning_rate": 0.00026639624934859853, "loss": 0.46010035276412964, "memory(GiB)": 78.26, "step": 1324, "token_acc": 0.86755315416462, "train_speed(iter/s)": 0.032824 }, { "epoch": 0.2567456280579373, "grad_norm": 0.1076708659529686, "learning_rate": 0.0002663355888975437, "loss": 0.3726995587348938, "memory(GiB)": 78.26, "step": 1325, "token_acc": 0.8889115964031298, "train_speed(iter/s)": 0.032826 }, { "epoch": 0.25693939834326407, "grad_norm": 0.12054485827684402, "learning_rate": 0.0002662748806643364, "loss": 0.45460277795791626, "memory(GiB)": 78.26, "step": 1326, "token_acc": 0.8696869355809753, "train_speed(iter/s)": 0.032828 }, { "epoch": 0.2571331686285908, "grad_norm": 0.1159156784415245, "learning_rate": 0.00026621412467391125, "loss": 0.413094162940979, "memory(GiB)": 78.26, "step": 1327, "token_acc": 0.8797178700263824, "train_speed(iter/s)": 0.03283 }, { "epoch": 0.25732693891391756, "grad_norm": 0.10506505519151688, "learning_rate": 0.00026615332095122223, "loss": 0.3767690360546112, "memory(GiB)": 78.26, "step": 1328, "token_acc": 0.8906741666898722, "train_speed(iter/s)": 0.032831 }, { "epoch": 0.2575207091992443, "grad_norm": 0.10869252681732178, "learning_rate": 0.00026609246952124323, "loss": 0.4055717885494232, "memory(GiB)": 78.26, "step": 1329, "token_acc": 0.8826902784786897, "train_speed(iter/s)": 0.032833 }, { "epoch": 0.25771447948457105, "grad_norm": 0.10867463052272797, "learning_rate": 0.00026603157040896736, "loss": 0.41014426946640015, "memory(GiB)": 78.26, "step": 1330, "token_acc": 0.8821821358824135, "train_speed(iter/s)": 0.032835 }, { "epoch": 0.2579082497698978, "grad_norm": 0.11635489761829376, "learning_rate": 0.0002659706236394077, "loss": 0.41088828444480896, "memory(GiB)": 78.26, "step": 1331, "token_acc": 0.8814593374412788, "train_speed(iter/s)": 0.032837 }, { "epoch": 0.25810202005522453, "grad_norm": 0.11617472767829895, "learning_rate": 0.00026590962923759664, "loss": 0.4540368616580963, "memory(GiB)": 78.26, "step": 1332, "token_acc": 0.8705997580226997, "train_speed(iter/s)": 0.032838 }, { "epoch": 0.2582957903405513, "grad_norm": 0.10581759363412857, "learning_rate": 0.0002658485872285863, "loss": 0.40073099732398987, "memory(GiB)": 78.26, "step": 1333, "token_acc": 0.8806860367272187, "train_speed(iter/s)": 0.03284 }, { "epoch": 0.258489560625878, "grad_norm": 0.10285675525665283, "learning_rate": 0.0002657874976374481, "loss": 0.39908578991889954, "memory(GiB)": 78.26, "step": 1334, "token_acc": 0.8866406011983345, "train_speed(iter/s)": 0.032842 }, { "epoch": 0.25868333091120477, "grad_norm": 0.1177554577589035, "learning_rate": 0.00026572636048927334, "loss": 0.4447701573371887, "memory(GiB)": 78.26, "step": 1335, "token_acc": 0.8721481689350358, "train_speed(iter/s)": 0.032844 }, { "epoch": 0.2588771011965315, "grad_norm": 0.10632134228944778, "learning_rate": 0.00026566517580917267, "loss": 0.405487984418869, "memory(GiB)": 78.26, "step": 1336, "token_acc": 0.881686990206509, "train_speed(iter/s)": 0.032845 }, { "epoch": 0.25907087148185826, "grad_norm": 0.10160496830940247, "learning_rate": 0.00026560394362227624, "loss": 0.3819845914840698, "memory(GiB)": 78.26, "step": 1337, "token_acc": 0.887969021231139, "train_speed(iter/s)": 0.032847 }, { "epoch": 0.259264641767185, "grad_norm": 0.10578127205371857, "learning_rate": 0.0002655426639537337, "loss": 0.40268826484680176, "memory(GiB)": 78.26, "step": 1338, "token_acc": 0.8844663787785317, "train_speed(iter/s)": 0.032849 }, { "epoch": 0.25945841205251174, "grad_norm": 0.11469519138336182, "learning_rate": 0.0002654813368287144, "loss": 0.4174302816390991, "memory(GiB)": 78.26, "step": 1339, "token_acc": 0.8773725181492171, "train_speed(iter/s)": 0.03285 }, { "epoch": 0.2596521823378385, "grad_norm": 0.1754118949174881, "learning_rate": 0.0002654199622724069, "loss": 0.4385494291782379, "memory(GiB)": 78.26, "step": 1340, "token_acc": 0.8754544961198242, "train_speed(iter/s)": 0.032852 }, { "epoch": 0.25984595262316523, "grad_norm": 0.12439852207899094, "learning_rate": 0.00026535854031001953, "loss": 0.46192917227745056, "memory(GiB)": 78.26, "step": 1341, "token_acc": 0.869569399467761, "train_speed(iter/s)": 0.032854 }, { "epoch": 0.260039722908492, "grad_norm": 0.10508039593696594, "learning_rate": 0.00026529707096677977, "loss": 0.3968399167060852, "memory(GiB)": 78.26, "step": 1342, "token_acc": 0.8842156942967024, "train_speed(iter/s)": 0.032856 }, { "epoch": 0.2602334931938187, "grad_norm": 0.10867384821176529, "learning_rate": 0.0002652355542679349, "loss": 0.41717660427093506, "memory(GiB)": 78.26, "step": 1343, "token_acc": 0.8766409013494039, "train_speed(iter/s)": 0.032858 }, { "epoch": 0.26042726347914547, "grad_norm": 0.1032068282365799, "learning_rate": 0.0002651739902387513, "loss": 0.4130229353904724, "memory(GiB)": 78.26, "step": 1344, "token_acc": 0.8800225915765693, "train_speed(iter/s)": 0.032859 }, { "epoch": 0.2606210337644722, "grad_norm": 0.11675221472978592, "learning_rate": 0.00026511237890451504, "loss": 0.45590946078300476, "memory(GiB)": 78.26, "step": 1345, "token_acc": 0.8688484994867459, "train_speed(iter/s)": 0.032861 }, { "epoch": 0.26081480404979895, "grad_norm": 0.10524659603834152, "learning_rate": 0.00026505072029053167, "loss": 0.39327630400657654, "memory(GiB)": 78.26, "step": 1346, "token_acc": 0.8866813620279599, "train_speed(iter/s)": 0.032863 }, { "epoch": 0.2610085743351257, "grad_norm": 0.11350347101688385, "learning_rate": 0.0002649890144221259, "loss": 0.40386056900024414, "memory(GiB)": 78.26, "step": 1347, "token_acc": 0.8825792875394559, "train_speed(iter/s)": 0.032865 }, { "epoch": 0.26120234462045244, "grad_norm": 0.10424939543008804, "learning_rate": 0.000264927261324642, "loss": 0.37450239062309265, "memory(GiB)": 78.26, "step": 1348, "token_acc": 0.8905393229501867, "train_speed(iter/s)": 0.032866 }, { "epoch": 0.2613961149057792, "grad_norm": 0.11633609235286713, "learning_rate": 0.00026486546102344374, "loss": 0.40495017170906067, "memory(GiB)": 78.26, "step": 1349, "token_acc": 0.8839027845893956, "train_speed(iter/s)": 0.032868 }, { "epoch": 0.26158988519110593, "grad_norm": 0.12242694199085236, "learning_rate": 0.000264803613543914, "loss": 0.42412567138671875, "memory(GiB)": 78.26, "step": 1350, "token_acc": 0.8782494429526367, "train_speed(iter/s)": 0.03287 }, { "epoch": 0.2617836554764327, "grad_norm": 0.10657955706119537, "learning_rate": 0.00026474171891145536, "loss": 0.39834100008010864, "memory(GiB)": 78.26, "step": 1351, "token_acc": 0.8844101811062164, "train_speed(iter/s)": 0.032872 }, { "epoch": 0.2619774257617594, "grad_norm": 0.12258057296276093, "learning_rate": 0.0002646797771514895, "loss": 0.44253265857696533, "memory(GiB)": 78.26, "step": 1352, "token_acc": 0.8755840383886855, "train_speed(iter/s)": 0.032873 }, { "epoch": 0.26217119604708616, "grad_norm": 0.10096532851457596, "learning_rate": 0.0002646177882894576, "loss": 0.34528648853302, "memory(GiB)": 78.26, "step": 1353, "token_acc": 0.8981727315320943, "train_speed(iter/s)": 0.032875 }, { "epoch": 0.2623649663324129, "grad_norm": 0.10675135254859924, "learning_rate": 0.0002645557523508202, "loss": 0.3865097761154175, "memory(GiB)": 78.26, "step": 1354, "token_acc": 0.8894203275279439, "train_speed(iter/s)": 0.032877 }, { "epoch": 0.26255873661773965, "grad_norm": 0.10857495665550232, "learning_rate": 0.00026449366936105696, "loss": 0.4091225862503052, "memory(GiB)": 78.26, "step": 1355, "token_acc": 0.8810944625407167, "train_speed(iter/s)": 0.032878 }, { "epoch": 0.2627525069030664, "grad_norm": 0.11212481558322906, "learning_rate": 0.0002644315393456672, "loss": 0.4342299699783325, "memory(GiB)": 78.26, "step": 1356, "token_acc": 0.8745918208676722, "train_speed(iter/s)": 0.03288 }, { "epoch": 0.26294627718839314, "grad_norm": 0.10644607990980148, "learning_rate": 0.00026436936233016937, "loss": 0.3997873067855835, "memory(GiB)": 78.26, "step": 1357, "token_acc": 0.8845487023221603, "train_speed(iter/s)": 0.032882 }, { "epoch": 0.2631400474737199, "grad_norm": 0.1112193912267685, "learning_rate": 0.0002643071383401012, "loss": 0.3876175284385681, "memory(GiB)": 78.26, "step": 1358, "token_acc": 0.8877623875481936, "train_speed(iter/s)": 0.032883 }, { "epoch": 0.26333381775904663, "grad_norm": 0.10909376293420792, "learning_rate": 0.00026424486740101973, "loss": 0.3942829370498657, "memory(GiB)": 78.26, "step": 1359, "token_acc": 0.886093114358897, "train_speed(iter/s)": 0.032885 }, { "epoch": 0.2635275880443734, "grad_norm": 0.10721178352832794, "learning_rate": 0.00026418254953850136, "loss": 0.4055170714855194, "memory(GiB)": 78.26, "step": 1360, "token_acc": 0.8812773628637651, "train_speed(iter/s)": 0.032887 }, { "epoch": 0.2637213583297001, "grad_norm": 0.11268987506628036, "learning_rate": 0.00026412018477814164, "loss": 0.4035116136074066, "memory(GiB)": 78.26, "step": 1361, "token_acc": 0.884084150849012, "train_speed(iter/s)": 0.032888 }, { "epoch": 0.26391512861502686, "grad_norm": 0.0998779758810997, "learning_rate": 0.0002640577731455556, "loss": 0.37671032547950745, "memory(GiB)": 78.26, "step": 1362, "token_acc": 0.8892367801327339, "train_speed(iter/s)": 0.03289 }, { "epoch": 0.2641088989003536, "grad_norm": 0.109964519739151, "learning_rate": 0.0002639953146663772, "loss": 0.38640278577804565, "memory(GiB)": 78.26, "step": 1363, "token_acc": 0.8855807365439093, "train_speed(iter/s)": 0.032892 }, { "epoch": 0.26430266918568035, "grad_norm": 0.10154106467962265, "learning_rate": 0.0002639328093662599, "loss": 0.3642141819000244, "memory(GiB)": 78.26, "step": 1364, "token_acc": 0.894092631464166, "train_speed(iter/s)": 0.032893 }, { "epoch": 0.2644964394710071, "grad_norm": 0.10464335232973099, "learning_rate": 0.00026387025727087635, "loss": 0.3907454311847687, "memory(GiB)": 78.26, "step": 1365, "token_acc": 0.8862689656913484, "train_speed(iter/s)": 0.032895 }, { "epoch": 0.26469020975633384, "grad_norm": 0.10311836749315262, "learning_rate": 0.00026380765840591834, "loss": 0.3700196146965027, "memory(GiB)": 78.26, "step": 1366, "token_acc": 0.8920064072753578, "train_speed(iter/s)": 0.032896 }, { "epoch": 0.26488398004166064, "grad_norm": 0.11652354896068573, "learning_rate": 0.00026374501279709684, "loss": 0.4107242822647095, "memory(GiB)": 78.26, "step": 1367, "token_acc": 0.8811259676284307, "train_speed(iter/s)": 0.032898 }, { "epoch": 0.2650777503269874, "grad_norm": 0.11389324814081192, "learning_rate": 0.0002636823204701421, "loss": 0.4210531711578369, "memory(GiB)": 78.26, "step": 1368, "token_acc": 0.8800777400635059, "train_speed(iter/s)": 0.032899 }, { "epoch": 0.2652715206123141, "grad_norm": 0.100397489964962, "learning_rate": 0.00026361958145080367, "loss": 0.36525481939315796, "memory(GiB)": 78.26, "step": 1369, "token_acc": 0.8945630471509752, "train_speed(iter/s)": 0.032901 }, { "epoch": 0.26546529089764087, "grad_norm": 0.11270000785589218, "learning_rate": 0.00026355679576485003, "loss": 0.44845372438430786, "memory(GiB)": 78.26, "step": 1370, "token_acc": 0.8699417372881356, "train_speed(iter/s)": 0.032902 }, { "epoch": 0.2656590611829676, "grad_norm": 0.10077559947967529, "learning_rate": 0.00026349396343806897, "loss": 0.3875451385974884, "memory(GiB)": 78.26, "step": 1371, "token_acc": 0.8859615802040374, "train_speed(iter/s)": 0.032904 }, { "epoch": 0.26585283146829436, "grad_norm": 0.10320444405078888, "learning_rate": 0.0002634310844962674, "loss": 0.3602311909198761, "memory(GiB)": 78.26, "step": 1372, "token_acc": 0.8927622360728608, "train_speed(iter/s)": 0.032906 }, { "epoch": 0.2660466017536211, "grad_norm": 0.10504072159528732, "learning_rate": 0.0002633681589652715, "loss": 0.38470515608787537, "memory(GiB)": 78.26, "step": 1373, "token_acc": 0.8860445526020218, "train_speed(iter/s)": 0.032907 }, { "epoch": 0.26624037203894785, "grad_norm": 0.10138935595750809, "learning_rate": 0.00026330518687092626, "loss": 0.3703993260860443, "memory(GiB)": 78.26, "step": 1374, "token_acc": 0.8889246858813908, "train_speed(iter/s)": 0.032909 }, { "epoch": 0.2664341423242746, "grad_norm": 0.09620644897222519, "learning_rate": 0.0002632421682390962, "loss": 0.37472549080848694, "memory(GiB)": 78.26, "step": 1375, "token_acc": 0.8922912631003824, "train_speed(iter/s)": 0.03291 }, { "epoch": 0.26662791260960134, "grad_norm": 0.10792719572782516, "learning_rate": 0.00026317910309566476, "loss": 0.3973167836666107, "memory(GiB)": 78.26, "step": 1376, "token_acc": 0.8853990475942648, "train_speed(iter/s)": 0.032912 }, { "epoch": 0.2668216828949281, "grad_norm": 0.10119035840034485, "learning_rate": 0.00026311599146653443, "loss": 0.3766027092933655, "memory(GiB)": 78.26, "step": 1377, "token_acc": 0.8914306730415594, "train_speed(iter/s)": 0.032914 }, { "epoch": 0.2670154531802548, "grad_norm": 0.10936938971281052, "learning_rate": 0.00026305283337762684, "loss": 0.385044664144516, "memory(GiB)": 78.26, "step": 1378, "token_acc": 0.8883062401678028, "train_speed(iter/s)": 0.032915 }, { "epoch": 0.26720922346558157, "grad_norm": 0.10729166865348816, "learning_rate": 0.0002629896288548827, "loss": 0.3800238370895386, "memory(GiB)": 78.26, "step": 1379, "token_acc": 0.8888697301491508, "train_speed(iter/s)": 0.032917 }, { "epoch": 0.2674029937509083, "grad_norm": 0.11573578417301178, "learning_rate": 0.0002629263779242619, "loss": 0.4426755905151367, "memory(GiB)": 78.26, "step": 1380, "token_acc": 0.8741497664796414, "train_speed(iter/s)": 0.032919 }, { "epoch": 0.26759676403623506, "grad_norm": 0.09716112166643143, "learning_rate": 0.00026286308061174315, "loss": 0.36622488498687744, "memory(GiB)": 78.26, "step": 1381, "token_acc": 0.8954976071882019, "train_speed(iter/s)": 0.032921 }, { "epoch": 0.2677905343215618, "grad_norm": 0.11388426274061203, "learning_rate": 0.0002627997369433246, "loss": 0.40867888927459717, "memory(GiB)": 78.26, "step": 1382, "token_acc": 0.8815565203408908, "train_speed(iter/s)": 0.032922 }, { "epoch": 0.26798430460688855, "grad_norm": 0.10194810479879379, "learning_rate": 0.0002627363469450229, "loss": 0.3862283229827881, "memory(GiB)": 78.26, "step": 1383, "token_acc": 0.8868069456729203, "train_speed(iter/s)": 0.032924 }, { "epoch": 0.2681780748922153, "grad_norm": 0.11335241794586182, "learning_rate": 0.0002626729106428742, "loss": 0.3988998532295227, "memory(GiB)": 78.26, "step": 1384, "token_acc": 0.8836738942922578, "train_speed(iter/s)": 0.032926 }, { "epoch": 0.26837184517754203, "grad_norm": 0.10034345090389252, "learning_rate": 0.0002626094280629335, "loss": 0.3520752489566803, "memory(GiB)": 78.26, "step": 1385, "token_acc": 0.8951549706812225, "train_speed(iter/s)": 0.032928 }, { "epoch": 0.2685656154628688, "grad_norm": 0.10252499580383301, "learning_rate": 0.0002625458992312747, "loss": 0.37120020389556885, "memory(GiB)": 78.26, "step": 1386, "token_acc": 0.892410527781646, "train_speed(iter/s)": 0.032929 }, { "epoch": 0.2687593857481955, "grad_norm": 0.11847876012325287, "learning_rate": 0.0002624823241739909, "loss": 0.4186937212944031, "memory(GiB)": 78.26, "step": 1387, "token_acc": 0.8775456306121251, "train_speed(iter/s)": 0.032931 }, { "epoch": 0.26895315603352227, "grad_norm": 0.11340092122554779, "learning_rate": 0.000262418702917194, "loss": 0.41182029247283936, "memory(GiB)": 78.26, "step": 1388, "token_acc": 0.8774313059586292, "train_speed(iter/s)": 0.032932 }, { "epoch": 0.269146926318849, "grad_norm": 0.11647920310497284, "learning_rate": 0.0002623550354870151, "loss": 0.41684606671333313, "memory(GiB)": 78.26, "step": 1389, "token_acc": 0.8813554314750806, "train_speed(iter/s)": 0.032934 }, { "epoch": 0.26934069660417576, "grad_norm": 0.10434102267026901, "learning_rate": 0.00026229132190960395, "loss": 0.35181209444999695, "memory(GiB)": 78.26, "step": 1390, "token_acc": 0.8950941151016876, "train_speed(iter/s)": 0.032935 }, { "epoch": 0.2695344668895025, "grad_norm": 0.11331314593553543, "learning_rate": 0.0002622275622111295, "loss": 0.3922780156135559, "memory(GiB)": 78.26, "step": 1391, "token_acc": 0.8852905742642418, "train_speed(iter/s)": 0.032937 }, { "epoch": 0.26972823717482924, "grad_norm": 0.10961979627609253, "learning_rate": 0.00026216375641777964, "loss": 0.40969350934028625, "memory(GiB)": 78.26, "step": 1392, "token_acc": 0.8799139553643452, "train_speed(iter/s)": 0.032939 }, { "epoch": 0.269922007460156, "grad_norm": 0.10636827349662781, "learning_rate": 0.000262099904555761, "loss": 0.39670512080192566, "memory(GiB)": 78.26, "step": 1393, "token_acc": 0.8852959931980019, "train_speed(iter/s)": 0.03294 }, { "epoch": 0.27011577774548273, "grad_norm": 0.11483940482139587, "learning_rate": 0.00026203600665129935, "loss": 0.40846797823905945, "memory(GiB)": 78.26, "step": 1394, "token_acc": 0.8796793825144723, "train_speed(iter/s)": 0.032942 }, { "epoch": 0.2703095480308095, "grad_norm": 0.10763109475374222, "learning_rate": 0.0002619720627306393, "loss": 0.4060547351837158, "memory(GiB)": 78.26, "step": 1395, "token_acc": 0.8830100404773169, "train_speed(iter/s)": 0.032943 }, { "epoch": 0.2705033183161362, "grad_norm": 0.11100554466247559, "learning_rate": 0.00026190807282004414, "loss": 0.4114495813846588, "memory(GiB)": 78.26, "step": 1396, "token_acc": 0.8800386130606068, "train_speed(iter/s)": 0.032945 }, { "epoch": 0.27069708860146297, "grad_norm": 0.1212446540594101, "learning_rate": 0.0002618440369457965, "loss": 0.45782333612442017, "memory(GiB)": 78.26, "step": 1397, "token_acc": 0.868626656635222, "train_speed(iter/s)": 0.032946 }, { "epoch": 0.2708908588867897, "grad_norm": 0.10512025654315948, "learning_rate": 0.0002617799551341975, "loss": 0.40602394938468933, "memory(GiB)": 78.26, "step": 1398, "token_acc": 0.8847831850941259, "train_speed(iter/s)": 0.032948 }, { "epoch": 0.27108462917211645, "grad_norm": 0.1056622564792633, "learning_rate": 0.00026171582741156725, "loss": 0.3937338590621948, "memory(GiB)": 78.26, "step": 1399, "token_acc": 0.8871733032741762, "train_speed(iter/s)": 0.032949 }, { "epoch": 0.2712783994574432, "grad_norm": 0.10223392397165298, "learning_rate": 0.0002616516538042448, "loss": 0.4050544500350952, "memory(GiB)": 78.26, "step": 1400, "token_acc": 0.883415804468436, "train_speed(iter/s)": 0.032951 }, { "epoch": 0.27147216974276994, "grad_norm": 0.10160177946090698, "learning_rate": 0.0002615874343385879, "loss": 0.38988304138183594, "memory(GiB)": 78.26, "step": 1401, "token_acc": 0.8859642291617994, "train_speed(iter/s)": 0.032943 }, { "epoch": 0.2716659400280967, "grad_norm": 0.12478124350309372, "learning_rate": 0.00026152316904097327, "loss": 0.4281242787837982, "memory(GiB)": 78.26, "step": 1402, "token_acc": 0.875607687959299, "train_speed(iter/s)": 0.032945 }, { "epoch": 0.27185971031342343, "grad_norm": 0.10745180398225784, "learning_rate": 0.00026145885793779633, "loss": 0.3870825171470642, "memory(GiB)": 78.26, "step": 1403, "token_acc": 0.8886628235560077, "train_speed(iter/s)": 0.032946 }, { "epoch": 0.2720534805987502, "grad_norm": 0.10077864676713943, "learning_rate": 0.0002613945010554715, "loss": 0.37229087948799133, "memory(GiB)": 78.26, "step": 1404, "token_acc": 0.8895456714202407, "train_speed(iter/s)": 0.032948 }, { "epoch": 0.2722472508840769, "grad_norm": 0.1082373782992363, "learning_rate": 0.00026133009842043174, "loss": 0.418215274810791, "memory(GiB)": 78.26, "step": 1405, "token_acc": 0.8777365051766379, "train_speed(iter/s)": 0.03295 }, { "epoch": 0.27244102116940366, "grad_norm": 0.1040397435426712, "learning_rate": 0.00026126565005912903, "loss": 0.397979736328125, "memory(GiB)": 78.26, "step": 1406, "token_acc": 0.8846828451006571, "train_speed(iter/s)": 0.032951 }, { "epoch": 0.2726347914547304, "grad_norm": 0.10922178626060486, "learning_rate": 0.000261201155998034, "loss": 0.41443806886672974, "memory(GiB)": 78.26, "step": 1407, "token_acc": 0.8808320476671809, "train_speed(iter/s)": 0.032953 }, { "epoch": 0.27282856174005715, "grad_norm": 0.11000542342662811, "learning_rate": 0.0002611366162636361, "loss": 0.4330993890762329, "memory(GiB)": 78.26, "step": 1408, "token_acc": 0.8738227795410791, "train_speed(iter/s)": 0.032954 }, { "epoch": 0.2730223320253839, "grad_norm": 0.1046384945511818, "learning_rate": 0.00026107203088244357, "loss": 0.37256211042404175, "memory(GiB)": 78.26, "step": 1409, "token_acc": 0.8898919399606564, "train_speed(iter/s)": 0.032956 }, { "epoch": 0.27321610231071064, "grad_norm": 0.10583927482366562, "learning_rate": 0.0002610073998809833, "loss": 0.38221731781959534, "memory(GiB)": 78.26, "step": 1410, "token_acc": 0.8893005757282106, "train_speed(iter/s)": 0.032958 }, { "epoch": 0.2734098725960374, "grad_norm": 0.11508861929178238, "learning_rate": 0.0002609427232858011, "loss": 0.4040870666503906, "memory(GiB)": 78.26, "step": 1411, "token_acc": 0.8844921965991148, "train_speed(iter/s)": 0.032959 }, { "epoch": 0.27360364288136413, "grad_norm": 0.10771467536687851, "learning_rate": 0.0002608780011234612, "loss": 0.3735302686691284, "memory(GiB)": 78.26, "step": 1412, "token_acc": 0.8907560765756076, "train_speed(iter/s)": 0.032961 }, { "epoch": 0.2737974131666909, "grad_norm": 0.11977894604206085, "learning_rate": 0.0002608132334205469, "loss": 0.43817439675331116, "memory(GiB)": 78.26, "step": 1413, "token_acc": 0.8712272477356252, "train_speed(iter/s)": 0.032963 }, { "epoch": 0.2739911834520176, "grad_norm": 0.11429792642593384, "learning_rate": 0.00026074842020365994, "loss": 0.41938716173171997, "memory(GiB)": 78.26, "step": 1414, "token_acc": 0.8812332907125421, "train_speed(iter/s)": 0.032964 }, { "epoch": 0.27418495373734436, "grad_norm": 0.12319884449243546, "learning_rate": 0.00026068356149942085, "loss": 0.4341338574886322, "memory(GiB)": 78.26, "step": 1415, "token_acc": 0.8744184650478735, "train_speed(iter/s)": 0.032966 }, { "epoch": 0.2743787240226711, "grad_norm": 0.10541116446256638, "learning_rate": 0.00026061865733446887, "loss": 0.39338019490242004, "memory(GiB)": 78.26, "step": 1416, "token_acc": 0.8870288248337029, "train_speed(iter/s)": 0.032967 }, { "epoch": 0.27457249430799785, "grad_norm": 0.10803982615470886, "learning_rate": 0.00026055370773546193, "loss": 0.3971567749977112, "memory(GiB)": 78.26, "step": 1417, "token_acc": 0.8830211291457155, "train_speed(iter/s)": 0.032969 }, { "epoch": 0.2747662645933246, "grad_norm": 0.11194406449794769, "learning_rate": 0.00026048871272907657, "loss": 0.386283278465271, "memory(GiB)": 78.26, "step": 1418, "token_acc": 0.8865788499180024, "train_speed(iter/s)": 0.032971 }, { "epoch": 0.27496003487865134, "grad_norm": 0.09532847255468369, "learning_rate": 0.00026042367234200783, "loss": 0.365975558757782, "memory(GiB)": 78.26, "step": 1419, "token_acc": 0.8936005233952846, "train_speed(iter/s)": 0.032972 }, { "epoch": 0.2751538051639781, "grad_norm": 0.10494247823953629, "learning_rate": 0.0002603585866009697, "loss": 0.4001488983631134, "memory(GiB)": 78.26, "step": 1420, "token_acc": 0.8836253515033528, "train_speed(iter/s)": 0.032974 }, { "epoch": 0.2753475754493048, "grad_norm": 0.11904102563858032, "learning_rate": 0.00026029345553269466, "loss": 0.4139590263366699, "memory(GiB)": 78.26, "step": 1421, "token_acc": 0.881683852450303, "train_speed(iter/s)": 0.032975 }, { "epoch": 0.27554134573463157, "grad_norm": 0.09683738648891449, "learning_rate": 0.00026022827916393366, "loss": 0.3371363878250122, "memory(GiB)": 78.26, "step": 1422, "token_acc": 0.8991735972207601, "train_speed(iter/s)": 0.032977 }, { "epoch": 0.2757351160199583, "grad_norm": 0.09993797540664673, "learning_rate": 0.0002601630575214565, "loss": 0.3939957916736603, "memory(GiB)": 78.26, "step": 1423, "token_acc": 0.8834102564102564, "train_speed(iter/s)": 0.032978 }, { "epoch": 0.27592888630528506, "grad_norm": 0.11129175126552582, "learning_rate": 0.0002600977906320514, "loss": 0.41668498516082764, "memory(GiB)": 78.26, "step": 1424, "token_acc": 0.8775945117501095, "train_speed(iter/s)": 0.03298 }, { "epoch": 0.2761226565906118, "grad_norm": 0.10565146058797836, "learning_rate": 0.00026003247852252525, "loss": 0.37214991450309753, "memory(GiB)": 78.26, "step": 1425, "token_acc": 0.8892408742152987, "train_speed(iter/s)": 0.032981 }, { "epoch": 0.27631642687593855, "grad_norm": 0.11600112915039062, "learning_rate": 0.0002599671212197035, "loss": 0.42271527647972107, "memory(GiB)": 78.26, "step": 1426, "token_acc": 0.8791572967215056, "train_speed(iter/s)": 0.032983 }, { "epoch": 0.27651019716126535, "grad_norm": 0.1128147765994072, "learning_rate": 0.0002599017187504301, "loss": 0.4270755648612976, "memory(GiB)": 78.26, "step": 1427, "token_acc": 0.875413155465563, "train_speed(iter/s)": 0.032985 }, { "epoch": 0.2767039674465921, "grad_norm": 0.09815947711467743, "learning_rate": 0.0002598362711415677, "loss": 0.35846811532974243, "memory(GiB)": 78.26, "step": 1428, "token_acc": 0.8941823402029246, "train_speed(iter/s)": 0.032986 }, { "epoch": 0.27689773773191884, "grad_norm": 0.11663207411766052, "learning_rate": 0.0002597707784199973, "loss": 0.41208210587501526, "memory(GiB)": 78.26, "step": 1429, "token_acc": 0.8810604466620577, "train_speed(iter/s)": 0.032988 }, { "epoch": 0.2770915080172456, "grad_norm": 0.10307564586400986, "learning_rate": 0.0002597052406126185, "loss": 0.37274155020713806, "memory(GiB)": 78.26, "step": 1430, "token_acc": 0.8910128388017119, "train_speed(iter/s)": 0.032989 }, { "epoch": 0.2772852783025723, "grad_norm": 0.11798793822526932, "learning_rate": 0.0002596396577463495, "loss": 0.4077189266681671, "memory(GiB)": 78.26, "step": 1431, "token_acc": 0.8813992194674013, "train_speed(iter/s)": 0.032991 }, { "epoch": 0.27747904858789907, "grad_norm": 0.11641109734773636, "learning_rate": 0.00025957402984812695, "loss": 0.40781551599502563, "memory(GiB)": 78.26, "step": 1432, "token_acc": 0.8810986109516703, "train_speed(iter/s)": 0.032992 }, { "epoch": 0.2776728188732258, "grad_norm": 0.11191576719284058, "learning_rate": 0.000259508356944906, "loss": 0.4027900695800781, "memory(GiB)": 78.26, "step": 1433, "token_acc": 0.8829726161088467, "train_speed(iter/s)": 0.032994 }, { "epoch": 0.27786658915855256, "grad_norm": 0.11352302134037018, "learning_rate": 0.0002594426390636602, "loss": 0.423457533121109, "memory(GiB)": 78.26, "step": 1434, "token_acc": 0.8779633306084316, "train_speed(iter/s)": 0.032996 }, { "epoch": 0.2780603594438793, "grad_norm": 0.10580118000507355, "learning_rate": 0.00025937687623138174, "loss": 0.41249287128448486, "memory(GiB)": 78.26, "step": 1435, "token_acc": 0.8809974249829208, "train_speed(iter/s)": 0.032997 }, { "epoch": 0.27825412972920605, "grad_norm": 0.10057955980300903, "learning_rate": 0.00025931106847508115, "loss": 0.37016963958740234, "memory(GiB)": 78.26, "step": 1436, "token_acc": 0.8921429696110849, "train_speed(iter/s)": 0.032999 }, { "epoch": 0.2784479000145328, "grad_norm": 0.11049242317676544, "learning_rate": 0.0002592452158217873, "loss": 0.39204567670822144, "memory(GiB)": 78.26, "step": 1437, "token_acc": 0.8855197058366565, "train_speed(iter/s)": 0.033 }, { "epoch": 0.27864167029985953, "grad_norm": 0.1143956407904625, "learning_rate": 0.00025917931829854795, "loss": 0.41014882922172546, "memory(GiB)": 78.26, "step": 1438, "token_acc": 0.8808566548416765, "train_speed(iter/s)": 0.033002 }, { "epoch": 0.2788354405851863, "grad_norm": 0.10722561925649643, "learning_rate": 0.00025911337593242874, "loss": 0.4017128646373749, "memory(GiB)": 78.26, "step": 1439, "token_acc": 0.8847192571825407, "train_speed(iter/s)": 0.033004 }, { "epoch": 0.279029210870513, "grad_norm": 0.10540410131216049, "learning_rate": 0.0002590473887505141, "loss": 0.3711482286453247, "memory(GiB)": 78.26, "step": 1440, "token_acc": 0.8929875288413797, "train_speed(iter/s)": 0.033005 }, { "epoch": 0.27922298115583977, "grad_norm": 0.10699062794446945, "learning_rate": 0.0002589813567799066, "loss": 0.37242835760116577, "memory(GiB)": 78.26, "step": 1441, "token_acc": 0.8892943052514933, "train_speed(iter/s)": 0.033007 }, { "epoch": 0.2794167514411665, "grad_norm": 0.11198758333921432, "learning_rate": 0.0002589152800477275, "loss": 0.38732579350471497, "memory(GiB)": 78.26, "step": 1442, "token_acc": 0.8864687236455637, "train_speed(iter/s)": 0.033008 }, { "epoch": 0.27961052172649326, "grad_norm": 0.1112254187464714, "learning_rate": 0.00025884915858111614, "loss": 0.41368624567985535, "memory(GiB)": 78.26, "step": 1443, "token_acc": 0.8818168540966846, "train_speed(iter/s)": 0.033009 }, { "epoch": 0.27980429201182, "grad_norm": 0.09880448877811432, "learning_rate": 0.00025878299240723055, "loss": 0.3723769783973694, "memory(GiB)": 78.26, "step": 1444, "token_acc": 0.8905043044032355, "train_speed(iter/s)": 0.033011 }, { "epoch": 0.27999806229714674, "grad_norm": 0.10194198787212372, "learning_rate": 0.0002587167815532468, "loss": 0.3568089008331299, "memory(GiB)": 78.26, "step": 1445, "token_acc": 0.8941702819956616, "train_speed(iter/s)": 0.033012 }, { "epoch": 0.2801918325824735, "grad_norm": 0.10794655978679657, "learning_rate": 0.00025865052604635955, "loss": 0.3879980146884918, "memory(GiB)": 78.26, "step": 1446, "token_acc": 0.8870534199744252, "train_speed(iter/s)": 0.033014 }, { "epoch": 0.28038560286780023, "grad_norm": 0.10190257430076599, "learning_rate": 0.0002585842259137817, "loss": 0.385148823261261, "memory(GiB)": 78.26, "step": 1447, "token_acc": 0.8866731047802994, "train_speed(iter/s)": 0.033015 }, { "epoch": 0.280579373153127, "grad_norm": 0.11218508332967758, "learning_rate": 0.0002585178811827445, "loss": 0.41854822635650635, "memory(GiB)": 78.26, "step": 1448, "token_acc": 0.8806162104733414, "train_speed(iter/s)": 0.033017 }, { "epoch": 0.2807731434384537, "grad_norm": 0.10860633105039597, "learning_rate": 0.00025845149188049747, "loss": 0.3808492124080658, "memory(GiB)": 78.26, "step": 1449, "token_acc": 0.8890830143110403, "train_speed(iter/s)": 0.033018 }, { "epoch": 0.28096691372378046, "grad_norm": 0.10633924603462219, "learning_rate": 0.0002583850580343086, "loss": 0.3995111584663391, "memory(GiB)": 78.26, "step": 1450, "token_acc": 0.8848618846379894, "train_speed(iter/s)": 0.03302 }, { "epoch": 0.2811606840091072, "grad_norm": 0.1051037460565567, "learning_rate": 0.00025831857967146394, "loss": 0.3739997148513794, "memory(GiB)": 78.26, "step": 1451, "token_acc": 0.8915456874466268, "train_speed(iter/s)": 0.033021 }, { "epoch": 0.28135445429443395, "grad_norm": 0.1118805930018425, "learning_rate": 0.0002582520568192679, "loss": 0.4328954517841339, "memory(GiB)": 78.26, "step": 1452, "token_acc": 0.874195172358973, "train_speed(iter/s)": 0.033023 }, { "epoch": 0.2815482245797607, "grad_norm": 0.10707706958055496, "learning_rate": 0.0002581854895050434, "loss": 0.4252171814441681, "memory(GiB)": 78.26, "step": 1453, "token_acc": 0.8767298393819819, "train_speed(iter/s)": 0.033024 }, { "epoch": 0.28174199486508744, "grad_norm": 0.1086982861161232, "learning_rate": 0.0002581188777561313, "loss": 0.41112756729125977, "memory(GiB)": 78.26, "step": 1454, "token_acc": 0.8803484635202252, "train_speed(iter/s)": 0.033026 }, { "epoch": 0.2819357651504142, "grad_norm": 0.10656489431858063, "learning_rate": 0.00025805222159989077, "loss": 0.3699858486652374, "memory(GiB)": 78.26, "step": 1455, "token_acc": 0.8922368486911256, "train_speed(iter/s)": 0.033027 }, { "epoch": 0.28212953543574093, "grad_norm": 0.10233739018440247, "learning_rate": 0.00025798552106369937, "loss": 0.3772289752960205, "memory(GiB)": 78.26, "step": 1456, "token_acc": 0.8884009801305224, "train_speed(iter/s)": 0.033028 }, { "epoch": 0.2823233057210677, "grad_norm": 0.10542133450508118, "learning_rate": 0.00025791877617495275, "loss": 0.37354397773742676, "memory(GiB)": 78.26, "step": 1457, "token_acc": 0.8912464826222526, "train_speed(iter/s)": 0.03303 }, { "epoch": 0.2825170760063944, "grad_norm": 0.11344679445028305, "learning_rate": 0.0002578519869610649, "loss": 0.4273817241191864, "memory(GiB)": 78.26, "step": 1458, "token_acc": 0.8761668213330422, "train_speed(iter/s)": 0.033031 }, { "epoch": 0.28271084629172116, "grad_norm": 0.10357240587472916, "learning_rate": 0.000257785153449468, "loss": 0.3900695741176605, "memory(GiB)": 78.26, "step": 1459, "token_acc": 0.884189494968947, "train_speed(iter/s)": 0.033032 }, { "epoch": 0.2829046165770479, "grad_norm": 0.11411286890506744, "learning_rate": 0.00025771827566761215, "loss": 0.4180591106414795, "memory(GiB)": 78.26, "step": 1460, "token_acc": 0.8776129660386003, "train_speed(iter/s)": 0.033034 }, { "epoch": 0.28309838686237465, "grad_norm": 0.10684863477945328, "learning_rate": 0.00025765135364296606, "loss": 0.42027878761291504, "memory(GiB)": 78.26, "step": 1461, "token_acc": 0.8778430934841382, "train_speed(iter/s)": 0.033035 }, { "epoch": 0.2832921571477014, "grad_norm": 0.10738670825958252, "learning_rate": 0.0002575843874030163, "loss": 0.36749184131622314, "memory(GiB)": 78.26, "step": 1462, "token_acc": 0.889355581127733, "train_speed(iter/s)": 0.033037 }, { "epoch": 0.28348592743302814, "grad_norm": 0.10112845152616501, "learning_rate": 0.0002575173769752677, "loss": 0.3824685513973236, "memory(GiB)": 78.26, "step": 1463, "token_acc": 0.8880091942280679, "train_speed(iter/s)": 0.033038 }, { "epoch": 0.2836796977183549, "grad_norm": 0.11604847013950348, "learning_rate": 0.00025745032238724325, "loss": 0.4152447283267975, "memory(GiB)": 78.26, "step": 1464, "token_acc": 0.8827828574982894, "train_speed(iter/s)": 0.033039 }, { "epoch": 0.28387346800368163, "grad_norm": 0.10715403407812119, "learning_rate": 0.0002573832236664842, "loss": 0.3738415837287903, "memory(GiB)": 78.26, "step": 1465, "token_acc": 0.8922089917371876, "train_speed(iter/s)": 0.033041 }, { "epoch": 0.2840672382890084, "grad_norm": 0.10670117288827896, "learning_rate": 0.0002573160808405496, "loss": 0.40999048948287964, "memory(GiB)": 78.26, "step": 1466, "token_acc": 0.8812486533074768, "train_speed(iter/s)": 0.033042 }, { "epoch": 0.2842610085743351, "grad_norm": 0.11364596337080002, "learning_rate": 0.00025724889393701687, "loss": 0.38645139336586, "memory(GiB)": 78.26, "step": 1467, "token_acc": 0.8858692377222055, "train_speed(iter/s)": 0.033044 }, { "epoch": 0.28445477885966186, "grad_norm": 0.10818950086832047, "learning_rate": 0.00025718166298348163, "loss": 0.43197697401046753, "memory(GiB)": 78.26, "step": 1468, "token_acc": 0.8740517566040781, "train_speed(iter/s)": 0.033045 }, { "epoch": 0.2846485491449886, "grad_norm": 0.10857094824314117, "learning_rate": 0.00025711438800755725, "loss": 0.3983537256717682, "memory(GiB)": 78.26, "step": 1469, "token_acc": 0.8840075154730327, "train_speed(iter/s)": 0.033047 }, { "epoch": 0.28484231943031535, "grad_norm": 0.10339067131280899, "learning_rate": 0.00025704706903687544, "loss": 0.3655926585197449, "memory(GiB)": 78.26, "step": 1470, "token_acc": 0.8920360215324245, "train_speed(iter/s)": 0.033048 }, { "epoch": 0.2850360897156421, "grad_norm": 0.09809229522943497, "learning_rate": 0.0002569797060990859, "loss": 0.36431875824928284, "memory(GiB)": 78.26, "step": 1471, "token_acc": 0.8942954390742002, "train_speed(iter/s)": 0.033049 }, { "epoch": 0.28522986000096884, "grad_norm": 0.12293403595685959, "learning_rate": 0.0002569122992218564, "loss": 0.4425593316555023, "memory(GiB)": 78.26, "step": 1472, "token_acc": 0.872041270483512, "train_speed(iter/s)": 0.033051 }, { "epoch": 0.2854236302862956, "grad_norm": 0.11606067419052124, "learning_rate": 0.00025684484843287284, "loss": 0.4095402956008911, "memory(GiB)": 78.26, "step": 1473, "token_acc": 0.8805144353865623, "train_speed(iter/s)": 0.033052 }, { "epoch": 0.2856174005716223, "grad_norm": 0.09995657950639725, "learning_rate": 0.00025677735375983894, "loss": 0.38141509890556335, "memory(GiB)": 78.26, "step": 1474, "token_acc": 0.886579869804707, "train_speed(iter/s)": 0.033053 }, { "epoch": 0.28581117085694907, "grad_norm": 0.09991477429866791, "learning_rate": 0.00025670981523047664, "loss": 0.3756733238697052, "memory(GiB)": 78.26, "step": 1475, "token_acc": 0.8895817295355402, "train_speed(iter/s)": 0.033055 }, { "epoch": 0.2860049411422758, "grad_norm": 0.10868589580059052, "learning_rate": 0.00025664223287252586, "loss": 0.40122172236442566, "memory(GiB)": 78.26, "step": 1476, "token_acc": 0.8827901370963692, "train_speed(iter/s)": 0.033056 }, { "epoch": 0.28619871142760256, "grad_norm": 0.10444658994674683, "learning_rate": 0.0002565746067137444, "loss": 0.37232351303100586, "memory(GiB)": 78.26, "step": 1477, "token_acc": 0.8921252470612712, "train_speed(iter/s)": 0.033058 }, { "epoch": 0.2863924817129293, "grad_norm": 0.10679329186677933, "learning_rate": 0.0002565069367819082, "loss": 0.36880356073379517, "memory(GiB)": 78.26, "step": 1478, "token_acc": 0.8917536644521776, "train_speed(iter/s)": 0.033059 }, { "epoch": 0.28658625199825605, "grad_norm": 0.09651319682598114, "learning_rate": 0.0002564392231048111, "loss": 0.36866456270217896, "memory(GiB)": 78.26, "step": 1479, "token_acc": 0.8893619056730125, "train_speed(iter/s)": 0.033061 }, { "epoch": 0.2867800222835828, "grad_norm": 0.10870502889156342, "learning_rate": 0.000256371465710265, "loss": 0.3834042549133301, "memory(GiB)": 78.26, "step": 1480, "token_acc": 0.8883792468895053, "train_speed(iter/s)": 0.033062 }, { "epoch": 0.28697379256890954, "grad_norm": 0.11162568628787994, "learning_rate": 0.0002563036646260996, "loss": 0.4060906171798706, "memory(GiB)": 78.26, "step": 1481, "token_acc": 0.8793504766127245, "train_speed(iter/s)": 0.033064 }, { "epoch": 0.2871675628542363, "grad_norm": 0.10246509313583374, "learning_rate": 0.00025623581988016257, "loss": 0.37621062994003296, "memory(GiB)": 78.26, "step": 1482, "token_acc": 0.8903611537529288, "train_speed(iter/s)": 0.033065 }, { "epoch": 0.287361333139563, "grad_norm": 0.10581538081169128, "learning_rate": 0.0002561679315003197, "loss": 0.40835872292518616, "memory(GiB)": 78.26, "step": 1483, "token_acc": 0.8827440663756468, "train_speed(iter/s)": 0.033066 }, { "epoch": 0.28755510342488977, "grad_norm": 0.11528300493955612, "learning_rate": 0.0002560999995144545, "loss": 0.43208250403404236, "memory(GiB)": 78.26, "step": 1484, "token_acc": 0.8761382138334439, "train_speed(iter/s)": 0.033068 }, { "epoch": 0.2877488737102165, "grad_norm": 0.11191680282354355, "learning_rate": 0.00025603202395046857, "loss": 0.40369465947151184, "memory(GiB)": 78.26, "step": 1485, "token_acc": 0.8817523721627779, "train_speed(iter/s)": 0.033069 }, { "epoch": 0.28794264399554326, "grad_norm": 0.0994502380490303, "learning_rate": 0.00025596400483628113, "loss": 0.3518386781215668, "memory(GiB)": 78.26, "step": 1486, "token_acc": 0.8963413044045166, "train_speed(iter/s)": 0.033071 }, { "epoch": 0.28813641428087, "grad_norm": 0.10107074677944183, "learning_rate": 0.00025589594219982957, "loss": 0.36090126633644104, "memory(GiB)": 78.26, "step": 1487, "token_acc": 0.8948038903451824, "train_speed(iter/s)": 0.033072 }, { "epoch": 0.2883301845661968, "grad_norm": 0.10665128380060196, "learning_rate": 0.000255827836069069, "loss": 0.381521075963974, "memory(GiB)": 78.26, "step": 1488, "token_acc": 0.8875178437492439, "train_speed(iter/s)": 0.033073 }, { "epoch": 0.28852395485152355, "grad_norm": 0.11443497240543365, "learning_rate": 0.00025575968647197246, "loss": 0.4305599629878998, "memory(GiB)": 78.26, "step": 1489, "token_acc": 0.8745530313214142, "train_speed(iter/s)": 0.033075 }, { "epoch": 0.2887177251368503, "grad_norm": 0.10215198248624802, "learning_rate": 0.0002556914934365308, "loss": 0.38611581921577454, "memory(GiB)": 78.26, "step": 1490, "token_acc": 0.8878939990051101, "train_speed(iter/s)": 0.033076 }, { "epoch": 0.28891149542217703, "grad_norm": 0.11031733453273773, "learning_rate": 0.00025562325699075275, "loss": 0.3934246003627777, "memory(GiB)": 78.26, "step": 1491, "token_acc": 0.8847335423197492, "train_speed(iter/s)": 0.033078 }, { "epoch": 0.2891052657075038, "grad_norm": 0.107778400182724, "learning_rate": 0.00025555497716266487, "loss": 0.4159546196460724, "memory(GiB)": 78.26, "step": 1492, "token_acc": 0.8804967649961618, "train_speed(iter/s)": 0.033079 }, { "epoch": 0.2892990359928305, "grad_norm": 0.11043369024991989, "learning_rate": 0.00025548665398031145, "loss": 0.42264997959136963, "memory(GiB)": 78.26, "step": 1493, "token_acc": 0.8791478474270945, "train_speed(iter/s)": 0.033081 }, { "epoch": 0.28949280627815727, "grad_norm": 0.11627716571092606, "learning_rate": 0.0002554182874717547, "loss": 0.40543925762176514, "memory(GiB)": 78.26, "step": 1494, "token_acc": 0.8830718414533444, "train_speed(iter/s)": 0.033082 }, { "epoch": 0.289686576563484, "grad_norm": 0.11875671148300171, "learning_rate": 0.00025534987766507466, "loss": 0.42699331045150757, "memory(GiB)": 78.26, "step": 1495, "token_acc": 0.8727887840821125, "train_speed(iter/s)": 0.033084 }, { "epoch": 0.28988034684881075, "grad_norm": 0.1101786196231842, "learning_rate": 0.00025528142458836896, "loss": 0.36886993050575256, "memory(GiB)": 78.26, "step": 1496, "token_acc": 0.8915269892151803, "train_speed(iter/s)": 0.033085 }, { "epoch": 0.2900741171341375, "grad_norm": 0.1435171663761139, "learning_rate": 0.0002552129282697532, "loss": 0.38396596908569336, "memory(GiB)": 78.26, "step": 1497, "token_acc": 0.8871555969652603, "train_speed(iter/s)": 0.033087 }, { "epoch": 0.29026788741946424, "grad_norm": 0.10523149371147156, "learning_rate": 0.0002551443887373605, "loss": 0.36478835344314575, "memory(GiB)": 78.26, "step": 1498, "token_acc": 0.8914113351325608, "train_speed(iter/s)": 0.033088 }, { "epoch": 0.290461657704791, "grad_norm": 0.1243639886379242, "learning_rate": 0.00025507580601934215, "loss": 0.4351167678833008, "memory(GiB)": 78.26, "step": 1499, "token_acc": 0.874244422464007, "train_speed(iter/s)": 0.033089 }, { "epoch": 0.29065542799011773, "grad_norm": 0.12400247901678085, "learning_rate": 0.0002550071801438667, "loss": 0.4268084466457367, "memory(GiB)": 78.26, "step": 1500, "token_acc": 0.8754112107334064, "train_speed(iter/s)": 0.033091 }, { "epoch": 0.29065542799011773, "eval_loss": 0.46029698848724365, "eval_runtime": 1344.1819, "eval_samples_per_second": 5.021, "eval_steps_per_second": 5.021, "eval_token_acc": 0.8845504800698284, "step": 1500 }, { "epoch": 0.2908491982754445, "grad_norm": 0.11321935057640076, "learning_rate": 0.0002549385111391207, "loss": 0.4121960699558258, "memory(GiB)": 78.26, "step": 1501, "token_acc": 0.881211333653493, "train_speed(iter/s)": 0.03214 }, { "epoch": 0.2910429685607712, "grad_norm": 0.11633153259754181, "learning_rate": 0.0002548697990333084, "loss": 0.4356468617916107, "memory(GiB)": 78.26, "step": 1502, "token_acc": 0.8736777331731473, "train_speed(iter/s)": 0.032142 }, { "epoch": 0.29123673884609796, "grad_norm": 0.10829450935125351, "learning_rate": 0.00025480104385465166, "loss": 0.4199885129928589, "memory(GiB)": 78.26, "step": 1503, "token_acc": 0.8805436914067747, "train_speed(iter/s)": 0.032144 }, { "epoch": 0.2914305091314247, "grad_norm": 0.11650849878787994, "learning_rate": 0.0002547322456313901, "loss": 0.4100668430328369, "memory(GiB)": 78.26, "step": 1504, "token_acc": 0.8807525592548188, "train_speed(iter/s)": 0.032146 }, { "epoch": 0.29162427941675145, "grad_norm": 0.10901083797216415, "learning_rate": 0.000254663404391781, "loss": 0.4044414460659027, "memory(GiB)": 78.26, "step": 1505, "token_acc": 0.883741382734262, "train_speed(iter/s)": 0.032148 }, { "epoch": 0.2918180497020782, "grad_norm": 0.10262013971805573, "learning_rate": 0.00025459452016409926, "loss": 0.40257662534713745, "memory(GiB)": 78.26, "step": 1506, "token_acc": 0.8821285962936418, "train_speed(iter/s)": 0.03215 }, { "epoch": 0.29201181998740494, "grad_norm": 0.10731082409620285, "learning_rate": 0.0002545255929766376, "loss": 0.4021913409233093, "memory(GiB)": 78.26, "step": 1507, "token_acc": 0.8832824363320539, "train_speed(iter/s)": 0.032152 }, { "epoch": 0.2922055902727317, "grad_norm": 0.11936552822589874, "learning_rate": 0.00025445662285770613, "loss": 0.4097455143928528, "memory(GiB)": 78.26, "step": 1508, "token_acc": 0.8790452832446013, "train_speed(iter/s)": 0.032154 }, { "epoch": 0.29239936055805843, "grad_norm": 0.11192671209573746, "learning_rate": 0.00025438760983563285, "loss": 0.40953508019447327, "memory(GiB)": 78.26, "step": 1509, "token_acc": 0.8821702104868513, "train_speed(iter/s)": 0.032156 }, { "epoch": 0.2925931308433852, "grad_norm": 0.10715434700250626, "learning_rate": 0.0002543185539387632, "loss": 0.3857203722000122, "memory(GiB)": 78.26, "step": 1510, "token_acc": 0.8867008985879332, "train_speed(iter/s)": 0.032158 }, { "epoch": 0.2927869011287119, "grad_norm": 0.1083342507481575, "learning_rate": 0.0002542494551954602, "loss": 0.3879879117012024, "memory(GiB)": 78.26, "step": 1511, "token_acc": 0.886165023879687, "train_speed(iter/s)": 0.032159 }, { "epoch": 0.29298067141403866, "grad_norm": 0.12331650406122208, "learning_rate": 0.0002541803136341048, "loss": 0.4193812608718872, "memory(GiB)": 78.26, "step": 1512, "token_acc": 0.8781190284970506, "train_speed(iter/s)": 0.032161 }, { "epoch": 0.2931744416993654, "grad_norm": 0.10431553423404694, "learning_rate": 0.0002541111292830951, "loss": 0.3808317184448242, "memory(GiB)": 78.26, "step": 1513, "token_acc": 0.888267724649629, "train_speed(iter/s)": 0.032163 }, { "epoch": 0.29336821198469215, "grad_norm": 0.09962674230337143, "learning_rate": 0.00025404190217084697, "loss": 0.3735466003417969, "memory(GiB)": 78.26, "step": 1514, "token_acc": 0.8892632578004508, "train_speed(iter/s)": 0.032165 }, { "epoch": 0.2935619822700189, "grad_norm": 0.1109800636768341, "learning_rate": 0.000253972632325794, "loss": 0.42662137746810913, "memory(GiB)": 78.26, "step": 1515, "token_acc": 0.8756379486330658, "train_speed(iter/s)": 0.032167 }, { "epoch": 0.29375575255534564, "grad_norm": 0.10075878351926804, "learning_rate": 0.000253903319776387, "loss": 0.3623042702674866, "memory(GiB)": 78.26, "step": 1516, "token_acc": 0.8929071827001877, "train_speed(iter/s)": 0.032169 }, { "epoch": 0.2939495228406724, "grad_norm": 0.10288105905056, "learning_rate": 0.0002538339645510946, "loss": 0.38466522097587585, "memory(GiB)": 78.26, "step": 1517, "token_acc": 0.8891691358327007, "train_speed(iter/s)": 0.032171 }, { "epoch": 0.29414329312599913, "grad_norm": 0.11288794130086899, "learning_rate": 0.00025376456667840284, "loss": 0.39476266503334045, "memory(GiB)": 78.26, "step": 1518, "token_acc": 0.8875907973596265, "train_speed(iter/s)": 0.032173 }, { "epoch": 0.29433706341132587, "grad_norm": 0.12219920009374619, "learning_rate": 0.0002536951261868153, "loss": 0.40511706471443176, "memory(GiB)": 78.26, "step": 1519, "token_acc": 0.8838210765731614, "train_speed(iter/s)": 0.032175 }, { "epoch": 0.2945308336966526, "grad_norm": 0.1097761020064354, "learning_rate": 0.000253625643104853, "loss": 0.4082501530647278, "memory(GiB)": 78.26, "step": 1520, "token_acc": 0.8827916295636687, "train_speed(iter/s)": 0.032177 }, { "epoch": 0.29472460398197936, "grad_norm": 0.21266750991344452, "learning_rate": 0.0002535561174610546, "loss": 0.4297519028186798, "memory(GiB)": 78.26, "step": 1521, "token_acc": 0.8791513393088441, "train_speed(iter/s)": 0.032179 }, { "epoch": 0.2949183742673061, "grad_norm": 0.10368236899375916, "learning_rate": 0.00025348654928397614, "loss": 0.3687269389629364, "memory(GiB)": 78.26, "step": 1522, "token_acc": 0.8922836811893236, "train_speed(iter/s)": 0.03218 }, { "epoch": 0.29511214455263285, "grad_norm": 0.11547058075666428, "learning_rate": 0.000253416938602191, "loss": 0.38897940516471863, "memory(GiB)": 78.26, "step": 1523, "token_acc": 0.8873057637889094, "train_speed(iter/s)": 0.032182 }, { "epoch": 0.2953059148379596, "grad_norm": 0.40387094020843506, "learning_rate": 0.0002533472854442903, "loss": 0.4147917926311493, "memory(GiB)": 78.26, "step": 1524, "token_acc": 0.8777598930968507, "train_speed(iter/s)": 0.032184 }, { "epoch": 0.29549968512328634, "grad_norm": 0.10571064054965973, "learning_rate": 0.0002532775898388824, "loss": 0.38508111238479614, "memory(GiB)": 78.26, "step": 1525, "token_acc": 0.8872821274090757, "train_speed(iter/s)": 0.032186 }, { "epoch": 0.2956934554086131, "grad_norm": 0.1151953935623169, "learning_rate": 0.0002532078518145931, "loss": 0.4073837697505951, "memory(GiB)": 78.26, "step": 1526, "token_acc": 0.8840981728275945, "train_speed(iter/s)": 0.032188 }, { "epoch": 0.2958872256939398, "grad_norm": 0.11016645282506943, "learning_rate": 0.0002531380714000659, "loss": 0.39761802554130554, "memory(GiB)": 78.26, "step": 1527, "token_acc": 0.8864857603439011, "train_speed(iter/s)": 0.03219 }, { "epoch": 0.29608099597926657, "grad_norm": 0.10478914529085159, "learning_rate": 0.00025306824862396127, "loss": 0.3668254613876343, "memory(GiB)": 78.26, "step": 1528, "token_acc": 0.8923076923076924, "train_speed(iter/s)": 0.032192 }, { "epoch": 0.2962747662645933, "grad_norm": 0.3135347068309784, "learning_rate": 0.0002529983835149574, "loss": 0.3972938656806946, "memory(GiB)": 78.26, "step": 1529, "token_acc": 0.8850560993077107, "train_speed(iter/s)": 0.032193 }, { "epoch": 0.29646853654992006, "grad_norm": 0.11302391439676285, "learning_rate": 0.00025292847610174974, "loss": 0.4002307057380676, "memory(GiB)": 78.26, "step": 1530, "token_acc": 0.8843288710519805, "train_speed(iter/s)": 0.032195 }, { "epoch": 0.2966623068352468, "grad_norm": 0.1242542490363121, "learning_rate": 0.0002528585264130511, "loss": 0.40723949670791626, "memory(GiB)": 78.26, "step": 1531, "token_acc": 0.8828371991110276, "train_speed(iter/s)": 0.032197 }, { "epoch": 0.29685607712057355, "grad_norm": 0.11407161504030228, "learning_rate": 0.00025278853447759184, "loss": 0.38927173614501953, "memory(GiB)": 78.26, "step": 1532, "token_acc": 0.888841623906628, "train_speed(iter/s)": 0.032199 }, { "epoch": 0.2970498474059003, "grad_norm": 0.10127895325422287, "learning_rate": 0.0002527185003241194, "loss": 0.38897159695625305, "memory(GiB)": 78.26, "step": 1533, "token_acc": 0.884356180097505, "train_speed(iter/s)": 0.032201 }, { "epoch": 0.29724361769122704, "grad_norm": 0.11514552682638168, "learning_rate": 0.0002526484239813987, "loss": 0.41849443316459656, "memory(GiB)": 78.26, "step": 1534, "token_acc": 0.8769607696674535, "train_speed(iter/s)": 0.032203 }, { "epoch": 0.2974373879765538, "grad_norm": 0.10013052076101303, "learning_rate": 0.00025257830547821205, "loss": 0.35412314534187317, "memory(GiB)": 78.26, "step": 1535, "token_acc": 0.8960811384876806, "train_speed(iter/s)": 0.032205 }, { "epoch": 0.2976311582618805, "grad_norm": 0.107215017080307, "learning_rate": 0.0002525081448433589, "loss": 0.4127185642719269, "memory(GiB)": 78.26, "step": 1536, "token_acc": 0.8802447552447552, "train_speed(iter/s)": 0.032206 }, { "epoch": 0.29782492854720727, "grad_norm": 0.10350769758224487, "learning_rate": 0.00025243794210565623, "loss": 0.35154351592063904, "memory(GiB)": 78.26, "step": 1537, "token_acc": 0.8971347925653725, "train_speed(iter/s)": 0.032208 }, { "epoch": 0.298018698832534, "grad_norm": 0.11700846254825592, "learning_rate": 0.00025236769729393806, "loss": 0.42805176973342896, "memory(GiB)": 78.26, "step": 1538, "token_acc": 0.8778269617706237, "train_speed(iter/s)": 0.03221 }, { "epoch": 0.29821246911786076, "grad_norm": 0.11825554072856903, "learning_rate": 0.0002522974104370559, "loss": 0.42821887135505676, "memory(GiB)": 78.26, "step": 1539, "token_acc": 0.8769199262748311, "train_speed(iter/s)": 0.032211 }, { "epoch": 0.2984062394031875, "grad_norm": 0.11428213119506836, "learning_rate": 0.0002522270815638784, "loss": 0.4251292645931244, "memory(GiB)": 78.26, "step": 1540, "token_acc": 0.8747427227286092, "train_speed(iter/s)": 0.032213 }, { "epoch": 0.29860000968851425, "grad_norm": 0.11575201153755188, "learning_rate": 0.00025215671070329164, "loss": 0.4403133988380432, "memory(GiB)": 78.26, "step": 1541, "token_acc": 0.872397366589948, "train_speed(iter/s)": 0.032215 }, { "epoch": 0.298793779973841, "grad_norm": 0.10235166549682617, "learning_rate": 0.0002520862978841987, "loss": 0.3714492619037628, "memory(GiB)": 78.26, "step": 1542, "token_acc": 0.8913319238900634, "train_speed(iter/s)": 0.032217 }, { "epoch": 0.29898755025916773, "grad_norm": 0.11813291907310486, "learning_rate": 0.00025201584313552, "loss": 0.4298678934574127, "memory(GiB)": 78.26, "step": 1543, "token_acc": 0.875, "train_speed(iter/s)": 0.032219 }, { "epoch": 0.2991813205444945, "grad_norm": 0.1107359528541565, "learning_rate": 0.0002519453464861933, "loss": 0.3796873390674591, "memory(GiB)": 78.26, "step": 1544, "token_acc": 0.8897960202731381, "train_speed(iter/s)": 0.03222 }, { "epoch": 0.2993750908298212, "grad_norm": 0.12202285975217819, "learning_rate": 0.0002518748079651734, "loss": 0.3539576232433319, "memory(GiB)": 78.26, "step": 1545, "token_acc": 0.8958181376124483, "train_speed(iter/s)": 0.032222 }, { "epoch": 0.29956886111514797, "grad_norm": 0.10455404222011566, "learning_rate": 0.00025180422760143244, "loss": 0.3648805618286133, "memory(GiB)": 78.26, "step": 1546, "token_acc": 0.8921280583557064, "train_speed(iter/s)": 0.032224 }, { "epoch": 0.2997626314004747, "grad_norm": 0.10837128758430481, "learning_rate": 0.0002517336054239596, "loss": 0.4049074351787567, "memory(GiB)": 78.26, "step": 1547, "token_acc": 0.8826463706255946, "train_speed(iter/s)": 0.032226 }, { "epoch": 0.29995640168580145, "grad_norm": 0.11739413440227509, "learning_rate": 0.00025166294146176124, "loss": 0.38583841919898987, "memory(GiB)": 78.26, "step": 1548, "token_acc": 0.8861551627309925, "train_speed(iter/s)": 0.032227 }, { "epoch": 0.30015017197112825, "grad_norm": 0.10927794128656387, "learning_rate": 0.00025159223574386114, "loss": 0.40299391746520996, "memory(GiB)": 78.26, "step": 1549, "token_acc": 0.8808586662200263, "train_speed(iter/s)": 0.032229 }, { "epoch": 0.300343942256455, "grad_norm": 0.10551624745130539, "learning_rate": 0.0002515214882992999, "loss": 0.3709675073623657, "memory(GiB)": 78.26, "step": 1550, "token_acc": 0.8917359439947495, "train_speed(iter/s)": 0.032231 }, { "epoch": 0.30053771254178174, "grad_norm": 0.11021958291530609, "learning_rate": 0.00025145069915713536, "loss": 0.3905988335609436, "memory(GiB)": 78.26, "step": 1551, "token_acc": 0.8831299218774511, "train_speed(iter/s)": 0.032233 }, { "epoch": 0.3007314828271085, "grad_norm": 0.09675493836402893, "learning_rate": 0.0002513798683464427, "loss": 0.3553582429885864, "memory(GiB)": 78.26, "step": 1552, "token_acc": 0.895078622611465, "train_speed(iter/s)": 0.032234 }, { "epoch": 0.30092525311243523, "grad_norm": 0.11715512722730637, "learning_rate": 0.0002513089958963139, "loss": 0.38959836959838867, "memory(GiB)": 78.26, "step": 1553, "token_acc": 0.8870248193528119, "train_speed(iter/s)": 0.032236 }, { "epoch": 0.301119023397762, "grad_norm": 0.10786168277263641, "learning_rate": 0.00025123808183585817, "loss": 0.3897908329963684, "memory(GiB)": 78.26, "step": 1554, "token_acc": 0.8833446083074009, "train_speed(iter/s)": 0.032238 }, { "epoch": 0.3013127936830887, "grad_norm": 0.11172870546579361, "learning_rate": 0.00025116712619420185, "loss": 0.37313905358314514, "memory(GiB)": 78.26, "step": 1555, "token_acc": 0.8884811242923387, "train_speed(iter/s)": 0.03224 }, { "epoch": 0.30150656396841546, "grad_norm": 0.11459864675998688, "learning_rate": 0.0002510961290004884, "loss": 0.41675081849098206, "memory(GiB)": 78.26, "step": 1556, "token_acc": 0.8796063237082496, "train_speed(iter/s)": 0.032242 }, { "epoch": 0.3017003342537422, "grad_norm": 0.12527529895305634, "learning_rate": 0.00025102509028387813, "loss": 0.3714950680732727, "memory(GiB)": 78.26, "step": 1557, "token_acc": 0.8910757252156046, "train_speed(iter/s)": 0.032244 }, { "epoch": 0.30189410453906895, "grad_norm": 0.11035227030515671, "learning_rate": 0.00025095401007354867, "loss": 0.39393147826194763, "memory(GiB)": 78.26, "step": 1558, "token_acc": 0.8863604634929498, "train_speed(iter/s)": 0.032246 }, { "epoch": 0.3020878748243957, "grad_norm": 0.10444723814725876, "learning_rate": 0.0002508828883986945, "loss": 0.3709162771701813, "memory(GiB)": 78.26, "step": 1559, "token_acc": 0.8904844563673663, "train_speed(iter/s)": 0.032248 }, { "epoch": 0.30228164510972244, "grad_norm": 0.10122967511415482, "learning_rate": 0.0002508117252885273, "loss": 0.34817183017730713, "memory(GiB)": 78.26, "step": 1560, "token_acc": 0.8977317721590587, "train_speed(iter/s)": 0.03225 }, { "epoch": 0.3024754153950492, "grad_norm": 0.10534863919019699, "learning_rate": 0.00025074052077227556, "loss": 0.4088999927043915, "memory(GiB)": 78.26, "step": 1561, "token_acc": 0.8811859443631039, "train_speed(iter/s)": 0.032251 }, { "epoch": 0.30266918568037593, "grad_norm": 0.0965069830417633, "learning_rate": 0.000250669274879185, "loss": 0.3669951558113098, "memory(GiB)": 78.26, "step": 1562, "token_acc": 0.8907134896627016, "train_speed(iter/s)": 0.032253 }, { "epoch": 0.3028629559657027, "grad_norm": 0.10454553365707397, "learning_rate": 0.0002505979876385181, "loss": 0.40625712275505066, "memory(GiB)": 78.26, "step": 1563, "token_acc": 0.8799307565696124, "train_speed(iter/s)": 0.032255 }, { "epoch": 0.3030567262510294, "grad_norm": 0.10287556052207947, "learning_rate": 0.0002505266590795545, "loss": 0.415050208568573, "memory(GiB)": 78.26, "step": 1564, "token_acc": 0.8815440689198144, "train_speed(iter/s)": 0.032257 }, { "epoch": 0.30325049653635616, "grad_norm": 0.11195293813943863, "learning_rate": 0.00025045528923159073, "loss": 0.4277609586715698, "memory(GiB)": 78.26, "step": 1565, "token_acc": 0.8775790135451792, "train_speed(iter/s)": 0.032258 }, { "epoch": 0.3034442668216829, "grad_norm": 0.10438180714845657, "learning_rate": 0.0002503838781239404, "loss": 0.3552014231681824, "memory(GiB)": 78.26, "step": 1566, "token_acc": 0.896136847274513, "train_speed(iter/s)": 0.03226 }, { "epoch": 0.30363803710700965, "grad_norm": 0.12397732585668564, "learning_rate": 0.0002503124257859339, "loss": 0.46164411306381226, "memory(GiB)": 78.26, "step": 1567, "token_acc": 0.8641721234798877, "train_speed(iter/s)": 0.032262 }, { "epoch": 0.3038318073923364, "grad_norm": 0.11469985544681549, "learning_rate": 0.0002502409322469186, "loss": 0.39702335000038147, "memory(GiB)": 78.26, "step": 1568, "token_acc": 0.883688332266813, "train_speed(iter/s)": 0.032264 }, { "epoch": 0.30402557767766314, "grad_norm": 0.11066870391368866, "learning_rate": 0.00025016939753625886, "loss": 0.40544670820236206, "memory(GiB)": 78.26, "step": 1569, "token_acc": 0.8816060016671298, "train_speed(iter/s)": 0.032266 }, { "epoch": 0.3042193479629899, "grad_norm": 0.09953310340642929, "learning_rate": 0.0002500978216833359, "loss": 0.39714691042900085, "memory(GiB)": 78.26, "step": 1570, "token_acc": 0.8836974458757173, "train_speed(iter/s)": 0.032267 }, { "epoch": 0.3044131182483166, "grad_norm": 0.11722380667924881, "learning_rate": 0.00025002620471754785, "loss": 0.41713088750839233, "memory(GiB)": 78.26, "step": 1571, "token_acc": 0.8797300061362242, "train_speed(iter/s)": 0.032269 }, { "epoch": 0.30460688853364337, "grad_norm": 0.11423590779304504, "learning_rate": 0.00024995454666830967, "loss": 0.40586379170417786, "memory(GiB)": 78.26, "step": 1572, "token_acc": 0.8844951044863364, "train_speed(iter/s)": 0.032271 }, { "epoch": 0.3048006588189701, "grad_norm": 0.11461616307497025, "learning_rate": 0.00024988284756505334, "loss": 0.3949568271636963, "memory(GiB)": 78.26, "step": 1573, "token_acc": 0.8844877454711524, "train_speed(iter/s)": 0.032273 }, { "epoch": 0.30499442910429686, "grad_norm": 0.09611064195632935, "learning_rate": 0.0002498111074372276, "loss": 0.3817123770713806, "memory(GiB)": 78.26, "step": 1574, "token_acc": 0.8891472188103176, "train_speed(iter/s)": 0.032274 }, { "epoch": 0.3051881993896236, "grad_norm": 0.12749864161014557, "learning_rate": 0.0002497393263142979, "loss": 0.43550729751586914, "memory(GiB)": 78.26, "step": 1575, "token_acc": 0.876248012718601, "train_speed(iter/s)": 0.032276 }, { "epoch": 0.30538196967495035, "grad_norm": 0.10742782801389694, "learning_rate": 0.00024966750422574684, "loss": 0.3822196125984192, "memory(GiB)": 78.26, "step": 1576, "token_acc": 0.8897888795986622, "train_speed(iter/s)": 0.032278 }, { "epoch": 0.3055757399602771, "grad_norm": 0.1094922348856926, "learning_rate": 0.0002495956412010736, "loss": 0.3977155089378357, "memory(GiB)": 78.26, "step": 1577, "token_acc": 0.8843427182499494, "train_speed(iter/s)": 0.03228 }, { "epoch": 0.30576951024560384, "grad_norm": 0.11330767720937729, "learning_rate": 0.0002495237372697943, "loss": 0.3944382071495056, "memory(GiB)": 78.26, "step": 1578, "token_acc": 0.8850218853421586, "train_speed(iter/s)": 0.032281 }, { "epoch": 0.3059632805309306, "grad_norm": 0.10812770575284958, "learning_rate": 0.0002494517924614418, "loss": 0.39382147789001465, "memory(GiB)": 78.26, "step": 1579, "token_acc": 0.8852722927933593, "train_speed(iter/s)": 0.032283 }, { "epoch": 0.3061570508162573, "grad_norm": 0.12108810991048813, "learning_rate": 0.00024937980680556576, "loss": 0.4625246226787567, "memory(GiB)": 78.26, "step": 1580, "token_acc": 0.8664573098687892, "train_speed(iter/s)": 0.032285 }, { "epoch": 0.30635082110158407, "grad_norm": 0.11010700464248657, "learning_rate": 0.00024930778033173265, "loss": 0.40388697385787964, "memory(GiB)": 78.26, "step": 1581, "token_acc": 0.8823420361501623, "train_speed(iter/s)": 0.032287 }, { "epoch": 0.3065445913869108, "grad_norm": 0.1091642677783966, "learning_rate": 0.0002492357130695256, "loss": 0.4263598322868347, "memory(GiB)": 78.26, "step": 1582, "token_acc": 0.8781599433828733, "train_speed(iter/s)": 0.032289 }, { "epoch": 0.30673836167223756, "grad_norm": 0.10464166849851608, "learning_rate": 0.0002491636050485447, "loss": 0.37317129969596863, "memory(GiB)": 78.26, "step": 1583, "token_acc": 0.890936937421071, "train_speed(iter/s)": 0.03229 }, { "epoch": 0.3069321319575643, "grad_norm": 0.10919937491416931, "learning_rate": 0.00024909145629840645, "loss": 0.39577916264533997, "memory(GiB)": 78.26, "step": 1584, "token_acc": 0.8861203036791901, "train_speed(iter/s)": 0.032292 }, { "epoch": 0.30712590224289105, "grad_norm": 0.10316384583711624, "learning_rate": 0.0002490192668487445, "loss": 0.3726585805416107, "memory(GiB)": 78.26, "step": 1585, "token_acc": 0.8914358661264361, "train_speed(iter/s)": 0.032294 }, { "epoch": 0.3073196725282178, "grad_norm": 0.11321194469928741, "learning_rate": 0.00024894703672920894, "loss": 0.4326629340648651, "memory(GiB)": 78.26, "step": 1586, "token_acc": 0.8764373258299288, "train_speed(iter/s)": 0.032296 }, { "epoch": 0.30751344281354454, "grad_norm": 0.10429013520479202, "learning_rate": 0.0002488747659694665, "loss": 0.36875998973846436, "memory(GiB)": 78.26, "step": 1587, "token_acc": 0.8898948094499052, "train_speed(iter/s)": 0.032297 }, { "epoch": 0.3077072130988713, "grad_norm": 0.10509074479341507, "learning_rate": 0.0002488024545992009, "loss": 0.36970990896224976, "memory(GiB)": 78.26, "step": 1588, "token_acc": 0.8925745257452574, "train_speed(iter/s)": 0.032299 }, { "epoch": 0.307900983384198, "grad_norm": 0.09186790883541107, "learning_rate": 0.0002487301026481122, "loss": 0.3301558494567871, "memory(GiB)": 78.26, "step": 1589, "token_acc": 0.9027383654937571, "train_speed(iter/s)": 0.032301 }, { "epoch": 0.30809475366952477, "grad_norm": 0.11293933540582657, "learning_rate": 0.00024865771014591733, "loss": 0.38940760493278503, "memory(GiB)": 78.26, "step": 1590, "token_acc": 0.8861825562725317, "train_speed(iter/s)": 0.032303 }, { "epoch": 0.3082885239548515, "grad_norm": 0.11680306494235992, "learning_rate": 0.0002485852771223499, "loss": 0.41928166151046753, "memory(GiB)": 78.26, "step": 1591, "token_acc": 0.8788892413276046, "train_speed(iter/s)": 0.032304 }, { "epoch": 0.30848229424017826, "grad_norm": 0.10015156865119934, "learning_rate": 0.00024851280360716014, "loss": 0.3776377737522125, "memory(GiB)": 78.26, "step": 1592, "token_acc": 0.8896725440806046, "train_speed(iter/s)": 0.032306 }, { "epoch": 0.308676064525505, "grad_norm": 0.11279778182506561, "learning_rate": 0.00024844028963011476, "loss": 0.39112144708633423, "memory(GiB)": 78.26, "step": 1593, "token_acc": 0.8870161362751848, "train_speed(iter/s)": 0.032308 }, { "epoch": 0.30886983481083174, "grad_norm": 0.09899583458900452, "learning_rate": 0.0002483677352209972, "loss": 0.35154473781585693, "memory(GiB)": 78.26, "step": 1594, "token_acc": 0.897304444024716, "train_speed(iter/s)": 0.032309 }, { "epoch": 0.3090636050961585, "grad_norm": 0.11332813650369644, "learning_rate": 0.0002482951404096076, "loss": 0.40011727809906006, "memory(GiB)": 78.26, "step": 1595, "token_acc": 0.8834134615384616, "train_speed(iter/s)": 0.032311 }, { "epoch": 0.30925737538148523, "grad_norm": 0.10168785601854324, "learning_rate": 0.00024822250522576247, "loss": 0.4107932150363922, "memory(GiB)": 78.26, "step": 1596, "token_acc": 0.8791540056914098, "train_speed(iter/s)": 0.032313 }, { "epoch": 0.309451145666812, "grad_norm": 0.10856655985116959, "learning_rate": 0.0002481498296992951, "loss": 0.40053847432136536, "memory(GiB)": 78.26, "step": 1597, "token_acc": 0.8820065490768003, "train_speed(iter/s)": 0.032314 }, { "epoch": 0.3096449159521387, "grad_norm": 0.10789895802736282, "learning_rate": 0.0002480771138600553, "loss": 0.3848547041416168, "memory(GiB)": 78.26, "step": 1598, "token_acc": 0.8867153284671533, "train_speed(iter/s)": 0.032316 }, { "epoch": 0.30983868623746547, "grad_norm": 0.11236843466758728, "learning_rate": 0.00024800435773790946, "loss": 0.40924182534217834, "memory(GiB)": 78.26, "step": 1599, "token_acc": 0.8796616904126696, "train_speed(iter/s)": 0.032318 }, { "epoch": 0.3100324565227922, "grad_norm": 0.10669440031051636, "learning_rate": 0.00024793156136274037, "loss": 0.38883164525032043, "memory(GiB)": 78.26, "step": 1600, "token_acc": 0.8854082720253724, "train_speed(iter/s)": 0.032319 }, { "epoch": 0.31022622680811895, "grad_norm": 0.10899162292480469, "learning_rate": 0.0002478587247644475, "loss": 0.3585772216320038, "memory(GiB)": 78.26, "step": 1601, "token_acc": 0.8958569414886266, "train_speed(iter/s)": 0.032313 }, { "epoch": 0.3104199970934457, "grad_norm": 0.11266548186540604, "learning_rate": 0.00024778584797294684, "loss": 0.3955519199371338, "memory(GiB)": 78.26, "step": 1602, "token_acc": 0.8861248676036985, "train_speed(iter/s)": 0.032315 }, { "epoch": 0.31061376737877244, "grad_norm": 0.1098904013633728, "learning_rate": 0.0002477129310181708, "loss": 0.42345693707466125, "memory(GiB)": 78.26, "step": 1603, "token_acc": 0.8796804389928986, "train_speed(iter/s)": 0.032316 }, { "epoch": 0.3108075376640992, "grad_norm": 0.10730142146348953, "learning_rate": 0.0002476399739300683, "loss": 0.40332454442977905, "memory(GiB)": 78.26, "step": 1604, "token_acc": 0.8845008085794536, "train_speed(iter/s)": 0.032318 }, { "epoch": 0.31100130794942593, "grad_norm": 0.10026847571134567, "learning_rate": 0.0002475669767386049, "loss": 0.3628842830657959, "memory(GiB)": 78.26, "step": 1605, "token_acc": 0.8923505698501966, "train_speed(iter/s)": 0.032319 }, { "epoch": 0.3111950782347527, "grad_norm": 0.12212540209293365, "learning_rate": 0.00024749393947376234, "loss": 0.38043132424354553, "memory(GiB)": 78.26, "step": 1606, "token_acc": 0.8881142442463197, "train_speed(iter/s)": 0.032321 }, { "epoch": 0.3113888485200794, "grad_norm": 0.10906349122524261, "learning_rate": 0.00024742086216553914, "loss": 0.4101215898990631, "memory(GiB)": 78.26, "step": 1607, "token_acc": 0.8820161592380544, "train_speed(iter/s)": 0.032323 }, { "epoch": 0.31158261880540616, "grad_norm": 0.10822609812021255, "learning_rate": 0.00024734774484395, "loss": 0.40345677733421326, "memory(GiB)": 78.26, "step": 1608, "token_acc": 0.8825836216839678, "train_speed(iter/s)": 0.032324 }, { "epoch": 0.31177638909073296, "grad_norm": 0.13473811745643616, "learning_rate": 0.00024727458753902624, "loss": 0.492877721786499, "memory(GiB)": 78.26, "step": 1609, "token_acc": 0.8599673445153629, "train_speed(iter/s)": 0.032326 }, { "epoch": 0.3119701593760597, "grad_norm": 0.10517790168523788, "learning_rate": 0.0002472013902808155, "loss": 0.375487744808197, "memory(GiB)": 78.26, "step": 1610, "token_acc": 0.8903864596563075, "train_speed(iter/s)": 0.032328 }, { "epoch": 0.31216392966138645, "grad_norm": 0.11642046272754669, "learning_rate": 0.00024712815309938186, "loss": 0.39327579736709595, "memory(GiB)": 78.26, "step": 1611, "token_acc": 0.8876873924797247, "train_speed(iter/s)": 0.03233 }, { "epoch": 0.3123576999467132, "grad_norm": 0.10896303504705429, "learning_rate": 0.00024705487602480583, "loss": 0.38862502574920654, "memory(GiB)": 78.26, "step": 1612, "token_acc": 0.88512, "train_speed(iter/s)": 0.032331 }, { "epoch": 0.31255147023203994, "grad_norm": 0.10455754399299622, "learning_rate": 0.0002469815590871842, "loss": 0.3765313923358917, "memory(GiB)": 78.26, "step": 1613, "token_acc": 0.890468422279189, "train_speed(iter/s)": 0.032333 }, { "epoch": 0.3127452405173667, "grad_norm": 0.10820908099412918, "learning_rate": 0.00024690820231663036, "loss": 0.40285009145736694, "memory(GiB)": 78.26, "step": 1614, "token_acc": 0.8830803366074086, "train_speed(iter/s)": 0.032335 }, { "epoch": 0.31293901080269343, "grad_norm": 0.10197239369153976, "learning_rate": 0.0002468348057432737, "loss": 0.3643296957015991, "memory(GiB)": 78.26, "step": 1615, "token_acc": 0.8937665096266442, "train_speed(iter/s)": 0.032336 }, { "epoch": 0.3131327810880202, "grad_norm": 0.12070825695991516, "learning_rate": 0.00024676136939726036, "loss": 0.4440290331840515, "memory(GiB)": 78.26, "step": 1616, "token_acc": 0.8726066239878395, "train_speed(iter/s)": 0.032338 }, { "epoch": 0.3133265513733469, "grad_norm": 0.10100740194320679, "learning_rate": 0.0002466878933087525, "loss": 0.38823482394218445, "memory(GiB)": 78.26, "step": 1617, "token_acc": 0.8885969521807672, "train_speed(iter/s)": 0.03234 }, { "epoch": 0.31352032165867366, "grad_norm": 0.10464068502187729, "learning_rate": 0.00024661437750792865, "loss": 0.3649863004684448, "memory(GiB)": 78.26, "step": 1618, "token_acc": 0.8917689623982427, "train_speed(iter/s)": 0.032341 }, { "epoch": 0.3137140919440004, "grad_norm": 0.09842365980148315, "learning_rate": 0.00024654082202498395, "loss": 0.37648287415504456, "memory(GiB)": 78.26, "step": 1619, "token_acc": 0.8893629913904108, "train_speed(iter/s)": 0.032343 }, { "epoch": 0.31390786222932715, "grad_norm": 0.11136578768491745, "learning_rate": 0.00024646722689012946, "loss": 0.3890235722064972, "memory(GiB)": 78.26, "step": 1620, "token_acc": 0.8880463144161774, "train_speed(iter/s)": 0.032345 }, { "epoch": 0.3141016325146539, "grad_norm": 0.12551820278167725, "learning_rate": 0.0002463935921335927, "loss": 0.39017459750175476, "memory(GiB)": 78.26, "step": 1621, "token_acc": 0.8850291533218363, "train_speed(iter/s)": 0.032347 }, { "epoch": 0.31429540279998064, "grad_norm": 0.12295407056808472, "learning_rate": 0.00024631991778561747, "loss": 0.4268419146537781, "memory(GiB)": 78.26, "step": 1622, "token_acc": 0.8766729419818586, "train_speed(iter/s)": 0.032348 }, { "epoch": 0.3144891730853074, "grad_norm": 0.10500724613666534, "learning_rate": 0.00024624620387646377, "loss": 0.3578689992427826, "memory(GiB)": 78.26, "step": 1623, "token_acc": 0.8940465211599549, "train_speed(iter/s)": 0.03235 }, { "epoch": 0.3146829433706341, "grad_norm": 0.1127677783370018, "learning_rate": 0.0002461724504364079, "loss": 0.41033974289894104, "memory(GiB)": 78.26, "step": 1624, "token_acc": 0.8811105837683911, "train_speed(iter/s)": 0.032352 }, { "epoch": 0.31487671365596087, "grad_norm": 0.10852906107902527, "learning_rate": 0.0002460986574957424, "loss": 0.4147697687149048, "memory(GiB)": 78.26, "step": 1625, "token_acc": 0.8803296877476621, "train_speed(iter/s)": 0.032353 }, { "epoch": 0.3150704839412876, "grad_norm": 0.10529500991106033, "learning_rate": 0.000246024825084776, "loss": 0.3946833312511444, "memory(GiB)": 78.26, "step": 1626, "token_acc": 0.8831530219494435, "train_speed(iter/s)": 0.032355 }, { "epoch": 0.31526425422661436, "grad_norm": 0.10391051322221756, "learning_rate": 0.00024595095323383365, "loss": 0.3614901900291443, "memory(GiB)": 78.26, "step": 1627, "token_acc": 0.8932863813825308, "train_speed(iter/s)": 0.032357 }, { "epoch": 0.3154580245119411, "grad_norm": 0.11637852340936661, "learning_rate": 0.00024587704197325655, "loss": 0.43050843477249146, "memory(GiB)": 78.26, "step": 1628, "token_acc": 0.8762489252571971, "train_speed(iter/s)": 0.032359 }, { "epoch": 0.31565179479726785, "grad_norm": 0.11949366331100464, "learning_rate": 0.0002458030913334019, "loss": 0.4221411347389221, "memory(GiB)": 78.26, "step": 1629, "token_acc": 0.8803364945335391, "train_speed(iter/s)": 0.03236 }, { "epoch": 0.3158455650825946, "grad_norm": 0.10639101266860962, "learning_rate": 0.0002457291013446434, "loss": 0.387704074382782, "memory(GiB)": 78.26, "step": 1630, "token_acc": 0.8859933917540583, "train_speed(iter/s)": 0.032362 }, { "epoch": 0.31603933536792134, "grad_norm": 0.12497800588607788, "learning_rate": 0.00024565507203737054, "loss": 0.4306849539279938, "memory(GiB)": 78.26, "step": 1631, "token_acc": 0.8744147105336599, "train_speed(iter/s)": 0.032364 }, { "epoch": 0.3162331056532481, "grad_norm": 0.10723750293254852, "learning_rate": 0.0002455810034419893, "loss": 0.41962188482284546, "memory(GiB)": 78.26, "step": 1632, "token_acc": 0.8786517987789495, "train_speed(iter/s)": 0.032366 }, { "epoch": 0.3164268759385748, "grad_norm": 0.10330141335725784, "learning_rate": 0.0002455068955889216, "loss": 0.39232727885246277, "memory(GiB)": 78.26, "step": 1633, "token_acc": 0.8855892466395748, "train_speed(iter/s)": 0.032367 }, { "epoch": 0.31662064622390157, "grad_norm": 0.09860360622406006, "learning_rate": 0.0002454327485086055, "loss": 0.37398630380630493, "memory(GiB)": 78.26, "step": 1634, "token_acc": 0.8935310637039957, "train_speed(iter/s)": 0.032369 }, { "epoch": 0.3168144165092283, "grad_norm": 0.10215026140213013, "learning_rate": 0.00024535856223149524, "loss": 0.3739304840564728, "memory(GiB)": 78.26, "step": 1635, "token_acc": 0.889224391616981, "train_speed(iter/s)": 0.03237 }, { "epoch": 0.31700818679455506, "grad_norm": 0.10608868300914764, "learning_rate": 0.000245284336788061, "loss": 0.40002018213272095, "memory(GiB)": 78.26, "step": 1636, "token_acc": 0.8821046707934721, "train_speed(iter/s)": 0.032372 }, { "epoch": 0.3172019570798818, "grad_norm": 0.10536207258701324, "learning_rate": 0.0002452100722087893, "loss": 0.38114506006240845, "memory(GiB)": 78.26, "step": 1637, "token_acc": 0.8877112648882257, "train_speed(iter/s)": 0.032373 }, { "epoch": 0.31739572736520855, "grad_norm": 0.10534202307462692, "learning_rate": 0.00024513576852418256, "loss": 0.3625592887401581, "memory(GiB)": 78.26, "step": 1638, "token_acc": 0.8946749986647439, "train_speed(iter/s)": 0.032375 }, { "epoch": 0.3175894976505353, "grad_norm": 0.10395082831382751, "learning_rate": 0.0002450614257647593, "loss": 0.40679264068603516, "memory(GiB)": 78.26, "step": 1639, "token_acc": 0.882569104812343, "train_speed(iter/s)": 0.032377 }, { "epoch": 0.31778326793586203, "grad_norm": 0.11221817880868912, "learning_rate": 0.00024498704396105404, "loss": 0.4055024981498718, "memory(GiB)": 78.26, "step": 1640, "token_acc": 0.882479675964494, "train_speed(iter/s)": 0.032378 }, { "epoch": 0.3179770382211888, "grad_norm": 0.09974303096532822, "learning_rate": 0.00024491262314361745, "loss": 0.38338178396224976, "memory(GiB)": 78.26, "step": 1641, "token_acc": 0.8880624860078108, "train_speed(iter/s)": 0.03238 }, { "epoch": 0.3181708085065155, "grad_norm": 0.10050232708454132, "learning_rate": 0.0002448381633430161, "loss": 0.35944464802742004, "memory(GiB)": 78.26, "step": 1642, "token_acc": 0.8940914437298231, "train_speed(iter/s)": 0.032382 }, { "epoch": 0.31836457879184227, "grad_norm": 0.11217102408409119, "learning_rate": 0.0002447636645898327, "loss": 0.39601394534111023, "memory(GiB)": 78.26, "step": 1643, "token_acc": 0.8860522531810413, "train_speed(iter/s)": 0.032383 }, { "epoch": 0.318558349077169, "grad_norm": 0.11227677762508392, "learning_rate": 0.00024468912691466587, "loss": 0.39023369550704956, "memory(GiB)": 78.26, "step": 1644, "token_acc": 0.8874575239640919, "train_speed(iter/s)": 0.032385 }, { "epoch": 0.31875211936249576, "grad_norm": 0.10062558948993683, "learning_rate": 0.00024461455034813017, "loss": 0.37629178166389465, "memory(GiB)": 78.26, "step": 1645, "token_acc": 0.8884143616228832, "train_speed(iter/s)": 0.032386 }, { "epoch": 0.3189458896478225, "grad_norm": 0.12126388400793076, "learning_rate": 0.0002445399349208563, "loss": 0.4465317726135254, "memory(GiB)": 78.26, "step": 1646, "token_acc": 0.8706858890345129, "train_speed(iter/s)": 0.032388 }, { "epoch": 0.31913965993314924, "grad_norm": 0.10243180394172668, "learning_rate": 0.00024446528066349074, "loss": 0.3463560938835144, "memory(GiB)": 78.26, "step": 1647, "token_acc": 0.8977993900931345, "train_speed(iter/s)": 0.03239 }, { "epoch": 0.319333430218476, "grad_norm": 0.09814903885126114, "learning_rate": 0.00024439058760669603, "loss": 0.3848741054534912, "memory(GiB)": 78.26, "step": 1648, "token_acc": 0.8868541204061775, "train_speed(iter/s)": 0.032391 }, { "epoch": 0.31952720050380273, "grad_norm": 0.10367216914892197, "learning_rate": 0.00024431585578115064, "loss": 0.38090386986732483, "memory(GiB)": 78.26, "step": 1649, "token_acc": 0.8876089324618737, "train_speed(iter/s)": 0.032393 }, { "epoch": 0.3197209707891295, "grad_norm": 0.11322217434644699, "learning_rate": 0.00024424108521754886, "loss": 0.3961893916130066, "memory(GiB)": 78.26, "step": 1650, "token_acc": 0.8858755383929897, "train_speed(iter/s)": 0.032394 }, { "epoch": 0.3199147410744562, "grad_norm": 0.11258938163518906, "learning_rate": 0.00024416627594660105, "loss": 0.40742677450180054, "memory(GiB)": 78.26, "step": 1651, "token_acc": 0.8804218539370391, "train_speed(iter/s)": 0.032396 }, { "epoch": 0.32010851135978297, "grad_norm": 0.10289296507835388, "learning_rate": 0.00024409142799903342, "loss": 0.37639862298965454, "memory(GiB)": 78.26, "step": 1652, "token_acc": 0.8907713884992987, "train_speed(iter/s)": 0.032398 }, { "epoch": 0.3203022816451097, "grad_norm": 0.10312007367610931, "learning_rate": 0.00024401654140558795, "loss": 0.39015206694602966, "memory(GiB)": 78.26, "step": 1653, "token_acc": 0.8851129761136217, "train_speed(iter/s)": 0.032399 }, { "epoch": 0.32049605193043645, "grad_norm": 0.09907987713813782, "learning_rate": 0.00024394161619702257, "loss": 0.35583794116973877, "memory(GiB)": 78.26, "step": 1654, "token_acc": 0.8962283436398017, "train_speed(iter/s)": 0.032401 }, { "epoch": 0.3206898222157632, "grad_norm": 0.11065588891506195, "learning_rate": 0.00024386665240411115, "loss": 0.3847392499446869, "memory(GiB)": 78.26, "step": 1655, "token_acc": 0.886949811533553, "train_speed(iter/s)": 0.032402 }, { "epoch": 0.32088359250108994, "grad_norm": 0.10860442370176315, "learning_rate": 0.0002437916500576433, "loss": 0.4008861184120178, "memory(GiB)": 78.33, "step": 1656, "token_acc": 0.8821946594228897, "train_speed(iter/s)": 0.032404 }, { "epoch": 0.3210773627864167, "grad_norm": 0.11284346878528595, "learning_rate": 0.0002437166091884244, "loss": 0.3918432593345642, "memory(GiB)": 78.33, "step": 1657, "token_acc": 0.8856465073739568, "train_speed(iter/s)": 0.032405 }, { "epoch": 0.32127113307174343, "grad_norm": 0.11324102431535721, "learning_rate": 0.00024364152982727592, "loss": 0.4203750789165497, "memory(GiB)": 78.33, "step": 1658, "token_acc": 0.8781059802334023, "train_speed(iter/s)": 0.032407 }, { "epoch": 0.3214649033570702, "grad_norm": 0.1033095270395279, "learning_rate": 0.0002435664120050349, "loss": 0.3659226894378662, "memory(GiB)": 78.33, "step": 1659, "token_acc": 0.8937110992198721, "train_speed(iter/s)": 0.032409 }, { "epoch": 0.3216586736423969, "grad_norm": 0.11475743353366852, "learning_rate": 0.0002434912557525542, "loss": 0.4008634686470032, "memory(GiB)": 78.33, "step": 1660, "token_acc": 0.8813870157718026, "train_speed(iter/s)": 0.03241 }, { "epoch": 0.32185244392772366, "grad_norm": 0.11086868494749069, "learning_rate": 0.0002434160611007026, "loss": 0.4097922742366791, "memory(GiB)": 78.33, "step": 1661, "token_acc": 0.8791088493742418, "train_speed(iter/s)": 0.032412 }, { "epoch": 0.3220462142130504, "grad_norm": 0.09878856688737869, "learning_rate": 0.0002433408280803645, "loss": 0.3229445517063141, "memory(GiB)": 78.33, "step": 1662, "token_acc": 0.9044224128327949, "train_speed(iter/s)": 0.032414 }, { "epoch": 0.32223998449837715, "grad_norm": 0.1650267392396927, "learning_rate": 0.00024326555672244012, "loss": 0.3968316912651062, "memory(GiB)": 78.33, "step": 1663, "token_acc": 0.8846646732165742, "train_speed(iter/s)": 0.032415 }, { "epoch": 0.3224337547837039, "grad_norm": 0.11585035920143127, "learning_rate": 0.0002431902470578455, "loss": 0.4106610417366028, "memory(GiB)": 78.33, "step": 1664, "token_acc": 0.8812659238112779, "train_speed(iter/s)": 0.032417 }, { "epoch": 0.32262752506903064, "grad_norm": 0.10727370530366898, "learning_rate": 0.00024311489911751224, "loss": 0.3810875117778778, "memory(GiB)": 78.33, "step": 1665, "token_acc": 0.8862208987547374, "train_speed(iter/s)": 0.032419 }, { "epoch": 0.3228212953543574, "grad_norm": 0.10232355445623398, "learning_rate": 0.00024303951293238785, "loss": 0.40213391184806824, "memory(GiB)": 78.33, "step": 1666, "token_acc": 0.8846539618856569, "train_speed(iter/s)": 0.03242 }, { "epoch": 0.32301506563968413, "grad_norm": 0.10427332669496536, "learning_rate": 0.00024296408853343544, "loss": 0.381198525428772, "memory(GiB)": 78.33, "step": 1667, "token_acc": 0.886510858088634, "train_speed(iter/s)": 0.032422 }, { "epoch": 0.3232088359250109, "grad_norm": 0.11577033996582031, "learning_rate": 0.0002428886259516338, "loss": 0.38930296897888184, "memory(GiB)": 78.33, "step": 1668, "token_acc": 0.8849813571961223, "train_speed(iter/s)": 0.032424 }, { "epoch": 0.3234026062103376, "grad_norm": 0.10479265451431274, "learning_rate": 0.0002428131252179775, "loss": 0.38501253724098206, "memory(GiB)": 78.33, "step": 1669, "token_acc": 0.8847304574878965, "train_speed(iter/s)": 0.032425 }, { "epoch": 0.3235963764956644, "grad_norm": 0.10768333077430725, "learning_rate": 0.00024273758636347663, "loss": 0.4145703613758087, "memory(GiB)": 78.33, "step": 1670, "token_acc": 0.8764765532929537, "train_speed(iter/s)": 0.032427 }, { "epoch": 0.32379014678099116, "grad_norm": 0.1042892262339592, "learning_rate": 0.00024266200941915712, "loss": 0.3692252039909363, "memory(GiB)": 78.33, "step": 1671, "token_acc": 0.8922046134447549, "train_speed(iter/s)": 0.032428 }, { "epoch": 0.3239839170663179, "grad_norm": 0.11078062653541565, "learning_rate": 0.00024258639441606042, "loss": 0.4250616133213043, "memory(GiB)": 78.33, "step": 1672, "token_acc": 0.876742678050362, "train_speed(iter/s)": 0.03243 }, { "epoch": 0.32417768735164465, "grad_norm": 0.10978075116872787, "learning_rate": 0.00024251074138524365, "loss": 0.40720629692077637, "memory(GiB)": 78.33, "step": 1673, "token_acc": 0.8835114016903205, "train_speed(iter/s)": 0.032432 }, { "epoch": 0.3243714576369714, "grad_norm": 0.09793038666248322, "learning_rate": 0.00024243505035777954, "loss": 0.34907984733581543, "memory(GiB)": 78.33, "step": 1674, "token_acc": 0.8983231559561888, "train_speed(iter/s)": 0.032433 }, { "epoch": 0.32456522792229814, "grad_norm": 0.11205164343118668, "learning_rate": 0.0002423593213647564, "loss": 0.3938961625099182, "memory(GiB)": 78.33, "step": 1675, "token_acc": 0.8874208948595537, "train_speed(iter/s)": 0.032435 }, { "epoch": 0.3247589982076249, "grad_norm": 0.12075439840555191, "learning_rate": 0.0002422835544372782, "loss": 0.42155468463897705, "memory(GiB)": 78.33, "step": 1676, "token_acc": 0.8779719830355995, "train_speed(iter/s)": 0.032436 }, { "epoch": 0.3249527684929516, "grad_norm": 0.10721995681524277, "learning_rate": 0.0002422077496064644, "loss": 0.4283701479434967, "memory(GiB)": 78.33, "step": 1677, "token_acc": 0.874123831775701, "train_speed(iter/s)": 0.032438 }, { "epoch": 0.32514653877827837, "grad_norm": 0.12848171591758728, "learning_rate": 0.00024213190690345018, "loss": 0.4088367521762848, "memory(GiB)": 78.33, "step": 1678, "token_acc": 0.8799862555836692, "train_speed(iter/s)": 0.03244 }, { "epoch": 0.3253403090636051, "grad_norm": 0.10981763154268265, "learning_rate": 0.00024205602635938604, "loss": 0.41480377316474915, "memory(GiB)": 78.33, "step": 1679, "token_acc": 0.878927367125194, "train_speed(iter/s)": 0.032441 }, { "epoch": 0.32553407934893186, "grad_norm": 0.10942694544792175, "learning_rate": 0.0002419801080054383, "loss": 0.39324793219566345, "memory(GiB)": 78.33, "step": 1680, "token_acc": 0.8848377444002208, "train_speed(iter/s)": 0.032443 }, { "epoch": 0.3257278496342586, "grad_norm": 0.10424201935529709, "learning_rate": 0.00024190415187278855, "loss": 0.38931453227996826, "memory(GiB)": 78.33, "step": 1681, "token_acc": 0.8873810462300805, "train_speed(iter/s)": 0.032445 }, { "epoch": 0.32592161991958535, "grad_norm": 0.10016170144081116, "learning_rate": 0.0002418281579926341, "loss": 0.3769393861293793, "memory(GiB)": 78.33, "step": 1682, "token_acc": 0.888585472419443, "train_speed(iter/s)": 0.032446 }, { "epoch": 0.3261153902049121, "grad_norm": 0.10119081288576126, "learning_rate": 0.0002417521263961876, "loss": 0.34021657705307007, "memory(GiB)": 78.33, "step": 1683, "token_acc": 0.8990787809976002, "train_speed(iter/s)": 0.032448 }, { "epoch": 0.32630916049023884, "grad_norm": 0.10256388038396835, "learning_rate": 0.00024167605711467738, "loss": 0.3520572781562805, "memory(GiB)": 78.33, "step": 1684, "token_acc": 0.8965411006861784, "train_speed(iter/s)": 0.03245 }, { "epoch": 0.3265029307755656, "grad_norm": 0.10342499613761902, "learning_rate": 0.00024159995017934702, "loss": 0.37822040915489197, "memory(GiB)": 78.33, "step": 1685, "token_acc": 0.8892546779308175, "train_speed(iter/s)": 0.032451 }, { "epoch": 0.3266967010608923, "grad_norm": 0.10906314849853516, "learning_rate": 0.00024152380562145575, "loss": 0.3807571232318878, "memory(GiB)": 78.33, "step": 1686, "token_acc": 0.8904570071765867, "train_speed(iter/s)": 0.032453 }, { "epoch": 0.32689047134621907, "grad_norm": 0.10030427575111389, "learning_rate": 0.00024144762347227822, "loss": 0.35491418838500977, "memory(GiB)": 78.33, "step": 1687, "token_acc": 0.8963004898220396, "train_speed(iter/s)": 0.032454 }, { "epoch": 0.3270842416315458, "grad_norm": 0.10785649716854095, "learning_rate": 0.0002413714037631044, "loss": 0.4118516743183136, "memory(GiB)": 78.33, "step": 1688, "token_acc": 0.8789999213774667, "train_speed(iter/s)": 0.032456 }, { "epoch": 0.32727801191687256, "grad_norm": 0.10498232394456863, "learning_rate": 0.00024129514652523976, "loss": 0.35566800832748413, "memory(GiB)": 78.33, "step": 1689, "token_acc": 0.8963449018301123, "train_speed(iter/s)": 0.032457 }, { "epoch": 0.3274717822021993, "grad_norm": 0.09992900490760803, "learning_rate": 0.0002412188517900053, "loss": 0.3673107326030731, "memory(GiB)": 78.33, "step": 1690, "token_acc": 0.891633487007544, "train_speed(iter/s)": 0.032459 }, { "epoch": 0.32766555248752605, "grad_norm": 0.10761499404907227, "learning_rate": 0.00024114251958873726, "loss": 0.39024174213409424, "memory(GiB)": 78.33, "step": 1691, "token_acc": 0.886815954076521, "train_speed(iter/s)": 0.03246 }, { "epoch": 0.3278593227728528, "grad_norm": 0.10518249124288559, "learning_rate": 0.00024106614995278731, "loss": 0.38077983260154724, "memory(GiB)": 78.33, "step": 1692, "token_acc": 0.8875674808915496, "train_speed(iter/s)": 0.032462 }, { "epoch": 0.32805309305817953, "grad_norm": 0.10248047858476639, "learning_rate": 0.00024098974291352255, "loss": 0.36225420236587524, "memory(GiB)": 78.33, "step": 1693, "token_acc": 0.8952485416562797, "train_speed(iter/s)": 0.032464 }, { "epoch": 0.3282468633435063, "grad_norm": 0.10516565293073654, "learning_rate": 0.00024091329850232536, "loss": 0.391397625207901, "memory(GiB)": 78.33, "step": 1694, "token_acc": 0.8858848043629254, "train_speed(iter/s)": 0.032465 }, { "epoch": 0.328440633628833, "grad_norm": 0.1099853664636612, "learning_rate": 0.00024083681675059356, "loss": 0.38383805751800537, "memory(GiB)": 78.33, "step": 1695, "token_acc": 0.8891934663426214, "train_speed(iter/s)": 0.032467 }, { "epoch": 0.32863440391415977, "grad_norm": 0.11349144577980042, "learning_rate": 0.00024076029768974025, "loss": 0.4004535675048828, "memory(GiB)": 78.33, "step": 1696, "token_acc": 0.883281280229955, "train_speed(iter/s)": 0.032468 }, { "epoch": 0.3288281741994865, "grad_norm": 0.11258336901664734, "learning_rate": 0.00024068374135119384, "loss": 0.4041019678115845, "memory(GiB)": 78.33, "step": 1697, "token_acc": 0.8841508871328463, "train_speed(iter/s)": 0.03247 }, { "epoch": 0.32902194448481326, "grad_norm": 0.10647819191217422, "learning_rate": 0.00024060714776639813, "loss": 0.36608096957206726, "memory(GiB)": 78.33, "step": 1698, "token_acc": 0.8930285460648598, "train_speed(iter/s)": 0.032471 }, { "epoch": 0.32921571477014, "grad_norm": 0.10232997685670853, "learning_rate": 0.00024053051696681208, "loss": 0.37514373660087585, "memory(GiB)": 78.33, "step": 1699, "token_acc": 0.8901580589433514, "train_speed(iter/s)": 0.032473 }, { "epoch": 0.32940948505546674, "grad_norm": 0.10426058620214462, "learning_rate": 0.00024045384898391007, "loss": 0.3706841468811035, "memory(GiB)": 78.33, "step": 1700, "token_acc": 0.8918139787870547, "train_speed(iter/s)": 0.032474 }, { "epoch": 0.3296032553407935, "grad_norm": 0.11431006342172623, "learning_rate": 0.0002403771438491817, "loss": 0.40189433097839355, "memory(GiB)": 78.33, "step": 1701, "token_acc": 0.8844837336666964, "train_speed(iter/s)": 0.032476 }, { "epoch": 0.32979702562612023, "grad_norm": 0.11498620361089706, "learning_rate": 0.00024030040159413185, "loss": 0.4026452898979187, "memory(GiB)": 78.33, "step": 1702, "token_acc": 0.8820736746475984, "train_speed(iter/s)": 0.032478 }, { "epoch": 0.329990795911447, "grad_norm": 0.10887457430362701, "learning_rate": 0.0002402236222502805, "loss": 0.3983825445175171, "memory(GiB)": 78.33, "step": 1703, "token_acc": 0.8844022811427336, "train_speed(iter/s)": 0.032479 }, { "epoch": 0.3301845661967737, "grad_norm": 0.10820619016885757, "learning_rate": 0.00024014680584916322, "loss": 0.37078577280044556, "memory(GiB)": 78.33, "step": 1704, "token_acc": 0.8899442436468578, "train_speed(iter/s)": 0.032481 }, { "epoch": 0.33037833648210047, "grad_norm": 0.12457577884197235, "learning_rate": 0.00024006995242233038, "loss": 0.4524819552898407, "memory(GiB)": 78.33, "step": 1705, "token_acc": 0.867928674577761, "train_speed(iter/s)": 0.032482 }, { "epoch": 0.3305721067674272, "grad_norm": 0.11018163710832596, "learning_rate": 0.0002399930620013478, "loss": 0.39005085825920105, "memory(GiB)": 78.33, "step": 1706, "token_acc": 0.8860519838801287, "train_speed(iter/s)": 0.032484 }, { "epoch": 0.33076587705275395, "grad_norm": 0.10681940615177155, "learning_rate": 0.00023991613461779644, "loss": 0.38101130723953247, "memory(GiB)": 78.33, "step": 1707, "token_acc": 0.890979347101932, "train_speed(iter/s)": 0.032485 }, { "epoch": 0.3309596473380807, "grad_norm": 0.10825340449810028, "learning_rate": 0.00023983917030327248, "loss": 0.36492252349853516, "memory(GiB)": 78.33, "step": 1708, "token_acc": 0.8953067555147058, "train_speed(iter/s)": 0.032487 }, { "epoch": 0.33115341762340744, "grad_norm": 0.0985020250082016, "learning_rate": 0.00023976216908938719, "loss": 0.36632204055786133, "memory(GiB)": 78.33, "step": 1709, "token_acc": 0.8902666632577404, "train_speed(iter/s)": 0.032488 }, { "epoch": 0.3313471879087342, "grad_norm": 0.10576551407575607, "learning_rate": 0.00023968513100776703, "loss": 0.3914228081703186, "memory(GiB)": 78.33, "step": 1710, "token_acc": 0.8861784230125749, "train_speed(iter/s)": 0.03249 }, { "epoch": 0.33154095819406093, "grad_norm": 0.10542413592338562, "learning_rate": 0.00023960805609005365, "loss": 0.40728819370269775, "memory(GiB)": 78.33, "step": 1711, "token_acc": 0.8833345786943635, "train_speed(iter/s)": 0.032491 }, { "epoch": 0.3317347284793877, "grad_norm": 0.10389473289251328, "learning_rate": 0.0002395309443679038, "loss": 0.3726048171520233, "memory(GiB)": 78.33, "step": 1712, "token_acc": 0.8894928520954504, "train_speed(iter/s)": 0.032493 }, { "epoch": 0.3319284987647144, "grad_norm": 0.11621604114770889, "learning_rate": 0.0002394537958729893, "loss": 0.39150145649909973, "memory(GiB)": 78.33, "step": 1713, "token_acc": 0.8860767814882987, "train_speed(iter/s)": 0.032494 }, { "epoch": 0.33212226905004116, "grad_norm": 0.13983365893363953, "learning_rate": 0.00023937661063699707, "loss": 0.4037204384803772, "memory(GiB)": 78.33, "step": 1714, "token_acc": 0.882203057624461, "train_speed(iter/s)": 0.032496 }, { "epoch": 0.3323160393353679, "grad_norm": 0.11795634776353836, "learning_rate": 0.00023929938869162928, "loss": 0.43276962637901306, "memory(GiB)": 78.33, "step": 1715, "token_acc": 0.875131748448296, "train_speed(iter/s)": 0.032498 }, { "epoch": 0.33250980962069465, "grad_norm": 0.10593996942043304, "learning_rate": 0.00023922213006860292, "loss": 0.3847067654132843, "memory(GiB)": 78.33, "step": 1716, "token_acc": 0.888629105839416, "train_speed(iter/s)": 0.032499 }, { "epoch": 0.3327035799060214, "grad_norm": 0.11202581226825714, "learning_rate": 0.00023914483479965025, "loss": 0.4195789396762848, "memory(GiB)": 78.33, "step": 1717, "token_acc": 0.8793484873624247, "train_speed(iter/s)": 0.0325 }, { "epoch": 0.33289735019134814, "grad_norm": 0.11114699393510818, "learning_rate": 0.00023906750291651858, "loss": 0.3933086395263672, "memory(GiB)": 78.33, "step": 1718, "token_acc": 0.8864894491641545, "train_speed(iter/s)": 0.032502 }, { "epoch": 0.3330911204766749, "grad_norm": 0.11813364177942276, "learning_rate": 0.00023899013445097007, "loss": 0.39278751611709595, "memory(GiB)": 78.33, "step": 1719, "token_acc": 0.8872432671919998, "train_speed(iter/s)": 0.032504 }, { "epoch": 0.33328489076200163, "grad_norm": 0.1009705662727356, "learning_rate": 0.0002389127294347821, "loss": 0.38593360781669617, "memory(GiB)": 78.33, "step": 1720, "token_acc": 0.8872761447292852, "train_speed(iter/s)": 0.032505 }, { "epoch": 0.3334786610473284, "grad_norm": 0.10921531915664673, "learning_rate": 0.000238835287899747, "loss": 0.38367462158203125, "memory(GiB)": 78.33, "step": 1721, "token_acc": 0.8886438230274386, "train_speed(iter/s)": 0.032507 }, { "epoch": 0.3336724313326551, "grad_norm": 0.1057095155119896, "learning_rate": 0.00023875780987767204, "loss": 0.3982135057449341, "memory(GiB)": 78.33, "step": 1722, "token_acc": 0.8859919028340081, "train_speed(iter/s)": 0.032508 }, { "epoch": 0.33386620161798186, "grad_norm": 0.10099222511053085, "learning_rate": 0.0002386802954003795, "loss": 0.37414735555648804, "memory(GiB)": 78.33, "step": 1723, "token_acc": 0.8888457355911139, "train_speed(iter/s)": 0.032509 }, { "epoch": 0.3340599719033086, "grad_norm": 0.10116644203662872, "learning_rate": 0.0002386027444997068, "loss": 0.4056088328361511, "memory(GiB)": 78.33, "step": 1724, "token_acc": 0.8838143450628663, "train_speed(iter/s)": 0.032511 }, { "epoch": 0.33425374218863535, "grad_norm": 0.0998440757393837, "learning_rate": 0.000238525157207506, "loss": 0.3904368281364441, "memory(GiB)": 78.33, "step": 1725, "token_acc": 0.8871362252283732, "train_speed(iter/s)": 0.032512 }, { "epoch": 0.3344475124739621, "grad_norm": 0.1074642464518547, "learning_rate": 0.0002384475335556444, "loss": 0.3967469334602356, "memory(GiB)": 78.33, "step": 1726, "token_acc": 0.8839466666666667, "train_speed(iter/s)": 0.032514 }, { "epoch": 0.33464128275928884, "grad_norm": 0.11831879615783691, "learning_rate": 0.00023836987357600414, "loss": 0.45349621772766113, "memory(GiB)": 78.33, "step": 1727, "token_acc": 0.8706680645349176, "train_speed(iter/s)": 0.032515 }, { "epoch": 0.3348350530446156, "grad_norm": 0.11764407902956009, "learning_rate": 0.00023829217730048219, "loss": 0.4561113119125366, "memory(GiB)": 78.33, "step": 1728, "token_acc": 0.8694065038177228, "train_speed(iter/s)": 0.032517 }, { "epoch": 0.3350288233299423, "grad_norm": 0.1027785986661911, "learning_rate": 0.00023821444476099048, "loss": 0.38838091492652893, "memory(GiB)": 78.33, "step": 1729, "token_acc": 0.8851640132492787, "train_speed(iter/s)": 0.032518 }, { "epoch": 0.33522259361526907, "grad_norm": 0.1072501540184021, "learning_rate": 0.0002381366759894559, "loss": 0.3992714285850525, "memory(GiB)": 78.33, "step": 1730, "token_acc": 0.8841534590542398, "train_speed(iter/s)": 0.03252 }, { "epoch": 0.33541636390059587, "grad_norm": 0.09161798655986786, "learning_rate": 0.00023805887101782018, "loss": 0.3421012759208679, "memory(GiB)": 78.33, "step": 1731, "token_acc": 0.8976631008677287, "train_speed(iter/s)": 0.032521 }, { "epoch": 0.3356101341859226, "grad_norm": 0.12624742090702057, "learning_rate": 0.00023798102987803994, "loss": 0.47877341508865356, "memory(GiB)": 78.33, "step": 1732, "token_acc": 0.8629468940234756, "train_speed(iter/s)": 0.032523 }, { "epoch": 0.33580390447124936, "grad_norm": 0.11039040237665176, "learning_rate": 0.00023790315260208654, "loss": 0.373019278049469, "memory(GiB)": 78.33, "step": 1733, "token_acc": 0.8893133633557658, "train_speed(iter/s)": 0.032525 }, { "epoch": 0.3359976747565761, "grad_norm": 0.11160407960414886, "learning_rate": 0.0002378252392219463, "loss": 0.3813738524913788, "memory(GiB)": 78.33, "step": 1734, "token_acc": 0.887247745813654, "train_speed(iter/s)": 0.032526 }, { "epoch": 0.33619144504190285, "grad_norm": 0.11238150298595428, "learning_rate": 0.0002377472897696204, "loss": 0.41692203283309937, "memory(GiB)": 78.33, "step": 1735, "token_acc": 0.8769641013877738, "train_speed(iter/s)": 0.032528 }, { "epoch": 0.3363852153272296, "grad_norm": 0.1071707084774971, "learning_rate": 0.00023766930427712471, "loss": 0.39206287264823914, "memory(GiB)": 78.33, "step": 1736, "token_acc": 0.8842453737762436, "train_speed(iter/s)": 0.032529 }, { "epoch": 0.33657898561255634, "grad_norm": 0.10300777852535248, "learning_rate": 0.00023759128277649, "loss": 0.3877497613430023, "memory(GiB)": 78.33, "step": 1737, "token_acc": 0.8867694590155248, "train_speed(iter/s)": 0.032531 }, { "epoch": 0.3367727558978831, "grad_norm": 0.1066984236240387, "learning_rate": 0.0002375132252997618, "loss": 0.3994975686073303, "memory(GiB)": 78.33, "step": 1738, "token_acc": 0.8828811405205282, "train_speed(iter/s)": 0.032532 }, { "epoch": 0.3369665261832098, "grad_norm": 0.100070521235466, "learning_rate": 0.00023743513187900037, "loss": 0.36340388655662537, "memory(GiB)": 78.33, "step": 1739, "token_acc": 0.8943034295679132, "train_speed(iter/s)": 0.032534 }, { "epoch": 0.33716029646853657, "grad_norm": 0.10749375075101852, "learning_rate": 0.00023735700254628078, "loss": 0.3817083537578583, "memory(GiB)": 78.33, "step": 1740, "token_acc": 0.8865700752298691, "train_speed(iter/s)": 0.032535 }, { "epoch": 0.3373540667538633, "grad_norm": 0.1144825890660286, "learning_rate": 0.00023727883733369292, "loss": 0.3933083415031433, "memory(GiB)": 78.33, "step": 1741, "token_acc": 0.8860413411849537, "train_speed(iter/s)": 0.032537 }, { "epoch": 0.33754783703919006, "grad_norm": 0.11627618223428726, "learning_rate": 0.00023720063627334124, "loss": 0.38191571831703186, "memory(GiB)": 78.33, "step": 1742, "token_acc": 0.8882073834046985, "train_speed(iter/s)": 0.032538 }, { "epoch": 0.3377416073245168, "grad_norm": 0.10807470977306366, "learning_rate": 0.00023712239939734512, "loss": 0.3824315667152405, "memory(GiB)": 78.33, "step": 1743, "token_acc": 0.8895115896074527, "train_speed(iter/s)": 0.032539 }, { "epoch": 0.33793537760984355, "grad_norm": 0.11028466373682022, "learning_rate": 0.00023704412673783852, "loss": 0.4183183014392853, "memory(GiB)": 78.33, "step": 1744, "token_acc": 0.8795684778318642, "train_speed(iter/s)": 0.032541 }, { "epoch": 0.3381291478951703, "grad_norm": 0.10906746983528137, "learning_rate": 0.00023696581832697002, "loss": 0.38075122237205505, "memory(GiB)": 78.33, "step": 1745, "token_acc": 0.8904931905328574, "train_speed(iter/s)": 0.032542 }, { "epoch": 0.33832291818049703, "grad_norm": 0.10776349902153015, "learning_rate": 0.00023688747419690312, "loss": 0.39934486150741577, "memory(GiB)": 78.33, "step": 1746, "token_acc": 0.8850526197309921, "train_speed(iter/s)": 0.032544 }, { "epoch": 0.3385166884658238, "grad_norm": 0.12053883075714111, "learning_rate": 0.00023680909437981583, "loss": 0.4140859544277191, "memory(GiB)": 78.33, "step": 1747, "token_acc": 0.8816291216714198, "train_speed(iter/s)": 0.032546 }, { "epoch": 0.3387104587511505, "grad_norm": 0.12190855294466019, "learning_rate": 0.00023673067890790078, "loss": 0.46971094608306885, "memory(GiB)": 78.33, "step": 1748, "token_acc": 0.8653503654117791, "train_speed(iter/s)": 0.032547 }, { "epoch": 0.33890422903647727, "grad_norm": 0.10787336528301239, "learning_rate": 0.00023665222781336538, "loss": 0.3823373317718506, "memory(GiB)": 78.33, "step": 1749, "token_acc": 0.8879386276459724, "train_speed(iter/s)": 0.032549 }, { "epoch": 0.339097999321804, "grad_norm": 0.10566065460443497, "learning_rate": 0.0002365737411284316, "loss": 0.36630237102508545, "memory(GiB)": 78.33, "step": 1750, "token_acc": 0.8929974516882565, "train_speed(iter/s)": 0.03255 }, { "epoch": 0.33929176960713076, "grad_norm": 0.11304379254579544, "learning_rate": 0.000236495218885336, "loss": 0.41917577385902405, "memory(GiB)": 78.33, "step": 1751, "token_acc": 0.8782786241558034, "train_speed(iter/s)": 0.032551 }, { "epoch": 0.3394855398924575, "grad_norm": 0.1184999868273735, "learning_rate": 0.00023641666111632977, "loss": 0.436492919921875, "memory(GiB)": 78.33, "step": 1752, "token_acc": 0.8726875365530827, "train_speed(iter/s)": 0.032553 }, { "epoch": 0.33967931017778424, "grad_norm": 0.11077001690864563, "learning_rate": 0.00023633806785367873, "loss": 0.35720402002334595, "memory(GiB)": 78.33, "step": 1753, "token_acc": 0.8934273097826086, "train_speed(iter/s)": 0.032554 }, { "epoch": 0.339873080463111, "grad_norm": 0.10461857914924622, "learning_rate": 0.00023625943912966322, "loss": 0.366897851228714, "memory(GiB)": 78.33, "step": 1754, "token_acc": 0.8917701004031939, "train_speed(iter/s)": 0.032556 }, { "epoch": 0.34006685074843773, "grad_norm": 0.1258544921875, "learning_rate": 0.0002361807749765782, "loss": 0.43176522850990295, "memory(GiB)": 78.33, "step": 1755, "token_acc": 0.875682894257618, "train_speed(iter/s)": 0.032557 }, { "epoch": 0.3402606210337645, "grad_norm": 0.10822499543428421, "learning_rate": 0.0002361020754267331, "loss": 0.33935534954071045, "memory(GiB)": 78.33, "step": 1756, "token_acc": 0.9004844907278499, "train_speed(iter/s)": 0.032559 }, { "epoch": 0.3404543913190912, "grad_norm": 0.1091342493891716, "learning_rate": 0.00023602334051245195, "loss": 0.38166943192481995, "memory(GiB)": 78.33, "step": 1757, "token_acc": 0.8881519274376417, "train_speed(iter/s)": 0.03256 }, { "epoch": 0.34064816160441796, "grad_norm": 0.0942879170179367, "learning_rate": 0.00023594457026607335, "loss": 0.3533993363380432, "memory(GiB)": 78.33, "step": 1758, "token_acc": 0.8947904639000204, "train_speed(iter/s)": 0.032562 }, { "epoch": 0.3408419318897447, "grad_norm": 0.10075850039720535, "learning_rate": 0.00023586576471995035, "loss": 0.35853201150894165, "memory(GiB)": 78.33, "step": 1759, "token_acc": 0.8928562603474264, "train_speed(iter/s)": 0.032563 }, { "epoch": 0.34103570217507145, "grad_norm": 0.10983631759881973, "learning_rate": 0.00023578692390645043, "loss": 0.38611915707588196, "memory(GiB)": 78.33, "step": 1760, "token_acc": 0.8869917407878017, "train_speed(iter/s)": 0.032564 }, { "epoch": 0.3412294724603982, "grad_norm": 0.10747679322957993, "learning_rate": 0.00023570804785795572, "loss": 0.3835248649120331, "memory(GiB)": 78.33, "step": 1761, "token_acc": 0.8901472855851544, "train_speed(iter/s)": 0.032566 }, { "epoch": 0.34142324274572494, "grad_norm": 0.11863560974597931, "learning_rate": 0.00023562913660686263, "loss": 0.4406958222389221, "memory(GiB)": 78.33, "step": 1762, "token_acc": 0.8729787360066339, "train_speed(iter/s)": 0.032567 }, { "epoch": 0.3416170130310517, "grad_norm": 0.11419124901294708, "learning_rate": 0.00023555019018558224, "loss": 0.39518317580223083, "memory(GiB)": 78.33, "step": 1763, "token_acc": 0.88392173041787, "train_speed(iter/s)": 0.032569 }, { "epoch": 0.34181078331637843, "grad_norm": 0.11686435341835022, "learning_rate": 0.0002354712086265399, "loss": 0.43183597922325134, "memory(GiB)": 78.33, "step": 1764, "token_acc": 0.8757511860832894, "train_speed(iter/s)": 0.03257 }, { "epoch": 0.3420045536017052, "grad_norm": 0.10107649117708206, "learning_rate": 0.0002353921919621755, "loss": 0.39429807662963867, "memory(GiB)": 78.33, "step": 1765, "token_acc": 0.8837560239497639, "train_speed(iter/s)": 0.032572 }, { "epoch": 0.3421983238870319, "grad_norm": 0.10357075929641724, "learning_rate": 0.00023531314022494324, "loss": 0.3501340448856354, "memory(GiB)": 78.33, "step": 1766, "token_acc": 0.8969428540563897, "train_speed(iter/s)": 0.032573 }, { "epoch": 0.34239209417235866, "grad_norm": 0.0919223427772522, "learning_rate": 0.0002352340534473119, "loss": 0.3332497477531433, "memory(GiB)": 78.33, "step": 1767, "token_acc": 0.9006399880469158, "train_speed(iter/s)": 0.032574 }, { "epoch": 0.3425858644576854, "grad_norm": 0.10394643247127533, "learning_rate": 0.00023515493166176442, "loss": 0.36848586797714233, "memory(GiB)": 78.33, "step": 1768, "token_acc": 0.8926898258640502, "train_speed(iter/s)": 0.032576 }, { "epoch": 0.34277963474301215, "grad_norm": 0.10993330925703049, "learning_rate": 0.00023507577490079832, "loss": 0.41358712315559387, "memory(GiB)": 78.33, "step": 1769, "token_acc": 0.8809283276450512, "train_speed(iter/s)": 0.032577 }, { "epoch": 0.3429734050283389, "grad_norm": 0.10172808170318604, "learning_rate": 0.00023499658319692542, "loss": 0.3734714686870575, "memory(GiB)": 78.33, "step": 1770, "token_acc": 0.8903681089061836, "train_speed(iter/s)": 0.032579 }, { "epoch": 0.34316717531366564, "grad_norm": 0.10930664837360382, "learning_rate": 0.00023491735658267182, "loss": 0.37374699115753174, "memory(GiB)": 78.33, "step": 1771, "token_acc": 0.8907253336311352, "train_speed(iter/s)": 0.03258 }, { "epoch": 0.3433609455989924, "grad_norm": 0.1160641685128212, "learning_rate": 0.0002348380950905781, "loss": 0.3713539242744446, "memory(GiB)": 78.33, "step": 1772, "token_acc": 0.8907073847185395, "train_speed(iter/s)": 0.032582 }, { "epoch": 0.34355471588431913, "grad_norm": 0.11093118786811829, "learning_rate": 0.000234758798753199, "loss": 0.40102115273475647, "memory(GiB)": 78.33, "step": 1773, "token_acc": 0.8830998248686515, "train_speed(iter/s)": 0.032583 }, { "epoch": 0.3437484861696459, "grad_norm": 0.10876762866973877, "learning_rate": 0.00023467946760310368, "loss": 0.3644455671310425, "memory(GiB)": 78.33, "step": 1774, "token_acc": 0.8928700486448923, "train_speed(iter/s)": 0.032585 }, { "epoch": 0.3439422564549726, "grad_norm": 0.1010998785495758, "learning_rate": 0.00023460010167287564, "loss": 0.37988942861557007, "memory(GiB)": 78.33, "step": 1775, "token_acc": 0.8903876473385219, "train_speed(iter/s)": 0.032586 }, { "epoch": 0.34413602674029936, "grad_norm": 0.10760042816400528, "learning_rate": 0.00023452070099511249, "loss": 0.39640262722969055, "memory(GiB)": 78.33, "step": 1776, "token_acc": 0.8831872960402909, "train_speed(iter/s)": 0.032587 }, { "epoch": 0.3443297970256261, "grad_norm": 0.10678626596927643, "learning_rate": 0.00023444126560242634, "loss": 0.39018261432647705, "memory(GiB)": 78.33, "step": 1777, "token_acc": 0.8859104540432472, "train_speed(iter/s)": 0.032589 }, { "epoch": 0.34452356731095285, "grad_norm": 0.1159418597817421, "learning_rate": 0.00023436179552744333, "loss": 0.39666786789894104, "memory(GiB)": 78.33, "step": 1778, "token_acc": 0.8834541062801933, "train_speed(iter/s)": 0.03259 }, { "epoch": 0.3447173375962796, "grad_norm": 0.10813209414482117, "learning_rate": 0.00023428229080280403, "loss": 0.4084324836730957, "memory(GiB)": 78.33, "step": 1779, "token_acc": 0.8810328586468665, "train_speed(iter/s)": 0.032592 }, { "epoch": 0.34491110788160634, "grad_norm": 0.1118760034441948, "learning_rate": 0.00023420275146116318, "loss": 0.4295273721218109, "memory(GiB)": 78.33, "step": 1780, "token_acc": 0.876175413371675, "train_speed(iter/s)": 0.032593 }, { "epoch": 0.3451048781669331, "grad_norm": 0.10425037890672684, "learning_rate": 0.00023412317753518968, "loss": 0.38702112436294556, "memory(GiB)": 78.33, "step": 1781, "token_acc": 0.88680838382835, "train_speed(iter/s)": 0.032595 }, { "epoch": 0.3452986484522598, "grad_norm": 0.11203489452600479, "learning_rate": 0.0002340435690575666, "loss": 0.36375969648361206, "memory(GiB)": 78.33, "step": 1782, "token_acc": 0.8949717774449948, "train_speed(iter/s)": 0.032596 }, { "epoch": 0.34549241873758657, "grad_norm": 0.10534288734197617, "learning_rate": 0.00023396392606099144, "loss": 0.37565791606903076, "memory(GiB)": 78.33, "step": 1783, "token_acc": 0.8892402497364794, "train_speed(iter/s)": 0.032597 }, { "epoch": 0.3456861890229133, "grad_norm": 0.10718753933906555, "learning_rate": 0.00023388424857817566, "loss": 0.37750717997550964, "memory(GiB)": 78.33, "step": 1784, "token_acc": 0.8892705765022024, "train_speed(iter/s)": 0.032599 }, { "epoch": 0.34587995930824006, "grad_norm": 0.11478148400783539, "learning_rate": 0.00023380453664184492, "loss": 0.42205026745796204, "memory(GiB)": 78.33, "step": 1785, "token_acc": 0.8783005541671194, "train_speed(iter/s)": 0.0326 }, { "epoch": 0.3460737295935668, "grad_norm": 0.10276792198419571, "learning_rate": 0.00023372479028473908, "loss": 0.3618147671222687, "memory(GiB)": 78.33, "step": 1786, "token_acc": 0.8970774091627172, "train_speed(iter/s)": 0.032601 }, { "epoch": 0.34626749987889355, "grad_norm": 0.11334466934204102, "learning_rate": 0.0002336450095396121, "loss": 0.3954538404941559, "memory(GiB)": 78.33, "step": 1787, "token_acc": 0.8844070906454381, "train_speed(iter/s)": 0.032603 }, { "epoch": 0.3464612701642203, "grad_norm": 0.12592172622680664, "learning_rate": 0.00023356519443923205, "loss": 0.45259174704551697, "memory(GiB)": 78.33, "step": 1788, "token_acc": 0.8711917422583673, "train_speed(iter/s)": 0.032604 }, { "epoch": 0.34665504044954704, "grad_norm": 0.09646034240722656, "learning_rate": 0.00023348534501638115, "loss": 0.35588333010673523, "memory(GiB)": 78.33, "step": 1789, "token_acc": 0.8962561097320675, "train_speed(iter/s)": 0.032606 }, { "epoch": 0.3468488107348738, "grad_norm": 0.10078699886798859, "learning_rate": 0.00023340546130385574, "loss": 0.3622387647628784, "memory(GiB)": 78.33, "step": 1790, "token_acc": 0.8921391399616544, "train_speed(iter/s)": 0.032607 }, { "epoch": 0.3470425810202005, "grad_norm": 0.1054060235619545, "learning_rate": 0.00023332554333446617, "loss": 0.39420005679130554, "memory(GiB)": 78.33, "step": 1791, "token_acc": 0.8858397062477569, "train_speed(iter/s)": 0.032609 }, { "epoch": 0.3472363513055273, "grad_norm": 0.10570289194583893, "learning_rate": 0.0002332455911410369, "loss": 0.37943026423454285, "memory(GiB)": 78.33, "step": 1792, "token_acc": 0.8889375085464242, "train_speed(iter/s)": 0.03261 }, { "epoch": 0.34743012159085407, "grad_norm": 0.11169130355119705, "learning_rate": 0.00023316560475640646, "loss": 0.4090282917022705, "memory(GiB)": 78.33, "step": 1793, "token_acc": 0.8791112570459148, "train_speed(iter/s)": 0.032612 }, { "epoch": 0.3476238918761808, "grad_norm": 0.11017318069934845, "learning_rate": 0.00023308558421342743, "loss": 0.38704198598861694, "memory(GiB)": 78.33, "step": 1794, "token_acc": 0.8879233394591757, "train_speed(iter/s)": 0.032613 }, { "epoch": 0.34781766216150756, "grad_norm": 0.1121833398938179, "learning_rate": 0.0002330055295449663, "loss": 0.3916083872318268, "memory(GiB)": 78.33, "step": 1795, "token_acc": 0.8842932088924959, "train_speed(iter/s)": 0.032615 }, { "epoch": 0.3480114324468343, "grad_norm": 0.10421619564294815, "learning_rate": 0.00023292544078390377, "loss": 0.3865489065647125, "memory(GiB)": 78.33, "step": 1796, "token_acc": 0.8867510896758053, "train_speed(iter/s)": 0.032616 }, { "epoch": 0.34820520273216105, "grad_norm": 0.10179581493139267, "learning_rate": 0.00023284531796313444, "loss": 0.36844325065612793, "memory(GiB)": 78.33, "step": 1797, "token_acc": 0.8937036571572201, "train_speed(iter/s)": 0.032617 }, { "epoch": 0.3483989730174878, "grad_norm": 0.12043815106153488, "learning_rate": 0.0002327651611155669, "loss": 0.3939439058303833, "memory(GiB)": 78.33, "step": 1798, "token_acc": 0.8850653819683414, "train_speed(iter/s)": 0.032619 }, { "epoch": 0.34859274330281453, "grad_norm": 0.10927697271108627, "learning_rate": 0.00023268497027412364, "loss": 0.4002224802970886, "memory(GiB)": 78.33, "step": 1799, "token_acc": 0.8828930991564099, "train_speed(iter/s)": 0.03262 }, { "epoch": 0.3487865135881413, "grad_norm": 0.09782009571790695, "learning_rate": 0.0002326047454717413, "loss": 0.3829875886440277, "memory(GiB)": 78.33, "step": 1800, "token_acc": 0.8896661604720058, "train_speed(iter/s)": 0.032621 }, { "epoch": 0.348980283873468, "grad_norm": 0.09550854563713074, "learning_rate": 0.0002325244867413703, "loss": 0.34558382630348206, "memory(GiB)": 78.33, "step": 1801, "token_acc": 0.897237290273748, "train_speed(iter/s)": 0.032615 }, { "epoch": 0.34917405415879477, "grad_norm": 0.10129474103450775, "learning_rate": 0.00023244419411597508, "loss": 0.3574253022670746, "memory(GiB)": 78.33, "step": 1802, "token_acc": 0.8967041694242224, "train_speed(iter/s)": 0.032617 }, { "epoch": 0.3493678244441215, "grad_norm": 0.11037204414606094, "learning_rate": 0.00023236386762853398, "loss": 0.37563031911849976, "memory(GiB)": 78.33, "step": 1803, "token_acc": 0.889018691588785, "train_speed(iter/s)": 0.032618 }, { "epoch": 0.34956159472944826, "grad_norm": 0.10526623576879501, "learning_rate": 0.00023228350731203923, "loss": 0.37744981050491333, "memory(GiB)": 78.33, "step": 1804, "token_acc": 0.8889050705114202, "train_speed(iter/s)": 0.032619 }, { "epoch": 0.349755365014775, "grad_norm": 0.11370985209941864, "learning_rate": 0.000232203113199497, "loss": 0.38519221544265747, "memory(GiB)": 78.33, "step": 1805, "token_acc": 0.8884900839518189, "train_speed(iter/s)": 0.032621 }, { "epoch": 0.34994913530010174, "grad_norm": 0.11202775686979294, "learning_rate": 0.00023212268532392733, "loss": 0.40250468254089355, "memory(GiB)": 78.33, "step": 1806, "token_acc": 0.882124123298167, "train_speed(iter/s)": 0.032622 }, { "epoch": 0.3501429055854285, "grad_norm": 0.10065510869026184, "learning_rate": 0.00023204222371836405, "loss": 0.3592451810836792, "memory(GiB)": 78.33, "step": 1807, "token_acc": 0.8944679954711814, "train_speed(iter/s)": 0.032623 }, { "epoch": 0.35033667587075523, "grad_norm": 0.11026205122470856, "learning_rate": 0.00023196172841585488, "loss": 0.41680392622947693, "memory(GiB)": 78.33, "step": 1808, "token_acc": 0.8787629092507141, "train_speed(iter/s)": 0.032625 }, { "epoch": 0.350530446156082, "grad_norm": 0.10042080283164978, "learning_rate": 0.00023188119944946147, "loss": 0.36419713497161865, "memory(GiB)": 78.33, "step": 1809, "token_acc": 0.8931367912134112, "train_speed(iter/s)": 0.032626 }, { "epoch": 0.3507242164414087, "grad_norm": 0.12159177660942078, "learning_rate": 0.00023180063685225924, "loss": 0.3895268142223358, "memory(GiB)": 78.33, "step": 1810, "token_acc": 0.884988553363131, "train_speed(iter/s)": 0.032628 }, { "epoch": 0.35091798672673546, "grad_norm": 0.09737487882375717, "learning_rate": 0.0002317200406573374, "loss": 0.3568708002567291, "memory(GiB)": 78.33, "step": 1811, "token_acc": 0.8948162336884952, "train_speed(iter/s)": 0.032629 }, { "epoch": 0.3511117570120622, "grad_norm": 0.105777308344841, "learning_rate": 0.00023163941089779892, "loss": 0.3754398226737976, "memory(GiB)": 78.33, "step": 1812, "token_acc": 0.8915336122076238, "train_speed(iter/s)": 0.03263 }, { "epoch": 0.35130552729738895, "grad_norm": 0.10280515253543854, "learning_rate": 0.00023155874760676069, "loss": 0.3664350211620331, "memory(GiB)": 78.33, "step": 1813, "token_acc": 0.8921140983953261, "train_speed(iter/s)": 0.032632 }, { "epoch": 0.3514992975827157, "grad_norm": 0.10777980089187622, "learning_rate": 0.00023147805081735325, "loss": 0.3817192614078522, "memory(GiB)": 78.33, "step": 1814, "token_acc": 0.889120756816917, "train_speed(iter/s)": 0.032633 }, { "epoch": 0.35169306786804244, "grad_norm": 0.10182831436395645, "learning_rate": 0.0002313973205627209, "loss": 0.37246525287628174, "memory(GiB)": 78.33, "step": 1815, "token_acc": 0.8915448019143845, "train_speed(iter/s)": 0.032635 }, { "epoch": 0.3518868381533692, "grad_norm": 0.10677202045917511, "learning_rate": 0.00023131655687602174, "loss": 0.3982056677341461, "memory(GiB)": 78.33, "step": 1816, "token_acc": 0.8849781395220943, "train_speed(iter/s)": 0.032636 }, { "epoch": 0.35208060843869593, "grad_norm": 0.10674361884593964, "learning_rate": 0.00023123575979042767, "loss": 0.38551852107048035, "memory(GiB)": 78.33, "step": 1817, "token_acc": 0.8879332677063364, "train_speed(iter/s)": 0.032637 }, { "epoch": 0.3522743787240227, "grad_norm": 0.10658242553472519, "learning_rate": 0.00023115492933912412, "loss": 0.416787713766098, "memory(GiB)": 78.33, "step": 1818, "token_acc": 0.8809510899034811, "train_speed(iter/s)": 0.032639 }, { "epoch": 0.3524681490093494, "grad_norm": 0.10621768981218338, "learning_rate": 0.00023107406555531042, "loss": 0.37745460867881775, "memory(GiB)": 78.33, "step": 1819, "token_acc": 0.8909349857578331, "train_speed(iter/s)": 0.03264 }, { "epoch": 0.35266191929467616, "grad_norm": 0.11966609209775925, "learning_rate": 0.00023099316847219944, "loss": 0.43290236592292786, "memory(GiB)": 78.33, "step": 1820, "token_acc": 0.8743744060817231, "train_speed(iter/s)": 0.032642 }, { "epoch": 0.3528556895800029, "grad_norm": 0.11253046244382858, "learning_rate": 0.00023091223812301778, "loss": 0.4067782759666443, "memory(GiB)": 78.33, "step": 1821, "token_acc": 0.8815392109108622, "train_speed(iter/s)": 0.032643 }, { "epoch": 0.35304945986532965, "grad_norm": 0.09745576977729797, "learning_rate": 0.00023083127454100573, "loss": 0.36597940325737, "memory(GiB)": 78.33, "step": 1822, "token_acc": 0.8929222319310948, "train_speed(iter/s)": 0.032645 }, { "epoch": 0.3532432301506564, "grad_norm": 0.10735499858856201, "learning_rate": 0.00023075027775941722, "loss": 0.3935870826244354, "memory(GiB)": 78.33, "step": 1823, "token_acc": 0.8852148867679449, "train_speed(iter/s)": 0.032646 }, { "epoch": 0.35343700043598314, "grad_norm": 0.10999494791030884, "learning_rate": 0.00023066924781151976, "loss": 0.3992076814174652, "memory(GiB)": 78.33, "step": 1824, "token_acc": 0.8823376247649356, "train_speed(iter/s)": 0.032648 }, { "epoch": 0.3536307707213099, "grad_norm": 0.10191714018583298, "learning_rate": 0.00023058818473059456, "loss": 0.38821935653686523, "memory(GiB)": 78.33, "step": 1825, "token_acc": 0.8841636697432286, "train_speed(iter/s)": 0.032649 }, { "epoch": 0.35382454100663663, "grad_norm": 0.10946159809827805, "learning_rate": 0.00023050708854993645, "loss": 0.3874565660953522, "memory(GiB)": 78.33, "step": 1826, "token_acc": 0.8877764842840512, "train_speed(iter/s)": 0.03265 }, { "epoch": 0.3540183112919634, "grad_norm": 0.10883501917123795, "learning_rate": 0.00023042595930285374, "loss": 0.36587873101234436, "memory(GiB)": 78.33, "step": 1827, "token_acc": 0.8928685827436592, "train_speed(iter/s)": 0.032652 }, { "epoch": 0.3542120815772901, "grad_norm": 0.1008007600903511, "learning_rate": 0.0002303447970226684, "loss": 0.3552745282649994, "memory(GiB)": 78.33, "step": 1828, "token_acc": 0.8956754006614093, "train_speed(iter/s)": 0.032653 }, { "epoch": 0.35440585186261686, "grad_norm": 0.1125805526971817, "learning_rate": 0.00023026360174271593, "loss": 0.4129788279533386, "memory(GiB)": 78.33, "step": 1829, "token_acc": 0.8825399481301766, "train_speed(iter/s)": 0.032655 }, { "epoch": 0.3545996221479436, "grad_norm": 0.11501467972993851, "learning_rate": 0.00023018237349634553, "loss": 0.41495800018310547, "memory(GiB)": 78.33, "step": 1830, "token_acc": 0.8798725839542166, "train_speed(iter/s)": 0.032656 }, { "epoch": 0.35479339243327035, "grad_norm": 0.10721118748188019, "learning_rate": 0.00023010111231691973, "loss": 0.36478832364082336, "memory(GiB)": 78.33, "step": 1831, "token_acc": 0.891846109675567, "train_speed(iter/s)": 0.032657 }, { "epoch": 0.3549871627185971, "grad_norm": 0.11395079642534256, "learning_rate": 0.00023001981823781472, "loss": 0.3958333730697632, "memory(GiB)": 78.33, "step": 1832, "token_acc": 0.8860460306674152, "train_speed(iter/s)": 0.032659 }, { "epoch": 0.35518093300392384, "grad_norm": 0.11087855696678162, "learning_rate": 0.00022993849129242014, "loss": 0.4031826853752136, "memory(GiB)": 78.33, "step": 1833, "token_acc": 0.882696344865269, "train_speed(iter/s)": 0.03266 }, { "epoch": 0.3553747032892506, "grad_norm": 0.1058662161231041, "learning_rate": 0.00022985713151413913, "loss": 0.37713879346847534, "memory(GiB)": 78.33, "step": 1834, "token_acc": 0.8884106402887314, "train_speed(iter/s)": 0.032661 }, { "epoch": 0.3555684735745773, "grad_norm": 0.10403905063867569, "learning_rate": 0.00022977573893638836, "loss": 0.3826426863670349, "memory(GiB)": 78.33, "step": 1835, "token_acc": 0.8869446715726356, "train_speed(iter/s)": 0.032663 }, { "epoch": 0.35576224385990407, "grad_norm": 0.09630994498729706, "learning_rate": 0.00022969431359259797, "loss": 0.35036543011665344, "memory(GiB)": 78.33, "step": 1836, "token_acc": 0.8947007008903202, "train_speed(iter/s)": 0.032664 }, { "epoch": 0.3559560141452308, "grad_norm": 0.12186194956302643, "learning_rate": 0.0002296128555162115, "loss": 0.4338938593864441, "memory(GiB)": 78.33, "step": 1837, "token_acc": 0.8740203761755486, "train_speed(iter/s)": 0.032665 }, { "epoch": 0.35614978443055756, "grad_norm": 0.11005796492099762, "learning_rate": 0.000229531364740686, "loss": 0.37460899353027344, "memory(GiB)": 78.33, "step": 1838, "token_acc": 0.8911450261367517, "train_speed(iter/s)": 0.032667 }, { "epoch": 0.3563435547158843, "grad_norm": 0.10339885205030441, "learning_rate": 0.00022944984129949196, "loss": 0.3677971661090851, "memory(GiB)": 78.33, "step": 1839, "token_acc": 0.8935775621157461, "train_speed(iter/s)": 0.032668 }, { "epoch": 0.35653732500121105, "grad_norm": 0.10451404750347137, "learning_rate": 0.00022936828522611316, "loss": 0.3537764847278595, "memory(GiB)": 78.33, "step": 1840, "token_acc": 0.8961053422788898, "train_speed(iter/s)": 0.032669 }, { "epoch": 0.3567310952865378, "grad_norm": 0.09972133487462997, "learning_rate": 0.00022928669655404688, "loss": 0.39150598645210266, "memory(GiB)": 78.33, "step": 1841, "token_acc": 0.885431667414245, "train_speed(iter/s)": 0.032671 }, { "epoch": 0.35692486557186454, "grad_norm": 0.10803169012069702, "learning_rate": 0.0002292050753168038, "loss": 0.3714596629142761, "memory(GiB)": 78.33, "step": 1842, "token_acc": 0.8923362209736792, "train_speed(iter/s)": 0.032672 }, { "epoch": 0.3571186358571913, "grad_norm": 0.1168799102306366, "learning_rate": 0.00022912342154790804, "loss": 0.421195387840271, "memory(GiB)": 78.33, "step": 1843, "token_acc": 0.8766641213771124, "train_speed(iter/s)": 0.032674 }, { "epoch": 0.357312406142518, "grad_norm": 0.10394692420959473, "learning_rate": 0.00022904173528089686, "loss": 0.4081256091594696, "memory(GiB)": 78.33, "step": 1844, "token_acc": 0.8812434789587736, "train_speed(iter/s)": 0.032675 }, { "epoch": 0.35750617642784477, "grad_norm": 0.0959138348698616, "learning_rate": 0.00022896001654932105, "loss": 0.3497539162635803, "memory(GiB)": 78.33, "step": 1845, "token_acc": 0.8946740342957947, "train_speed(iter/s)": 0.032677 }, { "epoch": 0.3576999467131715, "grad_norm": 0.10984820127487183, "learning_rate": 0.0002288782653867448, "loss": 0.4027239680290222, "memory(GiB)": 78.33, "step": 1846, "token_acc": 0.8789472180150316, "train_speed(iter/s)": 0.032678 }, { "epoch": 0.35789371699849826, "grad_norm": 0.11082874238491058, "learning_rate": 0.0002287964818267453, "loss": 0.3869192600250244, "memory(GiB)": 78.33, "step": 1847, "token_acc": 0.8861776989530293, "train_speed(iter/s)": 0.032679 }, { "epoch": 0.358087487283825, "grad_norm": 0.10880383849143982, "learning_rate": 0.0002287146659029134, "loss": 0.39860600233078003, "memory(GiB)": 78.33, "step": 1848, "token_acc": 0.8812581533555588, "train_speed(iter/s)": 0.032681 }, { "epoch": 0.35828125756915175, "grad_norm": 0.11083029955625534, "learning_rate": 0.00022863281764885315, "loss": 0.3671538233757019, "memory(GiB)": 78.33, "step": 1849, "token_acc": 0.8913858792713418, "train_speed(iter/s)": 0.032682 }, { "epoch": 0.3584750278544785, "grad_norm": 0.1035129502415657, "learning_rate": 0.00022855093709818168, "loss": 0.35645681619644165, "memory(GiB)": 78.33, "step": 1850, "token_acc": 0.8927785347447856, "train_speed(iter/s)": 0.032683 }, { "epoch": 0.35866879813980523, "grad_norm": 0.11510751396417618, "learning_rate": 0.00022846902428452957, "loss": 0.4069887101650238, "memory(GiB)": 78.33, "step": 1851, "token_acc": 0.8827348409542743, "train_speed(iter/s)": 0.032685 }, { "epoch": 0.35886256842513203, "grad_norm": 0.11083865165710449, "learning_rate": 0.00022838707924154072, "loss": 0.3828306794166565, "memory(GiB)": 78.33, "step": 1852, "token_acc": 0.8873923603444469, "train_speed(iter/s)": 0.032686 }, { "epoch": 0.3590563387104588, "grad_norm": 0.11098314076662064, "learning_rate": 0.00022830510200287204, "loss": 0.4004696309566498, "memory(GiB)": 78.33, "step": 1853, "token_acc": 0.8826161182080414, "train_speed(iter/s)": 0.032688 }, { "epoch": 0.3592501089957855, "grad_norm": 0.1142255887389183, "learning_rate": 0.00022822309260219382, "loss": 0.39147108793258667, "memory(GiB)": 78.33, "step": 1854, "token_acc": 0.8852366362257191, "train_speed(iter/s)": 0.032689 }, { "epoch": 0.35944387928111227, "grad_norm": 0.10014226287603378, "learning_rate": 0.00022814105107318952, "loss": 0.35710304975509644, "memory(GiB)": 78.33, "step": 1855, "token_acc": 0.8953069555241322, "train_speed(iter/s)": 0.03269 }, { "epoch": 0.359637649566439, "grad_norm": 0.10325758904218674, "learning_rate": 0.00022805897744955587, "loss": 0.3418915569782257, "memory(GiB)": 78.33, "step": 1856, "token_acc": 0.899773435287454, "train_speed(iter/s)": 0.032691 }, { "epoch": 0.35983141985176575, "grad_norm": 0.10661393404006958, "learning_rate": 0.00022797687176500257, "loss": 0.36435335874557495, "memory(GiB)": 78.33, "step": 1857, "token_acc": 0.8942655906926346, "train_speed(iter/s)": 0.032693 }, { "epoch": 0.3600251901370925, "grad_norm": 0.11469036340713501, "learning_rate": 0.0002278947340532528, "loss": 0.415936678647995, "memory(GiB)": 78.33, "step": 1858, "token_acc": 0.8799646174259177, "train_speed(iter/s)": 0.032694 }, { "epoch": 0.36021896042241924, "grad_norm": 0.11351092904806137, "learning_rate": 0.0002278125643480426, "loss": 0.4251920282840729, "memory(GiB)": 78.33, "step": 1859, "token_acc": 0.8783027287232604, "train_speed(iter/s)": 0.032696 }, { "epoch": 0.360412730707746, "grad_norm": 0.12002420425415039, "learning_rate": 0.00022773036268312135, "loss": 0.40197598934173584, "memory(GiB)": 78.33, "step": 1860, "token_acc": 0.884971045690204, "train_speed(iter/s)": 0.032697 }, { "epoch": 0.36060650099307273, "grad_norm": 0.11968737095594406, "learning_rate": 0.00022764812909225143, "loss": 0.444561243057251, "memory(GiB)": 78.33, "step": 1861, "token_acc": 0.8720333353463373, "train_speed(iter/s)": 0.032698 }, { "epoch": 0.3608002712783995, "grad_norm": 0.12199822068214417, "learning_rate": 0.00022756586360920834, "loss": 0.4252670705318451, "memory(GiB)": 78.33, "step": 1862, "token_acc": 0.8757837872600405, "train_speed(iter/s)": 0.0327 }, { "epoch": 0.3609940415637262, "grad_norm": 0.11111042648553848, "learning_rate": 0.00022748356626778085, "loss": 0.38725805282592773, "memory(GiB)": 78.33, "step": 1863, "token_acc": 0.88624693011833, "train_speed(iter/s)": 0.032701 }, { "epoch": 0.36118781184905296, "grad_norm": 0.09851629287004471, "learning_rate": 0.00022740123710177063, "loss": 0.37134605646133423, "memory(GiB)": 78.33, "step": 1864, "token_acc": 0.8892687066811259, "train_speed(iter/s)": 0.032702 }, { "epoch": 0.3613815821343797, "grad_norm": 0.10534118860960007, "learning_rate": 0.0002273188761449925, "loss": 0.36640486121177673, "memory(GiB)": 78.33, "step": 1865, "token_acc": 0.8921657754010696, "train_speed(iter/s)": 0.032703 }, { "epoch": 0.36157535241970645, "grad_norm": 0.10792740434408188, "learning_rate": 0.00022723648343127428, "loss": 0.3896613121032715, "memory(GiB)": 78.33, "step": 1866, "token_acc": 0.8864207221350079, "train_speed(iter/s)": 0.032705 }, { "epoch": 0.3617691227050332, "grad_norm": 0.11643590033054352, "learning_rate": 0.0002271540589944569, "loss": 0.3730244040489197, "memory(GiB)": 78.33, "step": 1867, "token_acc": 0.8894352346164653, "train_speed(iter/s)": 0.032706 }, { "epoch": 0.36196289299035994, "grad_norm": 0.11337530612945557, "learning_rate": 0.00022707160286839425, "loss": 0.40268588066101074, "memory(GiB)": 78.33, "step": 1868, "token_acc": 0.8809530219410419, "train_speed(iter/s)": 0.032707 }, { "epoch": 0.3621566632756867, "grad_norm": 0.11132289469242096, "learning_rate": 0.00022698911508695335, "loss": 0.4088488221168518, "memory(GiB)": 78.33, "step": 1869, "token_acc": 0.8813299380751201, "train_speed(iter/s)": 0.032709 }, { "epoch": 0.36235043356101343, "grad_norm": 0.10401839762926102, "learning_rate": 0.00022690659568401405, "loss": 0.3792577087879181, "memory(GiB)": 78.33, "step": 1870, "token_acc": 0.8880251617357506, "train_speed(iter/s)": 0.03271 }, { "epoch": 0.3625442038463402, "grad_norm": 0.10719176381826401, "learning_rate": 0.0002268240446934694, "loss": 0.3922528028488159, "memory(GiB)": 78.33, "step": 1871, "token_acc": 0.8880327346157934, "train_speed(iter/s)": 0.032711 }, { "epoch": 0.3627379741316669, "grad_norm": 0.10341445356607437, "learning_rate": 0.00022674146214922522, "loss": 0.3721281886100769, "memory(GiB)": 78.33, "step": 1872, "token_acc": 0.8919034280378818, "train_speed(iter/s)": 0.032713 }, { "epoch": 0.36293174441699366, "grad_norm": 0.09941703081130981, "learning_rate": 0.00022665884808520045, "loss": 0.35231590270996094, "memory(GiB)": 78.33, "step": 1873, "token_acc": 0.895152484700744, "train_speed(iter/s)": 0.032714 }, { "epoch": 0.3631255147023204, "grad_norm": 0.12776227295398712, "learning_rate": 0.00022657620253532681, "loss": 0.42608171701431274, "memory(GiB)": 78.33, "step": 1874, "token_acc": 0.8764504054897068, "train_speed(iter/s)": 0.032715 }, { "epoch": 0.36331928498764715, "grad_norm": 0.10132501274347305, "learning_rate": 0.00022649352553354913, "loss": 0.3703892230987549, "memory(GiB)": 78.33, "step": 1875, "token_acc": 0.8915755231679516, "train_speed(iter/s)": 0.032717 }, { "epoch": 0.3635130552729739, "grad_norm": 0.11589387059211731, "learning_rate": 0.00022641081711382508, "loss": 0.4376241862773895, "memory(GiB)": 78.33, "step": 1876, "token_acc": 0.8734218545929473, "train_speed(iter/s)": 0.032718 }, { "epoch": 0.36370682555830064, "grad_norm": 0.10825785249471664, "learning_rate": 0.00022632807731012519, "loss": 0.3789633810520172, "memory(GiB)": 78.33, "step": 1877, "token_acc": 0.8895960154952961, "train_speed(iter/s)": 0.032719 }, { "epoch": 0.3639005958436274, "grad_norm": 0.10502450913190842, "learning_rate": 0.00022624530615643291, "loss": 0.3538724482059479, "memory(GiB)": 78.33, "step": 1878, "token_acc": 0.8984881209503239, "train_speed(iter/s)": 0.03272 }, { "epoch": 0.36409436612895413, "grad_norm": 0.10411059856414795, "learning_rate": 0.00022616250368674465, "loss": 0.35804876685142517, "memory(GiB)": 78.33, "step": 1879, "token_acc": 0.8947292874777516, "train_speed(iter/s)": 0.032722 }, { "epoch": 0.36428813641428087, "grad_norm": 0.09905564039945602, "learning_rate": 0.00022607966993506954, "loss": 0.36274391412734985, "memory(GiB)": 78.33, "step": 1880, "token_acc": 0.8931178818261462, "train_speed(iter/s)": 0.032723 }, { "epoch": 0.3644819066996076, "grad_norm": 0.11155527085065842, "learning_rate": 0.0002259968049354296, "loss": 0.4275517165660858, "memory(GiB)": 78.33, "step": 1881, "token_acc": 0.8751891922192948, "train_speed(iter/s)": 0.032724 }, { "epoch": 0.36467567698493436, "grad_norm": 0.12305615842342377, "learning_rate": 0.00022591390872185978, "loss": 0.4187135696411133, "memory(GiB)": 78.33, "step": 1882, "token_acc": 0.8782666539960088, "train_speed(iter/s)": 0.032726 }, { "epoch": 0.3648694472702611, "grad_norm": 0.2306802123785019, "learning_rate": 0.00022583098132840783, "loss": 0.40811386704444885, "memory(GiB)": 78.33, "step": 1883, "token_acc": 0.8822494609011463, "train_speed(iter/s)": 0.032727 }, { "epoch": 0.36506321755558785, "grad_norm": 0.11205735057592392, "learning_rate": 0.00022574802278913409, "loss": 0.3602101504802704, "memory(GiB)": 78.33, "step": 1884, "token_acc": 0.8919279519679787, "train_speed(iter/s)": 0.032728 }, { "epoch": 0.3652569878409146, "grad_norm": 0.10754162818193436, "learning_rate": 0.00022566503313811202, "loss": 0.41668108105659485, "memory(GiB)": 78.33, "step": 1885, "token_acc": 0.8790157211209843, "train_speed(iter/s)": 0.032729 }, { "epoch": 0.36545075812624134, "grad_norm": 0.13130834698677063, "learning_rate": 0.00022558201240942765, "loss": 0.3803432881832123, "memory(GiB)": 78.33, "step": 1886, "token_acc": 0.8907192443382269, "train_speed(iter/s)": 0.03273 }, { "epoch": 0.3656445284115681, "grad_norm": 0.12778477370738983, "learning_rate": 0.00022549896063717978, "loss": 0.45700541138648987, "memory(GiB)": 78.33, "step": 1887, "token_acc": 0.8673952641165756, "train_speed(iter/s)": 0.032732 }, { "epoch": 0.3658382986968948, "grad_norm": 0.10660150647163391, "learning_rate": 0.00022541587785548006, "loss": 0.388899028301239, "memory(GiB)": 78.33, "step": 1888, "token_acc": 0.8842618950793005, "train_speed(iter/s)": 0.032733 }, { "epoch": 0.36603206898222157, "grad_norm": 0.10151253640651703, "learning_rate": 0.0002253327640984528, "loss": 0.3622681200504303, "memory(GiB)": 78.33, "step": 1889, "token_acc": 0.8935134049603374, "train_speed(iter/s)": 0.032734 }, { "epoch": 0.3662258392675483, "grad_norm": 0.12292792648077011, "learning_rate": 0.00022524961940023505, "loss": 0.42959490418434143, "memory(GiB)": 78.33, "step": 1890, "token_acc": 0.8763984746777599, "train_speed(iter/s)": 0.032736 }, { "epoch": 0.36641960955287506, "grad_norm": 0.11249762028455734, "learning_rate": 0.00022516644379497658, "loss": 0.3739752471446991, "memory(GiB)": 78.33, "step": 1891, "token_acc": 0.8892815758980301, "train_speed(iter/s)": 0.032737 }, { "epoch": 0.3666133798382018, "grad_norm": 0.14134949445724487, "learning_rate": 0.00022508323731683984, "loss": 0.37695708870887756, "memory(GiB)": 78.33, "step": 1892, "token_acc": 0.8897990726429675, "train_speed(iter/s)": 0.032738 }, { "epoch": 0.36680715012352855, "grad_norm": 0.10767961293458939, "learning_rate": 0.000225, "loss": 0.3695901930332184, "memory(GiB)": 78.33, "step": 1893, "token_acc": 0.8925884180704907, "train_speed(iter/s)": 0.032739 }, { "epoch": 0.3670009204088553, "grad_norm": 0.10858670622110367, "learning_rate": 0.00022491673187864482, "loss": 0.39885541796684265, "memory(GiB)": 78.33, "step": 1894, "token_acc": 0.8855477140227064, "train_speed(iter/s)": 0.032741 }, { "epoch": 0.36719469069418204, "grad_norm": 0.12462179362773895, "learning_rate": 0.00022483343298697472, "loss": 0.38520297408103943, "memory(GiB)": 78.33, "step": 1895, "token_acc": 0.8870938651413259, "train_speed(iter/s)": 0.032742 }, { "epoch": 0.3673884609795088, "grad_norm": 0.09840144217014313, "learning_rate": 0.00022475010335920288, "loss": 0.35938745737075806, "memory(GiB)": 78.33, "step": 1896, "token_acc": 0.893057469002165, "train_speed(iter/s)": 0.032743 }, { "epoch": 0.3675822312648355, "grad_norm": 0.11222008615732193, "learning_rate": 0.00022466674302955495, "loss": 0.4017634689807892, "memory(GiB)": 78.33, "step": 1897, "token_acc": 0.8824197671116161, "train_speed(iter/s)": 0.032745 }, { "epoch": 0.36777600155016227, "grad_norm": 0.0982629731297493, "learning_rate": 0.00022458335203226932, "loss": 0.3599035441875458, "memory(GiB)": 78.33, "step": 1898, "token_acc": 0.8942642891179495, "train_speed(iter/s)": 0.032746 }, { "epoch": 0.367969771835489, "grad_norm": 0.09826286882162094, "learning_rate": 0.00022449993040159685, "loss": 0.35435935854911804, "memory(GiB)": 78.33, "step": 1899, "token_acc": 0.8933514973118933, "train_speed(iter/s)": 0.032747 }, { "epoch": 0.36816354212081576, "grad_norm": 0.11281174421310425, "learning_rate": 0.0002244164781718011, "loss": 0.3880941569805145, "memory(GiB)": 78.33, "step": 1900, "token_acc": 0.889101803692551, "train_speed(iter/s)": 0.032748 }, { "epoch": 0.3683573124061425, "grad_norm": 0.1215512827038765, "learning_rate": 0.0002243329953771581, "loss": 0.43437880277633667, "memory(GiB)": 78.33, "step": 1901, "token_acc": 0.875211292666291, "train_speed(iter/s)": 0.03275 }, { "epoch": 0.36855108269146925, "grad_norm": 0.11155978590250015, "learning_rate": 0.0002242494820519565, "loss": 0.37402305006980896, "memory(GiB)": 78.33, "step": 1902, "token_acc": 0.8897708186595872, "train_speed(iter/s)": 0.032751 }, { "epoch": 0.368744852976796, "grad_norm": 0.10142076760530472, "learning_rate": 0.00022416593823049746, "loss": 0.3670305609703064, "memory(GiB)": 78.33, "step": 1903, "token_acc": 0.8930082952429321, "train_speed(iter/s)": 0.032752 }, { "epoch": 0.36893862326212273, "grad_norm": 0.12013176828622818, "learning_rate": 0.00022408236394709464, "loss": 0.42799264192581177, "memory(GiB)": 78.33, "step": 1904, "token_acc": 0.8759844248616682, "train_speed(iter/s)": 0.032753 }, { "epoch": 0.3691323935474495, "grad_norm": 0.11704428493976593, "learning_rate": 0.0002239987592360743, "loss": 0.3853393793106079, "memory(GiB)": 78.33, "step": 1905, "token_acc": 0.8859077310659739, "train_speed(iter/s)": 0.032755 }, { "epoch": 0.3693261638327762, "grad_norm": 0.09620506316423416, "learning_rate": 0.00022391512413177516, "loss": 0.34337371587753296, "memory(GiB)": 78.33, "step": 1906, "token_acc": 0.8966349706853229, "train_speed(iter/s)": 0.032756 }, { "epoch": 0.36951993411810297, "grad_norm": 0.1018129214644432, "learning_rate": 0.00022383145866854834, "loss": 0.3677298128604889, "memory(GiB)": 78.33, "step": 1907, "token_acc": 0.8905968102763946, "train_speed(iter/s)": 0.032757 }, { "epoch": 0.3697137044034297, "grad_norm": 0.10136358439922333, "learning_rate": 0.00022374776288075745, "loss": 0.3641367554664612, "memory(GiB)": 78.33, "step": 1908, "token_acc": 0.8920720537349888, "train_speed(iter/s)": 0.032758 }, { "epoch": 0.36990747468875645, "grad_norm": 0.10752905905246735, "learning_rate": 0.00022366403680277875, "loss": 0.373902291059494, "memory(GiB)": 78.33, "step": 1909, "token_acc": 0.888663967611336, "train_speed(iter/s)": 0.032759 }, { "epoch": 0.3701012449740832, "grad_norm": 0.09833654016256332, "learning_rate": 0.00022358028046900067, "loss": 0.3543311059474945, "memory(GiB)": 78.33, "step": 1910, "token_acc": 0.89600593545196, "train_speed(iter/s)": 0.03276 }, { "epoch": 0.37029501525940994, "grad_norm": 0.10382269322872162, "learning_rate": 0.00022349649391382423, "loss": 0.39178794622421265, "memory(GiB)": 78.33, "step": 1911, "token_acc": 0.8857472274074888, "train_speed(iter/s)": 0.032762 }, { "epoch": 0.3704887855447367, "grad_norm": 0.11607307940721512, "learning_rate": 0.0002234126771716628, "loss": 0.39549145102500916, "memory(GiB)": 78.33, "step": 1912, "token_acc": 0.8873361227336123, "train_speed(iter/s)": 0.032763 }, { "epoch": 0.3706825558300635, "grad_norm": 0.098874531686306, "learning_rate": 0.0002233288302769422, "loss": 0.37269628047943115, "memory(GiB)": 78.33, "step": 1913, "token_acc": 0.8919546544147023, "train_speed(iter/s)": 0.032764 }, { "epoch": 0.37087632611539023, "grad_norm": 0.10173005610704422, "learning_rate": 0.00022324495326410057, "loss": 0.37610965967178345, "memory(GiB)": 78.33, "step": 1914, "token_acc": 0.8908233494774407, "train_speed(iter/s)": 0.032765 }, { "epoch": 0.371070096400717, "grad_norm": 0.12058980017900467, "learning_rate": 0.00022316104616758848, "loss": 0.43350180983543396, "memory(GiB)": 78.33, "step": 1915, "token_acc": 0.8748074872645422, "train_speed(iter/s)": 0.032767 }, { "epoch": 0.3712638666860437, "grad_norm": 0.1139712780714035, "learning_rate": 0.0002230771090218688, "loss": 0.3701034486293793, "memory(GiB)": 78.33, "step": 1916, "token_acc": 0.8913546669752908, "train_speed(iter/s)": 0.032768 }, { "epoch": 0.37145763697137046, "grad_norm": 0.11131946742534637, "learning_rate": 0.00022299314186141676, "loss": 0.37333056330680847, "memory(GiB)": 78.33, "step": 1917, "token_acc": 0.8892822835185947, "train_speed(iter/s)": 0.032769 }, { "epoch": 0.3716514072566972, "grad_norm": 0.12218397855758667, "learning_rate": 0.00022290914472072, "loss": 0.4149776101112366, "memory(GiB)": 78.33, "step": 1918, "token_acc": 0.8801127922136988, "train_speed(iter/s)": 0.03277 }, { "epoch": 0.37184517754202395, "grad_norm": 0.10744242370128632, "learning_rate": 0.00022282511763427838, "loss": 0.3630349338054657, "memory(GiB)": 78.33, "step": 1919, "token_acc": 0.8921786701935794, "train_speed(iter/s)": 0.032772 }, { "epoch": 0.3720389478273507, "grad_norm": 0.10889404267072678, "learning_rate": 0.00022274106063660404, "loss": 0.3649406433105469, "memory(GiB)": 78.33, "step": 1920, "token_acc": 0.8912664400768435, "train_speed(iter/s)": 0.032773 }, { "epoch": 0.37223271811267744, "grad_norm": 0.11465780436992645, "learning_rate": 0.00022265697376222141, "loss": 0.3944970965385437, "memory(GiB)": 78.33, "step": 1921, "token_acc": 0.8845351473922902, "train_speed(iter/s)": 0.032774 }, { "epoch": 0.3724264883980042, "grad_norm": 0.11644507199525833, "learning_rate": 0.00022257285704566735, "loss": 0.41171303391456604, "memory(GiB)": 78.33, "step": 1922, "token_acc": 0.8799286944436051, "train_speed(iter/s)": 0.032775 }, { "epoch": 0.37262025868333093, "grad_norm": 0.10672671347856522, "learning_rate": 0.00022248871052149078, "loss": 0.37590137124061584, "memory(GiB)": 78.33, "step": 1923, "token_acc": 0.8882163998831992, "train_speed(iter/s)": 0.032777 }, { "epoch": 0.3728140289686577, "grad_norm": 0.11380830407142639, "learning_rate": 0.00022240453422425294, "loss": 0.41342946887016296, "memory(GiB)": 78.33, "step": 1924, "token_acc": 0.8813163889880054, "train_speed(iter/s)": 0.032778 }, { "epoch": 0.3730077992539844, "grad_norm": 0.10751762241125107, "learning_rate": 0.00022232032818852732, "loss": 0.37841910123825073, "memory(GiB)": 78.33, "step": 1925, "token_acc": 0.887600209372159, "train_speed(iter/s)": 0.032779 }, { "epoch": 0.37320156953931116, "grad_norm": 0.11165986955165863, "learning_rate": 0.0002222360924488996, "loss": 0.39904457330703735, "memory(GiB)": 78.33, "step": 1926, "token_acc": 0.8864683010230814, "train_speed(iter/s)": 0.03278 }, { "epoch": 0.3733953398246379, "grad_norm": 0.10432970523834229, "learning_rate": 0.00022215182703996765, "loss": 0.35338613390922546, "memory(GiB)": 78.33, "step": 1927, "token_acc": 0.8975270862625093, "train_speed(iter/s)": 0.032782 }, { "epoch": 0.37358911010996465, "grad_norm": 0.09983038157224655, "learning_rate": 0.00022206753199634148, "loss": 0.36258718371391296, "memory(GiB)": 78.33, "step": 1928, "token_acc": 0.8939695669934641, "train_speed(iter/s)": 0.032783 }, { "epoch": 0.3737828803952914, "grad_norm": 0.10431524366140366, "learning_rate": 0.00022198320735264344, "loss": 0.3931850492954254, "memory(GiB)": 78.33, "step": 1929, "token_acc": 0.8851347908067064, "train_speed(iter/s)": 0.032784 }, { "epoch": 0.37397665068061814, "grad_norm": 0.11230375617742538, "learning_rate": 0.00022189885314350787, "loss": 0.391851007938385, "memory(GiB)": 78.33, "step": 1930, "token_acc": 0.8859056476850633, "train_speed(iter/s)": 0.032785 }, { "epoch": 0.3741704209659449, "grad_norm": 0.1120762825012207, "learning_rate": 0.00022181446940358135, "loss": 0.41800612211227417, "memory(GiB)": 78.33, "step": 1931, "token_acc": 0.877407731234931, "train_speed(iter/s)": 0.032787 }, { "epoch": 0.3743641912512716, "grad_norm": 0.1048278734087944, "learning_rate": 0.00022173005616752252, "loss": 0.3735467791557312, "memory(GiB)": 78.33, "step": 1932, "token_acc": 0.8902271252433485, "train_speed(iter/s)": 0.032788 }, { "epoch": 0.37455796153659837, "grad_norm": 0.12189824134111404, "learning_rate": 0.00022164561347000212, "loss": 0.3999292254447937, "memory(GiB)": 78.33, "step": 1933, "token_acc": 0.8858239307268434, "train_speed(iter/s)": 0.032789 }, { "epoch": 0.3747517318219251, "grad_norm": 0.10540164262056351, "learning_rate": 0.00022156114134570305, "loss": 0.39689502120018005, "memory(GiB)": 78.33, "step": 1934, "token_acc": 0.8854811041222644, "train_speed(iter/s)": 0.03279 }, { "epoch": 0.37494550210725186, "grad_norm": 0.10333568602800369, "learning_rate": 0.00022147663982932038, "loss": 0.3748435080051422, "memory(GiB)": 78.33, "step": 1935, "token_acc": 0.8905851118145615, "train_speed(iter/s)": 0.032791 }, { "epoch": 0.3751392723925786, "grad_norm": 0.1070760041475296, "learning_rate": 0.00022139210895556104, "loss": 0.37235966324806213, "memory(GiB)": 78.33, "step": 1936, "token_acc": 0.8913205876656614, "train_speed(iter/s)": 0.032792 }, { "epoch": 0.37533304267790535, "grad_norm": 0.10744938999414444, "learning_rate": 0.00022130754875914415, "loss": 0.36817583441734314, "memory(GiB)": 78.33, "step": 1937, "token_acc": 0.8937919693024531, "train_speed(iter/s)": 0.032794 }, { "epoch": 0.3755268129632321, "grad_norm": 0.09824454039335251, "learning_rate": 0.0002212229592748009, "loss": 0.34159597754478455, "memory(GiB)": 78.33, "step": 1938, "token_acc": 0.8967411946420885, "train_speed(iter/s)": 0.032795 }, { "epoch": 0.37572058324855884, "grad_norm": 0.10819046944379807, "learning_rate": 0.00022113834053727444, "loss": 0.3992760181427002, "memory(GiB)": 78.33, "step": 1939, "token_acc": 0.8845097429519071, "train_speed(iter/s)": 0.032796 }, { "epoch": 0.3759143535338856, "grad_norm": 0.09654027968645096, "learning_rate": 0.00022105369258131998, "loss": 0.340084046125412, "memory(GiB)": 78.33, "step": 1940, "token_acc": 0.9001615206348463, "train_speed(iter/s)": 0.032797 }, { "epoch": 0.3761081238192123, "grad_norm": 0.09880183637142181, "learning_rate": 0.00022096901544170467, "loss": 0.36598512530326843, "memory(GiB)": 78.33, "step": 1941, "token_acc": 0.8919111291880625, "train_speed(iter/s)": 0.032798 }, { "epoch": 0.37630189410453907, "grad_norm": 0.10682184994220734, "learning_rate": 0.0002208843091532077, "loss": 0.39188694953918457, "memory(GiB)": 78.33, "step": 1942, "token_acc": 0.8864231527226587, "train_speed(iter/s)": 0.0328 }, { "epoch": 0.3764956643898658, "grad_norm": 0.11017350852489471, "learning_rate": 0.00022079957375062021, "loss": 0.40813326835632324, "memory(GiB)": 78.33, "step": 1943, "token_acc": 0.8828356812531744, "train_speed(iter/s)": 0.032801 }, { "epoch": 0.37668943467519256, "grad_norm": 0.11248282343149185, "learning_rate": 0.00022071480926874536, "loss": 0.41364431381225586, "memory(GiB)": 78.33, "step": 1944, "token_acc": 0.8817496143077539, "train_speed(iter/s)": 0.032802 }, { "epoch": 0.3768832049605193, "grad_norm": 0.09806734323501587, "learning_rate": 0.00022063001574239814, "loss": 0.36278462409973145, "memory(GiB)": 78.33, "step": 1945, "token_acc": 0.8927522524764797, "train_speed(iter/s)": 0.032803 }, { "epoch": 0.37707697524584605, "grad_norm": 0.10731010138988495, "learning_rate": 0.00022054519320640557, "loss": 0.39958709478378296, "memory(GiB)": 78.33, "step": 1946, "token_acc": 0.882800608828006, "train_speed(iter/s)": 0.032804 }, { "epoch": 0.3772707455311728, "grad_norm": 0.1078319326043129, "learning_rate": 0.0002204603416956065, "loss": 0.39028337597846985, "memory(GiB)": 78.33, "step": 1947, "token_acc": 0.8870954252738982, "train_speed(iter/s)": 0.032805 }, { "epoch": 0.37746451581649954, "grad_norm": 0.10992579162120819, "learning_rate": 0.00022037546124485178, "loss": 0.3915446102619171, "memory(GiB)": 78.33, "step": 1948, "token_acc": 0.8861552284559077, "train_speed(iter/s)": 0.032807 }, { "epoch": 0.3776582861018263, "grad_norm": 0.10269248485565186, "learning_rate": 0.00022029055188900405, "loss": 0.35439544916152954, "memory(GiB)": 78.33, "step": 1949, "token_acc": 0.894891822424076, "train_speed(iter/s)": 0.032808 }, { "epoch": 0.377852056387153, "grad_norm": 0.11859910935163498, "learning_rate": 0.00022020561366293789, "loss": 0.36357784271240234, "memory(GiB)": 78.33, "step": 1950, "token_acc": 0.8931177855959062, "train_speed(iter/s)": 0.032809 }, { "epoch": 0.37804582667247977, "grad_norm": 0.11494617909193039, "learning_rate": 0.0002201206466015397, "loss": 0.4254799485206604, "memory(GiB)": 78.33, "step": 1951, "token_acc": 0.8750034286968209, "train_speed(iter/s)": 0.03281 }, { "epoch": 0.3782395969578065, "grad_norm": 0.09963639825582504, "learning_rate": 0.00022003565073970774, "loss": 0.3571300506591797, "memory(GiB)": 78.33, "step": 1952, "token_acc": 0.8945331269019273, "train_speed(iter/s)": 0.032812 }, { "epoch": 0.37843336724313326, "grad_norm": 0.10854914784431458, "learning_rate": 0.0002199506261123521, "loss": 0.3844650089740753, "memory(GiB)": 78.33, "step": 1953, "token_acc": 0.8875416461849364, "train_speed(iter/s)": 0.032813 }, { "epoch": 0.37862713752846, "grad_norm": 0.11779830604791641, "learning_rate": 0.00021986557275439464, "loss": 0.40272995829582214, "memory(GiB)": 78.33, "step": 1954, "token_acc": 0.8847069242264903, "train_speed(iter/s)": 0.032814 }, { "epoch": 0.37882090781378674, "grad_norm": 0.09850434213876724, "learning_rate": 0.00021978049070076912, "loss": 0.3412163257598877, "memory(GiB)": 78.33, "step": 1955, "token_acc": 0.8973165531228873, "train_speed(iter/s)": 0.032816 }, { "epoch": 0.3790146780991135, "grad_norm": 0.10487658530473709, "learning_rate": 0.00021969537998642097, "loss": 0.37174132466316223, "memory(GiB)": 78.33, "step": 1956, "token_acc": 0.8908394592093765, "train_speed(iter/s)": 0.032817 }, { "epoch": 0.37920844838444023, "grad_norm": 0.10610850155353546, "learning_rate": 0.00021961024064630745, "loss": 0.3750998079776764, "memory(GiB)": 78.33, "step": 1957, "token_acc": 0.8877445652173913, "train_speed(iter/s)": 0.032818 }, { "epoch": 0.379402218669767, "grad_norm": 0.11280324310064316, "learning_rate": 0.00021952507271539762, "loss": 0.40227746963500977, "memory(GiB)": 78.33, "step": 1958, "token_acc": 0.8827386807356404, "train_speed(iter/s)": 0.032819 }, { "epoch": 0.3795959889550937, "grad_norm": 0.11210876703262329, "learning_rate": 0.00021943987622867223, "loss": 0.3853500187397003, "memory(GiB)": 78.33, "step": 1959, "token_acc": 0.8886800763470856, "train_speed(iter/s)": 0.03282 }, { "epoch": 0.37978975924042047, "grad_norm": 0.10147024691104889, "learning_rate": 0.00021935465122112377, "loss": 0.34372827410697937, "memory(GiB)": 78.33, "step": 1960, "token_acc": 0.8980276961812841, "train_speed(iter/s)": 0.032821 }, { "epoch": 0.3799835295257472, "grad_norm": 0.1177581325173378, "learning_rate": 0.00021926939772775637, "loss": 0.4162188172340393, "memory(GiB)": 78.33, "step": 1961, "token_acc": 0.8789121688698469, "train_speed(iter/s)": 0.032823 }, { "epoch": 0.38017729981107395, "grad_norm": 0.10330435633659363, "learning_rate": 0.00021918411578358601, "loss": 0.3811953067779541, "memory(GiB)": 78.33, "step": 1962, "token_acc": 0.8878933276780744, "train_speed(iter/s)": 0.032824 }, { "epoch": 0.3803710700964007, "grad_norm": 0.1050238385796547, "learning_rate": 0.0002190988054236402, "loss": 0.39693543314933777, "memory(GiB)": 78.33, "step": 1963, "token_acc": 0.8836019246925838, "train_speed(iter/s)": 0.032825 }, { "epoch": 0.38056484038172744, "grad_norm": 0.1056143268942833, "learning_rate": 0.0002190134666829583, "loss": 0.3753167390823364, "memory(GiB)": 78.33, "step": 1964, "token_acc": 0.8911448414921026, "train_speed(iter/s)": 0.032826 }, { "epoch": 0.3807586106670542, "grad_norm": 0.1035081148147583, "learning_rate": 0.0002189280995965912, "loss": 0.3400823771953583, "memory(GiB)": 78.33, "step": 1965, "token_acc": 0.8973672105419812, "train_speed(iter/s)": 0.032828 }, { "epoch": 0.38095238095238093, "grad_norm": 0.11154508590698242, "learning_rate": 0.00021884270419960137, "loss": 0.37742310762405396, "memory(GiB)": 78.33, "step": 1966, "token_acc": 0.8898611151892416, "train_speed(iter/s)": 0.032829 }, { "epoch": 0.3811461512377077, "grad_norm": 0.10220933705568314, "learning_rate": 0.00021875728052706304, "loss": 0.3489319086074829, "memory(GiB)": 78.33, "step": 1967, "token_acc": 0.8957410562180579, "train_speed(iter/s)": 0.03283 }, { "epoch": 0.3813399215230344, "grad_norm": 0.12063480913639069, "learning_rate": 0.00021867182861406206, "loss": 0.40946558117866516, "memory(GiB)": 78.33, "step": 1968, "token_acc": 0.8786677692548975, "train_speed(iter/s)": 0.032831 }, { "epoch": 0.38153369180836116, "grad_norm": 0.1056455671787262, "learning_rate": 0.00021858634849569576, "loss": 0.38398033380508423, "memory(GiB)": 78.33, "step": 1969, "token_acc": 0.8870220312216843, "train_speed(iter/s)": 0.032832 }, { "epoch": 0.3817274620936879, "grad_norm": 0.10487000644207001, "learning_rate": 0.00021850084020707316, "loss": 0.39127257466316223, "memory(GiB)": 78.33, "step": 1970, "token_acc": 0.8856672733165466, "train_speed(iter/s)": 0.032833 }, { "epoch": 0.38192123237901465, "grad_norm": 0.10356573760509491, "learning_rate": 0.0002184153037833148, "loss": 0.39030513167381287, "memory(GiB)": 78.33, "step": 1971, "token_acc": 0.8840334008097166, "train_speed(iter/s)": 0.032835 }, { "epoch": 0.3821150026643414, "grad_norm": 0.10742707550525665, "learning_rate": 0.0002183297392595528, "loss": 0.3961334824562073, "memory(GiB)": 78.33, "step": 1972, "token_acc": 0.8832247032291641, "train_speed(iter/s)": 0.032836 }, { "epoch": 0.38230877294966814, "grad_norm": 0.10838883370161057, "learning_rate": 0.00021824414667093075, "loss": 0.3667401075363159, "memory(GiB)": 78.33, "step": 1973, "token_acc": 0.8917944849551588, "train_speed(iter/s)": 0.032837 }, { "epoch": 0.38250254323499494, "grad_norm": 0.1165713295340538, "learning_rate": 0.00021815852605260386, "loss": 0.3943071663379669, "memory(GiB)": 78.33, "step": 1974, "token_acc": 0.8838169344539811, "train_speed(iter/s)": 0.032838 }, { "epoch": 0.3826963135203217, "grad_norm": 0.10990098863840103, "learning_rate": 0.0002180728774397389, "loss": 0.4022403061389923, "memory(GiB)": 78.33, "step": 1975, "token_acc": 0.8827039398397908, "train_speed(iter/s)": 0.032839 }, { "epoch": 0.38289008380564843, "grad_norm": 0.12068414688110352, "learning_rate": 0.00021798720086751395, "loss": 0.431316614151001, "memory(GiB)": 78.33, "step": 1976, "token_acc": 0.8732160741111668, "train_speed(iter/s)": 0.032841 }, { "epoch": 0.3830838540909752, "grad_norm": 0.09716933965682983, "learning_rate": 0.0002179014963711187, "loss": 0.35583171248435974, "memory(GiB)": 78.33, "step": 1977, "token_acc": 0.8953317742009447, "train_speed(iter/s)": 0.032842 }, { "epoch": 0.3832776243763019, "grad_norm": 0.11288797110319138, "learning_rate": 0.00021781576398575433, "loss": 0.392235666513443, "memory(GiB)": 78.33, "step": 1978, "token_acc": 0.885842526497945, "train_speed(iter/s)": 0.032843 }, { "epoch": 0.38347139466162866, "grad_norm": 0.10276081413030624, "learning_rate": 0.0002177300037466334, "loss": 0.37976735830307007, "memory(GiB)": 78.33, "step": 1979, "token_acc": 0.8875089992800576, "train_speed(iter/s)": 0.032844 }, { "epoch": 0.3836651649469554, "grad_norm": 0.11369860172271729, "learning_rate": 0.00021764421568897993, "loss": 0.3621234893798828, "memory(GiB)": 78.33, "step": 1980, "token_acc": 0.890562048175558, "train_speed(iter/s)": 0.032845 }, { "epoch": 0.38385893523228215, "grad_norm": 0.10219123214483261, "learning_rate": 0.00021755839984802944, "loss": 0.3366636335849762, "memory(GiB)": 78.33, "step": 1981, "token_acc": 0.900472891235016, "train_speed(iter/s)": 0.032847 }, { "epoch": 0.3840527055176089, "grad_norm": 0.10489702969789505, "learning_rate": 0.0002174725562590288, "loss": 0.37687474489212036, "memory(GiB)": 78.33, "step": 1982, "token_acc": 0.8895873939660143, "train_speed(iter/s)": 0.032848 }, { "epoch": 0.38424647580293564, "grad_norm": 0.09669921547174454, "learning_rate": 0.00021738668495723616, "loss": 0.3477326035499573, "memory(GiB)": 78.33, "step": 1983, "token_acc": 0.8987814906182253, "train_speed(iter/s)": 0.032849 }, { "epoch": 0.3844402460882624, "grad_norm": 0.11556685715913773, "learning_rate": 0.0002173007859779213, "loss": 0.4000082015991211, "memory(GiB)": 78.33, "step": 1984, "token_acc": 0.8822343958445688, "train_speed(iter/s)": 0.03285 }, { "epoch": 0.3846340163735891, "grad_norm": 0.10442518442869186, "learning_rate": 0.00021721485935636523, "loss": 0.3522685170173645, "memory(GiB)": 78.33, "step": 1985, "token_acc": 0.8955117718187637, "train_speed(iter/s)": 0.032851 }, { "epoch": 0.38482778665891587, "grad_norm": 0.11232437193393707, "learning_rate": 0.00021712890512786027, "loss": 0.39102703332901, "memory(GiB)": 78.33, "step": 1986, "token_acc": 0.8859773174722014, "train_speed(iter/s)": 0.032852 }, { "epoch": 0.3850215569442426, "grad_norm": 0.1146976500749588, "learning_rate": 0.00021704292332771013, "loss": 0.4080618619918823, "memory(GiB)": 78.33, "step": 1987, "token_acc": 0.8815463453091564, "train_speed(iter/s)": 0.032853 }, { "epoch": 0.38521532722956936, "grad_norm": 0.10926152765750885, "learning_rate": 0.00021695691399122987, "loss": 0.3856127858161926, "memory(GiB)": 78.33, "step": 1988, "token_acc": 0.8882630577068502, "train_speed(iter/s)": 0.032855 }, { "epoch": 0.3854090975148961, "grad_norm": 0.11085885018110275, "learning_rate": 0.00021687087715374585, "loss": 0.37670475244522095, "memory(GiB)": 78.33, "step": 1989, "token_acc": 0.8898128898128899, "train_speed(iter/s)": 0.032856 }, { "epoch": 0.38560286780022285, "grad_norm": 0.12100964039564133, "learning_rate": 0.00021678481285059567, "loss": 0.43839654326438904, "memory(GiB)": 78.33, "step": 1990, "token_acc": 0.8723092599668832, "train_speed(iter/s)": 0.032857 }, { "epoch": 0.3857966380855496, "grad_norm": 0.12123683094978333, "learning_rate": 0.00021669872111712828, "loss": 0.44955089688301086, "memory(GiB)": 78.33, "step": 1991, "token_acc": 0.871977240398293, "train_speed(iter/s)": 0.032858 }, { "epoch": 0.38599040837087634, "grad_norm": 0.11107171326875687, "learning_rate": 0.0002166126019887039, "loss": 0.4144379794597626, "memory(GiB)": 78.33, "step": 1992, "token_acc": 0.8786039878072653, "train_speed(iter/s)": 0.032859 }, { "epoch": 0.3861841786562031, "grad_norm": 0.10761483013629913, "learning_rate": 0.00021652645550069392, "loss": 0.3946702480316162, "memory(GiB)": 78.33, "step": 1993, "token_acc": 0.8829077659918314, "train_speed(iter/s)": 0.03286 }, { "epoch": 0.3863779489415298, "grad_norm": 0.10240019857883453, "learning_rate": 0.000216440281688481, "loss": 0.36723431944847107, "memory(GiB)": 78.33, "step": 1994, "token_acc": 0.8909840895698291, "train_speed(iter/s)": 0.032862 }, { "epoch": 0.38657171922685657, "grad_norm": 0.10443782061338425, "learning_rate": 0.00021635408058745908, "loss": 0.3627747893333435, "memory(GiB)": 78.33, "step": 1995, "token_acc": 0.8937532210377844, "train_speed(iter/s)": 0.032863 }, { "epoch": 0.3867654895121833, "grad_norm": 0.10117712616920471, "learning_rate": 0.00021626785223303327, "loss": 0.36782556772232056, "memory(GiB)": 78.33, "step": 1996, "token_acc": 0.8936070197430273, "train_speed(iter/s)": 0.032864 }, { "epoch": 0.38695925979751006, "grad_norm": 0.10845784842967987, "learning_rate": 0.00021618159666061983, "loss": 0.4153798818588257, "memory(GiB)": 78.33, "step": 1997, "token_acc": 0.8783169270276054, "train_speed(iter/s)": 0.032865 }, { "epoch": 0.3871530300828368, "grad_norm": 0.11158988624811172, "learning_rate": 0.00021609531390564635, "loss": 0.3699086904525757, "memory(GiB)": 78.33, "step": 1998, "token_acc": 0.8905605642615397, "train_speed(iter/s)": 0.032866 }, { "epoch": 0.38734680036816355, "grad_norm": 0.1030752882361412, "learning_rate": 0.0002160090040035513, "loss": 0.3778277039527893, "memory(GiB)": 78.33, "step": 1999, "token_acc": 0.8885486675143369, "train_speed(iter/s)": 0.032867 }, { "epoch": 0.3875405706534903, "grad_norm": 0.10194329917430878, "learning_rate": 0.00021592266698978462, "loss": 0.3767800033092499, "memory(GiB)": 78.33, "step": 2000, "token_acc": 0.8890578277836907, "train_speed(iter/s)": 0.032868 }, { "epoch": 0.3875405706534903, "eval_loss": 0.44034290313720703, "eval_runtime": 1345.0564, "eval_samples_per_second": 5.018, "eval_steps_per_second": 5.018, "eval_token_acc": 0.8890313859070251, "step": 2000 }, { "epoch": 0.38773434093881703, "grad_norm": 0.12286636233329773, "learning_rate": 0.00021583630289980724, "loss": 0.41180068254470825, "memory(GiB)": 78.33, "step": 2001, "token_acc": 0.8807242798353909, "train_speed(iter/s)": 0.032152 }, { "epoch": 0.3879281112241438, "grad_norm": 0.10262475907802582, "learning_rate": 0.00021574991176909113, "loss": 0.37820303440093994, "memory(GiB)": 78.33, "step": 2002, "token_acc": 0.8894734153940447, "train_speed(iter/s)": 0.032154 }, { "epoch": 0.3881218815094705, "grad_norm": 0.10098009556531906, "learning_rate": 0.00021566349363311949, "loss": 0.3635365962982178, "memory(GiB)": 78.33, "step": 2003, "token_acc": 0.8932263226068625, "train_speed(iter/s)": 0.032155 }, { "epoch": 0.38831565179479727, "grad_norm": 0.11192551255226135, "learning_rate": 0.00021557704852738654, "loss": 0.39867013692855835, "memory(GiB)": 78.33, "step": 2004, "token_acc": 0.8858497030607584, "train_speed(iter/s)": 0.032157 }, { "epoch": 0.388509422080124, "grad_norm": 0.0996481254696846, "learning_rate": 0.00021549057648739768, "loss": 0.3613511025905609, "memory(GiB)": 78.33, "step": 2005, "token_acc": 0.8933584214808787, "train_speed(iter/s)": 0.032158 }, { "epoch": 0.38870319236545076, "grad_norm": 0.12326376140117645, "learning_rate": 0.00021540407754866924, "loss": 0.4421766996383667, "memory(GiB)": 78.33, "step": 2006, "token_acc": 0.8737238044062332, "train_speed(iter/s)": 0.03216 }, { "epoch": 0.3888969626507775, "grad_norm": 0.09973074495792389, "learning_rate": 0.00021531755174672868, "loss": 0.34252458810806274, "memory(GiB)": 78.33, "step": 2007, "token_acc": 0.8975569907844689, "train_speed(iter/s)": 0.032161 }, { "epoch": 0.38909073293610424, "grad_norm": 0.09278019517660141, "learning_rate": 0.00021523099911711447, "loss": 0.3389296233654022, "memory(GiB)": 78.33, "step": 2008, "token_acc": 0.89790143300206, "train_speed(iter/s)": 0.032162 }, { "epoch": 0.389284503221431, "grad_norm": 0.11313237994909286, "learning_rate": 0.00021514441969537607, "loss": 0.43270501494407654, "memory(GiB)": 78.33, "step": 2009, "token_acc": 0.8762101038587737, "train_speed(iter/s)": 0.032164 }, { "epoch": 0.38947827350675773, "grad_norm": 0.10002917051315308, "learning_rate": 0.00021505781351707402, "loss": 0.3512948751449585, "memory(GiB)": 78.33, "step": 2010, "token_acc": 0.8962549078828148, "train_speed(iter/s)": 0.032165 }, { "epoch": 0.3896720437920845, "grad_norm": 0.11693061888217926, "learning_rate": 0.0002149711806177798, "loss": 0.40658366680145264, "memory(GiB)": 78.33, "step": 2011, "token_acc": 0.8811087609929867, "train_speed(iter/s)": 0.032167 }, { "epoch": 0.3898658140774112, "grad_norm": 0.10790550708770752, "learning_rate": 0.00021488452103307585, "loss": 0.41037797927856445, "memory(GiB)": 78.33, "step": 2012, "token_acc": 0.88296488946684, "train_speed(iter/s)": 0.032168 }, { "epoch": 0.39005958436273797, "grad_norm": 0.11255151033401489, "learning_rate": 0.0002147978347985556, "loss": 0.37893742322921753, "memory(GiB)": 78.33, "step": 2013, "token_acc": 0.889498343046509, "train_speed(iter/s)": 0.032169 }, { "epoch": 0.3902533546480647, "grad_norm": 0.10197418183088303, "learning_rate": 0.0002147111219498234, "loss": 0.3498515188694, "memory(GiB)": 78.33, "step": 2014, "token_acc": 0.8973286219081272, "train_speed(iter/s)": 0.032171 }, { "epoch": 0.39044712493339145, "grad_norm": 0.09706109017133713, "learning_rate": 0.00021462438252249457, "loss": 0.340999037027359, "memory(GiB)": 78.33, "step": 2015, "token_acc": 0.896882369710006, "train_speed(iter/s)": 0.032172 }, { "epoch": 0.3906408952187182, "grad_norm": 0.10438670963048935, "learning_rate": 0.00021453761655219528, "loss": 0.389445424079895, "memory(GiB)": 78.33, "step": 2016, "token_acc": 0.8829340326399674, "train_speed(iter/s)": 0.032174 }, { "epoch": 0.39083466550404494, "grad_norm": 0.11134763062000275, "learning_rate": 0.00021445082407456272, "loss": 0.43515315651893616, "memory(GiB)": 78.33, "step": 2017, "token_acc": 0.8730318643677499, "train_speed(iter/s)": 0.032175 }, { "epoch": 0.3910284357893717, "grad_norm": 0.09228318929672241, "learning_rate": 0.00021436400512524483, "loss": 0.33381155133247375, "memory(GiB)": 78.33, "step": 2018, "token_acc": 0.902964766542538, "train_speed(iter/s)": 0.032177 }, { "epoch": 0.39122220607469843, "grad_norm": 0.10420046001672745, "learning_rate": 0.00021427715973990056, "loss": 0.3501606285572052, "memory(GiB)": 78.33, "step": 2019, "token_acc": 0.8966936364399899, "train_speed(iter/s)": 0.032178 }, { "epoch": 0.3914159763600252, "grad_norm": 0.09859726577997208, "learning_rate": 0.00021419028795419953, "loss": 0.3734872341156006, "memory(GiB)": 78.33, "step": 2020, "token_acc": 0.8910265718019457, "train_speed(iter/s)": 0.032179 }, { "epoch": 0.3916097466453519, "grad_norm": 0.11447544395923615, "learning_rate": 0.00021410338980382238, "loss": 0.38576456904411316, "memory(GiB)": 78.33, "step": 2021, "token_acc": 0.887767163988371, "train_speed(iter/s)": 0.032181 }, { "epoch": 0.39180351693067866, "grad_norm": 0.10994785279035568, "learning_rate": 0.00021401646532446053, "loss": 0.40534737706184387, "memory(GiB)": 78.33, "step": 2022, "token_acc": 0.8793884919720366, "train_speed(iter/s)": 0.032182 }, { "epoch": 0.3919972872160054, "grad_norm": 0.10715804994106293, "learning_rate": 0.00021392951455181619, "loss": 0.3926943242549896, "memory(GiB)": 78.33, "step": 2023, "token_acc": 0.8848688300385429, "train_speed(iter/s)": 0.032183 }, { "epoch": 0.39219105750133215, "grad_norm": 0.10480938851833344, "learning_rate": 0.00021384253752160235, "loss": 0.3569415807723999, "memory(GiB)": 78.33, "step": 2024, "token_acc": 0.8943907463159563, "train_speed(iter/s)": 0.032185 }, { "epoch": 0.3923848277866589, "grad_norm": 0.09940195083618164, "learning_rate": 0.00021375553426954285, "loss": 0.3529174029827118, "memory(GiB)": 78.33, "step": 2025, "token_acc": 0.8969288159844653, "train_speed(iter/s)": 0.032186 }, { "epoch": 0.39257859807198564, "grad_norm": 0.11303528398275375, "learning_rate": 0.00021366850483137226, "loss": 0.40097349882125854, "memory(GiB)": 78.33, "step": 2026, "token_acc": 0.8815832710978342, "train_speed(iter/s)": 0.032188 }, { "epoch": 0.3927723683573124, "grad_norm": 0.0959119200706482, "learning_rate": 0.00021358144924283584, "loss": 0.35317641496658325, "memory(GiB)": 78.33, "step": 2027, "token_acc": 0.8957997557997558, "train_speed(iter/s)": 0.032189 }, { "epoch": 0.39296613864263913, "grad_norm": 0.11829999089241028, "learning_rate": 0.0002134943675396898, "loss": 0.38520050048828125, "memory(GiB)": 78.33, "step": 2028, "token_acc": 0.8890619591554171, "train_speed(iter/s)": 0.032191 }, { "epoch": 0.3931599089279659, "grad_norm": 0.11327025294303894, "learning_rate": 0.0002134072597577008, "loss": 0.39436087012290955, "memory(GiB)": 78.33, "step": 2029, "token_acc": 0.8814638027048528, "train_speed(iter/s)": 0.032192 }, { "epoch": 0.3933536792132926, "grad_norm": 0.09940643608570099, "learning_rate": 0.0002133201259326464, "loss": 0.3501843810081482, "memory(GiB)": 78.33, "step": 2030, "token_acc": 0.8952846160250895, "train_speed(iter/s)": 0.032194 }, { "epoch": 0.39354744949861936, "grad_norm": 0.09761599451303482, "learning_rate": 0.0002132329661003148, "loss": 0.3546067178249359, "memory(GiB)": 78.33, "step": 2031, "token_acc": 0.8960130106943965, "train_speed(iter/s)": 0.032195 }, { "epoch": 0.3937412197839461, "grad_norm": 0.10584171116352081, "learning_rate": 0.00021314578029650493, "loss": 0.39139533042907715, "memory(GiB)": 78.33, "step": 2032, "token_acc": 0.8860886829913964, "train_speed(iter/s)": 0.032196 }, { "epoch": 0.39393499006927285, "grad_norm": 0.1090681329369545, "learning_rate": 0.00021305856855702624, "loss": 0.37363752722740173, "memory(GiB)": 78.33, "step": 2033, "token_acc": 0.8886917688801884, "train_speed(iter/s)": 0.032198 }, { "epoch": 0.39412876035459965, "grad_norm": 0.10559553653001785, "learning_rate": 0.00021297133091769904, "loss": 0.36771360039711, "memory(GiB)": 78.33, "step": 2034, "token_acc": 0.8921826625386997, "train_speed(iter/s)": 0.032199 }, { "epoch": 0.3943225306399264, "grad_norm": 0.10535501688718796, "learning_rate": 0.00021288406741435412, "loss": 0.36465299129486084, "memory(GiB)": 78.33, "step": 2035, "token_acc": 0.8928280358598207, "train_speed(iter/s)": 0.0322 }, { "epoch": 0.39451630092525314, "grad_norm": 0.1058509349822998, "learning_rate": 0.0002127967780828329, "loss": 0.3727305829524994, "memory(GiB)": 78.33, "step": 2036, "token_acc": 0.8917116094237866, "train_speed(iter/s)": 0.032202 }, { "epoch": 0.3947100712105799, "grad_norm": 0.10819867253303528, "learning_rate": 0.00021270946295898755, "loss": 0.36687490344047546, "memory(GiB)": 78.33, "step": 2037, "token_acc": 0.8913552022967728, "train_speed(iter/s)": 0.032203 }, { "epoch": 0.3949038414959066, "grad_norm": 0.10262859612703323, "learning_rate": 0.0002126221220786807, "loss": 0.3651004731655121, "memory(GiB)": 78.33, "step": 2038, "token_acc": 0.892148337595908, "train_speed(iter/s)": 0.032204 }, { "epoch": 0.39509761178123337, "grad_norm": 0.10185900330543518, "learning_rate": 0.0002125347554777856, "loss": 0.3557824492454529, "memory(GiB)": 78.33, "step": 2039, "token_acc": 0.8953365688963917, "train_speed(iter/s)": 0.032206 }, { "epoch": 0.3952913820665601, "grad_norm": 0.09542527794837952, "learning_rate": 0.000212447363192186, "loss": 0.36669260263442993, "memory(GiB)": 78.33, "step": 2040, "token_acc": 0.8931179156718667, "train_speed(iter/s)": 0.032207 }, { "epoch": 0.39548515235188686, "grad_norm": 0.10032487660646439, "learning_rate": 0.00021235994525777637, "loss": 0.3561350107192993, "memory(GiB)": 78.33, "step": 2041, "token_acc": 0.8934586347077768, "train_speed(iter/s)": 0.032208 }, { "epoch": 0.3956789226372136, "grad_norm": 0.1082734614610672, "learning_rate": 0.0002122725017104615, "loss": 0.34290170669555664, "memory(GiB)": 78.33, "step": 2042, "token_acc": 0.8980645927333175, "train_speed(iter/s)": 0.03221 }, { "epoch": 0.39587269292254035, "grad_norm": 0.10429167002439499, "learning_rate": 0.00021218503258615688, "loss": 0.38119545578956604, "memory(GiB)": 78.33, "step": 2043, "token_acc": 0.8884871605078859, "train_speed(iter/s)": 0.032211 }, { "epoch": 0.3960664632078671, "grad_norm": 0.10330045223236084, "learning_rate": 0.00021209753792078836, "loss": 0.36754027009010315, "memory(GiB)": 78.33, "step": 2044, "token_acc": 0.8926996316089867, "train_speed(iter/s)": 0.032213 }, { "epoch": 0.39626023349319384, "grad_norm": 0.10568582266569138, "learning_rate": 0.00021201001775029244, "loss": 0.37445148825645447, "memory(GiB)": 78.33, "step": 2045, "token_acc": 0.890682963949396, "train_speed(iter/s)": 0.032214 }, { "epoch": 0.3964540037785206, "grad_norm": 0.11252961307764053, "learning_rate": 0.00021192247211061595, "loss": 0.40704840421676636, "memory(GiB)": 78.33, "step": 2046, "token_acc": 0.8822551520536559, "train_speed(iter/s)": 0.032215 }, { "epoch": 0.3966477740638473, "grad_norm": 0.10527341812849045, "learning_rate": 0.0002118349010377162, "loss": 0.4128043055534363, "memory(GiB)": 78.33, "step": 2047, "token_acc": 0.8777996488801404, "train_speed(iter/s)": 0.032217 }, { "epoch": 0.39684154434917407, "grad_norm": 0.1078762486577034, "learning_rate": 0.00021174730456756106, "loss": 0.3727009892463684, "memory(GiB)": 78.33, "step": 2048, "token_acc": 0.8923324669454176, "train_speed(iter/s)": 0.032218 }, { "epoch": 0.3970353146345008, "grad_norm": 0.10819050669670105, "learning_rate": 0.00021165968273612875, "loss": 0.37524309754371643, "memory(GiB)": 78.33, "step": 2049, "token_acc": 0.8889318457969734, "train_speed(iter/s)": 0.03222 }, { "epoch": 0.39722908491982756, "grad_norm": 0.10360529273748398, "learning_rate": 0.0002115720355794078, "loss": 0.37673884630203247, "memory(GiB)": 78.33, "step": 2050, "token_acc": 0.8891292318527089, "train_speed(iter/s)": 0.032221 }, { "epoch": 0.3974228552051543, "grad_norm": 0.10623595118522644, "learning_rate": 0.00021148436313339739, "loss": 0.3622683882713318, "memory(GiB)": 78.33, "step": 2051, "token_acc": 0.8929465428046949, "train_speed(iter/s)": 0.032222 }, { "epoch": 0.39761662549048105, "grad_norm": 0.1145068109035492, "learning_rate": 0.0002113966654341069, "loss": 0.37567758560180664, "memory(GiB)": 78.33, "step": 2052, "token_acc": 0.8903638151425762, "train_speed(iter/s)": 0.032224 }, { "epoch": 0.3978103957758078, "grad_norm": 0.11284542083740234, "learning_rate": 0.00021130894251755608, "loss": 0.3825456202030182, "memory(GiB)": 78.33, "step": 2053, "token_acc": 0.8858395490519843, "train_speed(iter/s)": 0.032225 }, { "epoch": 0.39800416606113453, "grad_norm": 0.11392559856176376, "learning_rate": 0.00021122119441977516, "loss": 0.38240864872932434, "memory(GiB)": 78.33, "step": 2054, "token_acc": 0.8883792048929664, "train_speed(iter/s)": 0.032227 }, { "epoch": 0.3981979363464613, "grad_norm": 0.11121159791946411, "learning_rate": 0.00021113342117680463, "loss": 0.4033408761024475, "memory(GiB)": 78.33, "step": 2055, "token_acc": 0.8815422034050986, "train_speed(iter/s)": 0.032228 }, { "epoch": 0.398391706631788, "grad_norm": 0.11232481151819229, "learning_rate": 0.00021104562282469523, "loss": 0.3962811529636383, "memory(GiB)": 78.33, "step": 2056, "token_acc": 0.8822988914577035, "train_speed(iter/s)": 0.03223 }, { "epoch": 0.39858547691711477, "grad_norm": 0.1073499396443367, "learning_rate": 0.00021095779939950827, "loss": 0.36865997314453125, "memory(GiB)": 78.33, "step": 2057, "token_acc": 0.890293265087614, "train_speed(iter/s)": 0.032231 }, { "epoch": 0.3987792472024415, "grad_norm": 0.10009787231683731, "learning_rate": 0.00021086995093731506, "loss": 0.36048176884651184, "memory(GiB)": 78.33, "step": 2058, "token_acc": 0.8924960221731766, "train_speed(iter/s)": 0.032232 }, { "epoch": 0.39897301748776826, "grad_norm": 0.10518264770507812, "learning_rate": 0.00021078207747419737, "loss": 0.375562846660614, "memory(GiB)": 78.33, "step": 2059, "token_acc": 0.8906748031710573, "train_speed(iter/s)": 0.032234 }, { "epoch": 0.399166787773095, "grad_norm": 0.11405156552791595, "learning_rate": 0.00021069417904624713, "loss": 0.39197370409965515, "memory(GiB)": 78.33, "step": 2060, "token_acc": 0.8833886430678466, "train_speed(iter/s)": 0.032235 }, { "epoch": 0.39936055805842174, "grad_norm": 0.11229176819324493, "learning_rate": 0.00021060625568956672, "loss": 0.38142168521881104, "memory(GiB)": 78.33, "step": 2061, "token_acc": 0.8872789019385878, "train_speed(iter/s)": 0.032236 }, { "epoch": 0.3995543283437485, "grad_norm": 0.10720710456371307, "learning_rate": 0.0002105183074402685, "loss": 0.3908834755420685, "memory(GiB)": 78.33, "step": 2062, "token_acc": 0.8867632578421719, "train_speed(iter/s)": 0.032238 }, { "epoch": 0.39974809862907523, "grad_norm": 0.11731129139661789, "learning_rate": 0.00021043033433447523, "loss": 0.3812180757522583, "memory(GiB)": 78.33, "step": 2063, "token_acc": 0.8876133286235723, "train_speed(iter/s)": 0.032239 }, { "epoch": 0.399941868914402, "grad_norm": 0.1061178669333458, "learning_rate": 0.00021034233640831985, "loss": 0.35581403970718384, "memory(GiB)": 78.33, "step": 2064, "token_acc": 0.8961584794538884, "train_speed(iter/s)": 0.032241 }, { "epoch": 0.4001356391997287, "grad_norm": 0.12066232413053513, "learning_rate": 0.0002102543136979454, "loss": 0.407576322555542, "memory(GiB)": 78.33, "step": 2065, "token_acc": 0.8834347797630735, "train_speed(iter/s)": 0.032242 }, { "epoch": 0.40032940948505547, "grad_norm": 0.10802853107452393, "learning_rate": 0.00021016626623950523, "loss": 0.3750949800014496, "memory(GiB)": 78.33, "step": 2066, "token_acc": 0.8900487525093204, "train_speed(iter/s)": 0.032243 }, { "epoch": 0.4005231797703822, "grad_norm": 0.10871551930904388, "learning_rate": 0.00021007819406916283, "loss": 0.3510313928127289, "memory(GiB)": 78.33, "step": 2067, "token_acc": 0.8955925167910985, "train_speed(iter/s)": 0.032245 }, { "epoch": 0.40071695005570895, "grad_norm": 0.10944053530693054, "learning_rate": 0.0002099900972230917, "loss": 0.3768633306026459, "memory(GiB)": 78.33, "step": 2068, "token_acc": 0.8907095472471034, "train_speed(iter/s)": 0.032246 }, { "epoch": 0.4009107203410357, "grad_norm": 0.11056249588727951, "learning_rate": 0.0002099019757374757, "loss": 0.38695505261421204, "memory(GiB)": 78.33, "step": 2069, "token_acc": 0.8846898656182149, "train_speed(iter/s)": 0.032247 }, { "epoch": 0.40110449062636244, "grad_norm": 0.10126353055238724, "learning_rate": 0.00020981382964850858, "loss": 0.3784712851047516, "memory(GiB)": 78.33, "step": 2070, "token_acc": 0.8864102038725541, "train_speed(iter/s)": 0.032249 }, { "epoch": 0.4012982609116892, "grad_norm": 0.11109844595193863, "learning_rate": 0.00020972565899239441, "loss": 0.39917057752609253, "memory(GiB)": 78.33, "step": 2071, "token_acc": 0.884891448825875, "train_speed(iter/s)": 0.03225 }, { "epoch": 0.40149203119701593, "grad_norm": 0.10675584524869919, "learning_rate": 0.0002096374638053472, "loss": 0.3992624580860138, "memory(GiB)": 78.33, "step": 2072, "token_acc": 0.8833933488667656, "train_speed(iter/s)": 0.032251 }, { "epoch": 0.4016858014823427, "grad_norm": 0.09866248071193695, "learning_rate": 0.000209549244123591, "loss": 0.3732259273529053, "memory(GiB)": 78.33, "step": 2073, "token_acc": 0.891376350844958, "train_speed(iter/s)": 0.032253 }, { "epoch": 0.4018795717676694, "grad_norm": 0.11382555216550827, "learning_rate": 0.00020946099998336019, "loss": 0.40181201696395874, "memory(GiB)": 78.33, "step": 2074, "token_acc": 0.8830451706345614, "train_speed(iter/s)": 0.032254 }, { "epoch": 0.40207334205299616, "grad_norm": 0.09255445003509521, "learning_rate": 0.0002093727314208989, "loss": 0.317599356174469, "memory(GiB)": 78.33, "step": 2075, "token_acc": 0.9049172687019342, "train_speed(iter/s)": 0.032255 }, { "epoch": 0.4022671123383229, "grad_norm": 0.12287932634353638, "learning_rate": 0.00020928443847246134, "loss": 0.41253185272216797, "memory(GiB)": 78.33, "step": 2076, "token_acc": 0.8792145844223732, "train_speed(iter/s)": 0.032257 }, { "epoch": 0.40246088262364965, "grad_norm": 0.1004108339548111, "learning_rate": 0.0002091961211743119, "loss": 0.32616835832595825, "memory(GiB)": 78.33, "step": 2077, "token_acc": 0.9020430729022693, "train_speed(iter/s)": 0.032258 }, { "epoch": 0.4026546529089764, "grad_norm": 0.10672204196453094, "learning_rate": 0.00020910777956272485, "loss": 0.37214604020118713, "memory(GiB)": 78.33, "step": 2078, "token_acc": 0.8916562179414111, "train_speed(iter/s)": 0.032259 }, { "epoch": 0.40284842319430314, "grad_norm": 0.10667048394680023, "learning_rate": 0.00020901941367398446, "loss": 0.3868388533592224, "memory(GiB)": 78.33, "step": 2079, "token_acc": 0.886404833836858, "train_speed(iter/s)": 0.032261 }, { "epoch": 0.4030421934796299, "grad_norm": 0.1087040826678276, "learning_rate": 0.0002089310235443849, "loss": 0.3963964879512787, "memory(GiB)": 78.33, "step": 2080, "token_acc": 0.8864254703328509, "train_speed(iter/s)": 0.032262 }, { "epoch": 0.40323596376495663, "grad_norm": 0.10210565477609634, "learning_rate": 0.0002088426092102305, "loss": 0.38553428649902344, "memory(GiB)": 78.33, "step": 2081, "token_acc": 0.8849096532638776, "train_speed(iter/s)": 0.032263 }, { "epoch": 0.4034297340502834, "grad_norm": 0.10996957868337631, "learning_rate": 0.0002087541707078353, "loss": 0.40852048993110657, "memory(GiB)": 78.33, "step": 2082, "token_acc": 0.8819868995633188, "train_speed(iter/s)": 0.032264 }, { "epoch": 0.4036235043356101, "grad_norm": 0.1148100420832634, "learning_rate": 0.00020866570807352337, "loss": 0.40155094861984253, "memory(GiB)": 78.33, "step": 2083, "token_acc": 0.8844750224349387, "train_speed(iter/s)": 0.032266 }, { "epoch": 0.40381727462093686, "grad_norm": 0.09997189044952393, "learning_rate": 0.0002085772213436288, "loss": 0.35030126571655273, "memory(GiB)": 78.33, "step": 2084, "token_acc": 0.8974117047048323, "train_speed(iter/s)": 0.032267 }, { "epoch": 0.4040110449062636, "grad_norm": 0.10798052698373795, "learning_rate": 0.00020848871055449537, "loss": 0.3880666196346283, "memory(GiB)": 78.33, "step": 2085, "token_acc": 0.8854523021703774, "train_speed(iter/s)": 0.032269 }, { "epoch": 0.40420481519159035, "grad_norm": 0.11830330640077591, "learning_rate": 0.00020840017574247683, "loss": 0.446191668510437, "memory(GiB)": 78.33, "step": 2086, "token_acc": 0.8699704224566103, "train_speed(iter/s)": 0.03227 }, { "epoch": 0.4043985854769171, "grad_norm": 0.0969855859875679, "learning_rate": 0.00020831161694393683, "loss": 0.36765626072883606, "memory(GiB)": 78.33, "step": 2087, "token_acc": 0.892250186892599, "train_speed(iter/s)": 0.032271 }, { "epoch": 0.40459235576224384, "grad_norm": 0.10226907581090927, "learning_rate": 0.00020822303419524893, "loss": 0.36062684655189514, "memory(GiB)": 78.33, "step": 2088, "token_acc": 0.893473640557773, "train_speed(iter/s)": 0.032273 }, { "epoch": 0.4047861260475706, "grad_norm": 0.12443238496780396, "learning_rate": 0.0002081344275327963, "loss": 0.4231499135494232, "memory(GiB)": 78.33, "step": 2089, "token_acc": 0.8769953656024717, "train_speed(iter/s)": 0.032274 }, { "epoch": 0.4049798963328973, "grad_norm": 0.11125290393829346, "learning_rate": 0.00020804579699297218, "loss": 0.38650333881378174, "memory(GiB)": 78.33, "step": 2090, "token_acc": 0.8851398261545724, "train_speed(iter/s)": 0.032275 }, { "epoch": 0.40517366661822407, "grad_norm": 0.0958833247423172, "learning_rate": 0.00020795714261217949, "loss": 0.34758636355400085, "memory(GiB)": 78.33, "step": 2091, "token_acc": 0.8987275662330263, "train_speed(iter/s)": 0.032277 }, { "epoch": 0.4053674369035508, "grad_norm": 0.10977054387331009, "learning_rate": 0.00020786846442683095, "loss": 0.3909391760826111, "memory(GiB)": 78.33, "step": 2092, "token_acc": 0.886636506003132, "train_speed(iter/s)": 0.032278 }, { "epoch": 0.40556120718887756, "grad_norm": 0.10875298827886581, "learning_rate": 0.00020777976247334906, "loss": 0.3958970308303833, "memory(GiB)": 78.33, "step": 2093, "token_acc": 0.8842546456415968, "train_speed(iter/s)": 0.032279 }, { "epoch": 0.4057549774742043, "grad_norm": 0.11530545353889465, "learning_rate": 0.00020769103678816616, "loss": 0.3912769556045532, "memory(GiB)": 78.33, "step": 2094, "token_acc": 0.8857514450867052, "train_speed(iter/s)": 0.032281 }, { "epoch": 0.4059487477595311, "grad_norm": 0.11220169067382812, "learning_rate": 0.00020760228740772423, "loss": 0.39823243021965027, "memory(GiB)": 78.33, "step": 2095, "token_acc": 0.8826204060355064, "train_speed(iter/s)": 0.032282 }, { "epoch": 0.40614251804485785, "grad_norm": 0.10903418809175491, "learning_rate": 0.00020751351436847497, "loss": 0.4057612121105194, "memory(GiB)": 78.33, "step": 2096, "token_acc": 0.883694474539545, "train_speed(iter/s)": 0.032283 }, { "epoch": 0.4063362883301846, "grad_norm": 0.11058089882135391, "learning_rate": 0.00020742471770687998, "loss": 0.40834489464759827, "memory(GiB)": 78.33, "step": 2097, "token_acc": 0.8802559666034989, "train_speed(iter/s)": 0.032285 }, { "epoch": 0.40653005861551134, "grad_norm": 0.10095040500164032, "learning_rate": 0.00020733589745941034, "loss": 0.3721862733364105, "memory(GiB)": 78.33, "step": 2098, "token_acc": 0.8911490351522632, "train_speed(iter/s)": 0.032286 }, { "epoch": 0.4067238289008381, "grad_norm": 0.12033317983150482, "learning_rate": 0.00020724705366254693, "loss": 0.43703311681747437, "memory(GiB)": 78.33, "step": 2099, "token_acc": 0.874896265560166, "train_speed(iter/s)": 0.032287 }, { "epoch": 0.4069175991861648, "grad_norm": 0.10075613856315613, "learning_rate": 0.0002071581863527803, "loss": 0.33554336428642273, "memory(GiB)": 78.33, "step": 2100, "token_acc": 0.900580013797685, "train_speed(iter/s)": 0.032289 }, { "epoch": 0.40711136947149157, "grad_norm": 0.10525397956371307, "learning_rate": 0.00020706929556661068, "loss": 0.37453317642211914, "memory(GiB)": 78.33, "step": 2101, "token_acc": 0.8898639629661186, "train_speed(iter/s)": 0.03229 }, { "epoch": 0.4073051397568183, "grad_norm": 0.10155075043439865, "learning_rate": 0.00020698038134054782, "loss": 0.38334396481513977, "memory(GiB)": 78.33, "step": 2102, "token_acc": 0.885774579297657, "train_speed(iter/s)": 0.032291 }, { "epoch": 0.40749891004214506, "grad_norm": 0.11583290249109268, "learning_rate": 0.00020689144371111118, "loss": 0.40256598591804504, "memory(GiB)": 78.33, "step": 2103, "token_acc": 0.8832213494842434, "train_speed(iter/s)": 0.032293 }, { "epoch": 0.4076926803274718, "grad_norm": 0.10212710499763489, "learning_rate": 0.00020680248271482993, "loss": 0.3900299668312073, "memory(GiB)": 78.33, "step": 2104, "token_acc": 0.8837178537411366, "train_speed(iter/s)": 0.032294 }, { "epoch": 0.40788645061279855, "grad_norm": 0.1136767715215683, "learning_rate": 0.0002067134983882427, "loss": 0.40916168689727783, "memory(GiB)": 78.33, "step": 2105, "token_acc": 0.8802716981132076, "train_speed(iter/s)": 0.032296 }, { "epoch": 0.4080802208981253, "grad_norm": 0.1174011304974556, "learning_rate": 0.00020662449076789768, "loss": 0.3446325957775116, "memory(GiB)": 78.33, "step": 2106, "token_acc": 0.8981224992305324, "train_speed(iter/s)": 0.032297 }, { "epoch": 0.40827399118345203, "grad_norm": 0.10661471635103226, "learning_rate": 0.00020653545989035278, "loss": 0.3691784143447876, "memory(GiB)": 78.33, "step": 2107, "token_acc": 0.8892190390760778, "train_speed(iter/s)": 0.032299 }, { "epoch": 0.4084677614687788, "grad_norm": 0.10399375855922699, "learning_rate": 0.00020644640579217533, "loss": 0.3859068751335144, "memory(GiB)": 78.33, "step": 2108, "token_acc": 0.8871030059500812, "train_speed(iter/s)": 0.0323 }, { "epoch": 0.4086615317541055, "grad_norm": 0.10391335189342499, "learning_rate": 0.0002063573285099422, "loss": 0.3771337866783142, "memory(GiB)": 78.33, "step": 2109, "token_acc": 0.8897867506583363, "train_speed(iter/s)": 0.032301 }, { "epoch": 0.40885530203943227, "grad_norm": 0.11487588286399841, "learning_rate": 0.00020626822808023993, "loss": 0.3902420699596405, "memory(GiB)": 78.33, "step": 2110, "token_acc": 0.8852734179451692, "train_speed(iter/s)": 0.032302 }, { "epoch": 0.409049072324759, "grad_norm": 0.11222491413354874, "learning_rate": 0.00020617910453966438, "loss": 0.3725607693195343, "memory(GiB)": 78.33, "step": 2111, "token_acc": 0.8908452798762715, "train_speed(iter/s)": 0.032304 }, { "epoch": 0.40924284261008576, "grad_norm": 0.10845697671175003, "learning_rate": 0.00020608995792482102, "loss": 0.36426377296447754, "memory(GiB)": 78.33, "step": 2112, "token_acc": 0.8920298507462686, "train_speed(iter/s)": 0.032305 }, { "epoch": 0.4094366128954125, "grad_norm": 0.11745316535234451, "learning_rate": 0.00020600078827232469, "loss": 0.44262993335723877, "memory(GiB)": 78.33, "step": 2113, "token_acc": 0.8721979182380449, "train_speed(iter/s)": 0.032307 }, { "epoch": 0.40963038318073924, "grad_norm": 0.10488930344581604, "learning_rate": 0.00020591159561879991, "loss": 0.3662855923175812, "memory(GiB)": 78.33, "step": 2114, "token_acc": 0.8902525438621355, "train_speed(iter/s)": 0.032308 }, { "epoch": 0.409824153466066, "grad_norm": 0.10289178043603897, "learning_rate": 0.00020582238000088033, "loss": 0.37403732538223267, "memory(GiB)": 78.33, "step": 2115, "token_acc": 0.8886182232818493, "train_speed(iter/s)": 0.032309 }, { "epoch": 0.41001792375139273, "grad_norm": 0.10993564128875732, "learning_rate": 0.0002057331414552093, "loss": 0.400392085313797, "memory(GiB)": 78.33, "step": 2116, "token_acc": 0.882896215297761, "train_speed(iter/s)": 0.032311 }, { "epoch": 0.4102116940367195, "grad_norm": 0.10764973610639572, "learning_rate": 0.00020564388001843945, "loss": 0.3797903060913086, "memory(GiB)": 78.33, "step": 2117, "token_acc": 0.8884940778341793, "train_speed(iter/s)": 0.032312 }, { "epoch": 0.4104054643220462, "grad_norm": 0.11092360317707062, "learning_rate": 0.00020555459572723294, "loss": 0.3895651698112488, "memory(GiB)": 78.33, "step": 2118, "token_acc": 0.8856181915590213, "train_speed(iter/s)": 0.032313 }, { "epoch": 0.41059923460737296, "grad_norm": 0.11945401877164841, "learning_rate": 0.00020546528861826107, "loss": 0.40205317735671997, "memory(GiB)": 78.33, "step": 2119, "token_acc": 0.8840717802127998, "train_speed(iter/s)": 0.032315 }, { "epoch": 0.4107930048926997, "grad_norm": 0.1141401007771492, "learning_rate": 0.0002053759587282048, "loss": 0.42232364416122437, "memory(GiB)": 78.33, "step": 2120, "token_acc": 0.8797556077277742, "train_speed(iter/s)": 0.032316 }, { "epoch": 0.41098677517802645, "grad_norm": 0.10188201814889908, "learning_rate": 0.00020528660609375426, "loss": 0.36200839281082153, "memory(GiB)": 78.33, "step": 2121, "token_acc": 0.8925009859710407, "train_speed(iter/s)": 0.032317 }, { "epoch": 0.4111805454633532, "grad_norm": 0.09827426820993423, "learning_rate": 0.000205197230751609, "loss": 0.3723769187927246, "memory(GiB)": 78.33, "step": 2122, "token_acc": 0.8900832517140059, "train_speed(iter/s)": 0.032318 }, { "epoch": 0.41137431574867994, "grad_norm": 0.1049884557723999, "learning_rate": 0.00020510783273847778, "loss": 0.37664487957954407, "memory(GiB)": 78.33, "step": 2123, "token_acc": 0.8893548130258234, "train_speed(iter/s)": 0.032319 }, { "epoch": 0.4115680860340067, "grad_norm": 0.10434847325086594, "learning_rate": 0.00020501841209107896, "loss": 0.3857555091381073, "memory(GiB)": 78.33, "step": 2124, "token_acc": 0.8891336270190896, "train_speed(iter/s)": 0.032321 }, { "epoch": 0.41176185631933343, "grad_norm": 0.10164911299943924, "learning_rate": 0.00020492896884613987, "loss": 0.3675006628036499, "memory(GiB)": 78.33, "step": 2125, "token_acc": 0.8910501783714814, "train_speed(iter/s)": 0.032322 }, { "epoch": 0.4119556266046602, "grad_norm": 0.10134454071521759, "learning_rate": 0.00020483950304039724, "loss": 0.3476465940475464, "memory(GiB)": 78.33, "step": 2126, "token_acc": 0.896970502258836, "train_speed(iter/s)": 0.032323 }, { "epoch": 0.4121493968899869, "grad_norm": 0.10433205217123032, "learning_rate": 0.00020475001471059712, "loss": 0.3762246072292328, "memory(GiB)": 78.33, "step": 2127, "token_acc": 0.8882155392268263, "train_speed(iter/s)": 0.032324 }, { "epoch": 0.41234316717531366, "grad_norm": 0.10764237493276596, "learning_rate": 0.0002046605038934948, "loss": 0.3889080286026001, "memory(GiB)": 78.33, "step": 2128, "token_acc": 0.8841791697411931, "train_speed(iter/s)": 0.032326 }, { "epoch": 0.4125369374606404, "grad_norm": 0.10777831077575684, "learning_rate": 0.00020457097062585473, "loss": 0.38341259956359863, "memory(GiB)": 78.33, "step": 2129, "token_acc": 0.8884803921568627, "train_speed(iter/s)": 0.032327 }, { "epoch": 0.41273070774596715, "grad_norm": 0.15417581796646118, "learning_rate": 0.00020448141494445066, "loss": 0.37628644704818726, "memory(GiB)": 78.33, "step": 2130, "token_acc": 0.8896055119537908, "train_speed(iter/s)": 0.032328 }, { "epoch": 0.4129244780312939, "grad_norm": 0.1092265397310257, "learning_rate": 0.00020439183688606547, "loss": 0.367519348859787, "memory(GiB)": 78.33, "step": 2131, "token_acc": 0.8922348315610759, "train_speed(iter/s)": 0.03233 }, { "epoch": 0.41311824831662064, "grad_norm": 0.10159304738044739, "learning_rate": 0.0002043022364874913, "loss": 0.33829817175865173, "memory(GiB)": 78.33, "step": 2132, "token_acc": 0.8995331695331695, "train_speed(iter/s)": 0.032331 }, { "epoch": 0.4133120186019474, "grad_norm": 0.1057777926325798, "learning_rate": 0.00020421261378552948, "loss": 0.36473432183265686, "memory(GiB)": 78.33, "step": 2133, "token_acc": 0.8916222307734594, "train_speed(iter/s)": 0.032332 }, { "epoch": 0.41350578888727413, "grad_norm": 0.10533451288938522, "learning_rate": 0.00020412296881699039, "loss": 0.3586021065711975, "memory(GiB)": 78.33, "step": 2134, "token_acc": 0.8953077991205739, "train_speed(iter/s)": 0.032333 }, { "epoch": 0.4136995591726009, "grad_norm": 0.10619760304689407, "learning_rate": 0.00020403330161869373, "loss": 0.35481902956962585, "memory(GiB)": 78.33, "step": 2135, "token_acc": 0.8953304521977443, "train_speed(iter/s)": 0.032335 }, { "epoch": 0.4138933294579276, "grad_norm": 0.11591517180204391, "learning_rate": 0.0002039436122274681, "loss": 0.40828073024749756, "memory(GiB)": 78.33, "step": 2136, "token_acc": 0.8834233806329039, "train_speed(iter/s)": 0.032336 }, { "epoch": 0.41408709974325436, "grad_norm": 0.11356504261493683, "learning_rate": 0.00020385390068015146, "loss": 0.3991680443286896, "memory(GiB)": 78.33, "step": 2137, "token_acc": 0.8845186434119241, "train_speed(iter/s)": 0.032337 }, { "epoch": 0.4142808700285811, "grad_norm": 0.11383282393217087, "learning_rate": 0.00020376416701359067, "loss": 0.41105103492736816, "memory(GiB)": 78.33, "step": 2138, "token_acc": 0.8808600337268128, "train_speed(iter/s)": 0.032338 }, { "epoch": 0.41447464031390785, "grad_norm": 0.09987689554691315, "learning_rate": 0.00020367441126464177, "loss": 0.3558513820171356, "memory(GiB)": 78.33, "step": 2139, "token_acc": 0.8945576407506702, "train_speed(iter/s)": 0.03234 }, { "epoch": 0.4146684105992346, "grad_norm": 0.11728887259960175, "learning_rate": 0.00020358463347016988, "loss": 0.41319242119789124, "memory(GiB)": 78.33, "step": 2140, "token_acc": 0.880601774015885, "train_speed(iter/s)": 0.032341 }, { "epoch": 0.41486218088456134, "grad_norm": 0.10223786532878876, "learning_rate": 0.0002034948336670492, "loss": 0.3577488362789154, "memory(GiB)": 78.33, "step": 2141, "token_acc": 0.892794648592376, "train_speed(iter/s)": 0.032342 }, { "epoch": 0.4150559511698881, "grad_norm": 0.10343615710735321, "learning_rate": 0.00020340501189216285, "loss": 0.36109933257102966, "memory(GiB)": 78.33, "step": 2142, "token_acc": 0.8946854716704895, "train_speed(iter/s)": 0.032343 }, { "epoch": 0.4152497214552148, "grad_norm": 0.10884758830070496, "learning_rate": 0.000203315168182403, "loss": 0.38284680247306824, "memory(GiB)": 78.33, "step": 2143, "token_acc": 0.8858148616687751, "train_speed(iter/s)": 0.032345 }, { "epoch": 0.41544349174054157, "grad_norm": 0.10763585567474365, "learning_rate": 0.00020322530257467104, "loss": 0.3719366788864136, "memory(GiB)": 78.33, "step": 2144, "token_acc": 0.8914604948124502, "train_speed(iter/s)": 0.032346 }, { "epoch": 0.4156372620258683, "grad_norm": 0.10268665105104446, "learning_rate": 0.00020313541510587707, "loss": 0.3508341312408447, "memory(GiB)": 78.33, "step": 2145, "token_acc": 0.8950760245695297, "train_speed(iter/s)": 0.032347 }, { "epoch": 0.41583103231119506, "grad_norm": 0.10579564422369003, "learning_rate": 0.00020304550581294026, "loss": 0.39762815833091736, "memory(GiB)": 78.33, "step": 2146, "token_acc": 0.8849706763321503, "train_speed(iter/s)": 0.032348 }, { "epoch": 0.4160248025965218, "grad_norm": 0.09803339838981628, "learning_rate": 0.00020295557473278886, "loss": 0.33986738324165344, "memory(GiB)": 78.33, "step": 2147, "token_acc": 0.8999627421758569, "train_speed(iter/s)": 0.03235 }, { "epoch": 0.41621857288184855, "grad_norm": 0.1106083020567894, "learning_rate": 0.00020286562190235998, "loss": 0.3605062961578369, "memory(GiB)": 78.33, "step": 2148, "token_acc": 0.8942120900468518, "train_speed(iter/s)": 0.032351 }, { "epoch": 0.4164123431671753, "grad_norm": 0.11620527505874634, "learning_rate": 0.00020277564735859957, "loss": 0.42198172211647034, "memory(GiB)": 78.33, "step": 2149, "token_acc": 0.8788444418918531, "train_speed(iter/s)": 0.032352 }, { "epoch": 0.41660611345250204, "grad_norm": 0.0997370034456253, "learning_rate": 0.0002026856511384627, "loss": 0.33660730719566345, "memory(GiB)": 78.33, "step": 2150, "token_acc": 0.9009054193173719, "train_speed(iter/s)": 0.032354 }, { "epoch": 0.4167998837378288, "grad_norm": 0.10104697942733765, "learning_rate": 0.00020259563327891316, "loss": 0.3682084381580353, "memory(GiB)": 78.33, "step": 2151, "token_acc": 0.8905543542362886, "train_speed(iter/s)": 0.032355 }, { "epoch": 0.4169936540231555, "grad_norm": 0.10348115861415863, "learning_rate": 0.00020250559381692373, "loss": 0.35385698080062866, "memory(GiB)": 78.33, "step": 2152, "token_acc": 0.8960956832554303, "train_speed(iter/s)": 0.032356 }, { "epoch": 0.41718742430848227, "grad_norm": 0.10488869249820709, "learning_rate": 0.00020241553278947604, "loss": 0.36309656500816345, "memory(GiB)": 78.33, "step": 2153, "token_acc": 0.8921550221268607, "train_speed(iter/s)": 0.032357 }, { "epoch": 0.417381194593809, "grad_norm": 0.11151447147130966, "learning_rate": 0.00020232545023356058, "loss": 0.37561148405075073, "memory(GiB)": 78.33, "step": 2154, "token_acc": 0.889549997111663, "train_speed(iter/s)": 0.032359 }, { "epoch": 0.41757496487913576, "grad_norm": 0.11074910312891006, "learning_rate": 0.0002022353461861767, "loss": 0.3915635049343109, "memory(GiB)": 78.33, "step": 2155, "token_acc": 0.8837005402042029, "train_speed(iter/s)": 0.03236 }, { "epoch": 0.41776873516446256, "grad_norm": 0.10543368011713028, "learning_rate": 0.00020214522068433247, "loss": 0.34513017535209656, "memory(GiB)": 78.33, "step": 2156, "token_acc": 0.8980891719745223, "train_speed(iter/s)": 0.032361 }, { "epoch": 0.4179625054497893, "grad_norm": 0.0990806296467781, "learning_rate": 0.00020205507376504494, "loss": 0.34623822569847107, "memory(GiB)": 78.33, "step": 2157, "token_acc": 0.8984733024327651, "train_speed(iter/s)": 0.032362 }, { "epoch": 0.41815627573511605, "grad_norm": 0.0954173356294632, "learning_rate": 0.00020196490546533987, "loss": 0.3334534168243408, "memory(GiB)": 78.33, "step": 2158, "token_acc": 0.9018397113381105, "train_speed(iter/s)": 0.032363 }, { "epoch": 0.4183500460204428, "grad_norm": 0.10484931617975235, "learning_rate": 0.0002018747158222517, "loss": 0.33561134338378906, "memory(GiB)": 78.33, "step": 2159, "token_acc": 0.9003498385360603, "train_speed(iter/s)": 0.032365 }, { "epoch": 0.41854381630576953, "grad_norm": 0.10665947943925858, "learning_rate": 0.00020178450487282385, "loss": 0.3735751509666443, "memory(GiB)": 78.33, "step": 2160, "token_acc": 0.892504140620826, "train_speed(iter/s)": 0.032366 }, { "epoch": 0.4187375865910963, "grad_norm": 0.10096178948879242, "learning_rate": 0.00020169427265410837, "loss": 0.3531975746154785, "memory(GiB)": 78.33, "step": 2161, "token_acc": 0.8952592753308743, "train_speed(iter/s)": 0.032367 }, { "epoch": 0.418931356876423, "grad_norm": 0.11227719485759735, "learning_rate": 0.00020160401920316597, "loss": 0.3715410828590393, "memory(GiB)": 78.33, "step": 2162, "token_acc": 0.8928775907477384, "train_speed(iter/s)": 0.032368 }, { "epoch": 0.41912512716174977, "grad_norm": 0.10699176788330078, "learning_rate": 0.0002015137445570663, "loss": 0.3732505738735199, "memory(GiB)": 78.33, "step": 2163, "token_acc": 0.8906586310804994, "train_speed(iter/s)": 0.03237 }, { "epoch": 0.4193188974470765, "grad_norm": 0.0976463332772255, "learning_rate": 0.0002014234487528875, "loss": 0.37260702252388, "memory(GiB)": 78.33, "step": 2164, "token_acc": 0.8918560748130923, "train_speed(iter/s)": 0.032371 }, { "epoch": 0.41951266773240325, "grad_norm": 0.10294061154127121, "learning_rate": 0.00020133313182771646, "loss": 0.37537047266960144, "memory(GiB)": 78.33, "step": 2165, "token_acc": 0.8896890512075307, "train_speed(iter/s)": 0.032372 }, { "epoch": 0.41970643801773, "grad_norm": 0.10132710635662079, "learning_rate": 0.00020124279381864883, "loss": 0.35512280464172363, "memory(GiB)": 78.33, "step": 2166, "token_acc": 0.8946123521681998, "train_speed(iter/s)": 0.032373 }, { "epoch": 0.41990020830305674, "grad_norm": 0.1205829605460167, "learning_rate": 0.00020115243476278883, "loss": 0.4342115521430969, "memory(GiB)": 78.33, "step": 2167, "token_acc": 0.8729766390354182, "train_speed(iter/s)": 0.032375 }, { "epoch": 0.4200939785883835, "grad_norm": 0.10426265001296997, "learning_rate": 0.00020106205469724937, "loss": 0.36752408742904663, "memory(GiB)": 78.33, "step": 2168, "token_acc": 0.8942200862982198, "train_speed(iter/s)": 0.032376 }, { "epoch": 0.42028774887371023, "grad_norm": 0.10518850386142731, "learning_rate": 0.00020097165365915188, "loss": 0.36646583676338196, "memory(GiB)": 78.33, "step": 2169, "token_acc": 0.8950598184818482, "train_speed(iter/s)": 0.032377 }, { "epoch": 0.420481519159037, "grad_norm": 0.097692109644413, "learning_rate": 0.00020088123168562663, "loss": 0.31673404574394226, "memory(GiB)": 78.33, "step": 2170, "token_acc": 0.9040827884466869, "train_speed(iter/s)": 0.032378 }, { "epoch": 0.4206752894443637, "grad_norm": 0.10072191804647446, "learning_rate": 0.00020079078881381232, "loss": 0.3458814322948456, "memory(GiB)": 78.33, "step": 2171, "token_acc": 0.8972006834012354, "train_speed(iter/s)": 0.03238 }, { "epoch": 0.42086905972969046, "grad_norm": 0.10148821771144867, "learning_rate": 0.00020070032508085617, "loss": 0.362411230802536, "memory(GiB)": 78.33, "step": 2172, "token_acc": 0.893191234333064, "train_speed(iter/s)": 0.032381 }, { "epoch": 0.4210628300150172, "grad_norm": 0.09655621647834778, "learning_rate": 0.0002006098405239142, "loss": 0.36690768599510193, "memory(GiB)": 78.33, "step": 2173, "token_acc": 0.8898027945694182, "train_speed(iter/s)": 0.032382 }, { "epoch": 0.42125660030034395, "grad_norm": 0.10610742121934891, "learning_rate": 0.00020051933518015077, "loss": 0.388561487197876, "memory(GiB)": 78.33, "step": 2174, "token_acc": 0.8857004153406333, "train_speed(iter/s)": 0.032383 }, { "epoch": 0.4214503705856707, "grad_norm": 0.11180911213159561, "learning_rate": 0.00020042880908673888, "loss": 0.38997843861579895, "memory(GiB)": 78.33, "step": 2175, "token_acc": 0.8855060034305318, "train_speed(iter/s)": 0.032384 }, { "epoch": 0.42164414087099744, "grad_norm": 0.11040447652339935, "learning_rate": 0.00020033826228085997, "loss": 0.3600209355354309, "memory(GiB)": 78.33, "step": 2176, "token_acc": 0.8954944743553415, "train_speed(iter/s)": 0.032386 }, { "epoch": 0.4218379111563242, "grad_norm": 0.11115849018096924, "learning_rate": 0.0002002476947997042, "loss": 0.3961622714996338, "memory(GiB)": 78.33, "step": 2177, "token_acc": 0.8841036617111124, "train_speed(iter/s)": 0.032387 }, { "epoch": 0.42203168144165093, "grad_norm": 0.10502826422452927, "learning_rate": 0.0002001571066804699, "loss": 0.38119229674339294, "memory(GiB)": 78.33, "step": 2178, "token_acc": 0.8862857463902466, "train_speed(iter/s)": 0.032388 }, { "epoch": 0.4222254517269777, "grad_norm": 0.11460091918706894, "learning_rate": 0.00020006649796036412, "loss": 0.4165613353252411, "memory(GiB)": 78.33, "step": 2179, "token_acc": 0.8790522573127372, "train_speed(iter/s)": 0.032389 }, { "epoch": 0.4224192220123044, "grad_norm": 0.09722710400819778, "learning_rate": 0.0001999758686766023, "loss": 0.3486678898334503, "memory(GiB)": 78.33, "step": 2180, "token_acc": 0.8953782314223028, "train_speed(iter/s)": 0.03239 }, { "epoch": 0.42261299229763116, "grad_norm": 0.10160472989082336, "learning_rate": 0.0001998852188664083, "loss": 0.3687483072280884, "memory(GiB)": 78.33, "step": 2181, "token_acc": 0.8916645094486151, "train_speed(iter/s)": 0.032392 }, { "epoch": 0.4228067625829579, "grad_norm": 0.11015883833169937, "learning_rate": 0.00019979454856701442, "loss": 0.36141785979270935, "memory(GiB)": 78.33, "step": 2182, "token_acc": 0.895195164429874, "train_speed(iter/s)": 0.032393 }, { "epoch": 0.42300053286828465, "grad_norm": 0.10014893114566803, "learning_rate": 0.00019970385781566146, "loss": 0.3551277220249176, "memory(GiB)": 78.33, "step": 2183, "token_acc": 0.8938043448633031, "train_speed(iter/s)": 0.032395 }, { "epoch": 0.4231943031536114, "grad_norm": 0.08740631490945816, "learning_rate": 0.00019961314664959849, "loss": 0.3233758211135864, "memory(GiB)": 78.33, "step": 2184, "token_acc": 0.9035294117647059, "train_speed(iter/s)": 0.032396 }, { "epoch": 0.42338807343893814, "grad_norm": 0.10040253400802612, "learning_rate": 0.00019952241510608302, "loss": 0.34599989652633667, "memory(GiB)": 78.33, "step": 2185, "token_acc": 0.8977024740700782, "train_speed(iter/s)": 0.032397 }, { "epoch": 0.4235818437242649, "grad_norm": 0.10159041732549667, "learning_rate": 0.00019943166322238095, "loss": 0.34493720531463623, "memory(GiB)": 78.33, "step": 2186, "token_acc": 0.8977762408895037, "train_speed(iter/s)": 0.032398 }, { "epoch": 0.42377561400959163, "grad_norm": 0.11545497179031372, "learning_rate": 0.00019934089103576652, "loss": 0.3942737877368927, "memory(GiB)": 78.33, "step": 2187, "token_acc": 0.8845215505557061, "train_speed(iter/s)": 0.0324 }, { "epoch": 0.42396938429491837, "grad_norm": 0.10246077924966812, "learning_rate": 0.00019925009858352233, "loss": 0.37554022669792175, "memory(GiB)": 78.33, "step": 2188, "token_acc": 0.8895408954606556, "train_speed(iter/s)": 0.032401 }, { "epoch": 0.4241631545802451, "grad_norm": 0.12075801938772202, "learning_rate": 0.00019915928590293918, "loss": 0.40932536125183105, "memory(GiB)": 78.33, "step": 2189, "token_acc": 0.8816431763492609, "train_speed(iter/s)": 0.032402 }, { "epoch": 0.42435692486557186, "grad_norm": 0.12044385075569153, "learning_rate": 0.00019906845303131643, "loss": 0.4146193861961365, "memory(GiB)": 78.33, "step": 2190, "token_acc": 0.8788535074955163, "train_speed(iter/s)": 0.032403 }, { "epoch": 0.4245506951508986, "grad_norm": 0.10225894302129745, "learning_rate": 0.00019897760000596145, "loss": 0.3517759144306183, "memory(GiB)": 78.33, "step": 2191, "token_acc": 0.8963014924651073, "train_speed(iter/s)": 0.032405 }, { "epoch": 0.42474446543622535, "grad_norm": 0.10850612074136734, "learning_rate": 0.00019888672686419005, "loss": 0.34386181831359863, "memory(GiB)": 78.33, "step": 2192, "token_acc": 0.898326711713018, "train_speed(iter/s)": 0.032406 }, { "epoch": 0.4249382357215521, "grad_norm": 0.10484492033720016, "learning_rate": 0.0001987958336433263, "loss": 0.36306411027908325, "memory(GiB)": 78.33, "step": 2193, "token_acc": 0.8922604656216891, "train_speed(iter/s)": 0.032407 }, { "epoch": 0.42513200600687884, "grad_norm": 0.09952452033758163, "learning_rate": 0.00019870492038070252, "loss": 0.3434096574783325, "memory(GiB)": 78.33, "step": 2194, "token_acc": 0.8980537301407457, "train_speed(iter/s)": 0.032408 }, { "epoch": 0.4253257762922056, "grad_norm": 0.11556005477905273, "learning_rate": 0.00019861398711365917, "loss": 0.3726232945919037, "memory(GiB)": 78.33, "step": 2195, "token_acc": 0.890572481838881, "train_speed(iter/s)": 0.032409 }, { "epoch": 0.4255195465775323, "grad_norm": 0.0965765044093132, "learning_rate": 0.00019852303387954496, "loss": 0.34968358278274536, "memory(GiB)": 78.33, "step": 2196, "token_acc": 0.8960782916458824, "train_speed(iter/s)": 0.03241 }, { "epoch": 0.42571331686285907, "grad_norm": 0.11270653456449509, "learning_rate": 0.00019843206071571692, "loss": 0.37671494483947754, "memory(GiB)": 78.33, "step": 2197, "token_acc": 0.8884828552286984, "train_speed(iter/s)": 0.032412 }, { "epoch": 0.4259070871481858, "grad_norm": 0.10049509257078171, "learning_rate": 0.0001983410676595401, "loss": 0.36629050970077515, "memory(GiB)": 78.33, "step": 2198, "token_acc": 0.893262368452885, "train_speed(iter/s)": 0.032413 }, { "epoch": 0.42610085743351256, "grad_norm": 0.10621780902147293, "learning_rate": 0.0001982500547483878, "loss": 0.3855476677417755, "memory(GiB)": 78.33, "step": 2199, "token_acc": 0.8878023674729799, "train_speed(iter/s)": 0.032414 }, { "epoch": 0.4262946277188393, "grad_norm": 0.10796981304883957, "learning_rate": 0.00019815902201964153, "loss": 0.34390783309936523, "memory(GiB)": 78.33, "step": 2200, "token_acc": 0.8990232532326988, "train_speed(iter/s)": 0.032416 }, { "epoch": 0.42648839800416605, "grad_norm": 0.11975818127393723, "learning_rate": 0.00019806796951069087, "loss": 0.3976423442363739, "memory(GiB)": 78.33, "step": 2201, "token_acc": 0.885062679353572, "train_speed(iter/s)": 0.032411 }, { "epoch": 0.4266821682894928, "grad_norm": 0.10373754054307938, "learning_rate": 0.00019797689725893337, "loss": 0.3727421462535858, "memory(GiB)": 78.33, "step": 2202, "token_acc": 0.8891960318280459, "train_speed(iter/s)": 0.032412 }, { "epoch": 0.42687593857481954, "grad_norm": 0.10288981348276138, "learning_rate": 0.00019788580530177507, "loss": 0.35971593856811523, "memory(GiB)": 78.33, "step": 2203, "token_acc": 0.8920195439739413, "train_speed(iter/s)": 0.032413 }, { "epoch": 0.4270697088601463, "grad_norm": 0.10778756439685822, "learning_rate": 0.0001977946936766298, "loss": 0.38950392603874207, "memory(GiB)": 78.33, "step": 2204, "token_acc": 0.8854846286340623, "train_speed(iter/s)": 0.032415 }, { "epoch": 0.427263479145473, "grad_norm": 0.1137860119342804, "learning_rate": 0.0001977035624209195, "loss": 0.3468484580516815, "memory(GiB)": 78.33, "step": 2205, "token_acc": 0.8975911374468634, "train_speed(iter/s)": 0.032416 }, { "epoch": 0.42745724943079977, "grad_norm": 0.11108643561601639, "learning_rate": 0.00019761241157207428, "loss": 0.3628512918949127, "memory(GiB)": 78.33, "step": 2206, "token_acc": 0.8934348239771646, "train_speed(iter/s)": 0.032417 }, { "epoch": 0.4276510197161265, "grad_norm": 0.09750779718160629, "learning_rate": 0.00019752124116753224, "loss": 0.3502158522605896, "memory(GiB)": 78.33, "step": 2207, "token_acc": 0.8962943407094514, "train_speed(iter/s)": 0.032418 }, { "epoch": 0.42784479000145326, "grad_norm": 0.09157220274209976, "learning_rate": 0.0001974300512447395, "loss": 0.33457863330841064, "memory(GiB)": 78.33, "step": 2208, "token_acc": 0.900385138196647, "train_speed(iter/s)": 0.032419 }, { "epoch": 0.42803856028678, "grad_norm": 0.11325064301490784, "learning_rate": 0.0001973388418411502, "loss": 0.37056490778923035, "memory(GiB)": 78.33, "step": 2209, "token_acc": 0.8906734699794161, "train_speed(iter/s)": 0.03242 }, { "epoch": 0.42823233057210675, "grad_norm": 0.11709318310022354, "learning_rate": 0.00019724761299422654, "loss": 0.40491345524787903, "memory(GiB)": 78.33, "step": 2210, "token_acc": 0.8828931933278018, "train_speed(iter/s)": 0.032422 }, { "epoch": 0.4284261008574335, "grad_norm": 0.10216681659221649, "learning_rate": 0.00019715636474143864, "loss": 0.38466590642929077, "memory(GiB)": 78.33, "step": 2211, "token_acc": 0.8875497597803707, "train_speed(iter/s)": 0.032423 }, { "epoch": 0.42861987114276023, "grad_norm": 0.1109510064125061, "learning_rate": 0.00019706509712026456, "loss": 0.38734912872314453, "memory(GiB)": 78.33, "step": 2212, "token_acc": 0.8852111060119321, "train_speed(iter/s)": 0.032424 }, { "epoch": 0.428813641428087, "grad_norm": 0.10640700161457062, "learning_rate": 0.00019697381016819043, "loss": 0.3618239462375641, "memory(GiB)": 78.33, "step": 2213, "token_acc": 0.8929901199097012, "train_speed(iter/s)": 0.032425 }, { "epoch": 0.4290074117134137, "grad_norm": 0.13030792772769928, "learning_rate": 0.00019688250392271026, "loss": 0.3900720775127411, "memory(GiB)": 78.33, "step": 2214, "token_acc": 0.8874647759294609, "train_speed(iter/s)": 0.032427 }, { "epoch": 0.42920118199874047, "grad_norm": 0.09482412040233612, "learning_rate": 0.00019679117842132592, "loss": 0.35719773173332214, "memory(GiB)": 78.33, "step": 2215, "token_acc": 0.8954372137062723, "train_speed(iter/s)": 0.032428 }, { "epoch": 0.4293949522840672, "grad_norm": 0.10285639762878418, "learning_rate": 0.00019669983370154722, "loss": 0.3917454481124878, "memory(GiB)": 78.33, "step": 2216, "token_acc": 0.8865480649188514, "train_speed(iter/s)": 0.032429 }, { "epoch": 0.429588722569394, "grad_norm": 0.11368583887815475, "learning_rate": 0.000196608469800892, "loss": 0.42137694358825684, "memory(GiB)": 78.33, "step": 2217, "token_acc": 0.8776351492381549, "train_speed(iter/s)": 0.03243 }, { "epoch": 0.42978249285472075, "grad_norm": 0.10096923261880875, "learning_rate": 0.0001965170867568858, "loss": 0.3742446005344391, "memory(GiB)": 78.33, "step": 2218, "token_acc": 0.8895205325753609, "train_speed(iter/s)": 0.032431 }, { "epoch": 0.4299762631400475, "grad_norm": 0.10487914085388184, "learning_rate": 0.00019642568460706214, "loss": 0.36482223868370056, "memory(GiB)": 78.33, "step": 2219, "token_acc": 0.8928659320226409, "train_speed(iter/s)": 0.032432 }, { "epoch": 0.43017003342537424, "grad_norm": 0.2587502598762512, "learning_rate": 0.00019633426338896227, "loss": 0.36105775833129883, "memory(GiB)": 78.33, "step": 2220, "token_acc": 0.8932460356156114, "train_speed(iter/s)": 0.032433 }, { "epoch": 0.430363803710701, "grad_norm": 0.10027684271335602, "learning_rate": 0.0001962428231401354, "loss": 0.3383048176765442, "memory(GiB)": 78.33, "step": 2221, "token_acc": 0.8997930908043268, "train_speed(iter/s)": 0.032435 }, { "epoch": 0.43055757399602773, "grad_norm": 0.11608999222517014, "learning_rate": 0.00019615136389813847, "loss": 0.38931986689567566, "memory(GiB)": 78.33, "step": 2222, "token_acc": 0.8857191859135605, "train_speed(iter/s)": 0.032436 }, { "epoch": 0.4307513442813545, "grad_norm": 0.10588336735963821, "learning_rate": 0.00019605988570053622, "loss": 0.354093462228775, "memory(GiB)": 78.33, "step": 2223, "token_acc": 0.8957208040448884, "train_speed(iter/s)": 0.032437 }, { "epoch": 0.4309451145666812, "grad_norm": 0.16060945391654968, "learning_rate": 0.0001959683885849013, "loss": 0.3510138690471649, "memory(GiB)": 78.33, "step": 2224, "token_acc": 0.8974735682241519, "train_speed(iter/s)": 0.032438 }, { "epoch": 0.43113888485200796, "grad_norm": 0.1098427101969719, "learning_rate": 0.00019587687258881391, "loss": 0.3766689896583557, "memory(GiB)": 78.33, "step": 2225, "token_acc": 0.8884950490521164, "train_speed(iter/s)": 0.032439 }, { "epoch": 0.4313326551373347, "grad_norm": 0.10366590321063995, "learning_rate": 0.00019578533774986217, "loss": 0.3672398626804352, "memory(GiB)": 78.33, "step": 2226, "token_acc": 0.8924895345973898, "train_speed(iter/s)": 0.032441 }, { "epoch": 0.43152642542266145, "grad_norm": 0.11767855286598206, "learning_rate": 0.00019569378410564197, "loss": 0.3826836347579956, "memory(GiB)": 78.33, "step": 2227, "token_acc": 0.8886111359104841, "train_speed(iter/s)": 0.032442 }, { "epoch": 0.4317201957079882, "grad_norm": 0.0977967232465744, "learning_rate": 0.0001956022116937568, "loss": 0.3510884940624237, "memory(GiB)": 78.33, "step": 2228, "token_acc": 0.8963877167735799, "train_speed(iter/s)": 0.032443 }, { "epoch": 0.43191396599331494, "grad_norm": 0.09899823367595673, "learning_rate": 0.00019551062055181786, "loss": 0.3398961126804352, "memory(GiB)": 78.33, "step": 2229, "token_acc": 0.8998167084699981, "train_speed(iter/s)": 0.032444 }, { "epoch": 0.4321077362786417, "grad_norm": 0.1056177020072937, "learning_rate": 0.0001954190107174442, "loss": 0.38561806082725525, "memory(GiB)": 78.33, "step": 2230, "token_acc": 0.8863317429132753, "train_speed(iter/s)": 0.032445 }, { "epoch": 0.43230150656396843, "grad_norm": 0.09486104547977448, "learning_rate": 0.00019532738222826233, "loss": 0.3347775936126709, "memory(GiB)": 78.33, "step": 2231, "token_acc": 0.9010725841379837, "train_speed(iter/s)": 0.032447 }, { "epoch": 0.4324952768492952, "grad_norm": 0.1015876904129982, "learning_rate": 0.0001952357351219066, "loss": 0.3641873002052307, "memory(GiB)": 78.33, "step": 2232, "token_acc": 0.8932900972811175, "train_speed(iter/s)": 0.032448 }, { "epoch": 0.4326890471346219, "grad_norm": 0.10654337704181671, "learning_rate": 0.00019514406943601896, "loss": 0.37854257225990295, "memory(GiB)": 78.33, "step": 2233, "token_acc": 0.8883228206372311, "train_speed(iter/s)": 0.032449 }, { "epoch": 0.43288281741994866, "grad_norm": 0.09729617834091187, "learning_rate": 0.00019505238520824893, "loss": 0.342031866312027, "memory(GiB)": 78.33, "step": 2234, "token_acc": 0.8982541675900618, "train_speed(iter/s)": 0.03245 }, { "epoch": 0.4330765877052754, "grad_norm": 0.11875928193330765, "learning_rate": 0.00019496068247625361, "loss": 0.41712290048599243, "memory(GiB)": 78.33, "step": 2235, "token_acc": 0.8765791742219315, "train_speed(iter/s)": 0.032451 }, { "epoch": 0.43327035799060215, "grad_norm": 0.1124706119298935, "learning_rate": 0.00019486896127769794, "loss": 0.3886880576610565, "memory(GiB)": 78.33, "step": 2236, "token_acc": 0.8862263618402484, "train_speed(iter/s)": 0.032453 }, { "epoch": 0.4334641282759289, "grad_norm": 0.10713918507099152, "learning_rate": 0.00019477722165025418, "loss": 0.3783111572265625, "memory(GiB)": 78.33, "step": 2237, "token_acc": 0.8903876189123953, "train_speed(iter/s)": 0.032454 }, { "epoch": 0.43365789856125564, "grad_norm": 0.09784865379333496, "learning_rate": 0.00019468546363160224, "loss": 0.3712804913520813, "memory(GiB)": 78.33, "step": 2238, "token_acc": 0.8909272229695255, "train_speed(iter/s)": 0.032455 }, { "epoch": 0.4338516688465824, "grad_norm": 0.11485416442155838, "learning_rate": 0.0001945936872594297, "loss": 0.40341106057167053, "memory(GiB)": 78.33, "step": 2239, "token_acc": 0.8822438849351115, "train_speed(iter/s)": 0.032456 }, { "epoch": 0.4340454391319091, "grad_norm": 0.10090679675340652, "learning_rate": 0.00019450189257143148, "loss": 0.3727151155471802, "memory(GiB)": 78.33, "step": 2240, "token_acc": 0.8921047498293182, "train_speed(iter/s)": 0.032457 }, { "epoch": 0.43423920941723587, "grad_norm": 0.11711379885673523, "learning_rate": 0.0001944100796053102, "loss": 0.38696539402008057, "memory(GiB)": 78.33, "step": 2241, "token_acc": 0.8871316037592384, "train_speed(iter/s)": 0.032458 }, { "epoch": 0.4344329797025626, "grad_norm": 0.12041328847408295, "learning_rate": 0.00019431824839877582, "loss": 0.41463562846183777, "memory(GiB)": 78.33, "step": 2242, "token_acc": 0.881887840766859, "train_speed(iter/s)": 0.03246 }, { "epoch": 0.43462674998788936, "grad_norm": 0.1023503914475441, "learning_rate": 0.00019422639898954603, "loss": 0.3793856203556061, "memory(GiB)": 78.33, "step": 2243, "token_acc": 0.887026578933734, "train_speed(iter/s)": 0.032461 }, { "epoch": 0.4348205202732161, "grad_norm": 0.12375747412443161, "learning_rate": 0.00019413453141534575, "loss": 0.41560643911361694, "memory(GiB)": 78.33, "step": 2244, "token_acc": 0.878722458568798, "train_speed(iter/s)": 0.032462 }, { "epoch": 0.43501429055854285, "grad_norm": 0.09916850179433823, "learning_rate": 0.00019404264571390743, "loss": 0.360500693321228, "memory(GiB)": 78.33, "step": 2245, "token_acc": 0.8931444381820977, "train_speed(iter/s)": 0.032463 }, { "epoch": 0.4352080608438696, "grad_norm": 0.10854557156562805, "learning_rate": 0.00019395074192297106, "loss": 0.39239490032196045, "memory(GiB)": 78.33, "step": 2246, "token_acc": 0.8853895766150005, "train_speed(iter/s)": 0.032464 }, { "epoch": 0.43540183112919634, "grad_norm": 0.10064675658941269, "learning_rate": 0.000193858820080284, "loss": 0.3592490553855896, "memory(GiB)": 78.33, "step": 2247, "token_acc": 0.8954478346456692, "train_speed(iter/s)": 0.032465 }, { "epoch": 0.4355956014145231, "grad_norm": 0.11303189396858215, "learning_rate": 0.00019376688022360099, "loss": 0.35404136776924133, "memory(GiB)": 78.33, "step": 2248, "token_acc": 0.8956532831584654, "train_speed(iter/s)": 0.032467 }, { "epoch": 0.4357893716998498, "grad_norm": 0.1282435804605484, "learning_rate": 0.00019367492239068417, "loss": 0.37942934036254883, "memory(GiB)": 78.33, "step": 2249, "token_acc": 0.889295596034125, "train_speed(iter/s)": 0.032468 }, { "epoch": 0.43598314198517657, "grad_norm": 0.1114896610379219, "learning_rate": 0.0001935829466193032, "loss": 0.3755391538143158, "memory(GiB)": 78.33, "step": 2250, "token_acc": 0.8880601168939605, "train_speed(iter/s)": 0.032469 }, { "epoch": 0.4361769122705033, "grad_norm": 0.10226267576217651, "learning_rate": 0.00019349095294723487, "loss": 0.37153515219688416, "memory(GiB)": 78.33, "step": 2251, "token_acc": 0.8909354937328035, "train_speed(iter/s)": 0.03247 }, { "epoch": 0.43637068255583006, "grad_norm": 0.10146588832139969, "learning_rate": 0.00019339894141226355, "loss": 0.36199501156806946, "memory(GiB)": 78.33, "step": 2252, "token_acc": 0.8924786237263836, "train_speed(iter/s)": 0.032471 }, { "epoch": 0.4365644528411568, "grad_norm": 0.11089852452278137, "learning_rate": 0.00019330691205218082, "loss": 0.38684314489364624, "memory(GiB)": 78.33, "step": 2253, "token_acc": 0.8873638079006635, "train_speed(iter/s)": 0.032472 }, { "epoch": 0.43675822312648355, "grad_norm": 0.10390076041221619, "learning_rate": 0.00019321486490478563, "loss": 0.37561658024787903, "memory(GiB)": 78.33, "step": 2254, "token_acc": 0.8915121559575482, "train_speed(iter/s)": 0.032474 }, { "epoch": 0.4369519934118103, "grad_norm": 0.09438452124595642, "learning_rate": 0.00019312280000788416, "loss": 0.34584975242614746, "memory(GiB)": 78.33, "step": 2255, "token_acc": 0.8961015827777495, "train_speed(iter/s)": 0.032475 }, { "epoch": 0.43714576369713704, "grad_norm": 0.09714444726705551, "learning_rate": 0.00019303071739928997, "loss": 0.3477434515953064, "memory(GiB)": 78.33, "step": 2256, "token_acc": 0.8965367189699747, "train_speed(iter/s)": 0.032476 }, { "epoch": 0.4373395339824638, "grad_norm": 0.10147438943386078, "learning_rate": 0.00019293861711682393, "loss": 0.352212131023407, "memory(GiB)": 78.33, "step": 2257, "token_acc": 0.895200889829359, "train_speed(iter/s)": 0.032477 }, { "epoch": 0.4375333042677905, "grad_norm": 0.10559714585542679, "learning_rate": 0.00019284649919831394, "loss": 0.37872010469436646, "memory(GiB)": 78.33, "step": 2258, "token_acc": 0.8881790060998526, "train_speed(iter/s)": 0.032478 }, { "epoch": 0.43772707455311727, "grad_norm": 0.10056591033935547, "learning_rate": 0.00019275436368159548, "loss": 0.3722417652606964, "memory(GiB)": 78.33, "step": 2259, "token_acc": 0.8918299445471349, "train_speed(iter/s)": 0.032479 }, { "epoch": 0.437920844838444, "grad_norm": 0.10316530615091324, "learning_rate": 0.00019266221060451096, "loss": 0.3741195797920227, "memory(GiB)": 78.33, "step": 2260, "token_acc": 0.8905550295555158, "train_speed(iter/s)": 0.03248 }, { "epoch": 0.43811461512377076, "grad_norm": 0.1018475815653801, "learning_rate": 0.00019257004000491017, "loss": 0.36181291937828064, "memory(GiB)": 78.33, "step": 2261, "token_acc": 0.8936356535939978, "train_speed(iter/s)": 0.032481 }, { "epoch": 0.4383083854090975, "grad_norm": 0.10764992982149124, "learning_rate": 0.00019247785192065003, "loss": 0.39701730012893677, "memory(GiB)": 78.33, "step": 2262, "token_acc": 0.8833886405397252, "train_speed(iter/s)": 0.032483 }, { "epoch": 0.43850215569442424, "grad_norm": 0.09933136403560638, "learning_rate": 0.00019238564638959473, "loss": 0.331234335899353, "memory(GiB)": 78.33, "step": 2263, "token_acc": 0.901470844428959, "train_speed(iter/s)": 0.032484 }, { "epoch": 0.438695925979751, "grad_norm": 0.10954437404870987, "learning_rate": 0.00019229342344961547, "loss": 0.38815680146217346, "memory(GiB)": 78.33, "step": 2264, "token_acc": 0.8864250798765668, "train_speed(iter/s)": 0.032485 }, { "epoch": 0.43888969626507773, "grad_norm": 0.09880734980106354, "learning_rate": 0.00019220118313859074, "loss": 0.3450104892253876, "memory(GiB)": 78.33, "step": 2265, "token_acc": 0.8972709063595393, "train_speed(iter/s)": 0.032486 }, { "epoch": 0.4390834665504045, "grad_norm": 0.09824536740779877, "learning_rate": 0.0001921089254944061, "loss": 0.3457093834877014, "memory(GiB)": 78.33, "step": 2266, "token_acc": 0.898, "train_speed(iter/s)": 0.032487 }, { "epoch": 0.4392772368357312, "grad_norm": 0.11076337844133377, "learning_rate": 0.00019201665055495427, "loss": 0.4086833894252777, "memory(GiB)": 78.33, "step": 2267, "token_acc": 0.8818113491168733, "train_speed(iter/s)": 0.032488 }, { "epoch": 0.43947100712105797, "grad_norm": 0.10491563379764557, "learning_rate": 0.00019192435835813502, "loss": 0.36850589513778687, "memory(GiB)": 78.33, "step": 2268, "token_acc": 0.8915903890160183, "train_speed(iter/s)": 0.032489 }, { "epoch": 0.4396647774063847, "grad_norm": 0.10932449251413345, "learning_rate": 0.00019183204894185522, "loss": 0.3876577317714691, "memory(GiB)": 78.33, "step": 2269, "token_acc": 0.8863226177709189, "train_speed(iter/s)": 0.03249 }, { "epoch": 0.43985854769171145, "grad_norm": 0.11134737730026245, "learning_rate": 0.00019173972234402887, "loss": 0.40485280752182007, "memory(GiB)": 78.33, "step": 2270, "token_acc": 0.8811312687068596, "train_speed(iter/s)": 0.032491 }, { "epoch": 0.4400523179770382, "grad_norm": 0.1066315695643425, "learning_rate": 0.00019164737860257692, "loss": 0.38198620080947876, "memory(GiB)": 78.33, "step": 2271, "token_acc": 0.8884394005212859, "train_speed(iter/s)": 0.032493 }, { "epoch": 0.44024608826236494, "grad_norm": 0.10852167755365372, "learning_rate": 0.00019155501775542752, "loss": 0.3655702471733093, "memory(GiB)": 78.33, "step": 2272, "token_acc": 0.8904522077437578, "train_speed(iter/s)": 0.032494 }, { "epoch": 0.4404398585476917, "grad_norm": 0.10043393820524216, "learning_rate": 0.00019146263984051574, "loss": 0.3396553099155426, "memory(GiB)": 78.33, "step": 2273, "token_acc": 0.8992605233219567, "train_speed(iter/s)": 0.032495 }, { "epoch": 0.44063362883301843, "grad_norm": 0.09665820002555847, "learning_rate": 0.00019137024489578354, "loss": 0.3522893190383911, "memory(GiB)": 78.33, "step": 2274, "token_acc": 0.8970611596505162, "train_speed(iter/s)": 0.032496 }, { "epoch": 0.4408273991183452, "grad_norm": 0.10279867798089981, "learning_rate": 0.00019127783295918015, "loss": 0.38115227222442627, "memory(GiB)": 78.33, "step": 2275, "token_acc": 0.8877062684693637, "train_speed(iter/s)": 0.032497 }, { "epoch": 0.4410211694036719, "grad_norm": 0.10720198601484299, "learning_rate": 0.00019118540406866158, "loss": 0.3606134355068207, "memory(GiB)": 78.33, "step": 2276, "token_acc": 0.8946840521564694, "train_speed(iter/s)": 0.032499 }, { "epoch": 0.4412149396889987, "grad_norm": 0.10232347249984741, "learning_rate": 0.00019109295826219086, "loss": 0.3502247929573059, "memory(GiB)": 78.33, "step": 2277, "token_acc": 0.8963676036452317, "train_speed(iter/s)": 0.0325 }, { "epoch": 0.44140870997432546, "grad_norm": 0.10972965508699417, "learning_rate": 0.00019100049557773798, "loss": 0.3618561327457428, "memory(GiB)": 78.33, "step": 2278, "token_acc": 0.8917501192179303, "train_speed(iter/s)": 0.032501 }, { "epoch": 0.4416024802596522, "grad_norm": 0.10983320325613022, "learning_rate": 0.00019090801605327982, "loss": 0.37896856665611267, "memory(GiB)": 78.33, "step": 2279, "token_acc": 0.8861007817761251, "train_speed(iter/s)": 0.032502 }, { "epoch": 0.44179625054497895, "grad_norm": 0.10652820765972137, "learning_rate": 0.00019081551972680025, "loss": 0.3781920075416565, "memory(GiB)": 78.33, "step": 2280, "token_acc": 0.8892166502785972, "train_speed(iter/s)": 0.032503 }, { "epoch": 0.4419900208303057, "grad_norm": 0.10808536410331726, "learning_rate": 0.00019072300663628997, "loss": 0.3892320990562439, "memory(GiB)": 78.33, "step": 2281, "token_acc": 0.8866169049621531, "train_speed(iter/s)": 0.032504 }, { "epoch": 0.44218379111563244, "grad_norm": 0.09705408662557602, "learning_rate": 0.00019063047681974656, "loss": 0.33386996388435364, "memory(GiB)": 78.33, "step": 2282, "token_acc": 0.9006132756132756, "train_speed(iter/s)": 0.032505 }, { "epoch": 0.4423775614009592, "grad_norm": 0.10078884661197662, "learning_rate": 0.0001905379303151746, "loss": 0.33445224165916443, "memory(GiB)": 78.33, "step": 2283, "token_acc": 0.8993649371524941, "train_speed(iter/s)": 0.032506 }, { "epoch": 0.44257133168628593, "grad_norm": 0.11122056096792221, "learning_rate": 0.0001904453671605853, "loss": 0.37664756178855896, "memory(GiB)": 78.33, "step": 2284, "token_acc": 0.888987135970792, "train_speed(iter/s)": 0.032507 }, { "epoch": 0.4427651019716127, "grad_norm": 0.1016409620642662, "learning_rate": 0.00019035278739399692, "loss": 0.35757166147232056, "memory(GiB)": 78.33, "step": 2285, "token_acc": 0.8957771559399644, "train_speed(iter/s)": 0.032508 }, { "epoch": 0.4429588722569394, "grad_norm": 0.09880708903074265, "learning_rate": 0.00019026019105343445, "loss": 0.3487710952758789, "memory(GiB)": 78.33, "step": 2286, "token_acc": 0.8957860078103066, "train_speed(iter/s)": 0.03251 }, { "epoch": 0.44315264254226616, "grad_norm": 0.1038837805390358, "learning_rate": 0.00019016757817692966, "loss": 0.35564103722572327, "memory(GiB)": 78.33, "step": 2287, "token_acc": 0.893324717649042, "train_speed(iter/s)": 0.032511 }, { "epoch": 0.4433464128275929, "grad_norm": 0.11149082332849503, "learning_rate": 0.0001900749488025212, "loss": 0.3836996853351593, "memory(GiB)": 78.33, "step": 2288, "token_acc": 0.8873978009585566, "train_speed(iter/s)": 0.032512 }, { "epoch": 0.44354018311291965, "grad_norm": 0.10695455968379974, "learning_rate": 0.00018998230296825438, "loss": 0.3890923857688904, "memory(GiB)": 78.33, "step": 2289, "token_acc": 0.8848062223414241, "train_speed(iter/s)": 0.032513 }, { "epoch": 0.4437339533982464, "grad_norm": 0.10037211328744888, "learning_rate": 0.00018988964071218136, "loss": 0.34572556614875793, "memory(GiB)": 78.33, "step": 2290, "token_acc": 0.8985781478077526, "train_speed(iter/s)": 0.032514 }, { "epoch": 0.44392772368357314, "grad_norm": 0.09722217172384262, "learning_rate": 0.000189796962072361, "loss": 0.3216269314289093, "memory(GiB)": 78.33, "step": 2291, "token_acc": 0.906508290027221, "train_speed(iter/s)": 0.032515 }, { "epoch": 0.4441214939688999, "grad_norm": 0.1133374273777008, "learning_rate": 0.000189704267086859, "loss": 0.3567155599594116, "memory(GiB)": 78.33, "step": 2292, "token_acc": 0.8946020433729573, "train_speed(iter/s)": 0.032516 }, { "epoch": 0.4443152642542266, "grad_norm": 0.10639848560094833, "learning_rate": 0.00018961155579374757, "loss": 0.3713446855545044, "memory(GiB)": 78.33, "step": 2293, "token_acc": 0.8898174505758534, "train_speed(iter/s)": 0.032518 }, { "epoch": 0.44450903453955337, "grad_norm": 0.1109897717833519, "learning_rate": 0.0001895188282311058, "loss": 0.3658873438835144, "memory(GiB)": 78.33, "step": 2294, "token_acc": 0.8911519867265497, "train_speed(iter/s)": 0.032519 }, { "epoch": 0.4447028048248801, "grad_norm": 0.1037331074476242, "learning_rate": 0.00018942608443701936, "loss": 0.34084218740463257, "memory(GiB)": 78.33, "step": 2295, "token_acc": 0.898319518564778, "train_speed(iter/s)": 0.03252 }, { "epoch": 0.44489657511020686, "grad_norm": 0.11509410291910172, "learning_rate": 0.00018933332444958062, "loss": 0.38112330436706543, "memory(GiB)": 78.33, "step": 2296, "token_acc": 0.8888923143323982, "train_speed(iter/s)": 0.032521 }, { "epoch": 0.4450903453955336, "grad_norm": 0.09891083091497421, "learning_rate": 0.00018924054830688858, "loss": 0.33499619364738464, "memory(GiB)": 78.33, "step": 2297, "token_acc": 0.9014303407076423, "train_speed(iter/s)": 0.032522 }, { "epoch": 0.44528411568086035, "grad_norm": 0.11420368403196335, "learning_rate": 0.0001891477560470489, "loss": 0.3879055976867676, "memory(GiB)": 78.33, "step": 2298, "token_acc": 0.8863246853022871, "train_speed(iter/s)": 0.032523 }, { "epoch": 0.4454778859661871, "grad_norm": 0.10501628369092941, "learning_rate": 0.0001890549477081739, "loss": 0.3551484942436218, "memory(GiB)": 78.33, "step": 2299, "token_acc": 0.8968862632448985, "train_speed(iter/s)": 0.032524 }, { "epoch": 0.44567165625151384, "grad_norm": 0.10620047897100449, "learning_rate": 0.00018896212332838243, "loss": 0.36860981583595276, "memory(GiB)": 78.33, "step": 2300, "token_acc": 0.8929374124117507, "train_speed(iter/s)": 0.032525 }, { "epoch": 0.4458654265368406, "grad_norm": 0.10044432431459427, "learning_rate": 0.0001888692829458, "loss": 0.3472827672958374, "memory(GiB)": 78.33, "step": 2301, "token_acc": 0.8963743001866169, "train_speed(iter/s)": 0.032526 }, { "epoch": 0.4460591968221673, "grad_norm": 0.13254094123840332, "learning_rate": 0.00018877642659855852, "loss": 0.4520479440689087, "memory(GiB)": 78.33, "step": 2302, "token_acc": 0.8687972508591065, "train_speed(iter/s)": 0.032527 }, { "epoch": 0.44625296710749407, "grad_norm": 0.10516183078289032, "learning_rate": 0.00018868355432479674, "loss": 0.37986326217651367, "memory(GiB)": 78.33, "step": 2303, "token_acc": 0.8892758400680562, "train_speed(iter/s)": 0.032529 }, { "epoch": 0.4464467373928208, "grad_norm": 0.12673194706439972, "learning_rate": 0.00018859066616265966, "loss": 0.4348051846027374, "memory(GiB)": 78.33, "step": 2304, "token_acc": 0.8738275643379285, "train_speed(iter/s)": 0.03253 }, { "epoch": 0.44664050767814756, "grad_norm": 0.09944983571767807, "learning_rate": 0.00018849776215029907, "loss": 0.3587363064289093, "memory(GiB)": 78.33, "step": 2305, "token_acc": 0.8943836362735872, "train_speed(iter/s)": 0.032531 }, { "epoch": 0.4468342779634743, "grad_norm": 0.09945741295814514, "learning_rate": 0.0001884048423258731, "loss": 0.35953009128570557, "memory(GiB)": 78.33, "step": 2306, "token_acc": 0.8941004794663331, "train_speed(iter/s)": 0.032532 }, { "epoch": 0.44702804824880105, "grad_norm": 0.10500568151473999, "learning_rate": 0.00018831190672754638, "loss": 0.38044261932373047, "memory(GiB)": 78.33, "step": 2307, "token_acc": 0.8867819141508644, "train_speed(iter/s)": 0.032533 }, { "epoch": 0.4472218185341278, "grad_norm": 0.09857631474733353, "learning_rate": 0.00018821895539349008, "loss": 0.36684414744377136, "memory(GiB)": 78.33, "step": 2308, "token_acc": 0.8926224492510708, "train_speed(iter/s)": 0.032534 }, { "epoch": 0.44741558881945453, "grad_norm": 0.10511371493339539, "learning_rate": 0.00018812598836188182, "loss": 0.36754751205444336, "memory(GiB)": 78.33, "step": 2309, "token_acc": 0.8913974514883942, "train_speed(iter/s)": 0.032535 }, { "epoch": 0.4476093591047813, "grad_norm": 0.10627889633178711, "learning_rate": 0.0001880330056709057, "loss": 0.3721667230129242, "memory(GiB)": 78.33, "step": 2310, "token_acc": 0.8893509074742403, "train_speed(iter/s)": 0.032537 }, { "epoch": 0.447803129390108, "grad_norm": 0.09434963017702103, "learning_rate": 0.00018794000735875208, "loss": 0.3414641320705414, "memory(GiB)": 78.33, "step": 2311, "token_acc": 0.9010579609772407, "train_speed(iter/s)": 0.032538 }, { "epoch": 0.44799689967543477, "grad_norm": 0.10588974505662918, "learning_rate": 0.00018784699346361802, "loss": 0.36833858489990234, "memory(GiB)": 78.33, "step": 2312, "token_acc": 0.8927507447864945, "train_speed(iter/s)": 0.032539 }, { "epoch": 0.4481906699607615, "grad_norm": 0.10945326089859009, "learning_rate": 0.00018775396402370673, "loss": 0.3979440927505493, "memory(GiB)": 78.33, "step": 2313, "token_acc": 0.8817913102618431, "train_speed(iter/s)": 0.03254 }, { "epoch": 0.44838444024608826, "grad_norm": 0.10148966312408447, "learning_rate": 0.00018766091907722795, "loss": 0.35080429911613464, "memory(GiB)": 78.33, "step": 2314, "token_acc": 0.8943991907576, "train_speed(iter/s)": 0.032541 }, { "epoch": 0.448578210531415, "grad_norm": 0.10452762246131897, "learning_rate": 0.00018756785866239767, "loss": 0.36064577102661133, "memory(GiB)": 78.33, "step": 2315, "token_acc": 0.8935257756351483, "train_speed(iter/s)": 0.032542 }, { "epoch": 0.44877198081674174, "grad_norm": 0.10478675365447998, "learning_rate": 0.00018747478281743842, "loss": 0.36270397901535034, "memory(GiB)": 78.33, "step": 2316, "token_acc": 0.8914383656318249, "train_speed(iter/s)": 0.032543 }, { "epoch": 0.4489657511020685, "grad_norm": 0.10872726887464523, "learning_rate": 0.0001873816915805788, "loss": 0.3819129467010498, "memory(GiB)": 78.33, "step": 2317, "token_acc": 0.8880903215425461, "train_speed(iter/s)": 0.032544 }, { "epoch": 0.44915952138739523, "grad_norm": 0.09838545322418213, "learning_rate": 0.00018728858499005398, "loss": 0.34659987688064575, "memory(GiB)": 78.33, "step": 2318, "token_acc": 0.89670946219167, "train_speed(iter/s)": 0.032545 }, { "epoch": 0.449353291672722, "grad_norm": 0.11515597254037857, "learning_rate": 0.00018719546308410538, "loss": 0.389648973941803, "memory(GiB)": 78.33, "step": 2319, "token_acc": 0.8872989067638433, "train_speed(iter/s)": 0.032546 }, { "epoch": 0.4495470619580487, "grad_norm": 0.10723573714494705, "learning_rate": 0.00018710232590098057, "loss": 0.3719337582588196, "memory(GiB)": 78.33, "step": 2320, "token_acc": 0.8903108357787197, "train_speed(iter/s)": 0.032548 }, { "epoch": 0.44974083224337547, "grad_norm": 0.10834118723869324, "learning_rate": 0.00018700917347893358, "loss": 0.3716052770614624, "memory(GiB)": 78.33, "step": 2321, "token_acc": 0.889080622347949, "train_speed(iter/s)": 0.032549 }, { "epoch": 0.4499346025287022, "grad_norm": 0.10417478531599045, "learning_rate": 0.0001869160058562245, "loss": 0.35477250814437866, "memory(GiB)": 78.33, "step": 2322, "token_acc": 0.8943120009258188, "train_speed(iter/s)": 0.03255 }, { "epoch": 0.45012837281402895, "grad_norm": 0.10129362344741821, "learning_rate": 0.00018682282307111987, "loss": 0.3582378327846527, "memory(GiB)": 78.33, "step": 2323, "token_acc": 0.8946028116311099, "train_speed(iter/s)": 0.032551 }, { "epoch": 0.4503221430993557, "grad_norm": 0.09624646604061127, "learning_rate": 0.0001867296251618923, "loss": 0.3303111493587494, "memory(GiB)": 78.33, "step": 2324, "token_acc": 0.9006094476429071, "train_speed(iter/s)": 0.032552 }, { "epoch": 0.45051591338468244, "grad_norm": 0.10416833311319351, "learning_rate": 0.00018663641216682075, "loss": 0.3703954517841339, "memory(GiB)": 78.33, "step": 2325, "token_acc": 0.8913434299411637, "train_speed(iter/s)": 0.032553 }, { "epoch": 0.4507096836700092, "grad_norm": 0.09914438426494598, "learning_rate": 0.0001865431841241903, "loss": 0.3545830547809601, "memory(GiB)": 78.33, "step": 2326, "token_acc": 0.8953699331461094, "train_speed(iter/s)": 0.032554 }, { "epoch": 0.45090345395533593, "grad_norm": 0.10621084272861481, "learning_rate": 0.00018644994107229216, "loss": 0.37236636877059937, "memory(GiB)": 78.33, "step": 2327, "token_acc": 0.8887367838522704, "train_speed(iter/s)": 0.032555 }, { "epoch": 0.4510972242406627, "grad_norm": 0.09897017478942871, "learning_rate": 0.0001863566830494237, "loss": 0.33905407786369324, "memory(GiB)": 78.33, "step": 2328, "token_acc": 0.8993225346373268, "train_speed(iter/s)": 0.032557 }, { "epoch": 0.4512909945259894, "grad_norm": 0.09863407909870148, "learning_rate": 0.00018626341009388866, "loss": 0.3825795352458954, "memory(GiB)": 78.33, "step": 2329, "token_acc": 0.8874984754238322, "train_speed(iter/s)": 0.032558 }, { "epoch": 0.45148476481131616, "grad_norm": 0.09864397346973419, "learning_rate": 0.00018617012224399662, "loss": 0.3443533778190613, "memory(GiB)": 78.33, "step": 2330, "token_acc": 0.9003264309386758, "train_speed(iter/s)": 0.032559 }, { "epoch": 0.4516785350966429, "grad_norm": 0.09809895604848862, "learning_rate": 0.00018607681953806341, "loss": 0.34165775775909424, "memory(GiB)": 78.33, "step": 2331, "token_acc": 0.8970808010361662, "train_speed(iter/s)": 0.03256 }, { "epoch": 0.45187230538196965, "grad_norm": 0.10424879193305969, "learning_rate": 0.00018598350201441108, "loss": 0.33717483282089233, "memory(GiB)": 78.33, "step": 2332, "token_acc": 0.8975676890237052, "train_speed(iter/s)": 0.032561 }, { "epoch": 0.4520660756672964, "grad_norm": 0.09718424081802368, "learning_rate": 0.00018589016971136752, "loss": 0.32958295941352844, "memory(GiB)": 78.33, "step": 2333, "token_acc": 0.9014591294853342, "train_speed(iter/s)": 0.032562 }, { "epoch": 0.45225984595262314, "grad_norm": 0.1105201467871666, "learning_rate": 0.00018579682266726686, "loss": 0.38120901584625244, "memory(GiB)": 78.33, "step": 2334, "token_acc": 0.8889513793200416, "train_speed(iter/s)": 0.032563 }, { "epoch": 0.4524536162379499, "grad_norm": 0.1020718514919281, "learning_rate": 0.00018570346092044917, "loss": 0.3501797616481781, "memory(GiB)": 78.33, "step": 2335, "token_acc": 0.8964525407478428, "train_speed(iter/s)": 0.032564 }, { "epoch": 0.45264738652327663, "grad_norm": 0.10678357630968094, "learning_rate": 0.00018561008450926076, "loss": 0.3577033579349518, "memory(GiB)": 78.33, "step": 2336, "token_acc": 0.8953428424127857, "train_speed(iter/s)": 0.032565 }, { "epoch": 0.4528411568086034, "grad_norm": 0.09743531793355942, "learning_rate": 0.0001855166934720537, "loss": 0.33748358488082886, "memory(GiB)": 78.33, "step": 2337, "token_acc": 0.8984052721601771, "train_speed(iter/s)": 0.032566 }, { "epoch": 0.4530349270939302, "grad_norm": 0.10229934751987457, "learning_rate": 0.00018542328784718632, "loss": 0.3752382695674896, "memory(GiB)": 78.33, "step": 2338, "token_acc": 0.891382828441375, "train_speed(iter/s)": 0.032567 }, { "epoch": 0.4532286973792569, "grad_norm": 0.11242741346359253, "learning_rate": 0.00018532986767302276, "loss": 0.4054182767868042, "memory(GiB)": 78.33, "step": 2339, "token_acc": 0.8786372906672795, "train_speed(iter/s)": 0.032568 }, { "epoch": 0.45342246766458366, "grad_norm": 0.09873122721910477, "learning_rate": 0.0001852364329879332, "loss": 0.3532402813434601, "memory(GiB)": 78.33, "step": 2340, "token_acc": 0.8958922821738706, "train_speed(iter/s)": 0.032569 }, { "epoch": 0.4536162379499104, "grad_norm": 0.10772307217121124, "learning_rate": 0.00018514298383029372, "loss": 0.37972134351730347, "memory(GiB)": 78.33, "step": 2341, "token_acc": 0.8879170528266914, "train_speed(iter/s)": 0.03257 }, { "epoch": 0.45381000823523715, "grad_norm": 0.11007421463727951, "learning_rate": 0.00018504952023848647, "loss": 0.37062400579452515, "memory(GiB)": 78.33, "step": 2342, "token_acc": 0.8900929211930756, "train_speed(iter/s)": 0.032571 }, { "epoch": 0.4540037785205639, "grad_norm": 0.09810297191143036, "learning_rate": 0.00018495604225089946, "loss": 0.33862578868865967, "memory(GiB)": 78.33, "step": 2343, "token_acc": 0.9004052456801936, "train_speed(iter/s)": 0.032572 }, { "epoch": 0.45419754880589064, "grad_norm": 0.09834323078393936, "learning_rate": 0.00018486254990592656, "loss": 0.34658533334732056, "memory(GiB)": 78.33, "step": 2344, "token_acc": 0.8964155452144384, "train_speed(iter/s)": 0.032573 }, { "epoch": 0.4543913190912174, "grad_norm": 0.11022671312093735, "learning_rate": 0.00018476904324196764, "loss": 0.4045500159263611, "memory(GiB)": 78.33, "step": 2345, "token_acc": 0.8820878509132359, "train_speed(iter/s)": 0.032574 }, { "epoch": 0.4545850893765441, "grad_norm": 0.10049665719270706, "learning_rate": 0.0001846755222974284, "loss": 0.3371380865573883, "memory(GiB)": 78.33, "step": 2346, "token_acc": 0.9003629250212349, "train_speed(iter/s)": 0.032576 }, { "epoch": 0.45477885966187087, "grad_norm": 0.09852764755487442, "learning_rate": 0.0001845819871107204, "loss": 0.36543235182762146, "memory(GiB)": 78.33, "step": 2347, "token_acc": 0.8905981201226657, "train_speed(iter/s)": 0.032577 }, { "epoch": 0.4549726299471976, "grad_norm": 0.11310271173715591, "learning_rate": 0.00018448843772026098, "loss": 0.3698401153087616, "memory(GiB)": 78.33, "step": 2348, "token_acc": 0.8884804666140106, "train_speed(iter/s)": 0.032578 }, { "epoch": 0.45516640023252436, "grad_norm": 0.1051260381937027, "learning_rate": 0.0001843948741644735, "loss": 0.3575577437877655, "memory(GiB)": 78.33, "step": 2349, "token_acc": 0.8939034259509037, "train_speed(iter/s)": 0.032579 }, { "epoch": 0.4553601705178511, "grad_norm": 0.10609371960163116, "learning_rate": 0.00018430129648178693, "loss": 0.3754083216190338, "memory(GiB)": 78.33, "step": 2350, "token_acc": 0.8901177784276713, "train_speed(iter/s)": 0.03258 }, { "epoch": 0.45555394080317785, "grad_norm": 0.10359574854373932, "learning_rate": 0.0001842077047106362, "loss": 0.37402597069740295, "memory(GiB)": 78.33, "step": 2351, "token_acc": 0.8886889138857643, "train_speed(iter/s)": 0.032581 }, { "epoch": 0.4557477110885046, "grad_norm": 0.10701092332601547, "learning_rate": 0.00018411409888946197, "loss": 0.3984612822532654, "memory(GiB)": 78.33, "step": 2352, "token_acc": 0.8840575810574465, "train_speed(iter/s)": 0.032582 }, { "epoch": 0.45594148137383134, "grad_norm": 0.09848210960626602, "learning_rate": 0.00018402047905671063, "loss": 0.3315700590610504, "memory(GiB)": 78.33, "step": 2353, "token_acc": 0.9021433150310767, "train_speed(iter/s)": 0.032583 }, { "epoch": 0.4561352516591581, "grad_norm": 0.10119795799255371, "learning_rate": 0.0001839268452508344, "loss": 0.3340100347995758, "memory(GiB)": 78.33, "step": 2354, "token_acc": 0.9002782696686061, "train_speed(iter/s)": 0.032584 }, { "epoch": 0.4563290219444848, "grad_norm": 0.10151761025190353, "learning_rate": 0.00018383319751029114, "loss": 0.3325027823448181, "memory(GiB)": 78.33, "step": 2355, "token_acc": 0.902809093079392, "train_speed(iter/s)": 0.032585 }, { "epoch": 0.45652279222981157, "grad_norm": 0.11158560961484909, "learning_rate": 0.00018373953587354452, "loss": 0.38792747259140015, "memory(GiB)": 78.33, "step": 2356, "token_acc": 0.8858068315665489, "train_speed(iter/s)": 0.032586 }, { "epoch": 0.4567165625151383, "grad_norm": 0.0991615429520607, "learning_rate": 0.00018364586037906391, "loss": 0.35615038871765137, "memory(GiB)": 78.33, "step": 2357, "token_acc": 0.8954778266800505, "train_speed(iter/s)": 0.032587 }, { "epoch": 0.45691033280046506, "grad_norm": 0.10101715475320816, "learning_rate": 0.00018355217106532436, "loss": 0.36160457134246826, "memory(GiB)": 78.33, "step": 2358, "token_acc": 0.8935985113941181, "train_speed(iter/s)": 0.032588 }, { "epoch": 0.4571041030857918, "grad_norm": 0.10320735722780228, "learning_rate": 0.0001834584679708066, "loss": 0.3799397945404053, "memory(GiB)": 78.33, "step": 2359, "token_acc": 0.8880186336993524, "train_speed(iter/s)": 0.032589 }, { "epoch": 0.45729787337111855, "grad_norm": 0.1062033548951149, "learning_rate": 0.00018336475113399692, "loss": 0.3934144079685211, "memory(GiB)": 78.33, "step": 2360, "token_acc": 0.8855768736176123, "train_speed(iter/s)": 0.03259 }, { "epoch": 0.4574916436564453, "grad_norm": 0.10630171746015549, "learning_rate": 0.00018327102059338744, "loss": 0.3696288466453552, "memory(GiB)": 78.33, "step": 2361, "token_acc": 0.8901493818220828, "train_speed(iter/s)": 0.032592 }, { "epoch": 0.45768541394177203, "grad_norm": 0.10573367774486542, "learning_rate": 0.00018317727638747576, "loss": 0.3717585504055023, "memory(GiB)": 78.33, "step": 2362, "token_acc": 0.8903601270722656, "train_speed(iter/s)": 0.032593 }, { "epoch": 0.4578791842270988, "grad_norm": 0.10207115113735199, "learning_rate": 0.0001830835185547652, "loss": 0.3541256785392761, "memory(GiB)": 78.33, "step": 2363, "token_acc": 0.896280064694527, "train_speed(iter/s)": 0.032594 }, { "epoch": 0.4580729545124255, "grad_norm": 0.10319995135068893, "learning_rate": 0.0001829897471337645, "loss": 0.3481341004371643, "memory(GiB)": 78.33, "step": 2364, "token_acc": 0.8952120676258607, "train_speed(iter/s)": 0.032595 }, { "epoch": 0.45826672479775227, "grad_norm": 0.10591937601566315, "learning_rate": 0.00018289596216298823, "loss": 0.36794811487197876, "memory(GiB)": 78.33, "step": 2365, "token_acc": 0.8902500987974934, "train_speed(iter/s)": 0.032596 }, { "epoch": 0.458460495083079, "grad_norm": 0.1113501638174057, "learning_rate": 0.00018280216368095638, "loss": 0.37933623790740967, "memory(GiB)": 78.33, "step": 2366, "token_acc": 0.8892398270522216, "train_speed(iter/s)": 0.032597 }, { "epoch": 0.45865426536840576, "grad_norm": 0.10290928184986115, "learning_rate": 0.00018270835172619443, "loss": 0.3481866717338562, "memory(GiB)": 78.33, "step": 2367, "token_acc": 0.8950770760815515, "train_speed(iter/s)": 0.032598 }, { "epoch": 0.4588480356537325, "grad_norm": 0.11763869225978851, "learning_rate": 0.00018261452633723356, "loss": 0.41069701313972473, "memory(GiB)": 78.33, "step": 2368, "token_acc": 0.8784230338208672, "train_speed(iter/s)": 0.032599 }, { "epoch": 0.45904180593905924, "grad_norm": 0.1066950112581253, "learning_rate": 0.00018252068755261029, "loss": 0.3848106861114502, "memory(GiB)": 78.33, "step": 2369, "token_acc": 0.8873462694725335, "train_speed(iter/s)": 0.0326 }, { "epoch": 0.459235576224386, "grad_norm": 0.10410353541374207, "learning_rate": 0.00018242683541086678, "loss": 0.36915817856788635, "memory(GiB)": 78.33, "step": 2370, "token_acc": 0.8926594658498576, "train_speed(iter/s)": 0.032601 }, { "epoch": 0.45942934650971273, "grad_norm": 0.10082308948040009, "learning_rate": 0.00018233296995055065, "loss": 0.370003342628479, "memory(GiB)": 78.33, "step": 2371, "token_acc": 0.8912730226099714, "train_speed(iter/s)": 0.032602 }, { "epoch": 0.4596231167950395, "grad_norm": 0.10870834439992905, "learning_rate": 0.00018223909121021495, "loss": 0.3969360589981079, "memory(GiB)": 78.33, "step": 2372, "token_acc": 0.8850561134081512, "train_speed(iter/s)": 0.032603 }, { "epoch": 0.4598168870803662, "grad_norm": 0.09935883432626724, "learning_rate": 0.00018214519922841817, "loss": 0.3541335165500641, "memory(GiB)": 78.33, "step": 2373, "token_acc": 0.8943371776597123, "train_speed(iter/s)": 0.032604 }, { "epoch": 0.46001065736569297, "grad_norm": 0.12180610001087189, "learning_rate": 0.00018205129404372431, "loss": 0.4192396402359009, "memory(GiB)": 78.33, "step": 2374, "token_acc": 0.8759435110786462, "train_speed(iter/s)": 0.032605 }, { "epoch": 0.4602044276510197, "grad_norm": 0.11153632402420044, "learning_rate": 0.00018195737569470273, "loss": 0.37499576807022095, "memory(GiB)": 78.33, "step": 2375, "token_acc": 0.889001271529303, "train_speed(iter/s)": 0.032607 }, { "epoch": 0.46039819793634645, "grad_norm": 0.10651109367609024, "learning_rate": 0.0001818634442199282, "loss": 0.3690948486328125, "memory(GiB)": 78.33, "step": 2376, "token_acc": 0.8935253398571071, "train_speed(iter/s)": 0.032608 }, { "epoch": 0.4605919682216732, "grad_norm": 0.09841850399971008, "learning_rate": 0.00018176949965798093, "loss": 0.3518884778022766, "memory(GiB)": 78.33, "step": 2377, "token_acc": 0.8960413453472955, "train_speed(iter/s)": 0.032609 }, { "epoch": 0.46078573850699994, "grad_norm": 0.1050775870680809, "learning_rate": 0.0001816755420474465, "loss": 0.3738871216773987, "memory(GiB)": 78.33, "step": 2378, "token_acc": 0.8914807588995391, "train_speed(iter/s)": 0.03261 }, { "epoch": 0.4609795087923267, "grad_norm": 0.10667918622493744, "learning_rate": 0.0001815815714269158, "loss": 0.38406267762184143, "memory(GiB)": 78.33, "step": 2379, "token_acc": 0.8884490352348994, "train_speed(iter/s)": 0.032611 }, { "epoch": 0.46117327907765343, "grad_norm": 0.11324939876794815, "learning_rate": 0.00018148758783498504, "loss": 0.38455072045326233, "memory(GiB)": 78.33, "step": 2380, "token_acc": 0.8882405081991431, "train_speed(iter/s)": 0.032612 }, { "epoch": 0.4613670493629802, "grad_norm": 0.10635250061750412, "learning_rate": 0.00018139359131025588, "loss": 0.3805847764015198, "memory(GiB)": 78.33, "step": 2381, "token_acc": 0.8862944162436548, "train_speed(iter/s)": 0.032613 }, { "epoch": 0.4615608196483069, "grad_norm": 0.10868912190198898, "learning_rate": 0.00018129958189133522, "loss": 0.3691996932029724, "memory(GiB)": 78.33, "step": 2382, "token_acc": 0.8913490258405722, "train_speed(iter/s)": 0.032614 }, { "epoch": 0.46175458993363366, "grad_norm": 0.09865929931402206, "learning_rate": 0.00018120555961683514, "loss": 0.36495789885520935, "memory(GiB)": 78.33, "step": 2383, "token_acc": 0.8911807418711695, "train_speed(iter/s)": 0.032615 }, { "epoch": 0.4619483602189604, "grad_norm": 0.1034301221370697, "learning_rate": 0.00018111152452537327, "loss": 0.3717435598373413, "memory(GiB)": 78.33, "step": 2384, "token_acc": 0.8909826100745283, "train_speed(iter/s)": 0.032616 }, { "epoch": 0.46214213050428715, "grad_norm": 0.10854472219944, "learning_rate": 0.00018101747665557225, "loss": 0.3835892081260681, "memory(GiB)": 78.33, "step": 2385, "token_acc": 0.8888919333625602, "train_speed(iter/s)": 0.032617 }, { "epoch": 0.4623359007896139, "grad_norm": 0.11397820711135864, "learning_rate": 0.00018092341604606014, "loss": 0.39861786365509033, "memory(GiB)": 78.33, "step": 2386, "token_acc": 0.8822021941317055, "train_speed(iter/s)": 0.032618 }, { "epoch": 0.46252967107494064, "grad_norm": 0.1112833321094513, "learning_rate": 0.00018082934273547008, "loss": 0.40602025389671326, "memory(GiB)": 78.33, "step": 2387, "token_acc": 0.8811936155447606, "train_speed(iter/s)": 0.032619 }, { "epoch": 0.4627234413602674, "grad_norm": 0.11048243939876556, "learning_rate": 0.00018073525676244053, "loss": 0.3989701271057129, "memory(GiB)": 78.33, "step": 2388, "token_acc": 0.8846206164812145, "train_speed(iter/s)": 0.03262 }, { "epoch": 0.46291721164559413, "grad_norm": 0.12440559267997742, "learning_rate": 0.00018064115816561515, "loss": 0.397320419549942, "memory(GiB)": 78.33, "step": 2389, "token_acc": 0.8848505094371137, "train_speed(iter/s)": 0.032621 }, { "epoch": 0.4631109819309209, "grad_norm": 0.11104737967252731, "learning_rate": 0.00018054704698364273, "loss": 0.39772453904151917, "memory(GiB)": 78.33, "step": 2390, "token_acc": 0.8835264012326995, "train_speed(iter/s)": 0.032622 }, { "epoch": 0.4633047522162476, "grad_norm": 0.1087903156876564, "learning_rate": 0.00018045292325517736, "loss": 0.39890480041503906, "memory(GiB)": 78.33, "step": 2391, "token_acc": 0.8840932546508243, "train_speed(iter/s)": 0.032623 }, { "epoch": 0.46349852250157436, "grad_norm": 0.10867461562156677, "learning_rate": 0.00018035878701887803, "loss": 0.3494797348976135, "memory(GiB)": 78.33, "step": 2392, "token_acc": 0.897205366586873, "train_speed(iter/s)": 0.032624 }, { "epoch": 0.4636922927869011, "grad_norm": 0.11319278180599213, "learning_rate": 0.00018026463831340915, "loss": 0.36147987842559814, "memory(GiB)": 78.33, "step": 2393, "token_acc": 0.8932517509038962, "train_speed(iter/s)": 0.032625 }, { "epoch": 0.46388606307222785, "grad_norm": 0.1191987693309784, "learning_rate": 0.00018017047717744006, "loss": 0.4088186025619507, "memory(GiB)": 78.33, "step": 2394, "token_acc": 0.88241711618886, "train_speed(iter/s)": 0.032626 }, { "epoch": 0.4640798333575546, "grad_norm": 0.10697056353092194, "learning_rate": 0.00018007630364964524, "loss": 0.3530442714691162, "memory(GiB)": 78.33, "step": 2395, "token_acc": 0.8948714966856611, "train_speed(iter/s)": 0.032628 }, { "epoch": 0.46427360364288134, "grad_norm": 0.1151200458407402, "learning_rate": 0.00017998211776870435, "loss": 0.4034122824668884, "memory(GiB)": 78.33, "step": 2396, "token_acc": 0.8807363035709694, "train_speed(iter/s)": 0.032629 }, { "epoch": 0.4644673739282081, "grad_norm": 0.11261675506830215, "learning_rate": 0.00017988791957330205, "loss": 0.3917164206504822, "memory(GiB)": 78.33, "step": 2397, "token_acc": 0.8845557080623023, "train_speed(iter/s)": 0.03263 }, { "epoch": 0.4646611442135348, "grad_norm": 0.10412374138832092, "learning_rate": 0.00017979370910212807, "loss": 0.34287336468696594, "memory(GiB)": 78.33, "step": 2398, "token_acc": 0.8996082907468417, "train_speed(iter/s)": 0.032631 }, { "epoch": 0.4648549144988616, "grad_norm": 0.1128680557012558, "learning_rate": 0.00017969948639387715, "loss": 0.35632070899009705, "memory(GiB)": 78.33, "step": 2399, "token_acc": 0.8946635868277659, "train_speed(iter/s)": 0.032632 }, { "epoch": 0.46504868478418837, "grad_norm": 0.10247381776571274, "learning_rate": 0.00017960525148724916, "loss": 0.3603074848651886, "memory(GiB)": 78.33, "step": 2400, "token_acc": 0.891370611730082, "train_speed(iter/s)": 0.032633 }, { "epoch": 0.4652424550695151, "grad_norm": 0.10234736651182175, "learning_rate": 0.00017951100442094878, "loss": 0.3643084764480591, "memory(GiB)": 78.33, "step": 2401, "token_acc": 0.892455605758117, "train_speed(iter/s)": 0.032628 }, { "epoch": 0.46543622535484186, "grad_norm": 0.11007906496524811, "learning_rate": 0.00017941674523368594, "loss": 0.36536821722984314, "memory(GiB)": 78.33, "step": 2402, "token_acc": 0.8915501381254806, "train_speed(iter/s)": 0.032629 }, { "epoch": 0.4656299956401686, "grad_norm": 0.0991489514708519, "learning_rate": 0.00017932247396417538, "loss": 0.32952260971069336, "memory(GiB)": 78.33, "step": 2403, "token_acc": 0.9006100723689086, "train_speed(iter/s)": 0.032631 }, { "epoch": 0.46582376592549535, "grad_norm": 0.09857763350009918, "learning_rate": 0.00017922819065113683, "loss": 0.36439892649650574, "memory(GiB)": 78.33, "step": 2404, "token_acc": 0.8917956246540396, "train_speed(iter/s)": 0.032631 }, { "epoch": 0.4660175362108221, "grad_norm": 0.11544130742549896, "learning_rate": 0.000179133895333295, "loss": 0.39752769470214844, "memory(GiB)": 78.33, "step": 2405, "token_acc": 0.8802836879432624, "train_speed(iter/s)": 0.032633 }, { "epoch": 0.46621130649614884, "grad_norm": 0.11335242539644241, "learning_rate": 0.0001790395880493795, "loss": 0.3888709843158722, "memory(GiB)": 78.33, "step": 2406, "token_acc": 0.8840124504810413, "train_speed(iter/s)": 0.032634 }, { "epoch": 0.4664050767814756, "grad_norm": 0.1116030365228653, "learning_rate": 0.00017894526883812485, "loss": 0.3878939747810364, "memory(GiB)": 78.33, "step": 2407, "token_acc": 0.885872988604478, "train_speed(iter/s)": 0.032635 }, { "epoch": 0.4665988470668023, "grad_norm": 0.10574699193239212, "learning_rate": 0.00017885093773827048, "loss": 0.36204928159713745, "memory(GiB)": 78.33, "step": 2408, "token_acc": 0.8934056241568953, "train_speed(iter/s)": 0.032636 }, { "epoch": 0.46679261735212907, "grad_norm": 0.10200771689414978, "learning_rate": 0.00017875659478856076, "loss": 0.3507916033267975, "memory(GiB)": 78.33, "step": 2409, "token_acc": 0.8951724847051565, "train_speed(iter/s)": 0.032636 }, { "epoch": 0.4669863876374558, "grad_norm": 0.10833270847797394, "learning_rate": 0.00017866224002774478, "loss": 0.38488560914993286, "memory(GiB)": 78.33, "step": 2410, "token_acc": 0.8863565847742081, "train_speed(iter/s)": 0.032637 }, { "epoch": 0.46718015792278256, "grad_norm": 0.1008668914437294, "learning_rate": 0.00017856787349457672, "loss": 0.37106162309646606, "memory(GiB)": 78.33, "step": 2411, "token_acc": 0.8896542726679713, "train_speed(iter/s)": 0.032638 }, { "epoch": 0.4673739282081093, "grad_norm": 0.11065692454576492, "learning_rate": 0.0001784734952278153, "loss": 0.3777768313884735, "memory(GiB)": 78.33, "step": 2412, "token_acc": 0.8877977553825012, "train_speed(iter/s)": 0.03264 }, { "epoch": 0.46756769849343605, "grad_norm": 0.10646090656518936, "learning_rate": 0.00017837910526622436, "loss": 0.3846604824066162, "memory(GiB)": 78.33, "step": 2413, "token_acc": 0.8871676430824695, "train_speed(iter/s)": 0.032641 }, { "epoch": 0.4677614687787628, "grad_norm": 0.09328921884298325, "learning_rate": 0.00017828470364857226, "loss": 0.32804617285728455, "memory(GiB)": 78.33, "step": 2414, "token_acc": 0.9045698603985205, "train_speed(iter/s)": 0.032641 }, { "epoch": 0.46795523906408953, "grad_norm": 0.09627443552017212, "learning_rate": 0.00017819029041363232, "loss": 0.33626145124435425, "memory(GiB)": 78.33, "step": 2415, "token_acc": 0.8980459016393443, "train_speed(iter/s)": 0.032642 }, { "epoch": 0.4681490093494163, "grad_norm": 0.10673625767230988, "learning_rate": 0.00017809586560018262, "loss": 0.38183170557022095, "memory(GiB)": 78.33, "step": 2416, "token_acc": 0.8858918947761593, "train_speed(iter/s)": 0.032643 }, { "epoch": 0.468342779634743, "grad_norm": 0.1077309101819992, "learning_rate": 0.00017800142924700592, "loss": 0.3792920410633087, "memory(GiB)": 78.33, "step": 2417, "token_acc": 0.8886526780784098, "train_speed(iter/s)": 0.032644 }, { "epoch": 0.46853654992006977, "grad_norm": 0.11391132324934006, "learning_rate": 0.00017790698139288983, "loss": 0.3999425768852234, "memory(GiB)": 78.33, "step": 2418, "token_acc": 0.8836907644413697, "train_speed(iter/s)": 0.032645 }, { "epoch": 0.4687303202053965, "grad_norm": 0.10699284076690674, "learning_rate": 0.0001778125220766266, "loss": 0.3917970061302185, "memory(GiB)": 78.33, "step": 2419, "token_acc": 0.8875225537820958, "train_speed(iter/s)": 0.032646 }, { "epoch": 0.46892409049072326, "grad_norm": 0.10588902235031128, "learning_rate": 0.00017771805133701322, "loss": 0.3656570315361023, "memory(GiB)": 78.33, "step": 2420, "token_acc": 0.8920440225153323, "train_speed(iter/s)": 0.032648 }, { "epoch": 0.46911786077605, "grad_norm": 0.09696738421916962, "learning_rate": 0.00017762356921285127, "loss": 0.35896116495132446, "memory(GiB)": 78.33, "step": 2421, "token_acc": 0.8952547723573225, "train_speed(iter/s)": 0.032648 }, { "epoch": 0.46931163106137674, "grad_norm": 0.09918226301670074, "learning_rate": 0.00017752907574294726, "loss": 0.36196303367614746, "memory(GiB)": 78.33, "step": 2422, "token_acc": 0.8924877517691889, "train_speed(iter/s)": 0.032649 }, { "epoch": 0.4695054013467035, "grad_norm": 0.09119024872779846, "learning_rate": 0.000177434570966112, "loss": 0.34092551469802856, "memory(GiB)": 78.33, "step": 2423, "token_acc": 0.898368601754819, "train_speed(iter/s)": 0.03265 }, { "epoch": 0.46969917163203023, "grad_norm": 0.10667548328638077, "learning_rate": 0.00017734005492116135, "loss": 0.4041289687156677, "memory(GiB)": 78.33, "step": 2424, "token_acc": 0.8804930885083557, "train_speed(iter/s)": 0.032651 }, { "epoch": 0.469892941917357, "grad_norm": 0.10172632336616516, "learning_rate": 0.00017724552764691545, "loss": 0.3615379333496094, "memory(GiB)": 78.33, "step": 2425, "token_acc": 0.8944812914528845, "train_speed(iter/s)": 0.032652 }, { "epoch": 0.4700867122026837, "grad_norm": 0.10548295080661774, "learning_rate": 0.00017715098918219926, "loss": 0.36227190494537354, "memory(GiB)": 78.33, "step": 2426, "token_acc": 0.8919847828938738, "train_speed(iter/s)": 0.032653 }, { "epoch": 0.47028048248801047, "grad_norm": 0.10230764001607895, "learning_rate": 0.0001770564395658422, "loss": 0.3501635789871216, "memory(GiB)": 78.33, "step": 2427, "token_acc": 0.8956358685880147, "train_speed(iter/s)": 0.032654 }, { "epoch": 0.4704742527733372, "grad_norm": 0.10815402865409851, "learning_rate": 0.00017696187883667837, "loss": 0.3826008439064026, "memory(GiB)": 78.33, "step": 2428, "token_acc": 0.8863275727763192, "train_speed(iter/s)": 0.032655 }, { "epoch": 0.47066802305866395, "grad_norm": 0.12040967494249344, "learning_rate": 0.00017686730703354641, "loss": 0.43001601099967957, "memory(GiB)": 78.33, "step": 2429, "token_acc": 0.8732980332829047, "train_speed(iter/s)": 0.032656 }, { "epoch": 0.4708617933439907, "grad_norm": 0.10938671231269836, "learning_rate": 0.00017677272419528952, "loss": 0.39348623156547546, "memory(GiB)": 78.33, "step": 2430, "token_acc": 0.8847729835272117, "train_speed(iter/s)": 0.032657 }, { "epoch": 0.47105556362931744, "grad_norm": 0.09924609214067459, "learning_rate": 0.00017667813036075538, "loss": 0.35848769545555115, "memory(GiB)": 78.33, "step": 2431, "token_acc": 0.8955514503969244, "train_speed(iter/s)": 0.032658 }, { "epoch": 0.4712493339146442, "grad_norm": 0.09913709759712219, "learning_rate": 0.00017658352556879623, "loss": 0.3611469864845276, "memory(GiB)": 78.33, "step": 2432, "token_acc": 0.8925030260887481, "train_speed(iter/s)": 0.032659 }, { "epoch": 0.47144310419997093, "grad_norm": 0.10867290198802948, "learning_rate": 0.00017648890985826881, "loss": 0.34665271639823914, "memory(GiB)": 78.33, "step": 2433, "token_acc": 0.8978396543446951, "train_speed(iter/s)": 0.03266 }, { "epoch": 0.4716368744852977, "grad_norm": 0.10434415936470032, "learning_rate": 0.00017639428326803432, "loss": 0.3893589973449707, "memory(GiB)": 78.33, "step": 2434, "token_acc": 0.8848601637972565, "train_speed(iter/s)": 0.032661 }, { "epoch": 0.4718306447706244, "grad_norm": 0.10091494768857956, "learning_rate": 0.00017629964583695847, "loss": 0.3660696744918823, "memory(GiB)": 78.33, "step": 2435, "token_acc": 0.893289756957954, "train_speed(iter/s)": 0.032662 }, { "epoch": 0.47202441505595116, "grad_norm": 0.11700880527496338, "learning_rate": 0.00017620499760391133, "loss": 0.3919360041618347, "memory(GiB)": 78.33, "step": 2436, "token_acc": 0.8846436236304077, "train_speed(iter/s)": 0.032663 }, { "epoch": 0.4722181853412779, "grad_norm": 0.11263095587491989, "learning_rate": 0.00017611033860776752, "loss": 0.36513423919677734, "memory(GiB)": 78.33, "step": 2437, "token_acc": 0.8920323843097422, "train_speed(iter/s)": 0.032664 }, { "epoch": 0.47241195562660465, "grad_norm": 0.09740516543388367, "learning_rate": 0.0001760156688874061, "loss": 0.3375285565853119, "memory(GiB)": 78.33, "step": 2438, "token_acc": 0.899163848916149, "train_speed(iter/s)": 0.032665 }, { "epoch": 0.4726057259119314, "grad_norm": 0.10824833065271378, "learning_rate": 0.00017592098848171037, "loss": 0.36911553144454956, "memory(GiB)": 78.33, "step": 2439, "token_acc": 0.8915403549682601, "train_speed(iter/s)": 0.032666 }, { "epoch": 0.47279949619725814, "grad_norm": 0.1003868579864502, "learning_rate": 0.00017582629742956816, "loss": 0.3386095464229584, "memory(GiB)": 78.33, "step": 2440, "token_acc": 0.8985514852524404, "train_speed(iter/s)": 0.032667 }, { "epoch": 0.4729932664825849, "grad_norm": 0.13465555012226105, "learning_rate": 0.00017573159576987155, "loss": 0.4634130001068115, "memory(GiB)": 78.33, "step": 2441, "token_acc": 0.8691293051762041, "train_speed(iter/s)": 0.032668 }, { "epoch": 0.47318703676791163, "grad_norm": 0.11581727117300034, "learning_rate": 0.0001756368835415172, "loss": 0.3709140717983246, "memory(GiB)": 78.33, "step": 2442, "token_acc": 0.8906245185938318, "train_speed(iter/s)": 0.032669 }, { "epoch": 0.4733808070532384, "grad_norm": 0.09655480831861496, "learning_rate": 0.00017554216078340582, "loss": 0.3356662094593048, "memory(GiB)": 78.33, "step": 2443, "token_acc": 0.901089996601929, "train_speed(iter/s)": 0.03267 }, { "epoch": 0.4735745773385651, "grad_norm": 0.10045115649700165, "learning_rate": 0.00017544742753444268, "loss": 0.33761128783226013, "memory(GiB)": 78.33, "step": 2444, "token_acc": 0.8995756718528995, "train_speed(iter/s)": 0.032671 }, { "epoch": 0.47376834762389186, "grad_norm": 0.10537243634462357, "learning_rate": 0.0001753526838335373, "loss": 0.3550539016723633, "memory(GiB)": 78.33, "step": 2445, "token_acc": 0.8979015012132363, "train_speed(iter/s)": 0.032672 }, { "epoch": 0.4739621179092186, "grad_norm": 0.09614920616149902, "learning_rate": 0.0001752579297196034, "loss": 0.3335365056991577, "memory(GiB)": 78.33, "step": 2446, "token_acc": 0.9002796136248093, "train_speed(iter/s)": 0.032673 }, { "epoch": 0.47415588819454535, "grad_norm": 0.10471241921186447, "learning_rate": 0.00017516316523155903, "loss": 0.3631875216960907, "memory(GiB)": 78.33, "step": 2447, "token_acc": 0.8914798436857261, "train_speed(iter/s)": 0.032674 }, { "epoch": 0.4743496584798721, "grad_norm": 0.10309155285358429, "learning_rate": 0.00017506839040832653, "loss": 0.36359232664108276, "memory(GiB)": 78.33, "step": 2448, "token_acc": 0.8915206063477025, "train_speed(iter/s)": 0.032675 }, { "epoch": 0.47454342876519884, "grad_norm": 0.09676847606897354, "learning_rate": 0.00017497360528883252, "loss": 0.33657437562942505, "memory(GiB)": 78.33, "step": 2449, "token_acc": 0.8991928296794235, "train_speed(iter/s)": 0.032676 }, { "epoch": 0.4747371990505256, "grad_norm": 0.1178392842411995, "learning_rate": 0.0001748788099120077, "loss": 0.3946092426776886, "memory(GiB)": 78.33, "step": 2450, "token_acc": 0.8864816204051013, "train_speed(iter/s)": 0.032677 }, { "epoch": 0.4749309693358523, "grad_norm": 0.10294267535209656, "learning_rate": 0.00017478400431678715, "loss": 0.35818547010421753, "memory(GiB)": 78.33, "step": 2451, "token_acc": 0.8954294409377818, "train_speed(iter/s)": 0.032678 }, { "epoch": 0.47512473962117907, "grad_norm": 0.09975744038820267, "learning_rate": 0.00017468918854211007, "loss": 0.3438222408294678, "memory(GiB)": 78.33, "step": 2452, "token_acc": 0.899183906851024, "train_speed(iter/s)": 0.032679 }, { "epoch": 0.4753185099065058, "grad_norm": 0.12452986091375351, "learning_rate": 0.00017459436262691987, "loss": 0.41207653284072876, "memory(GiB)": 78.33, "step": 2453, "token_acc": 0.8813101862650361, "train_speed(iter/s)": 0.03268 }, { "epoch": 0.47551228019183256, "grad_norm": 0.09743805229663849, "learning_rate": 0.00017449952661016395, "loss": 0.3469730019569397, "memory(GiB)": 78.33, "step": 2454, "token_acc": 0.8969273247713859, "train_speed(iter/s)": 0.032681 }, { "epoch": 0.4757060504771593, "grad_norm": 0.10268343985080719, "learning_rate": 0.0001744046805307942, "loss": 0.33460330963134766, "memory(GiB)": 78.33, "step": 2455, "token_acc": 0.900157210232957, "train_speed(iter/s)": 0.032682 }, { "epoch": 0.47589982076248605, "grad_norm": 0.1134437769651413, "learning_rate": 0.00017430982442776636, "loss": 0.3615866005420685, "memory(GiB)": 78.33, "step": 2456, "token_acc": 0.8943463421872682, "train_speed(iter/s)": 0.032683 }, { "epoch": 0.4760935910478128, "grad_norm": 0.10063590854406357, "learning_rate": 0.0001742149583400404, "loss": 0.35712823271751404, "memory(GiB)": 78.33, "step": 2457, "token_acc": 0.8925671118305679, "train_speed(iter/s)": 0.032684 }, { "epoch": 0.47628736133313954, "grad_norm": 0.11840621381998062, "learning_rate": 0.0001741200823065804, "loss": 0.3843688666820526, "memory(GiB)": 78.33, "step": 2458, "token_acc": 0.8865816184002235, "train_speed(iter/s)": 0.032685 }, { "epoch": 0.47648113161846634, "grad_norm": 0.11133860051631927, "learning_rate": 0.00017402519636635445, "loss": 0.36577725410461426, "memory(GiB)": 78.33, "step": 2459, "token_acc": 0.8931690359777295, "train_speed(iter/s)": 0.032686 }, { "epoch": 0.4766749019037931, "grad_norm": 0.10411540418863297, "learning_rate": 0.00017393030055833477, "loss": 0.40943437814712524, "memory(GiB)": 78.33, "step": 2460, "token_acc": 0.8801027690550957, "train_speed(iter/s)": 0.032688 }, { "epoch": 0.4768686721891198, "grad_norm": 0.11033840477466583, "learning_rate": 0.00017383539492149755, "loss": 0.39954739809036255, "memory(GiB)": 78.33, "step": 2461, "token_acc": 0.8828463277143869, "train_speed(iter/s)": 0.032688 }, { "epoch": 0.47706244247444657, "grad_norm": 0.10192213207483292, "learning_rate": 0.00017374047949482324, "loss": 0.36716240644454956, "memory(GiB)": 78.33, "step": 2462, "token_acc": 0.8911974494758457, "train_speed(iter/s)": 0.032689 }, { "epoch": 0.4772562127597733, "grad_norm": 0.10164597630500793, "learning_rate": 0.000173645554317296, "loss": 0.37092214822769165, "memory(GiB)": 78.33, "step": 2463, "token_acc": 0.8932100329364074, "train_speed(iter/s)": 0.03269 }, { "epoch": 0.47744998304510006, "grad_norm": 0.10854342579841614, "learning_rate": 0.0001735506194279043, "loss": 0.37675389647483826, "memory(GiB)": 78.33, "step": 2464, "token_acc": 0.8900420837124658, "train_speed(iter/s)": 0.032691 }, { "epoch": 0.4776437533304268, "grad_norm": 0.1078154519200325, "learning_rate": 0.00017345567486564033, "loss": 0.3637319505214691, "memory(GiB)": 78.33, "step": 2465, "token_acc": 0.8951967543337022, "train_speed(iter/s)": 0.032692 }, { "epoch": 0.47783752361575355, "grad_norm": 0.11544948071241379, "learning_rate": 0.00017336072066950043, "loss": 0.42091140151023865, "memory(GiB)": 78.33, "step": 2466, "token_acc": 0.8770483917519161, "train_speed(iter/s)": 0.032693 }, { "epoch": 0.4780312939010803, "grad_norm": 0.10689322650432587, "learning_rate": 0.00017326575687848483, "loss": 0.36379310488700867, "memory(GiB)": 78.33, "step": 2467, "token_acc": 0.8929775576862291, "train_speed(iter/s)": 0.032694 }, { "epoch": 0.47822506418640703, "grad_norm": 0.1052493080496788, "learning_rate": 0.00017317078353159767, "loss": 0.36684519052505493, "memory(GiB)": 78.33, "step": 2468, "token_acc": 0.8944831280128549, "train_speed(iter/s)": 0.032695 }, { "epoch": 0.4784188344717338, "grad_norm": 0.11039218306541443, "learning_rate": 0.00017307580066784706, "loss": 0.36911848187446594, "memory(GiB)": 78.33, "step": 2469, "token_acc": 0.891165015793186, "train_speed(iter/s)": 0.032696 }, { "epoch": 0.4786126047570605, "grad_norm": 0.09582363814115524, "learning_rate": 0.00017298080832624512, "loss": 0.32416924834251404, "memory(GiB)": 78.33, "step": 2470, "token_acc": 0.9032258064516129, "train_speed(iter/s)": 0.032697 }, { "epoch": 0.47880637504238727, "grad_norm": 0.09783096611499786, "learning_rate": 0.00017288580654580766, "loss": 0.3526180386543274, "memory(GiB)": 78.33, "step": 2471, "token_acc": 0.8959444254195, "train_speed(iter/s)": 0.032698 }, { "epoch": 0.479000145327714, "grad_norm": 0.11712806671857834, "learning_rate": 0.00017279079536555448, "loss": 0.416570246219635, "memory(GiB)": 78.33, "step": 2472, "token_acc": 0.8787166033411736, "train_speed(iter/s)": 0.032699 }, { "epoch": 0.47919391561304076, "grad_norm": 0.10478947311639786, "learning_rate": 0.00017269577482450927, "loss": 0.3899994194507599, "memory(GiB)": 78.33, "step": 2473, "token_acc": 0.8852699947849998, "train_speed(iter/s)": 0.0327 }, { "epoch": 0.4793876858983675, "grad_norm": 0.10049055516719818, "learning_rate": 0.0001726007449616994, "loss": 0.32086169719696045, "memory(GiB)": 78.33, "step": 2474, "token_acc": 0.9046260601387818, "train_speed(iter/s)": 0.032701 }, { "epoch": 0.47958145618369424, "grad_norm": 0.09648868441581726, "learning_rate": 0.00017250570581615632, "loss": 0.3393422067165375, "memory(GiB)": 78.33, "step": 2475, "token_acc": 0.897726723095526, "train_speed(iter/s)": 0.032702 }, { "epoch": 0.479775226469021, "grad_norm": 0.1213340163230896, "learning_rate": 0.00017241065742691508, "loss": 0.3405624032020569, "memory(GiB)": 78.33, "step": 2476, "token_acc": 0.8992950060470193, "train_speed(iter/s)": 0.032703 }, { "epoch": 0.47996899675434773, "grad_norm": 0.10845254361629486, "learning_rate": 0.00017231559983301467, "loss": 0.3818724751472473, "memory(GiB)": 78.33, "step": 2477, "token_acc": 0.8866860090264346, "train_speed(iter/s)": 0.032704 }, { "epoch": 0.4801627670396745, "grad_norm": 0.1041831448674202, "learning_rate": 0.00017222053307349775, "loss": 0.36897408962249756, "memory(GiB)": 78.33, "step": 2478, "token_acc": 0.8889274835675733, "train_speed(iter/s)": 0.032705 }, { "epoch": 0.4803565373250012, "grad_norm": 0.11223351210355759, "learning_rate": 0.00017212545718741084, "loss": 0.39438849687576294, "memory(GiB)": 78.33, "step": 2479, "token_acc": 0.8818676281855405, "train_speed(iter/s)": 0.032706 }, { "epoch": 0.48055030761032796, "grad_norm": 0.11003533750772476, "learning_rate": 0.0001720303722138041, "loss": 0.3478088080883026, "memory(GiB)": 78.33, "step": 2480, "token_acc": 0.8965669378188336, "train_speed(iter/s)": 0.032707 }, { "epoch": 0.4807440778956547, "grad_norm": 0.11993599683046341, "learning_rate": 0.0001719352781917315, "loss": 0.35039135813713074, "memory(GiB)": 78.33, "step": 2481, "token_acc": 0.896633438940834, "train_speed(iter/s)": 0.032708 }, { "epoch": 0.48093784818098145, "grad_norm": 0.10672726482152939, "learning_rate": 0.00017184017516025076, "loss": 0.3407592177391052, "memory(GiB)": 78.33, "step": 2482, "token_acc": 0.8988511857298958, "train_speed(iter/s)": 0.032709 }, { "epoch": 0.4811316184663082, "grad_norm": 0.1108117550611496, "learning_rate": 0.00017174506315842316, "loss": 0.3430328965187073, "memory(GiB)": 78.33, "step": 2483, "token_acc": 0.9010159105568695, "train_speed(iter/s)": 0.03271 }, { "epoch": 0.48132538875163494, "grad_norm": 0.10908929258584976, "learning_rate": 0.00017164994222531384, "loss": 0.365764319896698, "memory(GiB)": 78.33, "step": 2484, "token_acc": 0.8944871455075537, "train_speed(iter/s)": 0.032711 }, { "epoch": 0.4815191590369617, "grad_norm": 0.10668904334306717, "learning_rate": 0.0001715548123999915, "loss": 0.36919450759887695, "memory(GiB)": 78.33, "step": 2485, "token_acc": 0.8912290460418194, "train_speed(iter/s)": 0.032712 }, { "epoch": 0.48171292932228843, "grad_norm": 0.10080403089523315, "learning_rate": 0.0001714596737215285, "loss": 0.33419883251190186, "memory(GiB)": 78.33, "step": 2486, "token_acc": 0.899380222317015, "train_speed(iter/s)": 0.032713 }, { "epoch": 0.4819066996076152, "grad_norm": 0.10038257390260696, "learning_rate": 0.00017136452622900083, "loss": 0.3448052406311035, "memory(GiB)": 78.33, "step": 2487, "token_acc": 0.898629288314102, "train_speed(iter/s)": 0.032714 }, { "epoch": 0.4821004698929419, "grad_norm": 0.10354321449995041, "learning_rate": 0.0001712693699614882, "loss": 0.35112708806991577, "memory(GiB)": 78.33, "step": 2488, "token_acc": 0.8958072674031679, "train_speed(iter/s)": 0.032715 }, { "epoch": 0.48229424017826866, "grad_norm": 0.10565353184938431, "learning_rate": 0.00017117420495807372, "loss": 0.34346991777420044, "memory(GiB)": 78.33, "step": 2489, "token_acc": 0.8978623685413809, "train_speed(iter/s)": 0.032716 }, { "epoch": 0.4824880104635954, "grad_norm": 0.10667932033538818, "learning_rate": 0.00017107903125784433, "loss": 0.39348533749580383, "memory(GiB)": 78.33, "step": 2490, "token_acc": 0.8839731259234479, "train_speed(iter/s)": 0.032717 }, { "epoch": 0.48268178074892215, "grad_norm": 0.10690039396286011, "learning_rate": 0.00017098384889989044, "loss": 0.36393576860427856, "memory(GiB)": 78.33, "step": 2491, "token_acc": 0.891016713091922, "train_speed(iter/s)": 0.032718 }, { "epoch": 0.4828755510342489, "grad_norm": 0.10752350091934204, "learning_rate": 0.0001708886579233059, "loss": 0.3595428764820099, "memory(GiB)": 78.33, "step": 2492, "token_acc": 0.8925832633641197, "train_speed(iter/s)": 0.032719 }, { "epoch": 0.48306932131957564, "grad_norm": 0.10408618301153183, "learning_rate": 0.00017079345836718828, "loss": 0.35838645696640015, "memory(GiB)": 78.33, "step": 2493, "token_acc": 0.893934211276438, "train_speed(iter/s)": 0.03272 }, { "epoch": 0.4832630916049024, "grad_norm": 0.10144350677728653, "learning_rate": 0.0001706982502706385, "loss": 0.3558150827884674, "memory(GiB)": 78.33, "step": 2494, "token_acc": 0.8957832584387083, "train_speed(iter/s)": 0.032721 }, { "epoch": 0.48345686189022913, "grad_norm": 0.10692603886127472, "learning_rate": 0.00017060303367276121, "loss": 0.4040507376194, "memory(GiB)": 78.33, "step": 2495, "token_acc": 0.8803178194638112, "train_speed(iter/s)": 0.032722 }, { "epoch": 0.4836506321755559, "grad_norm": 0.10149496793746948, "learning_rate": 0.00017050780861266432, "loss": 0.36347493529319763, "memory(GiB)": 78.33, "step": 2496, "token_acc": 0.8917304707027652, "train_speed(iter/s)": 0.032723 }, { "epoch": 0.4838444024608826, "grad_norm": 0.11008831858634949, "learning_rate": 0.00017041257512945943, "loss": 0.36779606342315674, "memory(GiB)": 78.33, "step": 2497, "token_acc": 0.8886516076638493, "train_speed(iter/s)": 0.032724 }, { "epoch": 0.48403817274620936, "grad_norm": 0.09817710518836975, "learning_rate": 0.00017031733326226142, "loss": 0.3289712071418762, "memory(GiB)": 78.33, "step": 2498, "token_acc": 0.9038888272495285, "train_speed(iter/s)": 0.032725 }, { "epoch": 0.4842319430315361, "grad_norm": 0.11836884170770645, "learning_rate": 0.00017022208305018867, "loss": 0.4116940498352051, "memory(GiB)": 78.33, "step": 2499, "token_acc": 0.88121387283237, "train_speed(iter/s)": 0.032726 }, { "epoch": 0.48442571331686285, "grad_norm": 0.11396266520023346, "learning_rate": 0.00017012682453236303, "loss": 0.3659469485282898, "memory(GiB)": 78.33, "step": 2500, "token_acc": 0.8918359785092324, "train_speed(iter/s)": 0.032727 }, { "epoch": 0.48442571331686285, "eval_loss": 0.4225236177444458, "eval_runtime": 1345.6225, "eval_samples_per_second": 5.016, "eval_steps_per_second": 5.016, "eval_token_acc": 0.8928025735859316, "step": 2500 }, { "epoch": 0.4846194836021896, "grad_norm": 0.10215523093938828, "learning_rate": 0.00017003155774790966, "loss": 0.37822431325912476, "memory(GiB)": 78.33, "step": 2501, "token_acc": 0.8875518780506904, "train_speed(iter/s)": 0.032161 }, { "epoch": 0.48481325388751634, "grad_norm": 0.09499707072973251, "learning_rate": 0.00016993628273595732, "loss": 0.3357214629650116, "memory(GiB)": 78.33, "step": 2502, "token_acc": 0.8995496714816547, "train_speed(iter/s)": 0.032162 }, { "epoch": 0.4850070241728431, "grad_norm": 0.1082739308476448, "learning_rate": 0.00016984099953563792, "loss": 0.3810504078865051, "memory(GiB)": 78.33, "step": 2503, "token_acc": 0.8882280116266552, "train_speed(iter/s)": 0.032163 }, { "epoch": 0.4852007944581698, "grad_norm": 0.1032496765255928, "learning_rate": 0.0001697457081860869, "loss": 0.36440324783325195, "memory(GiB)": 78.33, "step": 2504, "token_acc": 0.8939489607997896, "train_speed(iter/s)": 0.032164 }, { "epoch": 0.48539456474349657, "grad_norm": 0.08897780627012253, "learning_rate": 0.00016965040872644294, "loss": 0.3292064964771271, "memory(GiB)": 78.33, "step": 2505, "token_acc": 0.9021104037673323, "train_speed(iter/s)": 0.032165 }, { "epoch": 0.4855883350288233, "grad_norm": 0.10932856798171997, "learning_rate": 0.0001695551011958481, "loss": 0.39690276980400085, "memory(GiB)": 78.33, "step": 2506, "token_acc": 0.8848341960945107, "train_speed(iter/s)": 0.032167 }, { "epoch": 0.48578210531415006, "grad_norm": 0.11414424329996109, "learning_rate": 0.0001694597856334477, "loss": 0.39790499210357666, "memory(GiB)": 78.33, "step": 2507, "token_acc": 0.8816454951776053, "train_speed(iter/s)": 0.032168 }, { "epoch": 0.4859758755994768, "grad_norm": 0.09871453046798706, "learning_rate": 0.00016936446207839042, "loss": 0.3503097593784332, "memory(GiB)": 78.33, "step": 2508, "token_acc": 0.8961086541229425, "train_speed(iter/s)": 0.032169 }, { "epoch": 0.48616964588480355, "grad_norm": 0.09798671305179596, "learning_rate": 0.0001692691305698282, "loss": 0.35509318113327026, "memory(GiB)": 78.33, "step": 2509, "token_acc": 0.8953029405960135, "train_speed(iter/s)": 0.03217 }, { "epoch": 0.4863634161701303, "grad_norm": 0.10052474588155746, "learning_rate": 0.00016917379114691635, "loss": 0.34173664450645447, "memory(GiB)": 78.33, "step": 2510, "token_acc": 0.8989416623644811, "train_speed(iter/s)": 0.032171 }, { "epoch": 0.48655718645545704, "grad_norm": 0.11249368637800217, "learning_rate": 0.00016907844384881325, "loss": 0.41095811128616333, "memory(GiB)": 78.33, "step": 2511, "token_acc": 0.8812950699043415, "train_speed(iter/s)": 0.032172 }, { "epoch": 0.4867509567407838, "grad_norm": 0.10713987797498703, "learning_rate": 0.00016898308871468059, "loss": 0.3698621988296509, "memory(GiB)": 78.33, "step": 2512, "token_acc": 0.8919433163888336, "train_speed(iter/s)": 0.032174 }, { "epoch": 0.4869447270261105, "grad_norm": 0.09921532869338989, "learning_rate": 0.00016888772578368326, "loss": 0.36754289269447327, "memory(GiB)": 78.33, "step": 2513, "token_acc": 0.8912722283784497, "train_speed(iter/s)": 0.032175 }, { "epoch": 0.48713849731143727, "grad_norm": 0.10043738037347794, "learning_rate": 0.00016879235509498943, "loss": 0.3569088280200958, "memory(GiB)": 78.33, "step": 2514, "token_acc": 0.8948838737949167, "train_speed(iter/s)": 0.032176 }, { "epoch": 0.487332267596764, "grad_norm": 0.1124173253774643, "learning_rate": 0.00016869697668777043, "loss": 0.37260574102401733, "memory(GiB)": 78.33, "step": 2515, "token_acc": 0.8900623953736113, "train_speed(iter/s)": 0.032177 }, { "epoch": 0.48752603788209076, "grad_norm": 0.10620686411857605, "learning_rate": 0.00016860159060120062, "loss": 0.36633095145225525, "memory(GiB)": 78.33, "step": 2516, "token_acc": 0.8924985397602425, "train_speed(iter/s)": 0.032178 }, { "epoch": 0.4877198081674175, "grad_norm": 0.10621728748083115, "learning_rate": 0.00016850619687445778, "loss": 0.36821886897087097, "memory(GiB)": 78.33, "step": 2517, "token_acc": 0.8912531612453126, "train_speed(iter/s)": 0.032179 }, { "epoch": 0.48791357845274425, "grad_norm": 0.1054278165102005, "learning_rate": 0.0001684107955467226, "loss": 0.3753064274787903, "memory(GiB)": 78.33, "step": 2518, "token_acc": 0.891217046851523, "train_speed(iter/s)": 0.03218 }, { "epoch": 0.488107348738071, "grad_norm": 0.09800657629966736, "learning_rate": 0.00016831538665717895, "loss": 0.35065239667892456, "memory(GiB)": 78.33, "step": 2519, "token_acc": 0.8958839022878686, "train_speed(iter/s)": 0.032181 }, { "epoch": 0.4883011190233978, "grad_norm": 0.12331446260213852, "learning_rate": 0.00016821997024501386, "loss": 0.4164249897003174, "memory(GiB)": 78.33, "step": 2520, "token_acc": 0.8785413567155202, "train_speed(iter/s)": 0.032183 }, { "epoch": 0.48849488930872453, "grad_norm": 0.10607205331325531, "learning_rate": 0.00016812454634941739, "loss": 0.34385666251182556, "memory(GiB)": 78.33, "step": 2521, "token_acc": 0.8990350297422339, "train_speed(iter/s)": 0.032184 }, { "epoch": 0.4886886595940513, "grad_norm": 0.10245993733406067, "learning_rate": 0.00016802911500958268, "loss": 0.38447538018226624, "memory(GiB)": 78.33, "step": 2522, "token_acc": 0.8873564827779333, "train_speed(iter/s)": 0.032185 }, { "epoch": 0.488882429879378, "grad_norm": 0.10764322429895401, "learning_rate": 0.00016793367626470598, "loss": 0.36548304557800293, "memory(GiB)": 78.33, "step": 2523, "token_acc": 0.8924651924651925, "train_speed(iter/s)": 0.032186 }, { "epoch": 0.48907620016470477, "grad_norm": 0.11456014215946198, "learning_rate": 0.0001678382301539866, "loss": 0.3953639566898346, "memory(GiB)": 78.33, "step": 2524, "token_acc": 0.8825286212045794, "train_speed(iter/s)": 0.032187 }, { "epoch": 0.4892699704500315, "grad_norm": 0.10977276414632797, "learning_rate": 0.00016774277671662672, "loss": 0.38997790217399597, "memory(GiB)": 78.33, "step": 2525, "token_acc": 0.8871661125759487, "train_speed(iter/s)": 0.032188 }, { "epoch": 0.48946374073535825, "grad_norm": 0.10588563233613968, "learning_rate": 0.00016764731599183173, "loss": 0.3698723614215851, "memory(GiB)": 78.33, "step": 2526, "token_acc": 0.8911557154232681, "train_speed(iter/s)": 0.03219 }, { "epoch": 0.489657511020685, "grad_norm": 0.1090141013264656, "learning_rate": 0.00016755184801880976, "loss": 0.3647971749305725, "memory(GiB)": 78.33, "step": 2527, "token_acc": 0.8910625354128768, "train_speed(iter/s)": 0.032191 }, { "epoch": 0.48985128130601174, "grad_norm": 0.11149538308382034, "learning_rate": 0.00016745637283677227, "loss": 0.4022625982761383, "memory(GiB)": 78.33, "step": 2528, "token_acc": 0.8828381516777448, "train_speed(iter/s)": 0.032192 }, { "epoch": 0.4900450515913385, "grad_norm": 0.102491594851017, "learning_rate": 0.0001673608904849333, "loss": 0.3276543915271759, "memory(GiB)": 78.33, "step": 2529, "token_acc": 0.9035930180828514, "train_speed(iter/s)": 0.032193 }, { "epoch": 0.49023882187666523, "grad_norm": 0.09906461089849472, "learning_rate": 0.00016726540100251013, "loss": 0.33269554376602173, "memory(GiB)": 78.33, "step": 2530, "token_acc": 0.9001250312578144, "train_speed(iter/s)": 0.032194 }, { "epoch": 0.490432592161992, "grad_norm": 0.10517586022615433, "learning_rate": 0.00016716990442872286, "loss": 0.3707316517829895, "memory(GiB)": 78.33, "step": 2531, "token_acc": 0.8911390265885534, "train_speed(iter/s)": 0.032195 }, { "epoch": 0.4906263624473187, "grad_norm": 0.10482536256313324, "learning_rate": 0.00016707440080279448, "loss": 0.37238699197769165, "memory(GiB)": 78.33, "step": 2532, "token_acc": 0.8896209510682288, "train_speed(iter/s)": 0.032196 }, { "epoch": 0.49082013273264546, "grad_norm": 0.10107911378145218, "learning_rate": 0.00016697889016395085, "loss": 0.34942498803138733, "memory(GiB)": 78.33, "step": 2533, "token_acc": 0.8957952468007313, "train_speed(iter/s)": 0.032197 }, { "epoch": 0.4910139030179722, "grad_norm": 0.11440771073102951, "learning_rate": 0.00016688337255142078, "loss": 0.39366450905799866, "memory(GiB)": 78.33, "step": 2534, "token_acc": 0.8840879915976193, "train_speed(iter/s)": 0.032199 }, { "epoch": 0.49120767330329895, "grad_norm": 0.11480151861906052, "learning_rate": 0.00016678784800443593, "loss": 0.3709234595298767, "memory(GiB)": 78.33, "step": 2535, "token_acc": 0.8920728858433266, "train_speed(iter/s)": 0.0322 }, { "epoch": 0.4914014435886257, "grad_norm": 0.10538162291049957, "learning_rate": 0.00016669231656223082, "loss": 0.3718525171279907, "memory(GiB)": 78.33, "step": 2536, "token_acc": 0.8928541556305238, "train_speed(iter/s)": 0.032201 }, { "epoch": 0.49159521387395244, "grad_norm": 0.10045890510082245, "learning_rate": 0.00016659677826404273, "loss": 0.3458371162414551, "memory(GiB)": 78.33, "step": 2537, "token_acc": 0.8953016402557687, "train_speed(iter/s)": 0.032202 }, { "epoch": 0.4917889841592792, "grad_norm": 0.09948378801345825, "learning_rate": 0.00016650123314911188, "loss": 0.3648657500743866, "memory(GiB)": 78.33, "step": 2538, "token_acc": 0.8927988345515284, "train_speed(iter/s)": 0.032203 }, { "epoch": 0.49198275444460593, "grad_norm": 0.11414239555597305, "learning_rate": 0.00016640568125668117, "loss": 0.39501774311065674, "memory(GiB)": 78.33, "step": 2539, "token_acc": 0.8854694665701183, "train_speed(iter/s)": 0.032204 }, { "epoch": 0.4921765247299327, "grad_norm": 0.11848455667495728, "learning_rate": 0.00016631012262599632, "loss": 0.4026211202144623, "memory(GiB)": 78.33, "step": 2540, "token_acc": 0.8802395209580839, "train_speed(iter/s)": 0.032205 }, { "epoch": 0.4923702950152594, "grad_norm": 0.10458105802536011, "learning_rate": 0.0001662145572963058, "loss": 0.37393152713775635, "memory(GiB)": 78.33, "step": 2541, "token_acc": 0.8897102626590847, "train_speed(iter/s)": 0.032206 }, { "epoch": 0.49256406530058616, "grad_norm": 0.11251001805067062, "learning_rate": 0.0001661189853068609, "loss": 0.39233535528182983, "memory(GiB)": 78.33, "step": 2542, "token_acc": 0.8865367607200936, "train_speed(iter/s)": 0.032208 }, { "epoch": 0.4927578355859129, "grad_norm": 0.11127537488937378, "learning_rate": 0.00016602340669691563, "loss": 0.38980281352996826, "memory(GiB)": 78.33, "step": 2543, "token_acc": 0.8849459159053324, "train_speed(iter/s)": 0.032209 }, { "epoch": 0.49295160587123965, "grad_norm": 0.11086184531450272, "learning_rate": 0.00016592782150572666, "loss": 0.3862883448600769, "memory(GiB)": 78.33, "step": 2544, "token_acc": 0.8859040848435406, "train_speed(iter/s)": 0.03221 }, { "epoch": 0.4931453761565664, "grad_norm": 0.10376356542110443, "learning_rate": 0.00016583222977255337, "loss": 0.36207854747772217, "memory(GiB)": 78.33, "step": 2545, "token_acc": 0.8943563260789377, "train_speed(iter/s)": 0.032211 }, { "epoch": 0.49333914644189314, "grad_norm": 0.10361557453870773, "learning_rate": 0.00016573663153665792, "loss": 0.36922216415405273, "memory(GiB)": 78.33, "step": 2546, "token_acc": 0.8913299533978594, "train_speed(iter/s)": 0.032212 }, { "epoch": 0.4935329167272199, "grad_norm": 0.09568388015031815, "learning_rate": 0.000165641026837305, "loss": 0.32921454310417175, "memory(GiB)": 78.33, "step": 2547, "token_acc": 0.9026607486174076, "train_speed(iter/s)": 0.032213 }, { "epoch": 0.49372668701254663, "grad_norm": 0.1072535291314125, "learning_rate": 0.00016554541571376212, "loss": 0.34089717268943787, "memory(GiB)": 78.33, "step": 2548, "token_acc": 0.899007279947055, "train_speed(iter/s)": 0.032214 }, { "epoch": 0.49392045729787337, "grad_norm": 0.11042487621307373, "learning_rate": 0.00016544979820529924, "loss": 0.381551057100296, "memory(GiB)": 78.33, "step": 2549, "token_acc": 0.8886517557338797, "train_speed(iter/s)": 0.032215 }, { "epoch": 0.4941142275832001, "grad_norm": 0.10860154032707214, "learning_rate": 0.0001653541743511891, "loss": 0.3577283024787903, "memory(GiB)": 78.33, "step": 2550, "token_acc": 0.8940223463687151, "train_speed(iter/s)": 0.032216 }, { "epoch": 0.49430799786852686, "grad_norm": 0.10476289689540863, "learning_rate": 0.00016525854419070698, "loss": 0.34515032172203064, "memory(GiB)": 78.33, "step": 2551, "token_acc": 0.896808724928779, "train_speed(iter/s)": 0.032218 }, { "epoch": 0.4945017681538536, "grad_norm": 0.11404412239789963, "learning_rate": 0.00016516290776313075, "loss": 0.3582231402397156, "memory(GiB)": 78.33, "step": 2552, "token_acc": 0.8928893430305944, "train_speed(iter/s)": 0.032219 }, { "epoch": 0.49469553843918035, "grad_norm": 0.11500284075737, "learning_rate": 0.00016506726510774085, "loss": 0.3722653388977051, "memory(GiB)": 78.33, "step": 2553, "token_acc": 0.8909366240293843, "train_speed(iter/s)": 0.03222 }, { "epoch": 0.4948893087245071, "grad_norm": 0.10306849330663681, "learning_rate": 0.00016497161626382028, "loss": 0.361613392829895, "memory(GiB)": 78.33, "step": 2554, "token_acc": 0.8950341710758377, "train_speed(iter/s)": 0.032221 }, { "epoch": 0.49508307900983384, "grad_norm": 0.10094691812992096, "learning_rate": 0.0001648759612706546, "loss": 0.364359587430954, "memory(GiB)": 78.33, "step": 2555, "token_acc": 0.8946505999798327, "train_speed(iter/s)": 0.032222 }, { "epoch": 0.4952768492951606, "grad_norm": 0.11370456963777542, "learning_rate": 0.00016478030016753195, "loss": 0.3835192620754242, "memory(GiB)": 78.33, "step": 2556, "token_acc": 0.88659125721692, "train_speed(iter/s)": 0.032223 }, { "epoch": 0.4954706195804873, "grad_norm": 0.09802607446908951, "learning_rate": 0.00016468463299374283, "loss": 0.3576071858406067, "memory(GiB)": 78.33, "step": 2557, "token_acc": 0.8939546925566343, "train_speed(iter/s)": 0.032224 }, { "epoch": 0.49566438986581407, "grad_norm": 0.10389211028814316, "learning_rate": 0.00016458895978858034, "loss": 0.33778145909309387, "memory(GiB)": 78.33, "step": 2558, "token_acc": 0.901243754650792, "train_speed(iter/s)": 0.032225 }, { "epoch": 0.4958581601511408, "grad_norm": 0.10826588422060013, "learning_rate": 0.00016449328059134008, "loss": 0.39494338631629944, "memory(GiB)": 78.33, "step": 2559, "token_acc": 0.8841801579743431, "train_speed(iter/s)": 0.032226 }, { "epoch": 0.49605193043646756, "grad_norm": 0.1128348559141159, "learning_rate": 0.00016439759544132, "loss": 0.35508641600608826, "memory(GiB)": 78.33, "step": 2560, "token_acc": 0.8955709517264593, "train_speed(iter/s)": 0.032227 }, { "epoch": 0.4962457007217943, "grad_norm": 0.10237224400043488, "learning_rate": 0.00016430190437782057, "loss": 0.3304956555366516, "memory(GiB)": 78.33, "step": 2561, "token_acc": 0.9007643970512228, "train_speed(iter/s)": 0.032229 }, { "epoch": 0.49643947100712105, "grad_norm": 0.10696551948785782, "learning_rate": 0.00016420620744014473, "loss": 0.34542012214660645, "memory(GiB)": 78.33, "step": 2562, "token_acc": 0.8984604105571847, "train_speed(iter/s)": 0.03223 }, { "epoch": 0.4966332412924478, "grad_norm": 0.10525023192167282, "learning_rate": 0.00016411050466759775, "loss": 0.37965142726898193, "memory(GiB)": 78.33, "step": 2563, "token_acc": 0.8895881006864989, "train_speed(iter/s)": 0.032231 }, { "epoch": 0.49682701157777454, "grad_norm": 0.10651940107345581, "learning_rate": 0.00016401479609948736, "loss": 0.36009615659713745, "memory(GiB)": 78.33, "step": 2564, "token_acc": 0.8930840596007872, "train_speed(iter/s)": 0.032232 }, { "epoch": 0.4970207818631013, "grad_norm": 0.10184381902217865, "learning_rate": 0.00016391908177512362, "loss": 0.3449605405330658, "memory(GiB)": 78.33, "step": 2565, "token_acc": 0.8950802590120822, "train_speed(iter/s)": 0.032233 }, { "epoch": 0.497214552148428, "grad_norm": 0.10290346294641495, "learning_rate": 0.00016382336173381899, "loss": 0.3701861798763275, "memory(GiB)": 78.33, "step": 2566, "token_acc": 0.8891216519527159, "train_speed(iter/s)": 0.032234 }, { "epoch": 0.49740832243375477, "grad_norm": 0.10077276080846786, "learning_rate": 0.00016372763601488818, "loss": 0.36543571949005127, "memory(GiB)": 78.33, "step": 2567, "token_acc": 0.891035628960216, "train_speed(iter/s)": 0.032235 }, { "epoch": 0.4976020927190815, "grad_norm": 0.10975835472345352, "learning_rate": 0.00016363190465764837, "loss": 0.3935272693634033, "memory(GiB)": 78.33, "step": 2568, "token_acc": 0.882976987106251, "train_speed(iter/s)": 0.032236 }, { "epoch": 0.49779586300440826, "grad_norm": 0.10267551988363266, "learning_rate": 0.0001635361677014191, "loss": 0.3506276607513428, "memory(GiB)": 78.33, "step": 2569, "token_acc": 0.8953292213611682, "train_speed(iter/s)": 0.032237 }, { "epoch": 0.497989633289735, "grad_norm": 0.10422598570585251, "learning_rate": 0.00016344042518552198, "loss": 0.3643769323825836, "memory(GiB)": 78.33, "step": 2570, "token_acc": 0.8916066426570628, "train_speed(iter/s)": 0.032238 }, { "epoch": 0.49818340357506175, "grad_norm": 0.110415019094944, "learning_rate": 0.00016334467714928112, "loss": 0.38099807500839233, "memory(GiB)": 78.33, "step": 2571, "token_acc": 0.8894837676823347, "train_speed(iter/s)": 0.032239 }, { "epoch": 0.4983771738603885, "grad_norm": 0.0991300493478775, "learning_rate": 0.00016324892363202273, "loss": 0.33743083477020264, "memory(GiB)": 78.33, "step": 2572, "token_acc": 0.8997278841710735, "train_speed(iter/s)": 0.03224 }, { "epoch": 0.49857094414571523, "grad_norm": 0.11681834608316422, "learning_rate": 0.00016315316467307544, "loss": 0.41430431604385376, "memory(GiB)": 78.33, "step": 2573, "token_acc": 0.8792227548003788, "train_speed(iter/s)": 0.032242 }, { "epoch": 0.498764714431042, "grad_norm": 0.10748863965272903, "learning_rate": 0.0001630574003117699, "loss": 0.393510639667511, "memory(GiB)": 78.33, "step": 2574, "token_acc": 0.8857627401373405, "train_speed(iter/s)": 0.032243 }, { "epoch": 0.4989584847163687, "grad_norm": 0.10948104411363602, "learning_rate": 0.00016296163058743919, "loss": 0.35196927189826965, "memory(GiB)": 78.33, "step": 2575, "token_acc": 0.8924734374072535, "train_speed(iter/s)": 0.032244 }, { "epoch": 0.49915225500169547, "grad_norm": 0.1006598174571991, "learning_rate": 0.00016286585553941857, "loss": 0.32736673951148987, "memory(GiB)": 78.33, "step": 2576, "token_acc": 0.9026124709933379, "train_speed(iter/s)": 0.032245 }, { "epoch": 0.4993460252870222, "grad_norm": 0.09987809509038925, "learning_rate": 0.00016277007520704533, "loss": 0.3588752746582031, "memory(GiB)": 78.33, "step": 2577, "token_acc": 0.8945117224048975, "train_speed(iter/s)": 0.032246 }, { "epoch": 0.49953979557234895, "grad_norm": 0.10553018003702164, "learning_rate": 0.00016267428962965906, "loss": 0.3588329255580902, "memory(GiB)": 78.33, "step": 2578, "token_acc": 0.8936772386426634, "train_speed(iter/s)": 0.032247 }, { "epoch": 0.4997335658576757, "grad_norm": 0.10368996113538742, "learning_rate": 0.00016257849884660148, "loss": 0.36948418617248535, "memory(GiB)": 78.33, "step": 2579, "token_acc": 0.8906377490590371, "train_speed(iter/s)": 0.032248 }, { "epoch": 0.49992733614300244, "grad_norm": 0.1034737378358841, "learning_rate": 0.00016248270289721646, "loss": 0.3439171612262726, "memory(GiB)": 78.33, "step": 2580, "token_acc": 0.8986508719973676, "train_speed(iter/s)": 0.032249 }, { "epoch": 0.5001211064283292, "grad_norm": 0.10310948640108109, "learning_rate": 0.00016238690182084986, "loss": 0.336532324552536, "memory(GiB)": 78.33, "step": 2581, "token_acc": 0.9005147686293942, "train_speed(iter/s)": 0.03225 }, { "epoch": 0.5003148767136559, "grad_norm": 0.1009165570139885, "learning_rate": 0.0001622910956568498, "loss": 0.33198171854019165, "memory(GiB)": 78.33, "step": 2582, "token_acc": 0.9027183088253284, "train_speed(iter/s)": 0.032251 }, { "epoch": 0.5005086469989827, "grad_norm": 0.11531993001699448, "learning_rate": 0.00016219528444456658, "loss": 0.38337087631225586, "memory(GiB)": 78.33, "step": 2583, "token_acc": 0.8874007793332371, "train_speed(iter/s)": 0.032253 }, { "epoch": 0.5007024172843094, "grad_norm": 0.11452654004096985, "learning_rate": 0.0001620994682233523, "loss": 0.3833463191986084, "memory(GiB)": 78.33, "step": 2584, "token_acc": 0.887452540227807, "train_speed(iter/s)": 0.032254 }, { "epoch": 0.5008961875696362, "grad_norm": 0.09829486906528473, "learning_rate": 0.00016200364703256132, "loss": 0.3433375358581543, "memory(GiB)": 78.33, "step": 2585, "token_acc": 0.898637268412188, "train_speed(iter/s)": 0.032255 }, { "epoch": 0.5010899578549629, "grad_norm": 0.11422152817249298, "learning_rate": 0.00016190782091154993, "loss": 0.3638113737106323, "memory(GiB)": 78.33, "step": 2586, "token_acc": 0.8920692223941321, "train_speed(iter/s)": 0.032256 }, { "epoch": 0.5012837281402897, "grad_norm": 0.1095975786447525, "learning_rate": 0.00016181198989967648, "loss": 0.375558078289032, "memory(GiB)": 78.33, "step": 2587, "token_acc": 0.8875113896788801, "train_speed(iter/s)": 0.032257 }, { "epoch": 0.5014774984256164, "grad_norm": 0.12371022254228592, "learning_rate": 0.0001617161540363014, "loss": 0.3656739592552185, "memory(GiB)": 78.33, "step": 2588, "token_acc": 0.890730205842746, "train_speed(iter/s)": 0.032258 }, { "epoch": 0.5016712687109431, "grad_norm": 0.1015966534614563, "learning_rate": 0.00016162031336078707, "loss": 0.35260072350502014, "memory(GiB)": 78.33, "step": 2589, "token_acc": 0.8963511941792216, "train_speed(iter/s)": 0.032259 }, { "epoch": 0.5018650389962699, "grad_norm": 0.09963703155517578, "learning_rate": 0.00016152446791249775, "loss": 0.3328312933444977, "memory(GiB)": 78.33, "step": 2590, "token_acc": 0.9016321007428681, "train_speed(iter/s)": 0.03226 }, { "epoch": 0.5020588092815966, "grad_norm": 0.11252304911613464, "learning_rate": 0.00016142861773079983, "loss": 0.3886357545852661, "memory(GiB)": 78.33, "step": 2591, "token_acc": 0.8855965345045425, "train_speed(iter/s)": 0.032261 }, { "epoch": 0.5022525795669234, "grad_norm": 0.10422030091285706, "learning_rate": 0.00016133276285506152, "loss": 0.3719256520271301, "memory(GiB)": 78.33, "step": 2592, "token_acc": 0.8892891253922772, "train_speed(iter/s)": 0.032262 }, { "epoch": 0.5024463498522501, "grad_norm": 0.11819025874137878, "learning_rate": 0.00016123690332465294, "loss": 0.3973425626754761, "memory(GiB)": 78.33, "step": 2593, "token_acc": 0.8866967825205125, "train_speed(iter/s)": 0.032264 }, { "epoch": 0.5026401201375769, "grad_norm": 0.11547715216875076, "learning_rate": 0.00016114103917894617, "loss": 0.3757579028606415, "memory(GiB)": 78.33, "step": 2594, "token_acc": 0.8914766343788905, "train_speed(iter/s)": 0.032265 }, { "epoch": 0.5028338904229036, "grad_norm": 0.1119546890258789, "learning_rate": 0.0001610451704573153, "loss": 0.365306556224823, "memory(GiB)": 78.33, "step": 2595, "token_acc": 0.8929053225410841, "train_speed(iter/s)": 0.032266 }, { "epoch": 0.5030276607082304, "grad_norm": 0.10307694226503372, "learning_rate": 0.00016094929719913612, "loss": 0.34190264344215393, "memory(GiB)": 78.33, "step": 2596, "token_acc": 0.8982385908726982, "train_speed(iter/s)": 0.032267 }, { "epoch": 0.5032214309935571, "grad_norm": 0.10450293868780136, "learning_rate": 0.00016085341944378634, "loss": 0.37576237320899963, "memory(GiB)": 78.33, "step": 2597, "token_acc": 0.8903388098419935, "train_speed(iter/s)": 0.032268 }, { "epoch": 0.5034152012788838, "grad_norm": 0.11997717618942261, "learning_rate": 0.00016075753723064558, "loss": 0.3967727720737457, "memory(GiB)": 78.33, "step": 2598, "token_acc": 0.8815336184366833, "train_speed(iter/s)": 0.032269 }, { "epoch": 0.5036089715642106, "grad_norm": 0.10347646474838257, "learning_rate": 0.00016066165059909523, "loss": 0.3663371801376343, "memory(GiB)": 78.33, "step": 2599, "token_acc": 0.8920181267202313, "train_speed(iter/s)": 0.03227 }, { "epoch": 0.5038027418495373, "grad_norm": 0.10675547271966934, "learning_rate": 0.00016056575958851843, "loss": 0.3652355670928955, "memory(GiB)": 78.33, "step": 2600, "token_acc": 0.890871055842873, "train_speed(iter/s)": 0.032271 }, { "epoch": 0.5039965121348641, "grad_norm": 0.11141776293516159, "learning_rate": 0.0001604698642383003, "loss": 0.37232065200805664, "memory(GiB)": 78.33, "step": 2601, "token_acc": 0.8899046440325167, "train_speed(iter/s)": 0.032267 }, { "epoch": 0.5041902824201908, "grad_norm": 0.08926242589950562, "learning_rate": 0.00016037396458782759, "loss": 0.3233528435230255, "memory(GiB)": 78.33, "step": 2602, "token_acc": 0.9036984098624263, "train_speed(iter/s)": 0.032268 }, { "epoch": 0.5043840527055176, "grad_norm": 0.09639449417591095, "learning_rate": 0.00016027806067648884, "loss": 0.31952396035194397, "memory(GiB)": 78.33, "step": 2603, "token_acc": 0.9034606910634639, "train_speed(iter/s)": 0.032269 }, { "epoch": 0.5045778229908443, "grad_norm": 0.09628106653690338, "learning_rate": 0.0001601821525436744, "loss": 0.3303004503250122, "memory(GiB)": 78.33, "step": 2604, "token_acc": 0.9007667907185853, "train_speed(iter/s)": 0.03227 }, { "epoch": 0.504771593276171, "grad_norm": 0.10307995975017548, "learning_rate": 0.0001600862402287763, "loss": 0.370724618434906, "memory(GiB)": 78.33, "step": 2605, "token_acc": 0.8890643615105422, "train_speed(iter/s)": 0.032271 }, { "epoch": 0.5049653635614978, "grad_norm": 0.11356647312641144, "learning_rate": 0.00015999032377118834, "loss": 0.3665652275085449, "memory(GiB)": 78.33, "step": 2606, "token_acc": 0.8920088410173396, "train_speed(iter/s)": 0.032272 }, { "epoch": 0.5051591338468245, "grad_norm": 0.11644966900348663, "learning_rate": 0.0001598944032103059, "loss": 0.38838085532188416, "memory(GiB)": 78.33, "step": 2607, "token_acc": 0.8869162270049968, "train_speed(iter/s)": 0.032273 }, { "epoch": 0.5053529041321513, "grad_norm": 0.10509419441223145, "learning_rate": 0.0001597984785855262, "loss": 0.37085598707199097, "memory(GiB)": 78.33, "step": 2608, "token_acc": 0.8929580496730619, "train_speed(iter/s)": 0.032274 }, { "epoch": 0.505546674417478, "grad_norm": 0.10611365735530853, "learning_rate": 0.0001597025499362481, "loss": 0.3378239870071411, "memory(GiB)": 78.33, "step": 2609, "token_acc": 0.9013872771389387, "train_speed(iter/s)": 0.032275 }, { "epoch": 0.5057404447028048, "grad_norm": 0.11018887907266617, "learning_rate": 0.000159606617301872, "loss": 0.3307594954967499, "memory(GiB)": 78.33, "step": 2610, "token_acc": 0.9017131424284106, "train_speed(iter/s)": 0.032276 }, { "epoch": 0.5059342149881316, "grad_norm": 0.1073288694024086, "learning_rate": 0.00015951068072180002, "loss": 0.38456863164901733, "memory(GiB)": 78.33, "step": 2611, "token_acc": 0.8856738792250035, "train_speed(iter/s)": 0.032277 }, { "epoch": 0.5061279852734584, "grad_norm": 0.11159303784370422, "learning_rate": 0.0001594147402354359, "loss": 0.3925580680370331, "memory(GiB)": 78.33, "step": 2612, "token_acc": 0.8856811904125457, "train_speed(iter/s)": 0.032279 }, { "epoch": 0.5063217555587851, "grad_norm": 0.09800157696008682, "learning_rate": 0.00015931879588218503, "loss": 0.3453458845615387, "memory(GiB)": 78.33, "step": 2613, "token_acc": 0.8966416230180362, "train_speed(iter/s)": 0.03228 }, { "epoch": 0.5065155258441119, "grad_norm": 0.09460814297199249, "learning_rate": 0.00015922284770145424, "loss": 0.32773369550704956, "memory(GiB)": 78.33, "step": 2614, "token_acc": 0.9022055463357145, "train_speed(iter/s)": 0.032281 }, { "epoch": 0.5067092961294386, "grad_norm": 0.10572027415037155, "learning_rate": 0.00015912689573265208, "loss": 0.37495627999305725, "memory(GiB)": 78.33, "step": 2615, "token_acc": 0.8893242702918832, "train_speed(iter/s)": 0.032282 }, { "epoch": 0.5069030664147653, "grad_norm": 0.10272035747766495, "learning_rate": 0.00015903094001518857, "loss": 0.3710392713546753, "memory(GiB)": 78.33, "step": 2616, "token_acc": 0.8908351579605881, "train_speed(iter/s)": 0.032283 }, { "epoch": 0.5070968367000921, "grad_norm": 0.11143457889556885, "learning_rate": 0.0001589349805884754, "loss": 0.38444697856903076, "memory(GiB)": 78.33, "step": 2617, "token_acc": 0.8872893621420449, "train_speed(iter/s)": 0.032284 }, { "epoch": 0.5072906069854188, "grad_norm": 0.10727585107088089, "learning_rate": 0.00015883901749192555, "loss": 0.3834106922149658, "memory(GiB)": 78.33, "step": 2618, "token_acc": 0.888004011444415, "train_speed(iter/s)": 0.032285 }, { "epoch": 0.5074843772707456, "grad_norm": 0.10124249756336212, "learning_rate": 0.00015874305076495372, "loss": 0.37049931287765503, "memory(GiB)": 78.33, "step": 2619, "token_acc": 0.8904576168763391, "train_speed(iter/s)": 0.032286 }, { "epoch": 0.5076781475560723, "grad_norm": 0.1076463907957077, "learning_rate": 0.00015864708044697597, "loss": 0.35899150371551514, "memory(GiB)": 78.33, "step": 2620, "token_acc": 0.8948525469168901, "train_speed(iter/s)": 0.032287 }, { "epoch": 0.5078719178413991, "grad_norm": 0.09248903393745422, "learning_rate": 0.00015855110657740998, "loss": 0.3522607386112213, "memory(GiB)": 78.33, "step": 2621, "token_acc": 0.8958730306269461, "train_speed(iter/s)": 0.032288 }, { "epoch": 0.5080656881267258, "grad_norm": 0.10546861588954926, "learning_rate": 0.00015845512919567467, "loss": 0.3607703447341919, "memory(GiB)": 78.33, "step": 2622, "token_acc": 0.8937468225724453, "train_speed(iter/s)": 0.032289 }, { "epoch": 0.5082594584120526, "grad_norm": 0.08962032198905945, "learning_rate": 0.00015835914834119066, "loss": 0.3159025311470032, "memory(GiB)": 78.33, "step": 2623, "token_acc": 0.9060619623054353, "train_speed(iter/s)": 0.03229 }, { "epoch": 0.5084532286973793, "grad_norm": 0.10019563883543015, "learning_rate": 0.0001582631640533798, "loss": 0.34771329164505005, "memory(GiB)": 78.33, "step": 2624, "token_acc": 0.8975572054242686, "train_speed(iter/s)": 0.032291 }, { "epoch": 0.508646998982706, "grad_norm": 0.11215566098690033, "learning_rate": 0.00015816717637166545, "loss": 0.38069719076156616, "memory(GiB)": 78.33, "step": 2625, "token_acc": 0.8880416751549722, "train_speed(iter/s)": 0.032292 }, { "epoch": 0.5088407692680328, "grad_norm": 0.10857607424259186, "learning_rate": 0.00015807118533547228, "loss": 0.36432355642318726, "memory(GiB)": 78.33, "step": 2626, "token_acc": 0.8919505825043285, "train_speed(iter/s)": 0.032293 }, { "epoch": 0.5090345395533595, "grad_norm": 0.10065297037363052, "learning_rate": 0.00015797519098422638, "loss": 0.3655664920806885, "memory(GiB)": 78.33, "step": 2627, "token_acc": 0.8914216996258685, "train_speed(iter/s)": 0.032294 }, { "epoch": 0.5092283098386863, "grad_norm": 0.10752709954977036, "learning_rate": 0.00015787919335735523, "loss": 0.35718318819999695, "memory(GiB)": 78.33, "step": 2628, "token_acc": 0.8937131050767414, "train_speed(iter/s)": 0.032295 }, { "epoch": 0.509422080124013, "grad_norm": 0.10011646896600723, "learning_rate": 0.0001577831924942877, "loss": 0.3524722158908844, "memory(GiB)": 78.33, "step": 2629, "token_acc": 0.8952115870315963, "train_speed(iter/s)": 0.032296 }, { "epoch": 0.5096158504093398, "grad_norm": 0.09924966841936111, "learning_rate": 0.00015768718843445386, "loss": 0.35439997911453247, "memory(GiB)": 78.33, "step": 2630, "token_acc": 0.8962517059855722, "train_speed(iter/s)": 0.032297 }, { "epoch": 0.5098096206946665, "grad_norm": 0.10966690629720688, "learning_rate": 0.00015759118121728516, "loss": 0.4074428677558899, "memory(GiB)": 78.33, "step": 2631, "token_acc": 0.8822711142654365, "train_speed(iter/s)": 0.032298 }, { "epoch": 0.5100033909799933, "grad_norm": 0.11511880904436111, "learning_rate": 0.00015749517088221434, "loss": 0.4096870720386505, "memory(GiB)": 78.33, "step": 2632, "token_acc": 0.8811336465830012, "train_speed(iter/s)": 0.032299 }, { "epoch": 0.51019716126532, "grad_norm": 0.10656805336475372, "learning_rate": 0.00015739915746867546, "loss": 0.3948022127151489, "memory(GiB)": 78.33, "step": 2633, "token_acc": 0.8854893199651264, "train_speed(iter/s)": 0.0323 }, { "epoch": 0.5103909315506467, "grad_norm": 0.09918279200792313, "learning_rate": 0.00015730314101610376, "loss": 0.32525914907455444, "memory(GiB)": 78.33, "step": 2634, "token_acc": 0.9005807402270954, "train_speed(iter/s)": 0.032301 }, { "epoch": 0.5105847018359735, "grad_norm": 0.09818840026855469, "learning_rate": 0.00015720712156393579, "loss": 0.33824896812438965, "memory(GiB)": 78.33, "step": 2635, "token_acc": 0.9008060904612629, "train_speed(iter/s)": 0.032302 }, { "epoch": 0.5107784721213002, "grad_norm": 0.09556617587804794, "learning_rate": 0.00015711109915160932, "loss": 0.3609238564968109, "memory(GiB)": 78.33, "step": 2636, "token_acc": 0.8941985496374093, "train_speed(iter/s)": 0.032303 }, { "epoch": 0.510972242406627, "grad_norm": 0.10253620892763138, "learning_rate": 0.00015701507381856342, "loss": 0.35230863094329834, "memory(GiB)": 78.33, "step": 2637, "token_acc": 0.8980842250604073, "train_speed(iter/s)": 0.032304 }, { "epoch": 0.5111660126919537, "grad_norm": 0.10715737193822861, "learning_rate": 0.00015691904560423818, "loss": 0.37759995460510254, "memory(GiB)": 78.33, "step": 2638, "token_acc": 0.8882215743440234, "train_speed(iter/s)": 0.032305 }, { "epoch": 0.5113597829772805, "grad_norm": 0.1301286518573761, "learning_rate": 0.00015682301454807496, "loss": 0.3717860281467438, "memory(GiB)": 78.33, "step": 2639, "token_acc": 0.8927836337553795, "train_speed(iter/s)": 0.032306 }, { "epoch": 0.5115535532626072, "grad_norm": 0.09518956393003464, "learning_rate": 0.00015672698068951632, "loss": 0.3172317445278168, "memory(GiB)": 78.33, "step": 2640, "token_acc": 0.9052939066263517, "train_speed(iter/s)": 0.032307 }, { "epoch": 0.511747323547934, "grad_norm": 0.09442702680826187, "learning_rate": 0.00015663094406800592, "loss": 0.3069186210632324, "memory(GiB)": 78.33, "step": 2641, "token_acc": 0.9095811612439332, "train_speed(iter/s)": 0.032308 }, { "epoch": 0.5119410938332607, "grad_norm": 0.11501803249120712, "learning_rate": 0.00015653490472298864, "loss": 0.3687857389450073, "memory(GiB)": 78.33, "step": 2642, "token_acc": 0.8907549189100189, "train_speed(iter/s)": 0.032309 }, { "epoch": 0.5121348641185874, "grad_norm": 0.0969555526971817, "learning_rate": 0.00015643886269391043, "loss": 0.3279804587364197, "memory(GiB)": 78.33, "step": 2643, "token_acc": 0.9027313266443702, "train_speed(iter/s)": 0.03231 }, { "epoch": 0.5123286344039142, "grad_norm": 0.10562612861394882, "learning_rate": 0.00015634281802021826, "loss": 0.345773845911026, "memory(GiB)": 78.33, "step": 2644, "token_acc": 0.8951558127530089, "train_speed(iter/s)": 0.032311 }, { "epoch": 0.5125224046892409, "grad_norm": 0.10318602621555328, "learning_rate": 0.0001562467707413603, "loss": 0.38389700651168823, "memory(GiB)": 78.33, "step": 2645, "token_acc": 0.8879924999383219, "train_speed(iter/s)": 0.032312 }, { "epoch": 0.5127161749745677, "grad_norm": 0.0975809097290039, "learning_rate": 0.00015615072089678574, "loss": 0.3352702260017395, "memory(GiB)": 78.33, "step": 2646, "token_acc": 0.8997019155590305, "train_speed(iter/s)": 0.032313 }, { "epoch": 0.5129099452598944, "grad_norm": 0.102164626121521, "learning_rate": 0.00015605466852594481, "loss": 0.36365604400634766, "memory(GiB)": 78.33, "step": 2647, "token_acc": 0.8929846070017708, "train_speed(iter/s)": 0.032314 }, { "epoch": 0.5131037155452212, "grad_norm": 0.09489741176366806, "learning_rate": 0.00015595861366828883, "loss": 0.32492002844810486, "memory(GiB)": 78.33, "step": 2648, "token_acc": 0.9037146465248159, "train_speed(iter/s)": 0.032315 }, { "epoch": 0.5132974858305479, "grad_norm": 0.0985584408044815, "learning_rate": 0.00015586255636327012, "loss": 0.3407394289970398, "memory(GiB)": 78.33, "step": 2649, "token_acc": 0.9006668376506797, "train_speed(iter/s)": 0.032316 }, { "epoch": 0.5134912561158747, "grad_norm": 0.0985412672162056, "learning_rate": 0.00015576649665034197, "loss": 0.3170143961906433, "memory(GiB)": 78.33, "step": 2650, "token_acc": 0.907279489904357, "train_speed(iter/s)": 0.032317 }, { "epoch": 0.5136850264012014, "grad_norm": 0.10823136568069458, "learning_rate": 0.00015567043456895868, "loss": 0.37427809834480286, "memory(GiB)": 78.33, "step": 2651, "token_acc": 0.8882963136611381, "train_speed(iter/s)": 0.032318 }, { "epoch": 0.5138787966865281, "grad_norm": 0.09839842468500137, "learning_rate": 0.0001555743701585756, "loss": 0.3554091155529022, "memory(GiB)": 78.33, "step": 2652, "token_acc": 0.8943349139006738, "train_speed(iter/s)": 0.032319 }, { "epoch": 0.5140725669718549, "grad_norm": 0.10772431641817093, "learning_rate": 0.00015547830345864885, "loss": 0.3907697796821594, "memory(GiB)": 78.33, "step": 2653, "token_acc": 0.8854761904761905, "train_speed(iter/s)": 0.03232 }, { "epoch": 0.5142663372571816, "grad_norm": 0.10258005559444427, "learning_rate": 0.00015538223450863565, "loss": 0.3574678897857666, "memory(GiB)": 78.33, "step": 2654, "token_acc": 0.893127167301461, "train_speed(iter/s)": 0.032322 }, { "epoch": 0.5144601075425084, "grad_norm": 0.10088451206684113, "learning_rate": 0.0001552861633479941, "loss": 0.37425774335861206, "memory(GiB)": 78.33, "step": 2655, "token_acc": 0.8893967324057473, "train_speed(iter/s)": 0.032322 }, { "epoch": 0.5146538778278351, "grad_norm": 0.10441171377897263, "learning_rate": 0.00015519009001618327, "loss": 0.3345174193382263, "memory(GiB)": 78.33, "step": 2656, "token_acc": 0.8989620545132345, "train_speed(iter/s)": 0.032323 }, { "epoch": 0.5148476481131619, "grad_norm": 0.10342314839363098, "learning_rate": 0.000155094014552663, "loss": 0.3678382635116577, "memory(GiB)": 78.33, "step": 2657, "token_acc": 0.8913637207329038, "train_speed(iter/s)": 0.032324 }, { "epoch": 0.5150414183984886, "grad_norm": 0.10229925811290741, "learning_rate": 0.00015499793699689406, "loss": 0.3740781247615814, "memory(GiB)": 78.33, "step": 2658, "token_acc": 0.8912131626660894, "train_speed(iter/s)": 0.032325 }, { "epoch": 0.5152351886838153, "grad_norm": 0.10311522334814072, "learning_rate": 0.0001549018573883381, "loss": 0.3588406443595886, "memory(GiB)": 78.33, "step": 2659, "token_acc": 0.8927260150055872, "train_speed(iter/s)": 0.032326 }, { "epoch": 0.5154289589691421, "grad_norm": 0.1111614927649498, "learning_rate": 0.00015480577576645758, "loss": 0.3687216639518738, "memory(GiB)": 78.33, "step": 2660, "token_acc": 0.8915793809579852, "train_speed(iter/s)": 0.032327 }, { "epoch": 0.5156227292544688, "grad_norm": 0.10557236522436142, "learning_rate": 0.00015470969217071582, "loss": 0.39404910802841187, "memory(GiB)": 78.33, "step": 2661, "token_acc": 0.884908754848619, "train_speed(iter/s)": 0.032328 }, { "epoch": 0.5158164995397956, "grad_norm": 0.11680735647678375, "learning_rate": 0.00015461360664057692, "loss": 0.391072154045105, "memory(GiB)": 78.33, "step": 2662, "token_acc": 0.8852517451358978, "train_speed(iter/s)": 0.03233 }, { "epoch": 0.5160102698251223, "grad_norm": 0.1172829270362854, "learning_rate": 0.00015451751921550583, "loss": 0.38999128341674805, "memory(GiB)": 78.33, "step": 2663, "token_acc": 0.8861199122593224, "train_speed(iter/s)": 0.032331 }, { "epoch": 0.5162040401104491, "grad_norm": 0.09996732324361801, "learning_rate": 0.0001544214299349682, "loss": 0.3332677483558655, "memory(GiB)": 78.33, "step": 2664, "token_acc": 0.900718860279702, "train_speed(iter/s)": 0.032332 }, { "epoch": 0.5163978103957758, "grad_norm": 0.11197924613952637, "learning_rate": 0.00015432533883843048, "loss": 0.35336822271347046, "memory(GiB)": 78.33, "step": 2665, "token_acc": 0.8947896063077465, "train_speed(iter/s)": 0.032333 }, { "epoch": 0.5165915806811026, "grad_norm": 0.10348919034004211, "learning_rate": 0.0001542292459653599, "loss": 0.3489812910556793, "memory(GiB)": 78.33, "step": 2666, "token_acc": 0.8955829903978052, "train_speed(iter/s)": 0.032334 }, { "epoch": 0.5167853509664293, "grad_norm": 0.10220210254192352, "learning_rate": 0.00015413315135522432, "loss": 0.3711531162261963, "memory(GiB)": 78.33, "step": 2667, "token_acc": 0.8892863670783613, "train_speed(iter/s)": 0.032335 }, { "epoch": 0.516979121251756, "grad_norm": 0.1041957437992096, "learning_rate": 0.00015403705504749238, "loss": 0.3776172697544098, "memory(GiB)": 78.33, "step": 2668, "token_acc": 0.8862548629964455, "train_speed(iter/s)": 0.032336 }, { "epoch": 0.5171728915370828, "grad_norm": 0.10291523486375809, "learning_rate": 0.0001539409570816335, "loss": 0.36820337176322937, "memory(GiB)": 78.33, "step": 2669, "token_acc": 0.8891734392557556, "train_speed(iter/s)": 0.032337 }, { "epoch": 0.5173666618224095, "grad_norm": 0.10148876905441284, "learning_rate": 0.00015384485749711768, "loss": 0.34953948855400085, "memory(GiB)": 78.33, "step": 2670, "token_acc": 0.8953482075165169, "train_speed(iter/s)": 0.032338 }, { "epoch": 0.5175604321077363, "grad_norm": 0.09922674298286438, "learning_rate": 0.0001537487563334155, "loss": 0.306792676448822, "memory(GiB)": 78.33, "step": 2671, "token_acc": 0.9090061848682538, "train_speed(iter/s)": 0.032339 }, { "epoch": 0.517754202393063, "grad_norm": 0.10890819132328033, "learning_rate": 0.00015365265362999846, "loss": 0.3788120746612549, "memory(GiB)": 78.33, "step": 2672, "token_acc": 0.8892348255357343, "train_speed(iter/s)": 0.03234 }, { "epoch": 0.5179479726783898, "grad_norm": 0.09601421654224396, "learning_rate": 0.00015355654942633833, "loss": 0.3305688500404358, "memory(GiB)": 78.33, "step": 2673, "token_acc": 0.9015699037955296, "train_speed(iter/s)": 0.032341 }, { "epoch": 0.5181417429637165, "grad_norm": 0.09569491446018219, "learning_rate": 0.00015346044376190782, "loss": 0.31505027413368225, "memory(GiB)": 78.33, "step": 2674, "token_acc": 0.9047790339157246, "train_speed(iter/s)": 0.032342 }, { "epoch": 0.5183355132490433, "grad_norm": 0.11235766857862473, "learning_rate": 0.00015336433667618004, "loss": 0.368362158536911, "memory(GiB)": 78.33, "step": 2675, "token_acc": 0.8934327846364883, "train_speed(iter/s)": 0.032343 }, { "epoch": 0.51852928353437, "grad_norm": 0.09810005128383636, "learning_rate": 0.00015326822820862883, "loss": 0.3768079876899719, "memory(GiB)": 78.33, "step": 2676, "token_acc": 0.8896894012167788, "train_speed(iter/s)": 0.032344 }, { "epoch": 0.5187230538196967, "grad_norm": 0.1080244854092598, "learning_rate": 0.00015317211839872846, "loss": 0.3397291898727417, "memory(GiB)": 78.33, "step": 2677, "token_acc": 0.90013633265167, "train_speed(iter/s)": 0.032345 }, { "epoch": 0.5189168241050235, "grad_norm": 0.10235600918531418, "learning_rate": 0.00015307600728595383, "loss": 0.3567368686199188, "memory(GiB)": 78.33, "step": 2678, "token_acc": 0.89522800645682, "train_speed(iter/s)": 0.032346 }, { "epoch": 0.5191105943903502, "grad_norm": 0.10997829586267471, "learning_rate": 0.00015297989490978037, "loss": 0.3763918876647949, "memory(GiB)": 78.33, "step": 2679, "token_acc": 0.8877150980098758, "train_speed(iter/s)": 0.032347 }, { "epoch": 0.519304364675677, "grad_norm": 0.11425752192735672, "learning_rate": 0.00015288378130968395, "loss": 0.350836843252182, "memory(GiB)": 78.33, "step": 2680, "token_acc": 0.8964464422016792, "train_speed(iter/s)": 0.032348 }, { "epoch": 0.5194981349610037, "grad_norm": 0.10013525933027267, "learning_rate": 0.00015278766652514103, "loss": 0.332501083612442, "memory(GiB)": 78.33, "step": 2681, "token_acc": 0.8996237243171544, "train_speed(iter/s)": 0.032349 }, { "epoch": 0.5196919052463305, "grad_norm": 0.09917152673006058, "learning_rate": 0.00015269155059562863, "loss": 0.35868263244628906, "memory(GiB)": 78.33, "step": 2682, "token_acc": 0.8944104296421408, "train_speed(iter/s)": 0.03235 }, { "epoch": 0.5198856755316572, "grad_norm": 0.10261884331703186, "learning_rate": 0.00015259543356062406, "loss": 0.3666459321975708, "memory(GiB)": 78.33, "step": 2683, "token_acc": 0.8901406178581991, "train_speed(iter/s)": 0.032351 }, { "epoch": 0.520079445816984, "grad_norm": 0.11286422610282898, "learning_rate": 0.00015249931545960517, "loss": 0.38164597749710083, "memory(GiB)": 78.33, "step": 2684, "token_acc": 0.8868490627692125, "train_speed(iter/s)": 0.032352 }, { "epoch": 0.5202732161023107, "grad_norm": 0.10428116470575333, "learning_rate": 0.0001524031963320503, "loss": 0.34657108783721924, "memory(GiB)": 78.33, "step": 2685, "token_acc": 0.8960804399396162, "train_speed(iter/s)": 0.032353 }, { "epoch": 0.5204669863876374, "grad_norm": 0.10740246623754501, "learning_rate": 0.00015230707621743809, "loss": 0.3531382381916046, "memory(GiB)": 78.33, "step": 2686, "token_acc": 0.8949211908931699, "train_speed(iter/s)": 0.032354 }, { "epoch": 0.5206607566729642, "grad_norm": 0.10802320390939713, "learning_rate": 0.00015221095515524768, "loss": 0.38364243507385254, "memory(GiB)": 78.33, "step": 2687, "token_acc": 0.8869632473001232, "train_speed(iter/s)": 0.032355 }, { "epoch": 0.5208545269582909, "grad_norm": 0.10873539000749588, "learning_rate": 0.00015211483318495854, "loss": 0.37823835015296936, "memory(GiB)": 78.33, "step": 2688, "token_acc": 0.8876921812052947, "train_speed(iter/s)": 0.032356 }, { "epoch": 0.5210482972436177, "grad_norm": 0.09489670395851135, "learning_rate": 0.00015201871034605064, "loss": 0.32838284969329834, "memory(GiB)": 78.33, "step": 2689, "token_acc": 0.903539469079638, "train_speed(iter/s)": 0.032357 }, { "epoch": 0.5212420675289444, "grad_norm": 0.09505011886358261, "learning_rate": 0.00015192258667800414, "loss": 0.3134308457374573, "memory(GiB)": 78.33, "step": 2690, "token_acc": 0.9066839378238342, "train_speed(iter/s)": 0.032358 }, { "epoch": 0.5214358378142712, "grad_norm": 0.10631779581308365, "learning_rate": 0.00015182646222029964, "loss": 0.39203402400016785, "memory(GiB)": 78.33, "step": 2691, "token_acc": 0.88480466768138, "train_speed(iter/s)": 0.032359 }, { "epoch": 0.5216296080995979, "grad_norm": 0.11067686975002289, "learning_rate": 0.00015173033701241804, "loss": 0.38351666927337646, "memory(GiB)": 78.33, "step": 2692, "token_acc": 0.8888694485755111, "train_speed(iter/s)": 0.03236 }, { "epoch": 0.5218233783849247, "grad_norm": 0.11114225536584854, "learning_rate": 0.00015163421109384048, "loss": 0.3793320655822754, "memory(GiB)": 78.33, "step": 2693, "token_acc": 0.8876760563380282, "train_speed(iter/s)": 0.032361 }, { "epoch": 0.5220171486702514, "grad_norm": 0.10015545785427094, "learning_rate": 0.0001515380845040485, "loss": 0.31924664974212646, "memory(GiB)": 78.33, "step": 2694, "token_acc": 0.9028615097223401, "train_speed(iter/s)": 0.032362 }, { "epoch": 0.5222109189555781, "grad_norm": 0.11381295323371887, "learning_rate": 0.00015144195728252396, "loss": 0.36840251088142395, "memory(GiB)": 78.33, "step": 2695, "token_acc": 0.889958889452047, "train_speed(iter/s)": 0.032363 }, { "epoch": 0.5224046892409049, "grad_norm": 0.09790168702602386, "learning_rate": 0.00015134582946874875, "loss": 0.3553347587585449, "memory(GiB)": 78.33, "step": 2696, "token_acc": 0.8931691146224018, "train_speed(iter/s)": 0.032364 }, { "epoch": 0.5225984595262316, "grad_norm": 0.10989172011613846, "learning_rate": 0.00015124970110220526, "loss": 0.37042251229286194, "memory(GiB)": 78.33, "step": 2697, "token_acc": 0.8891240509008175, "train_speed(iter/s)": 0.032365 }, { "epoch": 0.5227922298115584, "grad_norm": 0.11018986999988556, "learning_rate": 0.00015115357222237596, "loss": 0.34254854917526245, "memory(GiB)": 78.33, "step": 2698, "token_acc": 0.8972499929196522, "train_speed(iter/s)": 0.032366 }, { "epoch": 0.5229860000968851, "grad_norm": 0.10905101150274277, "learning_rate": 0.00015105744286874354, "loss": 0.37283697724342346, "memory(GiB)": 78.33, "step": 2699, "token_acc": 0.8891810881122045, "train_speed(iter/s)": 0.032367 }, { "epoch": 0.5231797703822119, "grad_norm": 0.10779301077127457, "learning_rate": 0.00015096131308079086, "loss": 0.35086601972579956, "memory(GiB)": 78.33, "step": 2700, "token_acc": 0.894660338178582, "train_speed(iter/s)": 0.032368 }, { "epoch": 0.5233735406675386, "grad_norm": 0.10483751446008682, "learning_rate": 0.00015086518289800108, "loss": 0.3449743688106537, "memory(GiB)": 78.33, "step": 2701, "token_acc": 0.8989076300800705, "train_speed(iter/s)": 0.032369 }, { "epoch": 0.5235673109528654, "grad_norm": 0.11302126944065094, "learning_rate": 0.00015076905235985748, "loss": 0.4011099338531494, "memory(GiB)": 78.33, "step": 2702, "token_acc": 0.8809768411165576, "train_speed(iter/s)": 0.03237 }, { "epoch": 0.5237610812381921, "grad_norm": 0.09251462668180466, "learning_rate": 0.0001506729215058434, "loss": 0.2971772253513336, "memory(GiB)": 78.33, "step": 2703, "token_acc": 0.9121730788649904, "train_speed(iter/s)": 0.032371 }, { "epoch": 0.5239548515235188, "grad_norm": 0.1044599711894989, "learning_rate": 0.0001505767903754424, "loss": 0.3731986880302429, "memory(GiB)": 78.33, "step": 2704, "token_acc": 0.8906576333350313, "train_speed(iter/s)": 0.032372 }, { "epoch": 0.5241486218088456, "grad_norm": 0.11340048164129257, "learning_rate": 0.00015048065900813805, "loss": 0.4006243944168091, "memory(GiB)": 78.33, "step": 2705, "token_acc": 0.8819369331891173, "train_speed(iter/s)": 0.032373 }, { "epoch": 0.5243423920941723, "grad_norm": 0.1074516549706459, "learning_rate": 0.00015038452744341416, "loss": 0.3777906894683838, "memory(GiB)": 78.33, "step": 2706, "token_acc": 0.8883070102521474, "train_speed(iter/s)": 0.032374 }, { "epoch": 0.5245361623794991, "grad_norm": 0.10201266407966614, "learning_rate": 0.00015028839572075447, "loss": 0.36712780594825745, "memory(GiB)": 78.33, "step": 2707, "token_acc": 0.8929159371815307, "train_speed(iter/s)": 0.032375 }, { "epoch": 0.5247299326648258, "grad_norm": 0.1035100668668747, "learning_rate": 0.0001501922638796429, "loss": 0.36527976393699646, "memory(GiB)": 78.33, "step": 2708, "token_acc": 0.8913647477217159, "train_speed(iter/s)": 0.032376 }, { "epoch": 0.5249237029501526, "grad_norm": 0.1076916977763176, "learning_rate": 0.00015009613195956343, "loss": 0.38062483072280884, "memory(GiB)": 78.33, "step": 2709, "token_acc": 0.8895733585435028, "train_speed(iter/s)": 0.032377 }, { "epoch": 0.5251174732354793, "grad_norm": 0.09896742552518845, "learning_rate": 0.00015, "loss": 0.3540569841861725, "memory(GiB)": 78.33, "step": 2710, "token_acc": 0.8951074201119336, "train_speed(iter/s)": 0.032378 }, { "epoch": 0.525311243520806, "grad_norm": 0.09472720324993134, "learning_rate": 0.00014990386804043652, "loss": 0.3247393071651459, "memory(GiB)": 78.33, "step": 2711, "token_acc": 0.901641010008637, "train_speed(iter/s)": 0.032379 }, { "epoch": 0.5255050138061328, "grad_norm": 0.09038470685482025, "learning_rate": 0.0001498077361203571, "loss": 0.3078896105289459, "memory(GiB)": 78.33, "step": 2712, "token_acc": 0.9055924034928663, "train_speed(iter/s)": 0.03238 }, { "epoch": 0.5256987840914595, "grad_norm": 0.10283378511667252, "learning_rate": 0.00014971160427924553, "loss": 0.3739337623119354, "memory(GiB)": 78.33, "step": 2713, "token_acc": 0.8916592017517667, "train_speed(iter/s)": 0.032381 }, { "epoch": 0.5258925543767863, "grad_norm": 0.10884794592857361, "learning_rate": 0.00014961547255658587, "loss": 0.3748420178890228, "memory(GiB)": 78.33, "step": 2714, "token_acc": 0.890541823727448, "train_speed(iter/s)": 0.032382 }, { "epoch": 0.526086324662113, "grad_norm": 0.10610167682170868, "learning_rate": 0.00014951934099186195, "loss": 0.39279741048812866, "memory(GiB)": 78.33, "step": 2715, "token_acc": 0.8841656649546498, "train_speed(iter/s)": 0.032383 }, { "epoch": 0.5262800949474398, "grad_norm": 0.10813671350479126, "learning_rate": 0.00014942320962455766, "loss": 0.36780744791030884, "memory(GiB)": 78.33, "step": 2716, "token_acc": 0.8913100322496296, "train_speed(iter/s)": 0.032384 }, { "epoch": 0.5264738652327665, "grad_norm": 0.103311687707901, "learning_rate": 0.0001493270784941566, "loss": 0.37279364466667175, "memory(GiB)": 78.33, "step": 2717, "token_acc": 0.8909463356782836, "train_speed(iter/s)": 0.032385 }, { "epoch": 0.5266676355180933, "grad_norm": 0.10846543312072754, "learning_rate": 0.00014923094764014247, "loss": 0.36826059222221375, "memory(GiB)": 78.33, "step": 2718, "token_acc": 0.8907717095394189, "train_speed(iter/s)": 0.032386 }, { "epoch": 0.52686140580342, "grad_norm": 0.09876382350921631, "learning_rate": 0.0001491348171019989, "loss": 0.358868807554245, "memory(GiB)": 78.33, "step": 2719, "token_acc": 0.8912382783074005, "train_speed(iter/s)": 0.032387 }, { "epoch": 0.5270551760887467, "grad_norm": 0.09218183159828186, "learning_rate": 0.00014903868691920911, "loss": 0.31816765666007996, "memory(GiB)": 78.33, "step": 2720, "token_acc": 0.9072353603603603, "train_speed(iter/s)": 0.032388 }, { "epoch": 0.5272489463740735, "grad_norm": 0.10111497342586517, "learning_rate": 0.0001489425571312565, "loss": 0.3605380952358246, "memory(GiB)": 78.33, "step": 2721, "token_acc": 0.8904896257271978, "train_speed(iter/s)": 0.032389 }, { "epoch": 0.5274427166594002, "grad_norm": 0.10805071890354156, "learning_rate": 0.00014884642777762404, "loss": 0.35685086250305176, "memory(GiB)": 78.33, "step": 2722, "token_acc": 0.8938120928038679, "train_speed(iter/s)": 0.03239 }, { "epoch": 0.527636486944727, "grad_norm": 0.10611846297979355, "learning_rate": 0.00014875029889779476, "loss": 0.366268128156662, "memory(GiB)": 78.33, "step": 2723, "token_acc": 0.8927133114965365, "train_speed(iter/s)": 0.032391 }, { "epoch": 0.5278302572300537, "grad_norm": 0.10951778292655945, "learning_rate": 0.00014865417053125122, "loss": 0.35670673847198486, "memory(GiB)": 78.33, "step": 2724, "token_acc": 0.8942021014849336, "train_speed(iter/s)": 0.032392 }, { "epoch": 0.5280240275153805, "grad_norm": 0.10806987434625626, "learning_rate": 0.000148558042717476, "loss": 0.34922337532043457, "memory(GiB)": 78.33, "step": 2725, "token_acc": 0.8949587478767289, "train_speed(iter/s)": 0.032393 }, { "epoch": 0.5282177978007072, "grad_norm": 0.09839235991239548, "learning_rate": 0.0001484619154959515, "loss": 0.34449273347854614, "memory(GiB)": 78.33, "step": 2726, "token_acc": 0.8982513498786348, "train_speed(iter/s)": 0.032394 }, { "epoch": 0.528411568086034, "grad_norm": 0.10562780499458313, "learning_rate": 0.00014836578890615952, "loss": 0.34781414270401, "memory(GiB)": 78.33, "step": 2727, "token_acc": 0.8980476002925787, "train_speed(iter/s)": 0.032395 }, { "epoch": 0.5286053383713607, "grad_norm": 0.11507020145654678, "learning_rate": 0.00014826966298758202, "loss": 0.3725394010543823, "memory(GiB)": 78.33, "step": 2728, "token_acc": 0.8897241588360109, "train_speed(iter/s)": 0.032396 }, { "epoch": 0.5287991086566874, "grad_norm": 0.1054215356707573, "learning_rate": 0.00014817353777970036, "loss": 0.3395271897315979, "memory(GiB)": 78.33, "step": 2729, "token_acc": 0.8995790406558103, "train_speed(iter/s)": 0.032397 }, { "epoch": 0.5289928789420142, "grad_norm": 0.11207542568445206, "learning_rate": 0.00014807741332199584, "loss": 0.3861173093318939, "memory(GiB)": 78.33, "step": 2730, "token_acc": 0.8875055334218681, "train_speed(iter/s)": 0.032398 }, { "epoch": 0.5291866492273409, "grad_norm": 0.11059516668319702, "learning_rate": 0.00014798128965394936, "loss": 0.3895573019981384, "memory(GiB)": 78.33, "step": 2731, "token_acc": 0.8851554949115925, "train_speed(iter/s)": 0.032399 }, { "epoch": 0.5293804195126677, "grad_norm": 0.1085188090801239, "learning_rate": 0.0001478851668150414, "loss": 0.37232252955436707, "memory(GiB)": 78.33, "step": 2732, "token_acc": 0.8892735392827885, "train_speed(iter/s)": 0.0324 }, { "epoch": 0.5295741897979945, "grad_norm": 0.09967105835676193, "learning_rate": 0.00014778904484475235, "loss": 0.3428942561149597, "memory(GiB)": 78.33, "step": 2733, "token_acc": 0.8976353126642144, "train_speed(iter/s)": 0.032401 }, { "epoch": 0.5297679600833213, "grad_norm": 0.11610176414251328, "learning_rate": 0.0001476929237825619, "loss": 0.40362459421157837, "memory(GiB)": 78.33, "step": 2734, "token_acc": 0.8841030058763807, "train_speed(iter/s)": 0.032402 }, { "epoch": 0.529961730368648, "grad_norm": 0.10144146531820297, "learning_rate": 0.00014759680366794974, "loss": 0.3451736271381378, "memory(GiB)": 78.33, "step": 2735, "token_acc": 0.8984730993684548, "train_speed(iter/s)": 0.032403 }, { "epoch": 0.5301555006539748, "grad_norm": 0.097495436668396, "learning_rate": 0.0001475006845403948, "loss": 0.3148418366909027, "memory(GiB)": 78.33, "step": 2736, "token_acc": 0.9057284734309409, "train_speed(iter/s)": 0.032404 }, { "epoch": 0.5303492709393015, "grad_norm": 0.10190415382385254, "learning_rate": 0.00014740456643937591, "loss": 0.35173338651657104, "memory(GiB)": 78.33, "step": 2737, "token_acc": 0.895157419172667, "train_speed(iter/s)": 0.032405 }, { "epoch": 0.5305430412246283, "grad_norm": 0.118934765458107, "learning_rate": 0.00014730844940437138, "loss": 0.3918536901473999, "memory(GiB)": 78.33, "step": 2738, "token_acc": 0.8861999064400436, "train_speed(iter/s)": 0.032406 }, { "epoch": 0.530736811509955, "grad_norm": 0.10452469438314438, "learning_rate": 0.00014721233347485892, "loss": 0.36255908012390137, "memory(GiB)": 78.33, "step": 2739, "token_acc": 0.8927415372798595, "train_speed(iter/s)": 0.032407 }, { "epoch": 0.5309305817952817, "grad_norm": 0.10265407711267471, "learning_rate": 0.00014711621869031608, "loss": 0.3520941436290741, "memory(GiB)": 78.33, "step": 2740, "token_acc": 0.8986634006070611, "train_speed(iter/s)": 0.032408 }, { "epoch": 0.5311243520806085, "grad_norm": 0.10262811183929443, "learning_rate": 0.00014702010509021963, "loss": 0.33090177178382874, "memory(GiB)": 78.33, "step": 2741, "token_acc": 0.9019104647847163, "train_speed(iter/s)": 0.032409 }, { "epoch": 0.5313181223659352, "grad_norm": 0.10777027159929276, "learning_rate": 0.0001469239927140462, "loss": 0.3813689649105072, "memory(GiB)": 78.33, "step": 2742, "token_acc": 0.8873789612676056, "train_speed(iter/s)": 0.03241 }, { "epoch": 0.531511892651262, "grad_norm": 0.10505778342485428, "learning_rate": 0.00014682788160127154, "loss": 0.37121495604515076, "memory(GiB)": 78.33, "step": 2743, "token_acc": 0.8890379187973286, "train_speed(iter/s)": 0.032411 }, { "epoch": 0.5317056629365887, "grad_norm": 0.11017142981290817, "learning_rate": 0.00014673177179137114, "loss": 0.3495400547981262, "memory(GiB)": 78.33, "step": 2744, "token_acc": 0.8959156433529097, "train_speed(iter/s)": 0.032412 }, { "epoch": 0.5318994332219155, "grad_norm": 0.11223854124546051, "learning_rate": 0.00014663566332381994, "loss": 0.3930763900279999, "memory(GiB)": 78.33, "step": 2745, "token_acc": 0.8840929833038063, "train_speed(iter/s)": 0.032413 }, { "epoch": 0.5320932035072422, "grad_norm": 0.11058208346366882, "learning_rate": 0.00014653955623809215, "loss": 0.3463561534881592, "memory(GiB)": 78.33, "step": 2746, "token_acc": 0.8972069472423368, "train_speed(iter/s)": 0.032413 }, { "epoch": 0.532286973792569, "grad_norm": 0.10111848264932632, "learning_rate": 0.00014644345057366167, "loss": 0.33341148495674133, "memory(GiB)": 78.33, "step": 2747, "token_acc": 0.9006800088456435, "train_speed(iter/s)": 0.032414 }, { "epoch": 0.5324807440778957, "grad_norm": 0.10276893526315689, "learning_rate": 0.00014634734637000154, "loss": 0.35466858744621277, "memory(GiB)": 78.33, "step": 2748, "token_acc": 0.8964071405257475, "train_speed(iter/s)": 0.032415 }, { "epoch": 0.5326745143632224, "grad_norm": 0.09825938940048218, "learning_rate": 0.0001462512436665845, "loss": 0.35709887742996216, "memory(GiB)": 78.33, "step": 2749, "token_acc": 0.8944281524926686, "train_speed(iter/s)": 0.032416 }, { "epoch": 0.5328682846485492, "grad_norm": 0.1046825423836708, "learning_rate": 0.00014615514250288232, "loss": 0.3530852496623993, "memory(GiB)": 78.33, "step": 2750, "token_acc": 0.8973124966203428, "train_speed(iter/s)": 0.032417 }, { "epoch": 0.5330620549338759, "grad_norm": 0.09290501475334167, "learning_rate": 0.00014605904291836643, "loss": 0.3255302309989929, "memory(GiB)": 78.33, "step": 2751, "token_acc": 0.9017647611001596, "train_speed(iter/s)": 0.032418 }, { "epoch": 0.5332558252192027, "grad_norm": 0.09813160449266434, "learning_rate": 0.0001459629449525076, "loss": 0.35900381207466125, "memory(GiB)": 78.33, "step": 2752, "token_acc": 0.8953540595838115, "train_speed(iter/s)": 0.032419 }, { "epoch": 0.5334495955045294, "grad_norm": 0.10881511121988297, "learning_rate": 0.0001458668486447757, "loss": 0.3696974217891693, "memory(GiB)": 78.33, "step": 2753, "token_acc": 0.8900663072442213, "train_speed(iter/s)": 0.03242 }, { "epoch": 0.5336433657898562, "grad_norm": 0.09696649760007858, "learning_rate": 0.00014577075403464013, "loss": 0.3419326841831207, "memory(GiB)": 78.33, "step": 2754, "token_acc": 0.8965491278704982, "train_speed(iter/s)": 0.032421 }, { "epoch": 0.5338371360751829, "grad_norm": 0.10864214599132538, "learning_rate": 0.0001456746611615695, "loss": 0.3853573799133301, "memory(GiB)": 78.33, "step": 2755, "token_acc": 0.8867099400407064, "train_speed(iter/s)": 0.032422 }, { "epoch": 0.5340309063605096, "grad_norm": 0.105714350938797, "learning_rate": 0.00014557857006503182, "loss": 0.3608066737651825, "memory(GiB)": 78.33, "step": 2756, "token_acc": 0.8907904278462654, "train_speed(iter/s)": 0.032423 }, { "epoch": 0.5342246766458364, "grad_norm": 0.10220605880022049, "learning_rate": 0.00014548248078449417, "loss": 0.35340073704719543, "memory(GiB)": 78.33, "step": 2757, "token_acc": 0.8937097622685939, "train_speed(iter/s)": 0.032424 }, { "epoch": 0.5344184469311631, "grad_norm": 0.12155355513095856, "learning_rate": 0.00014538639335942303, "loss": 0.3872877061367035, "memory(GiB)": 78.33, "step": 2758, "token_acc": 0.8883419182369433, "train_speed(iter/s)": 0.032425 }, { "epoch": 0.5346122172164899, "grad_norm": 0.11298363655805588, "learning_rate": 0.0001452903078292842, "loss": 0.36656370759010315, "memory(GiB)": 78.33, "step": 2759, "token_acc": 0.8930003250623036, "train_speed(iter/s)": 0.032426 }, { "epoch": 0.5348059875018166, "grad_norm": 0.1053992435336113, "learning_rate": 0.00014519422423354243, "loss": 0.3747759163379669, "memory(GiB)": 78.33, "step": 2760, "token_acc": 0.8891952102478419, "train_speed(iter/s)": 0.032427 }, { "epoch": 0.5349997577871434, "grad_norm": 0.09907464683055878, "learning_rate": 0.00014509814261166193, "loss": 0.33399906754493713, "memory(GiB)": 78.33, "step": 2761, "token_acc": 0.8976413830072366, "train_speed(iter/s)": 0.032428 }, { "epoch": 0.5351935280724701, "grad_norm": 0.0989702120423317, "learning_rate": 0.00014500206300310594, "loss": 0.3469410538673401, "memory(GiB)": 78.33, "step": 2762, "token_acc": 0.8955082378865913, "train_speed(iter/s)": 0.032428 }, { "epoch": 0.5353872983577969, "grad_norm": 0.09886979311704636, "learning_rate": 0.00014490598544733695, "loss": 0.3301083445549011, "memory(GiB)": 78.33, "step": 2763, "token_acc": 0.903747359636999, "train_speed(iter/s)": 0.032429 }, { "epoch": 0.5355810686431236, "grad_norm": 0.11633176356554031, "learning_rate": 0.00014480990998381674, "loss": 0.36927592754364014, "memory(GiB)": 78.33, "step": 2764, "token_acc": 0.8909205359551572, "train_speed(iter/s)": 0.03243 }, { "epoch": 0.5357748389284503, "grad_norm": 0.10796020925045013, "learning_rate": 0.00014471383665200585, "loss": 0.3573460876941681, "memory(GiB)": 78.33, "step": 2765, "token_acc": 0.8945837063563116, "train_speed(iter/s)": 0.032431 }, { "epoch": 0.5359686092137771, "grad_norm": 0.10915590077638626, "learning_rate": 0.00014461776549136435, "loss": 0.3306170701980591, "memory(GiB)": 78.33, "step": 2766, "token_acc": 0.9005574136008918, "train_speed(iter/s)": 0.032432 }, { "epoch": 0.5361623794991038, "grad_norm": 0.09537974745035172, "learning_rate": 0.00014452169654135115, "loss": 0.3284014165401459, "memory(GiB)": 78.33, "step": 2767, "token_acc": 0.9025765702541481, "train_speed(iter/s)": 0.032433 }, { "epoch": 0.5363561497844306, "grad_norm": 0.09599710255861282, "learning_rate": 0.00014442562984142446, "loss": 0.34253832697868347, "memory(GiB)": 78.33, "step": 2768, "token_acc": 0.8958434143036872, "train_speed(iter/s)": 0.032434 }, { "epoch": 0.5365499200697573, "grad_norm": 0.10483718663454056, "learning_rate": 0.0001443295654310413, "loss": 0.3458617329597473, "memory(GiB)": 78.33, "step": 2769, "token_acc": 0.8977225020990764, "train_speed(iter/s)": 0.032435 }, { "epoch": 0.5367436903550841, "grad_norm": 0.10079243779182434, "learning_rate": 0.000144233503349658, "loss": 0.3476545810699463, "memory(GiB)": 78.33, "step": 2770, "token_acc": 0.8959448042804844, "train_speed(iter/s)": 0.032436 }, { "epoch": 0.5369374606404108, "grad_norm": 0.10120805352926254, "learning_rate": 0.00014413744363672988, "loss": 0.35417628288269043, "memory(GiB)": 78.33, "step": 2771, "token_acc": 0.8963836812952706, "train_speed(iter/s)": 0.032437 }, { "epoch": 0.5371312309257376, "grad_norm": 0.10923577845096588, "learning_rate": 0.00014404138633171114, "loss": 0.388058602809906, "memory(GiB)": 78.33, "step": 2772, "token_acc": 0.8873768455115426, "train_speed(iter/s)": 0.032438 }, { "epoch": 0.5373250012110643, "grad_norm": 0.10782677680253983, "learning_rate": 0.00014394533147405519, "loss": 0.3875938057899475, "memory(GiB)": 78.33, "step": 2773, "token_acc": 0.8839462734913853, "train_speed(iter/s)": 0.032439 }, { "epoch": 0.537518771496391, "grad_norm": 0.10002797842025757, "learning_rate": 0.00014384927910321424, "loss": 0.35020288825035095, "memory(GiB)": 78.33, "step": 2774, "token_acc": 0.8941337890860073, "train_speed(iter/s)": 0.03244 }, { "epoch": 0.5377125417817178, "grad_norm": 0.09954270720481873, "learning_rate": 0.0001437532292586397, "loss": 0.3441551625728607, "memory(GiB)": 78.33, "step": 2775, "token_acc": 0.9002784888463811, "train_speed(iter/s)": 0.032441 }, { "epoch": 0.5379063120670445, "grad_norm": 0.10037088394165039, "learning_rate": 0.00014365718197978172, "loss": 0.34260034561157227, "memory(GiB)": 78.33, "step": 2776, "token_acc": 0.89792182924082, "train_speed(iter/s)": 0.032442 }, { "epoch": 0.5381000823523713, "grad_norm": 0.1107875406742096, "learning_rate": 0.00014356113730608954, "loss": 0.3863231837749481, "memory(GiB)": 78.33, "step": 2777, "token_acc": 0.8852114077738917, "train_speed(iter/s)": 0.032443 }, { "epoch": 0.538293852637698, "grad_norm": 0.10534848272800446, "learning_rate": 0.00014346509527701133, "loss": 0.38511624932289124, "memory(GiB)": 78.33, "step": 2778, "token_acc": 0.8850161787561467, "train_speed(iter/s)": 0.032444 }, { "epoch": 0.5384876229230248, "grad_norm": 0.10484083741903305, "learning_rate": 0.00014336905593199405, "loss": 0.3411652445793152, "memory(GiB)": 78.33, "step": 2779, "token_acc": 0.8983436180578109, "train_speed(iter/s)": 0.032445 }, { "epoch": 0.5386813932083515, "grad_norm": 0.09589672088623047, "learning_rate": 0.00014327301931048368, "loss": 0.32667702436447144, "memory(GiB)": 78.33, "step": 2780, "token_acc": 0.902627403397424, "train_speed(iter/s)": 0.032445 }, { "epoch": 0.5388751634936783, "grad_norm": 0.11629176884889603, "learning_rate": 0.00014317698545192504, "loss": 0.39405229687690735, "memory(GiB)": 78.33, "step": 2781, "token_acc": 0.8830865833285327, "train_speed(iter/s)": 0.032446 }, { "epoch": 0.539068933779005, "grad_norm": 0.11103569716215134, "learning_rate": 0.00014308095439576188, "loss": 0.37723052501678467, "memory(GiB)": 78.33, "step": 2782, "token_acc": 0.8878711653086893, "train_speed(iter/s)": 0.032447 }, { "epoch": 0.5392627040643317, "grad_norm": 0.10809724777936935, "learning_rate": 0.00014298492618143658, "loss": 0.3773004710674286, "memory(GiB)": 78.33, "step": 2783, "token_acc": 0.8881794280195311, "train_speed(iter/s)": 0.032448 }, { "epoch": 0.5394564743496585, "grad_norm": 0.10347483307123184, "learning_rate": 0.0001428889008483906, "loss": 0.3722970187664032, "memory(GiB)": 78.33, "step": 2784, "token_acc": 0.8895179639134346, "train_speed(iter/s)": 0.032449 }, { "epoch": 0.5396502446349852, "grad_norm": 0.10195406526327133, "learning_rate": 0.00014279287843606422, "loss": 0.3863418996334076, "memory(GiB)": 78.33, "step": 2785, "token_acc": 0.8859166207897503, "train_speed(iter/s)": 0.03245 }, { "epoch": 0.539844014920312, "grad_norm": 0.10171383619308472, "learning_rate": 0.00014269685898389624, "loss": 0.3722241222858429, "memory(GiB)": 78.33, "step": 2786, "token_acc": 0.889989379306938, "train_speed(iter/s)": 0.032451 }, { "epoch": 0.5400377852056387, "grad_norm": 0.10024231672286987, "learning_rate": 0.00014260084253132457, "loss": 0.3747716546058655, "memory(GiB)": 78.33, "step": 2787, "token_acc": 0.8894973834732871, "train_speed(iter/s)": 0.032452 }, { "epoch": 0.5402315554909655, "grad_norm": 0.09642113000154495, "learning_rate": 0.00014250482911778563, "loss": 0.34401756525039673, "memory(GiB)": 78.33, "step": 2788, "token_acc": 0.8965368793647757, "train_speed(iter/s)": 0.032453 }, { "epoch": 0.5404253257762922, "grad_norm": 0.09063483029603958, "learning_rate": 0.00014240881878271487, "loss": 0.31945639848709106, "memory(GiB)": 78.33, "step": 2789, "token_acc": 0.9047063862187114, "train_speed(iter/s)": 0.032454 }, { "epoch": 0.540619096061619, "grad_norm": 0.1115802749991417, "learning_rate": 0.00014231281156554615, "loss": 0.37289944291114807, "memory(GiB)": 78.33, "step": 2790, "token_acc": 0.8908885411232829, "train_speed(iter/s)": 0.032455 }, { "epoch": 0.5408128663469457, "grad_norm": 0.1019621267914772, "learning_rate": 0.00014221680750571228, "loss": 0.3510439693927765, "memory(GiB)": 78.33, "step": 2791, "token_acc": 0.8948518823287389, "train_speed(iter/s)": 0.032456 }, { "epoch": 0.5410066366322724, "grad_norm": 0.09841447323560715, "learning_rate": 0.00014212080664264477, "loss": 0.3427790403366089, "memory(GiB)": 78.33, "step": 2792, "token_acc": 0.8993469501181047, "train_speed(iter/s)": 0.032456 }, { "epoch": 0.5412004069175992, "grad_norm": 0.10309161990880966, "learning_rate": 0.00014202480901577362, "loss": 0.34060966968536377, "memory(GiB)": 78.33, "step": 2793, "token_acc": 0.900046502721777, "train_speed(iter/s)": 0.032457 }, { "epoch": 0.5413941772029259, "grad_norm": 0.10474463552236557, "learning_rate": 0.00014192881466452775, "loss": 0.3322206437587738, "memory(GiB)": 78.33, "step": 2794, "token_acc": 0.8990165235382479, "train_speed(iter/s)": 0.032458 }, { "epoch": 0.5415879474882527, "grad_norm": 0.11492858082056046, "learning_rate": 0.00014183282362833455, "loss": 0.37814396619796753, "memory(GiB)": 78.33, "step": 2795, "token_acc": 0.8911760892116183, "train_speed(iter/s)": 0.032459 }, { "epoch": 0.5417817177735794, "grad_norm": 0.11693891882896423, "learning_rate": 0.00014173683594662014, "loss": 0.3709845244884491, "memory(GiB)": 78.33, "step": 2796, "token_acc": 0.8902887474818534, "train_speed(iter/s)": 0.03246 }, { "epoch": 0.5419754880589062, "grad_norm": 0.1105792298913002, "learning_rate": 0.00014164085165880932, "loss": 0.3588966131210327, "memory(GiB)": 78.33, "step": 2797, "token_acc": 0.8919163847269213, "train_speed(iter/s)": 0.032461 }, { "epoch": 0.5421692583442329, "grad_norm": 0.10854232311248779, "learning_rate": 0.00014154487080432528, "loss": 0.3848278820514679, "memory(GiB)": 78.33, "step": 2798, "token_acc": 0.8873765907664213, "train_speed(iter/s)": 0.032462 }, { "epoch": 0.5423630286295597, "grad_norm": 0.09642491489648819, "learning_rate": 0.00014144889342259002, "loss": 0.3380579352378845, "memory(GiB)": 78.33, "step": 2799, "token_acc": 0.8996538924558587, "train_speed(iter/s)": 0.032463 }, { "epoch": 0.5425567989148864, "grad_norm": 0.10592179000377655, "learning_rate": 0.000141352919553024, "loss": 0.37832123041152954, "memory(GiB)": 78.33, "step": 2800, "token_acc": 0.8886095633018012, "train_speed(iter/s)": 0.032464 }, { "epoch": 0.5427505692002131, "grad_norm": 0.10104614496231079, "learning_rate": 0.0001412569492350463, "loss": 0.36123543977737427, "memory(GiB)": 78.33, "step": 2801, "token_acc": 0.8938950807577793, "train_speed(iter/s)": 0.03246 }, { "epoch": 0.5429443394855399, "grad_norm": 0.10246588289737701, "learning_rate": 0.00014116098250807445, "loss": 0.35974156856536865, "memory(GiB)": 78.33, "step": 2802, "token_acc": 0.893833531441162, "train_speed(iter/s)": 0.032461 }, { "epoch": 0.5431381097708666, "grad_norm": 0.10312814265489578, "learning_rate": 0.00014106501941152459, "loss": 0.357723206281662, "memory(GiB)": 78.33, "step": 2803, "token_acc": 0.8940801971556195, "train_speed(iter/s)": 0.032462 }, { "epoch": 0.5433318800561934, "grad_norm": 0.10023822635412216, "learning_rate": 0.0001409690599848114, "loss": 0.33914467692375183, "memory(GiB)": 78.33, "step": 2804, "token_acc": 0.8970815359216062, "train_speed(iter/s)": 0.032463 }, { "epoch": 0.5435256503415201, "grad_norm": 0.10362284630537033, "learning_rate": 0.0001408731042673479, "loss": 0.3780994415283203, "memory(GiB)": 78.33, "step": 2805, "token_acc": 0.8886399282993502, "train_speed(iter/s)": 0.032464 }, { "epoch": 0.5437194206268469, "grad_norm": 0.10058917850255966, "learning_rate": 0.00014077715229854576, "loss": 0.34822142124176025, "memory(GiB)": 78.33, "step": 2806, "token_acc": 0.8962744785862123, "train_speed(iter/s)": 0.032465 }, { "epoch": 0.5439131909121736, "grad_norm": 0.10602036118507385, "learning_rate": 0.00014068120411781497, "loss": 0.3826134502887726, "memory(GiB)": 78.33, "step": 2807, "token_acc": 0.8868417471764866, "train_speed(iter/s)": 0.032466 }, { "epoch": 0.5441069611975003, "grad_norm": 0.09917270392179489, "learning_rate": 0.0001405852597645641, "loss": 0.35776597261428833, "memory(GiB)": 78.33, "step": 2808, "token_acc": 0.8961601781909743, "train_speed(iter/s)": 0.032467 }, { "epoch": 0.5443007314828271, "grad_norm": 0.20605452358722687, "learning_rate": 0.00014048931927819995, "loss": 0.3961770832538605, "memory(GiB)": 78.33, "step": 2809, "token_acc": 0.8819182185180362, "train_speed(iter/s)": 0.032467 }, { "epoch": 0.5444945017681538, "grad_norm": 0.11399701237678528, "learning_rate": 0.00014039338269812796, "loss": 0.39141345024108887, "memory(GiB)": 78.33, "step": 2810, "token_acc": 0.8836162513606542, "train_speed(iter/s)": 0.032468 }, { "epoch": 0.5446882720534806, "grad_norm": 0.10171796381473541, "learning_rate": 0.0001402974500637519, "loss": 0.3601566255092621, "memory(GiB)": 78.33, "step": 2811, "token_acc": 0.8961741354505248, "train_speed(iter/s)": 0.032469 }, { "epoch": 0.5448820423388073, "grad_norm": 0.09617502242326736, "learning_rate": 0.00014020152141447375, "loss": 0.34712108969688416, "memory(GiB)": 78.33, "step": 2812, "token_acc": 0.8963927855711423, "train_speed(iter/s)": 0.03247 }, { "epoch": 0.5450758126241341, "grad_norm": 0.10345807671546936, "learning_rate": 0.00014010559678969407, "loss": 0.36251404881477356, "memory(GiB)": 78.33, "step": 2813, "token_acc": 0.8929795640683466, "train_speed(iter/s)": 0.032471 }, { "epoch": 0.5452695829094608, "grad_norm": 0.09582247585058212, "learning_rate": 0.00014000967622881166, "loss": 0.3179134130477905, "memory(GiB)": 78.33, "step": 2814, "token_acc": 0.9051344993351744, "train_speed(iter/s)": 0.032472 }, { "epoch": 0.5454633531947876, "grad_norm": 0.125186488032341, "learning_rate": 0.0001399137597712237, "loss": 0.36605405807495117, "memory(GiB)": 78.33, "step": 2815, "token_acc": 0.8923891831614162, "train_speed(iter/s)": 0.032473 }, { "epoch": 0.5456571234801143, "grad_norm": 0.10813166946172714, "learning_rate": 0.00013981784745632558, "loss": 0.36997631192207336, "memory(GiB)": 78.33, "step": 2816, "token_acc": 0.8912589239073655, "train_speed(iter/s)": 0.032474 }, { "epoch": 0.545850893765441, "grad_norm": 0.10874950140714645, "learning_rate": 0.00013972193932351113, "loss": 0.36081942915916443, "memory(GiB)": 78.33, "step": 2817, "token_acc": 0.89447387926538, "train_speed(iter/s)": 0.032475 }, { "epoch": 0.5460446640507678, "grad_norm": 0.09806004166603088, "learning_rate": 0.00013962603541217244, "loss": 0.34919509291648865, "memory(GiB)": 78.33, "step": 2818, "token_acc": 0.8967576382560315, "train_speed(iter/s)": 0.032476 }, { "epoch": 0.5462384343360945, "grad_norm": 0.1022680252790451, "learning_rate": 0.0001395301357616997, "loss": 0.3764371871948242, "memory(GiB)": 78.33, "step": 2819, "token_acc": 0.8876494023904382, "train_speed(iter/s)": 0.032477 }, { "epoch": 0.5464322046214213, "grad_norm": 0.10649038106203079, "learning_rate": 0.00013943424041148154, "loss": 0.38199445605278015, "memory(GiB)": 78.33, "step": 2820, "token_acc": 0.8861415110903026, "train_speed(iter/s)": 0.032478 }, { "epoch": 0.546625974906748, "grad_norm": 0.10255075246095657, "learning_rate": 0.00013933834940090475, "loss": 0.36467188596725464, "memory(GiB)": 78.33, "step": 2821, "token_acc": 0.8910845821691643, "train_speed(iter/s)": 0.032479 }, { "epoch": 0.5468197451920748, "grad_norm": 0.10376548022031784, "learning_rate": 0.00013924246276935442, "loss": 0.32367387413978577, "memory(GiB)": 78.33, "step": 2822, "token_acc": 0.9025662838508436, "train_speed(iter/s)": 0.03248 }, { "epoch": 0.5470135154774015, "grad_norm": 0.10719739645719528, "learning_rate": 0.00013914658055621363, "loss": 0.3771483302116394, "memory(GiB)": 78.33, "step": 2823, "token_acc": 0.8899454274847032, "train_speed(iter/s)": 0.03248 }, { "epoch": 0.5472072857627283, "grad_norm": 0.11074833571910858, "learning_rate": 0.00013905070280086386, "loss": 0.36725014448165894, "memory(GiB)": 78.33, "step": 2824, "token_acc": 0.8935861944063543, "train_speed(iter/s)": 0.032481 }, { "epoch": 0.547401056048055, "grad_norm": 0.10274602472782135, "learning_rate": 0.0001389548295426847, "loss": 0.3463117480278015, "memory(GiB)": 78.33, "step": 2825, "token_acc": 0.8964165043036476, "train_speed(iter/s)": 0.032482 }, { "epoch": 0.5475948263333817, "grad_norm": 0.10460248589515686, "learning_rate": 0.0001388589608210538, "loss": 0.35207241773605347, "memory(GiB)": 78.33, "step": 2826, "token_acc": 0.8935823860501049, "train_speed(iter/s)": 0.032483 }, { "epoch": 0.5477885966187085, "grad_norm": 0.11349425464868546, "learning_rate": 0.0001387630966753471, "loss": 0.377665638923645, "memory(GiB)": 78.33, "step": 2827, "token_acc": 0.8892741984547059, "train_speed(iter/s)": 0.032484 }, { "epoch": 0.5479823669040352, "grad_norm": 0.11117535084486008, "learning_rate": 0.0001386672371449385, "loss": 0.38577699661254883, "memory(GiB)": 78.33, "step": 2828, "token_acc": 0.887010551652719, "train_speed(iter/s)": 0.032485 }, { "epoch": 0.548176137189362, "grad_norm": 0.09501176327466965, "learning_rate": 0.0001385713822692001, "loss": 0.3158635199069977, "memory(GiB)": 78.33, "step": 2829, "token_acc": 0.9051193667815651, "train_speed(iter/s)": 0.032486 }, { "epoch": 0.5483699074746887, "grad_norm": 0.0978478193283081, "learning_rate": 0.00013847553208750222, "loss": 0.33727461099624634, "memory(GiB)": 78.33, "step": 2830, "token_acc": 0.9000374953130859, "train_speed(iter/s)": 0.032487 }, { "epoch": 0.5485636777600155, "grad_norm": 0.11235343664884567, "learning_rate": 0.0001383796866392129, "loss": 0.3979712128639221, "memory(GiB)": 78.33, "step": 2831, "token_acc": 0.8826383993927235, "train_speed(iter/s)": 0.032488 }, { "epoch": 0.5487574480453422, "grad_norm": 0.11587338894605637, "learning_rate": 0.0001382838459636986, "loss": 0.4109783470630646, "memory(GiB)": 78.33, "step": 2832, "token_acc": 0.8784734879286655, "train_speed(iter/s)": 0.032489 }, { "epoch": 0.548951218330669, "grad_norm": 0.09932563453912735, "learning_rate": 0.0001381880101003235, "loss": 0.3608021140098572, "memory(GiB)": 78.33, "step": 2833, "token_acc": 0.892873446614253, "train_speed(iter/s)": 0.03249 }, { "epoch": 0.5491449886159957, "grad_norm": 0.1027938574552536, "learning_rate": 0.00013809217908845008, "loss": 0.3469353914260864, "memory(GiB)": 78.33, "step": 2834, "token_acc": 0.8969847914789889, "train_speed(iter/s)": 0.032491 }, { "epoch": 0.5493387589013224, "grad_norm": 0.1079033836722374, "learning_rate": 0.00013799635296743868, "loss": 0.3682827055454254, "memory(GiB)": 78.33, "step": 2835, "token_acc": 0.8886550135749367, "train_speed(iter/s)": 0.032492 }, { "epoch": 0.5495325291866492, "grad_norm": 0.10848791897296906, "learning_rate": 0.00013790053177664766, "loss": 0.3711949288845062, "memory(GiB)": 78.33, "step": 2836, "token_acc": 0.8927511279981002, "train_speed(iter/s)": 0.032492 }, { "epoch": 0.5497262994719759, "grad_norm": 0.10043694078922272, "learning_rate": 0.00013780471555543343, "loss": 0.3552500009536743, "memory(GiB)": 78.33, "step": 2837, "token_acc": 0.8943481138318994, "train_speed(iter/s)": 0.032493 }, { "epoch": 0.5499200697573027, "grad_norm": 0.10964163392782211, "learning_rate": 0.00013770890434315012, "loss": 0.3622366786003113, "memory(GiB)": 78.33, "step": 2838, "token_acc": 0.8923598464684952, "train_speed(iter/s)": 0.032494 }, { "epoch": 0.5501138400426294, "grad_norm": 0.1045740619301796, "learning_rate": 0.00013761309817915014, "loss": 0.3492569923400879, "memory(GiB)": 78.33, "step": 2839, "token_acc": 0.8954375260019415, "train_speed(iter/s)": 0.032495 }, { "epoch": 0.5503076103279562, "grad_norm": 0.11920612305402756, "learning_rate": 0.00013751729710278354, "loss": 0.32998332381248474, "memory(GiB)": 78.33, "step": 2840, "token_acc": 0.9013660163219938, "train_speed(iter/s)": 0.032496 }, { "epoch": 0.5505013806132829, "grad_norm": 0.09879645705223083, "learning_rate": 0.00013742150115339852, "loss": 0.31799447536468506, "memory(GiB)": 78.33, "step": 2841, "token_acc": 0.9049382374886009, "train_speed(iter/s)": 0.032497 }, { "epoch": 0.5506951508986097, "grad_norm": 0.10987204313278198, "learning_rate": 0.0001373257103703409, "loss": 0.3859724998474121, "memory(GiB)": 78.33, "step": 2842, "token_acc": 0.885055264529223, "train_speed(iter/s)": 0.032498 }, { "epoch": 0.5508889211839364, "grad_norm": 0.1112445592880249, "learning_rate": 0.00013722992479295461, "loss": 0.3733959496021271, "memory(GiB)": 78.33, "step": 2843, "token_acc": 0.8924345643599857, "train_speed(iter/s)": 0.032499 }, { "epoch": 0.5510826914692631, "grad_norm": 0.10867438465356827, "learning_rate": 0.00013713414446058143, "loss": 0.3374147415161133, "memory(GiB)": 78.33, "step": 2844, "token_acc": 0.8994350282485876, "train_speed(iter/s)": 0.0325 }, { "epoch": 0.5512764617545899, "grad_norm": 0.10243318974971771, "learning_rate": 0.00013703836941256073, "loss": 0.36659640073776245, "memory(GiB)": 78.33, "step": 2845, "token_acc": 0.8927052150622341, "train_speed(iter/s)": 0.032501 }, { "epoch": 0.5514702320399166, "grad_norm": 0.10131695866584778, "learning_rate": 0.00013694259968823007, "loss": 0.3265146017074585, "memory(GiB)": 78.33, "step": 2846, "token_acc": 0.9022528879828551, "train_speed(iter/s)": 0.032501 }, { "epoch": 0.5516640023252434, "grad_norm": 0.1028154119849205, "learning_rate": 0.00013684683532692456, "loss": 0.34832775592803955, "memory(GiB)": 78.33, "step": 2847, "token_acc": 0.8964531768308241, "train_speed(iter/s)": 0.032502 }, { "epoch": 0.5518577726105701, "grad_norm": 0.10362169146537781, "learning_rate": 0.00013675107636797727, "loss": 0.35685330629348755, "memory(GiB)": 78.33, "step": 2848, "token_acc": 0.8953478775207021, "train_speed(iter/s)": 0.032503 }, { "epoch": 0.5520515428958969, "grad_norm": 0.10931023210287094, "learning_rate": 0.00013665532285071885, "loss": 0.3599531650543213, "memory(GiB)": 78.33, "step": 2849, "token_acc": 0.894084200338737, "train_speed(iter/s)": 0.032504 }, { "epoch": 0.5522453131812236, "grad_norm": 0.10313741117715836, "learning_rate": 0.00013655957481447796, "loss": 0.3336341977119446, "memory(GiB)": 78.33, "step": 2850, "token_acc": 0.8995257854179016, "train_speed(iter/s)": 0.032505 }, { "epoch": 0.5524390834665504, "grad_norm": 0.10181530565023422, "learning_rate": 0.00013646383229858088, "loss": 0.3731972575187683, "memory(GiB)": 78.33, "step": 2851, "token_acc": 0.8911425098754178, "train_speed(iter/s)": 0.032506 }, { "epoch": 0.5526328537518771, "grad_norm": 0.09568587690591812, "learning_rate": 0.00013636809534235155, "loss": 0.3068424463272095, "memory(GiB)": 78.33, "step": 2852, "token_acc": 0.9089695605517314, "train_speed(iter/s)": 0.032507 }, { "epoch": 0.5528266240372038, "grad_norm": 0.10480746626853943, "learning_rate": 0.00013627236398511183, "loss": 0.3546777069568634, "memory(GiB)": 78.33, "step": 2853, "token_acc": 0.8944366727644464, "train_speed(iter/s)": 0.032508 }, { "epoch": 0.5530203943225307, "grad_norm": 0.1180429756641388, "learning_rate": 0.00013617663826618102, "loss": 0.40899646282196045, "memory(GiB)": 78.33, "step": 2854, "token_acc": 0.8810652619256658, "train_speed(iter/s)": 0.032509 }, { "epoch": 0.5532141646078574, "grad_norm": 0.09670621156692505, "learning_rate": 0.0001360809182248764, "loss": 0.33177006244659424, "memory(GiB)": 78.33, "step": 2855, "token_acc": 0.9025397480900268, "train_speed(iter/s)": 0.03251 }, { "epoch": 0.5534079348931842, "grad_norm": 0.10695379227399826, "learning_rate": 0.00013598520390051264, "loss": 0.34610581398010254, "memory(GiB)": 78.33, "step": 2856, "token_acc": 0.898416321871879, "train_speed(iter/s)": 0.032511 }, { "epoch": 0.5536017051785109, "grad_norm": 0.10109010338783264, "learning_rate": 0.00013588949533240222, "loss": 0.3390766978263855, "memory(GiB)": 78.33, "step": 2857, "token_acc": 0.89864415955117, "train_speed(iter/s)": 0.032511 }, { "epoch": 0.5537954754638377, "grad_norm": 0.101463183760643, "learning_rate": 0.00013579379255985528, "loss": 0.3392132818698883, "memory(GiB)": 78.33, "step": 2858, "token_acc": 0.898758677643294, "train_speed(iter/s)": 0.032512 }, { "epoch": 0.5539892457491644, "grad_norm": 0.09950132668018341, "learning_rate": 0.00013569809562217943, "loss": 0.32112976908683777, "memory(GiB)": 78.33, "step": 2859, "token_acc": 0.902511359616902, "train_speed(iter/s)": 0.032513 }, { "epoch": 0.5541830160344912, "grad_norm": 0.10303032398223877, "learning_rate": 0.00013560240455868003, "loss": 0.3673378825187683, "memory(GiB)": 78.33, "step": 2860, "token_acc": 0.8928259417236241, "train_speed(iter/s)": 0.032514 }, { "epoch": 0.5543767863198179, "grad_norm": 0.11683917790651321, "learning_rate": 0.00013550671940865992, "loss": 0.3810235261917114, "memory(GiB)": 78.33, "step": 2861, "token_acc": 0.8879626045791527, "train_speed(iter/s)": 0.032515 }, { "epoch": 0.5545705566051446, "grad_norm": 0.11016968637704849, "learning_rate": 0.0001354110402114196, "loss": 0.3756512999534607, "memory(GiB)": 78.33, "step": 2862, "token_acc": 0.8894170776635569, "train_speed(iter/s)": 0.032516 }, { "epoch": 0.5547643268904714, "grad_norm": 0.09833884984254837, "learning_rate": 0.00013531536700625715, "loss": 0.3510299026966095, "memory(GiB)": 78.33, "step": 2863, "token_acc": 0.8975372743295384, "train_speed(iter/s)": 0.032517 }, { "epoch": 0.5549580971757981, "grad_norm": 0.09793855249881744, "learning_rate": 0.00013521969983246803, "loss": 0.36319395899772644, "memory(GiB)": 78.33, "step": 2864, "token_acc": 0.892717529189427, "train_speed(iter/s)": 0.032517 }, { "epoch": 0.5551518674611249, "grad_norm": 0.11692684143781662, "learning_rate": 0.0001351240387293454, "loss": 0.407484233379364, "memory(GiB)": 78.33, "step": 2865, "token_acc": 0.8815019139833371, "train_speed(iter/s)": 0.032518 }, { "epoch": 0.5553456377464516, "grad_norm": 0.10228422284126282, "learning_rate": 0.0001350283837361797, "loss": 0.340445339679718, "memory(GiB)": 78.33, "step": 2866, "token_acc": 0.8981877995519129, "train_speed(iter/s)": 0.032519 }, { "epoch": 0.5555394080317784, "grad_norm": 0.09636260569095612, "learning_rate": 0.00013493273489225915, "loss": 0.32771408557891846, "memory(GiB)": 78.33, "step": 2867, "token_acc": 0.9005267118133935, "train_speed(iter/s)": 0.03252 }, { "epoch": 0.5557331783171051, "grad_norm": 0.10914786905050278, "learning_rate": 0.00013483709223686922, "loss": 0.35096076130867004, "memory(GiB)": 78.33, "step": 2868, "token_acc": 0.8962214601291438, "train_speed(iter/s)": 0.032521 }, { "epoch": 0.5559269486024319, "grad_norm": 0.10578683018684387, "learning_rate": 0.00013474145580929297, "loss": 0.36036306619644165, "memory(GiB)": 78.33, "step": 2869, "token_acc": 0.8910474829632886, "train_speed(iter/s)": 0.032522 }, { "epoch": 0.5561207188877586, "grad_norm": 0.10106171667575836, "learning_rate": 0.00013464582564881087, "loss": 0.34388551115989685, "memory(GiB)": 78.33, "step": 2870, "token_acc": 0.8960813087667808, "train_speed(iter/s)": 0.032523 }, { "epoch": 0.5563144891730853, "grad_norm": 0.11406324058771133, "learning_rate": 0.00013455020179470073, "loss": 0.3458818197250366, "memory(GiB)": 78.33, "step": 2871, "token_acc": 0.8983081299595748, "train_speed(iter/s)": 0.032524 }, { "epoch": 0.5565082594584121, "grad_norm": 0.09773491322994232, "learning_rate": 0.00013445458428623788, "loss": 0.33812999725341797, "memory(GiB)": 78.33, "step": 2872, "token_acc": 0.899073884568154, "train_speed(iter/s)": 0.032525 }, { "epoch": 0.5567020297437388, "grad_norm": 0.10276000201702118, "learning_rate": 0.000134358973162695, "loss": 0.3428000807762146, "memory(GiB)": 78.33, "step": 2873, "token_acc": 0.8970639124763755, "train_speed(iter/s)": 0.032525 }, { "epoch": 0.5568958000290656, "grad_norm": 0.10354767739772797, "learning_rate": 0.00013426336846334208, "loss": 0.359587162733078, "memory(GiB)": 78.33, "step": 2874, "token_acc": 0.8922890471140736, "train_speed(iter/s)": 0.032526 }, { "epoch": 0.5570895703143923, "grad_norm": 0.12495241314172745, "learning_rate": 0.0001341677702274466, "loss": 0.3915575444698334, "memory(GiB)": 78.33, "step": 2875, "token_acc": 0.8831632312720966, "train_speed(iter/s)": 0.032527 }, { "epoch": 0.5572833405997191, "grad_norm": 0.09519120305776596, "learning_rate": 0.00013407217849427332, "loss": 0.3396569788455963, "memory(GiB)": 78.33, "step": 2876, "token_acc": 0.8975312241823395, "train_speed(iter/s)": 0.032528 }, { "epoch": 0.5574771108850458, "grad_norm": 0.10415952652692795, "learning_rate": 0.0001339765933030844, "loss": 0.36318159103393555, "memory(GiB)": 78.33, "step": 2877, "token_acc": 0.8913055970578094, "train_speed(iter/s)": 0.032529 }, { "epoch": 0.5576708811703726, "grad_norm": 0.09890494495630264, "learning_rate": 0.00013388101469313907, "loss": 0.34472253918647766, "memory(GiB)": 78.33, "step": 2878, "token_acc": 0.8975010936132983, "train_speed(iter/s)": 0.03253 }, { "epoch": 0.5578646514556993, "grad_norm": 0.10054649412631989, "learning_rate": 0.0001337854427036942, "loss": 0.3560715615749359, "memory(GiB)": 78.33, "step": 2879, "token_acc": 0.8977050131798505, "train_speed(iter/s)": 0.032531 }, { "epoch": 0.558058421741026, "grad_norm": 0.10295595228672028, "learning_rate": 0.00013368987737400368, "loss": 0.3403087556362152, "memory(GiB)": 78.33, "step": 2880, "token_acc": 0.8993411629905471, "train_speed(iter/s)": 0.032532 }, { "epoch": 0.5582521920263528, "grad_norm": 0.11277662217617035, "learning_rate": 0.00013359431874331886, "loss": 0.38854244351387024, "memory(GiB)": 78.33, "step": 2881, "token_acc": 0.8866336122301415, "train_speed(iter/s)": 0.032533 }, { "epoch": 0.5584459623116795, "grad_norm": 0.09544237703084946, "learning_rate": 0.0001334987668508881, "loss": 0.35088953375816345, "memory(GiB)": 78.33, "step": 2882, "token_acc": 0.8954308027790456, "train_speed(iter/s)": 0.032533 }, { "epoch": 0.5586397325970063, "grad_norm": 0.09790968149900436, "learning_rate": 0.0001334032217359572, "loss": 0.3613382875919342, "memory(GiB)": 78.33, "step": 2883, "token_acc": 0.8920747907163267, "train_speed(iter/s)": 0.032534 }, { "epoch": 0.558833502882333, "grad_norm": 0.10290265083312988, "learning_rate": 0.00013330768343776918, "loss": 0.35918739438056946, "memory(GiB)": 78.33, "step": 2884, "token_acc": 0.8945180422371265, "train_speed(iter/s)": 0.032535 }, { "epoch": 0.5590272731676598, "grad_norm": 0.11126357316970825, "learning_rate": 0.00013321215199556404, "loss": 0.34764158725738525, "memory(GiB)": 78.33, "step": 2885, "token_acc": 0.8981644381890688, "train_speed(iter/s)": 0.032536 }, { "epoch": 0.5592210434529865, "grad_norm": 0.10342656821012497, "learning_rate": 0.0001331166274485792, "loss": 0.3714340329170227, "memory(GiB)": 78.33, "step": 2886, "token_acc": 0.8889918774544479, "train_speed(iter/s)": 0.032537 }, { "epoch": 0.5594148137383133, "grad_norm": 0.09674359858036041, "learning_rate": 0.00013302110983604912, "loss": 0.3426961302757263, "memory(GiB)": 78.33, "step": 2887, "token_acc": 0.8992525579932754, "train_speed(iter/s)": 0.032538 }, { "epoch": 0.55960858402364, "grad_norm": 0.1152225211262703, "learning_rate": 0.00013292559919720554, "loss": 0.3851836621761322, "memory(GiB)": 78.33, "step": 2888, "token_acc": 0.8864522728725784, "train_speed(iter/s)": 0.032539 }, { "epoch": 0.5598023543089667, "grad_norm": 0.2307002991437912, "learning_rate": 0.00013283009557127712, "loss": 0.3549976050853729, "memory(GiB)": 78.33, "step": 2889, "token_acc": 0.8931888544891641, "train_speed(iter/s)": 0.03254 }, { "epoch": 0.5599961245942935, "grad_norm": 0.10933719575405121, "learning_rate": 0.0001327345989974898, "loss": 0.40044257044792175, "memory(GiB)": 78.33, "step": 2890, "token_acc": 0.8821779976652399, "train_speed(iter/s)": 0.03254 }, { "epoch": 0.5601898948796202, "grad_norm": 0.08904779702425003, "learning_rate": 0.00013263910951506668, "loss": 0.31354546546936035, "memory(GiB)": 78.33, "step": 2891, "token_acc": 0.9041093058828393, "train_speed(iter/s)": 0.032541 }, { "epoch": 0.560383665164947, "grad_norm": 0.10197526961565018, "learning_rate": 0.00013254362716322776, "loss": 0.3503738045692444, "memory(GiB)": 78.33, "step": 2892, "token_acc": 0.8949555782295996, "train_speed(iter/s)": 0.032542 }, { "epoch": 0.5605774354502737, "grad_norm": 0.11662586033344269, "learning_rate": 0.00013244815198119024, "loss": 0.3930249810218811, "memory(GiB)": 78.33, "step": 2893, "token_acc": 0.8859180035650623, "train_speed(iter/s)": 0.032543 }, { "epoch": 0.5607712057356005, "grad_norm": 0.11080929636955261, "learning_rate": 0.0001323526840081683, "loss": 0.3820610046386719, "memory(GiB)": 78.33, "step": 2894, "token_acc": 0.8872899535216303, "train_speed(iter/s)": 0.032544 }, { "epoch": 0.5609649760209272, "grad_norm": 0.10336665064096451, "learning_rate": 0.00013225722328337323, "loss": 0.34890395402908325, "memory(GiB)": 78.33, "step": 2895, "token_acc": 0.8951938666153719, "train_speed(iter/s)": 0.032545 }, { "epoch": 0.561158746306254, "grad_norm": 0.09736684709787369, "learning_rate": 0.0001321617698460134, "loss": 0.33721378445625305, "memory(GiB)": 78.33, "step": 2896, "token_acc": 0.9000335025642348, "train_speed(iter/s)": 0.032546 }, { "epoch": 0.5613525165915807, "grad_norm": 0.10685203224420547, "learning_rate": 0.00013206632373529396, "loss": 0.35752072930336, "memory(GiB)": 78.33, "step": 2897, "token_acc": 0.8925185941373779, "train_speed(iter/s)": 0.032546 }, { "epoch": 0.5615462868769074, "grad_norm": 0.10666251182556152, "learning_rate": 0.00013197088499041732, "loss": 0.3803097903728485, "memory(GiB)": 78.33, "step": 2898, "token_acc": 0.8869661896345288, "train_speed(iter/s)": 0.032547 }, { "epoch": 0.5617400571622342, "grad_norm": 0.09380005300045013, "learning_rate": 0.00013187545365058261, "loss": 0.3471141755580902, "memory(GiB)": 78.33, "step": 2899, "token_acc": 0.8968915295993044, "train_speed(iter/s)": 0.032548 }, { "epoch": 0.5619338274475609, "grad_norm": 0.10400458425283432, "learning_rate": 0.00013178002975498614, "loss": 0.3575308322906494, "memory(GiB)": 78.33, "step": 2900, "token_acc": 0.8943859748659613, "train_speed(iter/s)": 0.032549 }, { "epoch": 0.5621275977328877, "grad_norm": 0.09898320585489273, "learning_rate": 0.00013168461334282103, "loss": 0.34250903129577637, "memory(GiB)": 78.33, "step": 2901, "token_acc": 0.895111494593793, "train_speed(iter/s)": 0.03255 }, { "epoch": 0.5623213680182144, "grad_norm": 0.09259682148694992, "learning_rate": 0.00013158920445327738, "loss": 0.30667221546173096, "memory(GiB)": 78.33, "step": 2902, "token_acc": 0.9075316927665921, "train_speed(iter/s)": 0.032551 }, { "epoch": 0.5625151383035412, "grad_norm": 0.10204022377729416, "learning_rate": 0.0001314938031255422, "loss": 0.35288378596305847, "memory(GiB)": 78.33, "step": 2903, "token_acc": 0.8955367449018854, "train_speed(iter/s)": 0.032552 }, { "epoch": 0.5627089085888679, "grad_norm": 0.0911250039935112, "learning_rate": 0.00013139840939879933, "loss": 0.3128979206085205, "memory(GiB)": 78.33, "step": 2904, "token_acc": 0.9082222013523666, "train_speed(iter/s)": 0.032553 }, { "epoch": 0.5629026788741947, "grad_norm": 0.10293769836425781, "learning_rate": 0.00013130302331222963, "loss": 0.34590229392051697, "memory(GiB)": 78.33, "step": 2905, "token_acc": 0.8969505783385909, "train_speed(iter/s)": 0.032553 }, { "epoch": 0.5630964491595214, "grad_norm": 0.09894620627164841, "learning_rate": 0.00013120764490501057, "loss": 0.33227023482322693, "memory(GiB)": 78.33, "step": 2906, "token_acc": 0.9007358424899319, "train_speed(iter/s)": 0.032554 }, { "epoch": 0.5632902194448481, "grad_norm": 0.10376887768507004, "learning_rate": 0.00013111227421631674, "loss": 0.36455827951431274, "memory(GiB)": 78.33, "step": 2907, "token_acc": 0.8945593638331069, "train_speed(iter/s)": 0.032555 }, { "epoch": 0.5634839897301749, "grad_norm": 0.10269268602132797, "learning_rate": 0.00013101691128531942, "loss": 0.3566417694091797, "memory(GiB)": 78.33, "step": 2908, "token_acc": 0.8924794993526112, "train_speed(iter/s)": 0.032556 }, { "epoch": 0.5636777600155016, "grad_norm": 0.11318562924861908, "learning_rate": 0.00013092155615118672, "loss": 0.3609810471534729, "memory(GiB)": 78.33, "step": 2909, "token_acc": 0.8935661992803078, "train_speed(iter/s)": 0.032557 }, { "epoch": 0.5638715303008284, "grad_norm": 0.09679584205150604, "learning_rate": 0.00013082620885308363, "loss": 0.33762603998184204, "memory(GiB)": 78.33, "step": 2910, "token_acc": 0.8995445957075723, "train_speed(iter/s)": 0.032558 }, { "epoch": 0.5640653005861551, "grad_norm": 0.11616901308298111, "learning_rate": 0.00013073086943017173, "loss": 0.351492315530777, "memory(GiB)": 78.33, "step": 2911, "token_acc": 0.8948040342142218, "train_speed(iter/s)": 0.032559 }, { "epoch": 0.5642590708714819, "grad_norm": 0.10330884903669357, "learning_rate": 0.00013063553792160958, "loss": 0.3321197032928467, "memory(GiB)": 78.33, "step": 2912, "token_acc": 0.8990494397535977, "train_speed(iter/s)": 0.03256 }, { "epoch": 0.5644528411568086, "grad_norm": 0.10223756730556488, "learning_rate": 0.0001305402143665523, "loss": 0.32255592942237854, "memory(GiB)": 78.33, "step": 2913, "token_acc": 0.9044650149741356, "train_speed(iter/s)": 0.03256 }, { "epoch": 0.5646466114421353, "grad_norm": 0.09594756364822388, "learning_rate": 0.00013044489880415194, "loss": 0.3452211320400238, "memory(GiB)": 78.33, "step": 2914, "token_acc": 0.8991926235148401, "train_speed(iter/s)": 0.032561 }, { "epoch": 0.5648403817274621, "grad_norm": 0.09111212939023972, "learning_rate": 0.00013034959127355703, "loss": 0.3073510229587555, "memory(GiB)": 78.33, "step": 2915, "token_acc": 0.9079986434765758, "train_speed(iter/s)": 0.032562 }, { "epoch": 0.5650341520127888, "grad_norm": 0.10147152841091156, "learning_rate": 0.00013025429181391304, "loss": 0.3540181815624237, "memory(GiB)": 78.33, "step": 2916, "token_acc": 0.8969390771546049, "train_speed(iter/s)": 0.032563 }, { "epoch": 0.5652279222981156, "grad_norm": 0.10001610219478607, "learning_rate": 0.00013015900046436205, "loss": 0.35491663217544556, "memory(GiB)": 78.33, "step": 2917, "token_acc": 0.8945139415897574, "train_speed(iter/s)": 0.032564 }, { "epoch": 0.5654216925834423, "grad_norm": 0.09321217238903046, "learning_rate": 0.00013006371726404265, "loss": 0.3310143053531647, "memory(GiB)": 78.33, "step": 2918, "token_acc": 0.9035609732687047, "train_speed(iter/s)": 0.032564 }, { "epoch": 0.5656154628687691, "grad_norm": 0.10879889130592346, "learning_rate": 0.0001299684422520903, "loss": 0.3868524730205536, "memory(GiB)": 78.33, "step": 2919, "token_acc": 0.8842917053674463, "train_speed(iter/s)": 0.032565 }, { "epoch": 0.5658092331540958, "grad_norm": 0.09358043968677521, "learning_rate": 0.00012987317546763697, "loss": 0.3301950991153717, "memory(GiB)": 78.33, "step": 2920, "token_acc": 0.9018064033885635, "train_speed(iter/s)": 0.032566 }, { "epoch": 0.5660030034394226, "grad_norm": 0.09514044970273972, "learning_rate": 0.00012977791694981136, "loss": 0.34315773844718933, "memory(GiB)": 78.33, "step": 2921, "token_acc": 0.8958858102434929, "train_speed(iter/s)": 0.032567 }, { "epoch": 0.5661967737247493, "grad_norm": 0.10747494548559189, "learning_rate": 0.00012968266673773858, "loss": 0.3809034824371338, "memory(GiB)": 78.33, "step": 2922, "token_acc": 0.8869167528719987, "train_speed(iter/s)": 0.032568 }, { "epoch": 0.566390544010076, "grad_norm": 0.09282265603542328, "learning_rate": 0.00012958742487054054, "loss": 0.3388338088989258, "memory(GiB)": 78.33, "step": 2923, "token_acc": 0.9004572462527813, "train_speed(iter/s)": 0.032569 }, { "epoch": 0.5665843142954028, "grad_norm": 0.11763904243707657, "learning_rate": 0.00012949219138733565, "loss": 0.403276652097702, "memory(GiB)": 78.33, "step": 2924, "token_acc": 0.8811169562025936, "train_speed(iter/s)": 0.03257 }, { "epoch": 0.5667780845807295, "grad_norm": 0.09826403111219406, "learning_rate": 0.00012939696632723876, "loss": 0.34657537937164307, "memory(GiB)": 78.33, "step": 2925, "token_acc": 0.8960861857357153, "train_speed(iter/s)": 0.03257 }, { "epoch": 0.5669718548660563, "grad_norm": 0.09996719658374786, "learning_rate": 0.00012930174972936148, "loss": 0.34664636850357056, "memory(GiB)": 78.33, "step": 2926, "token_acc": 0.8960221082560519, "train_speed(iter/s)": 0.032571 }, { "epoch": 0.567165625151383, "grad_norm": 0.09884995222091675, "learning_rate": 0.00012920654163281172, "loss": 0.3174511790275574, "memory(GiB)": 78.33, "step": 2927, "token_acc": 0.9049272486772487, "train_speed(iter/s)": 0.032572 }, { "epoch": 0.5673593954367098, "grad_norm": 0.10128315538167953, "learning_rate": 0.00012911134207669412, "loss": 0.33141183853149414, "memory(GiB)": 78.33, "step": 2928, "token_acc": 0.9011783988470342, "train_speed(iter/s)": 0.032573 }, { "epoch": 0.5675531657220365, "grad_norm": 0.11448942869901657, "learning_rate": 0.00012901615110010956, "loss": 0.3866661489009857, "memory(GiB)": 78.33, "step": 2929, "token_acc": 0.8871186120469788, "train_speed(iter/s)": 0.032574 }, { "epoch": 0.5677469360073633, "grad_norm": 0.09554385393857956, "learning_rate": 0.00012892096874215562, "loss": 0.31373119354248047, "memory(GiB)": 78.33, "step": 2930, "token_acc": 0.9060102797220859, "train_speed(iter/s)": 0.032575 }, { "epoch": 0.56794070629269, "grad_norm": 0.10202641785144806, "learning_rate": 0.00012882579504192628, "loss": 0.3446533977985382, "memory(GiB)": 78.33, "step": 2931, "token_acc": 0.8982907151647751, "train_speed(iter/s)": 0.032576 }, { "epoch": 0.5681344765780167, "grad_norm": 0.10661419481039047, "learning_rate": 0.00012873063003851184, "loss": 0.35394954681396484, "memory(GiB)": 78.33, "step": 2932, "token_acc": 0.8950952106174265, "train_speed(iter/s)": 0.032577 }, { "epoch": 0.5683282468633435, "grad_norm": 0.10493983328342438, "learning_rate": 0.00012863547377099918, "loss": 0.34970593452453613, "memory(GiB)": 78.33, "step": 2933, "token_acc": 0.8950512907225792, "train_speed(iter/s)": 0.032578 }, { "epoch": 0.5685220171486702, "grad_norm": 0.11756689846515656, "learning_rate": 0.0001285403262784715, "loss": 0.3975431025028229, "memory(GiB)": 78.33, "step": 2934, "token_acc": 0.8840085287846482, "train_speed(iter/s)": 0.032578 }, { "epoch": 0.568715787433997, "grad_norm": 0.10590403527021408, "learning_rate": 0.00012844518760000848, "loss": 0.37834692001342773, "memory(GiB)": 78.33, "step": 2935, "token_acc": 0.8883588043448963, "train_speed(iter/s)": 0.032579 }, { "epoch": 0.5689095577193237, "grad_norm": 0.0980941578745842, "learning_rate": 0.0001283500577746862, "loss": 0.32845622301101685, "memory(GiB)": 78.33, "step": 2936, "token_acc": 0.9010894350139347, "train_speed(iter/s)": 0.03258 }, { "epoch": 0.5691033280046505, "grad_norm": 0.09987200796604156, "learning_rate": 0.00012825493684157682, "loss": 0.334673672914505, "memory(GiB)": 78.33, "step": 2937, "token_acc": 0.8996542412235318, "train_speed(iter/s)": 0.032581 }, { "epoch": 0.5692970982899772, "grad_norm": 0.1141529381275177, "learning_rate": 0.0001281598248397493, "loss": 0.39329400658607483, "memory(GiB)": 78.33, "step": 2938, "token_acc": 0.886091163126108, "train_speed(iter/s)": 0.032582 }, { "epoch": 0.569490868575304, "grad_norm": 0.09881948679685593, "learning_rate": 0.0001280647218082685, "loss": 0.3378741145133972, "memory(GiB)": 78.33, "step": 2939, "token_acc": 0.8991566664118831, "train_speed(iter/s)": 0.032583 }, { "epoch": 0.5696846388606307, "grad_norm": 0.10220920294523239, "learning_rate": 0.00012796962778619593, "loss": 0.35935983061790466, "memory(GiB)": 78.33, "step": 2940, "token_acc": 0.8916394328504156, "train_speed(iter/s)": 0.032583 }, { "epoch": 0.5698784091459574, "grad_norm": 0.10239671915769577, "learning_rate": 0.00012787454281258916, "loss": 0.34669652581214905, "memory(GiB)": 78.33, "step": 2941, "token_acc": 0.8974839774985307, "train_speed(iter/s)": 0.032584 }, { "epoch": 0.5700721794312842, "grad_norm": 0.10328203439712524, "learning_rate": 0.0001277794669265022, "loss": 0.33819228410720825, "memory(GiB)": 78.33, "step": 2942, "token_acc": 0.8988887064104669, "train_speed(iter/s)": 0.032585 }, { "epoch": 0.5702659497166109, "grad_norm": 0.10673517733812332, "learning_rate": 0.00012768440016698533, "loss": 0.36250045895576477, "memory(GiB)": 78.33, "step": 2943, "token_acc": 0.8937628879783616, "train_speed(iter/s)": 0.032586 }, { "epoch": 0.5704597200019377, "grad_norm": 0.10135802626609802, "learning_rate": 0.0001275893425730849, "loss": 0.3382996618747711, "memory(GiB)": 78.33, "step": 2944, "token_acc": 0.8998493193390834, "train_speed(iter/s)": 0.032587 }, { "epoch": 0.5706534902872644, "grad_norm": 0.09103412926197052, "learning_rate": 0.00012749429418384368, "loss": 0.3121355175971985, "memory(GiB)": 78.33, "step": 2945, "token_acc": 0.907722643769968, "train_speed(iter/s)": 0.032588 }, { "epoch": 0.5708472605725912, "grad_norm": 0.10437119007110596, "learning_rate": 0.00012739925503830058, "loss": 0.3552001118659973, "memory(GiB)": 78.33, "step": 2946, "token_acc": 0.8937973264809601, "train_speed(iter/s)": 0.032589 }, { "epoch": 0.5710410308579179, "grad_norm": 0.09624162316322327, "learning_rate": 0.00012730422517549076, "loss": 0.34696635603904724, "memory(GiB)": 78.33, "step": 2947, "token_acc": 0.89641196508939, "train_speed(iter/s)": 0.032589 }, { "epoch": 0.5712348011432447, "grad_norm": 0.0997808501124382, "learning_rate": 0.0001272092046344455, "loss": 0.33023878931999207, "memory(GiB)": 78.33, "step": 2948, "token_acc": 0.9004624871531346, "train_speed(iter/s)": 0.03259 }, { "epoch": 0.5714285714285714, "grad_norm": 0.10305418819189072, "learning_rate": 0.0001271141934541923, "loss": 0.3459644019603729, "memory(GiB)": 78.33, "step": 2949, "token_acc": 0.8978806469604016, "train_speed(iter/s)": 0.032591 }, { "epoch": 0.5716223417138981, "grad_norm": 0.09685331583023071, "learning_rate": 0.00012701919167375488, "loss": 0.34739017486572266, "memory(GiB)": 78.33, "step": 2950, "token_acc": 0.8971735019041944, "train_speed(iter/s)": 0.032592 }, { "epoch": 0.5718161119992249, "grad_norm": 0.09449600428342819, "learning_rate": 0.00012692419933215288, "loss": 0.3246384859085083, "memory(GiB)": 78.33, "step": 2951, "token_acc": 0.9017080904162831, "train_speed(iter/s)": 0.032593 }, { "epoch": 0.5720098822845516, "grad_norm": 0.10447492450475693, "learning_rate": 0.00012682921646840233, "loss": 0.37390848994255066, "memory(GiB)": 78.33, "step": 2952, "token_acc": 0.8906833284795214, "train_speed(iter/s)": 0.032594 }, { "epoch": 0.5722036525698784, "grad_norm": 0.11614307761192322, "learning_rate": 0.00012673424312151517, "loss": 0.40665432810783386, "memory(GiB)": 78.33, "step": 2953, "token_acc": 0.8813434035031336, "train_speed(iter/s)": 0.032594 }, { "epoch": 0.5723974228552051, "grad_norm": 0.09071079641580582, "learning_rate": 0.0001266392793304996, "loss": 0.33611175417900085, "memory(GiB)": 78.33, "step": 2954, "token_acc": 0.8994031782555547, "train_speed(iter/s)": 0.032595 }, { "epoch": 0.5725911931405319, "grad_norm": 0.10135752707719803, "learning_rate": 0.00012654432513435965, "loss": 0.36446985602378845, "memory(GiB)": 78.33, "step": 2955, "token_acc": 0.8912203779100115, "train_speed(iter/s)": 0.032596 }, { "epoch": 0.5727849634258586, "grad_norm": 0.10277879238128662, "learning_rate": 0.00012644938057209567, "loss": 0.3852466940879822, "memory(GiB)": 78.33, "step": 2956, "token_acc": 0.8869683281547821, "train_speed(iter/s)": 0.032597 }, { "epoch": 0.5729787337111854, "grad_norm": 0.10757278650999069, "learning_rate": 0.00012635444568270398, "loss": 0.35662007331848145, "memory(GiB)": 78.33, "step": 2957, "token_acc": 0.893989887382211, "train_speed(iter/s)": 0.032598 }, { "epoch": 0.5731725039965121, "grad_norm": 0.11290912330150604, "learning_rate": 0.00012625952050517673, "loss": 0.365123450756073, "memory(GiB)": 78.33, "step": 2958, "token_acc": 0.890861820997898, "train_speed(iter/s)": 0.032599 }, { "epoch": 0.5733662742818388, "grad_norm": 0.11363532394170761, "learning_rate": 0.00012616460507850242, "loss": 0.3688386082649231, "memory(GiB)": 78.33, "step": 2959, "token_acc": 0.8900687757909216, "train_speed(iter/s)": 0.032599 }, { "epoch": 0.5735600445671656, "grad_norm": 0.10175785422325134, "learning_rate": 0.00012606969944166523, "loss": 0.3480740785598755, "memory(GiB)": 78.33, "step": 2960, "token_acc": 0.8961689531507941, "train_speed(iter/s)": 0.0326 }, { "epoch": 0.5737538148524923, "grad_norm": 0.10226535052061081, "learning_rate": 0.00012597480363364558, "loss": 0.36537787318229675, "memory(GiB)": 78.33, "step": 2961, "token_acc": 0.8930911178973194, "train_speed(iter/s)": 0.032601 }, { "epoch": 0.5739475851378191, "grad_norm": 0.10284057259559631, "learning_rate": 0.0001258799176934196, "loss": 0.3630596399307251, "memory(GiB)": 78.33, "step": 2962, "token_acc": 0.8929182958930716, "train_speed(iter/s)": 0.032602 }, { "epoch": 0.5741413554231458, "grad_norm": 0.10394224524497986, "learning_rate": 0.00012578504165995953, "loss": 0.3422529995441437, "memory(GiB)": 78.33, "step": 2963, "token_acc": 0.8979251265505924, "train_speed(iter/s)": 0.032603 }, { "epoch": 0.5743351257084726, "grad_norm": 0.10181540995836258, "learning_rate": 0.00012569017557223362, "loss": 0.35082393884658813, "memory(GiB)": 78.33, "step": 2964, "token_acc": 0.8960905890389365, "train_speed(iter/s)": 0.032604 }, { "epoch": 0.5745288959937993, "grad_norm": 0.10336862504482269, "learning_rate": 0.00012559531946920578, "loss": 0.36020544171333313, "memory(GiB)": 78.33, "step": 2965, "token_acc": 0.8921231485743684, "train_speed(iter/s)": 0.032604 }, { "epoch": 0.574722666279126, "grad_norm": 0.10156133770942688, "learning_rate": 0.00012550047338983603, "loss": 0.36286041140556335, "memory(GiB)": 78.33, "step": 2966, "token_acc": 0.8922610015174507, "train_speed(iter/s)": 0.032605 }, { "epoch": 0.5749164365644528, "grad_norm": 0.09440826624631882, "learning_rate": 0.00012540563737308016, "loss": 0.3350200057029724, "memory(GiB)": 78.33, "step": 2967, "token_acc": 0.8994114241605558, "train_speed(iter/s)": 0.032606 }, { "epoch": 0.5751102068497795, "grad_norm": 0.117142453789711, "learning_rate": 0.00012531081145788987, "loss": 0.4059712290763855, "memory(GiB)": 78.33, "step": 2968, "token_acc": 0.8823188237682337, "train_speed(iter/s)": 0.032607 }, { "epoch": 0.5753039771351063, "grad_norm": 0.10292242467403412, "learning_rate": 0.00012521599568321283, "loss": 0.3511313199996948, "memory(GiB)": 78.33, "step": 2969, "token_acc": 0.8962899832210296, "train_speed(iter/s)": 0.032608 }, { "epoch": 0.575497747420433, "grad_norm": 0.10700348764657974, "learning_rate": 0.00012512119008799226, "loss": 0.3738468885421753, "memory(GiB)": 78.33, "step": 2970, "token_acc": 0.8902907594626264, "train_speed(iter/s)": 0.032609 }, { "epoch": 0.5756915177057598, "grad_norm": 0.10004039853811264, "learning_rate": 0.0001250263947111675, "loss": 0.3366740942001343, "memory(GiB)": 78.33, "step": 2971, "token_acc": 0.9001073537305422, "train_speed(iter/s)": 0.032609 }, { "epoch": 0.5758852879910865, "grad_norm": 0.10136931389570236, "learning_rate": 0.00012493160959167347, "loss": 0.35105177760124207, "memory(GiB)": 78.33, "step": 2972, "token_acc": 0.8961308889362609, "train_speed(iter/s)": 0.03261 }, { "epoch": 0.5760790582764133, "grad_norm": 0.10934333503246307, "learning_rate": 0.000124836834768441, "loss": 0.37874335050582886, "memory(GiB)": 78.33, "step": 2973, "token_acc": 0.8891520475888849, "train_speed(iter/s)": 0.032611 }, { "epoch": 0.57627282856174, "grad_norm": 0.09537496417760849, "learning_rate": 0.0001247420702803966, "loss": 0.3463536500930786, "memory(GiB)": 78.33, "step": 2974, "token_acc": 0.8957062728096603, "train_speed(iter/s)": 0.032612 }, { "epoch": 0.5764665988470667, "grad_norm": 0.11320365220308304, "learning_rate": 0.00012464731616646267, "loss": 0.3580982983112335, "memory(GiB)": 78.33, "step": 2975, "token_acc": 0.8927235580700927, "train_speed(iter/s)": 0.032613 }, { "epoch": 0.5766603691323936, "grad_norm": 0.1089281439781189, "learning_rate": 0.0001245525724655573, "loss": 0.3534199297428131, "memory(GiB)": 78.33, "step": 2976, "token_acc": 0.8954800657857427, "train_speed(iter/s)": 0.032614 }, { "epoch": 0.5768541394177203, "grad_norm": 0.11043395847082138, "learning_rate": 0.00012445783921659416, "loss": 0.3937772214412689, "memory(GiB)": 78.33, "step": 2977, "token_acc": 0.8828118422898835, "train_speed(iter/s)": 0.032614 }, { "epoch": 0.5770479097030471, "grad_norm": 0.09332658350467682, "learning_rate": 0.00012436311645848286, "loss": 0.3024745285511017, "memory(GiB)": 78.33, "step": 2978, "token_acc": 0.9083586811047573, "train_speed(iter/s)": 0.032615 }, { "epoch": 0.5772416799883738, "grad_norm": 0.10384024679660797, "learning_rate": 0.00012426840423012845, "loss": 0.3482080101966858, "memory(GiB)": 78.33, "step": 2979, "token_acc": 0.8957580870528065, "train_speed(iter/s)": 0.032616 }, { "epoch": 0.5774354502737006, "grad_norm": 0.10794834792613983, "learning_rate": 0.0001241737025704319, "loss": 0.35951748490333557, "memory(GiB)": 78.33, "step": 2980, "token_acc": 0.894366402850384, "train_speed(iter/s)": 0.032617 }, { "epoch": 0.5776292205590273, "grad_norm": 0.13303066790103912, "learning_rate": 0.00012407901151828963, "loss": 0.3524818420410156, "memory(GiB)": 78.33, "step": 2981, "token_acc": 0.8969385499557914, "train_speed(iter/s)": 0.032618 }, { "epoch": 0.5778229908443541, "grad_norm": 0.11768164485692978, "learning_rate": 0.00012398433111259386, "loss": 0.3718627691268921, "memory(GiB)": 78.33, "step": 2982, "token_acc": 0.8906458313955291, "train_speed(iter/s)": 0.032619 }, { "epoch": 0.5780167611296808, "grad_norm": 0.10189583152532578, "learning_rate": 0.00012388966139223245, "loss": 0.3596772253513336, "memory(GiB)": 78.33, "step": 2983, "token_acc": 0.8941813261163735, "train_speed(iter/s)": 0.03262 }, { "epoch": 0.5782105314150076, "grad_norm": 0.10249454528093338, "learning_rate": 0.00012379500239608865, "loss": 0.3092750906944275, "memory(GiB)": 78.33, "step": 2984, "token_acc": 0.9063748245708734, "train_speed(iter/s)": 0.03262 }, { "epoch": 0.5784043017003343, "grad_norm": 0.12180888652801514, "learning_rate": 0.00012370035416304153, "loss": 0.3937700092792511, "memory(GiB)": 78.33, "step": 2985, "token_acc": 0.8859492919528534, "train_speed(iter/s)": 0.032621 }, { "epoch": 0.578598071985661, "grad_norm": 0.0898217260837555, "learning_rate": 0.00012360571673196565, "loss": 0.3232322931289673, "memory(GiB)": 78.33, "step": 2986, "token_acc": 0.9051626763526832, "train_speed(iter/s)": 0.032622 }, { "epoch": 0.5787918422709878, "grad_norm": 0.09598486870527267, "learning_rate": 0.0001235110901417312, "loss": 0.32468363642692566, "memory(GiB)": 78.33, "step": 2987, "token_acc": 0.901153603034134, "train_speed(iter/s)": 0.032623 }, { "epoch": 0.5789856125563145, "grad_norm": 0.10317398607730865, "learning_rate": 0.00012341647443120374, "loss": 0.33831608295440674, "memory(GiB)": 78.33, "step": 2988, "token_acc": 0.900511402902557, "train_speed(iter/s)": 0.032624 }, { "epoch": 0.5791793828416413, "grad_norm": 0.10351257771253586, "learning_rate": 0.0001233218696392446, "loss": 0.3458552956581116, "memory(GiB)": 78.33, "step": 2989, "token_acc": 0.8962962962962963, "train_speed(iter/s)": 0.032624 }, { "epoch": 0.579373153126968, "grad_norm": 0.09751173853874207, "learning_rate": 0.00012322727580471048, "loss": 0.338079035282135, "memory(GiB)": 78.33, "step": 2990, "token_acc": 0.8986410108266975, "train_speed(iter/s)": 0.032625 }, { "epoch": 0.5795669234122948, "grad_norm": 0.10420162230730057, "learning_rate": 0.00012313269296645356, "loss": 0.3703789710998535, "memory(GiB)": 78.33, "step": 2991, "token_acc": 0.8920427978106005, "train_speed(iter/s)": 0.032626 }, { "epoch": 0.5797606936976215, "grad_norm": 0.10459905862808228, "learning_rate": 0.00012303812116332163, "loss": 0.35847145318984985, "memory(GiB)": 78.33, "step": 2992, "token_acc": 0.893757727364214, "train_speed(iter/s)": 0.032627 }, { "epoch": 0.5799544639829483, "grad_norm": 0.09815093129873276, "learning_rate": 0.0001229435604341578, "loss": 0.3507353663444519, "memory(GiB)": 78.33, "step": 2993, "token_acc": 0.8970371099517669, "train_speed(iter/s)": 0.032628 }, { "epoch": 0.580148234268275, "grad_norm": 0.10171345621347427, "learning_rate": 0.00012284901081780077, "loss": 0.34216856956481934, "memory(GiB)": 78.33, "step": 2994, "token_acc": 0.8977114312267658, "train_speed(iter/s)": 0.032629 }, { "epoch": 0.5803420045536017, "grad_norm": 0.09991803765296936, "learning_rate": 0.00012275447235308453, "loss": 0.33179599046707153, "memory(GiB)": 78.33, "step": 2995, "token_acc": 0.9008316831683169, "train_speed(iter/s)": 0.032629 }, { "epoch": 0.5805357748389285, "grad_norm": 0.12053931504487991, "learning_rate": 0.00012265994507883863, "loss": 0.3269277811050415, "memory(GiB)": 78.33, "step": 2996, "token_acc": 0.9013863084178934, "train_speed(iter/s)": 0.03263 }, { "epoch": 0.5807295451242552, "grad_norm": 0.10479523986577988, "learning_rate": 0.00012256542903388797, "loss": 0.3800506591796875, "memory(GiB)": 78.33, "step": 2997, "token_acc": 0.8882348086815394, "train_speed(iter/s)": 0.032631 }, { "epoch": 0.580923315409582, "grad_norm": 0.10834317654371262, "learning_rate": 0.00012247092425705274, "loss": 0.3906557559967041, "memory(GiB)": 78.33, "step": 2998, "token_acc": 0.8865690162121335, "train_speed(iter/s)": 0.032632 }, { "epoch": 0.5811170856949087, "grad_norm": 0.09533150494098663, "learning_rate": 0.0001223764307871487, "loss": 0.32510894536972046, "memory(GiB)": 78.33, "step": 2999, "token_acc": 0.9044620191368873, "train_speed(iter/s)": 0.032633 }, { "epoch": 0.5813108559802355, "grad_norm": 0.11682204157114029, "learning_rate": 0.00012228194866298678, "loss": 0.3705749809741974, "memory(GiB)": 78.33, "step": 3000, "token_acc": 0.8889822334235664, "train_speed(iter/s)": 0.032633 }, { "epoch": 0.5813108559802355, "eval_loss": 0.40707629919052124, "eval_runtime": 1344.136, "eval_samples_per_second": 5.021, "eval_steps_per_second": 5.021, "eval_token_acc": 0.8961095095372348, "step": 3000 }, { "epoch": 0.5815046262655622, "grad_norm": 0.1150721088051796, "learning_rate": 0.00012218747792337335, "loss": 0.3748778998851776, "memory(GiB)": 78.33, "step": 3001, "token_acc": 0.8879601402956698, "train_speed(iter/s)": 0.03216 }, { "epoch": 0.581698396550889, "grad_norm": 0.10332752019166946, "learning_rate": 0.00012209301860711017, "loss": 0.34594491124153137, "memory(GiB)": 78.33, "step": 3002, "token_acc": 0.8974166376049575, "train_speed(iter/s)": 0.032161 }, { "epoch": 0.5818921668362157, "grad_norm": 0.10606271028518677, "learning_rate": 0.00012199857075299403, "loss": 0.33615994453430176, "memory(GiB)": 78.33, "step": 3003, "token_acc": 0.900671290493301, "train_speed(iter/s)": 0.032162 }, { "epoch": 0.5820859371215424, "grad_norm": 0.11355333030223846, "learning_rate": 0.00012190413439981741, "loss": 0.3375508785247803, "memory(GiB)": 78.33, "step": 3004, "token_acc": 0.8986658671863289, "train_speed(iter/s)": 0.032163 }, { "epoch": 0.5822797074068692, "grad_norm": 0.0899207592010498, "learning_rate": 0.00012180970958636769, "loss": 0.32201769948005676, "memory(GiB)": 78.33, "step": 3005, "token_acc": 0.9033656775887123, "train_speed(iter/s)": 0.032164 }, { "epoch": 0.5824734776921959, "grad_norm": 0.10488248616456985, "learning_rate": 0.00012171529635142777, "loss": 0.3354138433933258, "memory(GiB)": 78.33, "step": 3006, "token_acc": 0.8999153020892151, "train_speed(iter/s)": 0.032165 }, { "epoch": 0.5826672479775227, "grad_norm": 0.11078818142414093, "learning_rate": 0.00012162089473377564, "loss": 0.39330416917800903, "memory(GiB)": 78.33, "step": 3007, "token_acc": 0.8855033101341029, "train_speed(iter/s)": 0.032166 }, { "epoch": 0.5828610182628494, "grad_norm": 0.09363757818937302, "learning_rate": 0.00012152650477218462, "loss": 0.34882262349128723, "memory(GiB)": 78.33, "step": 3008, "token_acc": 0.8959220498015157, "train_speed(iter/s)": 0.032167 }, { "epoch": 0.5830547885481762, "grad_norm": 0.11309264600276947, "learning_rate": 0.00012143212650542327, "loss": 0.38344091176986694, "memory(GiB)": 78.33, "step": 3009, "token_acc": 0.8874630723781388, "train_speed(iter/s)": 0.032168 }, { "epoch": 0.5832485588335029, "grad_norm": 0.1011514663696289, "learning_rate": 0.00012133775997225515, "loss": 0.3377688527107239, "memory(GiB)": 78.33, "step": 3010, "token_acc": 0.8985852981969487, "train_speed(iter/s)": 0.032169 }, { "epoch": 0.5834423291188297, "grad_norm": 0.09630289673805237, "learning_rate": 0.00012124340521143926, "loss": 0.3347204327583313, "memory(GiB)": 78.33, "step": 3011, "token_acc": 0.9005485624724494, "train_speed(iter/s)": 0.03217 }, { "epoch": 0.5836360994041564, "grad_norm": 0.09772694110870361, "learning_rate": 0.0001211490622617295, "loss": 0.33570656180381775, "memory(GiB)": 78.33, "step": 3012, "token_acc": 0.9012391298992437, "train_speed(iter/s)": 0.032171 }, { "epoch": 0.5838298696894831, "grad_norm": 0.11302408576011658, "learning_rate": 0.00012105473116187517, "loss": 0.36947059631347656, "memory(GiB)": 78.33, "step": 3013, "token_acc": 0.8906702025072324, "train_speed(iter/s)": 0.032172 }, { "epoch": 0.5840236399748099, "grad_norm": 0.09968356788158417, "learning_rate": 0.0001209604119506205, "loss": 0.3836635947227478, "memory(GiB)": 78.33, "step": 3014, "token_acc": 0.8870716703312993, "train_speed(iter/s)": 0.032172 }, { "epoch": 0.5842174102601366, "grad_norm": 0.1118093952536583, "learning_rate": 0.00012086610466670495, "loss": 0.3891493082046509, "memory(GiB)": 78.33, "step": 3015, "token_acc": 0.8857653605512787, "train_speed(iter/s)": 0.032173 }, { "epoch": 0.5844111805454634, "grad_norm": 0.10725325345993042, "learning_rate": 0.00012077180934886317, "loss": 0.3714507520198822, "memory(GiB)": 78.33, "step": 3016, "token_acc": 0.8898680855780956, "train_speed(iter/s)": 0.032174 }, { "epoch": 0.5846049508307901, "grad_norm": 0.10335146635770798, "learning_rate": 0.00012067752603582458, "loss": 0.32297301292419434, "memory(GiB)": 78.33, "step": 3017, "token_acc": 0.9023583923365265, "train_speed(iter/s)": 0.032175 }, { "epoch": 0.5847987211161169, "grad_norm": 0.09637671709060669, "learning_rate": 0.00012058325476631404, "loss": 0.33302274346351624, "memory(GiB)": 78.33, "step": 3018, "token_acc": 0.9011041882171811, "train_speed(iter/s)": 0.032176 }, { "epoch": 0.5849924914014436, "grad_norm": 0.10183451324701309, "learning_rate": 0.0001204889955790512, "loss": 0.35752299427986145, "memory(GiB)": 78.33, "step": 3019, "token_acc": 0.8940680214357722, "train_speed(iter/s)": 0.032177 }, { "epoch": 0.5851862616867703, "grad_norm": 0.09284891933202744, "learning_rate": 0.00012039474851275087, "loss": 0.31136441230773926, "memory(GiB)": 78.33, "step": 3020, "token_acc": 0.906412598570867, "train_speed(iter/s)": 0.032178 }, { "epoch": 0.5853800319720971, "grad_norm": 0.12801574170589447, "learning_rate": 0.00012030051360612282, "loss": 0.36925309896469116, "memory(GiB)": 78.33, "step": 3021, "token_acc": 0.890402707664066, "train_speed(iter/s)": 0.032179 }, { "epoch": 0.5855738022574238, "grad_norm": 0.10945330560207367, "learning_rate": 0.0001202062908978719, "loss": 0.37747853994369507, "memory(GiB)": 78.33, "step": 3022, "token_acc": 0.8870110767568217, "train_speed(iter/s)": 0.03218 }, { "epoch": 0.5857675725427506, "grad_norm": 0.10129844397306442, "learning_rate": 0.00012011208042669797, "loss": 0.3541242778301239, "memory(GiB)": 78.33, "step": 3023, "token_acc": 0.893968527045489, "train_speed(iter/s)": 0.032181 }, { "epoch": 0.5859613428280773, "grad_norm": 0.10958780348300934, "learning_rate": 0.00012001788223129563, "loss": 0.37422338128089905, "memory(GiB)": 78.33, "step": 3024, "token_acc": 0.8888448160126929, "train_speed(iter/s)": 0.032182 }, { "epoch": 0.5861551131134041, "grad_norm": 0.09564699977636337, "learning_rate": 0.00011992369635035475, "loss": 0.3554079532623291, "memory(GiB)": 78.33, "step": 3025, "token_acc": 0.8962656629191224, "train_speed(iter/s)": 0.032183 }, { "epoch": 0.5863488833987308, "grad_norm": 0.11680517345666885, "learning_rate": 0.00011982952282255994, "loss": 0.37712615728378296, "memory(GiB)": 78.33, "step": 3026, "token_acc": 0.8882045642012648, "train_speed(iter/s)": 0.032184 }, { "epoch": 0.5865426536840576, "grad_norm": 0.1049395278096199, "learning_rate": 0.00011973536168659089, "loss": 0.36447393894195557, "memory(GiB)": 78.33, "step": 3027, "token_acc": 0.8916041342205862, "train_speed(iter/s)": 0.032185 }, { "epoch": 0.5867364239693843, "grad_norm": 0.1006232500076294, "learning_rate": 0.00011964121298112194, "loss": 0.34536200761795044, "memory(GiB)": 78.33, "step": 3028, "token_acc": 0.897648835202761, "train_speed(iter/s)": 0.032185 }, { "epoch": 0.586930194254711, "grad_norm": 0.12608642876148224, "learning_rate": 0.00011954707674482263, "loss": 0.3553128242492676, "memory(GiB)": 78.33, "step": 3029, "token_acc": 0.8939438229119608, "train_speed(iter/s)": 0.032186 }, { "epoch": 0.5871239645400378, "grad_norm": 0.10169053822755814, "learning_rate": 0.00011945295301635724, "loss": 0.3616080582141876, "memory(GiB)": 78.33, "step": 3030, "token_acc": 0.892506928160307, "train_speed(iter/s)": 0.032187 }, { "epoch": 0.5873177348253645, "grad_norm": 0.09629808366298676, "learning_rate": 0.00011935884183438483, "loss": 0.33636754751205444, "memory(GiB)": 78.33, "step": 3031, "token_acc": 0.9002651087436792, "train_speed(iter/s)": 0.032188 }, { "epoch": 0.5875115051106913, "grad_norm": 0.10230912268161774, "learning_rate": 0.00011926474323755947, "loss": 0.352095365524292, "memory(GiB)": 78.33, "step": 3032, "token_acc": 0.8956463675213675, "train_speed(iter/s)": 0.032189 }, { "epoch": 0.587705275396018, "grad_norm": 0.09654974937438965, "learning_rate": 0.00011917065726452991, "loss": 0.33314049243927, "memory(GiB)": 78.33, "step": 3033, "token_acc": 0.9005795098091339, "train_speed(iter/s)": 0.03219 }, { "epoch": 0.5878990456813448, "grad_norm": 0.10360792279243469, "learning_rate": 0.00011907658395393982, "loss": 0.35966095328330994, "memory(GiB)": 78.33, "step": 3034, "token_acc": 0.8929588157145568, "train_speed(iter/s)": 0.032191 }, { "epoch": 0.5880928159666715, "grad_norm": 0.09588982164859772, "learning_rate": 0.00011898252334442771, "loss": 0.3092660903930664, "memory(GiB)": 78.33, "step": 3035, "token_acc": 0.9049737273763518, "train_speed(iter/s)": 0.032192 }, { "epoch": 0.5882865862519983, "grad_norm": 0.11765430122613907, "learning_rate": 0.00011888847547462669, "loss": 0.3882688879966736, "memory(GiB)": 78.33, "step": 3036, "token_acc": 0.8858762254901961, "train_speed(iter/s)": 0.032193 }, { "epoch": 0.588480356537325, "grad_norm": 0.1067943125963211, "learning_rate": 0.00011879444038316485, "loss": 0.34796375036239624, "memory(GiB)": 78.33, "step": 3037, "token_acc": 0.8972947233713282, "train_speed(iter/s)": 0.032194 }, { "epoch": 0.5886741268226517, "grad_norm": 0.12499988824129105, "learning_rate": 0.0001187004181086648, "loss": 0.34971049427986145, "memory(GiB)": 78.33, "step": 3038, "token_acc": 0.8963723439392272, "train_speed(iter/s)": 0.032195 }, { "epoch": 0.5888678971079785, "grad_norm": 0.10188789665699005, "learning_rate": 0.0001186064086897441, "loss": 0.355500727891922, "memory(GiB)": 78.33, "step": 3039, "token_acc": 0.8935444389469024, "train_speed(iter/s)": 0.032196 }, { "epoch": 0.5890616673933052, "grad_norm": 0.10140416026115417, "learning_rate": 0.00011851241216501492, "loss": 0.3354438245296478, "memory(GiB)": 78.33, "step": 3040, "token_acc": 0.8993545502808126, "train_speed(iter/s)": 0.032197 }, { "epoch": 0.589255437678632, "grad_norm": 0.10051760822534561, "learning_rate": 0.00011841842857308416, "loss": 0.35278022289276123, "memory(GiB)": 78.33, "step": 3041, "token_acc": 0.8940331066341589, "train_speed(iter/s)": 0.032198 }, { "epoch": 0.5894492079639587, "grad_norm": 0.11390508711338043, "learning_rate": 0.00011832445795255348, "loss": 0.36132004857063293, "memory(GiB)": 78.33, "step": 3042, "token_acc": 0.891473121085595, "train_speed(iter/s)": 0.032199 }, { "epoch": 0.5896429782492855, "grad_norm": 0.1137322410941124, "learning_rate": 0.00011823050034201902, "loss": 0.38379916548728943, "memory(GiB)": 78.33, "step": 3043, "token_acc": 0.8879169618381741, "train_speed(iter/s)": 0.0322 }, { "epoch": 0.5898367485346122, "grad_norm": 0.09486231952905655, "learning_rate": 0.00011813655578007181, "loss": 0.3457268476486206, "memory(GiB)": 78.33, "step": 3044, "token_acc": 0.8960932211584953, "train_speed(iter/s)": 0.0322 }, { "epoch": 0.590030518819939, "grad_norm": 0.09855726361274719, "learning_rate": 0.00011804262430529727, "loss": 0.33907079696655273, "memory(GiB)": 78.33, "step": 3045, "token_acc": 0.8978774011737636, "train_speed(iter/s)": 0.032201 }, { "epoch": 0.5902242891052657, "grad_norm": 0.10438596457242966, "learning_rate": 0.0001179487059562757, "loss": 0.32949021458625793, "memory(GiB)": 78.33, "step": 3046, "token_acc": 0.9006081617063784, "train_speed(iter/s)": 0.032202 }, { "epoch": 0.5904180593905924, "grad_norm": 0.10170579701662064, "learning_rate": 0.0001178548007715818, "loss": 0.3779696226119995, "memory(GiB)": 78.33, "step": 3047, "token_acc": 0.887333483856401, "train_speed(iter/s)": 0.032203 }, { "epoch": 0.5906118296759192, "grad_norm": 0.10463224351406097, "learning_rate": 0.000117760908789785, "loss": 0.37798070907592773, "memory(GiB)": 78.33, "step": 3048, "token_acc": 0.8907643133516514, "train_speed(iter/s)": 0.032204 }, { "epoch": 0.5908055999612459, "grad_norm": 0.09821716696023941, "learning_rate": 0.00011766703004944934, "loss": 0.3446789085865021, "memory(GiB)": 78.33, "step": 3049, "token_acc": 0.8958164505672609, "train_speed(iter/s)": 0.032205 }, { "epoch": 0.5909993702465727, "grad_norm": 0.12338458746671677, "learning_rate": 0.00011757316458913317, "loss": 0.3765670657157898, "memory(GiB)": 78.33, "step": 3050, "token_acc": 0.8903645271623579, "train_speed(iter/s)": 0.032206 }, { "epoch": 0.5911931405318994, "grad_norm": 0.10490836948156357, "learning_rate": 0.00011747931244738973, "loss": 0.3376193344593048, "memory(GiB)": 78.33, "step": 3051, "token_acc": 0.8987225491603272, "train_speed(iter/s)": 0.032207 }, { "epoch": 0.5913869108172262, "grad_norm": 0.10726680606603622, "learning_rate": 0.00011738547366276645, "loss": 0.36437898874282837, "memory(GiB)": 78.33, "step": 3052, "token_acc": 0.8919469627617164, "train_speed(iter/s)": 0.032208 }, { "epoch": 0.5915806811025529, "grad_norm": 0.10234256833791733, "learning_rate": 0.00011729164827380557, "loss": 0.335938960313797, "memory(GiB)": 78.33, "step": 3053, "token_acc": 0.8999602227525855, "train_speed(iter/s)": 0.032209 }, { "epoch": 0.5917744513878797, "grad_norm": 0.09306668490171432, "learning_rate": 0.00011719783631904362, "loss": 0.3333013653755188, "memory(GiB)": 78.33, "step": 3054, "token_acc": 0.8992164035860263, "train_speed(iter/s)": 0.03221 }, { "epoch": 0.5919682216732064, "grad_norm": 0.10699167847633362, "learning_rate": 0.00011710403783701172, "loss": 0.36375150084495544, "memory(GiB)": 78.33, "step": 3055, "token_acc": 0.8928638271703965, "train_speed(iter/s)": 0.032211 }, { "epoch": 0.5921619919585331, "grad_norm": 0.1067671999335289, "learning_rate": 0.0001170102528662355, "loss": 0.37255626916885376, "memory(GiB)": 78.33, "step": 3056, "token_acc": 0.8900267627942542, "train_speed(iter/s)": 0.032212 }, { "epoch": 0.5923557622438599, "grad_norm": 0.11016660928726196, "learning_rate": 0.00011691648144523482, "loss": 0.3919016420841217, "memory(GiB)": 78.33, "step": 3057, "token_acc": 0.8844034568760513, "train_speed(iter/s)": 0.032213 }, { "epoch": 0.5925495325291866, "grad_norm": 0.10009429603815079, "learning_rate": 0.00011682272361252423, "loss": 0.32291102409362793, "memory(GiB)": 78.33, "step": 3058, "token_acc": 0.9015109432285302, "train_speed(iter/s)": 0.032214 }, { "epoch": 0.5927433028145134, "grad_norm": 0.1145942285656929, "learning_rate": 0.00011672897940661254, "loss": 0.3807049095630646, "memory(GiB)": 78.33, "step": 3059, "token_acc": 0.8866095219750233, "train_speed(iter/s)": 0.032215 }, { "epoch": 0.5929370730998401, "grad_norm": 0.12313009053468704, "learning_rate": 0.00011663524886600309, "loss": 0.40620920062065125, "memory(GiB)": 78.33, "step": 3060, "token_acc": 0.8780752287360217, "train_speed(iter/s)": 0.032216 }, { "epoch": 0.5931308433851669, "grad_norm": 0.09408815950155258, "learning_rate": 0.00011654153202919341, "loss": 0.3425576388835907, "memory(GiB)": 78.33, "step": 3061, "token_acc": 0.8985101178563487, "train_speed(iter/s)": 0.032216 }, { "epoch": 0.5933246136704936, "grad_norm": 0.09209151566028595, "learning_rate": 0.00011644782893467559, "loss": 0.2937813997268677, "memory(GiB)": 78.33, "step": 3062, "token_acc": 0.9103508585965656, "train_speed(iter/s)": 0.032217 }, { "epoch": 0.5935183839558204, "grad_norm": 0.10529588907957077, "learning_rate": 0.00011635413962093607, "loss": 0.33743685483932495, "memory(GiB)": 78.33, "step": 3063, "token_acc": 0.9011610773855131, "train_speed(iter/s)": 0.032218 }, { "epoch": 0.5937121542411471, "grad_norm": 0.10583112388849258, "learning_rate": 0.00011626046412645546, "loss": 0.384732186794281, "memory(GiB)": 78.33, "step": 3064, "token_acc": 0.886413967142195, "train_speed(iter/s)": 0.032219 }, { "epoch": 0.5939059245264738, "grad_norm": 0.10110019147396088, "learning_rate": 0.00011616680248970887, "loss": 0.35833442211151123, "memory(GiB)": 78.33, "step": 3065, "token_acc": 0.8952516405918607, "train_speed(iter/s)": 0.03222 }, { "epoch": 0.5940996948118006, "grad_norm": 0.10245722532272339, "learning_rate": 0.0001160731547491656, "loss": 0.39178815484046936, "memory(GiB)": 78.33, "step": 3066, "token_acc": 0.8862267136788917, "train_speed(iter/s)": 0.032221 }, { "epoch": 0.5942934650971273, "grad_norm": 0.11041680723428726, "learning_rate": 0.00011597952094328933, "loss": 0.36065673828125, "memory(GiB)": 78.33, "step": 3067, "token_acc": 0.8902277926345974, "train_speed(iter/s)": 0.032222 }, { "epoch": 0.5944872353824541, "grad_norm": 0.1108374372124672, "learning_rate": 0.00011588590111053803, "loss": 0.3488714396953583, "memory(GiB)": 78.33, "step": 3068, "token_acc": 0.8970215739365883, "train_speed(iter/s)": 0.032223 }, { "epoch": 0.5946810056677808, "grad_norm": 0.09806838631629944, "learning_rate": 0.00011579229528936375, "loss": 0.3352605104446411, "memory(GiB)": 78.33, "step": 3069, "token_acc": 0.9009188114555908, "train_speed(iter/s)": 0.032224 }, { "epoch": 0.5948747759531076, "grad_norm": 0.0958922728896141, "learning_rate": 0.00011569870351821308, "loss": 0.3334873616695404, "memory(GiB)": 78.33, "step": 3070, "token_acc": 0.8991558523371551, "train_speed(iter/s)": 0.032225 }, { "epoch": 0.5950685462384343, "grad_norm": 0.08912528306245804, "learning_rate": 0.00011560512583552649, "loss": 0.2977232336997986, "memory(GiB)": 78.33, "step": 3071, "token_acc": 0.9110933022487736, "train_speed(iter/s)": 0.032225 }, { "epoch": 0.595262316523761, "grad_norm": 0.11086632311344147, "learning_rate": 0.000115511562279739, "loss": 0.385816752910614, "memory(GiB)": 78.33, "step": 3072, "token_acc": 0.8883094428021641, "train_speed(iter/s)": 0.032226 }, { "epoch": 0.5954560868090878, "grad_norm": 0.09204145520925522, "learning_rate": 0.0001154180128892796, "loss": 0.31663045287132263, "memory(GiB)": 78.33, "step": 3073, "token_acc": 0.9048406026202965, "train_speed(iter/s)": 0.032227 }, { "epoch": 0.5956498570944145, "grad_norm": 0.11149836331605911, "learning_rate": 0.00011532447770257153, "loss": 0.4004219174385071, "memory(GiB)": 78.33, "step": 3074, "token_acc": 0.883167884804189, "train_speed(iter/s)": 0.032228 }, { "epoch": 0.5958436273797413, "grad_norm": 0.10164281725883484, "learning_rate": 0.00011523095675803232, "loss": 0.3479783535003662, "memory(GiB)": 78.33, "step": 3075, "token_acc": 0.8950982001203567, "train_speed(iter/s)": 0.032229 }, { "epoch": 0.596037397665068, "grad_norm": 0.09977372735738754, "learning_rate": 0.00011513745009407339, "loss": 0.34006303548812866, "memory(GiB)": 78.33, "step": 3076, "token_acc": 0.8972231350640815, "train_speed(iter/s)": 0.03223 }, { "epoch": 0.5962311679503948, "grad_norm": 0.09573015570640564, "learning_rate": 0.00011504395774910056, "loss": 0.32083189487457275, "memory(GiB)": 78.33, "step": 3077, "token_acc": 0.9044013992586017, "train_speed(iter/s)": 0.032231 }, { "epoch": 0.5964249382357215, "grad_norm": 0.10882751643657684, "learning_rate": 0.00011495047976151352, "loss": 0.35326480865478516, "memory(GiB)": 78.33, "step": 3078, "token_acc": 0.8945277761309065, "train_speed(iter/s)": 0.032232 }, { "epoch": 0.5966187085210483, "grad_norm": 0.10612433403730392, "learning_rate": 0.00011485701616970628, "loss": 0.37883704900741577, "memory(GiB)": 78.33, "step": 3079, "token_acc": 0.8884327457135958, "train_speed(iter/s)": 0.032233 }, { "epoch": 0.596812478806375, "grad_norm": 0.114121213555336, "learning_rate": 0.00011476356701206683, "loss": 0.3798381984233856, "memory(GiB)": 78.33, "step": 3080, "token_acc": 0.887470047620492, "train_speed(iter/s)": 0.032234 }, { "epoch": 0.5970062490917017, "grad_norm": 0.09286189824342728, "learning_rate": 0.00011467013232697721, "loss": 0.30042147636413574, "memory(GiB)": 78.33, "step": 3081, "token_acc": 0.9096791788565478, "train_speed(iter/s)": 0.032235 }, { "epoch": 0.5972000193770285, "grad_norm": 0.103480763733387, "learning_rate": 0.00011457671215281367, "loss": 0.35093623399734497, "memory(GiB)": 78.33, "step": 3082, "token_acc": 0.8954995525533328, "train_speed(iter/s)": 0.032235 }, { "epoch": 0.5973937896623552, "grad_norm": 0.0931040421128273, "learning_rate": 0.00011448330652794625, "loss": 0.3155452013015747, "memory(GiB)": 78.33, "step": 3083, "token_acc": 0.9038670039754246, "train_speed(iter/s)": 0.032236 }, { "epoch": 0.597587559947682, "grad_norm": 0.10366553068161011, "learning_rate": 0.00011438991549073928, "loss": 0.3622359037399292, "memory(GiB)": 78.33, "step": 3084, "token_acc": 0.8940353881278539, "train_speed(iter/s)": 0.032237 }, { "epoch": 0.5977813302330087, "grad_norm": 0.09329306334257126, "learning_rate": 0.00011429653907955083, "loss": 0.3334829807281494, "memory(GiB)": 78.33, "step": 3085, "token_acc": 0.9010285939268665, "train_speed(iter/s)": 0.032238 }, { "epoch": 0.5979751005183355, "grad_norm": 0.09546185284852982, "learning_rate": 0.00011420317733273319, "loss": 0.32923221588134766, "memory(GiB)": 78.33, "step": 3086, "token_acc": 0.9020269413227124, "train_speed(iter/s)": 0.032239 }, { "epoch": 0.5981688708036622, "grad_norm": 0.09393859654664993, "learning_rate": 0.00011410983028863249, "loss": 0.315865159034729, "memory(GiB)": 78.33, "step": 3087, "token_acc": 0.9042561262423184, "train_speed(iter/s)": 0.03224 }, { "epoch": 0.598362641088989, "grad_norm": 0.10992894321680069, "learning_rate": 0.0001140164979855889, "loss": 0.39187929034233093, "memory(GiB)": 78.33, "step": 3088, "token_acc": 0.8844834915686552, "train_speed(iter/s)": 0.032241 }, { "epoch": 0.5985564113743157, "grad_norm": 0.10544363409280777, "learning_rate": 0.00011392318046193656, "loss": 0.3612055778503418, "memory(GiB)": 78.33, "step": 3089, "token_acc": 0.8903434157031227, "train_speed(iter/s)": 0.032242 }, { "epoch": 0.5987501816596424, "grad_norm": 0.10547579824924469, "learning_rate": 0.00011382987775600336, "loss": 0.37269455194473267, "memory(GiB)": 78.33, "step": 3090, "token_acc": 0.8905209503713244, "train_speed(iter/s)": 0.032242 }, { "epoch": 0.5989439519449692, "grad_norm": 0.10508310049772263, "learning_rate": 0.00011373658990611134, "loss": 0.36013132333755493, "memory(GiB)": 78.33, "step": 3091, "token_acc": 0.8923727863707689, "train_speed(iter/s)": 0.032243 }, { "epoch": 0.5991377222302959, "grad_norm": 0.10769172757863998, "learning_rate": 0.00011364331695057627, "loss": 0.36448052525520325, "memory(GiB)": 78.33, "step": 3092, "token_acc": 0.8934169278996865, "train_speed(iter/s)": 0.032244 }, { "epoch": 0.5993314925156227, "grad_norm": 0.10621456056833267, "learning_rate": 0.00011355005892770788, "loss": 0.35971012711524963, "memory(GiB)": 78.33, "step": 3093, "token_acc": 0.8954783003862758, "train_speed(iter/s)": 0.032245 }, { "epoch": 0.5995252628009494, "grad_norm": 0.10547253489494324, "learning_rate": 0.00011345681587580971, "loss": 0.38230541348457336, "memory(GiB)": 78.33, "step": 3094, "token_acc": 0.8890573803785643, "train_speed(iter/s)": 0.032246 }, { "epoch": 0.5997190330862762, "grad_norm": 0.11314172297716141, "learning_rate": 0.00011336358783317918, "loss": 0.3828127086162567, "memory(GiB)": 78.33, "step": 3095, "token_acc": 0.8876047830374754, "train_speed(iter/s)": 0.032247 }, { "epoch": 0.5999128033716029, "grad_norm": 0.1083948016166687, "learning_rate": 0.00011327037483810767, "loss": 0.34444913268089294, "memory(GiB)": 78.33, "step": 3096, "token_acc": 0.8984524686809138, "train_speed(iter/s)": 0.032248 }, { "epoch": 0.6001065736569298, "grad_norm": 0.1084941178560257, "learning_rate": 0.00011317717692888012, "loss": 0.35603395104408264, "memory(GiB)": 78.33, "step": 3097, "token_acc": 0.8943460416891673, "train_speed(iter/s)": 0.032249 }, { "epoch": 0.6003003439422565, "grad_norm": 0.10329198837280273, "learning_rate": 0.0001130839941437755, "loss": 0.3441343903541565, "memory(GiB)": 78.33, "step": 3098, "token_acc": 0.8977043317759704, "train_speed(iter/s)": 0.03225 }, { "epoch": 0.6004941142275833, "grad_norm": 0.10437928140163422, "learning_rate": 0.00011299082652106642, "loss": 0.32754242420196533, "memory(GiB)": 78.33, "step": 3099, "token_acc": 0.9024354801209021, "train_speed(iter/s)": 0.032251 }, { "epoch": 0.60068788451291, "grad_norm": 0.10441198199987411, "learning_rate": 0.00011289767409901936, "loss": 0.34080564975738525, "memory(GiB)": 78.33, "step": 3100, "token_acc": 0.9004198514371837, "train_speed(iter/s)": 0.032252 }, { "epoch": 0.6008816547982367, "grad_norm": 0.09256397187709808, "learning_rate": 0.00011280453691589461, "loss": 0.34055426716804504, "memory(GiB)": 78.33, "step": 3101, "token_acc": 0.8970888269749687, "train_speed(iter/s)": 0.032252 }, { "epoch": 0.6010754250835635, "grad_norm": 0.10573244839906693, "learning_rate": 0.00011271141500994595, "loss": 0.37066394090652466, "memory(GiB)": 78.33, "step": 3102, "token_acc": 0.890913167778463, "train_speed(iter/s)": 0.032253 }, { "epoch": 0.6012691953688902, "grad_norm": 0.1301407516002655, "learning_rate": 0.0001126183084194212, "loss": 0.34675294160842896, "memory(GiB)": 78.33, "step": 3103, "token_acc": 0.8968794248992484, "train_speed(iter/s)": 0.032254 }, { "epoch": 0.601462965654217, "grad_norm": 0.09512253850698471, "learning_rate": 0.00011252521718256159, "loss": 0.34905996918678284, "memory(GiB)": 78.33, "step": 3104, "token_acc": 0.8967058594888738, "train_speed(iter/s)": 0.032255 }, { "epoch": 0.6016567359395437, "grad_norm": 0.11211273819208145, "learning_rate": 0.0001124321413376023, "loss": 0.3654071092605591, "memory(GiB)": 78.33, "step": 3105, "token_acc": 0.8934378629500581, "train_speed(iter/s)": 0.032256 }, { "epoch": 0.6018505062248705, "grad_norm": 0.10533998906612396, "learning_rate": 0.00011233908092277203, "loss": 0.3517376780509949, "memory(GiB)": 78.33, "step": 3106, "token_acc": 0.8942503737936659, "train_speed(iter/s)": 0.032257 }, { "epoch": 0.6020442765101972, "grad_norm": 0.11167991906404495, "learning_rate": 0.00011224603597629322, "loss": 0.3245575428009033, "memory(GiB)": 78.33, "step": 3107, "token_acc": 0.9026755252289459, "train_speed(iter/s)": 0.032258 }, { "epoch": 0.602238046795524, "grad_norm": 0.11581658571958542, "learning_rate": 0.00011215300653638199, "loss": 0.36745405197143555, "memory(GiB)": 78.33, "step": 3108, "token_acc": 0.892201612120035, "train_speed(iter/s)": 0.032259 }, { "epoch": 0.6024318170808507, "grad_norm": 0.10512839257717133, "learning_rate": 0.00011205999264124786, "loss": 0.3423347771167755, "memory(GiB)": 78.33, "step": 3109, "token_acc": 0.8962487920583326, "train_speed(iter/s)": 0.03226 }, { "epoch": 0.6026255873661774, "grad_norm": 0.10966593772172928, "learning_rate": 0.00011196699432909435, "loss": 0.3704321086406708, "memory(GiB)": 78.33, "step": 3110, "token_acc": 0.8911972524843059, "train_speed(iter/s)": 0.032261 }, { "epoch": 0.6028193576515042, "grad_norm": 0.09262005239725113, "learning_rate": 0.00011187401163811816, "loss": 0.31054481863975525, "memory(GiB)": 78.33, "step": 3111, "token_acc": 0.9069608679751311, "train_speed(iter/s)": 0.032261 }, { "epoch": 0.6030131279368309, "grad_norm": 0.11650776118040085, "learning_rate": 0.00011178104460650993, "loss": 0.40273237228393555, "memory(GiB)": 78.33, "step": 3112, "token_acc": 0.8809349247778527, "train_speed(iter/s)": 0.032262 }, { "epoch": 0.6032068982221577, "grad_norm": 0.10926227271556854, "learning_rate": 0.00011168809327245361, "loss": 0.3614426851272583, "memory(GiB)": 78.33, "step": 3113, "token_acc": 0.8933805533961015, "train_speed(iter/s)": 0.032263 }, { "epoch": 0.6034006685074844, "grad_norm": 0.09592621773481369, "learning_rate": 0.00011159515767412688, "loss": 0.3322632312774658, "memory(GiB)": 78.33, "step": 3114, "token_acc": 0.9001508923461158, "train_speed(iter/s)": 0.032264 }, { "epoch": 0.6035944387928112, "grad_norm": 0.09553615748882294, "learning_rate": 0.00011150223784970092, "loss": 0.31129273772239685, "memory(GiB)": 78.33, "step": 3115, "token_acc": 0.9059551739629611, "train_speed(iter/s)": 0.032265 }, { "epoch": 0.6037882090781379, "grad_norm": 0.113133504986763, "learning_rate": 0.0001114093338373403, "loss": 0.38860899209976196, "memory(GiB)": 78.33, "step": 3116, "token_acc": 0.885463585843814, "train_speed(iter/s)": 0.032266 }, { "epoch": 0.6039819793634646, "grad_norm": 0.09849842637777328, "learning_rate": 0.0001113164456752033, "loss": 0.3431742489337921, "memory(GiB)": 78.33, "step": 3117, "token_acc": 0.8975163928200713, "train_speed(iter/s)": 0.032267 }, { "epoch": 0.6041757496487914, "grad_norm": 0.10167428106069565, "learning_rate": 0.00011122357340144148, "loss": 0.37376028299331665, "memory(GiB)": 78.33, "step": 3118, "token_acc": 0.8888616757830571, "train_speed(iter/s)": 0.032268 }, { "epoch": 0.6043695199341181, "grad_norm": 0.11035677045583725, "learning_rate": 0.00011113071705420004, "loss": 0.36835113167762756, "memory(GiB)": 78.33, "step": 3119, "token_acc": 0.8914904111178809, "train_speed(iter/s)": 0.032269 }, { "epoch": 0.6045632902194449, "grad_norm": 0.10013226419687271, "learning_rate": 0.00011103787667161753, "loss": 0.3682772219181061, "memory(GiB)": 78.33, "step": 3120, "token_acc": 0.8906798812862472, "train_speed(iter/s)": 0.03227 }, { "epoch": 0.6047570605047716, "grad_norm": 0.10240922123193741, "learning_rate": 0.00011094505229182605, "loss": 0.33320745825767517, "memory(GiB)": 78.33, "step": 3121, "token_acc": 0.8995880980499467, "train_speed(iter/s)": 0.03227 }, { "epoch": 0.6049508307900984, "grad_norm": 0.10636462271213531, "learning_rate": 0.00011085224395295109, "loss": 0.3461098074913025, "memory(GiB)": 78.33, "step": 3122, "token_acc": 0.8970420099088271, "train_speed(iter/s)": 0.032271 }, { "epoch": 0.6051446010754251, "grad_norm": 0.15055952966213226, "learning_rate": 0.00011075945169311141, "loss": 0.4095023572444916, "memory(GiB)": 78.33, "step": 3123, "token_acc": 0.8795589223532286, "train_speed(iter/s)": 0.032272 }, { "epoch": 0.6053383713607519, "grad_norm": 0.10108061134815216, "learning_rate": 0.00011066667555041942, "loss": 0.33705589175224304, "memory(GiB)": 78.33, "step": 3124, "token_acc": 0.8990451697627971, "train_speed(iter/s)": 0.032273 }, { "epoch": 0.6055321416460786, "grad_norm": 0.10497518628835678, "learning_rate": 0.00011057391556298065, "loss": 0.3509282171726227, "memory(GiB)": 78.33, "step": 3125, "token_acc": 0.8944882780464354, "train_speed(iter/s)": 0.032274 }, { "epoch": 0.6057259119314053, "grad_norm": 0.0899055078625679, "learning_rate": 0.0001104811717688942, "loss": 0.3112187385559082, "memory(GiB)": 78.33, "step": 3126, "token_acc": 0.9061983165066279, "train_speed(iter/s)": 0.032275 }, { "epoch": 0.6059196822167321, "grad_norm": 0.10332870483398438, "learning_rate": 0.00011038844420625239, "loss": 0.40198662877082825, "memory(GiB)": 78.33, "step": 3127, "token_acc": 0.8823683727275112, "train_speed(iter/s)": 0.032276 }, { "epoch": 0.6061134525020588, "grad_norm": 0.1093793734908104, "learning_rate": 0.00011029573291314094, "loss": 0.36452385783195496, "memory(GiB)": 78.33, "step": 3128, "token_acc": 0.8916964258920266, "train_speed(iter/s)": 0.032277 }, { "epoch": 0.6063072227873856, "grad_norm": 0.10296124219894409, "learning_rate": 0.00011020303792763896, "loss": 0.31651031970977783, "memory(GiB)": 78.33, "step": 3129, "token_acc": 0.9070627097315436, "train_speed(iter/s)": 0.032277 }, { "epoch": 0.6065009930727123, "grad_norm": 0.10293302685022354, "learning_rate": 0.00011011035928781861, "loss": 0.34327608346939087, "memory(GiB)": 78.33, "step": 3130, "token_acc": 0.896756412974348, "train_speed(iter/s)": 0.032278 }, { "epoch": 0.6066947633580391, "grad_norm": 0.1089029610157013, "learning_rate": 0.00011001769703174564, "loss": 0.3363596796989441, "memory(GiB)": 78.33, "step": 3131, "token_acc": 0.9004472523691865, "train_speed(iter/s)": 0.032279 }, { "epoch": 0.6068885336433658, "grad_norm": 0.10041207820177078, "learning_rate": 0.0001099250511974788, "loss": 0.3575053811073303, "memory(GiB)": 78.33, "step": 3132, "token_acc": 0.8955265421790395, "train_speed(iter/s)": 0.03228 }, { "epoch": 0.6070823039286926, "grad_norm": 0.11202222853899002, "learning_rate": 0.00010983242182307032, "loss": 0.363272488117218, "memory(GiB)": 78.33, "step": 3133, "token_acc": 0.8899773926149209, "train_speed(iter/s)": 0.032281 }, { "epoch": 0.6072760742140193, "grad_norm": 0.10849933326244354, "learning_rate": 0.00010973980894656555, "loss": 0.3696746826171875, "memory(GiB)": 78.33, "step": 3134, "token_acc": 0.8896282494898412, "train_speed(iter/s)": 0.032282 }, { "epoch": 0.607469844499346, "grad_norm": 0.10315241664648056, "learning_rate": 0.00010964721260600305, "loss": 0.34507814049720764, "memory(GiB)": 78.33, "step": 3135, "token_acc": 0.897193022987178, "train_speed(iter/s)": 0.032283 }, { "epoch": 0.6076636147846728, "grad_norm": 0.11026407033205032, "learning_rate": 0.00010955463283941472, "loss": 0.3476986289024353, "memory(GiB)": 78.33, "step": 3136, "token_acc": 0.8966703526231724, "train_speed(iter/s)": 0.032284 }, { "epoch": 0.6078573850699995, "grad_norm": 0.09877464920282364, "learning_rate": 0.00010946206968482542, "loss": 0.33894383907318115, "memory(GiB)": 78.33, "step": 3137, "token_acc": 0.8988343465809174, "train_speed(iter/s)": 0.032284 }, { "epoch": 0.6080511553553263, "grad_norm": 0.09642918407917023, "learning_rate": 0.00010936952318025344, "loss": 0.3351293206214905, "memory(GiB)": 78.33, "step": 3138, "token_acc": 0.8997663427327279, "train_speed(iter/s)": 0.032285 }, { "epoch": 0.608244925640653, "grad_norm": 0.10105688869953156, "learning_rate": 0.00010927699336371003, "loss": 0.32609879970550537, "memory(GiB)": 78.33, "step": 3139, "token_acc": 0.9029436501261564, "train_speed(iter/s)": 0.032286 }, { "epoch": 0.6084386959259798, "grad_norm": 0.10417565703392029, "learning_rate": 0.00010918448027319971, "loss": 0.35680675506591797, "memory(GiB)": 78.33, "step": 3140, "token_acc": 0.8950784207679827, "train_speed(iter/s)": 0.032287 }, { "epoch": 0.6086324662113065, "grad_norm": 0.10269538313150406, "learning_rate": 0.00010909198394672018, "loss": 0.36641865968704224, "memory(GiB)": 78.33, "step": 3141, "token_acc": 0.8919290565190549, "train_speed(iter/s)": 0.032288 }, { "epoch": 0.6088262364966333, "grad_norm": 0.10089493542909622, "learning_rate": 0.000108999504422262, "loss": 0.37600627541542053, "memory(GiB)": 78.33, "step": 3142, "token_acc": 0.8877864583333334, "train_speed(iter/s)": 0.032289 }, { "epoch": 0.60902000678196, "grad_norm": 0.10720504820346832, "learning_rate": 0.00010890704173780916, "loss": 0.35292261838912964, "memory(GiB)": 78.33, "step": 3143, "token_acc": 0.8938151494093121, "train_speed(iter/s)": 0.03229 }, { "epoch": 0.6092137770672867, "grad_norm": 0.09673202037811279, "learning_rate": 0.00010881459593133842, "loss": 0.32286348938941956, "memory(GiB)": 78.33, "step": 3144, "token_acc": 0.9035548686244204, "train_speed(iter/s)": 0.032291 }, { "epoch": 0.6094075473526135, "grad_norm": 0.10169202834367752, "learning_rate": 0.00010872216704081986, "loss": 0.345810204744339, "memory(GiB)": 78.33, "step": 3145, "token_acc": 0.8962894744659993, "train_speed(iter/s)": 0.032291 }, { "epoch": 0.6096013176379402, "grad_norm": 0.09147030860185623, "learning_rate": 0.00010862975510421642, "loss": 0.30978208780288696, "memory(GiB)": 78.33, "step": 3146, "token_acc": 0.9076807434741014, "train_speed(iter/s)": 0.032292 }, { "epoch": 0.609795087923267, "grad_norm": 0.09909648448228836, "learning_rate": 0.00010853736015948425, "loss": 0.339855819940567, "memory(GiB)": 78.33, "step": 3147, "token_acc": 0.8987607001405391, "train_speed(iter/s)": 0.032293 }, { "epoch": 0.6099888582085937, "grad_norm": 0.21334044635295868, "learning_rate": 0.00010844498224457246, "loss": 0.34740781784057617, "memory(GiB)": 78.33, "step": 3148, "token_acc": 0.8966986427186447, "train_speed(iter/s)": 0.032294 }, { "epoch": 0.6101826284939205, "grad_norm": 0.11788555234670639, "learning_rate": 0.00010835262139742303, "loss": 0.36575233936309814, "memory(GiB)": 78.33, "step": 3149, "token_acc": 0.8924587929111414, "train_speed(iter/s)": 0.032295 }, { "epoch": 0.6103763987792472, "grad_norm": 0.09829849749803543, "learning_rate": 0.00010826027765597116, "loss": 0.3394019901752472, "memory(GiB)": 78.33, "step": 3150, "token_acc": 0.8985713198162762, "train_speed(iter/s)": 0.032296 }, { "epoch": 0.610570169064574, "grad_norm": 0.1058797761797905, "learning_rate": 0.00010816795105814479, "loss": 0.3209468424320221, "memory(GiB)": 78.33, "step": 3151, "token_acc": 0.9035917319430982, "train_speed(iter/s)": 0.032296 }, { "epoch": 0.6107639393499007, "grad_norm": 0.09663695096969604, "learning_rate": 0.000108075641641865, "loss": 0.3185003399848938, "memory(GiB)": 78.33, "step": 3152, "token_acc": 0.904547132985177, "train_speed(iter/s)": 0.032297 }, { "epoch": 0.6109577096352274, "grad_norm": 0.09900177270174026, "learning_rate": 0.00010798334944504572, "loss": 0.3523489832878113, "memory(GiB)": 78.33, "step": 3153, "token_acc": 0.8945492208011177, "train_speed(iter/s)": 0.032298 }, { "epoch": 0.6111514799205542, "grad_norm": 0.11104562878608704, "learning_rate": 0.00010789107450559386, "loss": 0.37791165709495544, "memory(GiB)": 78.33, "step": 3154, "token_acc": 0.8904507301368287, "train_speed(iter/s)": 0.032299 }, { "epoch": 0.6113452502058809, "grad_norm": 0.11416266113519669, "learning_rate": 0.00010779881686140927, "loss": 0.34979140758514404, "memory(GiB)": 78.33, "step": 3155, "token_acc": 0.895737220889463, "train_speed(iter/s)": 0.0323 }, { "epoch": 0.6115390204912077, "grad_norm": 0.10298382490873337, "learning_rate": 0.00010770657655038453, "loss": 0.339542031288147, "memory(GiB)": 78.33, "step": 3156, "token_acc": 0.8982814065053508, "train_speed(iter/s)": 0.032301 }, { "epoch": 0.6117327907765344, "grad_norm": 0.09547599405050278, "learning_rate": 0.00010761435361040531, "loss": 0.36547231674194336, "memory(GiB)": 78.33, "step": 3157, "token_acc": 0.8895893839578759, "train_speed(iter/s)": 0.032302 }, { "epoch": 0.6119265610618612, "grad_norm": 0.10469914972782135, "learning_rate": 0.00010752214807934996, "loss": 0.31865012645721436, "memory(GiB)": 78.33, "step": 3158, "token_acc": 0.9054883979435876, "train_speed(iter/s)": 0.032302 }, { "epoch": 0.6121203313471879, "grad_norm": 0.10441134870052338, "learning_rate": 0.00010742995999508987, "loss": 0.36503875255584717, "memory(GiB)": 78.33, "step": 3159, "token_acc": 0.8934720034758051, "train_speed(iter/s)": 0.032303 }, { "epoch": 0.6123141016325147, "grad_norm": 0.10514453798532486, "learning_rate": 0.00010733778939548905, "loss": 0.3756176829338074, "memory(GiB)": 78.33, "step": 3160, "token_acc": 0.8889959795120339, "train_speed(iter/s)": 0.032304 }, { "epoch": 0.6125078719178414, "grad_norm": 0.10831741243600845, "learning_rate": 0.00010724563631840451, "loss": 0.39807701110839844, "memory(GiB)": 78.33, "step": 3161, "token_acc": 0.8838846894490271, "train_speed(iter/s)": 0.032305 }, { "epoch": 0.6127016422031681, "grad_norm": 0.10723893344402313, "learning_rate": 0.00010715350080168606, "loss": 0.38960257172584534, "memory(GiB)": 78.33, "step": 3162, "token_acc": 0.8852941176470588, "train_speed(iter/s)": 0.032306 }, { "epoch": 0.6128954124884949, "grad_norm": 0.10730850696563721, "learning_rate": 0.00010706138288317609, "loss": 0.3630613386631012, "memory(GiB)": 78.33, "step": 3163, "token_acc": 0.8937855052504734, "train_speed(iter/s)": 0.032307 }, { "epoch": 0.6130891827738216, "grad_norm": 0.09366155415773392, "learning_rate": 0.00010696928260070999, "loss": 0.3480615019798279, "memory(GiB)": 78.33, "step": 3164, "token_acc": 0.8973462201951934, "train_speed(iter/s)": 0.032308 }, { "epoch": 0.6132829530591484, "grad_norm": 0.10169912874698639, "learning_rate": 0.00010687719999211583, "loss": 0.36037591099739075, "memory(GiB)": 78.33, "step": 3165, "token_acc": 0.892155500597438, "train_speed(iter/s)": 0.032309 }, { "epoch": 0.6134767233444751, "grad_norm": 0.10275442153215408, "learning_rate": 0.00010678513509521435, "loss": 0.3725361227989197, "memory(GiB)": 78.33, "step": 3166, "token_acc": 0.8887706641060908, "train_speed(iter/s)": 0.032309 }, { "epoch": 0.6136704936298019, "grad_norm": 0.10723954439163208, "learning_rate": 0.00010669308794781914, "loss": 0.3526693880558014, "memory(GiB)": 78.33, "step": 3167, "token_acc": 0.8959228785459904, "train_speed(iter/s)": 0.03231 }, { "epoch": 0.6138642639151286, "grad_norm": 0.1202649399638176, "learning_rate": 0.0001066010585877364, "loss": 0.4182667136192322, "memory(GiB)": 78.33, "step": 3168, "token_acc": 0.8773098963363596, "train_speed(iter/s)": 0.032311 }, { "epoch": 0.6140580342004553, "grad_norm": 0.1005726233124733, "learning_rate": 0.00010650904705276513, "loss": 0.33930516242980957, "memory(GiB)": 78.33, "step": 3169, "token_acc": 0.8973083132625784, "train_speed(iter/s)": 0.032312 }, { "epoch": 0.6142518044857821, "grad_norm": 0.10162294656038284, "learning_rate": 0.0001064170533806968, "loss": 0.3408072292804718, "memory(GiB)": 78.33, "step": 3170, "token_acc": 0.8978023358030893, "train_speed(iter/s)": 0.032313 }, { "epoch": 0.6144455747711088, "grad_norm": 0.0949673056602478, "learning_rate": 0.00010632507760931581, "loss": 0.3130902349948883, "memory(GiB)": 78.33, "step": 3171, "token_acc": 0.9057638586329737, "train_speed(iter/s)": 0.032314 }, { "epoch": 0.6146393450564356, "grad_norm": 0.10987738519906998, "learning_rate": 0.000106233119776399, "loss": 0.35283100605010986, "memory(GiB)": 78.33, "step": 3172, "token_acc": 0.8905293376688052, "train_speed(iter/s)": 0.032315 }, { "epoch": 0.6148331153417623, "grad_norm": 0.10509679466485977, "learning_rate": 0.00010614117991971598, "loss": 0.36292973160743713, "memory(GiB)": 78.33, "step": 3173, "token_acc": 0.8947354138398914, "train_speed(iter/s)": 0.032316 }, { "epoch": 0.6150268856270891, "grad_norm": 0.1026267558336258, "learning_rate": 0.00010604925807702895, "loss": 0.3576149046421051, "memory(GiB)": 78.33, "step": 3174, "token_acc": 0.8960543506663182, "train_speed(iter/s)": 0.032317 }, { "epoch": 0.6152206559124158, "grad_norm": 0.1011863574385643, "learning_rate": 0.00010595735428609256, "loss": 0.3429381251335144, "memory(GiB)": 78.33, "step": 3175, "token_acc": 0.8972618182729163, "train_speed(iter/s)": 0.032317 }, { "epoch": 0.6154144261977426, "grad_norm": 0.09697016328573227, "learning_rate": 0.0001058654685846543, "loss": 0.3507676422595978, "memory(GiB)": 78.33, "step": 3176, "token_acc": 0.8970009395870896, "train_speed(iter/s)": 0.032318 }, { "epoch": 0.6156081964830693, "grad_norm": 0.0992831438779831, "learning_rate": 0.00010577360101045396, "loss": 0.3351018726825714, "memory(GiB)": 78.33, "step": 3177, "token_acc": 0.8992732671719934, "train_speed(iter/s)": 0.032319 }, { "epoch": 0.615801966768396, "grad_norm": 0.1062702089548111, "learning_rate": 0.00010568175160122414, "loss": 0.38858115673065186, "memory(GiB)": 78.33, "step": 3178, "token_acc": 0.8845210155749946, "train_speed(iter/s)": 0.03232 }, { "epoch": 0.6159957370537228, "grad_norm": 0.09542369842529297, "learning_rate": 0.00010558992039468979, "loss": 0.32729557156562805, "memory(GiB)": 78.33, "step": 3179, "token_acc": 0.9012937964120754, "train_speed(iter/s)": 0.032321 }, { "epoch": 0.6161895073390495, "grad_norm": 0.09666220843791962, "learning_rate": 0.00010549810742856847, "loss": 0.3226274847984314, "memory(GiB)": 78.33, "step": 3180, "token_acc": 0.9038066656136471, "train_speed(iter/s)": 0.032322 }, { "epoch": 0.6163832776243763, "grad_norm": 0.12249480187892914, "learning_rate": 0.0001054063127405703, "loss": 0.4036695063114166, "memory(GiB)": 78.33, "step": 3181, "token_acc": 0.8821568293927539, "train_speed(iter/s)": 0.032323 }, { "epoch": 0.616577047909703, "grad_norm": 0.10205438733100891, "learning_rate": 0.00010531453636839771, "loss": 0.3525814116001129, "memory(GiB)": 78.33, "step": 3182, "token_acc": 0.8946187098204389, "train_speed(iter/s)": 0.032323 }, { "epoch": 0.6167708181950298, "grad_norm": 0.10525421798229218, "learning_rate": 0.00010522277834974585, "loss": 0.3661832809448242, "memory(GiB)": 78.33, "step": 3183, "token_acc": 0.8905501755879536, "train_speed(iter/s)": 0.032324 }, { "epoch": 0.6169645884803565, "grad_norm": 0.09824883937835693, "learning_rate": 0.00010513103872230206, "loss": 0.3395775854587555, "memory(GiB)": 78.33, "step": 3184, "token_acc": 0.8979004582756048, "train_speed(iter/s)": 0.032325 }, { "epoch": 0.6171583587656833, "grad_norm": 0.10357562452554703, "learning_rate": 0.00010503931752374637, "loss": 0.330016553401947, "memory(GiB)": 78.33, "step": 3185, "token_acc": 0.903437815975733, "train_speed(iter/s)": 0.032326 }, { "epoch": 0.61735212905101, "grad_norm": 0.09775417298078537, "learning_rate": 0.00010494761479175107, "loss": 0.3158339262008667, "memory(GiB)": 78.33, "step": 3186, "token_acc": 0.9043734015345268, "train_speed(iter/s)": 0.032327 }, { "epoch": 0.6175458993363367, "grad_norm": 0.09946906566619873, "learning_rate": 0.000104855930563981, "loss": 0.3278699517250061, "memory(GiB)": 78.33, "step": 3187, "token_acc": 0.9031324520176881, "train_speed(iter/s)": 0.032328 }, { "epoch": 0.6177396696216635, "grad_norm": 0.09458330273628235, "learning_rate": 0.00010476426487809338, "loss": 0.3177909255027771, "memory(GiB)": 78.33, "step": 3188, "token_acc": 0.9047277202338222, "train_speed(iter/s)": 0.032329 }, { "epoch": 0.6179334399069902, "grad_norm": 0.09574276953935623, "learning_rate": 0.00010467261777173763, "loss": 0.3308386206626892, "memory(GiB)": 78.33, "step": 3189, "token_acc": 0.9015635322616163, "train_speed(iter/s)": 0.032329 }, { "epoch": 0.618127210192317, "grad_norm": 0.12292765825986862, "learning_rate": 0.00010458098928255584, "loss": 0.3934580385684967, "memory(GiB)": 78.33, "step": 3190, "token_acc": 0.8843327802796871, "train_speed(iter/s)": 0.03233 }, { "epoch": 0.6183209804776437, "grad_norm": 0.10484007745981216, "learning_rate": 0.00010448937944818211, "loss": 0.33495286107063293, "memory(GiB)": 78.33, "step": 3191, "token_acc": 0.9003335885376791, "train_speed(iter/s)": 0.032331 }, { "epoch": 0.6185147507629705, "grad_norm": 0.09170569479465485, "learning_rate": 0.00010439778830624321, "loss": 0.32464128732681274, "memory(GiB)": 78.33, "step": 3192, "token_acc": 0.9023982419262373, "train_speed(iter/s)": 0.032332 }, { "epoch": 0.6187085210482972, "grad_norm": 0.10959605127573013, "learning_rate": 0.00010430621589435801, "loss": 0.36965322494506836, "memory(GiB)": 78.33, "step": 3193, "token_acc": 0.8880881729546418, "train_speed(iter/s)": 0.032333 }, { "epoch": 0.618902291333624, "grad_norm": 0.10908178985118866, "learning_rate": 0.00010421466225013776, "loss": 0.39024633169174194, "memory(GiB)": 78.33, "step": 3194, "token_acc": 0.8856543263021954, "train_speed(iter/s)": 0.032334 }, { "epoch": 0.6190960616189507, "grad_norm": 0.10703379660844803, "learning_rate": 0.0001041231274111861, "loss": 0.3596927523612976, "memory(GiB)": 78.33, "step": 3195, "token_acc": 0.8922619047619048, "train_speed(iter/s)": 0.032334 }, { "epoch": 0.6192898319042774, "grad_norm": 0.10117658227682114, "learning_rate": 0.00010403161141509872, "loss": 0.34861043095588684, "memory(GiB)": 78.33, "step": 3196, "token_acc": 0.89493396959272, "train_speed(iter/s)": 0.032335 }, { "epoch": 0.6194836021896042, "grad_norm": 0.09983016550540924, "learning_rate": 0.0001039401142994638, "loss": 0.34936952590942383, "memory(GiB)": 78.33, "step": 3197, "token_acc": 0.8953481619141646, "train_speed(iter/s)": 0.032336 }, { "epoch": 0.6196773724749309, "grad_norm": 0.08768882602453232, "learning_rate": 0.00010384863610186155, "loss": 0.3130777180194855, "memory(GiB)": 78.33, "step": 3198, "token_acc": 0.9038072289156627, "train_speed(iter/s)": 0.032337 }, { "epoch": 0.6198711427602577, "grad_norm": 0.09570645540952682, "learning_rate": 0.00010375717685986459, "loss": 0.31712067127227783, "memory(GiB)": 78.33, "step": 3199, "token_acc": 0.9068876881917851, "train_speed(iter/s)": 0.032338 }, { "epoch": 0.6200649130455844, "grad_norm": 0.09237085282802582, "learning_rate": 0.0001036657366110377, "loss": 0.33153122663497925, "memory(GiB)": 78.33, "step": 3200, "token_acc": 0.9015142538922724, "train_speed(iter/s)": 0.032339 }, { "epoch": 0.6202586833309112, "grad_norm": 0.11354105174541473, "learning_rate": 0.00010357431539293784, "loss": 0.3915308713912964, "memory(GiB)": 78.33, "step": 3201, "token_acc": 0.8843476653936183, "train_speed(iter/s)": 0.032336 }, { "epoch": 0.6204524536162379, "grad_norm": 0.09536038339138031, "learning_rate": 0.00010348291324311418, "loss": 0.3190258741378784, "memory(GiB)": 78.33, "step": 3202, "token_acc": 0.9054259501965924, "train_speed(iter/s)": 0.032337 }, { "epoch": 0.6206462239015647, "grad_norm": 0.10331059992313385, "learning_rate": 0.00010339153019910797, "loss": 0.34562522172927856, "memory(GiB)": 78.33, "step": 3203, "token_acc": 0.8966327964544374, "train_speed(iter/s)": 0.032337 }, { "epoch": 0.6208399941868914, "grad_norm": 0.1046903133392334, "learning_rate": 0.00010330016629845274, "loss": 0.3687852919101715, "memory(GiB)": 78.33, "step": 3204, "token_acc": 0.8896271888020478, "train_speed(iter/s)": 0.032338 }, { "epoch": 0.6210337644722181, "grad_norm": 0.12125846743583679, "learning_rate": 0.00010320882157867408, "loss": 0.32853639125823975, "memory(GiB)": 78.33, "step": 3205, "token_acc": 0.9024209282514908, "train_speed(iter/s)": 0.032339 }, { "epoch": 0.6212275347575449, "grad_norm": 0.09862146526575089, "learning_rate": 0.00010311749607728976, "loss": 0.3248175084590912, "memory(GiB)": 78.33, "step": 3206, "token_acc": 0.9051060112761058, "train_speed(iter/s)": 0.03234 }, { "epoch": 0.6214213050428716, "grad_norm": 0.10406461358070374, "learning_rate": 0.00010302618983180955, "loss": 0.3480234742164612, "memory(GiB)": 78.33, "step": 3207, "token_acc": 0.8964719652212685, "train_speed(iter/s)": 0.032341 }, { "epoch": 0.6216150753281984, "grad_norm": 0.0927383154630661, "learning_rate": 0.00010293490287973539, "loss": 0.31758391857147217, "memory(GiB)": 78.33, "step": 3208, "token_acc": 0.9062631046646439, "train_speed(iter/s)": 0.032341 }, { "epoch": 0.6218088456135251, "grad_norm": 0.10273189842700958, "learning_rate": 0.00010284363525856138, "loss": 0.34117552638053894, "memory(GiB)": 78.33, "step": 3209, "token_acc": 0.897176918619091, "train_speed(iter/s)": 0.032342 }, { "epoch": 0.6220026158988519, "grad_norm": 0.10961279273033142, "learning_rate": 0.00010275238700577344, "loss": 0.3611040711402893, "memory(GiB)": 78.33, "step": 3210, "token_acc": 0.8908705991531813, "train_speed(iter/s)": 0.032343 }, { "epoch": 0.6221963861841786, "grad_norm": 0.11966651678085327, "learning_rate": 0.00010266115815884978, "loss": 0.3134154975414276, "memory(GiB)": 78.33, "step": 3211, "token_acc": 0.9054380748603662, "train_speed(iter/s)": 0.032344 }, { "epoch": 0.6223901564695054, "grad_norm": 0.10242405533790588, "learning_rate": 0.00010256994875526048, "loss": 0.3505587577819824, "memory(GiB)": 78.33, "step": 3212, "token_acc": 0.8964926844453668, "train_speed(iter/s)": 0.032345 }, { "epoch": 0.6225839267548321, "grad_norm": 0.09929542243480682, "learning_rate": 0.0001024787588324677, "loss": 0.3694930672645569, "memory(GiB)": 78.33, "step": 3213, "token_acc": 0.8909321660752674, "train_speed(iter/s)": 0.032346 }, { "epoch": 0.6227776970401588, "grad_norm": 0.11470238864421844, "learning_rate": 0.00010238758842792571, "loss": 0.34870725870132446, "memory(GiB)": 78.33, "step": 3214, "token_acc": 0.8959076482318906, "train_speed(iter/s)": 0.032347 }, { "epoch": 0.6229714673254856, "grad_norm": 0.10671926289796829, "learning_rate": 0.00010229643757908047, "loss": 0.34584519267082214, "memory(GiB)": 78.33, "step": 3215, "token_acc": 0.8956783512316554, "train_speed(iter/s)": 0.032347 }, { "epoch": 0.6231652376108123, "grad_norm": 0.10204023122787476, "learning_rate": 0.00010220530632337022, "loss": 0.33685633540153503, "memory(GiB)": 78.33, "step": 3216, "token_acc": 0.8971209005303604, "train_speed(iter/s)": 0.032348 }, { "epoch": 0.6233590078961391, "grad_norm": 0.09859669208526611, "learning_rate": 0.0001021141946982249, "loss": 0.33895328640937805, "memory(GiB)": 78.33, "step": 3217, "token_acc": 0.8996550051210177, "train_speed(iter/s)": 0.032349 }, { "epoch": 0.6235527781814659, "grad_norm": 0.09662448614835739, "learning_rate": 0.00010202310274106659, "loss": 0.3390568494796753, "memory(GiB)": 78.33, "step": 3218, "token_acc": 0.8996795034847637, "train_speed(iter/s)": 0.03235 }, { "epoch": 0.6237465484667927, "grad_norm": 0.12932774424552917, "learning_rate": 0.00010193203048930914, "loss": 0.3782866597175598, "memory(GiB)": 78.33, "step": 3219, "token_acc": 0.8888921265808031, "train_speed(iter/s)": 0.032351 }, { "epoch": 0.6239403187521194, "grad_norm": 0.09401915222406387, "learning_rate": 0.0001018409779803584, "loss": 0.29952648282051086, "memory(GiB)": 78.33, "step": 3220, "token_acc": 0.9101730793176441, "train_speed(iter/s)": 0.032352 }, { "epoch": 0.6241340890374462, "grad_norm": 0.09950324892997742, "learning_rate": 0.00010174994525161215, "loss": 0.32064443826675415, "memory(GiB)": 78.33, "step": 3221, "token_acc": 0.9026459405633293, "train_speed(iter/s)": 0.032352 }, { "epoch": 0.6243278593227729, "grad_norm": 0.11111953109502792, "learning_rate": 0.00010165893234045988, "loss": 0.3619105815887451, "memory(GiB)": 78.33, "step": 3222, "token_acc": 0.8926043878273178, "train_speed(iter/s)": 0.032353 }, { "epoch": 0.6245216296080996, "grad_norm": 0.10427284240722656, "learning_rate": 0.0001015679392842831, "loss": 0.3563496768474579, "memory(GiB)": 78.33, "step": 3223, "token_acc": 0.8932768896563249, "train_speed(iter/s)": 0.032354 }, { "epoch": 0.6247153998934264, "grad_norm": 0.10940185189247131, "learning_rate": 0.00010147696612045502, "loss": 0.35510045289993286, "memory(GiB)": 78.33, "step": 3224, "token_acc": 0.892183207930252, "train_speed(iter/s)": 0.032355 }, { "epoch": 0.6249091701787531, "grad_norm": 0.10383673757314682, "learning_rate": 0.00010138601288634085, "loss": 0.34664323925971985, "memory(GiB)": 78.33, "step": 3225, "token_acc": 0.897200460505995, "train_speed(iter/s)": 0.032356 }, { "epoch": 0.6251029404640799, "grad_norm": 0.1071397215127945, "learning_rate": 0.00010129507961929748, "loss": 0.3815678358078003, "memory(GiB)": 78.33, "step": 3226, "token_acc": 0.8863624317591562, "train_speed(iter/s)": 0.032357 }, { "epoch": 0.6252967107494066, "grad_norm": 0.09263280034065247, "learning_rate": 0.00010120416635667364, "loss": 0.30259522795677185, "memory(GiB)": 78.33, "step": 3227, "token_acc": 0.9077235976497765, "train_speed(iter/s)": 0.032358 }, { "epoch": 0.6254904810347334, "grad_norm": 0.12750935554504395, "learning_rate": 0.00010111327313580994, "loss": 0.3879462480545044, "memory(GiB)": 78.33, "step": 3228, "token_acc": 0.8852969484232577, "train_speed(iter/s)": 0.032359 }, { "epoch": 0.6256842513200601, "grad_norm": 0.09431038796901703, "learning_rate": 0.00010102239999403857, "loss": 0.3204158842563629, "memory(GiB)": 78.33, "step": 3229, "token_acc": 0.9041399892865342, "train_speed(iter/s)": 0.032359 }, { "epoch": 0.6258780216053869, "grad_norm": 0.1006726399064064, "learning_rate": 0.00010093154696868362, "loss": 0.34341204166412354, "memory(GiB)": 78.33, "step": 3230, "token_acc": 0.8983400146801422, "train_speed(iter/s)": 0.03236 }, { "epoch": 0.6260717918907136, "grad_norm": 0.09969964623451233, "learning_rate": 0.0001008407140970608, "loss": 0.318872332572937, "memory(GiB)": 78.33, "step": 3231, "token_acc": 0.9055216821471309, "train_speed(iter/s)": 0.032361 }, { "epoch": 0.6262655621760403, "grad_norm": 0.1142285168170929, "learning_rate": 0.00010074990141647767, "loss": 0.33352869749069214, "memory(GiB)": 78.33, "step": 3232, "token_acc": 0.9006948304613674, "train_speed(iter/s)": 0.032362 }, { "epoch": 0.6264593324613671, "grad_norm": 0.10268845409154892, "learning_rate": 0.00010065910896423346, "loss": 0.33974435925483704, "memory(GiB)": 78.33, "step": 3233, "token_acc": 0.8981594097137761, "train_speed(iter/s)": 0.032363 }, { "epoch": 0.6266531027466938, "grad_norm": 0.11217531561851501, "learning_rate": 0.000100568336777619, "loss": 0.40757495164871216, "memory(GiB)": 78.33, "step": 3234, "token_acc": 0.8796624013068336, "train_speed(iter/s)": 0.032364 }, { "epoch": 0.6268468730320206, "grad_norm": 0.11023017019033432, "learning_rate": 0.00010047758489391698, "loss": 0.37037602066993713, "memory(GiB)": 78.33, "step": 3235, "token_acc": 0.8889917278880083, "train_speed(iter/s)": 0.032365 }, { "epoch": 0.6270406433173473, "grad_norm": 0.09839659184217453, "learning_rate": 0.00010038685335040149, "loss": 0.330047607421875, "memory(GiB)": 78.33, "step": 3236, "token_acc": 0.9030212027856163, "train_speed(iter/s)": 0.032365 }, { "epoch": 0.6272344136026741, "grad_norm": 0.10475650429725647, "learning_rate": 0.00010029614218433851, "loss": 0.3539055585861206, "memory(GiB)": 78.33, "step": 3237, "token_acc": 0.8929911208269236, "train_speed(iter/s)": 0.032366 }, { "epoch": 0.6274281838880008, "grad_norm": 0.11595990508794785, "learning_rate": 0.00010020545143298555, "loss": 0.3532840311527252, "memory(GiB)": 78.33, "step": 3238, "token_acc": 0.8959765685216587, "train_speed(iter/s)": 0.032367 }, { "epoch": 0.6276219541733276, "grad_norm": 0.0995573177933693, "learning_rate": 0.0001001147811335917, "loss": 0.3350241482257843, "memory(GiB)": 78.33, "step": 3239, "token_acc": 0.900135550084373, "train_speed(iter/s)": 0.032368 }, { "epoch": 0.6278157244586543, "grad_norm": 0.10098245739936829, "learning_rate": 0.0001000241313233977, "loss": 0.3346031606197357, "memory(GiB)": 78.33, "step": 3240, "token_acc": 0.9002930289944479, "train_speed(iter/s)": 0.032369 }, { "epoch": 0.628009494743981, "grad_norm": 0.09921301156282425, "learning_rate": 9.993350203963586e-05, "loss": 0.329167902469635, "memory(GiB)": 78.33, "step": 3241, "token_acc": 0.9022172464638741, "train_speed(iter/s)": 0.03237 }, { "epoch": 0.6282032650293078, "grad_norm": 0.09442011266946793, "learning_rate": 9.984289331953012e-05, "loss": 0.32895606756210327, "memory(GiB)": 78.33, "step": 3242, "token_acc": 0.9014620960805996, "train_speed(iter/s)": 0.03237 }, { "epoch": 0.6283970353146345, "grad_norm": 0.09743113070726395, "learning_rate": 9.975230520029581e-05, "loss": 0.33065280318260193, "memory(GiB)": 78.33, "step": 3243, "token_acc": 0.9017754207977865, "train_speed(iter/s)": 0.032371 }, { "epoch": 0.6285908055999613, "grad_norm": 0.10759622603654861, "learning_rate": 9.966173771913999e-05, "loss": 0.3607182502746582, "memory(GiB)": 78.33, "step": 3244, "token_acc": 0.8951261723983921, "train_speed(iter/s)": 0.032372 }, { "epoch": 0.628784575885288, "grad_norm": 0.10882527381181717, "learning_rate": 9.957119091326111e-05, "loss": 0.3616001307964325, "memory(GiB)": 78.33, "step": 3245, "token_acc": 0.8927234113899291, "train_speed(iter/s)": 0.032373 }, { "epoch": 0.6289783461706148, "grad_norm": 0.10181569308042526, "learning_rate": 9.948066481984919e-05, "loss": 0.32742297649383545, "memory(GiB)": 78.33, "step": 3246, "token_acc": 0.9032007815193656, "train_speed(iter/s)": 0.032374 }, { "epoch": 0.6291721164559415, "grad_norm": 0.09965640306472778, "learning_rate": 9.939015947608579e-05, "loss": 0.32857993245124817, "memory(GiB)": 78.33, "step": 3247, "token_acc": 0.901941986907338, "train_speed(iter/s)": 0.032375 }, { "epoch": 0.6293658867412683, "grad_norm": 0.11179827153682709, "learning_rate": 9.929967491914378e-05, "loss": 0.3481506407260895, "memory(GiB)": 78.33, "step": 3248, "token_acc": 0.8953111091968875, "train_speed(iter/s)": 0.032375 }, { "epoch": 0.629559657026595, "grad_norm": 0.10431916266679764, "learning_rate": 9.920921118618772e-05, "loss": 0.36535415053367615, "memory(GiB)": 78.33, "step": 3249, "token_acc": 0.8926919518963923, "train_speed(iter/s)": 0.032376 }, { "epoch": 0.6297534273119217, "grad_norm": 0.11817894130945206, "learning_rate": 9.911876831437334e-05, "loss": 0.4042690098285675, "memory(GiB)": 78.33, "step": 3250, "token_acc": 0.8816427447580516, "train_speed(iter/s)": 0.032377 }, { "epoch": 0.6299471975972485, "grad_norm": 0.10380962491035461, "learning_rate": 9.902834634084809e-05, "loss": 0.33863088488578796, "memory(GiB)": 78.33, "step": 3251, "token_acc": 0.8970420766782492, "train_speed(iter/s)": 0.032378 }, { "epoch": 0.6301409678825752, "grad_norm": 0.10345399379730225, "learning_rate": 9.893794530275065e-05, "loss": 0.33550775051116943, "memory(GiB)": 78.33, "step": 3252, "token_acc": 0.8967549151027703, "train_speed(iter/s)": 0.032379 }, { "epoch": 0.630334738167902, "grad_norm": 0.10701797902584076, "learning_rate": 9.884756523721115e-05, "loss": 0.33787888288497925, "memory(GiB)": 78.33, "step": 3253, "token_acc": 0.8981328701693443, "train_speed(iter/s)": 0.03238 }, { "epoch": 0.6305285084532287, "grad_norm": 0.11159101128578186, "learning_rate": 9.875720618135118e-05, "loss": 0.36240842938423157, "memory(GiB)": 78.33, "step": 3254, "token_acc": 0.8929006465315301, "train_speed(iter/s)": 0.032381 }, { "epoch": 0.6307222787385555, "grad_norm": 0.10191599279642105, "learning_rate": 9.866686817228351e-05, "loss": 0.35280781984329224, "memory(GiB)": 78.33, "step": 3255, "token_acc": 0.8953659778617565, "train_speed(iter/s)": 0.032381 }, { "epoch": 0.6309160490238822, "grad_norm": 0.10553169250488281, "learning_rate": 9.857655124711252e-05, "loss": 0.3603316843509674, "memory(GiB)": 78.33, "step": 3256, "token_acc": 0.8932977749043693, "train_speed(iter/s)": 0.032382 }, { "epoch": 0.631109819309209, "grad_norm": 0.12384941428899765, "learning_rate": 9.84862554429337e-05, "loss": 0.33838897943496704, "memory(GiB)": 78.33, "step": 3257, "token_acc": 0.8999357294705097, "train_speed(iter/s)": 0.032383 }, { "epoch": 0.6313035895945357, "grad_norm": 0.09418811649084091, "learning_rate": 9.839598079683399e-05, "loss": 0.3201935887336731, "memory(GiB)": 78.33, "step": 3258, "token_acc": 0.9044349822983446, "train_speed(iter/s)": 0.032384 }, { "epoch": 0.6314973598798624, "grad_norm": 0.08840276300907135, "learning_rate": 9.830572734589162e-05, "loss": 0.295772910118103, "memory(GiB)": 78.33, "step": 3259, "token_acc": 0.9113684822640047, "train_speed(iter/s)": 0.032385 }, { "epoch": 0.6316911301651892, "grad_norm": 0.10140841454267502, "learning_rate": 9.82154951271761e-05, "loss": 0.3301541805267334, "memory(GiB)": 78.33, "step": 3260, "token_acc": 0.9020520414639306, "train_speed(iter/s)": 0.032385 }, { "epoch": 0.6318849004505159, "grad_norm": 0.11607446521520615, "learning_rate": 9.81252841777483e-05, "loss": 0.376188725233078, "memory(GiB)": 78.33, "step": 3261, "token_acc": 0.8894073426006934, "train_speed(iter/s)": 0.032386 }, { "epoch": 0.6320786707358427, "grad_norm": 0.09963465481996536, "learning_rate": 9.803509453466015e-05, "loss": 0.3333686590194702, "memory(GiB)": 78.33, "step": 3262, "token_acc": 0.8974518502488639, "train_speed(iter/s)": 0.032387 }, { "epoch": 0.6322724410211694, "grad_norm": 0.0981069952249527, "learning_rate": 9.794492623495509e-05, "loss": 0.3355132043361664, "memory(GiB)": 78.33, "step": 3263, "token_acc": 0.8998677114073471, "train_speed(iter/s)": 0.032388 }, { "epoch": 0.6324662113064962, "grad_norm": 0.11496897041797638, "learning_rate": 9.785477931566753e-05, "loss": 0.3889937400817871, "memory(GiB)": 78.33, "step": 3264, "token_acc": 0.884316427783903, "train_speed(iter/s)": 0.032389 }, { "epoch": 0.6326599815918229, "grad_norm": 0.10167025029659271, "learning_rate": 9.77646538138233e-05, "loss": 0.33570176362991333, "memory(GiB)": 78.33, "step": 3265, "token_acc": 0.8996981339187706, "train_speed(iter/s)": 0.03239 }, { "epoch": 0.6328537518771497, "grad_norm": 0.1109309270977974, "learning_rate": 9.767454976643939e-05, "loss": 0.3936588168144226, "memory(GiB)": 78.33, "step": 3266, "token_acc": 0.8842383328441444, "train_speed(iter/s)": 0.032391 }, { "epoch": 0.6330475221624764, "grad_norm": 0.1110692024230957, "learning_rate": 9.758446721052394e-05, "loss": 0.35908806324005127, "memory(GiB)": 78.33, "step": 3267, "token_acc": 0.8943084600760456, "train_speed(iter/s)": 0.032391 }, { "epoch": 0.6332412924478031, "grad_norm": 0.10611660033464432, "learning_rate": 9.749440618307628e-05, "loss": 0.3543277680873871, "memory(GiB)": 78.33, "step": 3268, "token_acc": 0.8957095521023766, "train_speed(iter/s)": 0.032392 }, { "epoch": 0.6334350627331299, "grad_norm": 0.0958552435040474, "learning_rate": 9.740436672108685e-05, "loss": 0.3312506675720215, "memory(GiB)": 78.33, "step": 3269, "token_acc": 0.901442661563374, "train_speed(iter/s)": 0.032393 }, { "epoch": 0.6336288330184566, "grad_norm": 0.1073455959558487, "learning_rate": 9.731434886153735e-05, "loss": 0.37120985984802246, "memory(GiB)": 78.33, "step": 3270, "token_acc": 0.8908422837761447, "train_speed(iter/s)": 0.032394 }, { "epoch": 0.6338226033037834, "grad_norm": 0.10486573725938797, "learning_rate": 9.722435264140043e-05, "loss": 0.34987497329711914, "memory(GiB)": 78.33, "step": 3271, "token_acc": 0.8957631675770527, "train_speed(iter/s)": 0.032395 }, { "epoch": 0.6340163735891101, "grad_norm": 0.0972420871257782, "learning_rate": 9.713437809764002e-05, "loss": 0.3343609869480133, "memory(GiB)": 78.33, "step": 3272, "token_acc": 0.9014839885446498, "train_speed(iter/s)": 0.032395 }, { "epoch": 0.6342101438744369, "grad_norm": 0.09882809221744537, "learning_rate": 9.704442526721112e-05, "loss": 0.31326884031295776, "memory(GiB)": 78.33, "step": 3273, "token_acc": 0.9058581477465455, "train_speed(iter/s)": 0.032396 }, { "epoch": 0.6344039141597636, "grad_norm": 0.15750029683113098, "learning_rate": 9.69544941870597e-05, "loss": 0.32955771684646606, "memory(GiB)": 78.33, "step": 3274, "token_acc": 0.9013154082401345, "train_speed(iter/s)": 0.032397 }, { "epoch": 0.6345976844450903, "grad_norm": 0.10274738818407059, "learning_rate": 9.686458489412296e-05, "loss": 0.37514811754226685, "memory(GiB)": 78.33, "step": 3275, "token_acc": 0.8886818439601394, "train_speed(iter/s)": 0.032398 }, { "epoch": 0.6347914547304171, "grad_norm": 0.10508158802986145, "learning_rate": 9.677469742532896e-05, "loss": 0.34448888897895813, "memory(GiB)": 78.33, "step": 3276, "token_acc": 0.8970773975620502, "train_speed(iter/s)": 0.032399 }, { "epoch": 0.6349852250157438, "grad_norm": 0.1056324765086174, "learning_rate": 9.668483181759696e-05, "loss": 0.35540270805358887, "memory(GiB)": 78.33, "step": 3277, "token_acc": 0.8941543700340522, "train_speed(iter/s)": 0.032399 }, { "epoch": 0.6351789953010706, "grad_norm": 0.1182514950633049, "learning_rate": 9.659498810783716e-05, "loss": 0.38734346628189087, "memory(GiB)": 78.33, "step": 3278, "token_acc": 0.8861613506065693, "train_speed(iter/s)": 0.0324 }, { "epoch": 0.6353727655863973, "grad_norm": 0.09534314274787903, "learning_rate": 9.65051663329508e-05, "loss": 0.3013858199119568, "memory(GiB)": 78.33, "step": 3279, "token_acc": 0.9102011699243829, "train_speed(iter/s)": 0.032401 }, { "epoch": 0.6355665358717241, "grad_norm": 0.11075269430875778, "learning_rate": 9.641536652983008e-05, "loss": 0.3610069453716278, "memory(GiB)": 78.33, "step": 3280, "token_acc": 0.8919886003799873, "train_speed(iter/s)": 0.032402 }, { "epoch": 0.6357603061570508, "grad_norm": 0.1012001633644104, "learning_rate": 9.63255887353582e-05, "loss": 0.32832014560699463, "memory(GiB)": 78.33, "step": 3281, "token_acc": 0.9021164021164021, "train_speed(iter/s)": 0.032403 }, { "epoch": 0.6359540764423776, "grad_norm": 0.09684737026691437, "learning_rate": 9.623583298640937e-05, "loss": 0.35295921564102173, "memory(GiB)": 78.33, "step": 3282, "token_acc": 0.8938139656941098, "train_speed(iter/s)": 0.032404 }, { "epoch": 0.6361478467277043, "grad_norm": 0.12095388025045395, "learning_rate": 9.614609931984854e-05, "loss": 0.3737364709377289, "memory(GiB)": 78.33, "step": 3283, "token_acc": 0.8893832689984383, "train_speed(iter/s)": 0.032405 }, { "epoch": 0.636341617013031, "grad_norm": 0.10381918400526047, "learning_rate": 9.60563877725319e-05, "loss": 0.3494938313961029, "memory(GiB)": 78.33, "step": 3284, "token_acc": 0.8952139037433156, "train_speed(iter/s)": 0.032405 }, { "epoch": 0.6365353872983578, "grad_norm": 0.10965927690267563, "learning_rate": 9.596669838130627e-05, "loss": 0.36877620220184326, "memory(GiB)": 78.33, "step": 3285, "token_acc": 0.8932712103153104, "train_speed(iter/s)": 0.032406 }, { "epoch": 0.6367291575836845, "grad_norm": 0.09375525265932083, "learning_rate": 9.587703118300955e-05, "loss": 0.33250147104263306, "memory(GiB)": 78.33, "step": 3286, "token_acc": 0.9001272429821601, "train_speed(iter/s)": 0.032407 }, { "epoch": 0.6369229278690113, "grad_norm": 0.11146512627601624, "learning_rate": 9.578738621447052e-05, "loss": 0.3969725966453552, "memory(GiB)": 78.33, "step": 3287, "token_acc": 0.8843091971199565, "train_speed(iter/s)": 0.032408 }, { "epoch": 0.637116698154338, "grad_norm": 0.10300832241773605, "learning_rate": 9.569776351250867e-05, "loss": 0.3498913645744324, "memory(GiB)": 78.33, "step": 3288, "token_acc": 0.896644596566042, "train_speed(iter/s)": 0.032409 }, { "epoch": 0.6373104684396648, "grad_norm": 0.10484420508146286, "learning_rate": 9.560816311393456e-05, "loss": 0.3171748220920563, "memory(GiB)": 78.33, "step": 3289, "token_acc": 0.9038370440549502, "train_speed(iter/s)": 0.032409 }, { "epoch": 0.6375042387249915, "grad_norm": 0.11446377635002136, "learning_rate": 9.551858505554935e-05, "loss": 0.3828757703304291, "memory(GiB)": 78.33, "step": 3290, "token_acc": 0.8870528771384136, "train_speed(iter/s)": 0.03241 }, { "epoch": 0.6376980090103183, "grad_norm": 0.10568659007549286, "learning_rate": 9.542902937414528e-05, "loss": 0.3619765043258667, "memory(GiB)": 78.33, "step": 3291, "token_acc": 0.8938068279293189, "train_speed(iter/s)": 0.032411 }, { "epoch": 0.637891779295645, "grad_norm": 0.09850284457206726, "learning_rate": 9.533949610650519e-05, "loss": 0.3398081660270691, "memory(GiB)": 78.33, "step": 3292, "token_acc": 0.8959392848759878, "train_speed(iter/s)": 0.032412 }, { "epoch": 0.6380855495809717, "grad_norm": 0.10148289799690247, "learning_rate": 9.524998528940282e-05, "loss": 0.3508596420288086, "memory(GiB)": 78.33, "step": 3293, "token_acc": 0.8957725382847403, "train_speed(iter/s)": 0.032413 }, { "epoch": 0.6382793198662985, "grad_norm": 0.09729644656181335, "learning_rate": 9.516049695960277e-05, "loss": 0.33748623728752136, "memory(GiB)": 78.33, "step": 3294, "token_acc": 0.8995384056843525, "train_speed(iter/s)": 0.032414 }, { "epoch": 0.6384730901516252, "grad_norm": 0.11033184826374054, "learning_rate": 9.507103115386013e-05, "loss": 0.36528030037879944, "memory(GiB)": 78.33, "step": 3295, "token_acc": 0.8912418790604698, "train_speed(iter/s)": 0.032415 }, { "epoch": 0.638666860436952, "grad_norm": 0.09952208399772644, "learning_rate": 9.498158790892105e-05, "loss": 0.3313583433628082, "memory(GiB)": 78.33, "step": 3296, "token_acc": 0.9018691588785047, "train_speed(iter/s)": 0.032415 }, { "epoch": 0.6388606307222787, "grad_norm": 0.11369100958108902, "learning_rate": 9.489216726152218e-05, "loss": 0.3779648542404175, "memory(GiB)": 78.33, "step": 3297, "token_acc": 0.8898235347901619, "train_speed(iter/s)": 0.032416 }, { "epoch": 0.6390544010076055, "grad_norm": 0.10706917941570282, "learning_rate": 9.480276924839101e-05, "loss": 0.3606652319431305, "memory(GiB)": 78.33, "step": 3298, "token_acc": 0.8924252794889346, "train_speed(iter/s)": 0.032417 }, { "epoch": 0.6392481712929322, "grad_norm": 0.107958123087883, "learning_rate": 9.471339390624573e-05, "loss": 0.36460253596305847, "memory(GiB)": 78.33, "step": 3299, "token_acc": 0.8916521320743369, "train_speed(iter/s)": 0.032418 }, { "epoch": 0.639441941578259, "grad_norm": 0.10166801512241364, "learning_rate": 9.462404127179517e-05, "loss": 0.34821823239326477, "memory(GiB)": 78.33, "step": 3300, "token_acc": 0.8970175808932179, "train_speed(iter/s)": 0.032418 }, { "epoch": 0.6396357118635857, "grad_norm": 0.11107532680034637, "learning_rate": 9.453471138173893e-05, "loss": 0.3458663523197174, "memory(GiB)": 78.33, "step": 3301, "token_acc": 0.8959231235833094, "train_speed(iter/s)": 0.032419 }, { "epoch": 0.6398294821489124, "grad_norm": 0.11017818003892899, "learning_rate": 9.444540427276707e-05, "loss": 0.36541783809661865, "memory(GiB)": 78.33, "step": 3302, "token_acc": 0.8919458960959115, "train_speed(iter/s)": 0.03242 }, { "epoch": 0.6400232524342392, "grad_norm": 0.09530369192361832, "learning_rate": 9.435611998156055e-05, "loss": 0.31924229860305786, "memory(GiB)": 78.33, "step": 3303, "token_acc": 0.9019935658738, "train_speed(iter/s)": 0.032421 }, { "epoch": 0.6402170227195659, "grad_norm": 0.11861889809370041, "learning_rate": 9.42668585447907e-05, "loss": 0.3566751480102539, "memory(GiB)": 78.33, "step": 3304, "token_acc": 0.8942750741652464, "train_speed(iter/s)": 0.032422 }, { "epoch": 0.6404107930048927, "grad_norm": 0.09427324682474136, "learning_rate": 9.417761999911965e-05, "loss": 0.35689249634742737, "memory(GiB)": 78.33, "step": 3305, "token_acc": 0.8942868072275602, "train_speed(iter/s)": 0.032422 }, { "epoch": 0.6406045632902194, "grad_norm": 0.09987188875675201, "learning_rate": 9.40884043812001e-05, "loss": 0.3596974313259125, "memory(GiB)": 78.33, "step": 3306, "token_acc": 0.8958737616178123, "train_speed(iter/s)": 0.032423 }, { "epoch": 0.6407983335755462, "grad_norm": 0.0936884880065918, "learning_rate": 9.399921172767525e-05, "loss": 0.32182028889656067, "memory(GiB)": 78.33, "step": 3307, "token_acc": 0.9035014440174715, "train_speed(iter/s)": 0.032424 }, { "epoch": 0.6409921038608729, "grad_norm": 0.10136570781469345, "learning_rate": 9.3910042075179e-05, "loss": 0.34020885825157166, "memory(GiB)": 78.33, "step": 3308, "token_acc": 0.8990686593025774, "train_speed(iter/s)": 0.032425 }, { "epoch": 0.6411858741461997, "grad_norm": 0.09547661989927292, "learning_rate": 9.38208954603356e-05, "loss": 0.320523738861084, "memory(GiB)": 78.33, "step": 3309, "token_acc": 0.9048575160868935, "train_speed(iter/s)": 0.032425 }, { "epoch": 0.6413796444315264, "grad_norm": 0.09538991749286652, "learning_rate": 9.373177191976007e-05, "loss": 0.3119708299636841, "memory(GiB)": 78.33, "step": 3310, "token_acc": 0.906541066892464, "train_speed(iter/s)": 0.032426 }, { "epoch": 0.6415734147168531, "grad_norm": 0.09219719469547272, "learning_rate": 9.36426714900578e-05, "loss": 0.3281041383743286, "memory(GiB)": 78.33, "step": 3311, "token_acc": 0.9008394285994796, "train_speed(iter/s)": 0.032427 }, { "epoch": 0.6417671850021799, "grad_norm": 0.09361285716295242, "learning_rate": 9.355359420782467e-05, "loss": 0.3183348476886749, "memory(GiB)": 78.33, "step": 3312, "token_acc": 0.9031101372046116, "train_speed(iter/s)": 0.032428 }, { "epoch": 0.6419609552875066, "grad_norm": 0.10026978701353073, "learning_rate": 9.346454010964722e-05, "loss": 0.3270958364009857, "memory(GiB)": 78.33, "step": 3313, "token_acc": 0.90237444958475, "train_speed(iter/s)": 0.032428 }, { "epoch": 0.6421547255728334, "grad_norm": 0.10751143842935562, "learning_rate": 9.337550923210228e-05, "loss": 0.3549875020980835, "memory(GiB)": 78.33, "step": 3314, "token_acc": 0.8923901567449037, "train_speed(iter/s)": 0.032429 }, { "epoch": 0.6423484958581601, "grad_norm": 0.10847034305334091, "learning_rate": 9.328650161175735e-05, "loss": 0.3592216372489929, "memory(GiB)": 78.33, "step": 3315, "token_acc": 0.8951582706875199, "train_speed(iter/s)": 0.03243 }, { "epoch": 0.6425422661434869, "grad_norm": 0.09726880490779877, "learning_rate": 9.319751728517007e-05, "loss": 0.31801801919937134, "memory(GiB)": 78.33, "step": 3316, "token_acc": 0.9054611953118507, "train_speed(iter/s)": 0.032431 }, { "epoch": 0.6427360364288136, "grad_norm": 0.10297710448503494, "learning_rate": 9.31085562888888e-05, "loss": 0.3436722159385681, "memory(GiB)": 78.33, "step": 3317, "token_acc": 0.897666406481307, "train_speed(iter/s)": 0.032432 }, { "epoch": 0.6429298067141404, "grad_norm": 0.1005515605211258, "learning_rate": 9.301961865945217e-05, "loss": 0.3575522005558014, "memory(GiB)": 78.33, "step": 3318, "token_acc": 0.8908450704225352, "train_speed(iter/s)": 0.032433 }, { "epoch": 0.6431235769994671, "grad_norm": 0.11259230971336365, "learning_rate": 9.29307044333893e-05, "loss": 0.3851853311061859, "memory(GiB)": 78.33, "step": 3319, "token_acc": 0.8852057033875216, "train_speed(iter/s)": 0.032433 }, { "epoch": 0.6433173472847938, "grad_norm": 0.1026308611035347, "learning_rate": 9.284181364721967e-05, "loss": 0.33121898770332336, "memory(GiB)": 78.33, "step": 3320, "token_acc": 0.9000698616738857, "train_speed(iter/s)": 0.032434 }, { "epoch": 0.6435111175701206, "grad_norm": 0.10448215901851654, "learning_rate": 9.275294633745302e-05, "loss": 0.35694530606269836, "memory(GiB)": 78.33, "step": 3321, "token_acc": 0.8939416251877595, "train_speed(iter/s)": 0.032435 }, { "epoch": 0.6437048878554473, "grad_norm": 0.09308988600969315, "learning_rate": 9.266410254058966e-05, "loss": 0.3261651396751404, "memory(GiB)": 78.33, "step": 3322, "token_acc": 0.9015043921939929, "train_speed(iter/s)": 0.032436 }, { "epoch": 0.6438986581407741, "grad_norm": 0.10853360593318939, "learning_rate": 9.257528229312e-05, "loss": 0.36176615953445435, "memory(GiB)": 78.33, "step": 3323, "token_acc": 0.8933630754077465, "train_speed(iter/s)": 0.032437 }, { "epoch": 0.6440924284261008, "grad_norm": 0.10240910202264786, "learning_rate": 9.248648563152499e-05, "loss": 0.3467825651168823, "memory(GiB)": 78.33, "step": 3324, "token_acc": 0.8980282133305365, "train_speed(iter/s)": 0.032437 }, { "epoch": 0.6442861987114276, "grad_norm": 0.10371696203947067, "learning_rate": 9.239771259227577e-05, "loss": 0.35027819871902466, "memory(GiB)": 78.33, "step": 3325, "token_acc": 0.8956741618858588, "train_speed(iter/s)": 0.032438 }, { "epoch": 0.6444799689967543, "grad_norm": 0.11719199270009995, "learning_rate": 9.230896321183379e-05, "loss": 0.3709411919116974, "memory(GiB)": 78.33, "step": 3326, "token_acc": 0.8905724445013935, "train_speed(iter/s)": 0.032439 }, { "epoch": 0.644673739282081, "grad_norm": 0.09655088186264038, "learning_rate": 9.222023752665094e-05, "loss": 0.35260531306266785, "memory(GiB)": 78.33, "step": 3327, "token_acc": 0.8940183178216111, "train_speed(iter/s)": 0.03244 }, { "epoch": 0.6448675095674078, "grad_norm": 0.10662583261728287, "learning_rate": 9.213153557316904e-05, "loss": 0.3466799259185791, "memory(GiB)": 78.33, "step": 3328, "token_acc": 0.8973979206598129, "train_speed(iter/s)": 0.03244 }, { "epoch": 0.6450612798527345, "grad_norm": 0.1059870645403862, "learning_rate": 9.204285738782053e-05, "loss": 0.35201188921928406, "memory(GiB)": 78.33, "step": 3329, "token_acc": 0.8970674486803519, "train_speed(iter/s)": 0.032441 }, { "epoch": 0.6452550501380613, "grad_norm": 0.09518367052078247, "learning_rate": 9.195420300702782e-05, "loss": 0.3428179919719696, "memory(GiB)": 78.33, "step": 3330, "token_acc": 0.894964203373377, "train_speed(iter/s)": 0.032442 }, { "epoch": 0.645448820423388, "grad_norm": 0.10190161317586899, "learning_rate": 9.18655724672037e-05, "loss": 0.34315353631973267, "memory(GiB)": 78.33, "step": 3331, "token_acc": 0.8971641880907941, "train_speed(iter/s)": 0.032443 }, { "epoch": 0.6456425907087148, "grad_norm": 0.09689725935459137, "learning_rate": 9.177696580475109e-05, "loss": 0.32769232988357544, "memory(GiB)": 78.33, "step": 3332, "token_acc": 0.9002272727272728, "train_speed(iter/s)": 0.032444 }, { "epoch": 0.6458363609940415, "grad_norm": 0.09515693038702011, "learning_rate": 9.168838305606311e-05, "loss": 0.2983517050743103, "memory(GiB)": 78.33, "step": 3333, "token_acc": 0.9090839107005388, "train_speed(iter/s)": 0.032444 }, { "epoch": 0.6460301312793683, "grad_norm": 0.11088063567876816, "learning_rate": 9.159982425752319e-05, "loss": 0.38608160614967346, "memory(GiB)": 78.33, "step": 3334, "token_acc": 0.8878626217586326, "train_speed(iter/s)": 0.032445 }, { "epoch": 0.646223901564695, "grad_norm": 0.09727376699447632, "learning_rate": 9.151128944550465e-05, "loss": 0.3358725905418396, "memory(GiB)": 78.33, "step": 3335, "token_acc": 0.9006044153050249, "train_speed(iter/s)": 0.032446 }, { "epoch": 0.6464176718500217, "grad_norm": 0.1008610799908638, "learning_rate": 9.142277865637124e-05, "loss": 0.350691556930542, "memory(GiB)": 78.33, "step": 3336, "token_acc": 0.8959597901617732, "train_speed(iter/s)": 0.032447 }, { "epoch": 0.6466114421353485, "grad_norm": 0.10170338302850723, "learning_rate": 9.133429192647661e-05, "loss": 0.3640455901622772, "memory(GiB)": 78.33, "step": 3337, "token_acc": 0.8921849481941582, "train_speed(iter/s)": 0.032447 }, { "epoch": 0.6468052124206752, "grad_norm": 0.10477067530155182, "learning_rate": 9.124582929216471e-05, "loss": 0.3344863951206207, "memory(GiB)": 78.33, "step": 3338, "token_acc": 0.8980883180952892, "train_speed(iter/s)": 0.032448 }, { "epoch": 0.646998982706002, "grad_norm": 0.10310588032007217, "learning_rate": 9.11573907897695e-05, "loss": 0.35701289772987366, "memory(GiB)": 78.33, "step": 3339, "token_acc": 0.895986649522611, "train_speed(iter/s)": 0.032449 }, { "epoch": 0.6471927529913288, "grad_norm": 0.0928465947508812, "learning_rate": 9.106897645561506e-05, "loss": 0.3364104628562927, "memory(GiB)": 78.33, "step": 3340, "token_acc": 0.8999741468459153, "train_speed(iter/s)": 0.03245 }, { "epoch": 0.6473865232766556, "grad_norm": 0.09853307157754898, "learning_rate": 9.098058632601557e-05, "loss": 0.33070456981658936, "memory(GiB)": 78.33, "step": 3341, "token_acc": 0.900615836532763, "train_speed(iter/s)": 0.03245 }, { "epoch": 0.6475802935619823, "grad_norm": 0.10872960090637207, "learning_rate": 9.089222043727512e-05, "loss": 0.38177114725112915, "memory(GiB)": 78.33, "step": 3342, "token_acc": 0.8859876130382949, "train_speed(iter/s)": 0.032451 }, { "epoch": 0.6477740638473091, "grad_norm": 0.10483535379171371, "learning_rate": 9.08038788256881e-05, "loss": 0.3313630223274231, "memory(GiB)": 78.33, "step": 3343, "token_acc": 0.9004806641905178, "train_speed(iter/s)": 0.032452 }, { "epoch": 0.6479678341326358, "grad_norm": 0.09030815213918686, "learning_rate": 9.071556152753866e-05, "loss": 0.3115072548389435, "memory(GiB)": 78.33, "step": 3344, "token_acc": 0.9051830718021874, "train_speed(iter/s)": 0.032453 }, { "epoch": 0.6481616044179626, "grad_norm": 0.1190243512392044, "learning_rate": 9.062726857910111e-05, "loss": 0.3789428472518921, "memory(GiB)": 78.33, "step": 3345, "token_acc": 0.8881057268722466, "train_speed(iter/s)": 0.032454 }, { "epoch": 0.6483553747032893, "grad_norm": 0.10349465906620026, "learning_rate": 9.05390000166398e-05, "loss": 0.33017659187316895, "memory(GiB)": 78.33, "step": 3346, "token_acc": 0.9012410514386799, "train_speed(iter/s)": 0.032455 }, { "epoch": 0.648549144988616, "grad_norm": 0.10818509757518768, "learning_rate": 9.045075587640894e-05, "loss": 0.3425545394420624, "memory(GiB)": 78.33, "step": 3347, "token_acc": 0.8974606721217429, "train_speed(iter/s)": 0.032455 }, { "epoch": 0.6487429152739428, "grad_norm": 0.09324289113283157, "learning_rate": 9.036253619465285e-05, "loss": 0.3058326244354248, "memory(GiB)": 78.33, "step": 3348, "token_acc": 0.9082648317882186, "train_speed(iter/s)": 0.032456 }, { "epoch": 0.6489366855592695, "grad_norm": 0.10658305138349533, "learning_rate": 9.027434100760559e-05, "loss": 0.3650735318660736, "memory(GiB)": 78.33, "step": 3349, "token_acc": 0.8911853628023353, "train_speed(iter/s)": 0.032457 }, { "epoch": 0.6491304558445963, "grad_norm": 0.10458887368440628, "learning_rate": 9.018617035149141e-05, "loss": 0.34233736991882324, "memory(GiB)": 78.33, "step": 3350, "token_acc": 0.89612028460196, "train_speed(iter/s)": 0.032458 }, { "epoch": 0.649324226129923, "grad_norm": 0.10068278759717941, "learning_rate": 9.00980242625243e-05, "loss": 0.3308500647544861, "memory(GiB)": 78.33, "step": 3351, "token_acc": 0.8995980595980596, "train_speed(iter/s)": 0.032458 }, { "epoch": 0.6495179964152498, "grad_norm": 0.10845746099948883, "learning_rate": 9.000990277690828e-05, "loss": 0.349687397480011, "memory(GiB)": 78.33, "step": 3352, "token_acc": 0.8954418825203011, "train_speed(iter/s)": 0.032459 }, { "epoch": 0.6497117667005765, "grad_norm": 0.10593696683645248, "learning_rate": 8.992180593083718e-05, "loss": 0.35012173652648926, "memory(GiB)": 78.33, "step": 3353, "token_acc": 0.8978386677689855, "train_speed(iter/s)": 0.03246 }, { "epoch": 0.6499055369859033, "grad_norm": 0.10230911523103714, "learning_rate": 8.983373376049473e-05, "loss": 0.32801395654678345, "memory(GiB)": 78.33, "step": 3354, "token_acc": 0.9028890794570777, "train_speed(iter/s)": 0.032461 }, { "epoch": 0.65009930727123, "grad_norm": 0.09752920269966125, "learning_rate": 8.97456863020546e-05, "loss": 0.33961349725723267, "memory(GiB)": 78.33, "step": 3355, "token_acc": 0.8986005964670796, "train_speed(iter/s)": 0.032462 }, { "epoch": 0.6502930775565567, "grad_norm": 0.09952106326818466, "learning_rate": 8.965766359168017e-05, "loss": 0.3362662196159363, "memory(GiB)": 78.33, "step": 3356, "token_acc": 0.8996824741203501, "train_speed(iter/s)": 0.032462 }, { "epoch": 0.6504868478418835, "grad_norm": 0.09671690315008163, "learning_rate": 8.956966566552476e-05, "loss": 0.3496435880661011, "memory(GiB)": 78.33, "step": 3357, "token_acc": 0.8963326523493946, "train_speed(iter/s)": 0.032463 }, { "epoch": 0.6506806181272102, "grad_norm": 0.10482124984264374, "learning_rate": 8.948169255973147e-05, "loss": 0.3635352551937103, "memory(GiB)": 78.33, "step": 3358, "token_acc": 0.8922592685440517, "train_speed(iter/s)": 0.032464 }, { "epoch": 0.650874388412537, "grad_norm": 0.104715995490551, "learning_rate": 8.939374431043325e-05, "loss": 0.33896604180336, "memory(GiB)": 78.33, "step": 3359, "token_acc": 0.8968829586978186, "train_speed(iter/s)": 0.032465 }, { "epoch": 0.6510681586978637, "grad_norm": 0.10908566415309906, "learning_rate": 8.930582095375283e-05, "loss": 0.354704350233078, "memory(GiB)": 78.33, "step": 3360, "token_acc": 0.8975314829897876, "train_speed(iter/s)": 0.032465 }, { "epoch": 0.6512619289831905, "grad_norm": 0.10956370085477829, "learning_rate": 8.921792252580263e-05, "loss": 0.3756663203239441, "memory(GiB)": 78.33, "step": 3361, "token_acc": 0.8876317598533455, "train_speed(iter/s)": 0.032466 }, { "epoch": 0.6514556992685172, "grad_norm": 0.08828612416982651, "learning_rate": 8.913004906268495e-05, "loss": 0.3033355474472046, "memory(GiB)": 78.33, "step": 3362, "token_acc": 0.9071128935601346, "train_speed(iter/s)": 0.032467 }, { "epoch": 0.651649469553844, "grad_norm": 0.10207577049732208, "learning_rate": 8.904220060049172e-05, "loss": 0.3187861144542694, "memory(GiB)": 78.33, "step": 3363, "token_acc": 0.9046379128103328, "train_speed(iter/s)": 0.032468 }, { "epoch": 0.6518432398391707, "grad_norm": 0.10097214579582214, "learning_rate": 8.895437717530473e-05, "loss": 0.31431207060813904, "memory(GiB)": 78.33, "step": 3364, "token_acc": 0.9061064038885495, "train_speed(iter/s)": 0.032468 }, { "epoch": 0.6520370101244974, "grad_norm": 0.10009994357824326, "learning_rate": 8.886657882319537e-05, "loss": 0.32470712065696716, "memory(GiB)": 78.33, "step": 3365, "token_acc": 0.9029475799698543, "train_speed(iter/s)": 0.032469 }, { "epoch": 0.6522307804098242, "grad_norm": 0.1002596840262413, "learning_rate": 8.877880558022478e-05, "loss": 0.340162992477417, "memory(GiB)": 78.33, "step": 3366, "token_acc": 0.9000396877893901, "train_speed(iter/s)": 0.03247 }, { "epoch": 0.6524245506951509, "grad_norm": 0.09453920274972916, "learning_rate": 8.869105748244392e-05, "loss": 0.3194851279258728, "memory(GiB)": 78.33, "step": 3367, "token_acc": 0.9037842692144585, "train_speed(iter/s)": 0.032471 }, { "epoch": 0.6526183209804777, "grad_norm": 0.09737507253885269, "learning_rate": 8.86033345658931e-05, "loss": 0.33026716113090515, "memory(GiB)": 78.33, "step": 3368, "token_acc": 0.901304817031296, "train_speed(iter/s)": 0.032471 }, { "epoch": 0.6528120912658044, "grad_norm": 0.27890968322753906, "learning_rate": 8.851563686660263e-05, "loss": 0.3396485149860382, "memory(GiB)": 78.33, "step": 3369, "token_acc": 0.8994968003062954, "train_speed(iter/s)": 0.032472 }, { "epoch": 0.6530058615511312, "grad_norm": 0.09766080230474472, "learning_rate": 8.842796442059217e-05, "loss": 0.33352982997894287, "memory(GiB)": 78.33, "step": 3370, "token_acc": 0.9007515752714021, "train_speed(iter/s)": 0.032473 }, { "epoch": 0.6531996318364579, "grad_norm": 0.0987740233540535, "learning_rate": 8.834031726387126e-05, "loss": 0.33398640155792236, "memory(GiB)": 78.33, "step": 3371, "token_acc": 0.9004823690252018, "train_speed(iter/s)": 0.032474 }, { "epoch": 0.6533934021217846, "grad_norm": 0.09064597636461258, "learning_rate": 8.825269543243891e-05, "loss": 0.3236205577850342, "memory(GiB)": 78.33, "step": 3372, "token_acc": 0.9059911385089578, "train_speed(iter/s)": 0.032474 }, { "epoch": 0.6535871724071114, "grad_norm": 0.0983690693974495, "learning_rate": 8.816509896228374e-05, "loss": 0.32531166076660156, "memory(GiB)": 78.33, "step": 3373, "token_acc": 0.9019883621521745, "train_speed(iter/s)": 0.032475 }, { "epoch": 0.6537809426924381, "grad_norm": 0.10600029677152634, "learning_rate": 8.807752788938406e-05, "loss": 0.33703696727752686, "memory(GiB)": 78.33, "step": 3374, "token_acc": 0.898842851810377, "train_speed(iter/s)": 0.032476 }, { "epoch": 0.6539747129777649, "grad_norm": 0.10279665142297745, "learning_rate": 8.798998224970756e-05, "loss": 0.36671680212020874, "memory(GiB)": 78.33, "step": 3375, "token_acc": 0.8890161118835365, "train_speed(iter/s)": 0.032477 }, { "epoch": 0.6541684832630916, "grad_norm": 0.10394495725631714, "learning_rate": 8.790246207921164e-05, "loss": 0.344325453042984, "memory(GiB)": 78.33, "step": 3376, "token_acc": 0.8953350437253441, "train_speed(iter/s)": 0.032478 }, { "epoch": 0.6543622535484184, "grad_norm": 0.10396216064691544, "learning_rate": 8.781496741384314e-05, "loss": 0.3480278551578522, "memory(GiB)": 78.33, "step": 3377, "token_acc": 0.8966755763448045, "train_speed(iter/s)": 0.032478 }, { "epoch": 0.6545560238337451, "grad_norm": 0.10462364554405212, "learning_rate": 8.772749828953848e-05, "loss": 0.3206149935722351, "memory(GiB)": 78.33, "step": 3378, "token_acc": 0.9034575662325999, "train_speed(iter/s)": 0.032479 }, { "epoch": 0.6547497941190719, "grad_norm": 0.10066306591033936, "learning_rate": 8.764005474222365e-05, "loss": 0.32818686962127686, "memory(GiB)": 78.33, "step": 3379, "token_acc": 0.9018111463427743, "train_speed(iter/s)": 0.03248 }, { "epoch": 0.6549435644043986, "grad_norm": 0.10638129711151123, "learning_rate": 8.7552636807814e-05, "loss": 0.3548268675804138, "memory(GiB)": 78.33, "step": 3380, "token_acc": 0.8971172104263767, "train_speed(iter/s)": 0.032481 }, { "epoch": 0.6551373346897253, "grad_norm": 0.08810947835445404, "learning_rate": 8.746524452221442e-05, "loss": 0.31981220841407776, "memory(GiB)": 78.33, "step": 3381, "token_acc": 0.9033225494131816, "train_speed(iter/s)": 0.032481 }, { "epoch": 0.6553311049750521, "grad_norm": 0.0968799740076065, "learning_rate": 8.737787792131926e-05, "loss": 0.29892611503601074, "memory(GiB)": 78.33, "step": 3382, "token_acc": 0.9107401514971624, "train_speed(iter/s)": 0.032482 }, { "epoch": 0.6555248752603788, "grad_norm": 0.10424499958753586, "learning_rate": 8.729053704101246e-05, "loss": 0.3598312735557556, "memory(GiB)": 78.33, "step": 3383, "token_acc": 0.8916409545107142, "train_speed(iter/s)": 0.032483 }, { "epoch": 0.6557186455457056, "grad_norm": 0.1125241369009018, "learning_rate": 8.720322191716708e-05, "loss": 0.3586179316043854, "memory(GiB)": 78.33, "step": 3384, "token_acc": 0.8950719017611892, "train_speed(iter/s)": 0.032484 }, { "epoch": 0.6559124158310323, "grad_norm": 0.1094302088022232, "learning_rate": 8.71159325856459e-05, "loss": 0.35498055815696716, "memory(GiB)": 78.33, "step": 3385, "token_acc": 0.8951116185440015, "train_speed(iter/s)": 0.032485 }, { "epoch": 0.6561061861163591, "grad_norm": 0.09570290893316269, "learning_rate": 8.702866908230096e-05, "loss": 0.30832698941230774, "memory(GiB)": 78.33, "step": 3386, "token_acc": 0.9092654575646714, "train_speed(iter/s)": 0.032485 }, { "epoch": 0.6562999564016858, "grad_norm": 0.09967100620269775, "learning_rate": 8.694143144297376e-05, "loss": 0.32282644510269165, "memory(GiB)": 78.33, "step": 3387, "token_acc": 0.9025936599423631, "train_speed(iter/s)": 0.032486 }, { "epoch": 0.6564937266870126, "grad_norm": 0.09364413470029831, "learning_rate": 8.685421970349511e-05, "loss": 0.3126744031906128, "memory(GiB)": 78.33, "step": 3388, "token_acc": 0.9047869806163645, "train_speed(iter/s)": 0.032487 }, { "epoch": 0.6566874969723393, "grad_norm": 0.11045239120721817, "learning_rate": 8.676703389968515e-05, "loss": 0.3734998106956482, "memory(GiB)": 78.33, "step": 3389, "token_acc": 0.8892863324294698, "train_speed(iter/s)": 0.032488 }, { "epoch": 0.656881267257666, "grad_norm": 0.0982174277305603, "learning_rate": 8.667987406735363e-05, "loss": 0.3432157337665558, "memory(GiB)": 78.33, "step": 3390, "token_acc": 0.8978511530398323, "train_speed(iter/s)": 0.032488 }, { "epoch": 0.6570750375429928, "grad_norm": 0.09781992435455322, "learning_rate": 8.659274024229918e-05, "loss": 0.34358012676239014, "memory(GiB)": 78.33, "step": 3391, "token_acc": 0.8971234051399319, "train_speed(iter/s)": 0.032489 }, { "epoch": 0.6572688078283195, "grad_norm": 0.09683690965175629, "learning_rate": 8.65056324603102e-05, "loss": 0.3415444493293762, "memory(GiB)": 78.33, "step": 3392, "token_acc": 0.8975164282953227, "train_speed(iter/s)": 0.03249 }, { "epoch": 0.6574625781136463, "grad_norm": 0.09927819669246674, "learning_rate": 8.641855075716413e-05, "loss": 0.33681708574295044, "memory(GiB)": 78.33, "step": 3393, "token_acc": 0.9005583061202992, "train_speed(iter/s)": 0.032491 }, { "epoch": 0.657656348398973, "grad_norm": 0.10389687120914459, "learning_rate": 8.633149516862775e-05, "loss": 0.3464776277542114, "memory(GiB)": 78.33, "step": 3394, "token_acc": 0.8982288677996028, "train_speed(iter/s)": 0.032491 }, { "epoch": 0.6578501186842998, "grad_norm": 0.09941697865724564, "learning_rate": 8.624446573045717e-05, "loss": 0.33485954999923706, "memory(GiB)": 78.33, "step": 3395, "token_acc": 0.8993164654649689, "train_speed(iter/s)": 0.032492 }, { "epoch": 0.6580438889696265, "grad_norm": 0.10718297958374023, "learning_rate": 8.61574624783976e-05, "loss": 0.3694123923778534, "memory(GiB)": 78.33, "step": 3396, "token_acc": 0.8908513223063265, "train_speed(iter/s)": 0.032493 }, { "epoch": 0.6582376592549533, "grad_norm": 0.09040777385234833, "learning_rate": 8.607048544818386e-05, "loss": 0.29911673069000244, "memory(GiB)": 78.33, "step": 3397, "token_acc": 0.9073398896617894, "train_speed(iter/s)": 0.032494 }, { "epoch": 0.65843142954028, "grad_norm": 0.09987498074769974, "learning_rate": 8.598353467553945e-05, "loss": 0.3411368727684021, "memory(GiB)": 78.33, "step": 3398, "token_acc": 0.898384837029792, "train_speed(iter/s)": 0.032495 }, { "epoch": 0.6586251998256067, "grad_norm": 0.10136095434427261, "learning_rate": 8.58966101961776e-05, "loss": 0.3573547601699829, "memory(GiB)": 78.33, "step": 3399, "token_acc": 0.8953447859995176, "train_speed(iter/s)": 0.032495 }, { "epoch": 0.6588189701109335, "grad_norm": 0.10064728558063507, "learning_rate": 8.580971204580049e-05, "loss": 0.3405356705188751, "memory(GiB)": 78.33, "step": 3400, "token_acc": 0.8976995468804462, "train_speed(iter/s)": 0.032496 }, { "epoch": 0.6590127403962602, "grad_norm": 0.11121074110269547, "learning_rate": 8.572284026009947e-05, "loss": 0.3688167333602905, "memory(GiB)": 78.33, "step": 3401, "token_acc": 0.8922276364073873, "train_speed(iter/s)": 0.032493 }, { "epoch": 0.659206510681587, "grad_norm": 0.09966862946748734, "learning_rate": 8.563599487475517e-05, "loss": 0.32531312108039856, "memory(GiB)": 78.33, "step": 3402, "token_acc": 0.9004441763544637, "train_speed(iter/s)": 0.032494 }, { "epoch": 0.6594002809669137, "grad_norm": 0.09778161346912384, "learning_rate": 8.554917592543724e-05, "loss": 0.3067454695701599, "memory(GiB)": 78.33, "step": 3403, "token_acc": 0.9069717187542206, "train_speed(iter/s)": 0.032495 }, { "epoch": 0.6595940512522405, "grad_norm": 0.10241376608610153, "learning_rate": 8.546238344780468e-05, "loss": 0.3466450572013855, "memory(GiB)": 78.33, "step": 3404, "token_acc": 0.8972174435433221, "train_speed(iter/s)": 0.032496 }, { "epoch": 0.6597878215375672, "grad_norm": 0.10399215668439865, "learning_rate": 8.537561747750542e-05, "loss": 0.35718533396720886, "memory(GiB)": 78.33, "step": 3405, "token_acc": 0.894998862084661, "train_speed(iter/s)": 0.032496 }, { "epoch": 0.659981591822894, "grad_norm": 0.12148671597242355, "learning_rate": 8.528887805017661e-05, "loss": 0.38953131437301636, "memory(GiB)": 78.33, "step": 3406, "token_acc": 0.8871596564074931, "train_speed(iter/s)": 0.032497 }, { "epoch": 0.6601753621082207, "grad_norm": 0.10579821467399597, "learning_rate": 8.520216520144442e-05, "loss": 0.33564966917037964, "memory(GiB)": 78.33, "step": 3407, "token_acc": 0.8999078135039249, "train_speed(iter/s)": 0.032498 }, { "epoch": 0.6603691323935474, "grad_norm": 0.11077598482370377, "learning_rate": 8.51154789669241e-05, "loss": 0.3519205152988434, "memory(GiB)": 78.33, "step": 3408, "token_acc": 0.89746127867707, "train_speed(iter/s)": 0.032499 }, { "epoch": 0.6605629026788742, "grad_norm": 0.09937475621700287, "learning_rate": 8.502881938222021e-05, "loss": 0.36071473360061646, "memory(GiB)": 78.33, "step": 3409, "token_acc": 0.8904598045082784, "train_speed(iter/s)": 0.032499 }, { "epoch": 0.6607566729642009, "grad_norm": 0.09916140139102936, "learning_rate": 8.494218648292594e-05, "loss": 0.3261912763118744, "memory(GiB)": 78.33, "step": 3410, "token_acc": 0.9021432978352424, "train_speed(iter/s)": 0.0325 }, { "epoch": 0.6609504432495277, "grad_norm": 0.10734451562166214, "learning_rate": 8.485558030462389e-05, "loss": 0.32132577896118164, "memory(GiB)": 78.33, "step": 3411, "token_acc": 0.9042697182232066, "train_speed(iter/s)": 0.032501 }, { "epoch": 0.6611442135348544, "grad_norm": 0.09797913581132889, "learning_rate": 8.476900088288554e-05, "loss": 0.33835476636886597, "memory(GiB)": 78.33, "step": 3412, "token_acc": 0.9016252123424863, "train_speed(iter/s)": 0.032502 }, { "epoch": 0.6613379838201812, "grad_norm": 0.10399965941905975, "learning_rate": 8.468244825327132e-05, "loss": 0.3561461269855499, "memory(GiB)": 78.33, "step": 3413, "token_acc": 0.8911394426552394, "train_speed(iter/s)": 0.032502 }, { "epoch": 0.6615317541055079, "grad_norm": 0.14848816394805908, "learning_rate": 8.459592245133076e-05, "loss": 0.3276277780532837, "memory(GiB)": 78.33, "step": 3414, "token_acc": 0.9025031223980017, "train_speed(iter/s)": 0.032503 }, { "epoch": 0.6617255243908347, "grad_norm": 0.10193125158548355, "learning_rate": 8.450942351260228e-05, "loss": 0.3469090163707733, "memory(GiB)": 78.33, "step": 3415, "token_acc": 0.89754166435967, "train_speed(iter/s)": 0.032504 }, { "epoch": 0.6619192946761614, "grad_norm": 0.0984477773308754, "learning_rate": 8.442295147261347e-05, "loss": 0.33425408601760864, "memory(GiB)": 78.33, "step": 3416, "token_acc": 0.8991217336922533, "train_speed(iter/s)": 0.032505 }, { "epoch": 0.6621130649614881, "grad_norm": 0.09679713100194931, "learning_rate": 8.43365063668805e-05, "loss": 0.3265281617641449, "memory(GiB)": 78.33, "step": 3417, "token_acc": 0.9010822632198733, "train_speed(iter/s)": 0.032505 }, { "epoch": 0.6623068352468149, "grad_norm": 0.10674044489860535, "learning_rate": 8.425008823090885e-05, "loss": 0.32397177815437317, "memory(GiB)": 78.33, "step": 3418, "token_acc": 0.9022934871751365, "train_speed(iter/s)": 0.032506 }, { "epoch": 0.6625006055321416, "grad_norm": 0.09785594791173935, "learning_rate": 8.416369710019276e-05, "loss": 0.36127400398254395, "memory(GiB)": 78.33, "step": 3419, "token_acc": 0.8940439348587084, "train_speed(iter/s)": 0.032507 }, { "epoch": 0.6626943758174684, "grad_norm": 0.0925799012184143, "learning_rate": 8.407733301021534e-05, "loss": 0.3258381485939026, "memory(GiB)": 78.33, "step": 3420, "token_acc": 0.9027204212265125, "train_speed(iter/s)": 0.032508 }, { "epoch": 0.6628881461027951, "grad_norm": 0.09700040519237518, "learning_rate": 8.399099599644869e-05, "loss": 0.3296799957752228, "memory(GiB)": 78.33, "step": 3421, "token_acc": 0.9014329474066767, "train_speed(iter/s)": 0.032509 }, { "epoch": 0.6630819163881219, "grad_norm": 0.09748223423957825, "learning_rate": 8.390468609435364e-05, "loss": 0.3318982422351837, "memory(GiB)": 78.33, "step": 3422, "token_acc": 0.8991830021301265, "train_speed(iter/s)": 0.032509 }, { "epoch": 0.6632756866734486, "grad_norm": 0.1090392917394638, "learning_rate": 8.381840333938017e-05, "loss": 0.36316922307014465, "memory(GiB)": 78.33, "step": 3423, "token_acc": 0.8909560870306198, "train_speed(iter/s)": 0.03251 }, { "epoch": 0.6634694569587753, "grad_norm": 0.10358504951000214, "learning_rate": 8.37321477669667e-05, "loss": 0.3546435236930847, "memory(GiB)": 78.33, "step": 3424, "token_acc": 0.8954622502570373, "train_speed(iter/s)": 0.032511 }, { "epoch": 0.6636632272441021, "grad_norm": 0.1012730523943901, "learning_rate": 8.364591941254091e-05, "loss": 0.3311161994934082, "memory(GiB)": 78.33, "step": 3425, "token_acc": 0.9006039512744396, "train_speed(iter/s)": 0.032512 }, { "epoch": 0.6638569975294288, "grad_norm": 0.10826534032821655, "learning_rate": 8.355971831151901e-05, "loss": 0.36814382672309875, "memory(GiB)": 78.33, "step": 3426, "token_acc": 0.8914054979142154, "train_speed(iter/s)": 0.032512 }, { "epoch": 0.6640507678147556, "grad_norm": 0.10928687453269958, "learning_rate": 8.347354449930611e-05, "loss": 0.35287556052207947, "memory(GiB)": 78.33, "step": 3427, "token_acc": 0.8944273647136077, "train_speed(iter/s)": 0.032513 }, { "epoch": 0.6642445381000823, "grad_norm": 0.10338309407234192, "learning_rate": 8.338739801129611e-05, "loss": 0.3381505608558655, "memory(GiB)": 78.33, "step": 3428, "token_acc": 0.8976415225684292, "train_speed(iter/s)": 0.032514 }, { "epoch": 0.6644383083854091, "grad_norm": 0.09947198629379272, "learning_rate": 8.330127888287165e-05, "loss": 0.3421975374221802, "memory(GiB)": 78.33, "step": 3429, "token_acc": 0.8979885810067572, "train_speed(iter/s)": 0.032515 }, { "epoch": 0.6646320786707358, "grad_norm": 0.11111465096473694, "learning_rate": 8.321518714940434e-05, "loss": 0.35146766901016235, "memory(GiB)": 78.33, "step": 3430, "token_acc": 0.8957091043216328, "train_speed(iter/s)": 0.032516 }, { "epoch": 0.6648258489560626, "grad_norm": 0.09252513200044632, "learning_rate": 8.312912284625412e-05, "loss": 0.3317872881889343, "memory(GiB)": 78.33, "step": 3431, "token_acc": 0.8990752267472879, "train_speed(iter/s)": 0.032516 }, { "epoch": 0.6650196192413893, "grad_norm": 0.09670988470315933, "learning_rate": 8.30430860087701e-05, "loss": 0.3309570848941803, "memory(GiB)": 78.33, "step": 3432, "token_acc": 0.9016609407692506, "train_speed(iter/s)": 0.032517 }, { "epoch": 0.665213389526716, "grad_norm": 0.10486641526222229, "learning_rate": 8.295707667228987e-05, "loss": 0.33131322264671326, "memory(GiB)": 78.33, "step": 3433, "token_acc": 0.9014130083054687, "train_speed(iter/s)": 0.032518 }, { "epoch": 0.6654071598120428, "grad_norm": 0.09246989339590073, "learning_rate": 8.287109487213974e-05, "loss": 0.3162573575973511, "memory(GiB)": 78.33, "step": 3434, "token_acc": 0.9054944021074419, "train_speed(iter/s)": 0.032519 }, { "epoch": 0.6656009300973695, "grad_norm": 0.08992563188076019, "learning_rate": 8.278514064363477e-05, "loss": 0.31209853291511536, "memory(GiB)": 78.33, "step": 3435, "token_acc": 0.9090142626291013, "train_speed(iter/s)": 0.032519 }, { "epoch": 0.6657947003826963, "grad_norm": 0.102995365858078, "learning_rate": 8.269921402207863e-05, "loss": 0.3250240385532379, "memory(GiB)": 78.33, "step": 3436, "token_acc": 0.9044458079991075, "train_speed(iter/s)": 0.03252 }, { "epoch": 0.665988470668023, "grad_norm": 0.09776205569505692, "learning_rate": 8.261331504276378e-05, "loss": 0.31458067893981934, "memory(GiB)": 78.33, "step": 3437, "token_acc": 0.9068092088396656, "train_speed(iter/s)": 0.032521 }, { "epoch": 0.6661822409533498, "grad_norm": 0.12002206593751907, "learning_rate": 8.25274437409712e-05, "loss": 0.4206904470920563, "memory(GiB)": 78.33, "step": 3438, "token_acc": 0.8826992485651737, "train_speed(iter/s)": 0.032522 }, { "epoch": 0.6663760112386765, "grad_norm": 0.09399055689573288, "learning_rate": 8.244160015197054e-05, "loss": 0.3172173798084259, "memory(GiB)": 78.33, "step": 3439, "token_acc": 0.9048228875854827, "train_speed(iter/s)": 0.032522 }, { "epoch": 0.6665697815240033, "grad_norm": 0.10963206738233566, "learning_rate": 8.235578431102004e-05, "loss": 0.357374906539917, "memory(GiB)": 78.33, "step": 3440, "token_acc": 0.8945577178923919, "train_speed(iter/s)": 0.032523 }, { "epoch": 0.66676355180933, "grad_norm": 0.09495268017053604, "learning_rate": 8.226999625336662e-05, "loss": 0.3170054852962494, "memory(GiB)": 78.33, "step": 3441, "token_acc": 0.9031417624521073, "train_speed(iter/s)": 0.032524 }, { "epoch": 0.6669573220946567, "grad_norm": 0.10406588762998581, "learning_rate": 8.21842360142457e-05, "loss": 0.3348080813884735, "memory(GiB)": 78.33, "step": 3442, "token_acc": 0.8991230567735322, "train_speed(iter/s)": 0.032524 }, { "epoch": 0.6671510923799835, "grad_norm": 0.0943104475736618, "learning_rate": 8.209850362888126e-05, "loss": 0.3240373432636261, "memory(GiB)": 78.33, "step": 3443, "token_acc": 0.9041046505639388, "train_speed(iter/s)": 0.032525 }, { "epoch": 0.6673448626653102, "grad_norm": 0.10466547310352325, "learning_rate": 8.201279913248606e-05, "loss": 0.35648593306541443, "memory(GiB)": 78.33, "step": 3444, "token_acc": 0.8950629795859273, "train_speed(iter/s)": 0.032526 }, { "epoch": 0.667538632950637, "grad_norm": 0.1078016608953476, "learning_rate": 8.192712256026111e-05, "loss": 0.3920597434043884, "memory(GiB)": 78.33, "step": 3445, "token_acc": 0.8864499320972257, "train_speed(iter/s)": 0.032527 }, { "epoch": 0.6677324032359637, "grad_norm": 0.0912395566701889, "learning_rate": 8.18414739473961e-05, "loss": 0.3125567138195038, "memory(GiB)": 78.33, "step": 3446, "token_acc": 0.9068691030238295, "train_speed(iter/s)": 0.032528 }, { "epoch": 0.6679261735212905, "grad_norm": 0.10371241718530655, "learning_rate": 8.175585332906928e-05, "loss": 0.35416972637176514, "memory(GiB)": 78.33, "step": 3447, "token_acc": 0.8940065267570507, "train_speed(iter/s)": 0.032528 }, { "epoch": 0.6681199438066172, "grad_norm": 0.1040647029876709, "learning_rate": 8.167026074044719e-05, "loss": 0.34062814712524414, "memory(GiB)": 78.33, "step": 3448, "token_acc": 0.8997950416935733, "train_speed(iter/s)": 0.032529 }, { "epoch": 0.668313714091944, "grad_norm": 0.1095561683177948, "learning_rate": 8.158469621668522e-05, "loss": 0.35409224033355713, "memory(GiB)": 78.33, "step": 3449, "token_acc": 0.8959530893440019, "train_speed(iter/s)": 0.03253 }, { "epoch": 0.6685074843772707, "grad_norm": 0.09262026101350784, "learning_rate": 8.149915979292683e-05, "loss": 0.3020830750465393, "memory(GiB)": 78.33, "step": 3450, "token_acc": 0.9076835711816701, "train_speed(iter/s)": 0.032531 }, { "epoch": 0.6687012546625974, "grad_norm": 0.109112448990345, "learning_rate": 8.141365150430421e-05, "loss": 0.3594379723072052, "memory(GiB)": 78.33, "step": 3451, "token_acc": 0.893763065114973, "train_speed(iter/s)": 0.032531 }, { "epoch": 0.6688950249479242, "grad_norm": 0.10545790940523148, "learning_rate": 8.132817138593792e-05, "loss": 0.339529812335968, "memory(GiB)": 78.33, "step": 3452, "token_acc": 0.8981471052038748, "train_speed(iter/s)": 0.032532 }, { "epoch": 0.6690887952332509, "grad_norm": 0.10486025363206863, "learning_rate": 8.124271947293695e-05, "loss": 0.3370751142501831, "memory(GiB)": 78.33, "step": 3453, "token_acc": 0.9002499368137269, "train_speed(iter/s)": 0.032533 }, { "epoch": 0.6692825655185777, "grad_norm": 0.10516992211341858, "learning_rate": 8.115729580039863e-05, "loss": 0.31734785437583923, "memory(GiB)": 78.33, "step": 3454, "token_acc": 0.9048377947651333, "train_speed(iter/s)": 0.032534 }, { "epoch": 0.6694763358039044, "grad_norm": 0.09757381677627563, "learning_rate": 8.107190040340878e-05, "loss": 0.3384143114089966, "memory(GiB)": 78.33, "step": 3455, "token_acc": 0.8986423042761071, "train_speed(iter/s)": 0.032534 }, { "epoch": 0.6696701060892312, "grad_norm": 0.10361029207706451, "learning_rate": 8.09865333170417e-05, "loss": 0.36140722036361694, "memory(GiB)": 78.33, "step": 3456, "token_acc": 0.8917902428540726, "train_speed(iter/s)": 0.032535 }, { "epoch": 0.6698638763745579, "grad_norm": 0.10952255129814148, "learning_rate": 8.090119457635973e-05, "loss": 0.3548586666584015, "memory(GiB)": 78.33, "step": 3457, "token_acc": 0.8944674205396651, "train_speed(iter/s)": 0.032536 }, { "epoch": 0.6700576466598847, "grad_norm": 0.09695498645305634, "learning_rate": 8.081588421641399e-05, "loss": 0.34991052746772766, "memory(GiB)": 78.33, "step": 3458, "token_acc": 0.8968775374621687, "train_speed(iter/s)": 0.032537 }, { "epoch": 0.6702514169452114, "grad_norm": 0.096768319606781, "learning_rate": 8.073060227224364e-05, "loss": 0.3430905044078827, "memory(GiB)": 78.33, "step": 3459, "token_acc": 0.8984231274638633, "train_speed(iter/s)": 0.032537 }, { "epoch": 0.6704451872305381, "grad_norm": 0.10539654642343521, "learning_rate": 8.064534877887625e-05, "loss": 0.3595712184906006, "memory(GiB)": 78.33, "step": 3460, "token_acc": 0.8929441752331867, "train_speed(iter/s)": 0.032538 }, { "epoch": 0.670638957515865, "grad_norm": 0.109794020652771, "learning_rate": 8.056012377132778e-05, "loss": 0.38728106021881104, "memory(GiB)": 78.33, "step": 3461, "token_acc": 0.8862599615278923, "train_speed(iter/s)": 0.032539 }, { "epoch": 0.6708327278011917, "grad_norm": 0.10039215534925461, "learning_rate": 8.047492728460232e-05, "loss": 0.3494102358818054, "memory(GiB)": 78.33, "step": 3462, "token_acc": 0.8965799655454828, "train_speed(iter/s)": 0.03254 }, { "epoch": 0.6710264980865185, "grad_norm": 0.11146795004606247, "learning_rate": 8.038975935369256e-05, "loss": 0.34334322810173035, "memory(GiB)": 78.33, "step": 3463, "token_acc": 0.8990417830644919, "train_speed(iter/s)": 0.032541 }, { "epoch": 0.6712202683718452, "grad_norm": 0.10396383702754974, "learning_rate": 8.030462001357903e-05, "loss": 0.3301684558391571, "memory(GiB)": 78.33, "step": 3464, "token_acc": 0.9044740024183797, "train_speed(iter/s)": 0.032541 }, { "epoch": 0.671414038657172, "grad_norm": 0.10532711446285248, "learning_rate": 8.02195092992309e-05, "loss": 0.3416173756122589, "memory(GiB)": 78.33, "step": 3465, "token_acc": 0.8965047796620201, "train_speed(iter/s)": 0.032542 }, { "epoch": 0.6716078089424987, "grad_norm": 0.10811363905668259, "learning_rate": 8.013442724560537e-05, "loss": 0.3821423649787903, "memory(GiB)": 78.33, "step": 3466, "token_acc": 0.8860308779126075, "train_speed(iter/s)": 0.032543 }, { "epoch": 0.6718015792278255, "grad_norm": 0.10519671440124512, "learning_rate": 8.004937388764793e-05, "loss": 0.36607247591018677, "memory(GiB)": 78.33, "step": 3467, "token_acc": 0.892551655187736, "train_speed(iter/s)": 0.032543 }, { "epoch": 0.6719953495131522, "grad_norm": 0.10614033788442612, "learning_rate": 7.996434926029227e-05, "loss": 0.36225974559783936, "memory(GiB)": 78.33, "step": 3468, "token_acc": 0.8928445644517492, "train_speed(iter/s)": 0.032544 }, { "epoch": 0.672189119798479, "grad_norm": 0.11390534043312073, "learning_rate": 7.987935339846025e-05, "loss": 0.3544899821281433, "memory(GiB)": 78.33, "step": 3469, "token_acc": 0.895393061480967, "train_speed(iter/s)": 0.032545 }, { "epoch": 0.6723828900838057, "grad_norm": 0.09529854357242584, "learning_rate": 7.979438633706206e-05, "loss": 0.3296326994895935, "memory(GiB)": 78.33, "step": 3470, "token_acc": 0.8999336332128899, "train_speed(iter/s)": 0.032546 }, { "epoch": 0.6725766603691324, "grad_norm": 0.09383497387170792, "learning_rate": 7.97094481109959e-05, "loss": 0.31189560890197754, "memory(GiB)": 78.33, "step": 3471, "token_acc": 0.9070854413313131, "train_speed(iter/s)": 0.032547 }, { "epoch": 0.6727704306544592, "grad_norm": 0.10056065768003464, "learning_rate": 7.962453875514821e-05, "loss": 0.3454468250274658, "memory(GiB)": 78.33, "step": 3472, "token_acc": 0.8972385445115727, "train_speed(iter/s)": 0.032547 }, { "epoch": 0.6729642009397859, "grad_norm": 0.09275511652231216, "learning_rate": 7.953965830439349e-05, "loss": 0.3189311623573303, "memory(GiB)": 78.33, "step": 3473, "token_acc": 0.9043669819878191, "train_speed(iter/s)": 0.032548 }, { "epoch": 0.6731579712251127, "grad_norm": 0.09983082860708237, "learning_rate": 7.945480679359443e-05, "loss": 0.33363524079322815, "memory(GiB)": 78.33, "step": 3474, "token_acc": 0.901058171118153, "train_speed(iter/s)": 0.032549 }, { "epoch": 0.6733517415104394, "grad_norm": 0.3244771361351013, "learning_rate": 7.936998425760186e-05, "loss": 0.3636043071746826, "memory(GiB)": 78.33, "step": 3475, "token_acc": 0.8894176781990268, "train_speed(iter/s)": 0.03255 }, { "epoch": 0.6735455117957662, "grad_norm": 0.10669998079538345, "learning_rate": 7.928519073125461e-05, "loss": 0.35683298110961914, "memory(GiB)": 78.33, "step": 3476, "token_acc": 0.8926932475117707, "train_speed(iter/s)": 0.03255 }, { "epoch": 0.6737392820810929, "grad_norm": 0.09464793652296066, "learning_rate": 7.920042624937976e-05, "loss": 0.3355555236339569, "memory(GiB)": 78.33, "step": 3477, "token_acc": 0.9, "train_speed(iter/s)": 0.032551 }, { "epoch": 0.6739330523664196, "grad_norm": 0.10269086062908173, "learning_rate": 7.911569084679229e-05, "loss": 0.357306569814682, "memory(GiB)": 78.33, "step": 3478, "token_acc": 0.8936895083236547, "train_speed(iter/s)": 0.032552 }, { "epoch": 0.6741268226517464, "grad_norm": 0.09656988829374313, "learning_rate": 7.903098455829535e-05, "loss": 0.32650524377822876, "memory(GiB)": 78.33, "step": 3479, "token_acc": 0.9009720369210662, "train_speed(iter/s)": 0.032552 }, { "epoch": 0.6743205929370731, "grad_norm": 0.10245799273252487, "learning_rate": 7.894630741868004e-05, "loss": 0.3412097096443176, "memory(GiB)": 78.33, "step": 3480, "token_acc": 0.8984286408553378, "train_speed(iter/s)": 0.032553 }, { "epoch": 0.6745143632223999, "grad_norm": 0.0951901376247406, "learning_rate": 7.88616594627255e-05, "loss": 0.31834906339645386, "memory(GiB)": 78.33, "step": 3481, "token_acc": 0.9045248313917842, "train_speed(iter/s)": 0.032554 }, { "epoch": 0.6747081335077266, "grad_norm": 0.09806448966264725, "learning_rate": 7.877704072519911e-05, "loss": 0.3509221076965332, "memory(GiB)": 78.33, "step": 3482, "token_acc": 0.8939818589482267, "train_speed(iter/s)": 0.032555 }, { "epoch": 0.6749019037930534, "grad_norm": 0.10363534837961197, "learning_rate": 7.869245124085581e-05, "loss": 0.3548724353313446, "memory(GiB)": 78.33, "step": 3483, "token_acc": 0.8933054967997143, "train_speed(iter/s)": 0.032555 }, { "epoch": 0.6750956740783801, "grad_norm": 0.10435101389884949, "learning_rate": 7.860789104443896e-05, "loss": 0.36100128293037415, "memory(GiB)": 78.33, "step": 3484, "token_acc": 0.8941173294457493, "train_speed(iter/s)": 0.032556 }, { "epoch": 0.6752894443637069, "grad_norm": 0.09948544204235077, "learning_rate": 7.852336017067964e-05, "loss": 0.32601821422576904, "memory(GiB)": 78.33, "step": 3485, "token_acc": 0.901149788196911, "train_speed(iter/s)": 0.032557 }, { "epoch": 0.6754832146490336, "grad_norm": 0.10575611889362335, "learning_rate": 7.843885865429693e-05, "loss": 0.36941444873809814, "memory(GiB)": 78.33, "step": 3486, "token_acc": 0.8913294131211806, "train_speed(iter/s)": 0.032557 }, { "epoch": 0.6756769849343603, "grad_norm": 0.10058741271495819, "learning_rate": 7.835438652999791e-05, "loss": 0.3273415267467499, "memory(GiB)": 78.33, "step": 3487, "token_acc": 0.9005714607437435, "train_speed(iter/s)": 0.032558 }, { "epoch": 0.6758707552196871, "grad_norm": 0.09746947884559631, "learning_rate": 7.826994383247747e-05, "loss": 0.31902948021888733, "memory(GiB)": 78.33, "step": 3488, "token_acc": 0.9043132861435046, "train_speed(iter/s)": 0.032559 }, { "epoch": 0.6760645255050138, "grad_norm": 0.0960841104388237, "learning_rate": 7.818553059641867e-05, "loss": 0.32989510893821716, "memory(GiB)": 78.33, "step": 3489, "token_acc": 0.9024306199803854, "train_speed(iter/s)": 0.03256 }, { "epoch": 0.6762582957903406, "grad_norm": 0.10085577517747879, "learning_rate": 7.810114685649207e-05, "loss": 0.3315393626689911, "memory(GiB)": 78.33, "step": 3490, "token_acc": 0.8987733391228832, "train_speed(iter/s)": 0.03256 }, { "epoch": 0.6764520660756673, "grad_norm": 0.1091570183634758, "learning_rate": 7.801679264735652e-05, "loss": 0.3644520044326782, "memory(GiB)": 78.33, "step": 3491, "token_acc": 0.8908445452104651, "train_speed(iter/s)": 0.032561 }, { "epoch": 0.6766458363609941, "grad_norm": 0.11124227195978165, "learning_rate": 7.793246800365848e-05, "loss": 0.36782822012901306, "memory(GiB)": 78.33, "step": 3492, "token_acc": 0.8893680254071851, "train_speed(iter/s)": 0.032562 }, { "epoch": 0.6768396066463208, "grad_norm": 0.11075468361377716, "learning_rate": 7.784817296003237e-05, "loss": 0.3444291353225708, "memory(GiB)": 78.33, "step": 3493, "token_acc": 0.8969756577329727, "train_speed(iter/s)": 0.032563 }, { "epoch": 0.6770333769316476, "grad_norm": 0.10527750104665756, "learning_rate": 7.776390755110041e-05, "loss": 0.36560899019241333, "memory(GiB)": 78.33, "step": 3494, "token_acc": 0.8897428155792825, "train_speed(iter/s)": 0.032564 }, { "epoch": 0.6772271472169743, "grad_norm": 0.10618400573730469, "learning_rate": 7.767967181147265e-05, "loss": 0.35203975439071655, "memory(GiB)": 78.33, "step": 3495, "token_acc": 0.8978139451084018, "train_speed(iter/s)": 0.032564 }, { "epoch": 0.677420917502301, "grad_norm": 0.09277181327342987, "learning_rate": 7.759546577574708e-05, "loss": 0.3013547956943512, "memory(GiB)": 78.33, "step": 3496, "token_acc": 0.9089381973172872, "train_speed(iter/s)": 0.032565 }, { "epoch": 0.6776146877876278, "grad_norm": 0.09669741243124008, "learning_rate": 7.751128947850921e-05, "loss": 0.3188782334327698, "memory(GiB)": 78.33, "step": 3497, "token_acc": 0.9045035268583831, "train_speed(iter/s)": 0.032566 }, { "epoch": 0.6778084580729545, "grad_norm": 0.09220936894416809, "learning_rate": 7.742714295433265e-05, "loss": 0.3000409007072449, "memory(GiB)": 78.33, "step": 3498, "token_acc": 0.9094853531300161, "train_speed(iter/s)": 0.032566 }, { "epoch": 0.6780022283582813, "grad_norm": 0.1013851910829544, "learning_rate": 7.734302623777857e-05, "loss": 0.35849493741989136, "memory(GiB)": 78.33, "step": 3499, "token_acc": 0.8939226809538817, "train_speed(iter/s)": 0.032567 }, { "epoch": 0.678195998643608, "grad_norm": 0.10033900290727615, "learning_rate": 7.7258939363396e-05, "loss": 0.3273484408855438, "memory(GiB)": 78.33, "step": 3500, "token_acc": 0.902882797731569, "train_speed(iter/s)": 0.032568 }, { "epoch": 0.678195998643608, "eval_loss": 0.3950374126434326, "eval_runtime": 1346.3478, "eval_samples_per_second": 5.013, "eval_steps_per_second": 5.013, "eval_token_acc": 0.8989412672013904, "step": 3500 }, { "epoch": 0.6783897689289348, "grad_norm": 0.09484906494617462, "learning_rate": 7.717488236572166e-05, "loss": 0.3013468384742737, "memory(GiB)": 78.33, "step": 3501, "token_acc": 0.9093957031354772, "train_speed(iter/s)": 0.032166 }, { "epoch": 0.6785835392142615, "grad_norm": 0.1061227023601532, "learning_rate": 7.709085527927994e-05, "loss": 0.3452511131763458, "memory(GiB)": 78.33, "step": 3502, "token_acc": 0.8964009033475315, "train_speed(iter/s)": 0.032166 }, { "epoch": 0.6787773094995883, "grad_norm": 0.10096945613622665, "learning_rate": 7.700685813858317e-05, "loss": 0.330334335565567, "memory(GiB)": 78.33, "step": 3503, "token_acc": 0.9003261023677868, "train_speed(iter/s)": 0.032167 }, { "epoch": 0.678971079784915, "grad_norm": 0.09609825909137726, "learning_rate": 7.692289097813119e-05, "loss": 0.32408520579338074, "memory(GiB)": 78.33, "step": 3504, "token_acc": 0.9045769211614124, "train_speed(iter/s)": 0.032168 }, { "epoch": 0.6791648500702417, "grad_norm": 0.08847975730895996, "learning_rate": 7.683895383241152e-05, "loss": 0.3026714622974396, "memory(GiB)": 78.33, "step": 3505, "token_acc": 0.9074179743223966, "train_speed(iter/s)": 0.032169 }, { "epoch": 0.6793586203555685, "grad_norm": 0.10310973227024078, "learning_rate": 7.675504673589942e-05, "loss": 0.3564712107181549, "memory(GiB)": 78.33, "step": 3506, "token_acc": 0.8931326434619002, "train_speed(iter/s)": 0.03217 }, { "epoch": 0.6795523906408952, "grad_norm": 0.09145442396402359, "learning_rate": 7.66711697230578e-05, "loss": 0.29558998346328735, "memory(GiB)": 78.33, "step": 3507, "token_acc": 0.9112166695232732, "train_speed(iter/s)": 0.03217 }, { "epoch": 0.679746160926222, "grad_norm": 0.11162301152944565, "learning_rate": 7.65873228283372e-05, "loss": 0.37096238136291504, "memory(GiB)": 78.33, "step": 3508, "token_acc": 0.8897501419647927, "train_speed(iter/s)": 0.032171 }, { "epoch": 0.6799399312115487, "grad_norm": 0.09270161390304565, "learning_rate": 7.650350608617573e-05, "loss": 0.30103829503059387, "memory(GiB)": 78.33, "step": 3509, "token_acc": 0.9094411328318193, "train_speed(iter/s)": 0.032172 }, { "epoch": 0.6801337014968755, "grad_norm": 0.11339203268289566, "learning_rate": 7.641971953099932e-05, "loss": 0.36674901843070984, "memory(GiB)": 78.33, "step": 3510, "token_acc": 0.8918313357088867, "train_speed(iter/s)": 0.032173 }, { "epoch": 0.6803274717822022, "grad_norm": 0.11586851626634598, "learning_rate": 7.633596319722123e-05, "loss": 0.3334874212741852, "memory(GiB)": 78.33, "step": 3511, "token_acc": 0.8987653673752917, "train_speed(iter/s)": 0.032174 }, { "epoch": 0.680521242067529, "grad_norm": 0.09603973478078842, "learning_rate": 7.625223711924251e-05, "loss": 0.3322643041610718, "memory(GiB)": 78.33, "step": 3512, "token_acc": 0.9002838519764508, "train_speed(iter/s)": 0.032174 }, { "epoch": 0.6807150123528557, "grad_norm": 0.09271606057882309, "learning_rate": 7.616854133145168e-05, "loss": 0.3315150737762451, "memory(GiB)": 78.33, "step": 3513, "token_acc": 0.9017941454202077, "train_speed(iter/s)": 0.032175 }, { "epoch": 0.6809087826381824, "grad_norm": 0.10619889944791794, "learning_rate": 7.608487586822484e-05, "loss": 0.3216935396194458, "memory(GiB)": 78.33, "step": 3514, "token_acc": 0.9023508686672336, "train_speed(iter/s)": 0.032176 }, { "epoch": 0.6811025529235092, "grad_norm": 0.10402272641658783, "learning_rate": 7.600124076392569e-05, "loss": 0.3527657687664032, "memory(GiB)": 78.33, "step": 3515, "token_acc": 0.8967574223559994, "train_speed(iter/s)": 0.032177 }, { "epoch": 0.6812963232088359, "grad_norm": 0.09466518461704254, "learning_rate": 7.591763605290532e-05, "loss": 0.33050382137298584, "memory(GiB)": 78.33, "step": 3516, "token_acc": 0.9007306626354246, "train_speed(iter/s)": 0.032178 }, { "epoch": 0.6814900934941627, "grad_norm": 0.10368410497903824, "learning_rate": 7.583406176950252e-05, "loss": 0.35686179995536804, "memory(GiB)": 78.33, "step": 3517, "token_acc": 0.8919963619827194, "train_speed(iter/s)": 0.032178 }, { "epoch": 0.6816838637794894, "grad_norm": 0.1045493334531784, "learning_rate": 7.57505179480435e-05, "loss": 0.34543779492378235, "memory(GiB)": 78.33, "step": 3518, "token_acc": 0.89681269102023, "train_speed(iter/s)": 0.032179 }, { "epoch": 0.6818776340648162, "grad_norm": 0.10913045704364777, "learning_rate": 7.56670046228419e-05, "loss": 0.3246757388114929, "memory(GiB)": 78.33, "step": 3519, "token_acc": 0.901501614470673, "train_speed(iter/s)": 0.03218 }, { "epoch": 0.6820714043501429, "grad_norm": 0.10240820050239563, "learning_rate": 7.55835218281989e-05, "loss": 0.35042083263397217, "memory(GiB)": 78.33, "step": 3520, "token_acc": 0.8953667078178932, "train_speed(iter/s)": 0.032181 }, { "epoch": 0.6822651746354697, "grad_norm": 0.08281584084033966, "learning_rate": 7.55000695984031e-05, "loss": 0.28523337841033936, "memory(GiB)": 78.33, "step": 3521, "token_acc": 0.9129096006250287, "train_speed(iter/s)": 0.032182 }, { "epoch": 0.6824589449207964, "grad_norm": 0.10515667498111725, "learning_rate": 7.54166479677307e-05, "loss": 0.3321034014225006, "memory(GiB)": 78.33, "step": 3522, "token_acc": 0.8979174190441797, "train_speed(iter/s)": 0.032182 }, { "epoch": 0.6826527152061231, "grad_norm": 0.0955941304564476, "learning_rate": 7.5333256970445e-05, "loss": 0.3286648988723755, "memory(GiB)": 78.33, "step": 3523, "token_acc": 0.9015049247537623, "train_speed(iter/s)": 0.032183 }, { "epoch": 0.6828464854914499, "grad_norm": 0.09993550926446915, "learning_rate": 7.52498966407971e-05, "loss": 0.3626023232936859, "memory(GiB)": 78.33, "step": 3524, "token_acc": 0.8902110817941953, "train_speed(iter/s)": 0.032184 }, { "epoch": 0.6830402557767766, "grad_norm": 0.0978488177061081, "learning_rate": 7.516656701302527e-05, "loss": 0.3279617726802826, "memory(GiB)": 78.33, "step": 3525, "token_acc": 0.9018437274393292, "train_speed(iter/s)": 0.032185 }, { "epoch": 0.6832340260621034, "grad_norm": 0.11416416615247726, "learning_rate": 7.508326812135521e-05, "loss": 0.36074215173721313, "memory(GiB)": 78.33, "step": 3526, "token_acc": 0.891769873711183, "train_speed(iter/s)": 0.032186 }, { "epoch": 0.6834277963474301, "grad_norm": 0.10453812032938004, "learning_rate": 7.500000000000002e-05, "loss": 0.3612178862094879, "memory(GiB)": 78.33, "step": 3527, "token_acc": 0.8947847093932073, "train_speed(iter/s)": 0.032187 }, { "epoch": 0.6836215666327569, "grad_norm": 0.09948738664388657, "learning_rate": 7.49167626831601e-05, "loss": 0.34740930795669556, "memory(GiB)": 78.33, "step": 3528, "token_acc": 0.8971372214521804, "train_speed(iter/s)": 0.032187 }, { "epoch": 0.6838153369180836, "grad_norm": 0.09419828653335571, "learning_rate": 7.483355620502344e-05, "loss": 0.3287207782268524, "memory(GiB)": 78.33, "step": 3529, "token_acc": 0.9001759014951627, "train_speed(iter/s)": 0.032188 }, { "epoch": 0.6840091072034103, "grad_norm": 0.0952015146613121, "learning_rate": 7.475038059976492e-05, "loss": 0.3348086476325989, "memory(GiB)": 78.33, "step": 3530, "token_acc": 0.8985031033223805, "train_speed(iter/s)": 0.032189 }, { "epoch": 0.6842028774887371, "grad_norm": 0.09707789123058319, "learning_rate": 7.466723590154719e-05, "loss": 0.32328489422798157, "memory(GiB)": 78.33, "step": 3531, "token_acc": 0.9013299458170223, "train_speed(iter/s)": 0.03219 }, { "epoch": 0.6843966477740638, "grad_norm": 0.09463740885257721, "learning_rate": 7.458412214451992e-05, "loss": 0.3230316638946533, "memory(GiB)": 78.33, "step": 3532, "token_acc": 0.9013585495772858, "train_speed(iter/s)": 0.03219 }, { "epoch": 0.6845904180593906, "grad_norm": 0.09360900521278381, "learning_rate": 7.450103936282022e-05, "loss": 0.3111358880996704, "memory(GiB)": 78.33, "step": 3533, "token_acc": 0.9069836922693888, "train_speed(iter/s)": 0.032191 }, { "epoch": 0.6847841883447173, "grad_norm": 0.10580860823392868, "learning_rate": 7.441798759057238e-05, "loss": 0.3405246138572693, "memory(GiB)": 78.33, "step": 3534, "token_acc": 0.897554945054945, "train_speed(iter/s)": 0.032192 }, { "epoch": 0.6849779586300441, "grad_norm": 0.10613741725683212, "learning_rate": 7.433496686188794e-05, "loss": 0.3461315929889679, "memory(GiB)": 78.33, "step": 3535, "token_acc": 0.8966271542167963, "train_speed(iter/s)": 0.032193 }, { "epoch": 0.6851717289153708, "grad_norm": 0.1079440712928772, "learning_rate": 7.425197721086587e-05, "loss": 0.3699726462364197, "memory(GiB)": 78.33, "step": 3536, "token_acc": 0.8911746065279391, "train_speed(iter/s)": 0.032194 }, { "epoch": 0.6853654992006976, "grad_norm": 0.09260550886392593, "learning_rate": 7.416901867159219e-05, "loss": 0.29420557618141174, "memory(GiB)": 78.33, "step": 3537, "token_acc": 0.9136413222721161, "train_speed(iter/s)": 0.032194 }, { "epoch": 0.6855592694860243, "grad_norm": 0.10380339622497559, "learning_rate": 7.408609127814019e-05, "loss": 0.346148818731308, "memory(GiB)": 78.33, "step": 3538, "token_acc": 0.8991937710530675, "train_speed(iter/s)": 0.032195 }, { "epoch": 0.685753039771351, "grad_norm": 0.09581930190324783, "learning_rate": 7.400319506457039e-05, "loss": 0.3182103633880615, "memory(GiB)": 78.33, "step": 3539, "token_acc": 0.9065670467951239, "train_speed(iter/s)": 0.032196 }, { "epoch": 0.6859468100566778, "grad_norm": 0.11510851234197617, "learning_rate": 7.39203300649305e-05, "loss": 0.3609999716281891, "memory(GiB)": 78.33, "step": 3540, "token_acc": 0.892073988583549, "train_speed(iter/s)": 0.032197 }, { "epoch": 0.6861405803420045, "grad_norm": 0.11120989918708801, "learning_rate": 7.383749631325538e-05, "loss": 0.3553025722503662, "memory(GiB)": 78.33, "step": 3541, "token_acc": 0.8961922030825022, "train_speed(iter/s)": 0.032197 }, { "epoch": 0.6863343506273313, "grad_norm": 0.1011735275387764, "learning_rate": 7.375469384356705e-05, "loss": 0.33759036660194397, "memory(GiB)": 78.33, "step": 3542, "token_acc": 0.8999340949033392, "train_speed(iter/s)": 0.032198 }, { "epoch": 0.686528120912658, "grad_norm": 0.11077761650085449, "learning_rate": 7.367192268987479e-05, "loss": 0.38028085231781006, "memory(GiB)": 78.33, "step": 3543, "token_acc": 0.8891689052590767, "train_speed(iter/s)": 0.032199 }, { "epoch": 0.6867218911979848, "grad_norm": 0.10262443870306015, "learning_rate": 7.35891828861749e-05, "loss": 0.35626545548439026, "memory(GiB)": 78.33, "step": 3544, "token_acc": 0.8935800392486684, "train_speed(iter/s)": 0.0322 }, { "epoch": 0.6869156614833115, "grad_norm": 0.09497423470020294, "learning_rate": 7.350647446645084e-05, "loss": 0.3267231583595276, "memory(GiB)": 78.33, "step": 3545, "token_acc": 0.9022584504935687, "train_speed(iter/s)": 0.032201 }, { "epoch": 0.6871094317686383, "grad_norm": 0.09908232092857361, "learning_rate": 7.342379746467317e-05, "loss": 0.3486153185367584, "memory(GiB)": 78.33, "step": 3546, "token_acc": 0.8962997272399773, "train_speed(iter/s)": 0.032201 }, { "epoch": 0.687303202053965, "grad_norm": 0.10710590332746506, "learning_rate": 7.334115191479958e-05, "loss": 0.35125356912612915, "memory(GiB)": 78.33, "step": 3547, "token_acc": 0.8947753459732976, "train_speed(iter/s)": 0.032202 }, { "epoch": 0.6874969723392917, "grad_norm": 0.09003735333681107, "learning_rate": 7.325853785077478e-05, "loss": 0.3048159182071686, "memory(GiB)": 78.33, "step": 3548, "token_acc": 0.907744732028764, "train_speed(iter/s)": 0.032203 }, { "epoch": 0.6876907426246185, "grad_norm": 0.11373089253902435, "learning_rate": 7.317595530653055e-05, "loss": 0.3735848367214203, "memory(GiB)": 78.33, "step": 3549, "token_acc": 0.8891771564011403, "train_speed(iter/s)": 0.032204 }, { "epoch": 0.6878845129099452, "grad_norm": 0.10183597356081009, "learning_rate": 7.30934043159859e-05, "loss": 0.3513960838317871, "memory(GiB)": 78.33, "step": 3550, "token_acc": 0.8948632459312839, "train_speed(iter/s)": 0.032205 }, { "epoch": 0.688078283195272, "grad_norm": 0.10609740763902664, "learning_rate": 7.301088491304664e-05, "loss": 0.32522451877593994, "memory(GiB)": 78.33, "step": 3551, "token_acc": 0.9038668263767314, "train_speed(iter/s)": 0.032206 }, { "epoch": 0.6882720534805987, "grad_norm": 0.10257891565561295, "learning_rate": 7.292839713160572e-05, "loss": 0.34625405073165894, "memory(GiB)": 78.33, "step": 3552, "token_acc": 0.8976674404121611, "train_speed(iter/s)": 0.032206 }, { "epoch": 0.6884658237659255, "grad_norm": 0.09951046854257584, "learning_rate": 7.28459410055431e-05, "loss": 0.32491254806518555, "memory(GiB)": 78.33, "step": 3553, "token_acc": 0.9030543047794716, "train_speed(iter/s)": 0.032207 }, { "epoch": 0.6886595940512522, "grad_norm": 0.10017047822475433, "learning_rate": 7.276351656872567e-05, "loss": 0.3391422927379608, "memory(GiB)": 78.33, "step": 3554, "token_acc": 0.8989633042619714, "train_speed(iter/s)": 0.032208 }, { "epoch": 0.688853364336579, "grad_norm": 0.09386296570301056, "learning_rate": 7.268112385500751e-05, "loss": 0.3156377077102661, "memory(GiB)": 78.33, "step": 3555, "token_acc": 0.9045845939027228, "train_speed(iter/s)": 0.032209 }, { "epoch": 0.6890471346219057, "grad_norm": 0.0977504625916481, "learning_rate": 7.259876289822932e-05, "loss": 0.2982153594493866, "memory(GiB)": 78.33, "step": 3556, "token_acc": 0.9096926576077448, "train_speed(iter/s)": 0.03221 }, { "epoch": 0.6892409049072324, "grad_norm": 0.10796993225812912, "learning_rate": 7.25164337322191e-05, "loss": 0.33437347412109375, "memory(GiB)": 78.33, "step": 3557, "token_acc": 0.9011005762019451, "train_speed(iter/s)": 0.03221 }, { "epoch": 0.6894346751925592, "grad_norm": 0.09524484723806381, "learning_rate": 7.243413639079164e-05, "loss": 0.3119055926799774, "memory(GiB)": 78.33, "step": 3558, "token_acc": 0.9061108966936833, "train_speed(iter/s)": 0.032211 }, { "epoch": 0.6896284454778859, "grad_norm": 0.09212891757488251, "learning_rate": 7.235187090774861e-05, "loss": 0.31174278259277344, "memory(GiB)": 78.33, "step": 3559, "token_acc": 0.9064973268274461, "train_speed(iter/s)": 0.032212 }, { "epoch": 0.6898222157632127, "grad_norm": 0.09447701275348663, "learning_rate": 7.22696373168787e-05, "loss": 0.3395736515522003, "memory(GiB)": 78.33, "step": 3560, "token_acc": 0.899402390438247, "train_speed(iter/s)": 0.032213 }, { "epoch": 0.6900159860485394, "grad_norm": 0.09689807891845703, "learning_rate": 7.218743565195736e-05, "loss": 0.33678123354911804, "memory(GiB)": 78.33, "step": 3561, "token_acc": 0.8996187046549806, "train_speed(iter/s)": 0.032214 }, { "epoch": 0.6902097563338662, "grad_norm": 0.10127212852239609, "learning_rate": 7.210526594674724e-05, "loss": 0.35242077708244324, "memory(GiB)": 78.33, "step": 3562, "token_acc": 0.8943310420532657, "train_speed(iter/s)": 0.032214 }, { "epoch": 0.6904035266191929, "grad_norm": 0.0959462970495224, "learning_rate": 7.202312823499738e-05, "loss": 0.30481332540512085, "memory(GiB)": 78.33, "step": 3563, "token_acc": 0.9080480651942592, "train_speed(iter/s)": 0.032215 }, { "epoch": 0.6905972969045197, "grad_norm": 0.09734071046113968, "learning_rate": 7.194102255044415e-05, "loss": 0.31086277961730957, "memory(GiB)": 78.33, "step": 3564, "token_acc": 0.9077373688064437, "train_speed(iter/s)": 0.032216 }, { "epoch": 0.6907910671898464, "grad_norm": 0.10370344668626785, "learning_rate": 7.185894892681048e-05, "loss": 0.3449724316596985, "memory(GiB)": 78.33, "step": 3565, "token_acc": 0.8976901609168102, "train_speed(iter/s)": 0.032217 }, { "epoch": 0.6909848374751731, "grad_norm": 0.10260160267353058, "learning_rate": 7.17769073978062e-05, "loss": 0.3641132712364197, "memory(GiB)": 78.33, "step": 3566, "token_acc": 0.8912745666262711, "train_speed(iter/s)": 0.032218 }, { "epoch": 0.6911786077604999, "grad_norm": 0.10054480284452438, "learning_rate": 7.169489799712799e-05, "loss": 0.3622581660747528, "memory(GiB)": 78.33, "step": 3567, "token_acc": 0.8936806148590948, "train_speed(iter/s)": 0.032218 }, { "epoch": 0.6913723780458266, "grad_norm": 0.0974610224366188, "learning_rate": 7.161292075845926e-05, "loss": 0.3504323363304138, "memory(GiB)": 78.33, "step": 3568, "token_acc": 0.8961522654885441, "train_speed(iter/s)": 0.032219 }, { "epoch": 0.6915661483311534, "grad_norm": 0.10981731861829758, "learning_rate": 7.153097571547038e-05, "loss": 0.3665009140968323, "memory(GiB)": 78.33, "step": 3569, "token_acc": 0.8888823146559375, "train_speed(iter/s)": 0.03222 }, { "epoch": 0.6917599186164801, "grad_norm": 0.10058823972940445, "learning_rate": 7.144906290181832e-05, "loss": 0.3594769239425659, "memory(GiB)": 78.33, "step": 3570, "token_acc": 0.8944776275804255, "train_speed(iter/s)": 0.032221 }, { "epoch": 0.6919536889018069, "grad_norm": 0.11253203451633453, "learning_rate": 7.136718235114686e-05, "loss": 0.36248651146888733, "memory(GiB)": 78.33, "step": 3571, "token_acc": 0.8918202731428905, "train_speed(iter/s)": 0.032222 }, { "epoch": 0.6921474591871336, "grad_norm": 0.11419709771871567, "learning_rate": 7.128533409708656e-05, "loss": 0.3548899292945862, "memory(GiB)": 78.33, "step": 3572, "token_acc": 0.8954283004658169, "train_speed(iter/s)": 0.032222 }, { "epoch": 0.6923412294724604, "grad_norm": 0.10111390799283981, "learning_rate": 7.120351817325469e-05, "loss": 0.33687660098075867, "memory(GiB)": 78.33, "step": 3573, "token_acc": 0.8996950958880688, "train_speed(iter/s)": 0.032223 }, { "epoch": 0.6925349997577871, "grad_norm": 0.10509663820266724, "learning_rate": 7.112173461325525e-05, "loss": 0.36578071117401123, "memory(GiB)": 78.33, "step": 3574, "token_acc": 0.889935521532624, "train_speed(iter/s)": 0.032224 }, { "epoch": 0.6927287700431138, "grad_norm": 0.11243990808725357, "learning_rate": 7.10399834506789e-05, "loss": 0.3173210620880127, "memory(GiB)": 78.33, "step": 3575, "token_acc": 0.9059147392523605, "train_speed(iter/s)": 0.032225 }, { "epoch": 0.6929225403284406, "grad_norm": 0.10139571875333786, "learning_rate": 7.095826471910313e-05, "loss": 0.3282420039176941, "memory(GiB)": 78.33, "step": 3576, "token_acc": 0.902722545120832, "train_speed(iter/s)": 0.032226 }, { "epoch": 0.6931163106137673, "grad_norm": 0.0943324863910675, "learning_rate": 7.087657845209196e-05, "loss": 0.32712027430534363, "memory(GiB)": 78.33, "step": 3577, "token_acc": 0.9013381123058543, "train_speed(iter/s)": 0.032226 }, { "epoch": 0.6933100808990941, "grad_norm": 0.13730654120445251, "learning_rate": 7.079492468319618e-05, "loss": 0.3298265337944031, "memory(GiB)": 78.33, "step": 3578, "token_acc": 0.901703923714241, "train_speed(iter/s)": 0.032227 }, { "epoch": 0.6935038511844208, "grad_norm": 0.10465666651725769, "learning_rate": 7.071330344595314e-05, "loss": 0.34704989194869995, "memory(GiB)": 78.33, "step": 3579, "token_acc": 0.8968518114360541, "train_speed(iter/s)": 0.032228 }, { "epoch": 0.6936976214697476, "grad_norm": 0.10772595554590225, "learning_rate": 7.063171477388688e-05, "loss": 0.35015714168548584, "memory(GiB)": 78.33, "step": 3580, "token_acc": 0.894221193443329, "train_speed(iter/s)": 0.032229 }, { "epoch": 0.6938913917550743, "grad_norm": 0.10779722779989243, "learning_rate": 7.055015870050809e-05, "loss": 0.3421753942966461, "memory(GiB)": 78.33, "step": 3581, "token_acc": 0.8975127263354598, "train_speed(iter/s)": 0.03223 }, { "epoch": 0.694085162040401, "grad_norm": 0.09361686557531357, "learning_rate": 7.046863525931395e-05, "loss": 0.3193609416484833, "memory(GiB)": 78.33, "step": 3582, "token_acc": 0.9052085463095482, "train_speed(iter/s)": 0.03223 }, { "epoch": 0.6942789323257279, "grad_norm": 0.10432402044534683, "learning_rate": 7.038714448378846e-05, "loss": 0.3353957235813141, "memory(GiB)": 78.33, "step": 3583, "token_acc": 0.8987451070688465, "train_speed(iter/s)": 0.032231 }, { "epoch": 0.6944727026110546, "grad_norm": 0.09659415483474731, "learning_rate": 7.030568640740201e-05, "loss": 0.3202442526817322, "memory(GiB)": 78.33, "step": 3584, "token_acc": 0.9025401069518717, "train_speed(iter/s)": 0.032232 }, { "epoch": 0.6946664728963814, "grad_norm": 0.09830014407634735, "learning_rate": 7.022426106361163e-05, "loss": 0.31335949897766113, "memory(GiB)": 78.33, "step": 3585, "token_acc": 0.906624622120296, "train_speed(iter/s)": 0.032233 }, { "epoch": 0.6948602431817081, "grad_norm": 0.09905708581209183, "learning_rate": 7.014286848586088e-05, "loss": 0.3123447895050049, "memory(GiB)": 78.33, "step": 3586, "token_acc": 0.9041099546303709, "train_speed(iter/s)": 0.032233 }, { "epoch": 0.6950540134670349, "grad_norm": 0.09144581854343414, "learning_rate": 7.006150870757989e-05, "loss": 0.3135398030281067, "memory(GiB)": 78.33, "step": 3587, "token_acc": 0.9057776084938225, "train_speed(iter/s)": 0.032234 }, { "epoch": 0.6952477837523616, "grad_norm": 0.10966409742832184, "learning_rate": 6.99801817621853e-05, "loss": 0.3710702657699585, "memory(GiB)": 78.33, "step": 3588, "token_acc": 0.8925160936022211, "train_speed(iter/s)": 0.032235 }, { "epoch": 0.6954415540376884, "grad_norm": 0.09838887304067612, "learning_rate": 6.989888768308024e-05, "loss": 0.3230520486831665, "memory(GiB)": 78.33, "step": 3589, "token_acc": 0.901048865761444, "train_speed(iter/s)": 0.032236 }, { "epoch": 0.6956353243230151, "grad_norm": 0.0968567356467247, "learning_rate": 6.981762650365443e-05, "loss": 0.34151309728622437, "memory(GiB)": 78.33, "step": 3590, "token_acc": 0.8988749172733289, "train_speed(iter/s)": 0.032237 }, { "epoch": 0.6958290946083419, "grad_norm": 0.09684547781944275, "learning_rate": 6.973639825728401e-05, "loss": 0.3080379366874695, "memory(GiB)": 78.33, "step": 3591, "token_acc": 0.9065901444686325, "train_speed(iter/s)": 0.032237 }, { "epoch": 0.6960228648936686, "grad_norm": 0.09826657176017761, "learning_rate": 6.965520297733161e-05, "loss": 0.3160586953163147, "memory(GiB)": 78.33, "step": 3592, "token_acc": 0.9051656151419558, "train_speed(iter/s)": 0.032238 }, { "epoch": 0.6962166351789953, "grad_norm": 0.10795781761407852, "learning_rate": 6.957404069714629e-05, "loss": 0.3557586073875427, "memory(GiB)": 78.33, "step": 3593, "token_acc": 0.8951461742901105, "train_speed(iter/s)": 0.032239 }, { "epoch": 0.6964104054643221, "grad_norm": 0.10063203424215317, "learning_rate": 6.949291145006353e-05, "loss": 0.31674492359161377, "memory(GiB)": 78.33, "step": 3594, "token_acc": 0.9046947410840218, "train_speed(iter/s)": 0.03224 }, { "epoch": 0.6966041757496488, "grad_norm": 0.10855990648269653, "learning_rate": 6.941181526940546e-05, "loss": 0.3493427634239197, "memory(GiB)": 78.33, "step": 3595, "token_acc": 0.895584936843348, "train_speed(iter/s)": 0.032241 }, { "epoch": 0.6967979460349756, "grad_norm": 0.09571418911218643, "learning_rate": 6.933075218848022e-05, "loss": 0.3080763518810272, "memory(GiB)": 78.33, "step": 3596, "token_acc": 0.9050366723085603, "train_speed(iter/s)": 0.032241 }, { "epoch": 0.6969917163203023, "grad_norm": 0.0985865592956543, "learning_rate": 6.924972224058278e-05, "loss": 0.3179894983768463, "memory(GiB)": 78.33, "step": 3597, "token_acc": 0.9029965847747053, "train_speed(iter/s)": 0.032242 }, { "epoch": 0.6971854866056291, "grad_norm": 0.10527476668357849, "learning_rate": 6.916872545899427e-05, "loss": 0.34273964166641235, "memory(GiB)": 78.33, "step": 3598, "token_acc": 0.8985260287978188, "train_speed(iter/s)": 0.032243 }, { "epoch": 0.6973792568909558, "grad_norm": 0.10250242054462433, "learning_rate": 6.908776187698222e-05, "loss": 0.34843844175338745, "memory(GiB)": 78.33, "step": 3599, "token_acc": 0.8955212090892335, "train_speed(iter/s)": 0.032244 }, { "epoch": 0.6975730271762826, "grad_norm": 0.10623464733362198, "learning_rate": 6.900683152780059e-05, "loss": 0.34149301052093506, "memory(GiB)": 78.33, "step": 3600, "token_acc": 0.8972149929701756, "train_speed(iter/s)": 0.032244 }, { "epoch": 0.6977667974616093, "grad_norm": 0.10202284902334213, "learning_rate": 6.892593444468954e-05, "loss": 0.34131869673728943, "memory(GiB)": 78.33, "step": 3601, "token_acc": 0.8982538616521155, "train_speed(iter/s)": 0.032242 }, { "epoch": 0.697960567746936, "grad_norm": 0.24693740904331207, "learning_rate": 6.884507066087584e-05, "loss": 0.341753214597702, "memory(GiB)": 78.33, "step": 3602, "token_acc": 0.8990424814565071, "train_speed(iter/s)": 0.032242 }, { "epoch": 0.6981543380322628, "grad_norm": 0.11156189441680908, "learning_rate": 6.87642402095723e-05, "loss": 0.3889864385128021, "memory(GiB)": 78.33, "step": 3603, "token_acc": 0.885625468967735, "train_speed(iter/s)": 0.032243 }, { "epoch": 0.6983481083175895, "grad_norm": 0.10035014897584915, "learning_rate": 6.868344312397823e-05, "loss": 0.3296211063861847, "memory(GiB)": 78.33, "step": 3604, "token_acc": 0.9014160142449337, "train_speed(iter/s)": 0.032244 }, { "epoch": 0.6985418786029163, "grad_norm": 0.09430637955665588, "learning_rate": 6.860267943727912e-05, "loss": 0.31993281841278076, "memory(GiB)": 78.33, "step": 3605, "token_acc": 0.9017905151091515, "train_speed(iter/s)": 0.032245 }, { "epoch": 0.698735648888243, "grad_norm": 0.09148237109184265, "learning_rate": 6.852194918264679e-05, "loss": 0.3246019780635834, "memory(GiB)": 78.33, "step": 3606, "token_acc": 0.9024793779101373, "train_speed(iter/s)": 0.032246 }, { "epoch": 0.6989294191735698, "grad_norm": 0.10248222947120667, "learning_rate": 6.844125239323933e-05, "loss": 0.33933860063552856, "memory(GiB)": 78.33, "step": 3607, "token_acc": 0.8989493117584775, "train_speed(iter/s)": 0.032246 }, { "epoch": 0.6991231894588965, "grad_norm": 0.09796436131000519, "learning_rate": 6.836058910220102e-05, "loss": 0.3447071611881256, "memory(GiB)": 78.33, "step": 3608, "token_acc": 0.8983826213034023, "train_speed(iter/s)": 0.032247 }, { "epoch": 0.6993169597442233, "grad_norm": 0.10065195709466934, "learning_rate": 6.827995934266259e-05, "loss": 0.35696181654930115, "memory(GiB)": 78.33, "step": 3609, "token_acc": 0.8937146709536897, "train_speed(iter/s)": 0.032248 }, { "epoch": 0.69951073002955, "grad_norm": 0.08694633841514587, "learning_rate": 6.819936314774074e-05, "loss": 0.3089888095855713, "memory(GiB)": 78.33, "step": 3610, "token_acc": 0.9059485912481589, "train_speed(iter/s)": 0.032249 }, { "epoch": 0.6997045003148767, "grad_norm": 0.11966582387685776, "learning_rate": 6.81188005505385e-05, "loss": 0.3933137357234955, "memory(GiB)": 78.33, "step": 3611, "token_acc": 0.8836271567891973, "train_speed(iter/s)": 0.032249 }, { "epoch": 0.6998982706002035, "grad_norm": 0.11115437746047974, "learning_rate": 6.803827158414512e-05, "loss": 0.36456286907196045, "memory(GiB)": 78.33, "step": 3612, "token_acc": 0.8943506748392162, "train_speed(iter/s)": 0.03225 }, { "epoch": 0.7000920408855302, "grad_norm": 0.09643102437257767, "learning_rate": 6.795777628163599e-05, "loss": 0.3220784366130829, "memory(GiB)": 78.33, "step": 3613, "token_acc": 0.9028170546008266, "train_speed(iter/s)": 0.032251 }, { "epoch": 0.700285811170857, "grad_norm": 0.09812808781862259, "learning_rate": 6.78773146760727e-05, "loss": 0.3644194006919861, "memory(GiB)": 78.33, "step": 3614, "token_acc": 0.8943334712050802, "train_speed(iter/s)": 0.032252 }, { "epoch": 0.7004795814561837, "grad_norm": 0.08493607491254807, "learning_rate": 6.779688680050296e-05, "loss": 0.2795659899711609, "memory(GiB)": 78.33, "step": 3615, "token_acc": 0.9142221903464104, "train_speed(iter/s)": 0.032253 }, { "epoch": 0.7006733517415105, "grad_norm": 0.10760082304477692, "learning_rate": 6.771649268796073e-05, "loss": 0.3571077883243561, "memory(GiB)": 78.33, "step": 3616, "token_acc": 0.8955498133792708, "train_speed(iter/s)": 0.032253 }, { "epoch": 0.7008671220268372, "grad_norm": 0.10084082186222076, "learning_rate": 6.7636132371466e-05, "loss": 0.3274276554584503, "memory(GiB)": 78.33, "step": 3617, "token_acc": 0.9002256477176956, "train_speed(iter/s)": 0.032254 }, { "epoch": 0.701060892312164, "grad_norm": 0.10685470700263977, "learning_rate": 6.755580588402492e-05, "loss": 0.34894925355911255, "memory(GiB)": 78.33, "step": 3618, "token_acc": 0.8940826727066817, "train_speed(iter/s)": 0.032255 }, { "epoch": 0.7012546625974907, "grad_norm": 0.11531514674425125, "learning_rate": 6.74755132586297e-05, "loss": 0.40001946687698364, "memory(GiB)": 78.33, "step": 3619, "token_acc": 0.885813818463325, "train_speed(iter/s)": 0.032256 }, { "epoch": 0.7014484328828174, "grad_norm": 0.1036171242594719, "learning_rate": 6.739525452825871e-05, "loss": 0.35469454526901245, "memory(GiB)": 78.33, "step": 3620, "token_acc": 0.8936814976351638, "train_speed(iter/s)": 0.032257 }, { "epoch": 0.7016422031681442, "grad_norm": 0.10101023316383362, "learning_rate": 6.731502972587637e-05, "loss": 0.31730735301971436, "memory(GiB)": 78.33, "step": 3621, "token_acc": 0.9038396016249508, "train_speed(iter/s)": 0.032257 }, { "epoch": 0.7018359734534709, "grad_norm": 0.10922136157751083, "learning_rate": 6.72348388844331e-05, "loss": 0.3624739944934845, "memory(GiB)": 78.33, "step": 3622, "token_acc": 0.8922535011220635, "train_speed(iter/s)": 0.032258 }, { "epoch": 0.7020297437387977, "grad_norm": 0.09917715191841125, "learning_rate": 6.715468203686553e-05, "loss": 0.3212898373603821, "memory(GiB)": 78.33, "step": 3623, "token_acc": 0.9037098911227606, "train_speed(iter/s)": 0.032259 }, { "epoch": 0.7022235140241244, "grad_norm": 0.1009821966290474, "learning_rate": 6.70745592160962e-05, "loss": 0.3310352563858032, "memory(GiB)": 78.33, "step": 3624, "token_acc": 0.9000106797671811, "train_speed(iter/s)": 0.03226 }, { "epoch": 0.7024172843094512, "grad_norm": 0.1044231653213501, "learning_rate": 6.699447045503368e-05, "loss": 0.3453122079372406, "memory(GiB)": 78.33, "step": 3625, "token_acc": 0.8964209722297998, "train_speed(iter/s)": 0.03226 }, { "epoch": 0.7026110545947779, "grad_norm": 0.10527423024177551, "learning_rate": 6.69144157865726e-05, "loss": 0.34135711193084717, "memory(GiB)": 78.33, "step": 3626, "token_acc": 0.9015418633837328, "train_speed(iter/s)": 0.032261 }, { "epoch": 0.7028048248801047, "grad_norm": 0.10108046233654022, "learning_rate": 6.683439524359351e-05, "loss": 0.3644823431968689, "memory(GiB)": 78.33, "step": 3627, "token_acc": 0.8922698922698923, "train_speed(iter/s)": 0.032262 }, { "epoch": 0.7029985951654314, "grad_norm": 0.10416973382234573, "learning_rate": 6.675440885896313e-05, "loss": 0.3607743978500366, "memory(GiB)": 78.33, "step": 3628, "token_acc": 0.8915753781950965, "train_speed(iter/s)": 0.032263 }, { "epoch": 0.7031923654507581, "grad_norm": 0.09729216992855072, "learning_rate": 6.66744566655338e-05, "loss": 0.3052827715873718, "memory(GiB)": 78.33, "step": 3629, "token_acc": 0.907312711319753, "train_speed(iter/s)": 0.032264 }, { "epoch": 0.7033861357360849, "grad_norm": 0.11203377693891525, "learning_rate": 6.659453869614426e-05, "loss": 0.35420122742652893, "memory(GiB)": 78.33, "step": 3630, "token_acc": 0.8935774567300815, "train_speed(iter/s)": 0.032265 }, { "epoch": 0.7035799060214116, "grad_norm": 0.09870520234107971, "learning_rate": 6.651465498361885e-05, "loss": 0.34778672456741333, "memory(GiB)": 78.33, "step": 3631, "token_acc": 0.8956560099647083, "train_speed(iter/s)": 0.032265 }, { "epoch": 0.7037736763067384, "grad_norm": 0.10209493339061737, "learning_rate": 6.643480556076796e-05, "loss": 0.3381112515926361, "memory(GiB)": 78.33, "step": 3632, "token_acc": 0.9000748406410488, "train_speed(iter/s)": 0.032266 }, { "epoch": 0.7039674465920651, "grad_norm": 0.10792220383882523, "learning_rate": 6.635499046038794e-05, "loss": 0.3817068338394165, "memory(GiB)": 78.33, "step": 3633, "token_acc": 0.8868650435828074, "train_speed(iter/s)": 0.032267 }, { "epoch": 0.7041612168773919, "grad_norm": 0.1084747463464737, "learning_rate": 6.627520971526088e-05, "loss": 0.36603009700775146, "memory(GiB)": 78.33, "step": 3634, "token_acc": 0.8899790989317232, "train_speed(iter/s)": 0.032268 }, { "epoch": 0.7043549871627186, "grad_norm": 0.10230504721403122, "learning_rate": 6.619546335815503e-05, "loss": 0.3367150127887726, "memory(GiB)": 78.33, "step": 3635, "token_acc": 0.898678290999525, "train_speed(iter/s)": 0.032268 }, { "epoch": 0.7045487574480453, "grad_norm": 0.0944862887263298, "learning_rate": 6.61157514218243e-05, "loss": 0.33380186557769775, "memory(GiB)": 78.33, "step": 3636, "token_acc": 0.9002516989680343, "train_speed(iter/s)": 0.032269 }, { "epoch": 0.7047425277333721, "grad_norm": 0.1018281877040863, "learning_rate": 6.603607393900852e-05, "loss": 0.3347662687301636, "memory(GiB)": 78.33, "step": 3637, "token_acc": 0.8994067307187483, "train_speed(iter/s)": 0.03227 }, { "epoch": 0.7049362980186988, "grad_norm": 0.10647895187139511, "learning_rate": 6.595643094243335e-05, "loss": 0.3350790739059448, "memory(GiB)": 78.33, "step": 3638, "token_acc": 0.9001211906239839, "train_speed(iter/s)": 0.032271 }, { "epoch": 0.7051300683040256, "grad_norm": 0.10115791112184525, "learning_rate": 6.587682246481036e-05, "loss": 0.3523610234260559, "memory(GiB)": 78.33, "step": 3639, "token_acc": 0.8971207273951843, "train_speed(iter/s)": 0.032272 }, { "epoch": 0.7053238385893523, "grad_norm": 0.10161978751420975, "learning_rate": 6.579724853883684e-05, "loss": 0.3423236012458801, "memory(GiB)": 78.33, "step": 3640, "token_acc": 0.90029210201101, "train_speed(iter/s)": 0.032272 }, { "epoch": 0.7055176088746791, "grad_norm": 0.09845045208930969, "learning_rate": 6.571770919719592e-05, "loss": 0.32784488797187805, "memory(GiB)": 78.33, "step": 3641, "token_acc": 0.9032830751813925, "train_speed(iter/s)": 0.032273 }, { "epoch": 0.7057113791600058, "grad_norm": 0.09614937752485275, "learning_rate": 6.563820447255663e-05, "loss": 0.33316075801849365, "memory(GiB)": 78.33, "step": 3642, "token_acc": 0.8991082090500266, "train_speed(iter/s)": 0.032274 }, { "epoch": 0.7059051494453326, "grad_norm": 0.10282139480113983, "learning_rate": 6.555873439757366e-05, "loss": 0.32991480827331543, "memory(GiB)": 78.33, "step": 3643, "token_acc": 0.9022564374834086, "train_speed(iter/s)": 0.032274 }, { "epoch": 0.7060989197306593, "grad_norm": 0.11456986516714096, "learning_rate": 6.547929900488749e-05, "loss": 0.36666494607925415, "memory(GiB)": 78.33, "step": 3644, "token_acc": 0.889168765743073, "train_speed(iter/s)": 0.032275 }, { "epoch": 0.706292690015986, "grad_norm": 0.10417237877845764, "learning_rate": 6.539989832712439e-05, "loss": 0.36223533749580383, "memory(GiB)": 78.33, "step": 3645, "token_acc": 0.8901788429112419, "train_speed(iter/s)": 0.032276 }, { "epoch": 0.7064864603013128, "grad_norm": 0.09668152034282684, "learning_rate": 6.532053239689631e-05, "loss": 0.32515013217926025, "memory(GiB)": 78.33, "step": 3646, "token_acc": 0.901520462181827, "train_speed(iter/s)": 0.032277 }, { "epoch": 0.7066802305866395, "grad_norm": 0.10676920413970947, "learning_rate": 6.524120124680104e-05, "loss": 0.35744109749794006, "memory(GiB)": 78.33, "step": 3647, "token_acc": 0.8961123110151188, "train_speed(iter/s)": 0.032278 }, { "epoch": 0.7068740008719663, "grad_norm": 0.10558585822582245, "learning_rate": 6.51619049094219e-05, "loss": 0.3541412353515625, "memory(GiB)": 78.33, "step": 3648, "token_acc": 0.8950543561476038, "train_speed(iter/s)": 0.032278 }, { "epoch": 0.707067771157293, "grad_norm": 0.10470271855592728, "learning_rate": 6.508264341732815e-05, "loss": 0.36191800236701965, "memory(GiB)": 78.33, "step": 3649, "token_acc": 0.8917385076885658, "train_speed(iter/s)": 0.032279 }, { "epoch": 0.7072615414426198, "grad_norm": 0.10651751607656479, "learning_rate": 6.500341680307457e-05, "loss": 0.34071311354637146, "memory(GiB)": 78.33, "step": 3650, "token_acc": 0.8990374215877136, "train_speed(iter/s)": 0.03228 }, { "epoch": 0.7074553117279465, "grad_norm": 0.10043511539697647, "learning_rate": 6.492422509920167e-05, "loss": 0.32280540466308594, "memory(GiB)": 78.33, "step": 3651, "token_acc": 0.9044051318031472, "train_speed(iter/s)": 0.03228 }, { "epoch": 0.7076490820132733, "grad_norm": 0.11648175865411758, "learning_rate": 6.484506833823559e-05, "loss": 0.36728546023368835, "memory(GiB)": 78.33, "step": 3652, "token_acc": 0.890401807018102, "train_speed(iter/s)": 0.032281 }, { "epoch": 0.7078428522986, "grad_norm": 0.10505972057580948, "learning_rate": 6.476594655268814e-05, "loss": 0.3796302080154419, "memory(GiB)": 78.33, "step": 3653, "token_acc": 0.8889887273321353, "train_speed(iter/s)": 0.032282 }, { "epoch": 0.7080366225839267, "grad_norm": 0.1008792519569397, "learning_rate": 6.468685977505676e-05, "loss": 0.31763315200805664, "memory(GiB)": 78.33, "step": 3654, "token_acc": 0.905323467447716, "train_speed(iter/s)": 0.032283 }, { "epoch": 0.7082303928692535, "grad_norm": 0.09355062991380692, "learning_rate": 6.460780803782448e-05, "loss": 0.31916338205337524, "memory(GiB)": 78.33, "step": 3655, "token_acc": 0.9061786058790678, "train_speed(iter/s)": 0.032283 }, { "epoch": 0.7084241631545802, "grad_norm": 0.10963544994592667, "learning_rate": 6.452879137346007e-05, "loss": 0.3544541895389557, "memory(GiB)": 78.33, "step": 3656, "token_acc": 0.8935794884272636, "train_speed(iter/s)": 0.032284 }, { "epoch": 0.708617933439907, "grad_norm": 0.09566653519868851, "learning_rate": 6.444980981441775e-05, "loss": 0.3308408558368683, "memory(GiB)": 78.33, "step": 3657, "token_acc": 0.9017979944484733, "train_speed(iter/s)": 0.032285 }, { "epoch": 0.7088117037252337, "grad_norm": 0.10379898548126221, "learning_rate": 6.437086339313735e-05, "loss": 0.32990795373916626, "memory(GiB)": 78.33, "step": 3658, "token_acc": 0.8986880466472303, "train_speed(iter/s)": 0.032286 }, { "epoch": 0.7090054740105605, "grad_norm": 0.11237087845802307, "learning_rate": 6.429195214204428e-05, "loss": 0.3397267162799835, "memory(GiB)": 78.33, "step": 3659, "token_acc": 0.8990950108233576, "train_speed(iter/s)": 0.032286 }, { "epoch": 0.7091992442958872, "grad_norm": 0.10298223793506622, "learning_rate": 6.421307609354957e-05, "loss": 0.32215797901153564, "memory(GiB)": 78.33, "step": 3660, "token_acc": 0.9034365070024715, "train_speed(iter/s)": 0.032287 }, { "epoch": 0.709393014581214, "grad_norm": 0.11267991364002228, "learning_rate": 6.413423528004968e-05, "loss": 0.3442709445953369, "memory(GiB)": 78.33, "step": 3661, "token_acc": 0.8958262848706671, "train_speed(iter/s)": 0.032288 }, { "epoch": 0.7095867848665407, "grad_norm": 0.09616924822330475, "learning_rate": 6.40554297339266e-05, "loss": 0.3290918469429016, "memory(GiB)": 78.33, "step": 3662, "token_acc": 0.9001304801670146, "train_speed(iter/s)": 0.032289 }, { "epoch": 0.7097805551518674, "grad_norm": 0.09400229156017303, "learning_rate": 6.3976659487548e-05, "loss": 0.3193153440952301, "memory(GiB)": 78.33, "step": 3663, "token_acc": 0.9033483612785573, "train_speed(iter/s)": 0.032289 }, { "epoch": 0.7099743254371942, "grad_norm": 0.0955566018819809, "learning_rate": 6.38979245732669e-05, "loss": 0.3172321915626526, "memory(GiB)": 78.33, "step": 3664, "token_acc": 0.905316643945966, "train_speed(iter/s)": 0.03229 }, { "epoch": 0.7101680957225209, "grad_norm": 0.1006002202630043, "learning_rate": 6.381922502342182e-05, "loss": 0.31889939308166504, "memory(GiB)": 78.33, "step": 3665, "token_acc": 0.9029685900544152, "train_speed(iter/s)": 0.032291 }, { "epoch": 0.7103618660078477, "grad_norm": 0.10536913573741913, "learning_rate": 6.37405608703368e-05, "loss": 0.3331918716430664, "memory(GiB)": 78.33, "step": 3666, "token_acc": 0.9007023825919295, "train_speed(iter/s)": 0.032292 }, { "epoch": 0.7105556362931744, "grad_norm": 0.09231302887201309, "learning_rate": 6.366193214632123e-05, "loss": 0.32499587535858154, "memory(GiB)": 78.33, "step": 3667, "token_acc": 0.903846596533229, "train_speed(iter/s)": 0.032292 }, { "epoch": 0.7107494065785012, "grad_norm": 0.1009543314576149, "learning_rate": 6.35833388836702e-05, "loss": 0.3330029845237732, "memory(GiB)": 78.33, "step": 3668, "token_acc": 0.9015039132436442, "train_speed(iter/s)": 0.032293 }, { "epoch": 0.7109431768638279, "grad_norm": 0.10333767533302307, "learning_rate": 6.350478111466399e-05, "loss": 0.3277633786201477, "memory(GiB)": 78.33, "step": 3669, "token_acc": 0.9013294198895028, "train_speed(iter/s)": 0.032294 }, { "epoch": 0.7111369471491547, "grad_norm": 0.12599217891693115, "learning_rate": 6.342625887156839e-05, "loss": 0.38569512963294983, "memory(GiB)": 78.33, "step": 3670, "token_acc": 0.8859112900889416, "train_speed(iter/s)": 0.032295 }, { "epoch": 0.7113307174344814, "grad_norm": 0.10265874862670898, "learning_rate": 6.334777218663461e-05, "loss": 0.36148056387901306, "memory(GiB)": 78.33, "step": 3671, "token_acc": 0.8917856569870046, "train_speed(iter/s)": 0.032295 }, { "epoch": 0.7115244877198081, "grad_norm": 0.1083201915025711, "learning_rate": 6.326932109209922e-05, "loss": 0.34301403164863586, "memory(GiB)": 78.33, "step": 3672, "token_acc": 0.8956853082741233, "train_speed(iter/s)": 0.032296 }, { "epoch": 0.7117182580051349, "grad_norm": 0.11269722878932953, "learning_rate": 6.319090562018419e-05, "loss": 0.34771838784217834, "memory(GiB)": 78.33, "step": 3673, "token_acc": 0.8975308282074455, "train_speed(iter/s)": 0.032297 }, { "epoch": 0.7119120282904616, "grad_norm": 0.1110716462135315, "learning_rate": 6.311252580309682e-05, "loss": 0.361664742231369, "memory(GiB)": 78.33, "step": 3674, "token_acc": 0.8944661718966526, "train_speed(iter/s)": 0.032298 }, { "epoch": 0.7121057985757884, "grad_norm": 0.11600884050130844, "learning_rate": 6.303418167302994e-05, "loss": 0.38586270809173584, "memory(GiB)": 78.33, "step": 3675, "token_acc": 0.8842967627548681, "train_speed(iter/s)": 0.032298 }, { "epoch": 0.7122995688611151, "grad_norm": 0.1069970428943634, "learning_rate": 6.295587326216149e-05, "loss": 0.34455546736717224, "memory(GiB)": 78.33, "step": 3676, "token_acc": 0.898117085260072, "train_speed(iter/s)": 0.032299 }, { "epoch": 0.7124933391464419, "grad_norm": 0.10031075775623322, "learning_rate": 6.287760060265485e-05, "loss": 0.33782869577407837, "memory(GiB)": 78.33, "step": 3677, "token_acc": 0.8992902208201893, "train_speed(iter/s)": 0.0323 }, { "epoch": 0.7126871094317686, "grad_norm": 0.10248647630214691, "learning_rate": 6.279936372665874e-05, "loss": 0.33945292234420776, "memory(GiB)": 78.33, "step": 3678, "token_acc": 0.8977860913237329, "train_speed(iter/s)": 0.032301 }, { "epoch": 0.7128808797170954, "grad_norm": 0.10595156252384186, "learning_rate": 6.27211626663071e-05, "loss": 0.36181285977363586, "memory(GiB)": 78.33, "step": 3679, "token_acc": 0.8937350415317471, "train_speed(iter/s)": 0.032301 }, { "epoch": 0.7130746500024221, "grad_norm": 0.09816795587539673, "learning_rate": 6.264299745371922e-05, "loss": 0.33072012662887573, "memory(GiB)": 78.33, "step": 3680, "token_acc": 0.9027671022290545, "train_speed(iter/s)": 0.032302 }, { "epoch": 0.7132684202877488, "grad_norm": 0.09826884418725967, "learning_rate": 6.256486812099961e-05, "loss": 0.32538434863090515, "memory(GiB)": 78.33, "step": 3681, "token_acc": 0.901966256501332, "train_speed(iter/s)": 0.032303 }, { "epoch": 0.7134621905730756, "grad_norm": 0.09893721342086792, "learning_rate": 6.248677470023819e-05, "loss": 0.31855225563049316, "memory(GiB)": 78.33, "step": 3682, "token_acc": 0.9033751256414326, "train_speed(iter/s)": 0.032304 }, { "epoch": 0.7136559608584023, "grad_norm": 0.10306079685688019, "learning_rate": 6.240871722350998e-05, "loss": 0.3382839262485504, "memory(GiB)": 78.33, "step": 3683, "token_acc": 0.8988074461896451, "train_speed(iter/s)": 0.032304 }, { "epoch": 0.7138497311437291, "grad_norm": 0.10051840543746948, "learning_rate": 6.233069572287527e-05, "loss": 0.31791266798973083, "memory(GiB)": 78.33, "step": 3684, "token_acc": 0.90387971539804, "train_speed(iter/s)": 0.032305 }, { "epoch": 0.7140435014290558, "grad_norm": 0.11294718831777573, "learning_rate": 6.22527102303796e-05, "loss": 0.3825894892215729, "memory(GiB)": 78.33, "step": 3685, "token_acc": 0.8891299885640559, "train_speed(iter/s)": 0.032306 }, { "epoch": 0.7142372717143826, "grad_norm": 0.0918063074350357, "learning_rate": 6.217476077805369e-05, "loss": 0.31590160727500916, "memory(GiB)": 78.33, "step": 3686, "token_acc": 0.9046886822386566, "train_speed(iter/s)": 0.032307 }, { "epoch": 0.7144310419997093, "grad_norm": 0.09278496354818344, "learning_rate": 6.209684739791347e-05, "loss": 0.3343659043312073, "memory(GiB)": 78.33, "step": 3687, "token_acc": 0.8985444692228377, "train_speed(iter/s)": 0.032307 }, { "epoch": 0.714624812285036, "grad_norm": 0.11030339449644089, "learning_rate": 6.201897012196005e-05, "loss": 0.33175480365753174, "memory(GiB)": 78.33, "step": 3688, "token_acc": 0.9004147113390811, "train_speed(iter/s)": 0.032308 }, { "epoch": 0.7148185825703628, "grad_norm": 0.09746501594781876, "learning_rate": 6.194112898217978e-05, "loss": 0.3286242187023163, "memory(GiB)": 78.33, "step": 3689, "token_acc": 0.9018423513356415, "train_speed(iter/s)": 0.032309 }, { "epoch": 0.7150123528556895, "grad_norm": 0.10385642945766449, "learning_rate": 6.186332401054406e-05, "loss": 0.3613872528076172, "memory(GiB)": 78.33, "step": 3690, "token_acc": 0.891545143039286, "train_speed(iter/s)": 0.03231 }, { "epoch": 0.7152061231410163, "grad_norm": 0.10428830236196518, "learning_rate": 6.17855552390095e-05, "loss": 0.3543146252632141, "memory(GiB)": 78.33, "step": 3691, "token_acc": 0.8936885245901639, "train_speed(iter/s)": 0.03231 }, { "epoch": 0.715399893426343, "grad_norm": 0.09830185770988464, "learning_rate": 6.170782269951783e-05, "loss": 0.2993859648704529, "memory(GiB)": 78.33, "step": 3692, "token_acc": 0.9107328114684523, "train_speed(iter/s)": 0.032311 }, { "epoch": 0.7155936637116698, "grad_norm": 0.10561109334230423, "learning_rate": 6.163012642399587e-05, "loss": 0.3516858220100403, "memory(GiB)": 78.33, "step": 3693, "token_acc": 0.8974372357638708, "train_speed(iter/s)": 0.032312 }, { "epoch": 0.7157874339969965, "grad_norm": 0.0973869115114212, "learning_rate": 6.155246644435558e-05, "loss": 0.3680126965045929, "memory(GiB)": 78.33, "step": 3694, "token_acc": 0.8894600330110823, "train_speed(iter/s)": 0.032312 }, { "epoch": 0.7159812042823233, "grad_norm": 0.09466208517551422, "learning_rate": 6.147484279249396e-05, "loss": 0.32024601101875305, "memory(GiB)": 78.33, "step": 3695, "token_acc": 0.9047858675212543, "train_speed(iter/s)": 0.032313 }, { "epoch": 0.71617497456765, "grad_norm": 0.10687924921512604, "learning_rate": 6.13972555002932e-05, "loss": 0.33103132247924805, "memory(GiB)": 78.33, "step": 3696, "token_acc": 0.9001981178801387, "train_speed(iter/s)": 0.032314 }, { "epoch": 0.7163687448529767, "grad_norm": 0.09712281078100204, "learning_rate": 6.131970459962046e-05, "loss": 0.3220095634460449, "memory(GiB)": 78.33, "step": 3697, "token_acc": 0.9038578331704998, "train_speed(iter/s)": 0.032315 }, { "epoch": 0.7165625151383035, "grad_norm": 0.11033158749341965, "learning_rate": 6.124219012232798e-05, "loss": 0.36744678020477295, "memory(GiB)": 78.33, "step": 3698, "token_acc": 0.8917987391739822, "train_speed(iter/s)": 0.032315 }, { "epoch": 0.7167562854236302, "grad_norm": 0.09989314526319504, "learning_rate": 6.1164712100253e-05, "loss": 0.33150678873062134, "memory(GiB)": 78.33, "step": 3699, "token_acc": 0.8993894415495552, "train_speed(iter/s)": 0.032316 }, { "epoch": 0.716950055708957, "grad_norm": 0.09984250366687775, "learning_rate": 6.108727056521783e-05, "loss": 0.30265843868255615, "memory(GiB)": 78.33, "step": 3700, "token_acc": 0.9069431182649025, "train_speed(iter/s)": 0.032317 }, { "epoch": 0.7171438259942837, "grad_norm": 0.09416959434747696, "learning_rate": 6.100986554902988e-05, "loss": 0.3297763168811798, "memory(GiB)": 78.33, "step": 3701, "token_acc": 0.9030869331246025, "train_speed(iter/s)": 0.032318 }, { "epoch": 0.7173375962796105, "grad_norm": 0.10124395787715912, "learning_rate": 6.0932497083481404e-05, "loss": 0.3468609154224396, "memory(GiB)": 78.33, "step": 3702, "token_acc": 0.8960674907226953, "train_speed(iter/s)": 0.032318 }, { "epoch": 0.7175313665649372, "grad_norm": 0.10942904651165009, "learning_rate": 6.08551652003497e-05, "loss": 0.3687775135040283, "memory(GiB)": 78.33, "step": 3703, "token_acc": 0.890260192821466, "train_speed(iter/s)": 0.032319 }, { "epoch": 0.7177251368502641, "grad_norm": 0.0992734357714653, "learning_rate": 6.077786993139706e-05, "loss": 0.34562569856643677, "memory(GiB)": 78.33, "step": 3704, "token_acc": 0.8980179107439656, "train_speed(iter/s)": 0.03232 }, { "epoch": 0.7179189071355908, "grad_norm": 0.09762708097696304, "learning_rate": 6.070061130837074e-05, "loss": 0.3264696002006531, "memory(GiB)": 78.33, "step": 3705, "token_acc": 0.9032848106089475, "train_speed(iter/s)": 0.032321 }, { "epoch": 0.7181126774209176, "grad_norm": 0.1083015501499176, "learning_rate": 6.0623389363002925e-05, "loss": 0.3502708375453949, "memory(GiB)": 78.33, "step": 3706, "token_acc": 0.8949533057084544, "train_speed(iter/s)": 0.032321 }, { "epoch": 0.7183064477062443, "grad_norm": 0.1071147471666336, "learning_rate": 6.054620412701069e-05, "loss": 0.33522728085517883, "memory(GiB)": 78.33, "step": 3707, "token_acc": 0.89790950744559, "train_speed(iter/s)": 0.032322 }, { "epoch": 0.718500217991571, "grad_norm": 0.08930321782827377, "learning_rate": 6.0469055632096186e-05, "loss": 0.2909185290336609, "memory(GiB)": 78.33, "step": 3708, "token_acc": 0.9108066749253079, "train_speed(iter/s)": 0.032323 }, { "epoch": 0.7186939882768978, "grad_norm": 0.09592770040035248, "learning_rate": 6.039194390994632e-05, "loss": 0.3235243260860443, "memory(GiB)": 78.33, "step": 3709, "token_acc": 0.9030587138380254, "train_speed(iter/s)": 0.032323 }, { "epoch": 0.7188877585622245, "grad_norm": 0.11995202302932739, "learning_rate": 6.031486899223295e-05, "loss": 0.39347177743911743, "memory(GiB)": 78.33, "step": 3710, "token_acc": 0.8830674846625767, "train_speed(iter/s)": 0.032324 }, { "epoch": 0.7190815288475513, "grad_norm": 0.11326012015342712, "learning_rate": 6.0237830910612816e-05, "loss": 0.36819055676460266, "memory(GiB)": 78.33, "step": 3711, "token_acc": 0.8888051139157516, "train_speed(iter/s)": 0.032325 }, { "epoch": 0.719275299132878, "grad_norm": 0.099938303232193, "learning_rate": 6.0160829696727535e-05, "loss": 0.3334549367427826, "memory(GiB)": 78.33, "step": 3712, "token_acc": 0.9019944353433705, "train_speed(iter/s)": 0.032326 }, { "epoch": 0.7194690694182048, "grad_norm": 0.10359780490398407, "learning_rate": 6.008386538220357e-05, "loss": 0.3458460569381714, "memory(GiB)": 78.33, "step": 3713, "token_acc": 0.8952391251453135, "train_speed(iter/s)": 0.032326 }, { "epoch": 0.7196628397035315, "grad_norm": 0.09753096848726273, "learning_rate": 6.0006937998652174e-05, "loss": 0.3544035255908966, "memory(GiB)": 78.33, "step": 3714, "token_acc": 0.8957159256940096, "train_speed(iter/s)": 0.032327 }, { "epoch": 0.7198566099888583, "grad_norm": 0.10259803384542465, "learning_rate": 5.993004757766961e-05, "loss": 0.3598324954509735, "memory(GiB)": 78.33, "step": 3715, "token_acc": 0.8931450536543698, "train_speed(iter/s)": 0.032328 }, { "epoch": 0.720050380274185, "grad_norm": 0.10409785062074661, "learning_rate": 5.9853194150836776e-05, "loss": 0.34401389956474304, "memory(GiB)": 78.33, "step": 3716, "token_acc": 0.8969972797656414, "train_speed(iter/s)": 0.032328 }, { "epoch": 0.7202441505595117, "grad_norm": 0.10885158181190491, "learning_rate": 5.977637774971945e-05, "loss": 0.34587976336479187, "memory(GiB)": 78.33, "step": 3717, "token_acc": 0.8954475087437647, "train_speed(iter/s)": 0.032329 }, { "epoch": 0.7204379208448385, "grad_norm": 0.0933261513710022, "learning_rate": 5.9699598405868184e-05, "loss": 0.30791398882865906, "memory(GiB)": 78.33, "step": 3718, "token_acc": 0.9086631971823582, "train_speed(iter/s)": 0.03233 }, { "epoch": 0.7206316911301652, "grad_norm": 0.10228514671325684, "learning_rate": 5.962285615081831e-05, "loss": 0.32020220160484314, "memory(GiB)": 78.33, "step": 3719, "token_acc": 0.9048829981248876, "train_speed(iter/s)": 0.032331 }, { "epoch": 0.720825461415492, "grad_norm": 0.10021097958087921, "learning_rate": 5.9546151016089935e-05, "loss": 0.3199518024921417, "memory(GiB)": 78.33, "step": 3720, "token_acc": 0.9031611499255526, "train_speed(iter/s)": 0.032331 }, { "epoch": 0.7210192317008187, "grad_norm": 0.0932881161570549, "learning_rate": 5.946948303318788e-05, "loss": 0.30388596653938293, "memory(GiB)": 78.33, "step": 3721, "token_acc": 0.9094827586206896, "train_speed(iter/s)": 0.032332 }, { "epoch": 0.7212130019861455, "grad_norm": 0.09003807604312897, "learning_rate": 5.939285223360185e-05, "loss": 0.3002552092075348, "memory(GiB)": 78.33, "step": 3722, "token_acc": 0.9080838912957857, "train_speed(iter/s)": 0.032333 }, { "epoch": 0.7214067722714722, "grad_norm": 0.08356068283319473, "learning_rate": 5.931625864880612e-05, "loss": 0.2760453224182129, "memory(GiB)": 78.33, "step": 3723, "token_acc": 0.9180010863661053, "train_speed(iter/s)": 0.032333 }, { "epoch": 0.721600542556799, "grad_norm": 0.09654924273490906, "learning_rate": 5.9239702310259726e-05, "loss": 0.3264361023902893, "memory(GiB)": 78.33, "step": 3724, "token_acc": 0.9010648901012432, "train_speed(iter/s)": 0.032334 }, { "epoch": 0.7217943128421257, "grad_norm": 0.1332063376903534, "learning_rate": 5.916318324940643e-05, "loss": 0.3769557476043701, "memory(GiB)": 78.33, "step": 3725, "token_acc": 0.8894034827713968, "train_speed(iter/s)": 0.032335 }, { "epoch": 0.7219880831274524, "grad_norm": 0.10913225263357162, "learning_rate": 5.9086701497674636e-05, "loss": 0.3281722962856293, "memory(GiB)": 78.33, "step": 3726, "token_acc": 0.9037930929929353, "train_speed(iter/s)": 0.032335 }, { "epoch": 0.7221818534127792, "grad_norm": 0.0964721292257309, "learning_rate": 5.9010257086477465e-05, "loss": 0.31369680166244507, "memory(GiB)": 78.33, "step": 3727, "token_acc": 0.9058114664301737, "train_speed(iter/s)": 0.032336 }, { "epoch": 0.7223756236981059, "grad_norm": 0.09561733901500702, "learning_rate": 5.893385004721265e-05, "loss": 0.31413954496383667, "memory(GiB)": 78.33, "step": 3728, "token_acc": 0.9042112486339492, "train_speed(iter/s)": 0.032337 }, { "epoch": 0.7225693939834327, "grad_norm": 0.09478065371513367, "learning_rate": 5.885748041126273e-05, "loss": 0.32629698514938354, "memory(GiB)": 78.33, "step": 3729, "token_acc": 0.9058655593044381, "train_speed(iter/s)": 0.032338 }, { "epoch": 0.7227631642687594, "grad_norm": 0.09354487806558609, "learning_rate": 5.8781148209994684e-05, "loss": 0.31036120653152466, "memory(GiB)": 78.33, "step": 3730, "token_acc": 0.9080553710987384, "train_speed(iter/s)": 0.032338 }, { "epoch": 0.7229569345540862, "grad_norm": 0.10508932918310165, "learning_rate": 5.870485347476023e-05, "loss": 0.3312036693096161, "memory(GiB)": 78.33, "step": 3731, "token_acc": 0.9002311682453766, "train_speed(iter/s)": 0.032339 }, { "epoch": 0.7231507048394129, "grad_norm": 0.09594292938709259, "learning_rate": 5.862859623689564e-05, "loss": 0.33407771587371826, "memory(GiB)": 78.33, "step": 3732, "token_acc": 0.8988324898531843, "train_speed(iter/s)": 0.03234 }, { "epoch": 0.7233444751247396, "grad_norm": 0.09291541576385498, "learning_rate": 5.855237652772182e-05, "loss": 0.30694496631622314, "memory(GiB)": 78.33, "step": 3733, "token_acc": 0.9065133287402618, "train_speed(iter/s)": 0.03234 }, { "epoch": 0.7235382454100664, "grad_norm": 0.10045888274908066, "learning_rate": 5.847619437854425e-05, "loss": 0.3402080535888672, "memory(GiB)": 78.33, "step": 3734, "token_acc": 0.8967975233486823, "train_speed(iter/s)": 0.032341 }, { "epoch": 0.7237320156953931, "grad_norm": 0.10660912096500397, "learning_rate": 5.8400049820652944e-05, "loss": 0.36076852679252625, "memory(GiB)": 78.33, "step": 3735, "token_acc": 0.8916170432607337, "train_speed(iter/s)": 0.032342 }, { "epoch": 0.7239257859807199, "grad_norm": 0.10676801949739456, "learning_rate": 5.8323942885322605e-05, "loss": 0.3120606243610382, "memory(GiB)": 78.33, "step": 3736, "token_acc": 0.907629155894226, "train_speed(iter/s)": 0.032343 }, { "epoch": 0.7241195562660466, "grad_norm": 0.10707957297563553, "learning_rate": 5.8247873603812364e-05, "loss": 0.35654643177986145, "memory(GiB)": 78.33, "step": 3737, "token_acc": 0.8950555588218002, "train_speed(iter/s)": 0.032343 }, { "epoch": 0.7243133265513734, "grad_norm": 0.11162468791007996, "learning_rate": 5.8171842007365906e-05, "loss": 0.3537403345108032, "memory(GiB)": 78.33, "step": 3738, "token_acc": 0.895508172545164, "train_speed(iter/s)": 0.032344 }, { "epoch": 0.7245070968367001, "grad_norm": 0.10323680192232132, "learning_rate": 5.809584812721145e-05, "loss": 0.34856048226356506, "memory(GiB)": 78.33, "step": 3739, "token_acc": 0.8983748949285514, "train_speed(iter/s)": 0.032345 }, { "epoch": 0.7247008671220269, "grad_norm": 0.1021660715341568, "learning_rate": 5.801989199456167e-05, "loss": 0.3216470181941986, "memory(GiB)": 78.33, "step": 3740, "token_acc": 0.9034610943704413, "train_speed(iter/s)": 0.032346 }, { "epoch": 0.7248946374073536, "grad_norm": 0.09654397517442703, "learning_rate": 5.794397364061391e-05, "loss": 0.33012211322784424, "memory(GiB)": 78.33, "step": 3741, "token_acc": 0.9008311162385515, "train_speed(iter/s)": 0.032346 }, { "epoch": 0.7250884076926803, "grad_norm": 0.11007391661405563, "learning_rate": 5.786809309654982e-05, "loss": 0.3086721897125244, "memory(GiB)": 78.33, "step": 3742, "token_acc": 0.9068431837791199, "train_speed(iter/s)": 0.032347 }, { "epoch": 0.7252821779780071, "grad_norm": 0.10440944880247116, "learning_rate": 5.7792250393535575e-05, "loss": 0.3401702344417572, "memory(GiB)": 78.33, "step": 3743, "token_acc": 0.8993664717348928, "train_speed(iter/s)": 0.032348 }, { "epoch": 0.7254759482633338, "grad_norm": 0.11093514412641525, "learning_rate": 5.771644556272181e-05, "loss": 0.3583501875400543, "memory(GiB)": 78.33, "step": 3744, "token_acc": 0.8940592570311325, "train_speed(iter/s)": 0.032349 }, { "epoch": 0.7256697185486606, "grad_norm": 0.10289919376373291, "learning_rate": 5.7640678635243606e-05, "loss": 0.3268412947654724, "memory(GiB)": 78.33, "step": 3745, "token_acc": 0.9035391502660698, "train_speed(iter/s)": 0.032349 }, { "epoch": 0.7258634888339873, "grad_norm": 0.095411017537117, "learning_rate": 5.756494964222047e-05, "loss": 0.32108423113822937, "memory(GiB)": 78.33, "step": 3746, "token_acc": 0.9025717749632419, "train_speed(iter/s)": 0.03235 }, { "epoch": 0.7260572591193141, "grad_norm": 0.10212237387895584, "learning_rate": 5.748925861475631e-05, "loss": 0.35688862204551697, "memory(GiB)": 78.33, "step": 3747, "token_acc": 0.8916946508368809, "train_speed(iter/s)": 0.032351 }, { "epoch": 0.7262510294046408, "grad_norm": 0.11504284292459488, "learning_rate": 5.741360558393953e-05, "loss": 0.3791845738887787, "memory(GiB)": 78.33, "step": 3748, "token_acc": 0.8880055524007824, "train_speed(iter/s)": 0.032352 }, { "epoch": 0.7264447996899676, "grad_norm": 0.09927377104759216, "learning_rate": 5.733799058084284e-05, "loss": 0.33466947078704834, "memory(GiB)": 78.33, "step": 3749, "token_acc": 0.899494293530826, "train_speed(iter/s)": 0.032352 }, { "epoch": 0.7266385699752943, "grad_norm": 0.09528510272502899, "learning_rate": 5.7262413636523343e-05, "loss": 0.3107728362083435, "memory(GiB)": 78.33, "step": 3750, "token_acc": 0.9060210094798873, "train_speed(iter/s)": 0.032353 }, { "epoch": 0.726832340260621, "grad_norm": 0.09576904028654099, "learning_rate": 5.718687478202252e-05, "loss": 0.3216266930103302, "memory(GiB)": 78.33, "step": 3751, "token_acc": 0.9044459023011707, "train_speed(iter/s)": 0.032354 }, { "epoch": 0.7270261105459478, "grad_norm": 0.10115578025579453, "learning_rate": 5.7111374048366204e-05, "loss": 0.3389231264591217, "memory(GiB)": 78.33, "step": 3752, "token_acc": 0.8973095364944059, "train_speed(iter/s)": 0.032354 }, { "epoch": 0.7272198808312745, "grad_norm": 0.10029512643814087, "learning_rate": 5.703591146656458e-05, "loss": 0.31098711490631104, "memory(GiB)": 78.33, "step": 3753, "token_acc": 0.906193576626361, "train_speed(iter/s)": 0.032355 }, { "epoch": 0.7274136511166013, "grad_norm": 0.10533425211906433, "learning_rate": 5.696048706761211e-05, "loss": 0.33651649951934814, "memory(GiB)": 78.33, "step": 3754, "token_acc": 0.8975294455616202, "train_speed(iter/s)": 0.032356 }, { "epoch": 0.727607421401928, "grad_norm": 0.09996731579303741, "learning_rate": 5.688510088248772e-05, "loss": 0.31849730014801025, "memory(GiB)": 78.33, "step": 3755, "token_acc": 0.9020276110740658, "train_speed(iter/s)": 0.032357 }, { "epoch": 0.7278011916872548, "grad_norm": 0.10525427013635635, "learning_rate": 5.6809752942154505e-05, "loss": 0.3365975618362427, "memory(GiB)": 78.33, "step": 3756, "token_acc": 0.8993686078063035, "train_speed(iter/s)": 0.032357 }, { "epoch": 0.7279949619725815, "grad_norm": 0.11113554239273071, "learning_rate": 5.673444327755986e-05, "loss": 0.36573588848114014, "memory(GiB)": 78.33, "step": 3757, "token_acc": 0.8919042258663178, "train_speed(iter/s)": 0.032358 }, { "epoch": 0.7281887322579083, "grad_norm": 0.11570943146944046, "learning_rate": 5.6659171919635504e-05, "loss": 0.3126344382762909, "memory(GiB)": 78.33, "step": 3758, "token_acc": 0.9062464954581138, "train_speed(iter/s)": 0.032359 }, { "epoch": 0.728382502543235, "grad_norm": 0.11019326746463776, "learning_rate": 5.6583938899297404e-05, "loss": 0.3683561086654663, "memory(GiB)": 78.33, "step": 3759, "token_acc": 0.8913200775664959, "train_speed(iter/s)": 0.032359 }, { "epoch": 0.7285762728285617, "grad_norm": 0.11105161160230637, "learning_rate": 5.650874424744579e-05, "loss": 0.34568020701408386, "memory(GiB)": 78.33, "step": 3760, "token_acc": 0.898310245090577, "train_speed(iter/s)": 0.03236 }, { "epoch": 0.7287700431138885, "grad_norm": 0.10734663158655167, "learning_rate": 5.643358799496508e-05, "loss": 0.3574303090572357, "memory(GiB)": 78.33, "step": 3761, "token_acc": 0.8934692112234416, "train_speed(iter/s)": 0.032361 }, { "epoch": 0.7289638133992152, "grad_norm": 0.1054970771074295, "learning_rate": 5.635847017272404e-05, "loss": 0.33780547976493835, "memory(GiB)": 78.33, "step": 3762, "token_acc": 0.898160262743318, "train_speed(iter/s)": 0.032362 }, { "epoch": 0.729157583684542, "grad_norm": 0.10268279910087585, "learning_rate": 5.628339081157556e-05, "loss": 0.3292725682258606, "memory(GiB)": 78.33, "step": 3763, "token_acc": 0.9022685641820487, "train_speed(iter/s)": 0.032362 }, { "epoch": 0.7293513539698687, "grad_norm": 0.10835665464401245, "learning_rate": 5.620834994235673e-05, "loss": 0.3525397479534149, "memory(GiB)": 78.33, "step": 3764, "token_acc": 0.8940980485483103, "train_speed(iter/s)": 0.032363 }, { "epoch": 0.7295451242551955, "grad_norm": 0.09963172674179077, "learning_rate": 5.613334759588885e-05, "loss": 0.3277450501918793, "memory(GiB)": 78.33, "step": 3765, "token_acc": 0.9008741844812914, "train_speed(iter/s)": 0.032364 }, { "epoch": 0.7297388945405222, "grad_norm": 0.10072033107280731, "learning_rate": 5.605838380297742e-05, "loss": 0.3499065041542053, "memory(GiB)": 78.33, "step": 3766, "token_acc": 0.8946452543131308, "train_speed(iter/s)": 0.032364 }, { "epoch": 0.729932664825849, "grad_norm": 0.09813085943460464, "learning_rate": 5.5983458594412075e-05, "loss": 0.3212866187095642, "memory(GiB)": 78.33, "step": 3767, "token_acc": 0.9034384826047267, "train_speed(iter/s)": 0.032365 }, { "epoch": 0.7301264351111757, "grad_norm": 0.1046958863735199, "learning_rate": 5.5908572000966545e-05, "loss": 0.3336329162120819, "memory(GiB)": 78.33, "step": 3768, "token_acc": 0.9022138594066974, "train_speed(iter/s)": 0.032366 }, { "epoch": 0.7303202053965024, "grad_norm": 0.10364958643913269, "learning_rate": 5.583372405339888e-05, "loss": 0.3599414527416229, "memory(GiB)": 78.33, "step": 3769, "token_acc": 0.8923859525040124, "train_speed(iter/s)": 0.032367 }, { "epoch": 0.7305139756818292, "grad_norm": 0.11301209777593613, "learning_rate": 5.5758914782451094e-05, "loss": 0.3293749690055847, "memory(GiB)": 78.33, "step": 3770, "token_acc": 0.9023539911864785, "train_speed(iter/s)": 0.032367 }, { "epoch": 0.7307077459671559, "grad_norm": 0.10696696490049362, "learning_rate": 5.5684144218849364e-05, "loss": 0.3482256531715393, "memory(GiB)": 78.33, "step": 3771, "token_acc": 0.8958611481975968, "train_speed(iter/s)": 0.032368 }, { "epoch": 0.7309015162524827, "grad_norm": 0.1035601943731308, "learning_rate": 5.5609412393303983e-05, "loss": 0.35395896434783936, "memory(GiB)": 78.33, "step": 3772, "token_acc": 0.8959904183792906, "train_speed(iter/s)": 0.032369 }, { "epoch": 0.7310952865378094, "grad_norm": 0.10243549197912216, "learning_rate": 5.553471933650922e-05, "loss": 0.37264150381088257, "memory(GiB)": 78.33, "step": 3773, "token_acc": 0.8881558441558441, "train_speed(iter/s)": 0.032369 }, { "epoch": 0.7312890568231362, "grad_norm": 0.10226655006408691, "learning_rate": 5.546006507914369e-05, "loss": 0.3363930284976959, "memory(GiB)": 78.33, "step": 3774, "token_acc": 0.8988527079466847, "train_speed(iter/s)": 0.03237 }, { "epoch": 0.7314828271084629, "grad_norm": 0.11039669811725616, "learning_rate": 5.5385449651869815e-05, "loss": 0.34301939606666565, "memory(GiB)": 78.33, "step": 3775, "token_acc": 0.8971273781025576, "train_speed(iter/s)": 0.032371 }, { "epoch": 0.7316765973937897, "grad_norm": 0.10423394292593002, "learning_rate": 5.531087308533414e-05, "loss": 0.3568193316459656, "memory(GiB)": 78.33, "step": 3776, "token_acc": 0.8942443919716647, "train_speed(iter/s)": 0.032372 }, { "epoch": 0.7318703676791164, "grad_norm": 0.11283637583255768, "learning_rate": 5.52363354101673e-05, "loss": 0.36575639247894287, "memory(GiB)": 78.33, "step": 3777, "token_acc": 0.8924577682607662, "train_speed(iter/s)": 0.032372 }, { "epoch": 0.7320641379644431, "grad_norm": 0.09643404930830002, "learning_rate": 5.51618366569839e-05, "loss": 0.30832773447036743, "memory(GiB)": 78.33, "step": 3778, "token_acc": 0.9075776332077857, "train_speed(iter/s)": 0.032373 }, { "epoch": 0.7322579082497699, "grad_norm": 0.10145960003137589, "learning_rate": 5.508737685638259e-05, "loss": 0.32317882776260376, "memory(GiB)": 78.33, "step": 3779, "token_acc": 0.9013355214758644, "train_speed(iter/s)": 0.032374 }, { "epoch": 0.7324516785350966, "grad_norm": 0.10353747755289078, "learning_rate": 5.501295603894594e-05, "loss": 0.34720805287361145, "memory(GiB)": 78.33, "step": 3780, "token_acc": 0.8948657321398412, "train_speed(iter/s)": 0.032375 }, { "epoch": 0.7326454488204234, "grad_norm": 0.09833744168281555, "learning_rate": 5.49385742352407e-05, "loss": 0.33571141958236694, "memory(GiB)": 78.33, "step": 3781, "token_acc": 0.9020605793832017, "train_speed(iter/s)": 0.032375 }, { "epoch": 0.7328392191057501, "grad_norm": 0.09903203696012497, "learning_rate": 5.486423147581744e-05, "loss": 0.3094678521156311, "memory(GiB)": 78.33, "step": 3782, "token_acc": 0.9053752361325778, "train_speed(iter/s)": 0.032376 }, { "epoch": 0.7330329893910769, "grad_norm": 0.1019536554813385, "learning_rate": 5.4789927791210694e-05, "loss": 0.34034597873687744, "memory(GiB)": 78.33, "step": 3783, "token_acc": 0.8967783128042779, "train_speed(iter/s)": 0.032377 }, { "epoch": 0.7332267596764036, "grad_norm": 0.10213874280452728, "learning_rate": 5.4715663211939e-05, "loss": 0.34146547317504883, "memory(GiB)": 78.33, "step": 3784, "token_acc": 0.8989790738338316, "train_speed(iter/s)": 0.032377 }, { "epoch": 0.7334205299617303, "grad_norm": 0.09893448650836945, "learning_rate": 5.4641437768504824e-05, "loss": 0.3442443907260895, "memory(GiB)": 78.33, "step": 3785, "token_acc": 0.8979535398230089, "train_speed(iter/s)": 0.032378 }, { "epoch": 0.7336143002470571, "grad_norm": 0.10614677518606186, "learning_rate": 5.456725149139454e-05, "loss": 0.34340110421180725, "memory(GiB)": 78.33, "step": 3786, "token_acc": 0.895698897732694, "train_speed(iter/s)": 0.032379 }, { "epoch": 0.7338080705323838, "grad_norm": 0.0985707938671112, "learning_rate": 5.449310441107838e-05, "loss": 0.33455923199653625, "memory(GiB)": 78.33, "step": 3787, "token_acc": 0.8980871664348876, "train_speed(iter/s)": 0.032379 }, { "epoch": 0.7340018408177106, "grad_norm": 0.1156158521771431, "learning_rate": 5.4418996558010667e-05, "loss": 0.3170754015445709, "memory(GiB)": 78.33, "step": 3788, "token_acc": 0.9032800672834315, "train_speed(iter/s)": 0.03238 }, { "epoch": 0.7341956111030373, "grad_norm": 0.11739075183868408, "learning_rate": 5.434492796262942e-05, "loss": 0.38899776339530945, "memory(GiB)": 78.33, "step": 3789, "token_acc": 0.8846523748952937, "train_speed(iter/s)": 0.032381 }, { "epoch": 0.7343893813883641, "grad_norm": 0.1062496230006218, "learning_rate": 5.4270898655356625e-05, "loss": 0.335146427154541, "memory(GiB)": 78.33, "step": 3790, "token_acc": 0.8996969696969697, "train_speed(iter/s)": 0.032382 }, { "epoch": 0.7345831516736908, "grad_norm": 0.11981339007616043, "learning_rate": 5.4196908666598075e-05, "loss": 0.41245871782302856, "memory(GiB)": 78.33, "step": 3791, "token_acc": 0.8796483136350534, "train_speed(iter/s)": 0.032382 }, { "epoch": 0.7347769219590176, "grad_norm": 0.10465206950902939, "learning_rate": 5.412295802674348e-05, "loss": 0.3469778895378113, "memory(GiB)": 78.33, "step": 3792, "token_acc": 0.8962869653889989, "train_speed(iter/s)": 0.032383 }, { "epoch": 0.7349706922443443, "grad_norm": 0.0999392494559288, "learning_rate": 5.4049046766166335e-05, "loss": 0.3266690969467163, "memory(GiB)": 78.33, "step": 3793, "token_acc": 0.9025028714907711, "train_speed(iter/s)": 0.032384 }, { "epoch": 0.735164462529671, "grad_norm": 0.10005468130111694, "learning_rate": 5.397517491522393e-05, "loss": 0.34978044033050537, "memory(GiB)": 78.33, "step": 3794, "token_acc": 0.8954549081141459, "train_speed(iter/s)": 0.032384 }, { "epoch": 0.7353582328149978, "grad_norm": 0.11191490292549133, "learning_rate": 5.390134250425753e-05, "loss": 0.3621137738227844, "memory(GiB)": 78.33, "step": 3795, "token_acc": 0.8916416576208416, "train_speed(iter/s)": 0.032385 }, { "epoch": 0.7355520031003245, "grad_norm": 0.0992647185921669, "learning_rate": 5.382754956359204e-05, "loss": 0.33923065662384033, "memory(GiB)": 78.33, "step": 3796, "token_acc": 0.8980301274623407, "train_speed(iter/s)": 0.032386 }, { "epoch": 0.7357457733856513, "grad_norm": 0.10284343361854553, "learning_rate": 5.37537961235362e-05, "loss": 0.3275540769100189, "memory(GiB)": 78.33, "step": 3797, "token_acc": 0.9016500897717233, "train_speed(iter/s)": 0.032387 }, { "epoch": 0.735939543670978, "grad_norm": 0.09948836266994476, "learning_rate": 5.368008221438251e-05, "loss": 0.3380359709262848, "memory(GiB)": 78.33, "step": 3798, "token_acc": 0.8977125965043193, "train_speed(iter/s)": 0.032387 }, { "epoch": 0.7361333139563048, "grad_norm": 0.09476862102746964, "learning_rate": 5.360640786640729e-05, "loss": 0.3325924873352051, "memory(GiB)": 78.33, "step": 3799, "token_acc": 0.9025462002294754, "train_speed(iter/s)": 0.032388 }, { "epoch": 0.7363270842416315, "grad_norm": 0.10117123275995255, "learning_rate": 5.3532773109870544e-05, "loss": 0.3492782711982727, "memory(GiB)": 78.33, "step": 3800, "token_acc": 0.8976592977893368, "train_speed(iter/s)": 0.032389 }, { "epoch": 0.7365208545269583, "grad_norm": 0.11379625648260117, "learning_rate": 5.3459177975016e-05, "loss": 0.34902098774909973, "memory(GiB)": 78.33, "step": 3801, "token_acc": 0.8949467686309791, "train_speed(iter/s)": 0.032386 }, { "epoch": 0.736714624812285, "grad_norm": 0.10552668571472168, "learning_rate": 5.338562249207128e-05, "loss": 0.35885700583457947, "memory(GiB)": 78.33, "step": 3802, "token_acc": 0.8908405111749141, "train_speed(iter/s)": 0.032387 }, { "epoch": 0.7369083950976117, "grad_norm": 0.10673682391643524, "learning_rate": 5.331210669124752e-05, "loss": 0.34370338916778564, "memory(GiB)": 78.33, "step": 3803, "token_acc": 0.897491594013152, "train_speed(iter/s)": 0.032387 }, { "epoch": 0.7371021653829385, "grad_norm": 0.09526897221803665, "learning_rate": 5.323863060273966e-05, "loss": 0.303422749042511, "memory(GiB)": 78.33, "step": 3804, "token_acc": 0.909460807476243, "train_speed(iter/s)": 0.032388 }, { "epoch": 0.7372959356682652, "grad_norm": 0.08512071520090103, "learning_rate": 5.3165194256726275e-05, "loss": 0.2895103394985199, "memory(GiB)": 78.33, "step": 3805, "token_acc": 0.9112764240057593, "train_speed(iter/s)": 0.032389 }, { "epoch": 0.737489705953592, "grad_norm": 0.103308767080307, "learning_rate": 5.309179768336967e-05, "loss": 0.34746530652046204, "memory(GiB)": 78.33, "step": 3806, "token_acc": 0.8955654981027354, "train_speed(iter/s)": 0.032389 }, { "epoch": 0.7376834762389187, "grad_norm": 0.10745726525783539, "learning_rate": 5.301844091281573e-05, "loss": 0.33433717489242554, "memory(GiB)": 78.33, "step": 3807, "token_acc": 0.9007903055848261, "train_speed(iter/s)": 0.03239 }, { "epoch": 0.7378772465242455, "grad_norm": 0.09775236248970032, "learning_rate": 5.294512397519414e-05, "loss": 0.3289939761161804, "memory(GiB)": 78.33, "step": 3808, "token_acc": 0.9027567020738493, "train_speed(iter/s)": 0.032391 }, { "epoch": 0.7380710168095722, "grad_norm": 0.10481875389814377, "learning_rate": 5.287184690061811e-05, "loss": 0.32689011096954346, "memory(GiB)": 78.33, "step": 3809, "token_acc": 0.9029928528886242, "train_speed(iter/s)": 0.032391 }, { "epoch": 0.738264787094899, "grad_norm": 0.1059269830584526, "learning_rate": 5.279860971918449e-05, "loss": 0.37165161967277527, "memory(GiB)": 78.33, "step": 3810, "token_acc": 0.8881279218652295, "train_speed(iter/s)": 0.032392 }, { "epoch": 0.7384585573802257, "grad_norm": 0.09881359338760376, "learning_rate": 5.272541246097376e-05, "loss": 0.309497207403183, "memory(GiB)": 78.33, "step": 3811, "token_acc": 0.9069961956668375, "train_speed(iter/s)": 0.032393 }, { "epoch": 0.7386523276655524, "grad_norm": 0.10563083738088608, "learning_rate": 5.265225515605001e-05, "loss": 0.3451386094093323, "memory(GiB)": 78.33, "step": 3812, "token_acc": 0.8983088749126485, "train_speed(iter/s)": 0.032394 }, { "epoch": 0.7388460979508792, "grad_norm": 0.10384704172611237, "learning_rate": 5.257913783446086e-05, "loss": 0.35985156893730164, "memory(GiB)": 78.33, "step": 3813, "token_acc": 0.8926537745954783, "train_speed(iter/s)": 0.032394 }, { "epoch": 0.7390398682362059, "grad_norm": 0.10183234512805939, "learning_rate": 5.250606052623762e-05, "loss": 0.315267413854599, "memory(GiB)": 78.33, "step": 3814, "token_acc": 0.9050984876045943, "train_speed(iter/s)": 0.032395 }, { "epoch": 0.7392336385215327, "grad_norm": 0.10309257358312607, "learning_rate": 5.2433023261395113e-05, "loss": 0.34021520614624023, "memory(GiB)": 78.33, "step": 3815, "token_acc": 0.8987062341073574, "train_speed(iter/s)": 0.032396 }, { "epoch": 0.7394274088068594, "grad_norm": 0.11030527949333191, "learning_rate": 5.236002606993167e-05, "loss": 0.35504722595214844, "memory(GiB)": 78.33, "step": 3816, "token_acc": 0.8941146380971327, "train_speed(iter/s)": 0.032396 }, { "epoch": 0.7396211790921862, "grad_norm": 0.09362789988517761, "learning_rate": 5.228706898182921e-05, "loss": 0.3009899854660034, "memory(GiB)": 78.33, "step": 3817, "token_acc": 0.909036220077704, "train_speed(iter/s)": 0.032397 }, { "epoch": 0.7398149493775129, "grad_norm": 0.1043652817606926, "learning_rate": 5.221415202705316e-05, "loss": 0.3409806489944458, "memory(GiB)": 78.33, "step": 3818, "token_acc": 0.8996104618809126, "train_speed(iter/s)": 0.032398 }, { "epoch": 0.7400087196628397, "grad_norm": 0.10487966984510422, "learning_rate": 5.214127523555249e-05, "loss": 0.3638575077056885, "memory(GiB)": 78.33, "step": 3819, "token_acc": 0.8908792978947954, "train_speed(iter/s)": 0.032398 }, { "epoch": 0.7402024899481664, "grad_norm": 0.10344404727220535, "learning_rate": 5.206843863725959e-05, "loss": 0.3182569742202759, "memory(GiB)": 78.33, "step": 3820, "token_acc": 0.9035023713973003, "train_speed(iter/s)": 0.032399 }, { "epoch": 0.7403962602334931, "grad_norm": 0.09493482857942581, "learning_rate": 5.199564226209051e-05, "loss": 0.31004172563552856, "memory(GiB)": 78.33, "step": 3821, "token_acc": 0.905801490532747, "train_speed(iter/s)": 0.0324 }, { "epoch": 0.7405900305188199, "grad_norm": 0.10709976404905319, "learning_rate": 5.192288613994464e-05, "loss": 0.35697808861732483, "memory(GiB)": 78.33, "step": 3822, "token_acc": 0.8928056450375824, "train_speed(iter/s)": 0.032401 }, { "epoch": 0.7407838008041466, "grad_norm": 0.09773747622966766, "learning_rate": 5.185017030070487e-05, "loss": 0.34137359261512756, "memory(GiB)": 78.33, "step": 3823, "token_acc": 0.8971325975603316, "train_speed(iter/s)": 0.032401 }, { "epoch": 0.7409775710894734, "grad_norm": 0.10251982510089874, "learning_rate": 5.1777494774237534e-05, "loss": 0.3589448034763336, "memory(GiB)": 78.33, "step": 3824, "token_acc": 0.8923671206762486, "train_speed(iter/s)": 0.032402 }, { "epoch": 0.7411713413748001, "grad_norm": 0.09967760741710663, "learning_rate": 5.170485959039244e-05, "loss": 0.33076420426368713, "memory(GiB)": 78.33, "step": 3825, "token_acc": 0.9000113526707157, "train_speed(iter/s)": 0.032403 }, { "epoch": 0.741365111660127, "grad_norm": 0.09976795315742493, "learning_rate": 5.163226477900281e-05, "loss": 0.33364495635032654, "memory(GiB)": 78.33, "step": 3826, "token_acc": 0.9020033344801122, "train_speed(iter/s)": 0.032403 }, { "epoch": 0.7415588819454537, "grad_norm": 0.09653392434120178, "learning_rate": 5.155971036988522e-05, "loss": 0.32705751061439514, "memory(GiB)": 78.33, "step": 3827, "token_acc": 0.8996669334641502, "train_speed(iter/s)": 0.032404 }, { "epoch": 0.7417526522307805, "grad_norm": 0.10461395233869553, "learning_rate": 5.148719639283984e-05, "loss": 0.35183149576187134, "memory(GiB)": 78.33, "step": 3828, "token_acc": 0.8943066752210661, "train_speed(iter/s)": 0.032405 }, { "epoch": 0.7419464225161072, "grad_norm": 0.09616634994745255, "learning_rate": 5.1414722877650025e-05, "loss": 0.34106606245040894, "memory(GiB)": 78.33, "step": 3829, "token_acc": 0.8989504013171434, "train_speed(iter/s)": 0.032405 }, { "epoch": 0.742140192801434, "grad_norm": 0.0909215584397316, "learning_rate": 5.134228985408262e-05, "loss": 0.3328753709793091, "memory(GiB)": 78.33, "step": 3830, "token_acc": 0.8994169096209913, "train_speed(iter/s)": 0.032406 }, { "epoch": 0.7423339630867607, "grad_norm": 0.107168048620224, "learning_rate": 5.126989735188782e-05, "loss": 0.3591649532318115, "memory(GiB)": 78.33, "step": 3831, "token_acc": 0.8922912205567451, "train_speed(iter/s)": 0.032407 }, { "epoch": 0.7425277333720874, "grad_norm": 0.10682724416255951, "learning_rate": 5.119754540079914e-05, "loss": 0.35992226004600525, "memory(GiB)": 78.33, "step": 3832, "token_acc": 0.893421090615167, "train_speed(iter/s)": 0.032408 }, { "epoch": 0.7427215036574142, "grad_norm": 0.10819026827812195, "learning_rate": 5.1125234030533494e-05, "loss": 0.3347112536430359, "memory(GiB)": 78.33, "step": 3833, "token_acc": 0.8996055354371444, "train_speed(iter/s)": 0.032408 }, { "epoch": 0.7429152739427409, "grad_norm": 0.09705478698015213, "learning_rate": 5.1052963270791045e-05, "loss": 0.33945244550704956, "memory(GiB)": 78.33, "step": 3834, "token_acc": 0.89829610196889, "train_speed(iter/s)": 0.032409 }, { "epoch": 0.7431090442280677, "grad_norm": 0.09978172183036804, "learning_rate": 5.098073315125545e-05, "loss": 0.314132958650589, "memory(GiB)": 78.33, "step": 3835, "token_acc": 0.9039825686962838, "train_speed(iter/s)": 0.03241 }, { "epoch": 0.7433028145133944, "grad_norm": 0.1035098284482956, "learning_rate": 5.09085437015935e-05, "loss": 0.3336668908596039, "memory(GiB)": 78.33, "step": 3836, "token_acc": 0.9000965741587883, "train_speed(iter/s)": 0.03241 }, { "epoch": 0.7434965847987212, "grad_norm": 0.10066191852092743, "learning_rate": 5.083639495145534e-05, "loss": 0.3203166127204895, "memory(GiB)": 78.33, "step": 3837, "token_acc": 0.9036123032904149, "train_speed(iter/s)": 0.032411 }, { "epoch": 0.7436903550840479, "grad_norm": 0.11868193745613098, "learning_rate": 5.076428693047439e-05, "loss": 0.3131275177001953, "memory(GiB)": 78.33, "step": 3838, "token_acc": 0.9023334587881069, "train_speed(iter/s)": 0.032412 }, { "epoch": 0.7438841253693746, "grad_norm": 0.108455590903759, "learning_rate": 5.069221966826738e-05, "loss": 0.3502568006515503, "memory(GiB)": 78.33, "step": 3839, "token_acc": 0.8950985915492958, "train_speed(iter/s)": 0.032412 }, { "epoch": 0.7440778956547014, "grad_norm": 0.10078644752502441, "learning_rate": 5.06201931944342e-05, "loss": 0.33233729004859924, "memory(GiB)": 78.33, "step": 3840, "token_acc": 0.8990275860042892, "train_speed(iter/s)": 0.032413 }, { "epoch": 0.7442716659400281, "grad_norm": 0.10141783952713013, "learning_rate": 5.054820753855817e-05, "loss": 0.3332747220993042, "memory(GiB)": 78.33, "step": 3841, "token_acc": 0.902451309603761, "train_speed(iter/s)": 0.032414 }, { "epoch": 0.7444654362253549, "grad_norm": 0.0925682932138443, "learning_rate": 5.047626273020568e-05, "loss": 0.30119389295578003, "memory(GiB)": 78.33, "step": 3842, "token_acc": 0.9076820241130888, "train_speed(iter/s)": 0.032414 }, { "epoch": 0.7446592065106816, "grad_norm": 0.09351833164691925, "learning_rate": 5.040435879892639e-05, "loss": 0.3276900053024292, "memory(GiB)": 78.33, "step": 3843, "token_acc": 0.9007225727661307, "train_speed(iter/s)": 0.032415 }, { "epoch": 0.7448529767960084, "grad_norm": 0.1048940047621727, "learning_rate": 5.0332495774253165e-05, "loss": 0.33195894956588745, "memory(GiB)": 78.33, "step": 3844, "token_acc": 0.8993898230210663, "train_speed(iter/s)": 0.032416 }, { "epoch": 0.7450467470813351, "grad_norm": 0.09521888941526413, "learning_rate": 5.026067368570211e-05, "loss": 0.28278443217277527, "memory(GiB)": 78.33, "step": 3845, "token_acc": 0.9150288711129947, "train_speed(iter/s)": 0.032416 }, { "epoch": 0.7452405173666619, "grad_norm": 0.12543639540672302, "learning_rate": 5.018889256277241e-05, "loss": 0.32016804814338684, "memory(GiB)": 78.33, "step": 3846, "token_acc": 0.9048030793261185, "train_speed(iter/s)": 0.032417 }, { "epoch": 0.7454342876519886, "grad_norm": 0.09576548635959625, "learning_rate": 5.011715243494663e-05, "loss": 0.3216573894023895, "memory(GiB)": 78.33, "step": 3847, "token_acc": 0.9055984653440355, "train_speed(iter/s)": 0.032418 }, { "epoch": 0.7456280579373153, "grad_norm": 0.09915050864219666, "learning_rate": 5.004545333169028e-05, "loss": 0.3430394232273102, "memory(GiB)": 78.33, "step": 3848, "token_acc": 0.8962812609632637, "train_speed(iter/s)": 0.032419 }, { "epoch": 0.7458218282226421, "grad_norm": 0.09718713909387589, "learning_rate": 4.997379528245215e-05, "loss": 0.3420059084892273, "memory(GiB)": 78.33, "step": 3849, "token_acc": 0.8961742910262117, "train_speed(iter/s)": 0.032419 }, { "epoch": 0.7460155985079688, "grad_norm": 0.10262469947338104, "learning_rate": 4.990217831666409e-05, "loss": 0.3337242007255554, "memory(GiB)": 78.33, "step": 3850, "token_acc": 0.9016568394653292, "train_speed(iter/s)": 0.03242 }, { "epoch": 0.7462093687932956, "grad_norm": 0.09261801093816757, "learning_rate": 4.983060246374115e-05, "loss": 0.2943536341190338, "memory(GiB)": 78.33, "step": 3851, "token_acc": 0.9127831715210356, "train_speed(iter/s)": 0.032421 }, { "epoch": 0.7464031390786223, "grad_norm": 0.11604174971580505, "learning_rate": 4.9759067753081414e-05, "loss": 0.3443485200405121, "memory(GiB)": 78.33, "step": 3852, "token_acc": 0.8987715146996856, "train_speed(iter/s)": 0.032421 }, { "epoch": 0.7465969093639491, "grad_norm": 0.10266567766666412, "learning_rate": 4.968757421406608e-05, "loss": 0.3430730998516083, "memory(GiB)": 78.33, "step": 3853, "token_acc": 0.897044232074782, "train_speed(iter/s)": 0.032422 }, { "epoch": 0.7467906796492758, "grad_norm": 0.10636512190103531, "learning_rate": 4.961612187605958e-05, "loss": 0.3752942383289337, "memory(GiB)": 78.33, "step": 3854, "token_acc": 0.886795650331874, "train_speed(iter/s)": 0.032423 }, { "epoch": 0.7469844499346026, "grad_norm": 0.11186108738183975, "learning_rate": 4.954471076840922e-05, "loss": 0.37107086181640625, "memory(GiB)": 78.33, "step": 3855, "token_acc": 0.8894728014759322, "train_speed(iter/s)": 0.032423 }, { "epoch": 0.7471782202199293, "grad_norm": 0.10150814801454544, "learning_rate": 4.947334092044552e-05, "loss": 0.3377690315246582, "memory(GiB)": 78.33, "step": 3856, "token_acc": 0.8990276738967838, "train_speed(iter/s)": 0.032424 }, { "epoch": 0.747371990505256, "grad_norm": 0.10091419517993927, "learning_rate": 4.9402012361481934e-05, "loss": 0.30891650915145874, "memory(GiB)": 78.33, "step": 3857, "token_acc": 0.9063477562771641, "train_speed(iter/s)": 0.032425 }, { "epoch": 0.7475657607905828, "grad_norm": 0.09730595350265503, "learning_rate": 4.9330725120815054e-05, "loss": 0.3267110586166382, "memory(GiB)": 78.33, "step": 3858, "token_acc": 0.900251940973829, "train_speed(iter/s)": 0.032425 }, { "epoch": 0.7477595310759095, "grad_norm": 0.10822945088148117, "learning_rate": 4.925947922772445e-05, "loss": 0.4060097932815552, "memory(GiB)": 78.33, "step": 3859, "token_acc": 0.8794587622081238, "train_speed(iter/s)": 0.032426 }, { "epoch": 0.7479533013612363, "grad_norm": 0.10585605353116989, "learning_rate": 4.918827471147268e-05, "loss": 0.338094025850296, "memory(GiB)": 78.33, "step": 3860, "token_acc": 0.8971483116573201, "train_speed(iter/s)": 0.032427 }, { "epoch": 0.748147071646563, "grad_norm": 0.12144403904676437, "learning_rate": 4.911711160130546e-05, "loss": 0.36428892612457275, "memory(GiB)": 78.33, "step": 3861, "token_acc": 0.8935286935286936, "train_speed(iter/s)": 0.032427 }, { "epoch": 0.7483408419318898, "grad_norm": 0.09023165702819824, "learning_rate": 4.904598992645132e-05, "loss": 0.3032684028148651, "memory(GiB)": 78.33, "step": 3862, "token_acc": 0.9088954518606025, "train_speed(iter/s)": 0.032428 }, { "epoch": 0.7485346122172165, "grad_norm": 0.10547629743814468, "learning_rate": 4.897490971612187e-05, "loss": 0.33661413192749023, "memory(GiB)": 78.33, "step": 3863, "token_acc": 0.8987709009233841, "train_speed(iter/s)": 0.032429 }, { "epoch": 0.7487283825025433, "grad_norm": 0.10550445318222046, "learning_rate": 4.890387099951164e-05, "loss": 0.34583577513694763, "memory(GiB)": 78.33, "step": 3864, "token_acc": 0.8972740894421393, "train_speed(iter/s)": 0.03243 }, { "epoch": 0.74892215278787, "grad_norm": 0.09834988415241241, "learning_rate": 4.883287380579816e-05, "loss": 0.3329737186431885, "memory(GiB)": 78.33, "step": 3865, "token_acc": 0.900711867118138, "train_speed(iter/s)": 0.03243 }, { "epoch": 0.7491159230731967, "grad_norm": 0.0946660041809082, "learning_rate": 4.876191816414186e-05, "loss": 0.310972660779953, "memory(GiB)": 78.33, "step": 3866, "token_acc": 0.9068650482902636, "train_speed(iter/s)": 0.032431 }, { "epoch": 0.7493096933585235, "grad_norm": 0.09578848630189896, "learning_rate": 4.869100410368609e-05, "loss": 0.31944969296455383, "memory(GiB)": 78.33, "step": 3867, "token_acc": 0.904564120614807, "train_speed(iter/s)": 0.032432 }, { "epoch": 0.7495034636438502, "grad_norm": 0.09447766840457916, "learning_rate": 4.862013165355728e-05, "loss": 0.30866914987564087, "memory(GiB)": 78.33, "step": 3868, "token_acc": 0.9051225521981585, "train_speed(iter/s)": 0.032432 }, { "epoch": 0.749697233929177, "grad_norm": 0.09313666820526123, "learning_rate": 4.8549300842864576e-05, "loss": 0.32266178727149963, "memory(GiB)": 78.33, "step": 3869, "token_acc": 0.90316239526255, "train_speed(iter/s)": 0.032433 }, { "epoch": 0.7498910042145037, "grad_norm": 0.10308795422315598, "learning_rate": 4.84785117007001e-05, "loss": 0.3472236096858978, "memory(GiB)": 78.33, "step": 3870, "token_acc": 0.894967925873129, "train_speed(iter/s)": 0.032434 }, { "epoch": 0.7500847744998305, "grad_norm": 0.10628514736890793, "learning_rate": 4.840776425613886e-05, "loss": 0.3184339702129364, "memory(GiB)": 78.33, "step": 3871, "token_acc": 0.9032356532356532, "train_speed(iter/s)": 0.032434 }, { "epoch": 0.7502785447851572, "grad_norm": 0.09150487929582596, "learning_rate": 4.833705853823872e-05, "loss": 0.3118763864040375, "memory(GiB)": 78.33, "step": 3872, "token_acc": 0.9086181652950976, "train_speed(iter/s)": 0.032435 }, { "epoch": 0.750472315070484, "grad_norm": 0.10045349597930908, "learning_rate": 4.826639457604039e-05, "loss": 0.317714124917984, "memory(GiB)": 78.33, "step": 3873, "token_acc": 0.9055200119029906, "train_speed(iter/s)": 0.032436 }, { "epoch": 0.7506660853558107, "grad_norm": 0.09495769441127777, "learning_rate": 4.819577239856754e-05, "loss": 0.3022938072681427, "memory(GiB)": 78.33, "step": 3874, "token_acc": 0.9077078844206755, "train_speed(iter/s)": 0.032436 }, { "epoch": 0.7508598556411374, "grad_norm": 0.09479624778032303, "learning_rate": 4.812519203482655e-05, "loss": 0.3185397982597351, "memory(GiB)": 78.33, "step": 3875, "token_acc": 0.9046973580441641, "train_speed(iter/s)": 0.032437 }, { "epoch": 0.7510536259264642, "grad_norm": 0.10061098635196686, "learning_rate": 4.805465351380666e-05, "loss": 0.3666335940361023, "memory(GiB)": 78.33, "step": 3876, "token_acc": 0.8898552194528259, "train_speed(iter/s)": 0.032438 }, { "epoch": 0.7512473962117909, "grad_norm": 0.1007319763302803, "learning_rate": 4.798415686447997e-05, "loss": 0.3402915298938751, "memory(GiB)": 78.33, "step": 3877, "token_acc": 0.8979099678456591, "train_speed(iter/s)": 0.032438 }, { "epoch": 0.7514411664971177, "grad_norm": 0.09538646787405014, "learning_rate": 4.791370211580132e-05, "loss": 0.3278900384902954, "memory(GiB)": 78.33, "step": 3878, "token_acc": 0.9009969149180286, "train_speed(iter/s)": 0.032439 }, { "epoch": 0.7516349367824444, "grad_norm": 0.10622713714838028, "learning_rate": 4.7843289296708384e-05, "loss": 0.33736902475357056, "memory(GiB)": 78.33, "step": 3879, "token_acc": 0.8988551338682051, "train_speed(iter/s)": 0.03244 }, { "epoch": 0.7518287070677712, "grad_norm": 0.10398625582456589, "learning_rate": 4.777291843612153e-05, "loss": 0.31338000297546387, "memory(GiB)": 78.33, "step": 3880, "token_acc": 0.9074961626413558, "train_speed(iter/s)": 0.03244 }, { "epoch": 0.7520224773530979, "grad_norm": 0.1060367301106453, "learning_rate": 4.770258956294408e-05, "loss": 0.3464204668998718, "memory(GiB)": 78.33, "step": 3881, "token_acc": 0.8942851343906753, "train_speed(iter/s)": 0.032441 }, { "epoch": 0.7522162476384247, "grad_norm": 0.11519785970449448, "learning_rate": 4.7632302706061925e-05, "loss": 0.3710397779941559, "memory(GiB)": 78.33, "step": 3882, "token_acc": 0.8889779944825816, "train_speed(iter/s)": 0.032442 }, { "epoch": 0.7524100179237514, "grad_norm": 0.0994989275932312, "learning_rate": 4.756205789434379e-05, "loss": 0.3080540597438812, "memory(GiB)": 78.33, "step": 3883, "token_acc": 0.9073027181909703, "train_speed(iter/s)": 0.032442 }, { "epoch": 0.7526037882090781, "grad_norm": 0.11023864895105362, "learning_rate": 4.749185515664109e-05, "loss": 0.3553957939147949, "memory(GiB)": 78.33, "step": 3884, "token_acc": 0.8958528037383178, "train_speed(iter/s)": 0.032443 }, { "epoch": 0.7527975584944049, "grad_norm": 0.10181353241205215, "learning_rate": 4.742169452178796e-05, "loss": 0.3252992630004883, "memory(GiB)": 78.33, "step": 3885, "token_acc": 0.9014353801817778, "train_speed(iter/s)": 0.032444 }, { "epoch": 0.7529913287797316, "grad_norm": 0.0931096225976944, "learning_rate": 4.735157601860123e-05, "loss": 0.3285648226737976, "memory(GiB)": 78.33, "step": 3886, "token_acc": 0.9007917114651549, "train_speed(iter/s)": 0.032444 }, { "epoch": 0.7531850990650584, "grad_norm": 0.09871333092451096, "learning_rate": 4.7281499675880564e-05, "loss": 0.31424829363822937, "memory(GiB)": 78.33, "step": 3887, "token_acc": 0.9050426360669545, "train_speed(iter/s)": 0.032445 }, { "epoch": 0.7533788693503851, "grad_norm": 0.0981900617480278, "learning_rate": 4.7211465522408124e-05, "loss": 0.2891225814819336, "memory(GiB)": 78.33, "step": 3888, "token_acc": 0.9127433168959252, "train_speed(iter/s)": 0.032446 }, { "epoch": 0.7535726396357119, "grad_norm": 0.10310321301221848, "learning_rate": 4.714147358694883e-05, "loss": 0.341362327337265, "memory(GiB)": 78.33, "step": 3889, "token_acc": 0.899036050593414, "train_speed(iter/s)": 0.032446 }, { "epoch": 0.7537664099210386, "grad_norm": 0.08967840671539307, "learning_rate": 4.7071523898250246e-05, "loss": 0.3140985369682312, "memory(GiB)": 78.33, "step": 3890, "token_acc": 0.902944590528984, "train_speed(iter/s)": 0.032447 }, { "epoch": 0.7539601802063653, "grad_norm": 0.10281458497047424, "learning_rate": 4.700161648504261e-05, "loss": 0.34009885787963867, "memory(GiB)": 78.33, "step": 3891, "token_acc": 0.8981371841155235, "train_speed(iter/s)": 0.032448 }, { "epoch": 0.7541539504916921, "grad_norm": 0.0905076265335083, "learning_rate": 4.6931751376038735e-05, "loss": 0.2962040305137634, "memory(GiB)": 78.33, "step": 3892, "token_acc": 0.9114481536497017, "train_speed(iter/s)": 0.032448 }, { "epoch": 0.7543477207770188, "grad_norm": 0.11229556798934937, "learning_rate": 4.6861928599934086e-05, "loss": 0.38348227739334106, "memory(GiB)": 78.33, "step": 3893, "token_acc": 0.8880430168388284, "train_speed(iter/s)": 0.032449 }, { "epoch": 0.7545414910623456, "grad_norm": 0.10800474882125854, "learning_rate": 4.679214818540683e-05, "loss": 0.3647887110710144, "memory(GiB)": 78.33, "step": 3894, "token_acc": 0.8906179648806843, "train_speed(iter/s)": 0.03245 }, { "epoch": 0.7547352613476723, "grad_norm": 0.09147538244724274, "learning_rate": 4.672241016111761e-05, "loss": 0.32499605417251587, "memory(GiB)": 78.33, "step": 3895, "token_acc": 0.9020622260544657, "train_speed(iter/s)": 0.03245 }, { "epoch": 0.7549290316329991, "grad_norm": 0.09846985340118408, "learning_rate": 4.6652714555709734e-05, "loss": 0.32619956135749817, "memory(GiB)": 78.33, "step": 3896, "token_acc": 0.9018954111099443, "train_speed(iter/s)": 0.032451 }, { "epoch": 0.7551228019183258, "grad_norm": 0.09857229888439178, "learning_rate": 4.658306139780902e-05, "loss": 0.31706997752189636, "memory(GiB)": 78.33, "step": 3897, "token_acc": 0.9044509455647268, "train_speed(iter/s)": 0.032451 }, { "epoch": 0.7553165722036526, "grad_norm": 0.09921880811452866, "learning_rate": 4.6513450716023924e-05, "loss": 0.32637161016464233, "memory(GiB)": 78.33, "step": 3898, "token_acc": 0.9035741835947544, "train_speed(iter/s)": 0.032452 }, { "epoch": 0.7555103424889793, "grad_norm": 0.10863330215215683, "learning_rate": 4.64438825389454e-05, "loss": 0.33484184741973877, "memory(GiB)": 78.33, "step": 3899, "token_acc": 0.8991859737006888, "train_speed(iter/s)": 0.032453 }, { "epoch": 0.755704112774306, "grad_norm": 0.10528206825256348, "learning_rate": 4.637435689514693e-05, "loss": 0.3199610710144043, "memory(GiB)": 78.33, "step": 3900, "token_acc": 0.9030449124587668, "train_speed(iter/s)": 0.032453 }, { "epoch": 0.7558978830596328, "grad_norm": 0.09049303084611893, "learning_rate": 4.630487381318466e-05, "loss": 0.3117978572845459, "memory(GiB)": 78.33, "step": 3901, "token_acc": 0.906941374967325, "train_speed(iter/s)": 0.032454 }, { "epoch": 0.7560916533449595, "grad_norm": 0.09448360651731491, "learning_rate": 4.6235433321597124e-05, "loss": 0.2956331968307495, "memory(GiB)": 78.33, "step": 3902, "token_acc": 0.9114258416231102, "train_speed(iter/s)": 0.032455 }, { "epoch": 0.7562854236302863, "grad_norm": 0.09764683246612549, "learning_rate": 4.616603544890537e-05, "loss": 0.32111823558807373, "memory(GiB)": 78.33, "step": 3903, "token_acc": 0.9027792510873024, "train_speed(iter/s)": 0.032455 }, { "epoch": 0.756479193915613, "grad_norm": 0.10489743947982788, "learning_rate": 4.609668022361299e-05, "loss": 0.32993102073669434, "memory(GiB)": 78.33, "step": 3904, "token_acc": 0.9007670182166826, "train_speed(iter/s)": 0.032456 }, { "epoch": 0.7566729642009398, "grad_norm": 0.09750455617904663, "learning_rate": 4.6027367674206034e-05, "loss": 0.32114407420158386, "memory(GiB)": 78.33, "step": 3905, "token_acc": 0.9016605685336335, "train_speed(iter/s)": 0.032457 }, { "epoch": 0.7568667344862665, "grad_norm": 0.10215258598327637, "learning_rate": 4.595809782915298e-05, "loss": 0.3243396580219269, "memory(GiB)": 78.33, "step": 3906, "token_acc": 0.9038705677423164, "train_speed(iter/s)": 0.032457 }, { "epoch": 0.7570605047715933, "grad_norm": 0.10810741782188416, "learning_rate": 4.58888707169049e-05, "loss": 0.33883118629455566, "memory(GiB)": 78.33, "step": 3907, "token_acc": 0.8982128982128982, "train_speed(iter/s)": 0.032458 }, { "epoch": 0.75725427505692, "grad_norm": 0.09572840481996536, "learning_rate": 4.581968636589521e-05, "loss": 0.3282477557659149, "memory(GiB)": 78.33, "step": 3908, "token_acc": 0.9006182113741041, "train_speed(iter/s)": 0.032459 }, { "epoch": 0.7574480453422467, "grad_norm": 0.11090946942567825, "learning_rate": 4.575054480453975e-05, "loss": 0.3538365960121155, "memory(GiB)": 78.33, "step": 3909, "token_acc": 0.8962534285800404, "train_speed(iter/s)": 0.032459 }, { "epoch": 0.7576418156275735, "grad_norm": 0.10753259807825089, "learning_rate": 4.568144606123683e-05, "loss": 0.3474193513393402, "memory(GiB)": 78.33, "step": 3910, "token_acc": 0.8973380854262961, "train_speed(iter/s)": 0.03246 }, { "epoch": 0.7578355859129002, "grad_norm": 0.10958699882030487, "learning_rate": 4.561239016436716e-05, "loss": 0.3798098564147949, "memory(GiB)": 78.33, "step": 3911, "token_acc": 0.8861535899641975, "train_speed(iter/s)": 0.032461 }, { "epoch": 0.758029356198227, "grad_norm": 0.10337840020656586, "learning_rate": 4.5543377142293856e-05, "loss": 0.3272544741630554, "memory(GiB)": 78.33, "step": 3912, "token_acc": 0.9024021501763817, "train_speed(iter/s)": 0.032461 }, { "epoch": 0.7582231264835537, "grad_norm": 0.10478585213422775, "learning_rate": 4.5474407023362374e-05, "loss": 0.334266722202301, "memory(GiB)": 78.33, "step": 3913, "token_acc": 0.8995371061669404, "train_speed(iter/s)": 0.032462 }, { "epoch": 0.7584168967688805, "grad_norm": 0.10417360812425613, "learning_rate": 4.5405479835900685e-05, "loss": 0.29992321133613586, "memory(GiB)": 78.33, "step": 3914, "token_acc": 0.9098251531019792, "train_speed(iter/s)": 0.032463 }, { "epoch": 0.7586106670542072, "grad_norm": 0.09397785365581512, "learning_rate": 4.533659560821898e-05, "loss": 0.31968510150909424, "memory(GiB)": 78.33, "step": 3915, "token_acc": 0.9051098506427563, "train_speed(iter/s)": 0.032463 }, { "epoch": 0.758804437339534, "grad_norm": 0.09900712221860886, "learning_rate": 4.526775436860988e-05, "loss": 0.316898375749588, "memory(GiB)": 78.33, "step": 3916, "token_acc": 0.9043134151167392, "train_speed(iter/s)": 0.032464 }, { "epoch": 0.7589982076248607, "grad_norm": 0.08824368566274643, "learning_rate": 4.519895614534833e-05, "loss": 0.3089524209499359, "memory(GiB)": 78.33, "step": 3917, "token_acc": 0.9057074210072986, "train_speed(iter/s)": 0.032465 }, { "epoch": 0.7591919779101874, "grad_norm": 0.09777352958917618, "learning_rate": 4.513020096669161e-05, "loss": 0.31515491008758545, "memory(GiB)": 78.33, "step": 3918, "token_acc": 0.9042631075068986, "train_speed(iter/s)": 0.032465 }, { "epoch": 0.7593857481955142, "grad_norm": 0.10553678125143051, "learning_rate": 4.506148886087925e-05, "loss": 0.3489447236061096, "memory(GiB)": 78.33, "step": 3919, "token_acc": 0.8933819207711297, "train_speed(iter/s)": 0.032466 }, { "epoch": 0.7595795184808409, "grad_norm": 0.10306905955076218, "learning_rate": 4.4992819856133285e-05, "loss": 0.35067906975746155, "memory(GiB)": 78.33, "step": 3920, "token_acc": 0.8936492337113867, "train_speed(iter/s)": 0.032467 }, { "epoch": 0.7597732887661677, "grad_norm": 0.12322760373353958, "learning_rate": 4.492419398065784e-05, "loss": 0.42081135511398315, "memory(GiB)": 78.33, "step": 3921, "token_acc": 0.8778385546162771, "train_speed(iter/s)": 0.032468 }, { "epoch": 0.7599670590514944, "grad_norm": 0.0956936627626419, "learning_rate": 4.485561126263944e-05, "loss": 0.30873122811317444, "memory(GiB)": 78.33, "step": 3922, "token_acc": 0.9064496298078208, "train_speed(iter/s)": 0.032468 }, { "epoch": 0.7601608293368212, "grad_norm": 0.10779135674238205, "learning_rate": 4.4787071730246834e-05, "loss": 0.33779773116111755, "memory(GiB)": 78.33, "step": 3923, "token_acc": 0.8981919669942968, "train_speed(iter/s)": 0.032469 }, { "epoch": 0.7603545996221479, "grad_norm": 0.094157375395298, "learning_rate": 4.471857541163103e-05, "loss": 0.3073067367076874, "memory(GiB)": 78.33, "step": 3924, "token_acc": 0.9095217049695541, "train_speed(iter/s)": 0.03247 }, { "epoch": 0.7605483699074747, "grad_norm": 0.09603159129619598, "learning_rate": 4.465012233492535e-05, "loss": 0.31469446420669556, "memory(GiB)": 78.33, "step": 3925, "token_acc": 0.9049290982825726, "train_speed(iter/s)": 0.03247 }, { "epoch": 0.7607421401928014, "grad_norm": 0.09002597630023956, "learning_rate": 4.4581712528245226e-05, "loss": 0.30716589093208313, "memory(GiB)": 78.33, "step": 3926, "token_acc": 0.9074883267996704, "train_speed(iter/s)": 0.032471 }, { "epoch": 0.7609359104781281, "grad_norm": 0.09838228672742844, "learning_rate": 4.4513346019688514e-05, "loss": 0.3335438072681427, "memory(GiB)": 78.33, "step": 3927, "token_acc": 0.8979319720777014, "train_speed(iter/s)": 0.032471 }, { "epoch": 0.7611296807634549, "grad_norm": 0.10846085846424103, "learning_rate": 4.444502283733512e-05, "loss": 0.3647676110267639, "memory(GiB)": 78.33, "step": 3928, "token_acc": 0.8943367089359427, "train_speed(iter/s)": 0.032472 }, { "epoch": 0.7613234510487816, "grad_norm": 0.09758076071739197, "learning_rate": 4.437674300924724e-05, "loss": 0.32494428753852844, "memory(GiB)": 78.33, "step": 3929, "token_acc": 0.9025488420235477, "train_speed(iter/s)": 0.032473 }, { "epoch": 0.7615172213341084, "grad_norm": 0.09471366554498672, "learning_rate": 4.430850656346919e-05, "loss": 0.31988587975502014, "memory(GiB)": 78.33, "step": 3930, "token_acc": 0.9036783713916415, "train_speed(iter/s)": 0.032473 }, { "epoch": 0.7617109916194351, "grad_norm": 0.11201035976409912, "learning_rate": 4.4240313528027545e-05, "loss": 0.35305842757225037, "memory(GiB)": 78.33, "step": 3931, "token_acc": 0.8956776695356096, "train_speed(iter/s)": 0.032474 }, { "epoch": 0.7619047619047619, "grad_norm": 0.09610579162836075, "learning_rate": 4.417216393093102e-05, "loss": 0.3121870160102844, "memory(GiB)": 78.33, "step": 3932, "token_acc": 0.9052815243581577, "train_speed(iter/s)": 0.032475 }, { "epoch": 0.7620985321900886, "grad_norm": 0.09443981200456619, "learning_rate": 4.410405780017041e-05, "loss": 0.2963784635066986, "memory(GiB)": 78.33, "step": 3933, "token_acc": 0.9106874456134799, "train_speed(iter/s)": 0.032476 }, { "epoch": 0.7622923024754154, "grad_norm": 0.0956021174788475, "learning_rate": 4.403599516371884e-05, "loss": 0.28646087646484375, "memory(GiB)": 78.33, "step": 3934, "token_acc": 0.9147495102155052, "train_speed(iter/s)": 0.032476 }, { "epoch": 0.7624860727607421, "grad_norm": 0.08942447602748871, "learning_rate": 4.396797604953143e-05, "loss": 0.31425604224205017, "memory(GiB)": 78.33, "step": 3935, "token_acc": 0.9043874975732867, "train_speed(iter/s)": 0.032477 }, { "epoch": 0.7626798430460688, "grad_norm": 0.10449796169996262, "learning_rate": 4.3900000485545445e-05, "loss": 0.3526667654514313, "memory(GiB)": 78.33, "step": 3936, "token_acc": 0.8961538461538462, "train_speed(iter/s)": 0.032478 }, { "epoch": 0.7628736133313956, "grad_norm": 0.10736709088087082, "learning_rate": 4.3832068499680276e-05, "loss": 0.3539313077926636, "memory(GiB)": 78.33, "step": 3937, "token_acc": 0.8955620144206584, "train_speed(iter/s)": 0.032478 }, { "epoch": 0.7630673836167223, "grad_norm": 0.10252580791711807, "learning_rate": 4.376418011983741e-05, "loss": 0.3328987956047058, "memory(GiB)": 78.33, "step": 3938, "token_acc": 0.9002593667342903, "train_speed(iter/s)": 0.032479 }, { "epoch": 0.7632611539020491, "grad_norm": 0.09203853458166122, "learning_rate": 4.369633537390041e-05, "loss": 0.30216261744499207, "memory(GiB)": 78.33, "step": 3939, "token_acc": 0.9080275516593613, "train_speed(iter/s)": 0.03248 }, { "epoch": 0.7634549241873758, "grad_norm": 0.10732089728116989, "learning_rate": 4.3628534289734996e-05, "loss": 0.3707207441329956, "memory(GiB)": 78.33, "step": 3940, "token_acc": 0.8905606159184558, "train_speed(iter/s)": 0.03248 }, { "epoch": 0.7636486944727026, "grad_norm": 0.10623017698526382, "learning_rate": 4.3560776895188856e-05, "loss": 0.3556334972381592, "memory(GiB)": 78.33, "step": 3941, "token_acc": 0.8951635685631957, "train_speed(iter/s)": 0.032481 }, { "epoch": 0.7638424647580293, "grad_norm": 0.10812726616859436, "learning_rate": 4.3493063218091784e-05, "loss": 0.3475589156150818, "memory(GiB)": 78.33, "step": 3942, "token_acc": 0.8958073637165181, "train_speed(iter/s)": 0.032482 }, { "epoch": 0.764036235043356, "grad_norm": 0.10460913926362991, "learning_rate": 4.342539328625559e-05, "loss": 0.36832839250564575, "memory(GiB)": 78.33, "step": 3943, "token_acc": 0.8916668762733607, "train_speed(iter/s)": 0.032482 }, { "epoch": 0.7642300053286828, "grad_norm": 0.09330563992261887, "learning_rate": 4.335776712747416e-05, "loss": 0.2897856533527374, "memory(GiB)": 78.33, "step": 3944, "token_acc": 0.9121778725053284, "train_speed(iter/s)": 0.032483 }, { "epoch": 0.7644237756140095, "grad_norm": 0.1091540977358818, "learning_rate": 4.329018476952336e-05, "loss": 0.35247859358787537, "memory(GiB)": 78.33, "step": 3945, "token_acc": 0.8945392125308443, "train_speed(iter/s)": 0.032484 }, { "epoch": 0.7646175458993363, "grad_norm": 0.1135362833738327, "learning_rate": 4.3222646240161014e-05, "loss": 0.37347984313964844, "memory(GiB)": 78.33, "step": 3946, "token_acc": 0.8913823511524661, "train_speed(iter/s)": 0.032484 }, { "epoch": 0.7648113161846631, "grad_norm": 0.09469881653785706, "learning_rate": 4.315515156712714e-05, "loss": 0.2968969941139221, "memory(GiB)": 78.33, "step": 3947, "token_acc": 0.9084439266538344, "train_speed(iter/s)": 0.032485 }, { "epoch": 0.7650050864699899, "grad_norm": 0.09792491793632507, "learning_rate": 4.308770077814354e-05, "loss": 0.33856552839279175, "memory(GiB)": 78.33, "step": 3948, "token_acc": 0.8967915156396832, "train_speed(iter/s)": 0.032486 }, { "epoch": 0.7651988567553166, "grad_norm": 0.10359305143356323, "learning_rate": 4.3020293900914075e-05, "loss": 0.31331631541252136, "memory(GiB)": 78.33, "step": 3949, "token_acc": 0.9054741250373916, "train_speed(iter/s)": 0.032486 }, { "epoch": 0.7653926270406434, "grad_norm": 0.10234098881483078, "learning_rate": 4.295293096312457e-05, "loss": 0.3446482717990875, "memory(GiB)": 78.33, "step": 3950, "token_acc": 0.8954361313351653, "train_speed(iter/s)": 0.032487 }, { "epoch": 0.7655863973259701, "grad_norm": 0.10561185330152512, "learning_rate": 4.288561199244277e-05, "loss": 0.33969467878341675, "memory(GiB)": 78.33, "step": 3951, "token_acc": 0.8973197015750207, "train_speed(iter/s)": 0.032487 }, { "epoch": 0.7657801676112969, "grad_norm": 0.10193517059087753, "learning_rate": 4.281833701651841e-05, "loss": 0.34807661175727844, "memory(GiB)": 78.33, "step": 3952, "token_acc": 0.8959937746531784, "train_speed(iter/s)": 0.032488 }, { "epoch": 0.7659739378966236, "grad_norm": 0.10420801490545273, "learning_rate": 4.275110606298307e-05, "loss": 0.34150460362434387, "memory(GiB)": 78.33, "step": 3953, "token_acc": 0.8981601881311385, "train_speed(iter/s)": 0.032489 }, { "epoch": 0.7661677081819503, "grad_norm": 0.1057552695274353, "learning_rate": 4.26839191594504e-05, "loss": 0.34214961528778076, "memory(GiB)": 78.33, "step": 3954, "token_acc": 0.8992475734944477, "train_speed(iter/s)": 0.032489 }, { "epoch": 0.7663614784672771, "grad_norm": 0.10361424833536148, "learning_rate": 4.2616776333515844e-05, "loss": 0.3399190306663513, "memory(GiB)": 78.33, "step": 3955, "token_acc": 0.895477586158072, "train_speed(iter/s)": 0.03249 }, { "epoch": 0.7665552487526038, "grad_norm": 0.10096311569213867, "learning_rate": 4.254967761275672e-05, "loss": 0.3292155861854553, "memory(GiB)": 78.33, "step": 3956, "token_acc": 0.8995320229977269, "train_speed(iter/s)": 0.032491 }, { "epoch": 0.7667490190379306, "grad_norm": 0.10260067880153656, "learning_rate": 4.2482623024732334e-05, "loss": 0.3111547827720642, "memory(GiB)": 78.33, "step": 3957, "token_acc": 0.9036062791684345, "train_speed(iter/s)": 0.032491 }, { "epoch": 0.7669427893232573, "grad_norm": 0.10281860083341599, "learning_rate": 4.241561259698376e-05, "loss": 0.3256504237651825, "memory(GiB)": 78.33, "step": 3958, "token_acc": 0.900353960698157, "train_speed(iter/s)": 0.032492 }, { "epoch": 0.7671365596085841, "grad_norm": 0.1012672632932663, "learning_rate": 4.2348646357033944e-05, "loss": 0.3229145407676697, "memory(GiB)": 78.33, "step": 3959, "token_acc": 0.902934926196083, "train_speed(iter/s)": 0.032493 }, { "epoch": 0.7673303298939108, "grad_norm": 0.11049380153417587, "learning_rate": 4.228172433238783e-05, "loss": 0.3576306700706482, "memory(GiB)": 78.33, "step": 3960, "token_acc": 0.892226424530366, "train_speed(iter/s)": 0.032493 }, { "epoch": 0.7675241001792376, "grad_norm": 0.10522151738405228, "learning_rate": 4.2214846550532026e-05, "loss": 0.34801146388053894, "memory(GiB)": 78.33, "step": 3961, "token_acc": 0.8971957437375305, "train_speed(iter/s)": 0.032494 }, { "epoch": 0.7677178704645643, "grad_norm": 0.10627347230911255, "learning_rate": 4.2148013038935054e-05, "loss": 0.34363991022109985, "memory(GiB)": 78.33, "step": 3962, "token_acc": 0.8970234186911797, "train_speed(iter/s)": 0.032495 }, { "epoch": 0.767911640749891, "grad_norm": 0.09957989305257797, "learning_rate": 4.2081223825047214e-05, "loss": 0.3352537453174591, "memory(GiB)": 78.33, "step": 3963, "token_acc": 0.8995469112609128, "train_speed(iter/s)": 0.032495 }, { "epoch": 0.7681054110352178, "grad_norm": 0.10130537301301956, "learning_rate": 4.201447893630065e-05, "loss": 0.331424355506897, "memory(GiB)": 78.33, "step": 3964, "token_acc": 0.9031016042780748, "train_speed(iter/s)": 0.032496 }, { "epoch": 0.7682991813205445, "grad_norm": 0.08747418969869614, "learning_rate": 4.194777840010926e-05, "loss": 0.29280680418014526, "memory(GiB)": 78.33, "step": 3965, "token_acc": 0.9117611623816294, "train_speed(iter/s)": 0.032497 }, { "epoch": 0.7684929516058713, "grad_norm": 0.09481216967105865, "learning_rate": 4.1881122243868715e-05, "loss": 0.34195125102996826, "memory(GiB)": 78.33, "step": 3966, "token_acc": 0.8987415491352898, "train_speed(iter/s)": 0.032497 }, { "epoch": 0.768686721891198, "grad_norm": 0.09969401359558105, "learning_rate": 4.181451049495657e-05, "loss": 0.3363822400569916, "memory(GiB)": 78.33, "step": 3967, "token_acc": 0.9003247963367232, "train_speed(iter/s)": 0.032498 }, { "epoch": 0.7688804921765248, "grad_norm": 0.10244782269001007, "learning_rate": 4.174794318073202e-05, "loss": 0.3243018090724945, "memory(GiB)": 78.33, "step": 3968, "token_acc": 0.900588806951352, "train_speed(iter/s)": 0.032499 }, { "epoch": 0.7690742624618515, "grad_norm": 0.09450022131204605, "learning_rate": 4.168142032853605e-05, "loss": 0.3295527398586273, "memory(GiB)": 78.33, "step": 3969, "token_acc": 0.9003620957213587, "train_speed(iter/s)": 0.032499 }, { "epoch": 0.7692680327471783, "grad_norm": 0.09936364740133286, "learning_rate": 4.16149419656914e-05, "loss": 0.31701233983039856, "memory(GiB)": 78.33, "step": 3970, "token_acc": 0.9036584700943242, "train_speed(iter/s)": 0.0325 }, { "epoch": 0.769461803032505, "grad_norm": 0.11445298790931702, "learning_rate": 4.15485081195025e-05, "loss": 0.3477141261100769, "memory(GiB)": 78.33, "step": 3971, "token_acc": 0.8983733184957288, "train_speed(iter/s)": 0.032501 }, { "epoch": 0.7696555733178317, "grad_norm": 0.09958308935165405, "learning_rate": 4.148211881725547e-05, "loss": 0.31291618943214417, "memory(GiB)": 78.33, "step": 3972, "token_acc": 0.9050311502015601, "train_speed(iter/s)": 0.032501 }, { "epoch": 0.7698493436031585, "grad_norm": 0.10363567620515823, "learning_rate": 4.141577408621827e-05, "loss": 0.33989396691322327, "memory(GiB)": 78.33, "step": 3973, "token_acc": 0.8975318087789396, "train_speed(iter/s)": 0.032502 }, { "epoch": 0.7700431138884852, "grad_norm": 0.10342077165842056, "learning_rate": 4.134947395364043e-05, "loss": 0.33245235681533813, "memory(GiB)": 78.33, "step": 3974, "token_acc": 0.8993934211581113, "train_speed(iter/s)": 0.032502 }, { "epoch": 0.770236884173812, "grad_norm": 0.10198888927698135, "learning_rate": 4.128321844675318e-05, "loss": 0.32281598448753357, "memory(GiB)": 78.33, "step": 3975, "token_acc": 0.9067534827377347, "train_speed(iter/s)": 0.032503 }, { "epoch": 0.7704306544591387, "grad_norm": 0.09553690254688263, "learning_rate": 4.121700759276946e-05, "loss": 0.31830698251724243, "memory(GiB)": 78.33, "step": 3976, "token_acc": 0.905037092785722, "train_speed(iter/s)": 0.032504 }, { "epoch": 0.7706244247444655, "grad_norm": 0.09600663185119629, "learning_rate": 4.1150841418883845e-05, "loss": 0.3234773874282837, "memory(GiB)": 78.33, "step": 3977, "token_acc": 0.904327039295211, "train_speed(iter/s)": 0.032504 }, { "epoch": 0.7708181950297922, "grad_norm": 0.10290994495153427, "learning_rate": 4.1084719952272524e-05, "loss": 0.32649847865104675, "memory(GiB)": 78.33, "step": 3978, "token_acc": 0.9009803039841713, "train_speed(iter/s)": 0.032505 }, { "epoch": 0.771011965315119, "grad_norm": 0.09713493287563324, "learning_rate": 4.101864322009335e-05, "loss": 0.326259970664978, "memory(GiB)": 78.33, "step": 3979, "token_acc": 0.9027269351159213, "train_speed(iter/s)": 0.032506 }, { "epoch": 0.7712057356004457, "grad_norm": 0.1079777181148529, "learning_rate": 4.0952611249485906e-05, "loss": 0.33296194672584534, "memory(GiB)": 78.33, "step": 3980, "token_acc": 0.9001692971437933, "train_speed(iter/s)": 0.032506 }, { "epoch": 0.7713995058857724, "grad_norm": 0.10135926306247711, "learning_rate": 4.0886624067571215e-05, "loss": 0.3391050100326538, "memory(GiB)": 78.33, "step": 3981, "token_acc": 0.9001635300313202, "train_speed(iter/s)": 0.032507 }, { "epoch": 0.7715932761710992, "grad_norm": 0.09656066447496414, "learning_rate": 4.0820681701452034e-05, "loss": 0.3262328803539276, "memory(GiB)": 78.33, "step": 3982, "token_acc": 0.9005832056541294, "train_speed(iter/s)": 0.032508 }, { "epoch": 0.7717870464564259, "grad_norm": 0.0958666205406189, "learning_rate": 4.0754784178212616e-05, "loss": 0.30787983536720276, "memory(GiB)": 78.33, "step": 3983, "token_acc": 0.9089610766847405, "train_speed(iter/s)": 0.032508 }, { "epoch": 0.7719808167417527, "grad_norm": 0.11135300248861313, "learning_rate": 4.068893152491888e-05, "loss": 0.3504694402217865, "memory(GiB)": 78.33, "step": 3984, "token_acc": 0.8946419448868588, "train_speed(iter/s)": 0.032509 }, { "epoch": 0.7721745870270794, "grad_norm": 0.09396642446517944, "learning_rate": 4.062312376861828e-05, "loss": 0.2973524332046509, "memory(GiB)": 78.33, "step": 3985, "token_acc": 0.9072842658845541, "train_speed(iter/s)": 0.032509 }, { "epoch": 0.7723683573124062, "grad_norm": 0.10528262704610825, "learning_rate": 4.0557360936339754e-05, "loss": 0.3348216414451599, "memory(GiB)": 78.33, "step": 3986, "token_acc": 0.8983733985893191, "train_speed(iter/s)": 0.03251 }, { "epoch": 0.7725621275977329, "grad_norm": 0.10733997821807861, "learning_rate": 4.049164305509398e-05, "loss": 0.36273688077926636, "memory(GiB)": 78.33, "step": 3987, "token_acc": 0.8929391901242769, "train_speed(iter/s)": 0.032511 }, { "epoch": 0.7727558978830597, "grad_norm": 0.09470584243535995, "learning_rate": 4.042597015187301e-05, "loss": 0.3208099603652954, "memory(GiB)": 78.33, "step": 3988, "token_acc": 0.9038011841385144, "train_speed(iter/s)": 0.032511 }, { "epoch": 0.7729496681683864, "grad_norm": 0.10173063725233078, "learning_rate": 4.036034225365047e-05, "loss": 0.33486083149909973, "memory(GiB)": 78.33, "step": 3989, "token_acc": 0.9000950410912953, "train_speed(iter/s)": 0.032512 }, { "epoch": 0.7731434384537131, "grad_norm": 0.09967856109142303, "learning_rate": 4.029475938738149e-05, "loss": 0.31119605898857117, "memory(GiB)": 78.33, "step": 3990, "token_acc": 0.9062775816416593, "train_speed(iter/s)": 0.032513 }, { "epoch": 0.7733372087390399, "grad_norm": 0.10067766904830933, "learning_rate": 4.0229221580002736e-05, "loss": 0.31823796033859253, "memory(GiB)": 78.33, "step": 3991, "token_acc": 0.9016983016983017, "train_speed(iter/s)": 0.032513 }, { "epoch": 0.7735309790243666, "grad_norm": 0.10268165916204453, "learning_rate": 4.016372885843228e-05, "loss": 0.3410794138908386, "memory(GiB)": 78.33, "step": 3992, "token_acc": 0.8990299823633157, "train_speed(iter/s)": 0.032514 }, { "epoch": 0.7737247493096934, "grad_norm": 0.1047457829117775, "learning_rate": 4.0098281249569845e-05, "loss": 0.3615070581436157, "memory(GiB)": 78.33, "step": 3993, "token_acc": 0.8931236597569693, "train_speed(iter/s)": 0.032515 }, { "epoch": 0.7739185195950201, "grad_norm": 0.10737626999616623, "learning_rate": 4.0032878780296476e-05, "loss": 0.3728790879249573, "memory(GiB)": 78.33, "step": 3994, "token_acc": 0.8893093379417198, "train_speed(iter/s)": 0.032515 }, { "epoch": 0.7741122898803469, "grad_norm": 0.10214757919311523, "learning_rate": 3.9967521477474726e-05, "loss": 0.34999844431877136, "memory(GiB)": 78.33, "step": 3995, "token_acc": 0.8950407514850117, "train_speed(iter/s)": 0.032516 }, { "epoch": 0.7743060601656736, "grad_norm": 0.09131192415952682, "learning_rate": 3.990220936794859e-05, "loss": 0.302643358707428, "memory(GiB)": 78.33, "step": 3996, "token_acc": 0.9078601502709995, "train_speed(iter/s)": 0.032517 }, { "epoch": 0.7744998304510003, "grad_norm": 0.10554607212543488, "learning_rate": 3.98369424785435e-05, "loss": 0.33304092288017273, "memory(GiB)": 78.33, "step": 3997, "token_acc": 0.9006529325678262, "train_speed(iter/s)": 0.032517 }, { "epoch": 0.7746936007363271, "grad_norm": 0.0956023558974266, "learning_rate": 3.977172083606634e-05, "loss": 0.32441043853759766, "memory(GiB)": 78.33, "step": 3998, "token_acc": 0.9003093260588937, "train_speed(iter/s)": 0.032518 }, { "epoch": 0.7748873710216538, "grad_norm": 0.09569355845451355, "learning_rate": 3.9706544467305316e-05, "loss": 0.2999122738838196, "memory(GiB)": 78.33, "step": 3999, "token_acc": 0.9101693571824883, "train_speed(iter/s)": 0.032519 }, { "epoch": 0.7750811413069806, "grad_norm": 0.1037716194987297, "learning_rate": 3.964141339903026e-05, "loss": 0.33561694622039795, "memory(GiB)": 78.33, "step": 4000, "token_acc": 0.9002222414639074, "train_speed(iter/s)": 0.03252 }, { "epoch": 0.7750811413069806, "eval_loss": 0.38567423820495605, "eval_runtime": 1344.6983, "eval_samples_per_second": 5.019, "eval_steps_per_second": 5.019, "eval_token_acc": 0.9009321879927275, "step": 4000 }, { "epoch": 0.7752749115923073, "grad_norm": 0.09706872701644897, "learning_rate": 3.9576327657992144e-05, "loss": 0.31833964586257935, "memory(GiB)": 78.33, "step": 4001, "token_acc": 0.9042262932234534, "train_speed(iter/s)": 0.032165 }, { "epoch": 0.7754686818776341, "grad_norm": 0.10196474194526672, "learning_rate": 3.951128727092346e-05, "loss": 0.3550868630409241, "memory(GiB)": 78.33, "step": 4002, "token_acc": 0.8949007501286321, "train_speed(iter/s)": 0.032166 }, { "epoch": 0.7756624521629608, "grad_norm": 0.09914345294237137, "learning_rate": 3.9446292264538046e-05, "loss": 0.33100420236587524, "memory(GiB)": 78.33, "step": 4003, "token_acc": 0.900188852592895, "train_speed(iter/s)": 0.032167 }, { "epoch": 0.7758562224482876, "grad_norm": 0.11188572645187378, "learning_rate": 3.93813426655311e-05, "loss": 0.35677969455718994, "memory(GiB)": 78.33, "step": 4004, "token_acc": 0.8938544116760148, "train_speed(iter/s)": 0.032168 }, { "epoch": 0.7760499927336143, "grad_norm": 0.10255058854818344, "learning_rate": 3.9316438500579103e-05, "loss": 0.32014256715774536, "memory(GiB)": 78.33, "step": 4005, "token_acc": 0.9037872184890489, "train_speed(iter/s)": 0.032168 }, { "epoch": 0.776243763018941, "grad_norm": 0.10134012997150421, "learning_rate": 3.925157979634005e-05, "loss": 0.32840850949287415, "memory(GiB)": 78.33, "step": 4006, "token_acc": 0.9019783197831979, "train_speed(iter/s)": 0.032169 }, { "epoch": 0.7764375333042678, "grad_norm": 0.10212652385234833, "learning_rate": 3.918676657945308e-05, "loss": 0.3236843943595886, "memory(GiB)": 78.33, "step": 4007, "token_acc": 0.9032230290158711, "train_speed(iter/s)": 0.03217 }, { "epoch": 0.7766313035895945, "grad_norm": 0.09699150919914246, "learning_rate": 3.9121998876538775e-05, "loss": 0.3039090037345886, "memory(GiB)": 78.33, "step": 4008, "token_acc": 0.9065003465003465, "train_speed(iter/s)": 0.03217 }, { "epoch": 0.7768250738749213, "grad_norm": 0.10348644107580185, "learning_rate": 3.905727671419891e-05, "loss": 0.36719679832458496, "memory(GiB)": 78.33, "step": 4009, "token_acc": 0.8896081591671731, "train_speed(iter/s)": 0.032171 }, { "epoch": 0.777018844160248, "grad_norm": 0.09221908450126648, "learning_rate": 3.899260011901666e-05, "loss": 0.2919962406158447, "memory(GiB)": 78.33, "step": 4010, "token_acc": 0.9112894709050586, "train_speed(iter/s)": 0.032172 }, { "epoch": 0.7772126144455748, "grad_norm": 0.10690614581108093, "learning_rate": 3.892796911755642e-05, "loss": 0.3488962650299072, "memory(GiB)": 78.33, "step": 4011, "token_acc": 0.8949435444280806, "train_speed(iter/s)": 0.032173 }, { "epoch": 0.7774063847309015, "grad_norm": 0.11350135505199432, "learning_rate": 3.886338373636385e-05, "loss": 0.36426064372062683, "memory(GiB)": 78.33, "step": 4012, "token_acc": 0.892725139337049, "train_speed(iter/s)": 0.032173 }, { "epoch": 0.7776001550162283, "grad_norm": 0.10248113423585892, "learning_rate": 3.8798844001965976e-05, "loss": 0.3667606711387634, "memory(GiB)": 78.33, "step": 4013, "token_acc": 0.8912181013079423, "train_speed(iter/s)": 0.032174 }, { "epoch": 0.777793925301555, "grad_norm": 0.09691166877746582, "learning_rate": 3.873434994087095e-05, "loss": 0.3389664888381958, "memory(GiB)": 78.33, "step": 4014, "token_acc": 0.8992298909214586, "train_speed(iter/s)": 0.032175 }, { "epoch": 0.7779876955868817, "grad_norm": 0.09862842410802841, "learning_rate": 3.866990157956823e-05, "loss": 0.35453882813453674, "memory(GiB)": 78.33, "step": 4015, "token_acc": 0.8935185185185185, "train_speed(iter/s)": 0.032175 }, { "epoch": 0.7781814658722085, "grad_norm": 0.11863162368535995, "learning_rate": 3.86054989445285e-05, "loss": 0.3620249032974243, "memory(GiB)": 78.33, "step": 4016, "token_acc": 0.8918080939947781, "train_speed(iter/s)": 0.032176 }, { "epoch": 0.7783752361575352, "grad_norm": 0.10086744278669357, "learning_rate": 3.854114206220364e-05, "loss": 0.3244988024234772, "memory(GiB)": 78.33, "step": 4017, "token_acc": 0.9012764208850194, "train_speed(iter/s)": 0.032177 }, { "epoch": 0.778569006442862, "grad_norm": 0.12051405757665634, "learning_rate": 3.8476830959026735e-05, "loss": 0.38493812084198, "memory(GiB)": 78.33, "step": 4018, "token_acc": 0.8884372177055104, "train_speed(iter/s)": 0.032178 }, { "epoch": 0.7787627767281887, "grad_norm": 0.10893973708152771, "learning_rate": 3.8412565661412056e-05, "loss": 0.35063496232032776, "memory(GiB)": 78.33, "step": 4019, "token_acc": 0.8946015424164524, "train_speed(iter/s)": 0.032178 }, { "epoch": 0.7789565470135155, "grad_norm": 0.09621429443359375, "learning_rate": 3.834834619575519e-05, "loss": 0.3024739623069763, "memory(GiB)": 78.33, "step": 4020, "token_acc": 0.9086043745798645, "train_speed(iter/s)": 0.032179 }, { "epoch": 0.7791503172988422, "grad_norm": 0.09711124747991562, "learning_rate": 3.8284172588432716e-05, "loss": 0.32116764783859253, "memory(GiB)": 78.33, "step": 4021, "token_acc": 0.9038179359553471, "train_speed(iter/s)": 0.03218 }, { "epoch": 0.779344087584169, "grad_norm": 0.10393381118774414, "learning_rate": 3.822004486580251e-05, "loss": 0.32372620701789856, "memory(GiB)": 78.33, "step": 4022, "token_acc": 0.9042860117188661, "train_speed(iter/s)": 0.032181 }, { "epoch": 0.7795378578694957, "grad_norm": 0.1017252653837204, "learning_rate": 3.815596305420349e-05, "loss": 0.353562593460083, "memory(GiB)": 78.33, "step": 4023, "token_acc": 0.8964561961709364, "train_speed(iter/s)": 0.032181 }, { "epoch": 0.7797316281548224, "grad_norm": 0.09764081239700317, "learning_rate": 3.809192717995584e-05, "loss": 0.3372447192668915, "memory(GiB)": 78.33, "step": 4024, "token_acc": 0.9003608608738545, "train_speed(iter/s)": 0.032182 }, { "epoch": 0.7799253984401492, "grad_norm": 0.1030493900179863, "learning_rate": 3.8027937269360757e-05, "loss": 0.32105734944343567, "memory(GiB)": 78.33, "step": 4025, "token_acc": 0.9018271999087826, "train_speed(iter/s)": 0.032183 }, { "epoch": 0.7801191687254759, "grad_norm": 0.10457610338926315, "learning_rate": 3.796399334870061e-05, "loss": 0.3225201964378357, "memory(GiB)": 78.33, "step": 4026, "token_acc": 0.9034423897581793, "train_speed(iter/s)": 0.032184 }, { "epoch": 0.7803129390108027, "grad_norm": 0.10154607892036438, "learning_rate": 3.7900095444238965e-05, "loss": 0.3304111063480377, "memory(GiB)": 78.33, "step": 4027, "token_acc": 0.9023520164459762, "train_speed(iter/s)": 0.032184 }, { "epoch": 0.7805067092961294, "grad_norm": 0.09185813367366791, "learning_rate": 3.783624358222036e-05, "loss": 0.29117369651794434, "memory(GiB)": 78.33, "step": 4028, "token_acc": 0.9135486512770641, "train_speed(iter/s)": 0.032185 }, { "epoch": 0.7807004795814562, "grad_norm": 0.10176742821931839, "learning_rate": 3.777243778887047e-05, "loss": 0.33322426676750183, "memory(GiB)": 78.33, "step": 4029, "token_acc": 0.8997748686733928, "train_speed(iter/s)": 0.032186 }, { "epoch": 0.7808942498667829, "grad_norm": 0.10486084967851639, "learning_rate": 3.770867809039604e-05, "loss": 0.3594956398010254, "memory(GiB)": 78.33, "step": 4030, "token_acc": 0.8941469133331427, "train_speed(iter/s)": 0.032186 }, { "epoch": 0.7810880201521097, "grad_norm": 0.09931764006614685, "learning_rate": 3.764496451298492e-05, "loss": 0.30685773491859436, "memory(GiB)": 78.33, "step": 4031, "token_acc": 0.9082772375966093, "train_speed(iter/s)": 0.032187 }, { "epoch": 0.7812817904374364, "grad_norm": 0.1024843156337738, "learning_rate": 3.758129708280593e-05, "loss": 0.3311740458011627, "memory(GiB)": 78.33, "step": 4032, "token_acc": 0.8998150183641189, "train_speed(iter/s)": 0.032188 }, { "epoch": 0.7814755607227631, "grad_norm": 0.10810218751430511, "learning_rate": 3.751767582600908e-05, "loss": 0.3451961576938629, "memory(GiB)": 78.33, "step": 4033, "token_acc": 0.8986039894484128, "train_speed(iter/s)": 0.032188 }, { "epoch": 0.7816693310080899, "grad_norm": 0.11089378595352173, "learning_rate": 3.745410076872528e-05, "loss": 0.3630661964416504, "memory(GiB)": 78.33, "step": 4034, "token_acc": 0.8932108218478816, "train_speed(iter/s)": 0.032189 }, { "epoch": 0.7818631012934166, "grad_norm": 0.09974395483732224, "learning_rate": 3.739057193706651e-05, "loss": 0.3139222264289856, "memory(GiB)": 78.33, "step": 4035, "token_acc": 0.9046637100646875, "train_speed(iter/s)": 0.03219 }, { "epoch": 0.7820568715787434, "grad_norm": 0.10395882278680801, "learning_rate": 3.7327089357125794e-05, "loss": 0.3373940885066986, "memory(GiB)": 78.33, "step": 4036, "token_acc": 0.9000392222782541, "train_speed(iter/s)": 0.032191 }, { "epoch": 0.7822506418640701, "grad_norm": 0.09552538394927979, "learning_rate": 3.7263653054977106e-05, "loss": 0.3279712200164795, "memory(GiB)": 78.33, "step": 4037, "token_acc": 0.9012946233238103, "train_speed(iter/s)": 0.032191 }, { "epoch": 0.7824444121493969, "grad_norm": 0.09712161868810654, "learning_rate": 3.7200263056675424e-05, "loss": 0.3257930278778076, "memory(GiB)": 78.33, "step": 4038, "token_acc": 0.9013559672911707, "train_speed(iter/s)": 0.032192 }, { "epoch": 0.7826381824347236, "grad_norm": 0.09889403730630875, "learning_rate": 3.713691938825677e-05, "loss": 0.30284392833709717, "memory(GiB)": 78.33, "step": 4039, "token_acc": 0.9077128339058109, "train_speed(iter/s)": 0.032193 }, { "epoch": 0.7828319527200504, "grad_norm": 0.11424127966165543, "learning_rate": 3.7073622075738085e-05, "loss": 0.340787798166275, "memory(GiB)": 78.33, "step": 4040, "token_acc": 0.8969184444871549, "train_speed(iter/s)": 0.032193 }, { "epoch": 0.7830257230053771, "grad_norm": 0.1108805313706398, "learning_rate": 3.701037114511727e-05, "loss": 0.3701401948928833, "memory(GiB)": 78.33, "step": 4041, "token_acc": 0.8902578623930594, "train_speed(iter/s)": 0.032194 }, { "epoch": 0.7832194932907038, "grad_norm": 0.10004852712154388, "learning_rate": 3.694716662237317e-05, "loss": 0.3417167067527771, "memory(GiB)": 78.33, "step": 4042, "token_acc": 0.8959614939126406, "train_speed(iter/s)": 0.032195 }, { "epoch": 0.7834132635760306, "grad_norm": 0.09257663786411285, "learning_rate": 3.6884008533465575e-05, "loss": 0.2933918833732605, "memory(GiB)": 78.33, "step": 4043, "token_acc": 0.9114590016681637, "train_speed(iter/s)": 0.032195 }, { "epoch": 0.7836070338613573, "grad_norm": 0.10794571042060852, "learning_rate": 3.682089690433522e-05, "loss": 0.34912464022636414, "memory(GiB)": 78.33, "step": 4044, "token_acc": 0.8977625199506128, "train_speed(iter/s)": 0.032196 }, { "epoch": 0.7838008041466841, "grad_norm": 0.09921461343765259, "learning_rate": 3.675783176090373e-05, "loss": 0.3479025363922119, "memory(GiB)": 78.33, "step": 4045, "token_acc": 0.8996169970498421, "train_speed(iter/s)": 0.032197 }, { "epoch": 0.7839945744320108, "grad_norm": 0.091604083776474, "learning_rate": 3.669481312907369e-05, "loss": 0.3121592104434967, "memory(GiB)": 78.33, "step": 4046, "token_acc": 0.9041123022455497, "train_speed(iter/s)": 0.032197 }, { "epoch": 0.7841883447173376, "grad_norm": 0.09730125963687897, "learning_rate": 3.663184103472852e-05, "loss": 0.35410815477371216, "memory(GiB)": 78.33, "step": 4047, "token_acc": 0.8955283408408409, "train_speed(iter/s)": 0.032198 }, { "epoch": 0.7843821150026643, "grad_norm": 0.09932620078325272, "learning_rate": 3.6568915503732577e-05, "loss": 0.3147258162498474, "memory(GiB)": 78.33, "step": 4048, "token_acc": 0.9058458354888774, "train_speed(iter/s)": 0.032199 }, { "epoch": 0.784575885287991, "grad_norm": 0.09612176567316055, "learning_rate": 3.650603656193105e-05, "loss": 0.3254881203174591, "memory(GiB)": 78.33, "step": 4049, "token_acc": 0.902174750301692, "train_speed(iter/s)": 0.032199 }, { "epoch": 0.7847696555733178, "grad_norm": 0.08921913802623749, "learning_rate": 3.6443204235149995e-05, "loss": 0.30229154229164124, "memory(GiB)": 78.33, "step": 4050, "token_acc": 0.9090634861704102, "train_speed(iter/s)": 0.0322 }, { "epoch": 0.7849634258586445, "grad_norm": 0.09413807839155197, "learning_rate": 3.638041854919634e-05, "loss": 0.32846495509147644, "memory(GiB)": 78.33, "step": 4051, "token_acc": 0.901771947119638, "train_speed(iter/s)": 0.032201 }, { "epoch": 0.7851571961439713, "grad_norm": 0.09915482252836227, "learning_rate": 3.6317679529857844e-05, "loss": 0.32399147748947144, "memory(GiB)": 78.33, "step": 4052, "token_acc": 0.903708087800108, "train_speed(iter/s)": 0.032202 }, { "epoch": 0.785350966429298, "grad_norm": 0.10495869070291519, "learning_rate": 3.625498720290315e-05, "loss": 0.3514251112937927, "memory(GiB)": 78.33, "step": 4053, "token_acc": 0.8951106706309878, "train_speed(iter/s)": 0.032202 }, { "epoch": 0.7855447367146248, "grad_norm": 0.10493289679288864, "learning_rate": 3.619234159408168e-05, "loss": 0.34921297430992126, "memory(GiB)": 78.33, "step": 4054, "token_acc": 0.8932814420319541, "train_speed(iter/s)": 0.032203 }, { "epoch": 0.7857385069999515, "grad_norm": 0.09735289216041565, "learning_rate": 3.6129742729123625e-05, "loss": 0.30153387784957886, "memory(GiB)": 78.33, "step": 4055, "token_acc": 0.9098055563826937, "train_speed(iter/s)": 0.032204 }, { "epoch": 0.7859322772852783, "grad_norm": 0.09159845113754272, "learning_rate": 3.606719063374006e-05, "loss": 0.30557680130004883, "memory(GiB)": 78.33, "step": 4056, "token_acc": 0.907328966162636, "train_speed(iter/s)": 0.032204 }, { "epoch": 0.786126047570605, "grad_norm": 0.0924990177154541, "learning_rate": 3.600468533362279e-05, "loss": 0.2820108234882355, "memory(GiB)": 78.33, "step": 4057, "token_acc": 0.9154507656948067, "train_speed(iter/s)": 0.032205 }, { "epoch": 0.7863198178559317, "grad_norm": 0.1056913435459137, "learning_rate": 3.594222685444441e-05, "loss": 0.33108171820640564, "memory(GiB)": 78.33, "step": 4058, "token_acc": 0.8996641845524894, "train_speed(iter/s)": 0.032206 }, { "epoch": 0.7865135881412585, "grad_norm": 0.11276807636022568, "learning_rate": 3.587981522185829e-05, "loss": 0.31702518463134766, "memory(GiB)": 78.33, "step": 4059, "token_acc": 0.9057384014048604, "train_speed(iter/s)": 0.032206 }, { "epoch": 0.7867073584265852, "grad_norm": 0.09739526361227036, "learning_rate": 3.5817450461498634e-05, "loss": 0.3203843832015991, "memory(GiB)": 78.33, "step": 4060, "token_acc": 0.9029685310230603, "train_speed(iter/s)": 0.032207 }, { "epoch": 0.786901128711912, "grad_norm": 0.09344157576560974, "learning_rate": 3.575513259898027e-05, "loss": 0.31456148624420166, "memory(GiB)": 78.33, "step": 4061, "token_acc": 0.9028077753779697, "train_speed(iter/s)": 0.032208 }, { "epoch": 0.7870948989972387, "grad_norm": 0.09507586807012558, "learning_rate": 3.569286165989881e-05, "loss": 0.326376736164093, "memory(GiB)": 78.33, "step": 4062, "token_acc": 0.900634411067798, "train_speed(iter/s)": 0.032208 }, { "epoch": 0.7872886692825655, "grad_norm": 0.10197113454341888, "learning_rate": 3.5630637669830645e-05, "loss": 0.3019747734069824, "memory(GiB)": 78.33, "step": 4063, "token_acc": 0.9101308469795503, "train_speed(iter/s)": 0.032209 }, { "epoch": 0.7874824395678922, "grad_norm": 0.10532142966985703, "learning_rate": 3.556846065433279e-05, "loss": 0.33741429448127747, "memory(GiB)": 78.33, "step": 4064, "token_acc": 0.900261802212651, "train_speed(iter/s)": 0.03221 }, { "epoch": 0.787676209853219, "grad_norm": 0.10591472685337067, "learning_rate": 3.550633063894301e-05, "loss": 0.3443160951137543, "memory(GiB)": 78.33, "step": 4065, "token_acc": 0.8973382917044889, "train_speed(iter/s)": 0.032211 }, { "epoch": 0.7878699801385457, "grad_norm": 0.10201407968997955, "learning_rate": 3.544424764917983e-05, "loss": 0.33403322100639343, "memory(GiB)": 78.33, "step": 4066, "token_acc": 0.8988485568234625, "train_speed(iter/s)": 0.032211 }, { "epoch": 0.7880637504238724, "grad_norm": 0.09689588844776154, "learning_rate": 3.538221171054239e-05, "loss": 0.3143858015537262, "memory(GiB)": 78.33, "step": 4067, "token_acc": 0.9059065757302792, "train_speed(iter/s)": 0.032212 }, { "epoch": 0.7882575207091993, "grad_norm": 0.09144670516252518, "learning_rate": 3.532022284851048e-05, "loss": 0.3066175878047943, "memory(GiB)": 78.33, "step": 4068, "token_acc": 0.9066947038975011, "train_speed(iter/s)": 0.032213 }, { "epoch": 0.788451290994526, "grad_norm": 0.09848953783512115, "learning_rate": 3.525828108854464e-05, "loss": 0.3331592381000519, "memory(GiB)": 78.33, "step": 4069, "token_acc": 0.9009377322888673, "train_speed(iter/s)": 0.032213 }, { "epoch": 0.7886450612798528, "grad_norm": 0.11075198650360107, "learning_rate": 3.519638645608596e-05, "loss": 0.37963664531707764, "memory(GiB)": 78.33, "step": 4070, "token_acc": 0.8859161767085604, "train_speed(iter/s)": 0.032214 }, { "epoch": 0.7888388315651795, "grad_norm": 0.09600254893302917, "learning_rate": 3.513453897655622e-05, "loss": 0.30960047245025635, "memory(GiB)": 78.33, "step": 4071, "token_acc": 0.9076073849781613, "train_speed(iter/s)": 0.032215 }, { "epoch": 0.7890326018505063, "grad_norm": 0.10882295668125153, "learning_rate": 3.507273867535793e-05, "loss": 0.35110461711883545, "memory(GiB)": 78.33, "step": 4072, "token_acc": 0.8942715048811936, "train_speed(iter/s)": 0.032215 }, { "epoch": 0.789226372135833, "grad_norm": 0.10820236057043076, "learning_rate": 3.5010985577874066e-05, "loss": 0.34106817841529846, "memory(GiB)": 78.33, "step": 4073, "token_acc": 0.8956751190629425, "train_speed(iter/s)": 0.032216 }, { "epoch": 0.7894201424211598, "grad_norm": 0.11212671548128128, "learning_rate": 3.494927970946831e-05, "loss": 0.3203110694885254, "memory(GiB)": 78.33, "step": 4074, "token_acc": 0.9028558826772521, "train_speed(iter/s)": 0.032217 }, { "epoch": 0.7896139127064865, "grad_norm": 0.09849805384874344, "learning_rate": 3.4887621095484905e-05, "loss": 0.3194783627986908, "memory(GiB)": 78.33, "step": 4075, "token_acc": 0.9021505376344086, "train_speed(iter/s)": 0.032217 }, { "epoch": 0.7898076829918133, "grad_norm": 0.10881268978118896, "learning_rate": 3.482600976124871e-05, "loss": 0.33694878220558167, "memory(GiB)": 78.33, "step": 4076, "token_acc": 0.8999273783587509, "train_speed(iter/s)": 0.032218 }, { "epoch": 0.79000145327714, "grad_norm": 0.08902069926261902, "learning_rate": 3.476444573206515e-05, "loss": 0.28375521302223206, "memory(GiB)": 78.33, "step": 4077, "token_acc": 0.9137976536071591, "train_speed(iter/s)": 0.032219 }, { "epoch": 0.7901952235624667, "grad_norm": 0.11011825501918793, "learning_rate": 3.4702929033220174e-05, "loss": 0.36214667558670044, "memory(GiB)": 78.33, "step": 4078, "token_acc": 0.8909107047754306, "train_speed(iter/s)": 0.032219 }, { "epoch": 0.7903889938477935, "grad_norm": 0.09892592579126358, "learning_rate": 3.464145968998045e-05, "loss": 0.3139350414276123, "memory(GiB)": 78.33, "step": 4079, "token_acc": 0.904188324670132, "train_speed(iter/s)": 0.03222 }, { "epoch": 0.7905827641331202, "grad_norm": 0.09487756341695786, "learning_rate": 3.4580037727593033e-05, "loss": 0.32229354977607727, "memory(GiB)": 78.33, "step": 4080, "token_acc": 0.9033888688296281, "train_speed(iter/s)": 0.032221 }, { "epoch": 0.790776534418447, "grad_norm": 0.09870006889104843, "learning_rate": 3.4518663171285563e-05, "loss": 0.3076024651527405, "memory(GiB)": 78.33, "step": 4081, "token_acc": 0.9079937641113859, "train_speed(iter/s)": 0.032222 }, { "epoch": 0.7909703047037737, "grad_norm": 0.08845412731170654, "learning_rate": 3.445733604626626e-05, "loss": 0.3001633286476135, "memory(GiB)": 78.33, "step": 4082, "token_acc": 0.9098186843296562, "train_speed(iter/s)": 0.032222 }, { "epoch": 0.7911640749891005, "grad_norm": 0.10530709475278854, "learning_rate": 3.4396056377723766e-05, "loss": 0.32672393321990967, "memory(GiB)": 78.33, "step": 4083, "token_acc": 0.9015249886758266, "train_speed(iter/s)": 0.032223 }, { "epoch": 0.7913578452744272, "grad_norm": 0.09617147594690323, "learning_rate": 3.433482419082734e-05, "loss": 0.31130674481391907, "memory(GiB)": 78.33, "step": 4084, "token_acc": 0.9056026236676687, "train_speed(iter/s)": 0.032224 }, { "epoch": 0.791551615559754, "grad_norm": 0.09556283056735992, "learning_rate": 3.4273639510726617e-05, "loss": 0.3225637972354889, "memory(GiB)": 78.33, "step": 4085, "token_acc": 0.9032013734599071, "train_speed(iter/s)": 0.032224 }, { "epoch": 0.7917453858450807, "grad_norm": 0.10910790413618088, "learning_rate": 3.4212502362551864e-05, "loss": 0.3229255974292755, "memory(GiB)": 78.33, "step": 4086, "token_acc": 0.9025904897090135, "train_speed(iter/s)": 0.032225 }, { "epoch": 0.7919391561304074, "grad_norm": 0.10885477811098099, "learning_rate": 3.415141277141372e-05, "loss": 0.36971014738082886, "memory(GiB)": 78.33, "step": 4087, "token_acc": 0.8883316991175234, "train_speed(iter/s)": 0.032226 }, { "epoch": 0.7921329264157342, "grad_norm": 0.10303046554327011, "learning_rate": 3.409037076240334e-05, "loss": 0.3518933653831482, "memory(GiB)": 78.33, "step": 4088, "token_acc": 0.8961959684672769, "train_speed(iter/s)": 0.032226 }, { "epoch": 0.7923266967010609, "grad_norm": 0.09691599756479263, "learning_rate": 3.4029376360592284e-05, "loss": 0.3284243941307068, "memory(GiB)": 78.33, "step": 4089, "token_acc": 0.9007073741985853, "train_speed(iter/s)": 0.032227 }, { "epoch": 0.7925204669863877, "grad_norm": 0.10007129609584808, "learning_rate": 3.396842959103262e-05, "loss": 0.30757999420166016, "memory(GiB)": 78.33, "step": 4090, "token_acc": 0.9069416756121249, "train_speed(iter/s)": 0.032228 }, { "epoch": 0.7927142372717144, "grad_norm": 0.12611159682273865, "learning_rate": 3.3907530478756793e-05, "loss": 0.3721444308757782, "memory(GiB)": 78.33, "step": 4091, "token_acc": 0.8881480744352672, "train_speed(iter/s)": 0.032228 }, { "epoch": 0.7929080075570412, "grad_norm": 0.10851044952869415, "learning_rate": 3.38466790487777e-05, "loss": 0.3504268229007721, "memory(GiB)": 78.33, "step": 4092, "token_acc": 0.8952597994530538, "train_speed(iter/s)": 0.032229 }, { "epoch": 0.7931017778423679, "grad_norm": 0.09891311824321747, "learning_rate": 3.378587532608872e-05, "loss": 0.3422066867351532, "memory(GiB)": 78.33, "step": 4093, "token_acc": 0.8977621295777654, "train_speed(iter/s)": 0.03223 }, { "epoch": 0.7932955481276946, "grad_norm": 0.10003877431154251, "learning_rate": 3.372511933566355e-05, "loss": 0.3378649652004242, "memory(GiB)": 78.33, "step": 4094, "token_acc": 0.8983621269912497, "train_speed(iter/s)": 0.032231 }, { "epoch": 0.7934893184130214, "grad_norm": 0.10703529417514801, "learning_rate": 3.366441110245627e-05, "loss": 0.34462597966194153, "memory(GiB)": 78.33, "step": 4095, "token_acc": 0.896659132256464, "train_speed(iter/s)": 0.032231 }, { "epoch": 0.7936830886983481, "grad_norm": 0.10170795023441315, "learning_rate": 3.360375065140142e-05, "loss": 0.33412742614746094, "memory(GiB)": 78.33, "step": 4096, "token_acc": 0.9000777389083237, "train_speed(iter/s)": 0.032232 }, { "epoch": 0.7938768589836749, "grad_norm": 0.09521227329969406, "learning_rate": 3.354313800741387e-05, "loss": 0.32723385095596313, "memory(GiB)": 78.33, "step": 4097, "token_acc": 0.9021288088982738, "train_speed(iter/s)": 0.032233 }, { "epoch": 0.7940706292690016, "grad_norm": 0.10300496965646744, "learning_rate": 3.3482573195388854e-05, "loss": 0.35852038860321045, "memory(GiB)": 78.33, "step": 4098, "token_acc": 0.8925953869368398, "train_speed(iter/s)": 0.032233 }, { "epoch": 0.7942643995543284, "grad_norm": 0.09719771891832352, "learning_rate": 3.342205624020194e-05, "loss": 0.3195003569126129, "memory(GiB)": 78.33, "step": 4099, "token_acc": 0.9037184087959075, "train_speed(iter/s)": 0.032234 }, { "epoch": 0.7944581698396551, "grad_norm": 0.09721704572439194, "learning_rate": 3.336158716670913e-05, "loss": 0.3213385343551636, "memory(GiB)": 78.33, "step": 4100, "token_acc": 0.9026950799122532, "train_speed(iter/s)": 0.032235 }, { "epoch": 0.7946519401249819, "grad_norm": 0.1064034178853035, "learning_rate": 3.330116599974666e-05, "loss": 0.34772008657455444, "memory(GiB)": 78.33, "step": 4101, "token_acc": 0.8958247453035746, "train_speed(iter/s)": 0.032235 }, { "epoch": 0.7948457104103086, "grad_norm": 0.09646876156330109, "learning_rate": 3.324079276413114e-05, "loss": 0.3242574632167816, "memory(GiB)": 78.33, "step": 4102, "token_acc": 0.9017802644964394, "train_speed(iter/s)": 0.032236 }, { "epoch": 0.7950394806956353, "grad_norm": 0.09698193520307541, "learning_rate": 3.318046748465949e-05, "loss": 0.3176778554916382, "memory(GiB)": 78.33, "step": 4103, "token_acc": 0.9066922523386903, "train_speed(iter/s)": 0.032237 }, { "epoch": 0.7952332509809621, "grad_norm": 0.09748407453298569, "learning_rate": 3.312019018610884e-05, "loss": 0.31994813680648804, "memory(GiB)": 78.33, "step": 4104, "token_acc": 0.9049721189591078, "train_speed(iter/s)": 0.032237 }, { "epoch": 0.7954270212662888, "grad_norm": 0.09335286915302277, "learning_rate": 3.305996089323681e-05, "loss": 0.3147365152835846, "memory(GiB)": 78.33, "step": 4105, "token_acc": 0.9052823315118397, "train_speed(iter/s)": 0.032238 }, { "epoch": 0.7956207915516156, "grad_norm": 0.09451750665903091, "learning_rate": 3.299977963078115e-05, "loss": 0.31313663721084595, "memory(GiB)": 78.33, "step": 4106, "token_acc": 0.9066659892287369, "train_speed(iter/s)": 0.032239 }, { "epoch": 0.7958145618369423, "grad_norm": 0.10167912393808365, "learning_rate": 3.29396464234599e-05, "loss": 0.3258751928806305, "memory(GiB)": 78.33, "step": 4107, "token_acc": 0.9005933489719884, "train_speed(iter/s)": 0.032239 }, { "epoch": 0.7960083321222691, "grad_norm": 0.11654622852802277, "learning_rate": 3.287956129597142e-05, "loss": 0.39499345421791077, "memory(GiB)": 78.33, "step": 4108, "token_acc": 0.883764646907701, "train_speed(iter/s)": 0.03224 }, { "epoch": 0.7962021024075958, "grad_norm": 0.09777145832777023, "learning_rate": 3.281952427299424e-05, "loss": 0.3374737501144409, "memory(GiB)": 78.33, "step": 4109, "token_acc": 0.8980137309365337, "train_speed(iter/s)": 0.032241 }, { "epoch": 0.7963958726929226, "grad_norm": 0.09370385110378265, "learning_rate": 3.2759535379187214e-05, "loss": 0.3268592059612274, "memory(GiB)": 78.33, "step": 4110, "token_acc": 0.9006880405179416, "train_speed(iter/s)": 0.032241 }, { "epoch": 0.7965896429782493, "grad_norm": 0.09882562607526779, "learning_rate": 3.269959463918934e-05, "loss": 0.32429930567741394, "memory(GiB)": 78.33, "step": 4111, "token_acc": 0.9020517759682456, "train_speed(iter/s)": 0.032242 }, { "epoch": 0.796783413263576, "grad_norm": 0.09826599806547165, "learning_rate": 3.263970207761997e-05, "loss": 0.32509344816207886, "memory(GiB)": 78.33, "step": 4112, "token_acc": 0.9012852928782132, "train_speed(iter/s)": 0.032243 }, { "epoch": 0.7969771835489028, "grad_norm": 0.0987883061170578, "learning_rate": 3.257985771907856e-05, "loss": 0.3253635764122009, "memory(GiB)": 78.33, "step": 4113, "token_acc": 0.9022095821483264, "train_speed(iter/s)": 0.032243 }, { "epoch": 0.7971709538342295, "grad_norm": 0.10667404532432556, "learning_rate": 3.252006158814478e-05, "loss": 0.333732932806015, "memory(GiB)": 78.33, "step": 4114, "token_acc": 0.8992663220134353, "train_speed(iter/s)": 0.032244 }, { "epoch": 0.7973647241195563, "grad_norm": 0.10525521636009216, "learning_rate": 3.246031370937851e-05, "loss": 0.3293308913707733, "memory(GiB)": 78.33, "step": 4115, "token_acc": 0.9034758242413402, "train_speed(iter/s)": 0.032245 }, { "epoch": 0.797558494404883, "grad_norm": 0.09787974506616592, "learning_rate": 3.240061410731981e-05, "loss": 0.31691908836364746, "memory(GiB)": 78.33, "step": 4116, "token_acc": 0.9044979630296816, "train_speed(iter/s)": 0.032246 }, { "epoch": 0.7977522646902098, "grad_norm": 0.10533692687749863, "learning_rate": 3.234096280648892e-05, "loss": 0.35985928773880005, "memory(GiB)": 78.33, "step": 4117, "token_acc": 0.8940023752969121, "train_speed(iter/s)": 0.032246 }, { "epoch": 0.7979460349755365, "grad_norm": 0.09776397794485092, "learning_rate": 3.228135983138618e-05, "loss": 0.320921391248703, "memory(GiB)": 78.33, "step": 4118, "token_acc": 0.9044611133043822, "train_speed(iter/s)": 0.032247 }, { "epoch": 0.7981398052608633, "grad_norm": 0.10431236773729324, "learning_rate": 3.222180520649224e-05, "loss": 0.3434464633464813, "memory(GiB)": 78.33, "step": 4119, "token_acc": 0.8964882943143813, "train_speed(iter/s)": 0.032248 }, { "epoch": 0.79833357554619, "grad_norm": 0.09289419651031494, "learning_rate": 3.216229895626769e-05, "loss": 0.2984315752983093, "memory(GiB)": 78.33, "step": 4120, "token_acc": 0.9101831539207184, "train_speed(iter/s)": 0.032248 }, { "epoch": 0.7985273458315167, "grad_norm": 0.09537661075592041, "learning_rate": 3.2102841105153414e-05, "loss": 0.3033888041973114, "memory(GiB)": 78.33, "step": 4121, "token_acc": 0.9094319549819424, "train_speed(iter/s)": 0.032249 }, { "epoch": 0.7987211161168435, "grad_norm": 0.09890120476484299, "learning_rate": 3.2043431677570295e-05, "loss": 0.33709731698036194, "memory(GiB)": 78.33, "step": 4122, "token_acc": 0.899075500770416, "train_speed(iter/s)": 0.03225 }, { "epoch": 0.7989148864021702, "grad_norm": 0.10694926232099533, "learning_rate": 3.19840706979194e-05, "loss": 0.3586742877960205, "memory(GiB)": 78.33, "step": 4123, "token_acc": 0.8929906273359783, "train_speed(iter/s)": 0.03225 }, { "epoch": 0.799108656687497, "grad_norm": 0.09874139726161957, "learning_rate": 3.1924758190581886e-05, "loss": 0.3332917392253876, "memory(GiB)": 78.33, "step": 4124, "token_acc": 0.9008399249775748, "train_speed(iter/s)": 0.032251 }, { "epoch": 0.7993024269728237, "grad_norm": 0.10014494508504868, "learning_rate": 3.186549417991895e-05, "loss": 0.3542085289955139, "memory(GiB)": 78.33, "step": 4125, "token_acc": 0.892789227527368, "train_speed(iter/s)": 0.032252 }, { "epoch": 0.7994961972581505, "grad_norm": 0.10449165850877762, "learning_rate": 3.1806278690272005e-05, "loss": 0.3253108263015747, "memory(GiB)": 78.33, "step": 4126, "token_acc": 0.9014500959391023, "train_speed(iter/s)": 0.032253 }, { "epoch": 0.7996899675434772, "grad_norm": 0.1035015657544136, "learning_rate": 3.174711174596238e-05, "loss": 0.3380778133869171, "memory(GiB)": 78.33, "step": 4127, "token_acc": 0.9001489459211732, "train_speed(iter/s)": 0.032253 }, { "epoch": 0.799883737828804, "grad_norm": 0.10077136009931564, "learning_rate": 3.1687993371291525e-05, "loss": 0.32532066106796265, "memory(GiB)": 78.33, "step": 4128, "token_acc": 0.9027537506026488, "train_speed(iter/s)": 0.032254 }, { "epoch": 0.8000775081141307, "grad_norm": 0.10059913992881775, "learning_rate": 3.162892359054098e-05, "loss": 0.3570432662963867, "memory(GiB)": 78.33, "step": 4129, "token_acc": 0.8933640100302458, "train_speed(iter/s)": 0.032255 }, { "epoch": 0.8002712783994574, "grad_norm": 0.1088043823838234, "learning_rate": 3.156990242797226e-05, "loss": 0.3387261927127838, "memory(GiB)": 78.33, "step": 4130, "token_acc": 0.8997582550409318, "train_speed(iter/s)": 0.032255 }, { "epoch": 0.8004650486847842, "grad_norm": 0.10076975077390671, "learning_rate": 3.151092990782695e-05, "loss": 0.32494884729385376, "memory(GiB)": 78.33, "step": 4131, "token_acc": 0.9004781745476916, "train_speed(iter/s)": 0.032256 }, { "epoch": 0.8006588189701109, "grad_norm": 0.1097509041428566, "learning_rate": 3.145200605432662e-05, "loss": 0.3796813189983368, "memory(GiB)": 78.33, "step": 4132, "token_acc": 0.8863678979936246, "train_speed(iter/s)": 0.032257 }, { "epoch": 0.8008525892554377, "grad_norm": 0.10167836397886276, "learning_rate": 3.1393130891672944e-05, "loss": 0.3483812212944031, "memory(GiB)": 78.33, "step": 4133, "token_acc": 0.896118628960376, "train_speed(iter/s)": 0.032257 }, { "epoch": 0.8010463595407644, "grad_norm": 0.10103829205036163, "learning_rate": 3.1334304444047495e-05, "loss": 0.3420543372631073, "memory(GiB)": 78.33, "step": 4134, "token_acc": 0.8974770039421813, "train_speed(iter/s)": 0.032258 }, { "epoch": 0.8012401298260912, "grad_norm": 0.10454893857240677, "learning_rate": 3.1275526735611896e-05, "loss": 0.31737667322158813, "memory(GiB)": 78.33, "step": 4135, "token_acc": 0.9039817974971559, "train_speed(iter/s)": 0.032259 }, { "epoch": 0.8014339001114179, "grad_norm": 0.14402469992637634, "learning_rate": 3.12167977905077e-05, "loss": 0.36191561818122864, "memory(GiB)": 78.33, "step": 4136, "token_acc": 0.8928331193053263, "train_speed(iter/s)": 0.03226 }, { "epoch": 0.8016276703967447, "grad_norm": 0.09334685653448105, "learning_rate": 3.1158117632856454e-05, "loss": 0.3147181570529938, "memory(GiB)": 78.33, "step": 4137, "token_acc": 0.905297142173861, "train_speed(iter/s)": 0.03226 }, { "epoch": 0.8018214406820714, "grad_norm": 0.09789223968982697, "learning_rate": 3.109948628675974e-05, "loss": 0.3045772612094879, "memory(GiB)": 78.33, "step": 4138, "token_acc": 0.9074807619528195, "train_speed(iter/s)": 0.032261 }, { "epoch": 0.8020152109673981, "grad_norm": 0.11379068344831467, "learning_rate": 3.104090377629899e-05, "loss": 0.3640574514865875, "memory(GiB)": 78.33, "step": 4139, "token_acc": 0.8914368285480933, "train_speed(iter/s)": 0.032262 }, { "epoch": 0.8022089812527249, "grad_norm": 0.11807750910520554, "learning_rate": 3.098237012553562e-05, "loss": 0.35527053475379944, "memory(GiB)": 78.33, "step": 4140, "token_acc": 0.8920514040932889, "train_speed(iter/s)": 0.032262 }, { "epoch": 0.8024027515380516, "grad_norm": 0.09791693091392517, "learning_rate": 3.0923885358510946e-05, "loss": 0.3340778946876526, "memory(GiB)": 78.33, "step": 4141, "token_acc": 0.9000325538494927, "train_speed(iter/s)": 0.032263 }, { "epoch": 0.8025965218233784, "grad_norm": 0.10278962552547455, "learning_rate": 3.086544949924627e-05, "loss": 0.34071099758148193, "memory(GiB)": 78.33, "step": 4142, "token_acc": 0.9005618602581219, "train_speed(iter/s)": 0.032264 }, { "epoch": 0.8027902921087051, "grad_norm": 0.100987508893013, "learning_rate": 3.0807062571742755e-05, "loss": 0.32279205322265625, "memory(GiB)": 78.33, "step": 4143, "token_acc": 0.9030330758672659, "train_speed(iter/s)": 0.032264 }, { "epoch": 0.8029840623940319, "grad_norm": 0.10697422176599503, "learning_rate": 3.074872459998143e-05, "loss": 0.3390357792377472, "memory(GiB)": 78.33, "step": 4144, "token_acc": 0.9002963590177815, "train_speed(iter/s)": 0.032265 }, { "epoch": 0.8031778326793586, "grad_norm": 0.09046490490436554, "learning_rate": 3.069043560792336e-05, "loss": 0.3146419823169708, "memory(GiB)": 78.33, "step": 4145, "token_acc": 0.9062027231467473, "train_speed(iter/s)": 0.032266 }, { "epoch": 0.8033716029646853, "grad_norm": 0.09514185786247253, "learning_rate": 3.063219561950936e-05, "loss": 0.3226296007633209, "memory(GiB)": 78.33, "step": 4146, "token_acc": 0.9025114098311594, "train_speed(iter/s)": 0.032266 }, { "epoch": 0.8035653732500121, "grad_norm": 0.10075315088033676, "learning_rate": 3.057400465866016e-05, "loss": 0.31182047724723816, "memory(GiB)": 78.33, "step": 4147, "token_acc": 0.9075695604585517, "train_speed(iter/s)": 0.032267 }, { "epoch": 0.8037591435353388, "grad_norm": 0.10024786740541458, "learning_rate": 3.0515862749276353e-05, "loss": 0.3048425018787384, "memory(GiB)": 78.33, "step": 4148, "token_acc": 0.9080320590439946, "train_speed(iter/s)": 0.032268 }, { "epoch": 0.8039529138206656, "grad_norm": 0.09824330359697342, "learning_rate": 3.0457769915238368e-05, "loss": 0.3147960603237152, "memory(GiB)": 78.33, "step": 4149, "token_acc": 0.9054583112323508, "train_speed(iter/s)": 0.032268 }, { "epoch": 0.8041466841059923, "grad_norm": 0.10100057721138, "learning_rate": 3.03997261804065e-05, "loss": 0.35047298669815063, "memory(GiB)": 78.33, "step": 4150, "token_acc": 0.8985602958658038, "train_speed(iter/s)": 0.032269 }, { "epoch": 0.8043404543913191, "grad_norm": 0.10795464366674423, "learning_rate": 3.034173156862084e-05, "loss": 0.3506011664867401, "memory(GiB)": 78.33, "step": 4151, "token_acc": 0.8946405657028967, "train_speed(iter/s)": 0.03227 }, { "epoch": 0.8045342246766458, "grad_norm": 0.09722840040922165, "learning_rate": 3.028378610370141e-05, "loss": 0.30743473768234253, "memory(GiB)": 78.33, "step": 4152, "token_acc": 0.9074273940345369, "train_speed(iter/s)": 0.03227 }, { "epoch": 0.8047279949619726, "grad_norm": 0.10855165868997574, "learning_rate": 3.022588980944792e-05, "loss": 0.37705856561660767, "memory(GiB)": 78.33, "step": 4153, "token_acc": 0.889402452187872, "train_speed(iter/s)": 0.032271 }, { "epoch": 0.8049217652472993, "grad_norm": 0.09210513532161713, "learning_rate": 3.0168042709639932e-05, "loss": 0.30485376715660095, "memory(GiB)": 78.33, "step": 4154, "token_acc": 0.9075583735909822, "train_speed(iter/s)": 0.032271 }, { "epoch": 0.805115535532626, "grad_norm": 0.10625799745321274, "learning_rate": 3.011024482803684e-05, "loss": 0.3092432916164398, "memory(GiB)": 78.33, "step": 4155, "token_acc": 0.9070884146341464, "train_speed(iter/s)": 0.032272 }, { "epoch": 0.8053093058179528, "grad_norm": 0.10007067769765854, "learning_rate": 3.0052496188377735e-05, "loss": 0.3402232229709625, "memory(GiB)": 78.33, "step": 4156, "token_acc": 0.8997760214195608, "train_speed(iter/s)": 0.032273 }, { "epoch": 0.8055030761032795, "grad_norm": 0.10713425278663635, "learning_rate": 2.999479681438156e-05, "loss": 0.3407253921031952, "memory(GiB)": 78.33, "step": 4157, "token_acc": 0.8987352889513438, "train_speed(iter/s)": 0.032273 }, { "epoch": 0.8056968463886063, "grad_norm": 0.09479415416717529, "learning_rate": 2.993714672974698e-05, "loss": 0.3190673887729645, "memory(GiB)": 78.33, "step": 4158, "token_acc": 0.9022564050323435, "train_speed(iter/s)": 0.032274 }, { "epoch": 0.805890616673933, "grad_norm": 0.10293829441070557, "learning_rate": 2.987954595815247e-05, "loss": 0.3430511951446533, "memory(GiB)": 78.33, "step": 4159, "token_acc": 0.8964779192630723, "train_speed(iter/s)": 0.032275 }, { "epoch": 0.8060843869592598, "grad_norm": 0.09652558714151382, "learning_rate": 2.98219945232562e-05, "loss": 0.3083397150039673, "memory(GiB)": 78.33, "step": 4160, "token_acc": 0.9067033176387339, "train_speed(iter/s)": 0.032275 }, { "epoch": 0.8062781572445865, "grad_norm": 0.11536381393671036, "learning_rate": 2.9764492448696098e-05, "loss": 0.3753798305988312, "memory(GiB)": 78.33, "step": 4161, "token_acc": 0.8890807651434643, "train_speed(iter/s)": 0.032276 }, { "epoch": 0.8064719275299133, "grad_norm": 0.09727875888347626, "learning_rate": 2.970703975808979e-05, "loss": 0.32429182529449463, "memory(GiB)": 78.33, "step": 4162, "token_acc": 0.9036051879533964, "train_speed(iter/s)": 0.032277 }, { "epoch": 0.80666569781524, "grad_norm": 0.10008195042610168, "learning_rate": 2.964963647503465e-05, "loss": 0.326107919216156, "memory(GiB)": 78.33, "step": 4163, "token_acc": 0.9037652447142781, "train_speed(iter/s)": 0.032278 }, { "epoch": 0.8068594681005667, "grad_norm": 0.0974980965256691, "learning_rate": 2.9592282623107765e-05, "loss": 0.31634485721588135, "memory(GiB)": 78.33, "step": 4164, "token_acc": 0.9050801412659604, "train_speed(iter/s)": 0.032278 }, { "epoch": 0.8070532383858935, "grad_norm": 0.10140351206064224, "learning_rate": 2.953497822586583e-05, "loss": 0.32958346605300903, "memory(GiB)": 78.33, "step": 4165, "token_acc": 0.900670556920557, "train_speed(iter/s)": 0.032279 }, { "epoch": 0.8072470086712202, "grad_norm": 0.10700841248035431, "learning_rate": 2.9477723306845414e-05, "loss": 0.34634923934936523, "memory(GiB)": 78.33, "step": 4166, "token_acc": 0.8981357595218982, "train_speed(iter/s)": 0.032279 }, { "epoch": 0.807440778956547, "grad_norm": 0.09883598238229752, "learning_rate": 2.9420517889562574e-05, "loss": 0.3215301036834717, "memory(GiB)": 78.33, "step": 4167, "token_acc": 0.9049081311541975, "train_speed(iter/s)": 0.03228 }, { "epoch": 0.8076345492418737, "grad_norm": 0.10146182030439377, "learning_rate": 2.9363361997513145e-05, "loss": 0.3307911157608032, "memory(GiB)": 78.33, "step": 4168, "token_acc": 0.9003170028818444, "train_speed(iter/s)": 0.032281 }, { "epoch": 0.8078283195272005, "grad_norm": 0.10559407621622086, "learning_rate": 2.9306255654172572e-05, "loss": 0.33857250213623047, "memory(GiB)": 78.33, "step": 4169, "token_acc": 0.8989095106466997, "train_speed(iter/s)": 0.032281 }, { "epoch": 0.8080220898125272, "grad_norm": 0.10372511297464371, "learning_rate": 2.9249198882995973e-05, "loss": 0.31755688786506653, "memory(GiB)": 78.33, "step": 4170, "token_acc": 0.9028887891034276, "train_speed(iter/s)": 0.032282 }, { "epoch": 0.808215860097854, "grad_norm": 0.10989030450582504, "learning_rate": 2.91921917074181e-05, "loss": 0.3677654266357422, "memory(GiB)": 78.33, "step": 4171, "token_acc": 0.8904222209172588, "train_speed(iter/s)": 0.032283 }, { "epoch": 0.8084096303831807, "grad_norm": 0.10606292635202408, "learning_rate": 2.9135234150853276e-05, "loss": 0.35702183842658997, "memory(GiB)": 78.33, "step": 4172, "token_acc": 0.8924521259797307, "train_speed(iter/s)": 0.032283 }, { "epoch": 0.8086034006685074, "grad_norm": 0.09554623067378998, "learning_rate": 2.907832623669559e-05, "loss": 0.32034456729888916, "memory(GiB)": 78.33, "step": 4173, "token_acc": 0.9017811962351988, "train_speed(iter/s)": 0.032284 }, { "epoch": 0.8087971709538342, "grad_norm": 0.09658126533031464, "learning_rate": 2.90214679883186e-05, "loss": 0.3179253339767456, "memory(GiB)": 78.33, "step": 4174, "token_acc": 0.9031473726867486, "train_speed(iter/s)": 0.032285 }, { "epoch": 0.8089909412391609, "grad_norm": 0.09535779058933258, "learning_rate": 2.8964659429075543e-05, "loss": 0.3256949186325073, "memory(GiB)": 78.33, "step": 4175, "token_acc": 0.9016154247003648, "train_speed(iter/s)": 0.032285 }, { "epoch": 0.8091847115244877, "grad_norm": 0.09355632960796356, "learning_rate": 2.890790058229919e-05, "loss": 0.3242107629776001, "memory(GiB)": 78.33, "step": 4176, "token_acc": 0.9035344140313579, "train_speed(iter/s)": 0.032286 }, { "epoch": 0.8093784818098144, "grad_norm": 0.10124616324901581, "learning_rate": 2.8851191471301903e-05, "loss": 0.31790977716445923, "memory(GiB)": 78.33, "step": 4177, "token_acc": 0.9053812224322622, "train_speed(iter/s)": 0.032287 }, { "epoch": 0.8095722520951412, "grad_norm": 0.10209498554468155, "learning_rate": 2.8794532119375712e-05, "loss": 0.3283519148826599, "memory(GiB)": 78.33, "step": 4178, "token_acc": 0.9008973858759266, "train_speed(iter/s)": 0.032287 }, { "epoch": 0.8097660223804679, "grad_norm": 0.11376667767763138, "learning_rate": 2.8737922549792103e-05, "loss": 0.3559627830982208, "memory(GiB)": 78.33, "step": 4179, "token_acc": 0.8943670846197467, "train_speed(iter/s)": 0.032288 }, { "epoch": 0.8099597926657947, "grad_norm": 0.0886598601937294, "learning_rate": 2.868136278580214e-05, "loss": 0.3123997449874878, "memory(GiB)": 78.33, "step": 4180, "token_acc": 0.9062552047395845, "train_speed(iter/s)": 0.032289 }, { "epoch": 0.8101535629511214, "grad_norm": 0.09667520970106125, "learning_rate": 2.8624852850636432e-05, "loss": 0.3109186589717865, "memory(GiB)": 78.33, "step": 4181, "token_acc": 0.9074359598582596, "train_speed(iter/s)": 0.032289 }, { "epoch": 0.8103473332364481, "grad_norm": 0.09185932576656342, "learning_rate": 2.856839276750514e-05, "loss": 0.3074433505535126, "memory(GiB)": 78.33, "step": 4182, "token_acc": 0.9070412315028378, "train_speed(iter/s)": 0.03229 }, { "epoch": 0.8105411035217749, "grad_norm": 0.09254490584135056, "learning_rate": 2.851198255959793e-05, "loss": 0.3106043040752411, "memory(GiB)": 78.33, "step": 4183, "token_acc": 0.9067268041237113, "train_speed(iter/s)": 0.032291 }, { "epoch": 0.8107348738071016, "grad_norm": 0.10701245069503784, "learning_rate": 2.8455622250083953e-05, "loss": 0.32825660705566406, "memory(GiB)": 78.33, "step": 4184, "token_acc": 0.9028698224852071, "train_speed(iter/s)": 0.032291 }, { "epoch": 0.8109286440924284, "grad_norm": 0.09500328451395035, "learning_rate": 2.8399311862111978e-05, "loss": 0.28734296560287476, "memory(GiB)": 78.33, "step": 4185, "token_acc": 0.9113909591935806, "train_speed(iter/s)": 0.032292 }, { "epoch": 0.8111224143777551, "grad_norm": 0.11596144735813141, "learning_rate": 2.834305141881017e-05, "loss": 0.3888537287712097, "memory(GiB)": 78.33, "step": 4186, "token_acc": 0.8838215903227655, "train_speed(iter/s)": 0.032293 }, { "epoch": 0.8113161846630819, "grad_norm": 0.10639077425003052, "learning_rate": 2.8286840943286178e-05, "loss": 0.34377238154411316, "memory(GiB)": 78.33, "step": 4187, "token_acc": 0.9001137980085349, "train_speed(iter/s)": 0.032293 }, { "epoch": 0.8115099549484086, "grad_norm": 0.10659915953874588, "learning_rate": 2.823068045862718e-05, "loss": 0.36640581488609314, "memory(GiB)": 78.33, "step": 4188, "token_acc": 0.8934043246767358, "train_speed(iter/s)": 0.032294 }, { "epoch": 0.8117037252337354, "grad_norm": 0.10051631182432175, "learning_rate": 2.817456998789978e-05, "loss": 0.3765765428543091, "memory(GiB)": 78.33, "step": 4189, "token_acc": 0.8884732824427481, "train_speed(iter/s)": 0.032294 }, { "epoch": 0.8118974955190622, "grad_norm": 0.10304338485002518, "learning_rate": 2.8118509554150076e-05, "loss": 0.34053856134414673, "memory(GiB)": 78.33, "step": 4190, "token_acc": 0.8995534483709136, "train_speed(iter/s)": 0.032295 }, { "epoch": 0.812091265804389, "grad_norm": 0.09566718339920044, "learning_rate": 2.8062499180403532e-05, "loss": 0.3288200795650482, "memory(GiB)": 78.33, "step": 4191, "token_acc": 0.900259556989139, "train_speed(iter/s)": 0.032296 }, { "epoch": 0.8122850360897157, "grad_norm": 0.10790709406137466, "learning_rate": 2.800653888966519e-05, "loss": 0.32764533162117004, "memory(GiB)": 78.33, "step": 4192, "token_acc": 0.9018782286721013, "train_speed(iter/s)": 0.032296 }, { "epoch": 0.8124788063750424, "grad_norm": 0.09332863241434097, "learning_rate": 2.7950628704919426e-05, "loss": 0.3051561415195465, "memory(GiB)": 78.33, "step": 4193, "token_acc": 0.9071980963712076, "train_speed(iter/s)": 0.032297 }, { "epoch": 0.8126725766603692, "grad_norm": 0.10486903041601181, "learning_rate": 2.7894768649130044e-05, "loss": 0.358365923166275, "memory(GiB)": 78.33, "step": 4194, "token_acc": 0.8943505007578234, "train_speed(iter/s)": 0.032298 }, { "epoch": 0.8128663469456959, "grad_norm": 0.08717334270477295, "learning_rate": 2.783895874524028e-05, "loss": 0.30397284030914307, "memory(GiB)": 78.33, "step": 4195, "token_acc": 0.9074197783971805, "train_speed(iter/s)": 0.032298 }, { "epoch": 0.8130601172310227, "grad_norm": 0.09161286801099777, "learning_rate": 2.7783199016172765e-05, "loss": 0.3110318183898926, "memory(GiB)": 78.33, "step": 4196, "token_acc": 0.907268415519644, "train_speed(iter/s)": 0.032299 }, { "epoch": 0.8132538875163494, "grad_norm": 0.10017764568328857, "learning_rate": 2.772748948482949e-05, "loss": 0.33178234100341797, "memory(GiB)": 78.33, "step": 4197, "token_acc": 0.8992754418706773, "train_speed(iter/s)": 0.0323 }, { "epoch": 0.8134476578016762, "grad_norm": 0.09984217584133148, "learning_rate": 2.7671830174091824e-05, "loss": 0.33245033025741577, "memory(GiB)": 78.33, "step": 4198, "token_acc": 0.9006675354378872, "train_speed(iter/s)": 0.0323 }, { "epoch": 0.8136414280870029, "grad_norm": 0.09028760343790054, "learning_rate": 2.7616221106820645e-05, "loss": 0.29679808020591736, "memory(GiB)": 78.33, "step": 4199, "token_acc": 0.908898198152728, "train_speed(iter/s)": 0.032301 }, { "epoch": 0.8138351983723296, "grad_norm": 0.10453902184963226, "learning_rate": 2.7560662305856036e-05, "loss": 0.3445345163345337, "memory(GiB)": 78.33, "step": 4200, "token_acc": 0.8972627996028932, "train_speed(iter/s)": 0.032301 }, { "epoch": 0.8140289686576564, "grad_norm": 0.09469582140445709, "learning_rate": 2.7505153794017487e-05, "loss": 0.3133549690246582, "memory(GiB)": 78.33, "step": 4201, "token_acc": 0.905493996966347, "train_speed(iter/s)": 0.032299 }, { "epoch": 0.8142227389429831, "grad_norm": 0.10252521187067032, "learning_rate": 2.744969559410385e-05, "loss": 0.32331135869026184, "memory(GiB)": 78.33, "step": 4202, "token_acc": 0.9024316857682103, "train_speed(iter/s)": 0.0323 }, { "epoch": 0.8144165092283099, "grad_norm": 0.09786242991685867, "learning_rate": 2.7394287728893265e-05, "loss": 0.3233289122581482, "memory(GiB)": 78.33, "step": 4203, "token_acc": 0.9021999564365062, "train_speed(iter/s)": 0.0323 }, { "epoch": 0.8146102795136366, "grad_norm": 0.09682147949934006, "learning_rate": 2.733893022114327e-05, "loss": 0.3290054202079773, "memory(GiB)": 78.33, "step": 4204, "token_acc": 0.9020085944667086, "train_speed(iter/s)": 0.032301 }, { "epoch": 0.8148040497989634, "grad_norm": 0.09070180356502533, "learning_rate": 2.728362309359062e-05, "loss": 0.3250106871128082, "memory(GiB)": 78.33, "step": 4205, "token_acc": 0.9020958593997225, "train_speed(iter/s)": 0.032302 }, { "epoch": 0.8149978200842901, "grad_norm": 0.10017143189907074, "learning_rate": 2.7228366368951525e-05, "loss": 0.36836662888526917, "memory(GiB)": 78.33, "step": 4206, "token_acc": 0.8892802334649929, "train_speed(iter/s)": 0.032302 }, { "epoch": 0.8151915903696169, "grad_norm": 0.10824721306562424, "learning_rate": 2.7173160069921357e-05, "loss": 0.31380200386047363, "memory(GiB)": 78.33, "step": 4207, "token_acc": 0.9043143002803976, "train_speed(iter/s)": 0.032303 }, { "epoch": 0.8153853606549436, "grad_norm": 0.09697490185499191, "learning_rate": 2.7118004219174838e-05, "loss": 0.3323417901992798, "memory(GiB)": 78.33, "step": 4208, "token_acc": 0.8995884773662551, "train_speed(iter/s)": 0.032304 }, { "epoch": 0.8155791309402703, "grad_norm": 0.0982755497097969, "learning_rate": 2.706289883936595e-05, "loss": 0.3338177800178528, "memory(GiB)": 78.33, "step": 4209, "token_acc": 0.8988449691991787, "train_speed(iter/s)": 0.032304 }, { "epoch": 0.8157729012255971, "grad_norm": 0.09161195158958435, "learning_rate": 2.7007843953127917e-05, "loss": 0.3077443540096283, "memory(GiB)": 78.33, "step": 4210, "token_acc": 0.9083794426997657, "train_speed(iter/s)": 0.032305 }, { "epoch": 0.8159666715109238, "grad_norm": 0.10864771902561188, "learning_rate": 2.6952839583073355e-05, "loss": 0.36505550146102905, "memory(GiB)": 78.33, "step": 4211, "token_acc": 0.8941932795618918, "train_speed(iter/s)": 0.032306 }, { "epoch": 0.8161604417962506, "grad_norm": 0.10426463186740875, "learning_rate": 2.6897885751793956e-05, "loss": 0.3455352485179901, "memory(GiB)": 78.33, "step": 4212, "token_acc": 0.8963470566017374, "train_speed(iter/s)": 0.032306 }, { "epoch": 0.8163542120815773, "grad_norm": 0.1076427772641182, "learning_rate": 2.6842982481860768e-05, "loss": 0.3745838403701782, "memory(GiB)": 78.33, "step": 4213, "token_acc": 0.8894979479395064, "train_speed(iter/s)": 0.032307 }, { "epoch": 0.8165479823669041, "grad_norm": 0.09672832489013672, "learning_rate": 2.6788129795824054e-05, "loss": 0.34184157848358154, "memory(GiB)": 78.33, "step": 4214, "token_acc": 0.8963274582336017, "train_speed(iter/s)": 0.032308 }, { "epoch": 0.8167417526522308, "grad_norm": 0.10097566246986389, "learning_rate": 2.6733327716213236e-05, "loss": 0.30746835470199585, "memory(GiB)": 78.33, "step": 4215, "token_acc": 0.9063103281853282, "train_speed(iter/s)": 0.032308 }, { "epoch": 0.8169355229375576, "grad_norm": 0.10790715366601944, "learning_rate": 2.667857626553705e-05, "loss": 0.34957319498062134, "memory(GiB)": 78.33, "step": 4216, "token_acc": 0.8963655436258176, "train_speed(iter/s)": 0.032309 }, { "epoch": 0.8171292932228843, "grad_norm": 0.10515126585960388, "learning_rate": 2.662387546628332e-05, "loss": 0.3403055965900421, "memory(GiB)": 78.33, "step": 4217, "token_acc": 0.8979972597148937, "train_speed(iter/s)": 0.032309 }, { "epoch": 0.817323063508211, "grad_norm": 0.10571117699146271, "learning_rate": 2.6569225340919202e-05, "loss": 0.3421645164489746, "memory(GiB)": 78.33, "step": 4218, "token_acc": 0.8976865691740056, "train_speed(iter/s)": 0.03231 }, { "epoch": 0.8175168337935378, "grad_norm": 0.10033620893955231, "learning_rate": 2.651462591189097e-05, "loss": 0.32853877544403076, "memory(GiB)": 78.33, "step": 4219, "token_acc": 0.9037694765715578, "train_speed(iter/s)": 0.032311 }, { "epoch": 0.8177106040788645, "grad_norm": 0.10828068852424622, "learning_rate": 2.6460077201624058e-05, "loss": 0.3596659302711487, "memory(GiB)": 78.33, "step": 4220, "token_acc": 0.8916654015485046, "train_speed(iter/s)": 0.032311 }, { "epoch": 0.8179043743641913, "grad_norm": 0.09712978452444077, "learning_rate": 2.6405579232523066e-05, "loss": 0.31051602959632874, "memory(GiB)": 78.33, "step": 4221, "token_acc": 0.9060798808091227, "train_speed(iter/s)": 0.032312 }, { "epoch": 0.818098144649518, "grad_norm": 0.10436290502548218, "learning_rate": 2.6351132026971823e-05, "loss": 0.3625693619251251, "memory(GiB)": 78.33, "step": 4222, "token_acc": 0.8920728157908663, "train_speed(iter/s)": 0.032313 }, { "epoch": 0.8182919149348448, "grad_norm": 0.10246887058019638, "learning_rate": 2.6296735607333202e-05, "loss": 0.34993377327919006, "memory(GiB)": 78.33, "step": 4223, "token_acc": 0.8966694249069094, "train_speed(iter/s)": 0.032313 }, { "epoch": 0.8184856852201715, "grad_norm": 0.09719887375831604, "learning_rate": 2.6242389995949286e-05, "loss": 0.31395548582077026, "memory(GiB)": 78.33, "step": 4224, "token_acc": 0.9058810845776508, "train_speed(iter/s)": 0.032314 }, { "epoch": 0.8186794555054983, "grad_norm": 0.09809587150812149, "learning_rate": 2.618809521514132e-05, "loss": 0.3162482976913452, "memory(GiB)": 78.33, "step": 4225, "token_acc": 0.9043280182232346, "train_speed(iter/s)": 0.032315 }, { "epoch": 0.818873225790825, "grad_norm": 0.09690183401107788, "learning_rate": 2.613385128720961e-05, "loss": 0.308881551027298, "memory(GiB)": 78.33, "step": 4226, "token_acc": 0.9085266774992675, "train_speed(iter/s)": 0.032315 }, { "epoch": 0.8190669960761517, "grad_norm": 0.10261222720146179, "learning_rate": 2.6079658234433575e-05, "loss": 0.3542589843273163, "memory(GiB)": 78.33, "step": 4227, "token_acc": 0.8961422008770098, "train_speed(iter/s)": 0.032316 }, { "epoch": 0.8192607663614785, "grad_norm": 0.09491068869829178, "learning_rate": 2.602551607907179e-05, "loss": 0.31204402446746826, "memory(GiB)": 78.33, "step": 4228, "token_acc": 0.9075091359188966, "train_speed(iter/s)": 0.032316 }, { "epoch": 0.8194545366468052, "grad_norm": 0.10251081734895706, "learning_rate": 2.5971424843361865e-05, "loss": 0.3522550165653229, "memory(GiB)": 78.33, "step": 4229, "token_acc": 0.8963797611666712, "train_speed(iter/s)": 0.032317 }, { "epoch": 0.819648306932132, "grad_norm": 0.11336258798837662, "learning_rate": 2.591738454952055e-05, "loss": 0.33237776160240173, "memory(GiB)": 78.33, "step": 4230, "token_acc": 0.8990595976358925, "train_speed(iter/s)": 0.032318 }, { "epoch": 0.8198420772174587, "grad_norm": 0.0893642008304596, "learning_rate": 2.5863395219743565e-05, "loss": 0.3061623275279999, "memory(GiB)": 78.33, "step": 4231, "token_acc": 0.9074907292954264, "train_speed(iter/s)": 0.032318 }, { "epoch": 0.8200358475027855, "grad_norm": 0.10706465691328049, "learning_rate": 2.5809456876205897e-05, "loss": 0.35384026169776917, "memory(GiB)": 78.33, "step": 4232, "token_acc": 0.8951451380857006, "train_speed(iter/s)": 0.032319 }, { "epoch": 0.8202296177881122, "grad_norm": 0.11483049392700195, "learning_rate": 2.575556954106142e-05, "loss": 0.345024049282074, "memory(GiB)": 78.33, "step": 4233, "token_acc": 0.8970334598137288, "train_speed(iter/s)": 0.03232 }, { "epoch": 0.820423388073439, "grad_norm": 0.08666758239269257, "learning_rate": 2.57017332364431e-05, "loss": 0.2846605181694031, "memory(GiB)": 78.33, "step": 4234, "token_acc": 0.9113689315771956, "train_speed(iter/s)": 0.03232 }, { "epoch": 0.8206171583587657, "grad_norm": 0.10516150295734406, "learning_rate": 2.564794798446298e-05, "loss": 0.353068470954895, "memory(GiB)": 78.33, "step": 4235, "token_acc": 0.8956983240223464, "train_speed(iter/s)": 0.032321 }, { "epoch": 0.8208109286440924, "grad_norm": 0.10344908386468887, "learning_rate": 2.559421380721207e-05, "loss": 0.3344540596008301, "memory(GiB)": 78.33, "step": 4236, "token_acc": 0.8966224100413132, "train_speed(iter/s)": 0.032321 }, { "epoch": 0.8210046989294192, "grad_norm": 0.09849216789007187, "learning_rate": 2.554053072676049e-05, "loss": 0.3291811943054199, "memory(GiB)": 78.33, "step": 4237, "token_acc": 0.9034900284900285, "train_speed(iter/s)": 0.032322 }, { "epoch": 0.8211984692147459, "grad_norm": 0.10524845868349075, "learning_rate": 2.5486898765157227e-05, "loss": 0.36191216111183167, "memory(GiB)": 78.33, "step": 4238, "token_acc": 0.8912652571926766, "train_speed(iter/s)": 0.032323 }, { "epoch": 0.8213922395000727, "grad_norm": 0.10183076560497284, "learning_rate": 2.5433317944430497e-05, "loss": 0.31616219878196716, "memory(GiB)": 78.33, "step": 4239, "token_acc": 0.9054447035789353, "train_speed(iter/s)": 0.032323 }, { "epoch": 0.8215860097853994, "grad_norm": 0.10111679881811142, "learning_rate": 2.5379788286587317e-05, "loss": 0.3214479386806488, "memory(GiB)": 78.33, "step": 4240, "token_acc": 0.9007918449076181, "train_speed(iter/s)": 0.032324 }, { "epoch": 0.8217797800707262, "grad_norm": 0.10107513517141342, "learning_rate": 2.532630981361376e-05, "loss": 0.3232714831829071, "memory(GiB)": 78.33, "step": 4241, "token_acc": 0.9016899178246744, "train_speed(iter/s)": 0.032325 }, { "epoch": 0.8219735503560529, "grad_norm": 0.10409308224916458, "learning_rate": 2.5272882547474877e-05, "loss": 0.3329227566719055, "memory(GiB)": 78.33, "step": 4242, "token_acc": 0.8991776806795668, "train_speed(iter/s)": 0.032325 }, { "epoch": 0.8221673206413797, "grad_norm": 0.09440822154283524, "learning_rate": 2.5219506510114647e-05, "loss": 0.33597031235694885, "memory(GiB)": 78.33, "step": 4243, "token_acc": 0.9017857142857143, "train_speed(iter/s)": 0.032326 }, { "epoch": 0.8223610909267064, "grad_norm": 0.09774498641490936, "learning_rate": 2.5166181723456147e-05, "loss": 0.3483470678329468, "memory(GiB)": 78.33, "step": 4244, "token_acc": 0.8975424647632815, "train_speed(iter/s)": 0.032327 }, { "epoch": 0.8225548612120331, "grad_norm": 0.09778609871864319, "learning_rate": 2.5112908209401144e-05, "loss": 0.29115307331085205, "memory(GiB)": 78.33, "step": 4245, "token_acc": 0.9137274419007227, "train_speed(iter/s)": 0.032327 }, { "epoch": 0.8227486314973599, "grad_norm": 0.09965585917234421, "learning_rate": 2.5059685989830636e-05, "loss": 0.32971906661987305, "memory(GiB)": 78.33, "step": 4246, "token_acc": 0.9021879021879022, "train_speed(iter/s)": 0.032328 }, { "epoch": 0.8229424017826866, "grad_norm": 0.10306161642074585, "learning_rate": 2.5006515086604368e-05, "loss": 0.3302563726902008, "memory(GiB)": 78.33, "step": 4247, "token_acc": 0.9004143366267783, "train_speed(iter/s)": 0.032328 }, { "epoch": 0.8231361720680134, "grad_norm": 0.08937028795480728, "learning_rate": 2.4953395521561053e-05, "loss": 0.30359315872192383, "memory(GiB)": 78.33, "step": 4248, "token_acc": 0.9071417392665241, "train_speed(iter/s)": 0.032329 }, { "epoch": 0.8233299423533401, "grad_norm": 0.10544510930776596, "learning_rate": 2.4900327316518326e-05, "loss": 0.3514764606952667, "memory(GiB)": 78.33, "step": 4249, "token_acc": 0.8938008836033232, "train_speed(iter/s)": 0.03233 }, { "epoch": 0.8235237126386669, "grad_norm": 0.10431473702192307, "learning_rate": 2.48473104932727e-05, "loss": 0.3315693438053131, "memory(GiB)": 78.33, "step": 4250, "token_acc": 0.8996884388561255, "train_speed(iter/s)": 0.03233 }, { "epoch": 0.8237174829239936, "grad_norm": 0.10015621036291122, "learning_rate": 2.479434507359967e-05, "loss": 0.321992427110672, "memory(GiB)": 78.33, "step": 4251, "token_acc": 0.9025872260867175, "train_speed(iter/s)": 0.032331 }, { "epoch": 0.8239112532093203, "grad_norm": 0.10041700303554535, "learning_rate": 2.474143107925352e-05, "loss": 0.3204496204853058, "memory(GiB)": 78.33, "step": 4252, "token_acc": 0.9054709141274239, "train_speed(iter/s)": 0.032332 }, { "epoch": 0.8241050234946471, "grad_norm": 0.09637311100959778, "learning_rate": 2.4688568531967467e-05, "loss": 0.31289243698120117, "memory(GiB)": 78.33, "step": 4253, "token_acc": 0.9049771363624349, "train_speed(iter/s)": 0.032332 }, { "epoch": 0.8242987937799738, "grad_norm": 0.10459338128566742, "learning_rate": 2.463575745345356e-05, "loss": 0.34315067529678345, "memory(GiB)": 78.33, "step": 4254, "token_acc": 0.8960445130013222, "train_speed(iter/s)": 0.032333 }, { "epoch": 0.8244925640653006, "grad_norm": 0.09525644779205322, "learning_rate": 2.4582997865402727e-05, "loss": 0.32378891110420227, "memory(GiB)": 78.33, "step": 4255, "token_acc": 0.9033018867924528, "train_speed(iter/s)": 0.032333 }, { "epoch": 0.8246863343506273, "grad_norm": 0.0998750701546669, "learning_rate": 2.453028978948477e-05, "loss": 0.31236085295677185, "memory(GiB)": 78.33, "step": 4256, "token_acc": 0.9072579542034853, "train_speed(iter/s)": 0.032334 }, { "epoch": 0.8248801046359541, "grad_norm": 0.09134574234485626, "learning_rate": 2.4477633247348238e-05, "loss": 0.3001454472541809, "memory(GiB)": 78.33, "step": 4257, "token_acc": 0.9097239492663517, "train_speed(iter/s)": 0.032335 }, { "epoch": 0.8250738749212808, "grad_norm": 0.09457145631313324, "learning_rate": 2.4425028260620715e-05, "loss": 0.32455000281333923, "memory(GiB)": 78.33, "step": 4258, "token_acc": 0.902986820556812, "train_speed(iter/s)": 0.032335 }, { "epoch": 0.8252676452066076, "grad_norm": 0.1064552590250969, "learning_rate": 2.4372474850908404e-05, "loss": 0.3257259130477905, "memory(GiB)": 78.33, "step": 4259, "token_acc": 0.9031152183633925, "train_speed(iter/s)": 0.032336 }, { "epoch": 0.8254614154919343, "grad_norm": 0.10509302467107773, "learning_rate": 2.4319973039796397e-05, "loss": 0.3298068344593048, "memory(GiB)": 78.33, "step": 4260, "token_acc": 0.8993223921422355, "train_speed(iter/s)": 0.032336 }, { "epoch": 0.825655185777261, "grad_norm": 0.09444321691989899, "learning_rate": 2.4267522848848635e-05, "loss": 0.29824700951576233, "memory(GiB)": 78.33, "step": 4261, "token_acc": 0.908746618575293, "train_speed(iter/s)": 0.032337 }, { "epoch": 0.8258489560625878, "grad_norm": 0.09932407736778259, "learning_rate": 2.4215124299607802e-05, "loss": 0.33362001180648804, "memory(GiB)": 78.33, "step": 4262, "token_acc": 0.8990427838595377, "train_speed(iter/s)": 0.032338 }, { "epoch": 0.8260427263479145, "grad_norm": 0.09288123995065689, "learning_rate": 2.416277741359538e-05, "loss": 0.31670305132865906, "memory(GiB)": 78.33, "step": 4263, "token_acc": 0.9042440573573285, "train_speed(iter/s)": 0.032338 }, { "epoch": 0.8262364966332413, "grad_norm": 0.11048420518636703, "learning_rate": 2.411048221231162e-05, "loss": 0.3662915825843811, "memory(GiB)": 78.33, "step": 4264, "token_acc": 0.8945862079354081, "train_speed(iter/s)": 0.032339 }, { "epoch": 0.826430266918568, "grad_norm": 0.10002768039703369, "learning_rate": 2.4058238717235628e-05, "loss": 0.32539236545562744, "memory(GiB)": 78.33, "step": 4265, "token_acc": 0.9030578297219892, "train_speed(iter/s)": 0.03234 }, { "epoch": 0.8266240372038948, "grad_norm": 0.10625031590461731, "learning_rate": 2.4006046949825186e-05, "loss": 0.31954246759414673, "memory(GiB)": 78.33, "step": 4266, "token_acc": 0.9040422214225741, "train_speed(iter/s)": 0.03234 }, { "epoch": 0.8268178074892215, "grad_norm": 0.09548264741897583, "learning_rate": 2.3953906931516848e-05, "loss": 0.31130972504615784, "memory(GiB)": 78.33, "step": 4267, "token_acc": 0.9068991470145509, "train_speed(iter/s)": 0.032341 }, { "epoch": 0.8270115777745483, "grad_norm": 0.1018620952963829, "learning_rate": 2.390181868372593e-05, "loss": 0.37592288851737976, "memory(GiB)": 78.33, "step": 4268, "token_acc": 0.8873175527489979, "train_speed(iter/s)": 0.032341 }, { "epoch": 0.827205348059875, "grad_norm": 0.10313120484352112, "learning_rate": 2.384978222784646e-05, "loss": 0.3224887251853943, "memory(GiB)": 78.33, "step": 4269, "token_acc": 0.9022740524781341, "train_speed(iter/s)": 0.032342 }, { "epoch": 0.8273991183452017, "grad_norm": 0.1032845601439476, "learning_rate": 2.379779758525123e-05, "loss": 0.33453071117401123, "memory(GiB)": 78.33, "step": 4270, "token_acc": 0.8983879998821207, "train_speed(iter/s)": 0.032343 }, { "epoch": 0.8275928886305285, "grad_norm": 0.09773199260234833, "learning_rate": 2.3745864777291674e-05, "loss": 0.3257528841495514, "memory(GiB)": 78.33, "step": 4271, "token_acc": 0.9016068290233492, "train_speed(iter/s)": 0.032343 }, { "epoch": 0.8277866589158552, "grad_norm": 0.09065309911966324, "learning_rate": 2.369398382529807e-05, "loss": 0.29769429564476013, "memory(GiB)": 78.33, "step": 4272, "token_acc": 0.9092235329627487, "train_speed(iter/s)": 0.032344 }, { "epoch": 0.827980429201182, "grad_norm": 0.08960139006376266, "learning_rate": 2.3642154750579272e-05, "loss": 0.31141579151153564, "memory(GiB)": 78.33, "step": 4273, "token_acc": 0.9071020707684546, "train_speed(iter/s)": 0.032345 }, { "epoch": 0.8281741994865087, "grad_norm": 0.10664994269609451, "learning_rate": 2.3590377574422892e-05, "loss": 0.37568601965904236, "memory(GiB)": 78.33, "step": 4274, "token_acc": 0.8949000498366465, "train_speed(iter/s)": 0.032345 }, { "epoch": 0.8283679697718355, "grad_norm": 0.11605322360992432, "learning_rate": 2.3538652318095198e-05, "loss": 0.3583415746688843, "memory(GiB)": 78.33, "step": 4275, "token_acc": 0.8928342520189719, "train_speed(iter/s)": 0.032346 }, { "epoch": 0.8285617400571622, "grad_norm": 0.10450875759124756, "learning_rate": 2.348697900284111e-05, "loss": 0.34997832775115967, "memory(GiB)": 78.33, "step": 4276, "token_acc": 0.8950859618248274, "train_speed(iter/s)": 0.032347 }, { "epoch": 0.828755510342489, "grad_norm": 0.09127886593341827, "learning_rate": 2.3435357649884357e-05, "loss": 0.31003209948539734, "memory(GiB)": 78.33, "step": 4277, "token_acc": 0.9049302739590545, "train_speed(iter/s)": 0.032347 }, { "epoch": 0.8289492806278157, "grad_norm": 0.10241317003965378, "learning_rate": 2.3383788280427074e-05, "loss": 0.347523957490921, "memory(GiB)": 78.33, "step": 4278, "token_acc": 0.8969976905311778, "train_speed(iter/s)": 0.032348 }, { "epoch": 0.8291430509131424, "grad_norm": 0.09595993906259537, "learning_rate": 2.3332270915650285e-05, "loss": 0.3249264061450958, "memory(GiB)": 78.33, "step": 4279, "token_acc": 0.9030240097254584, "train_speed(iter/s)": 0.032348 }, { "epoch": 0.8293368211984692, "grad_norm": 0.0956515520811081, "learning_rate": 2.328080557671352e-05, "loss": 0.3111531734466553, "memory(GiB)": 78.33, "step": 4280, "token_acc": 0.9046177726038028, "train_speed(iter/s)": 0.032349 }, { "epoch": 0.8295305914837959, "grad_norm": 0.08913633227348328, "learning_rate": 2.3229392284754994e-05, "loss": 0.30270490050315857, "memory(GiB)": 78.33, "step": 4281, "token_acc": 0.9090954225002482, "train_speed(iter/s)": 0.03235 }, { "epoch": 0.8297243617691227, "grad_norm": 0.10194297879934311, "learning_rate": 2.3178031060891507e-05, "loss": 0.35240015387535095, "memory(GiB)": 78.33, "step": 4282, "token_acc": 0.8955933833943508, "train_speed(iter/s)": 0.03235 }, { "epoch": 0.8299181320544494, "grad_norm": 0.10122035443782806, "learning_rate": 2.312672192621846e-05, "loss": 0.33214977383613586, "memory(GiB)": 78.33, "step": 4283, "token_acc": 0.9003510196977169, "train_speed(iter/s)": 0.032351 }, { "epoch": 0.8301119023397762, "grad_norm": 0.09273627400398254, "learning_rate": 2.307546490180997e-05, "loss": 0.3047739863395691, "memory(GiB)": 78.33, "step": 4284, "token_acc": 0.909302266165913, "train_speed(iter/s)": 0.032351 }, { "epoch": 0.8303056726251029, "grad_norm": 0.1077071949839592, "learning_rate": 2.3024260008718642e-05, "loss": 0.34293609857559204, "memory(GiB)": 78.33, "step": 4285, "token_acc": 0.896573135034446, "train_speed(iter/s)": 0.032352 }, { "epoch": 0.8304994429104297, "grad_norm": 0.0920027419924736, "learning_rate": 2.2973107267975703e-05, "loss": 0.3134732246398926, "memory(GiB)": 78.33, "step": 4286, "token_acc": 0.9056259577262666, "train_speed(iter/s)": 0.032353 }, { "epoch": 0.8306932131957564, "grad_norm": 0.09819093346595764, "learning_rate": 2.292200670059095e-05, "loss": 0.32229092717170715, "memory(GiB)": 78.33, "step": 4287, "token_acc": 0.9032614234754544, "train_speed(iter/s)": 0.032353 }, { "epoch": 0.8308869834810831, "grad_norm": 0.10240425914525986, "learning_rate": 2.2870958327552774e-05, "loss": 0.35109835863113403, "memory(GiB)": 78.33, "step": 4288, "token_acc": 0.8959884028881269, "train_speed(iter/s)": 0.032354 }, { "epoch": 0.8310807537664099, "grad_norm": 0.10077088326215744, "learning_rate": 2.2819962169828088e-05, "loss": 0.32499995827674866, "memory(GiB)": 78.33, "step": 4289, "token_acc": 0.9027956087719758, "train_speed(iter/s)": 0.032355 }, { "epoch": 0.8312745240517366, "grad_norm": 0.09702709317207336, "learning_rate": 2.276901824836237e-05, "loss": 0.3244347870349884, "memory(GiB)": 78.33, "step": 4290, "token_acc": 0.9018396633577902, "train_speed(iter/s)": 0.032355 }, { "epoch": 0.8314682943370634, "grad_norm": 0.10198424011468887, "learning_rate": 2.2718126584079734e-05, "loss": 0.3321649730205536, "memory(GiB)": 78.33, "step": 4291, "token_acc": 0.9009141765588757, "train_speed(iter/s)": 0.032356 }, { "epoch": 0.8316620646223901, "grad_norm": 0.1072782501578331, "learning_rate": 2.266728719788269e-05, "loss": 0.3425625264644623, "memory(GiB)": 78.33, "step": 4292, "token_acc": 0.8971817900278725, "train_speed(iter/s)": 0.032356 }, { "epoch": 0.8318558349077169, "grad_norm": 0.10138484835624695, "learning_rate": 2.2616500110652352e-05, "loss": 0.32518693804740906, "memory(GiB)": 78.33, "step": 4293, "token_acc": 0.9017971942055487, "train_speed(iter/s)": 0.032357 }, { "epoch": 0.8320496051930436, "grad_norm": 0.09211868792772293, "learning_rate": 2.2565765343248353e-05, "loss": 0.3022071123123169, "memory(GiB)": 78.33, "step": 4294, "token_acc": 0.9080617941061924, "train_speed(iter/s)": 0.032358 }, { "epoch": 0.8322433754783704, "grad_norm": 0.09903164952993393, "learning_rate": 2.2515082916508824e-05, "loss": 0.3520633280277252, "memory(GiB)": 78.33, "step": 4295, "token_acc": 0.8949784239892258, "train_speed(iter/s)": 0.032358 }, { "epoch": 0.8324371457636971, "grad_norm": 0.10028848052024841, "learning_rate": 2.24644528512504e-05, "loss": 0.34844347834587097, "memory(GiB)": 78.33, "step": 4296, "token_acc": 0.8961258624855694, "train_speed(iter/s)": 0.032359 }, { "epoch": 0.8326309160490238, "grad_norm": 0.0993753969669342, "learning_rate": 2.2413875168268154e-05, "loss": 0.35692083835601807, "memory(GiB)": 78.33, "step": 4297, "token_acc": 0.8944187141847869, "train_speed(iter/s)": 0.03236 }, { "epoch": 0.8328246863343506, "grad_norm": 0.10167410969734192, "learning_rate": 2.2363349888335775e-05, "loss": 0.349956214427948, "memory(GiB)": 78.33, "step": 4298, "token_acc": 0.895724891202321, "train_speed(iter/s)": 0.03236 }, { "epoch": 0.8330184566196773, "grad_norm": 0.0991780236363411, "learning_rate": 2.2312877032205346e-05, "loss": 0.3107830584049225, "memory(GiB)": 78.33, "step": 4299, "token_acc": 0.9071795583946675, "train_speed(iter/s)": 0.032361 }, { "epoch": 0.8332122269050041, "grad_norm": 0.09722703695297241, "learning_rate": 2.22624566206074e-05, "loss": 0.3044697344303131, "memory(GiB)": 78.33, "step": 4300, "token_acc": 0.9069954220544535, "train_speed(iter/s)": 0.032362 }, { "epoch": 0.8334059971903308, "grad_norm": 0.09022455662488937, "learning_rate": 2.2212088674250956e-05, "loss": 0.2870331108570099, "memory(GiB)": 78.33, "step": 4301, "token_acc": 0.91363100759646, "train_speed(iter/s)": 0.032362 }, { "epoch": 0.8335997674756576, "grad_norm": 0.1041305810213089, "learning_rate": 2.216177321382348e-05, "loss": 0.3358173966407776, "memory(GiB)": 78.33, "step": 4302, "token_acc": 0.8962132245849112, "train_speed(iter/s)": 0.032363 }, { "epoch": 0.8337935377609843, "grad_norm": 0.10265124589204788, "learning_rate": 2.2111510259990913e-05, "loss": 0.3599543869495392, "memory(GiB)": 78.33, "step": 4303, "token_acc": 0.8918628427620197, "train_speed(iter/s)": 0.032363 }, { "epoch": 0.833987308046311, "grad_norm": 0.0928591713309288, "learning_rate": 2.2061299833397532e-05, "loss": 0.29637521505355835, "memory(GiB)": 78.33, "step": 4304, "token_acc": 0.910638866613205, "train_speed(iter/s)": 0.032364 }, { "epoch": 0.8341810783316378, "grad_norm": 0.0997898206114769, "learning_rate": 2.2011141954666185e-05, "loss": 0.33638525009155273, "memory(GiB)": 78.33, "step": 4305, "token_acc": 0.8974762101779065, "train_speed(iter/s)": 0.032365 }, { "epoch": 0.8343748486169645, "grad_norm": 0.10566650331020355, "learning_rate": 2.1961036644398035e-05, "loss": 0.3401532769203186, "memory(GiB)": 78.33, "step": 4306, "token_acc": 0.8974769961412882, "train_speed(iter/s)": 0.032365 }, { "epoch": 0.8345686189022913, "grad_norm": 0.11155321449041367, "learning_rate": 2.1910983923172686e-05, "loss": 0.29791954159736633, "memory(GiB)": 78.33, "step": 4307, "token_acc": 0.910283068563032, "train_speed(iter/s)": 0.032366 }, { "epoch": 0.834762389187618, "grad_norm": 0.10973121225833893, "learning_rate": 2.1860983811548118e-05, "loss": 0.3326932191848755, "memory(GiB)": 78.33, "step": 4308, "token_acc": 0.8998096353785328, "train_speed(iter/s)": 0.032366 }, { "epoch": 0.8349561594729448, "grad_norm": 0.08969270437955856, "learning_rate": 2.1811036330060676e-05, "loss": 0.30071404576301575, "memory(GiB)": 78.33, "step": 4309, "token_acc": 0.9097683498797705, "train_speed(iter/s)": 0.032367 }, { "epoch": 0.8351499297582715, "grad_norm": 0.09654086083173752, "learning_rate": 2.1761141499225278e-05, "loss": 0.3438301384449005, "memory(GiB)": 78.33, "step": 4310, "token_acc": 0.8972290781405053, "train_speed(iter/s)": 0.032368 }, { "epoch": 0.8353437000435984, "grad_norm": 0.09115693718194962, "learning_rate": 2.171129933953489e-05, "loss": 0.2993430197238922, "memory(GiB)": 78.33, "step": 4311, "token_acc": 0.9102314363897779, "train_speed(iter/s)": 0.032368 }, { "epoch": 0.8355374703289251, "grad_norm": 0.1047009527683258, "learning_rate": 2.1661509871461168e-05, "loss": 0.33541637659072876, "memory(GiB)": 78.33, "step": 4312, "token_acc": 0.8988593808067237, "train_speed(iter/s)": 0.032369 }, { "epoch": 0.8357312406142519, "grad_norm": 0.09559044986963272, "learning_rate": 2.1611773115453913e-05, "loss": 0.3060503900051117, "memory(GiB)": 78.33, "step": 4313, "token_acc": 0.9065379777703622, "train_speed(iter/s)": 0.03237 }, { "epoch": 0.8359250108995786, "grad_norm": 0.10747367143630981, "learning_rate": 2.1562089091941376e-05, "loss": 0.3518577218055725, "memory(GiB)": 78.33, "step": 4314, "token_acc": 0.8972739541160594, "train_speed(iter/s)": 0.03237 }, { "epoch": 0.8361187811849053, "grad_norm": 0.10569079220294952, "learning_rate": 2.1512457821330102e-05, "loss": 0.33122116327285767, "memory(GiB)": 78.33, "step": 4315, "token_acc": 0.9022820362785254, "train_speed(iter/s)": 0.032371 }, { "epoch": 0.8363125514702321, "grad_norm": 0.10672451555728912, "learning_rate": 2.1462879324004973e-05, "loss": 0.356486439704895, "memory(GiB)": 78.33, "step": 4316, "token_acc": 0.892833182626958, "train_speed(iter/s)": 0.032371 }, { "epoch": 0.8365063217555588, "grad_norm": 0.10485262423753738, "learning_rate": 2.1413353620329294e-05, "loss": 0.33778226375579834, "memory(GiB)": 78.33, "step": 4317, "token_acc": 0.8977509922093194, "train_speed(iter/s)": 0.032372 }, { "epoch": 0.8367000920408856, "grad_norm": 0.1112034022808075, "learning_rate": 2.136388073064446e-05, "loss": 0.31891724467277527, "memory(GiB)": 78.33, "step": 4318, "token_acc": 0.9021532012195121, "train_speed(iter/s)": 0.032373 }, { "epoch": 0.8368938623262123, "grad_norm": 0.09961036592721939, "learning_rate": 2.131446067527044e-05, "loss": 0.3315172493457794, "memory(GiB)": 78.33, "step": 4319, "token_acc": 0.9015923147782603, "train_speed(iter/s)": 0.032373 }, { "epoch": 0.8370876326115391, "grad_norm": 0.10558105260133743, "learning_rate": 2.126509347450534e-05, "loss": 0.36373376846313477, "memory(GiB)": 78.33, "step": 4320, "token_acc": 0.8906225980015373, "train_speed(iter/s)": 0.032374 }, { "epoch": 0.8372814028968658, "grad_norm": 0.09915536642074585, "learning_rate": 2.1215779148625578e-05, "loss": 0.3262394964694977, "memory(GiB)": 78.33, "step": 4321, "token_acc": 0.8996407847471677, "train_speed(iter/s)": 0.032374 }, { "epoch": 0.8374751731821926, "grad_norm": 0.11209924519062042, "learning_rate": 2.11665177178859e-05, "loss": 0.3615911900997162, "memory(GiB)": 78.33, "step": 4322, "token_acc": 0.8943596998400787, "train_speed(iter/s)": 0.032375 }, { "epoch": 0.8376689434675193, "grad_norm": 0.09516075998544693, "learning_rate": 2.111730920251924e-05, "loss": 0.3138526678085327, "memory(GiB)": 78.33, "step": 4323, "token_acc": 0.9044929966462814, "train_speed(iter/s)": 0.032376 }, { "epoch": 0.837862713752846, "grad_norm": 0.10355053097009659, "learning_rate": 2.1068153622736943e-05, "loss": 0.3259844183921814, "memory(GiB)": 78.33, "step": 4324, "token_acc": 0.902735473289597, "train_speed(iter/s)": 0.032376 }, { "epoch": 0.8380564840381728, "grad_norm": 0.11187773942947388, "learning_rate": 2.101905099872848e-05, "loss": 0.3713458776473999, "memory(GiB)": 78.33, "step": 4325, "token_acc": 0.888154201235107, "train_speed(iter/s)": 0.032377 }, { "epoch": 0.8382502543234995, "grad_norm": 0.10848337411880493, "learning_rate": 2.0970001350661635e-05, "loss": 0.3444962799549103, "memory(GiB)": 78.33, "step": 4326, "token_acc": 0.8968367889420521, "train_speed(iter/s)": 0.032378 }, { "epoch": 0.8384440246088263, "grad_norm": 0.08781024068593979, "learning_rate": 2.0921004698682407e-05, "loss": 0.328144371509552, "memory(GiB)": 78.33, "step": 4327, "token_acc": 0.9009247631008106, "train_speed(iter/s)": 0.032378 }, { "epoch": 0.838637794894153, "grad_norm": 0.09519962966442108, "learning_rate": 2.087206106291502e-05, "loss": 0.31252622604370117, "memory(GiB)": 78.33, "step": 4328, "token_acc": 0.9068911656474201, "train_speed(iter/s)": 0.032379 }, { "epoch": 0.8388315651794798, "grad_norm": 0.09435832500457764, "learning_rate": 2.082317046346197e-05, "loss": 0.32768622040748596, "memory(GiB)": 78.33, "step": 4329, "token_acc": 0.900981393912031, "train_speed(iter/s)": 0.032379 }, { "epoch": 0.8390253354648065, "grad_norm": 0.10018561035394669, "learning_rate": 2.077433292040388e-05, "loss": 0.3233901560306549, "memory(GiB)": 78.33, "step": 4330, "token_acc": 0.9020335985853227, "train_speed(iter/s)": 0.03238 }, { "epoch": 0.8392191057501333, "grad_norm": 0.09643685072660446, "learning_rate": 2.072554845379974e-05, "loss": 0.316950261592865, "memory(GiB)": 78.33, "step": 4331, "token_acc": 0.9074275988617929, "train_speed(iter/s)": 0.032381 }, { "epoch": 0.83941287603546, "grad_norm": 0.0906091183423996, "learning_rate": 2.067681708368657e-05, "loss": 0.31494781374931335, "memory(GiB)": 78.33, "step": 4332, "token_acc": 0.9049370490875654, "train_speed(iter/s)": 0.032381 }, { "epoch": 0.8396066463207867, "grad_norm": 0.10366849601268768, "learning_rate": 2.0628138830079695e-05, "loss": 0.3571404218673706, "memory(GiB)": 78.33, "step": 4333, "token_acc": 0.8948438109026116, "train_speed(iter/s)": 0.032382 }, { "epoch": 0.8398004166061135, "grad_norm": 0.10383996367454529, "learning_rate": 2.0579513712972535e-05, "loss": 0.3651527166366577, "memory(GiB)": 78.33, "step": 4334, "token_acc": 0.8928912539227726, "train_speed(iter/s)": 0.032383 }, { "epoch": 0.8399941868914402, "grad_norm": 0.1002940759062767, "learning_rate": 2.0530941752336767e-05, "loss": 0.33056554198265076, "memory(GiB)": 78.33, "step": 4335, "token_acc": 0.9013376077824453, "train_speed(iter/s)": 0.032383 }, { "epoch": 0.840187957176767, "grad_norm": 0.09768969565629959, "learning_rate": 2.0482422968122198e-05, "loss": 0.327396422624588, "memory(GiB)": 78.33, "step": 4336, "token_acc": 0.9023116889636305, "train_speed(iter/s)": 0.032384 }, { "epoch": 0.8403817274620937, "grad_norm": 0.0857066735625267, "learning_rate": 2.043395738025674e-05, "loss": 0.30105772614479065, "memory(GiB)": 78.33, "step": 4337, "token_acc": 0.9095752105336621, "train_speed(iter/s)": 0.032385 }, { "epoch": 0.8405754977474205, "grad_norm": 0.09931961447000504, "learning_rate": 2.0385545008646597e-05, "loss": 0.3311472535133362, "memory(GiB)": 78.33, "step": 4338, "token_acc": 0.9012172284644194, "train_speed(iter/s)": 0.032385 }, { "epoch": 0.8407692680327472, "grad_norm": 0.10935309529304504, "learning_rate": 2.0337185873176004e-05, "loss": 0.34328311681747437, "memory(GiB)": 78.33, "step": 4339, "token_acc": 0.8972643180476284, "train_speed(iter/s)": 0.032386 }, { "epoch": 0.840963038318074, "grad_norm": 0.08845790475606918, "learning_rate": 2.0288879993707335e-05, "loss": 0.2846664786338806, "memory(GiB)": 78.33, "step": 4340, "token_acc": 0.9113034491667743, "train_speed(iter/s)": 0.032386 }, { "epoch": 0.8411568086034007, "grad_norm": 0.10597343742847443, "learning_rate": 2.0240627390081137e-05, "loss": 0.32449671626091003, "memory(GiB)": 78.33, "step": 4341, "token_acc": 0.9027229011608319, "train_speed(iter/s)": 0.032387 }, { "epoch": 0.8413505788887274, "grad_norm": 0.1017051637172699, "learning_rate": 2.0192428082115992e-05, "loss": 0.32706570625305176, "memory(GiB)": 78.33, "step": 4342, "token_acc": 0.9033635878270155, "train_speed(iter/s)": 0.032388 }, { "epoch": 0.8415443491740542, "grad_norm": 0.10161115974187851, "learning_rate": 2.0144282089608778e-05, "loss": 0.34076470136642456, "memory(GiB)": 78.33, "step": 4343, "token_acc": 0.8972409669428547, "train_speed(iter/s)": 0.032388 }, { "epoch": 0.8417381194593809, "grad_norm": 0.10345038771629333, "learning_rate": 2.009618943233419e-05, "loss": 0.3462626338005066, "memory(GiB)": 78.33, "step": 4344, "token_acc": 0.8954670063412068, "train_speed(iter/s)": 0.032389 }, { "epoch": 0.8419318897447077, "grad_norm": 0.10364454239606857, "learning_rate": 2.0048150130045303e-05, "loss": 0.35689064860343933, "memory(GiB)": 78.33, "step": 4345, "token_acc": 0.893178860336336, "train_speed(iter/s)": 0.03239 }, { "epoch": 0.8421256600300344, "grad_norm": 0.08873011916875839, "learning_rate": 2.000016420247308e-05, "loss": 0.28760528564453125, "memory(GiB)": 78.33, "step": 4346, "token_acc": 0.9117839149094981, "train_speed(iter/s)": 0.03239 }, { "epoch": 0.8423194303153612, "grad_norm": 0.10340781509876251, "learning_rate": 1.9952231669326668e-05, "loss": 0.3450697958469391, "memory(GiB)": 78.33, "step": 4347, "token_acc": 0.896965636602183, "train_speed(iter/s)": 0.032391 }, { "epoch": 0.8425132006006879, "grad_norm": 0.08939331769943237, "learning_rate": 1.9904352550293224e-05, "loss": 0.28838780522346497, "memory(GiB)": 78.33, "step": 4348, "token_acc": 0.9093310888218653, "train_speed(iter/s)": 0.032391 }, { "epoch": 0.8427069708860147, "grad_norm": 0.09946542233228683, "learning_rate": 1.9856526865037947e-05, "loss": 0.322831392288208, "memory(GiB)": 78.33, "step": 4349, "token_acc": 0.90141040206157, "train_speed(iter/s)": 0.032392 }, { "epoch": 0.8429007411713414, "grad_norm": 0.10646877437829971, "learning_rate": 1.980875463320426e-05, "loss": 0.3633573651313782, "memory(GiB)": 78.33, "step": 4350, "token_acc": 0.8931559868080351, "train_speed(iter/s)": 0.032393 }, { "epoch": 0.8430945114566681, "grad_norm": 0.09976527839899063, "learning_rate": 1.9761035874413333e-05, "loss": 0.3256649374961853, "memory(GiB)": 78.33, "step": 4351, "token_acc": 0.9018026445528784, "train_speed(iter/s)": 0.032393 }, { "epoch": 0.8432882817419949, "grad_norm": 0.10646556317806244, "learning_rate": 1.9713370608264674e-05, "loss": 0.32194364070892334, "memory(GiB)": 78.33, "step": 4352, "token_acc": 0.9037880343886623, "train_speed(iter/s)": 0.032394 }, { "epoch": 0.8434820520273216, "grad_norm": 0.10743288695812225, "learning_rate": 1.966575885433565e-05, "loss": 0.34353500604629517, "memory(GiB)": 78.33, "step": 4353, "token_acc": 0.8954018292107992, "train_speed(iter/s)": 0.032394 }, { "epoch": 0.8436758223126484, "grad_norm": 0.1006690040230751, "learning_rate": 1.9618200632181673e-05, "loss": 0.3364703059196472, "memory(GiB)": 78.33, "step": 4354, "token_acc": 0.8980853059647961, "train_speed(iter/s)": 0.032395 }, { "epoch": 0.8438695925979751, "grad_norm": 0.09362746775150299, "learning_rate": 1.9570695961336203e-05, "loss": 0.28917554020881653, "memory(GiB)": 78.33, "step": 4355, "token_acc": 0.9127130038690557, "train_speed(iter/s)": 0.032396 }, { "epoch": 0.8440633628833019, "grad_norm": 0.11022236943244934, "learning_rate": 1.9523244861310626e-05, "loss": 0.31461644172668457, "memory(GiB)": 78.33, "step": 4356, "token_acc": 0.905908453993461, "train_speed(iter/s)": 0.032396 }, { "epoch": 0.8442571331686286, "grad_norm": 0.09002205729484558, "learning_rate": 1.9475847351594458e-05, "loss": 0.28434687852859497, "memory(GiB)": 78.33, "step": 4357, "token_acc": 0.9123356771629471, "train_speed(iter/s)": 0.032397 }, { "epoch": 0.8444509034539553, "grad_norm": 0.09616533666849136, "learning_rate": 1.9428503451655125e-05, "loss": 0.33912795782089233, "memory(GiB)": 78.33, "step": 4358, "token_acc": 0.8976422492479367, "train_speed(iter/s)": 0.032397 }, { "epoch": 0.8446446737392821, "grad_norm": 0.09272696822881699, "learning_rate": 1.9381213180938003e-05, "loss": 0.30497756600379944, "memory(GiB)": 78.33, "step": 4359, "token_acc": 0.9095019342359768, "train_speed(iter/s)": 0.032398 }, { "epoch": 0.8448384440246088, "grad_norm": 0.09096968919038773, "learning_rate": 1.9333976558866476e-05, "loss": 0.295366108417511, "memory(GiB)": 78.33, "step": 4360, "token_acc": 0.9114807813484562, "train_speed(iter/s)": 0.032399 }, { "epoch": 0.8450322143099356, "grad_norm": 0.1177552342414856, "learning_rate": 1.928679360484194e-05, "loss": 0.3394724428653717, "memory(GiB)": 78.33, "step": 4361, "token_acc": 0.8994290351563632, "train_speed(iter/s)": 0.032399 }, { "epoch": 0.8452259845952623, "grad_norm": 0.10825799405574799, "learning_rate": 1.9239664338243637e-05, "loss": 0.32095205783843994, "memory(GiB)": 78.33, "step": 4362, "token_acc": 0.90408615136876, "train_speed(iter/s)": 0.0324 }, { "epoch": 0.8454197548805891, "grad_norm": 0.09334609657526016, "learning_rate": 1.9192588778428842e-05, "loss": 0.30651578307151794, "memory(GiB)": 78.33, "step": 4363, "token_acc": 0.907121790842721, "train_speed(iter/s)": 0.0324 }, { "epoch": 0.8456135251659158, "grad_norm": 0.10030210018157959, "learning_rate": 1.91455669447328e-05, "loss": 0.3436662554740906, "memory(GiB)": 78.33, "step": 4364, "token_acc": 0.9008844323772145, "train_speed(iter/s)": 0.032401 }, { "epoch": 0.8458072954512426, "grad_norm": 0.09695718437433243, "learning_rate": 1.909859885646861e-05, "loss": 0.33061671257019043, "memory(GiB)": 78.33, "step": 4365, "token_acc": 0.9015153412648717, "train_speed(iter/s)": 0.032402 }, { "epoch": 0.8460010657365693, "grad_norm": 0.1165538877248764, "learning_rate": 1.9051684532927332e-05, "loss": 0.3574296534061432, "memory(GiB)": 78.33, "step": 4366, "token_acc": 0.8923136197264496, "train_speed(iter/s)": 0.032402 }, { "epoch": 0.846194836021896, "grad_norm": 0.09451232105493546, "learning_rate": 1.9004823993377927e-05, "loss": 0.3224382996559143, "memory(GiB)": 78.33, "step": 4367, "token_acc": 0.9037022153011504, "train_speed(iter/s)": 0.032403 }, { "epoch": 0.8463886063072228, "grad_norm": 0.08888131380081177, "learning_rate": 1.895801725706727e-05, "loss": 0.2944706082344055, "memory(GiB)": 78.33, "step": 4368, "token_acc": 0.9106517341753416, "train_speed(iter/s)": 0.032403 }, { "epoch": 0.8465823765925495, "grad_norm": 0.10048877447843552, "learning_rate": 1.8911264343220184e-05, "loss": 0.3280665874481201, "memory(GiB)": 78.33, "step": 4369, "token_acc": 0.9011113631208891, "train_speed(iter/s)": 0.032404 }, { "epoch": 0.8467761468778763, "grad_norm": 0.10502710193395615, "learning_rate": 1.8864565271039274e-05, "loss": 0.3245999217033386, "memory(GiB)": 78.33, "step": 4370, "token_acc": 0.9021815622800844, "train_speed(iter/s)": 0.032405 }, { "epoch": 0.846969917163203, "grad_norm": 0.1014811173081398, "learning_rate": 1.8817920059705194e-05, "loss": 0.3340781629085541, "memory(GiB)": 78.33, "step": 4371, "token_acc": 0.8999215070643642, "train_speed(iter/s)": 0.032405 }, { "epoch": 0.8471636874485298, "grad_norm": 0.09874725341796875, "learning_rate": 1.8771328728376338e-05, "loss": 0.3380013704299927, "memory(GiB)": 78.33, "step": 4372, "token_acc": 0.8971284634760706, "train_speed(iter/s)": 0.032406 }, { "epoch": 0.8473574577338565, "grad_norm": 0.14172475039958954, "learning_rate": 1.8724791296189034e-05, "loss": 0.32453781366348267, "memory(GiB)": 78.33, "step": 4373, "token_acc": 0.9030038451530833, "train_speed(iter/s)": 0.032406 }, { "epoch": 0.8475512280191833, "grad_norm": 0.11031965911388397, "learning_rate": 1.867830778225744e-05, "loss": 0.3707287907600403, "memory(GiB)": 78.33, "step": 4374, "token_acc": 0.892270504861836, "train_speed(iter/s)": 0.032407 }, { "epoch": 0.84774499830451, "grad_norm": 0.10298004001379013, "learning_rate": 1.8631878205673552e-05, "loss": 0.34424299001693726, "memory(GiB)": 78.33, "step": 4375, "token_acc": 0.8972931413876296, "train_speed(iter/s)": 0.032408 }, { "epoch": 0.8479387685898367, "grad_norm": 0.11782942712306976, "learning_rate": 1.858550258550736e-05, "loss": 0.32887881994247437, "memory(GiB)": 78.33, "step": 4376, "token_acc": 0.9019160805238904, "train_speed(iter/s)": 0.032408 }, { "epoch": 0.8481325388751635, "grad_norm": 0.09481247514486313, "learning_rate": 1.8539180940806436e-05, "loss": 0.32029709219932556, "memory(GiB)": 78.33, "step": 4377, "token_acc": 0.9027707297269867, "train_speed(iter/s)": 0.032409 }, { "epoch": 0.8483263091604902, "grad_norm": 0.11217660456895828, "learning_rate": 1.8492913290596407e-05, "loss": 0.3478485345840454, "memory(GiB)": 78.33, "step": 4378, "token_acc": 0.8978001381079215, "train_speed(iter/s)": 0.032409 }, { "epoch": 0.848520079445817, "grad_norm": 0.10071595013141632, "learning_rate": 1.8446699653880638e-05, "loss": 0.32309237122535706, "memory(GiB)": 78.33, "step": 4379, "token_acc": 0.9025203747208446, "train_speed(iter/s)": 0.03241 }, { "epoch": 0.8487138497311437, "grad_norm": 0.10318909585475922, "learning_rate": 1.8400540049640278e-05, "loss": 0.3701336681842804, "memory(GiB)": 78.33, "step": 4380, "token_acc": 0.8895177546906498, "train_speed(iter/s)": 0.032411 }, { "epoch": 0.8489076200164705, "grad_norm": 0.1191054955124855, "learning_rate": 1.8354434496834346e-05, "loss": 0.37648195028305054, "memory(GiB)": 78.33, "step": 4381, "token_acc": 0.8879827766179541, "train_speed(iter/s)": 0.032411 }, { "epoch": 0.8491013903017972, "grad_norm": 0.09198344498872757, "learning_rate": 1.830838301439958e-05, "loss": 0.32460615038871765, "memory(GiB)": 78.33, "step": 4382, "token_acc": 0.9025374855824683, "train_speed(iter/s)": 0.032412 }, { "epoch": 0.849295160587124, "grad_norm": 0.09382152557373047, "learning_rate": 1.826238562125068e-05, "loss": 0.31965401768684387, "memory(GiB)": 78.33, "step": 4383, "token_acc": 0.9034426770175306, "train_speed(iter/s)": 0.032413 }, { "epoch": 0.8494889308724507, "grad_norm": 0.10211214423179626, "learning_rate": 1.821644233627985e-05, "loss": 0.35185420513153076, "memory(GiB)": 78.33, "step": 4384, "token_acc": 0.8940360240160107, "train_speed(iter/s)": 0.032413 }, { "epoch": 0.8496827011577774, "grad_norm": 0.09837459772825241, "learning_rate": 1.8170553178357366e-05, "loss": 0.3056308627128601, "memory(GiB)": 78.33, "step": 4385, "token_acc": 0.9083345412378605, "train_speed(iter/s)": 0.032414 }, { "epoch": 0.8498764714431042, "grad_norm": 0.09455987811088562, "learning_rate": 1.8124718166331066e-05, "loss": 0.2900204658508301, "memory(GiB)": 78.33, "step": 4386, "token_acc": 0.9121223003129328, "train_speed(iter/s)": 0.032414 }, { "epoch": 0.8500702417284309, "grad_norm": 0.09906947612762451, "learning_rate": 1.8078937319026654e-05, "loss": 0.33766764402389526, "memory(GiB)": 78.33, "step": 4387, "token_acc": 0.8973069206205668, "train_speed(iter/s)": 0.032415 }, { "epoch": 0.8502640120137577, "grad_norm": 0.09509405493736267, "learning_rate": 1.8033210655247527e-05, "loss": 0.3071034550666809, "memory(GiB)": 78.33, "step": 4388, "token_acc": 0.9057726952298896, "train_speed(iter/s)": 0.032416 }, { "epoch": 0.8504577822990844, "grad_norm": 0.10163518041372299, "learning_rate": 1.7987538193774857e-05, "loss": 0.34781065583229065, "memory(GiB)": 78.33, "step": 4389, "token_acc": 0.8967510195262917, "train_speed(iter/s)": 0.032416 }, { "epoch": 0.8506515525844112, "grad_norm": 0.09907959401607513, "learning_rate": 1.794191995336761e-05, "loss": 0.3162297308444977, "memory(GiB)": 78.33, "step": 4390, "token_acc": 0.9030727923627685, "train_speed(iter/s)": 0.032417 }, { "epoch": 0.8508453228697379, "grad_norm": 0.10068470239639282, "learning_rate": 1.7896355952762314e-05, "loss": 0.32280969619750977, "memory(GiB)": 78.33, "step": 4391, "token_acc": 0.9020573146084884, "train_speed(iter/s)": 0.032417 }, { "epoch": 0.8510390931550647, "grad_norm": 0.1039271429181099, "learning_rate": 1.785084621067343e-05, "loss": 0.3339514434337616, "memory(GiB)": 78.33, "step": 4392, "token_acc": 0.9003369695585127, "train_speed(iter/s)": 0.032418 }, { "epoch": 0.8512328634403914, "grad_norm": 0.1004716232419014, "learning_rate": 1.780539074579299e-05, "loss": 0.3353673815727234, "memory(GiB)": 78.33, "step": 4393, "token_acc": 0.8975802074107934, "train_speed(iter/s)": 0.032418 }, { "epoch": 0.8514266337257181, "grad_norm": 0.10031532496213913, "learning_rate": 1.7759989576790778e-05, "loss": 0.32534271478652954, "memory(GiB)": 78.33, "step": 4394, "token_acc": 0.9057053186123409, "train_speed(iter/s)": 0.032419 }, { "epoch": 0.8516204040110449, "grad_norm": 0.10088721662759781, "learning_rate": 1.7714642722314278e-05, "loss": 0.3248327970504761, "memory(GiB)": 78.33, "step": 4395, "token_acc": 0.9025890722491108, "train_speed(iter/s)": 0.03242 }, { "epoch": 0.8518141742963716, "grad_norm": 0.10467974096536636, "learning_rate": 1.766935020098862e-05, "loss": 0.33937883377075195, "memory(GiB)": 78.33, "step": 4396, "token_acc": 0.8982584532423584, "train_speed(iter/s)": 0.03242 }, { "epoch": 0.8520079445816984, "grad_norm": 0.0988173633813858, "learning_rate": 1.7624112031416725e-05, "loss": 0.33925455808639526, "memory(GiB)": 78.33, "step": 4397, "token_acc": 0.8979691440036875, "train_speed(iter/s)": 0.032421 }, { "epoch": 0.8522017148670251, "grad_norm": 0.10114793479442596, "learning_rate": 1.7578928232179102e-05, "loss": 0.29743602871894836, "memory(GiB)": 78.33, "step": 4398, "token_acc": 0.9106254871918932, "train_speed(iter/s)": 0.032422 }, { "epoch": 0.8523954851523519, "grad_norm": 0.1041836142539978, "learning_rate": 1.753379882183395e-05, "loss": 0.3369101881980896, "memory(GiB)": 78.33, "step": 4399, "token_acc": 0.8994292404507537, "train_speed(iter/s)": 0.032422 }, { "epoch": 0.8525892554376786, "grad_norm": 0.10185372084379196, "learning_rate": 1.748872381891713e-05, "loss": 0.3253825902938843, "memory(GiB)": 78.33, "step": 4400, "token_acc": 0.9039803494668677, "train_speed(iter/s)": 0.032423 }, { "epoch": 0.8527830257230054, "grad_norm": 0.09283772110939026, "learning_rate": 1.7443703241942143e-05, "loss": 0.2999337911605835, "memory(GiB)": 78.33, "step": 4401, "token_acc": 0.9094778884305121, "train_speed(iter/s)": 0.03242 }, { "epoch": 0.8529767960083321, "grad_norm": 0.10008923709392548, "learning_rate": 1.739873710940015e-05, "loss": 0.32751649618148804, "memory(GiB)": 78.33, "step": 4402, "token_acc": 0.9021663945992309, "train_speed(iter/s)": 0.032421 }, { "epoch": 0.8531705662936588, "grad_norm": 0.11449175328016281, "learning_rate": 1.7353825439759948e-05, "loss": 0.3371378481388092, "memory(GiB)": 78.33, "step": 4403, "token_acc": 0.8989161168708766, "train_speed(iter/s)": 0.032422 }, { "epoch": 0.8533643365789856, "grad_norm": 0.10284436494112015, "learning_rate": 1.7308968251467997e-05, "loss": 0.34078723192214966, "memory(GiB)": 78.33, "step": 4404, "token_acc": 0.8978489916236173, "train_speed(iter/s)": 0.032422 }, { "epoch": 0.8535581068643123, "grad_norm": 0.09477044641971588, "learning_rate": 1.726416556294834e-05, "loss": 0.31810081005096436, "memory(GiB)": 78.33, "step": 4405, "token_acc": 0.9051635577183023, "train_speed(iter/s)": 0.032423 }, { "epoch": 0.8537518771496391, "grad_norm": 0.09910459071397781, "learning_rate": 1.721941739260264e-05, "loss": 0.340251088142395, "memory(GiB)": 78.33, "step": 4406, "token_acc": 0.8982416457979913, "train_speed(iter/s)": 0.032423 }, { "epoch": 0.8539456474349658, "grad_norm": 0.09987306594848633, "learning_rate": 1.7174723758810166e-05, "loss": 0.3514068126678467, "memory(GiB)": 78.33, "step": 4407, "token_acc": 0.8941308390766874, "train_speed(iter/s)": 0.032424 }, { "epoch": 0.8541394177202926, "grad_norm": 0.10393253713846207, "learning_rate": 1.7130084679927763e-05, "loss": 0.34633970260620117, "memory(GiB)": 78.33, "step": 4408, "token_acc": 0.896145340536909, "train_speed(iter/s)": 0.032425 }, { "epoch": 0.8543331880056193, "grad_norm": 0.10434923321008682, "learning_rate": 1.708550017429e-05, "loss": 0.34398695826530457, "memory(GiB)": 78.33, "step": 4409, "token_acc": 0.8962004803657725, "train_speed(iter/s)": 0.032425 }, { "epoch": 0.854526958290946, "grad_norm": 0.09756119549274445, "learning_rate": 1.70409702602088e-05, "loss": 0.31827184557914734, "memory(GiB)": 78.33, "step": 4410, "token_acc": 0.9032197224841708, "train_speed(iter/s)": 0.032426 }, { "epoch": 0.8547207285762728, "grad_norm": 0.09933782368898392, "learning_rate": 1.699649495597389e-05, "loss": 0.30512192845344543, "memory(GiB)": 78.33, "step": 4411, "token_acc": 0.9053738951249932, "train_speed(iter/s)": 0.032427 }, { "epoch": 0.8549144988615995, "grad_norm": 0.0985947772860527, "learning_rate": 1.695207427985246e-05, "loss": 0.31603682041168213, "memory(GiB)": 78.33, "step": 4412, "token_acc": 0.9057937693987814, "train_speed(iter/s)": 0.032427 }, { "epoch": 0.8551082691469263, "grad_norm": 0.09728779643774033, "learning_rate": 1.690770825008924e-05, "loss": 0.3370038568973541, "memory(GiB)": 78.33, "step": 4413, "token_acc": 0.898977370525996, "train_speed(iter/s)": 0.032428 }, { "epoch": 0.855302039432253, "grad_norm": 0.09609750658273697, "learning_rate": 1.6863396884906583e-05, "loss": 0.30580419301986694, "memory(GiB)": 78.33, "step": 4414, "token_acc": 0.9062045771938325, "train_speed(iter/s)": 0.032428 }, { "epoch": 0.8554958097175798, "grad_norm": 0.08634244650602341, "learning_rate": 1.681914020250431e-05, "loss": 0.3033207356929779, "memory(GiB)": 78.33, "step": 4415, "token_acc": 0.9060820975714593, "train_speed(iter/s)": 0.032429 }, { "epoch": 0.8556895800029065, "grad_norm": 0.10316123068332672, "learning_rate": 1.677493822105992e-05, "loss": 0.3632691502571106, "memory(GiB)": 78.33, "step": 4416, "token_acc": 0.8925554382259767, "train_speed(iter/s)": 0.032429 }, { "epoch": 0.8558833502882333, "grad_norm": 0.08851286768913269, "learning_rate": 1.6730790958728253e-05, "loss": 0.2938583493232727, "memory(GiB)": 78.33, "step": 4417, "token_acc": 0.9093023255813953, "train_speed(iter/s)": 0.03243 }, { "epoch": 0.85607712057356, "grad_norm": 0.10702818632125854, "learning_rate": 1.6686698433641836e-05, "loss": 0.33791351318359375, "memory(GiB)": 78.33, "step": 4418, "token_acc": 0.8968233799237612, "train_speed(iter/s)": 0.032431 }, { "epoch": 0.8562708908588867, "grad_norm": 0.10163046419620514, "learning_rate": 1.6642660663910658e-05, "loss": 0.34928232431411743, "memory(GiB)": 78.33, "step": 4419, "token_acc": 0.8963705698818635, "train_speed(iter/s)": 0.032431 }, { "epoch": 0.8564646611442135, "grad_norm": 0.09223022311925888, "learning_rate": 1.6598677667622175e-05, "loss": 0.29846811294555664, "memory(GiB)": 78.33, "step": 4420, "token_acc": 0.9110857397454819, "train_speed(iter/s)": 0.032432 }, { "epoch": 0.8566584314295402, "grad_norm": 0.0997534915804863, "learning_rate": 1.655474946284142e-05, "loss": 0.3577570915222168, "memory(GiB)": 78.33, "step": 4421, "token_acc": 0.8929023263019287, "train_speed(iter/s)": 0.032432 }, { "epoch": 0.856852201714867, "grad_norm": 0.10170114785432816, "learning_rate": 1.6510876067610833e-05, "loss": 0.3271295130252838, "memory(GiB)": 78.33, "step": 4422, "token_acc": 0.9011473179963333, "train_speed(iter/s)": 0.032433 }, { "epoch": 0.8570459720001937, "grad_norm": 0.10372772812843323, "learning_rate": 1.6467057499950497e-05, "loss": 0.3079608380794525, "memory(GiB)": 78.33, "step": 4423, "token_acc": 0.9084609878310665, "train_speed(iter/s)": 0.032434 }, { "epoch": 0.8572397422855205, "grad_norm": 0.09529042989015579, "learning_rate": 1.6423293777857765e-05, "loss": 0.31907567381858826, "memory(GiB)": 78.33, "step": 4424, "token_acc": 0.9041177847722602, "train_speed(iter/s)": 0.032434 }, { "epoch": 0.8574335125708472, "grad_norm": 0.10136647522449493, "learning_rate": 1.6379584919307644e-05, "loss": 0.3494371175765991, "memory(GiB)": 78.33, "step": 4425, "token_acc": 0.8960502692998205, "train_speed(iter/s)": 0.032435 }, { "epoch": 0.857627282856174, "grad_norm": 0.091824971139431, "learning_rate": 1.6335930942252535e-05, "loss": 0.30003249645233154, "memory(GiB)": 78.33, "step": 4426, "token_acc": 0.9094136715634568, "train_speed(iter/s)": 0.032435 }, { "epoch": 0.8578210531415007, "grad_norm": 0.10738231241703033, "learning_rate": 1.6292331864622265e-05, "loss": 0.338476300239563, "memory(GiB)": 78.33, "step": 4427, "token_acc": 0.8967210902712185, "train_speed(iter/s)": 0.032436 }, { "epoch": 0.8580148234268274, "grad_norm": 0.09496748447418213, "learning_rate": 1.6248787704324163e-05, "loss": 0.3244837522506714, "memory(GiB)": 78.33, "step": 4428, "token_acc": 0.9002851103573711, "train_speed(iter/s)": 0.032437 }, { "epoch": 0.8582085937121542, "grad_norm": 0.10821764171123505, "learning_rate": 1.620529847924295e-05, "loss": 0.3445894420146942, "memory(GiB)": 78.33, "step": 4429, "token_acc": 0.8967734961898071, "train_speed(iter/s)": 0.032437 }, { "epoch": 0.8584023639974809, "grad_norm": 0.11172088235616684, "learning_rate": 1.616186420724089e-05, "loss": 0.3518655598163605, "memory(GiB)": 78.33, "step": 4430, "token_acc": 0.8950802436686354, "train_speed(iter/s)": 0.032438 }, { "epoch": 0.8585961342828077, "grad_norm": 0.09562534093856812, "learning_rate": 1.611848490615757e-05, "loss": 0.31965699791908264, "memory(GiB)": 78.33, "step": 4431, "token_acc": 0.9038684271484827, "train_speed(iter/s)": 0.032438 }, { "epoch": 0.8587899045681344, "grad_norm": 0.10791201889514923, "learning_rate": 1.6075160593810044e-05, "loss": 0.3649354577064514, "memory(GiB)": 78.33, "step": 4432, "token_acc": 0.8911983213293257, "train_speed(iter/s)": 0.032439 }, { "epoch": 0.8589836748534613, "grad_norm": 0.10779910534620285, "learning_rate": 1.6031891287992747e-05, "loss": 0.3215060234069824, "memory(GiB)": 78.33, "step": 4433, "token_acc": 0.9031464957595514, "train_speed(iter/s)": 0.03244 }, { "epoch": 0.859177445138788, "grad_norm": 0.09658445417881012, "learning_rate": 1.5988677006477568e-05, "loss": 0.33394524455070496, "memory(GiB)": 78.33, "step": 4434, "token_acc": 0.8990924985430022, "train_speed(iter/s)": 0.03244 }, { "epoch": 0.8593712154241148, "grad_norm": 0.10684984922409058, "learning_rate": 1.594551776701377e-05, "loss": 0.33571135997772217, "memory(GiB)": 78.33, "step": 4435, "token_acc": 0.9019298688193743, "train_speed(iter/s)": 0.032441 }, { "epoch": 0.8595649857094415, "grad_norm": 0.09724808484315872, "learning_rate": 1.5902413587327978e-05, "loss": 0.32527902722358704, "memory(GiB)": 78.33, "step": 4436, "token_acc": 0.9014134468456669, "train_speed(iter/s)": 0.032441 }, { "epoch": 0.8597587559947683, "grad_norm": 0.1140824407339096, "learning_rate": 1.5859364485124294e-05, "loss": 0.3552241027355194, "memory(GiB)": 78.33, "step": 4437, "token_acc": 0.8934269717466584, "train_speed(iter/s)": 0.032442 }, { "epoch": 0.859952526280095, "grad_norm": 0.11029309034347534, "learning_rate": 1.5816370478084106e-05, "loss": 0.3606140613555908, "memory(GiB)": 78.33, "step": 4438, "token_acc": 0.8945596964690044, "train_speed(iter/s)": 0.032443 }, { "epoch": 0.8601462965654217, "grad_norm": 0.10048742592334747, "learning_rate": 1.5773431583866226e-05, "loss": 0.3234768509864807, "memory(GiB)": 78.33, "step": 4439, "token_acc": 0.9028618152085037, "train_speed(iter/s)": 0.032443 }, { "epoch": 0.8603400668507485, "grad_norm": 0.10467264801263809, "learning_rate": 1.573054782010681e-05, "loss": 0.33086928725242615, "memory(GiB)": 78.33, "step": 4440, "token_acc": 0.9003659258462036, "train_speed(iter/s)": 0.032444 }, { "epoch": 0.8605338371360752, "grad_norm": 0.11301259696483612, "learning_rate": 1.568771920441932e-05, "loss": 0.36950597167015076, "memory(GiB)": 78.33, "step": 4441, "token_acc": 0.8924123686288804, "train_speed(iter/s)": 0.032445 }, { "epoch": 0.860727607421402, "grad_norm": 0.1058172732591629, "learning_rate": 1.5644945754394732e-05, "loss": 0.36208900809288025, "memory(GiB)": 78.33, "step": 4442, "token_acc": 0.8900826684333324, "train_speed(iter/s)": 0.032445 }, { "epoch": 0.8609213777067287, "grad_norm": 0.10789795219898224, "learning_rate": 1.5602227487601114e-05, "loss": 0.35568827390670776, "memory(GiB)": 78.33, "step": 4443, "token_acc": 0.8932357022646078, "train_speed(iter/s)": 0.032446 }, { "epoch": 0.8611151479920555, "grad_norm": 0.10026465356349945, "learning_rate": 1.5559564421584114e-05, "loss": 0.32759472727775574, "memory(GiB)": 78.33, "step": 4444, "token_acc": 0.8990959539379413, "train_speed(iter/s)": 0.032446 }, { "epoch": 0.8613089182773822, "grad_norm": 0.09213128685951233, "learning_rate": 1.5516956573866564e-05, "loss": 0.3294154107570648, "memory(GiB)": 78.33, "step": 4445, "token_acc": 0.9012261580381471, "train_speed(iter/s)": 0.032447 }, { "epoch": 0.861502688562709, "grad_norm": 0.09709835052490234, "learning_rate": 1.5474403961948627e-05, "loss": 0.31836092472076416, "memory(GiB)": 78.33, "step": 4446, "token_acc": 0.9042838018741634, "train_speed(iter/s)": 0.032447 }, { "epoch": 0.8616964588480357, "grad_norm": 0.10115919262170792, "learning_rate": 1.5431906603307846e-05, "loss": 0.3285466432571411, "memory(GiB)": 78.33, "step": 4447, "token_acc": 0.8986334681362831, "train_speed(iter/s)": 0.032448 }, { "epoch": 0.8618902291333624, "grad_norm": 0.10198438912630081, "learning_rate": 1.5389464515398976e-05, "loss": 0.326136976480484, "memory(GiB)": 78.33, "step": 4448, "token_acc": 0.9012800087897599, "train_speed(iter/s)": 0.032449 }, { "epoch": 0.8620839994186892, "grad_norm": 0.10833071917295456, "learning_rate": 1.5347077715654198e-05, "loss": 0.3257608115673065, "memory(GiB)": 78.33, "step": 4449, "token_acc": 0.9010049449672994, "train_speed(iter/s)": 0.032449 }, { "epoch": 0.8622777697040159, "grad_norm": 0.09669304639101028, "learning_rate": 1.5304746221482827e-05, "loss": 0.30072930455207825, "memory(GiB)": 78.33, "step": 4450, "token_acc": 0.9068726155150487, "train_speed(iter/s)": 0.03245 }, { "epoch": 0.8624715399893427, "grad_norm": 0.11190321296453476, "learning_rate": 1.52624700502716e-05, "loss": 0.3294280469417572, "memory(GiB)": 78.33, "step": 4451, "token_acc": 0.9005425479444106, "train_speed(iter/s)": 0.03245 }, { "epoch": 0.8626653102746694, "grad_norm": 0.09287480264902115, "learning_rate": 1.5220249219384484e-05, "loss": 0.3197273015975952, "memory(GiB)": 78.33, "step": 4452, "token_acc": 0.9034961140224298, "train_speed(iter/s)": 0.032451 }, { "epoch": 0.8628590805599962, "grad_norm": 0.09530606120824814, "learning_rate": 1.5178083746162666e-05, "loss": 0.3117508590221405, "memory(GiB)": 78.33, "step": 4453, "token_acc": 0.9065753927662811, "train_speed(iter/s)": 0.032452 }, { "epoch": 0.8630528508453229, "grad_norm": 0.10420526564121246, "learning_rate": 1.5135973647924665e-05, "loss": 0.30883246660232544, "memory(GiB)": 78.33, "step": 4454, "token_acc": 0.9062847265094456, "train_speed(iter/s)": 0.032452 }, { "epoch": 0.8632466211306496, "grad_norm": 0.08543951064348221, "learning_rate": 1.5093918941966193e-05, "loss": 0.27868953347206116, "memory(GiB)": 78.33, "step": 4455, "token_acc": 0.9159319449077825, "train_speed(iter/s)": 0.032453 }, { "epoch": 0.8634403914159764, "grad_norm": 0.09152313321828842, "learning_rate": 1.5051919645560334e-05, "loss": 0.31067970395088196, "memory(GiB)": 78.33, "step": 4456, "token_acc": 0.9047191907111177, "train_speed(iter/s)": 0.032453 }, { "epoch": 0.8636341617013031, "grad_norm": 0.11151348054409027, "learning_rate": 1.5009975775957207e-05, "loss": 0.3601279556751251, "memory(GiB)": 78.33, "step": 4457, "token_acc": 0.8918564063150479, "train_speed(iter/s)": 0.032454 }, { "epoch": 0.8638279319866299, "grad_norm": 0.10080854594707489, "learning_rate": 1.4968087350384395e-05, "loss": 0.34047406911849976, "memory(GiB)": 78.33, "step": 4458, "token_acc": 0.8975244985557326, "train_speed(iter/s)": 0.032455 }, { "epoch": 0.8640217022719566, "grad_norm": 0.0961776152253151, "learning_rate": 1.4926254386046554e-05, "loss": 0.3052217662334442, "memory(GiB)": 78.33, "step": 4459, "token_acc": 0.908130910695084, "train_speed(iter/s)": 0.032455 }, { "epoch": 0.8642154725572834, "grad_norm": 0.10806053876876831, "learning_rate": 1.4884476900125591e-05, "loss": 0.33401399850845337, "memory(GiB)": 78.33, "step": 4460, "token_acc": 0.9004724605246799, "train_speed(iter/s)": 0.032456 }, { "epoch": 0.8644092428426101, "grad_norm": 0.09109724313020706, "learning_rate": 1.484275490978068e-05, "loss": 0.32588818669319153, "memory(GiB)": 78.33, "step": 4461, "token_acc": 0.90133139220889, "train_speed(iter/s)": 0.032456 }, { "epoch": 0.8646030131279369, "grad_norm": 0.08890355378389359, "learning_rate": 1.4801088432148112e-05, "loss": 0.2869184613227844, "memory(GiB)": 78.33, "step": 4462, "token_acc": 0.911701588985605, "train_speed(iter/s)": 0.032457 }, { "epoch": 0.8647967834132636, "grad_norm": 0.09064006060361862, "learning_rate": 1.4759477484341513e-05, "loss": 0.29877418279647827, "memory(GiB)": 78.33, "step": 4463, "token_acc": 0.9077449822904369, "train_speed(iter/s)": 0.032458 }, { "epoch": 0.8649905536985903, "grad_norm": 0.10824266821146011, "learning_rate": 1.47179220834515e-05, "loss": 0.3239610493183136, "memory(GiB)": 78.33, "step": 4464, "token_acc": 0.9016067776803973, "train_speed(iter/s)": 0.032458 }, { "epoch": 0.8651843239839171, "grad_norm": 0.1040419191122055, "learning_rate": 1.46764222465461e-05, "loss": 0.35535627603530884, "memory(GiB)": 78.33, "step": 4465, "token_acc": 0.8935964513193813, "train_speed(iter/s)": 0.032459 }, { "epoch": 0.8653780942692438, "grad_norm": 0.13735035061836243, "learning_rate": 1.463497799067036e-05, "loss": 0.3214113116264343, "memory(GiB)": 78.33, "step": 4466, "token_acc": 0.9032424242424243, "train_speed(iter/s)": 0.032459 }, { "epoch": 0.8655718645545706, "grad_norm": 0.11260360479354858, "learning_rate": 1.4593589332846567e-05, "loss": 0.3748854696750641, "memory(GiB)": 78.33, "step": 4467, "token_acc": 0.8932340525328331, "train_speed(iter/s)": 0.03246 }, { "epoch": 0.8657656348398973, "grad_norm": 0.09724284708499908, "learning_rate": 1.4552256290074138e-05, "loss": 0.32235243916511536, "memory(GiB)": 78.33, "step": 4468, "token_acc": 0.9010934523343511, "train_speed(iter/s)": 0.032461 }, { "epoch": 0.8659594051252241, "grad_norm": 0.1042628064751625, "learning_rate": 1.451097887932966e-05, "loss": 0.3547298014163971, "memory(GiB)": 78.33, "step": 4469, "token_acc": 0.8934734091775081, "train_speed(iter/s)": 0.032461 }, { "epoch": 0.8661531754105508, "grad_norm": 0.11357161402702332, "learning_rate": 1.4469757117566888e-05, "loss": 0.3611631393432617, "memory(GiB)": 78.33, "step": 4470, "token_acc": 0.8912460223373058, "train_speed(iter/s)": 0.032462 }, { "epoch": 0.8663469456958776, "grad_norm": 0.11139743030071259, "learning_rate": 1.4428591021716729e-05, "loss": 0.35104861855506897, "memory(GiB)": 78.33, "step": 4471, "token_acc": 0.8951944796856144, "train_speed(iter/s)": 0.032462 }, { "epoch": 0.8665407159812043, "grad_norm": 0.11307670921087265, "learning_rate": 1.4387480608687174e-05, "loss": 0.36476895213127136, "memory(GiB)": 78.33, "step": 4472, "token_acc": 0.8911314038074601, "train_speed(iter/s)": 0.032463 }, { "epoch": 0.866734486266531, "grad_norm": 0.10000620037317276, "learning_rate": 1.4346425895363384e-05, "loss": 0.33012938499450684, "memory(GiB)": 78.33, "step": 4473, "token_acc": 0.9017330185642236, "train_speed(iter/s)": 0.032464 }, { "epoch": 0.8669282565518578, "grad_norm": 0.09574344009160995, "learning_rate": 1.4305426898607602e-05, "loss": 0.31818804144859314, "memory(GiB)": 78.33, "step": 4474, "token_acc": 0.9036768787502313, "train_speed(iter/s)": 0.032464 }, { "epoch": 0.8671220268371845, "grad_norm": 0.09933432936668396, "learning_rate": 1.426448363525931e-05, "loss": 0.3508920669555664, "memory(GiB)": 78.33, "step": 4475, "token_acc": 0.8937370802838147, "train_speed(iter/s)": 0.032465 }, { "epoch": 0.8673157971225113, "grad_norm": 0.10301670432090759, "learning_rate": 1.4223596122134873e-05, "loss": 0.3236181437969208, "memory(GiB)": 78.33, "step": 4476, "token_acc": 0.9032188377619357, "train_speed(iter/s)": 0.032465 }, { "epoch": 0.867509567407838, "grad_norm": 0.11474580317735672, "learning_rate": 1.4182764376028006e-05, "loss": 0.3661247491836548, "memory(GiB)": 78.33, "step": 4477, "token_acc": 0.8923494649417026, "train_speed(iter/s)": 0.032466 }, { "epoch": 0.8677033376931648, "grad_norm": 0.10032869875431061, "learning_rate": 1.414198841370936e-05, "loss": 0.3041604459285736, "memory(GiB)": 78.33, "step": 4478, "token_acc": 0.9077770323157777, "train_speed(iter/s)": 0.032467 }, { "epoch": 0.8678971079784915, "grad_norm": 0.10304979979991913, "learning_rate": 1.4101268251926707e-05, "loss": 0.3436087667942047, "memory(GiB)": 78.33, "step": 4479, "token_acc": 0.8969530197494768, "train_speed(iter/s)": 0.032467 }, { "epoch": 0.8680908782638183, "grad_norm": 0.10889364033937454, "learning_rate": 1.4060603907404933e-05, "loss": 0.35096222162246704, "memory(GiB)": 78.33, "step": 4480, "token_acc": 0.8916971772764852, "train_speed(iter/s)": 0.032468 }, { "epoch": 0.868284648549145, "grad_norm": 0.09136100858449936, "learning_rate": 1.401999539684593e-05, "loss": 0.3105822205543518, "memory(GiB)": 78.33, "step": 4481, "token_acc": 0.9048854192898514, "train_speed(iter/s)": 0.032468 }, { "epoch": 0.8684784188344717, "grad_norm": 0.09702181816101074, "learning_rate": 1.3979442736928803e-05, "loss": 0.31895220279693604, "memory(GiB)": 78.33, "step": 4482, "token_acc": 0.9040166789823709, "train_speed(iter/s)": 0.032469 }, { "epoch": 0.8686721891197985, "grad_norm": 0.09936001151800156, "learning_rate": 1.3938945944309499e-05, "loss": 0.3287636339664459, "memory(GiB)": 78.33, "step": 4483, "token_acc": 0.9009811937857727, "train_speed(iter/s)": 0.032469 }, { "epoch": 0.8688659594051252, "grad_norm": 0.09482026845216751, "learning_rate": 1.3898505035621226e-05, "loss": 0.30222681164741516, "memory(GiB)": 78.33, "step": 4484, "token_acc": 0.9081456725285801, "train_speed(iter/s)": 0.03247 }, { "epoch": 0.869059729690452, "grad_norm": 0.09608285129070282, "learning_rate": 1.3858120027474134e-05, "loss": 0.2800081968307495, "memory(GiB)": 78.33, "step": 4485, "token_acc": 0.9137849484969519, "train_speed(iter/s)": 0.032471 }, { "epoch": 0.8692534999757787, "grad_norm": 0.09473740309476852, "learning_rate": 1.3817790936455402e-05, "loss": 0.3255942165851593, "memory(GiB)": 78.33, "step": 4486, "token_acc": 0.9004664035672185, "train_speed(iter/s)": 0.032471 }, { "epoch": 0.8694472702611055, "grad_norm": 0.09641645103693008, "learning_rate": 1.3777517779129316e-05, "loss": 0.31738942861557007, "memory(GiB)": 78.33, "step": 4487, "token_acc": 0.9032964135021097, "train_speed(iter/s)": 0.032472 }, { "epoch": 0.8696410405464322, "grad_norm": 0.12395960092544556, "learning_rate": 1.3737300572037075e-05, "loss": 0.30116546154022217, "memory(GiB)": 78.33, "step": 4488, "token_acc": 0.9070750061682704, "train_speed(iter/s)": 0.032472 }, { "epoch": 0.869834810831759, "grad_norm": 0.09995721280574799, "learning_rate": 1.3697139331697065e-05, "loss": 0.34251344203948975, "memory(GiB)": 78.33, "step": 4489, "token_acc": 0.896267974399468, "train_speed(iter/s)": 0.032473 }, { "epoch": 0.8700285811170857, "grad_norm": 0.10301598161458969, "learning_rate": 1.3657034074604478e-05, "loss": 0.3315487205982208, "memory(GiB)": 78.33, "step": 4490, "token_acc": 0.9004393623612911, "train_speed(iter/s)": 0.032473 }, { "epoch": 0.8702223514024124, "grad_norm": 0.099857397377491, "learning_rate": 1.3616984817231685e-05, "loss": 0.3251858353614807, "memory(GiB)": 78.33, "step": 4491, "token_acc": 0.9022668466644167, "train_speed(iter/s)": 0.032474 }, { "epoch": 0.8704161216877392, "grad_norm": 0.09973510354757309, "learning_rate": 1.3576991576028013e-05, "loss": 0.3382074534893036, "memory(GiB)": 78.33, "step": 4492, "token_acc": 0.9003876364378252, "train_speed(iter/s)": 0.032475 }, { "epoch": 0.8706098919730659, "grad_norm": 0.08957747370004654, "learning_rate": 1.3537054367419703e-05, "loss": 0.29314741492271423, "memory(GiB)": 78.33, "step": 4493, "token_acc": 0.9126814476038974, "train_speed(iter/s)": 0.032475 }, { "epoch": 0.8708036622583927, "grad_norm": 0.10090366750955582, "learning_rate": 1.3497173207810068e-05, "loss": 0.32285887002944946, "memory(GiB)": 78.33, "step": 4494, "token_acc": 0.9043573045641298, "train_speed(iter/s)": 0.032476 }, { "epoch": 0.8709974325437194, "grad_norm": 0.09529086202383041, "learning_rate": 1.3457348113579358e-05, "loss": 0.30941274762153625, "memory(GiB)": 78.33, "step": 4495, "token_acc": 0.9068646306126112, "train_speed(iter/s)": 0.032476 }, { "epoch": 0.8711912028290462, "grad_norm": 0.09300097078084946, "learning_rate": 1.3417579101084869e-05, "loss": 0.3176164925098419, "memory(GiB)": 78.33, "step": 4496, "token_acc": 0.9025844421699079, "train_speed(iter/s)": 0.032477 }, { "epoch": 0.8713849731143729, "grad_norm": 0.10441552102565765, "learning_rate": 1.3377866186660701e-05, "loss": 0.33653199672698975, "memory(GiB)": 78.33, "step": 4497, "token_acc": 0.8984428473648186, "train_speed(iter/s)": 0.032477 }, { "epoch": 0.8715787433996997, "grad_norm": 0.10322162508964539, "learning_rate": 1.3338209386618092e-05, "loss": 0.32316023111343384, "memory(GiB)": 78.33, "step": 4498, "token_acc": 0.9009378129837021, "train_speed(iter/s)": 0.032478 }, { "epoch": 0.8717725136850264, "grad_norm": 0.09797031432390213, "learning_rate": 1.329860871724513e-05, "loss": 0.3057693541049957, "memory(GiB)": 78.33, "step": 4499, "token_acc": 0.9071016563475685, "train_speed(iter/s)": 0.032479 }, { "epoch": 0.8719662839703531, "grad_norm": 0.10819875448942184, "learning_rate": 1.3259064194806885e-05, "loss": 0.3662481904029846, "memory(GiB)": 78.33, "step": 4500, "token_acc": 0.8935516888433982, "train_speed(iter/s)": 0.032479 }, { "epoch": 0.8719662839703531, "eval_loss": 0.3799753785133362, "eval_runtime": 1344.7448, "eval_samples_per_second": 5.019, "eval_steps_per_second": 5.019, "eval_token_acc": 0.9022423587138677, "step": 4500 }, { "epoch": 0.8721600542556799, "grad_norm": 0.09284574538469315, "learning_rate": 1.3219575835545332e-05, "loss": 0.30319368839263916, "memory(GiB)": 78.33, "step": 4501, "token_acc": 0.9082558951495945, "train_speed(iter/s)": 0.032168 }, { "epoch": 0.8723538245410066, "grad_norm": 0.098308265209198, "learning_rate": 1.3180143655679397e-05, "loss": 0.3219000995159149, "memory(GiB)": 78.33, "step": 4502, "token_acc": 0.9041865907009868, "train_speed(iter/s)": 0.032168 }, { "epoch": 0.8725475948263334, "grad_norm": 0.10174136608839035, "learning_rate": 1.3140767671404995e-05, "loss": 0.33587872982025146, "memory(GiB)": 78.33, "step": 4503, "token_acc": 0.8989667675363268, "train_speed(iter/s)": 0.032169 }, { "epoch": 0.8727413651116601, "grad_norm": 0.09019061923027039, "learning_rate": 1.3101447898894852e-05, "loss": 0.2792420983314514, "memory(GiB)": 78.33, "step": 4504, "token_acc": 0.9144691759092409, "train_speed(iter/s)": 0.03217 }, { "epoch": 0.8729351353969869, "grad_norm": 0.1039118766784668, "learning_rate": 1.306218435429865e-05, "loss": 0.35971078276634216, "memory(GiB)": 78.33, "step": 4505, "token_acc": 0.8914640875954808, "train_speed(iter/s)": 0.03217 }, { "epoch": 0.8731289056823136, "grad_norm": 0.10616055876016617, "learning_rate": 1.3022977053743005e-05, "loss": 0.3402102291584015, "memory(GiB)": 78.33, "step": 4506, "token_acc": 0.8962266862085811, "train_speed(iter/s)": 0.032171 }, { "epoch": 0.8733226759676403, "grad_norm": 0.09928639978170395, "learning_rate": 1.298382601333139e-05, "loss": 0.3082139194011688, "memory(GiB)": 78.33, "step": 4507, "token_acc": 0.9083105179101072, "train_speed(iter/s)": 0.032171 }, { "epoch": 0.8735164462529671, "grad_norm": 0.09501931816339493, "learning_rate": 1.294473124914422e-05, "loss": 0.3152191638946533, "memory(GiB)": 78.33, "step": 4508, "token_acc": 0.9062588283637671, "train_speed(iter/s)": 0.032172 }, { "epoch": 0.8737102165382938, "grad_norm": 0.09296244382858276, "learning_rate": 1.2905692777238719e-05, "loss": 0.27433520555496216, "memory(GiB)": 78.33, "step": 4509, "token_acc": 0.9149881928045562, "train_speed(iter/s)": 0.032173 }, { "epoch": 0.8739039868236206, "grad_norm": 0.09587821364402771, "learning_rate": 1.2866710613649062e-05, "loss": 0.2978772521018982, "memory(GiB)": 78.33, "step": 4510, "token_acc": 0.9090977199510526, "train_speed(iter/s)": 0.032173 }, { "epoch": 0.8740977571089473, "grad_norm": 0.10060916841030121, "learning_rate": 1.282778477438629e-05, "loss": 0.321035236120224, "memory(GiB)": 78.33, "step": 4511, "token_acc": 0.9027070865260921, "train_speed(iter/s)": 0.032174 }, { "epoch": 0.8742915273942741, "grad_norm": 0.097939632833004, "learning_rate": 1.2788915275438267e-05, "loss": 0.2997584044933319, "memory(GiB)": 78.33, "step": 4512, "token_acc": 0.9089030803906837, "train_speed(iter/s)": 0.032175 }, { "epoch": 0.8744852976796008, "grad_norm": 0.08998732268810272, "learning_rate": 1.2750102132769735e-05, "loss": 0.2946266531944275, "memory(GiB)": 78.33, "step": 4513, "token_acc": 0.9110179997024842, "train_speed(iter/s)": 0.032175 }, { "epoch": 0.8746790679649276, "grad_norm": 0.10302318632602692, "learning_rate": 1.2711345362322295e-05, "loss": 0.3208523988723755, "memory(GiB)": 78.33, "step": 4514, "token_acc": 0.9040156111839498, "train_speed(iter/s)": 0.032176 }, { "epoch": 0.8748728382502543, "grad_norm": 0.10049934685230255, "learning_rate": 1.2672644980014445e-05, "loss": 0.35380083322525024, "memory(GiB)": 78.33, "step": 4515, "token_acc": 0.8943812514845214, "train_speed(iter/s)": 0.032177 }, { "epoch": 0.875066608535581, "grad_norm": 0.09158279001712799, "learning_rate": 1.2634001001741373e-05, "loss": 0.3017105460166931, "memory(GiB)": 78.33, "step": 4516, "token_acc": 0.908971506687753, "train_speed(iter/s)": 0.032177 }, { "epoch": 0.8752603788209078, "grad_norm": 0.10271737724542618, "learning_rate": 1.2595413443375297e-05, "loss": 0.3498397767543793, "memory(GiB)": 78.33, "step": 4517, "token_acc": 0.896111417539989, "train_speed(iter/s)": 0.032178 }, { "epoch": 0.8754541491062345, "grad_norm": 0.09800441563129425, "learning_rate": 1.2556882320765122e-05, "loss": 0.3219001591205597, "memory(GiB)": 78.33, "step": 4518, "token_acc": 0.9034795042897998, "train_speed(iter/s)": 0.032178 }, { "epoch": 0.8756479193915613, "grad_norm": 0.10516981035470963, "learning_rate": 1.2518407649736607e-05, "loss": 0.3301500082015991, "memory(GiB)": 78.33, "step": 4519, "token_acc": 0.8994242640075973, "train_speed(iter/s)": 0.032179 }, { "epoch": 0.875841689676888, "grad_norm": 0.10065259039402008, "learning_rate": 1.2479989446092359e-05, "loss": 0.3331465423107147, "memory(GiB)": 78.33, "step": 4520, "token_acc": 0.8990757119011435, "train_speed(iter/s)": 0.03218 }, { "epoch": 0.8760354599622148, "grad_norm": 0.11081155389547348, "learning_rate": 1.2441627725611708e-05, "loss": 0.34419184923171997, "memory(GiB)": 78.33, "step": 4521, "token_acc": 0.8977344241661422, "train_speed(iter/s)": 0.03218 }, { "epoch": 0.8762292302475415, "grad_norm": 0.0998803898692131, "learning_rate": 1.240332250405095e-05, "loss": 0.31978899240493774, "memory(GiB)": 78.33, "step": 4522, "token_acc": 0.9027906976744186, "train_speed(iter/s)": 0.032181 }, { "epoch": 0.8764230005328683, "grad_norm": 0.10122760385274887, "learning_rate": 1.236507379714295e-05, "loss": 0.31909558176994324, "memory(GiB)": 78.33, "step": 4523, "token_acc": 0.9035904628330996, "train_speed(iter/s)": 0.032182 }, { "epoch": 0.876616770818195, "grad_norm": 0.10056735575199127, "learning_rate": 1.2326881620597556e-05, "loss": 0.3240450620651245, "memory(GiB)": 78.33, "step": 4524, "token_acc": 0.9028395198933096, "train_speed(iter/s)": 0.032182 }, { "epoch": 0.8768105411035217, "grad_norm": 0.08745722472667694, "learning_rate": 1.2288745990101323e-05, "loss": 0.3004869520664215, "memory(GiB)": 78.33, "step": 4525, "token_acc": 0.907592263761578, "train_speed(iter/s)": 0.032183 }, { "epoch": 0.8770043113888485, "grad_norm": 0.10847844928503036, "learning_rate": 1.2250666921317537e-05, "loss": 0.34397241473197937, "memory(GiB)": 78.33, "step": 4526, "token_acc": 0.8972039724980901, "train_speed(iter/s)": 0.032184 }, { "epoch": 0.8771980816741752, "grad_norm": 0.09692453593015671, "learning_rate": 1.221264442988632e-05, "loss": 0.33984488248825073, "memory(GiB)": 78.33, "step": 4527, "token_acc": 0.9006118417883123, "train_speed(iter/s)": 0.032184 }, { "epoch": 0.877391851959502, "grad_norm": 0.09908100217580795, "learning_rate": 1.2174678531424497e-05, "loss": 0.31903359293937683, "memory(GiB)": 78.33, "step": 4528, "token_acc": 0.9019696566409369, "train_speed(iter/s)": 0.032185 }, { "epoch": 0.8775856222448287, "grad_norm": 0.10452962666749954, "learning_rate": 1.2136769241525762e-05, "loss": 0.33924567699432373, "memory(GiB)": 78.33, "step": 4529, "token_acc": 0.897684942468475, "train_speed(iter/s)": 0.032185 }, { "epoch": 0.8777793925301555, "grad_norm": 0.09859445691108704, "learning_rate": 1.2098916575760376e-05, "loss": 0.3426741659641266, "memory(GiB)": 78.33, "step": 4530, "token_acc": 0.896549017555794, "train_speed(iter/s)": 0.032186 }, { "epoch": 0.8779731628154822, "grad_norm": 0.1066657230257988, "learning_rate": 1.2061120549675518e-05, "loss": 0.3669888377189636, "memory(GiB)": 78.33, "step": 4531, "token_acc": 0.8908226988867771, "train_speed(iter/s)": 0.032187 }, { "epoch": 0.878166933100809, "grad_norm": 0.1010642871260643, "learning_rate": 1.2023381178795022e-05, "loss": 0.31678637862205505, "memory(GiB)": 78.33, "step": 4532, "token_acc": 0.9021789270887037, "train_speed(iter/s)": 0.032187 }, { "epoch": 0.8783607033861357, "grad_norm": 0.10661202669143677, "learning_rate": 1.1985698478619454e-05, "loss": 0.3384348452091217, "memory(GiB)": 78.33, "step": 4533, "token_acc": 0.899502755811167, "train_speed(iter/s)": 0.032188 }, { "epoch": 0.8785544736714624, "grad_norm": 0.09901930391788483, "learning_rate": 1.1948072464626101e-05, "loss": 0.3115423619747162, "memory(GiB)": 78.33, "step": 4534, "token_acc": 0.9044517138387143, "train_speed(iter/s)": 0.032189 }, { "epoch": 0.8787482439567892, "grad_norm": 0.09338109940290451, "learning_rate": 1.1910503152268952e-05, "loss": 0.2875381410121918, "memory(GiB)": 78.33, "step": 4535, "token_acc": 0.9132529457108248, "train_speed(iter/s)": 0.032189 }, { "epoch": 0.8789420142421159, "grad_norm": 0.10732939094305038, "learning_rate": 1.187299055697883e-05, "loss": 0.3420470058917999, "memory(GiB)": 78.33, "step": 4536, "token_acc": 0.8977093546629971, "train_speed(iter/s)": 0.03219 }, { "epoch": 0.8791357845274427, "grad_norm": 0.10017619282007217, "learning_rate": 1.1835534694163057e-05, "loss": 0.3340654969215393, "memory(GiB)": 78.33, "step": 4537, "token_acc": 0.8995184135977337, "train_speed(iter/s)": 0.03219 }, { "epoch": 0.8793295548127694, "grad_norm": 0.09547953307628632, "learning_rate": 1.1798135579205831e-05, "loss": 0.3285997807979584, "memory(GiB)": 78.33, "step": 4538, "token_acc": 0.9024012202922796, "train_speed(iter/s)": 0.032191 }, { "epoch": 0.8795233250980962, "grad_norm": 0.09887305647134781, "learning_rate": 1.1760793227467947e-05, "loss": 0.34439530968666077, "memory(GiB)": 78.33, "step": 4539, "token_acc": 0.8960512218764348, "train_speed(iter/s)": 0.032192 }, { "epoch": 0.8797170953834229, "grad_norm": 0.10936824232339859, "learning_rate": 1.1723507654286885e-05, "loss": 0.3436868488788605, "memory(GiB)": 78.33, "step": 4540, "token_acc": 0.8984078161751402, "train_speed(iter/s)": 0.032192 }, { "epoch": 0.8799108656687497, "grad_norm": 0.09683515876531601, "learning_rate": 1.1686278874976912e-05, "loss": 0.32740306854248047, "memory(GiB)": 78.33, "step": 4541, "token_acc": 0.9006859993763642, "train_speed(iter/s)": 0.032193 }, { "epoch": 0.8801046359540764, "grad_norm": 0.10903044044971466, "learning_rate": 1.1649106904828798e-05, "loss": 0.3218260705471039, "memory(GiB)": 78.33, "step": 4542, "token_acc": 0.9046106833637916, "train_speed(iter/s)": 0.032194 }, { "epoch": 0.8802984062394031, "grad_norm": 0.09545977413654327, "learning_rate": 1.1611991759110128e-05, "loss": 0.31498417258262634, "memory(GiB)": 78.33, "step": 4543, "token_acc": 0.904716222873292, "train_speed(iter/s)": 0.032194 }, { "epoch": 0.8804921765247299, "grad_norm": 0.09254451096057892, "learning_rate": 1.1574933453065078e-05, "loss": 0.2897379696369171, "memory(GiB)": 78.33, "step": 4544, "token_acc": 0.9108022299442514, "train_speed(iter/s)": 0.032195 }, { "epoch": 0.8806859468100566, "grad_norm": 0.10882263630628586, "learning_rate": 1.1537932001914485e-05, "loss": 0.3277333974838257, "memory(GiB)": 78.33, "step": 4545, "token_acc": 0.9019514213860503, "train_speed(iter/s)": 0.032196 }, { "epoch": 0.8808797170953834, "grad_norm": 0.0869324579834938, "learning_rate": 1.1500987420855845e-05, "loss": 0.2924439609050751, "memory(GiB)": 78.33, "step": 4546, "token_acc": 0.9105908898603864, "train_speed(iter/s)": 0.032196 }, { "epoch": 0.8810734873807101, "grad_norm": 0.09957166761159897, "learning_rate": 1.1464099725063237e-05, "loss": 0.3209323287010193, "memory(GiB)": 78.33, "step": 4547, "token_acc": 0.9023074369189907, "train_speed(iter/s)": 0.032197 }, { "epoch": 0.8812672576660369, "grad_norm": 0.11669695377349854, "learning_rate": 1.1427268929687555e-05, "loss": 0.3712838888168335, "memory(GiB)": 78.33, "step": 4548, "token_acc": 0.8882153971045866, "train_speed(iter/s)": 0.032197 }, { "epoch": 0.8814610279513636, "grad_norm": 0.0909273773431778, "learning_rate": 1.1390495049856036e-05, "loss": 0.31092870235443115, "memory(GiB)": 78.33, "step": 4549, "token_acc": 0.9068354688855991, "train_speed(iter/s)": 0.032198 }, { "epoch": 0.8816547982366904, "grad_norm": 0.08977729827165604, "learning_rate": 1.1353778100672828e-05, "loss": 0.29384180903434753, "memory(GiB)": 78.33, "step": 4550, "token_acc": 0.9103132495094121, "train_speed(iter/s)": 0.032199 }, { "epoch": 0.8818485685220171, "grad_norm": 0.09765194356441498, "learning_rate": 1.131711809721852e-05, "loss": 0.31177225708961487, "memory(GiB)": 78.33, "step": 4551, "token_acc": 0.9054578904333606, "train_speed(iter/s)": 0.032199 }, { "epoch": 0.8820423388073438, "grad_norm": 0.11460500210523605, "learning_rate": 1.1280515054550366e-05, "loss": 0.3225688934326172, "memory(GiB)": 78.33, "step": 4552, "token_acc": 0.9011132164850781, "train_speed(iter/s)": 0.0322 }, { "epoch": 0.8822361090926706, "grad_norm": 0.09452386945486069, "learning_rate": 1.1243968987702206e-05, "loss": 0.29615986347198486, "memory(GiB)": 78.33, "step": 4553, "token_acc": 0.9087365694797036, "train_speed(iter/s)": 0.032201 }, { "epoch": 0.8824298793779974, "grad_norm": 0.09222650527954102, "learning_rate": 1.1207479911684487e-05, "loss": 0.2946526110172272, "memory(GiB)": 78.33, "step": 4554, "token_acc": 0.9109519196451333, "train_speed(iter/s)": 0.032201 }, { "epoch": 0.8826236496633242, "grad_norm": 0.10203875601291656, "learning_rate": 1.11710478414843e-05, "loss": 0.34442439675331116, "memory(GiB)": 78.33, "step": 4555, "token_acc": 0.8982755761896866, "train_speed(iter/s)": 0.032202 }, { "epoch": 0.8828174199486509, "grad_norm": 0.10598360747098923, "learning_rate": 1.1134672792065209e-05, "loss": 0.329555869102478, "memory(GiB)": 78.33, "step": 4556, "token_acc": 0.9016222021252543, "train_speed(iter/s)": 0.032202 }, { "epoch": 0.8830111902339777, "grad_norm": 0.10227588564157486, "learning_rate": 1.109835477836748e-05, "loss": 0.33669596910476685, "memory(GiB)": 78.33, "step": 4557, "token_acc": 0.8991185669604776, "train_speed(iter/s)": 0.032203 }, { "epoch": 0.8832049605193044, "grad_norm": 0.09594012796878815, "learning_rate": 1.1062093815307865e-05, "loss": 0.3027840554714203, "memory(GiB)": 78.33, "step": 4558, "token_acc": 0.9077541203159593, "train_speed(iter/s)": 0.032204 }, { "epoch": 0.8833987308046312, "grad_norm": 0.11179719865322113, "learning_rate": 1.1025889917779735e-05, "loss": 0.3367825746536255, "memory(GiB)": 78.33, "step": 4559, "token_acc": 0.9006670902160102, "train_speed(iter/s)": 0.032204 }, { "epoch": 0.8835925010899579, "grad_norm": 0.09386321157217026, "learning_rate": 1.0989743100653008e-05, "loss": 0.3008558750152588, "memory(GiB)": 78.33, "step": 4560, "token_acc": 0.908220500378284, "train_speed(iter/s)": 0.032205 }, { "epoch": 0.8837862713752846, "grad_norm": 0.08674740046262741, "learning_rate": 1.0953653378774097e-05, "loss": 0.27751424908638, "memory(GiB)": 78.33, "step": 4561, "token_acc": 0.9156325156325157, "train_speed(iter/s)": 0.032206 }, { "epoch": 0.8839800416606114, "grad_norm": 0.10045383870601654, "learning_rate": 1.0917620766966123e-05, "loss": 0.30626630783081055, "memory(GiB)": 78.33, "step": 4562, "token_acc": 0.9081538638719336, "train_speed(iter/s)": 0.032206 }, { "epoch": 0.8841738119459381, "grad_norm": 0.10451143980026245, "learning_rate": 1.0881645280028534e-05, "loss": 0.3387284576892853, "memory(GiB)": 78.33, "step": 4563, "token_acc": 0.8987345454545455, "train_speed(iter/s)": 0.032207 }, { "epoch": 0.8843675822312649, "grad_norm": 0.09835859388113022, "learning_rate": 1.0845726932737509e-05, "loss": 0.33612358570098877, "memory(GiB)": 78.33, "step": 4564, "token_acc": 0.8987608426270136, "train_speed(iter/s)": 0.032207 }, { "epoch": 0.8845613525165916, "grad_norm": 0.10052934288978577, "learning_rate": 1.0809865739845646e-05, "loss": 0.31725549697875977, "memory(GiB)": 78.33, "step": 4565, "token_acc": 0.9065557583555249, "train_speed(iter/s)": 0.032208 }, { "epoch": 0.8847551228019184, "grad_norm": 0.10602091997861862, "learning_rate": 1.0774061716082117e-05, "loss": 0.34389835596084595, "memory(GiB)": 78.33, "step": 4566, "token_acc": 0.8980959097320169, "train_speed(iter/s)": 0.032209 }, { "epoch": 0.8849488930872451, "grad_norm": 0.09629754722118378, "learning_rate": 1.0738314876152587e-05, "loss": 0.29018452763557434, "memory(GiB)": 78.33, "step": 4567, "token_acc": 0.9128355904015205, "train_speed(iter/s)": 0.032209 }, { "epoch": 0.8851426633725719, "grad_norm": 0.11927231401205063, "learning_rate": 1.0702625234739215e-05, "loss": 0.3924431800842285, "memory(GiB)": 78.33, "step": 4568, "token_acc": 0.8830037082818294, "train_speed(iter/s)": 0.03221 }, { "epoch": 0.8853364336578986, "grad_norm": 0.10137398540973663, "learning_rate": 1.0666992806500774e-05, "loss": 0.3232523202896118, "memory(GiB)": 78.33, "step": 4569, "token_acc": 0.9024434010228191, "train_speed(iter/s)": 0.032211 }, { "epoch": 0.8855302039432253, "grad_norm": 0.09592381864786148, "learning_rate": 1.0631417606072356e-05, "loss": 0.31686073541641235, "memory(GiB)": 78.33, "step": 4570, "token_acc": 0.9042706275456697, "train_speed(iter/s)": 0.032211 }, { "epoch": 0.8857239742285521, "grad_norm": 0.10683749616146088, "learning_rate": 1.0595899648065742e-05, "loss": 0.33387985825538635, "memory(GiB)": 78.33, "step": 4571, "token_acc": 0.8986985376504726, "train_speed(iter/s)": 0.032212 }, { "epoch": 0.8859177445138788, "grad_norm": 0.10224435478448868, "learning_rate": 1.0560438947069077e-05, "loss": 0.32763242721557617, "memory(GiB)": 78.33, "step": 4572, "token_acc": 0.9004544285346824, "train_speed(iter/s)": 0.032213 }, { "epoch": 0.8861115147992056, "grad_norm": 0.1058693677186966, "learning_rate": 1.0525035517647012e-05, "loss": 0.33532196283340454, "memory(GiB)": 78.33, "step": 4573, "token_acc": 0.899757553151809, "train_speed(iter/s)": 0.032213 }, { "epoch": 0.8863052850845323, "grad_norm": 0.10156890749931335, "learning_rate": 1.0489689374340699e-05, "loss": 0.32516196370124817, "memory(GiB)": 78.33, "step": 4574, "token_acc": 0.9034920634920635, "train_speed(iter/s)": 0.032214 }, { "epoch": 0.8864990553698591, "grad_norm": 0.09546992182731628, "learning_rate": 1.0454400531667723e-05, "loss": 0.32520344853401184, "memory(GiB)": 78.33, "step": 4575, "token_acc": 0.9030310559006212, "train_speed(iter/s)": 0.032214 }, { "epoch": 0.8866928256551858, "grad_norm": 0.0944749042391777, "learning_rate": 1.0419169004122208e-05, "loss": 0.308064341545105, "memory(GiB)": 78.33, "step": 4576, "token_acc": 0.9058090431558348, "train_speed(iter/s)": 0.032215 }, { "epoch": 0.8868865959405126, "grad_norm": 0.10865960270166397, "learning_rate": 1.0383994806174678e-05, "loss": 0.34614354372024536, "memory(GiB)": 78.33, "step": 4577, "token_acc": 0.8971537001897533, "train_speed(iter/s)": 0.032216 }, { "epoch": 0.8870803662258393, "grad_norm": 0.11621136218309402, "learning_rate": 1.0348877952272094e-05, "loss": 0.3003866672515869, "memory(GiB)": 78.33, "step": 4578, "token_acc": 0.908298606977021, "train_speed(iter/s)": 0.032216 }, { "epoch": 0.887274136511166, "grad_norm": 0.09108857810497284, "learning_rate": 1.0313818456837918e-05, "loss": 0.29279422760009766, "memory(GiB)": 78.33, "step": 4579, "token_acc": 0.9107714628777266, "train_speed(iter/s)": 0.032217 }, { "epoch": 0.8874679067964928, "grad_norm": 0.1540272831916809, "learning_rate": 1.0278816334271984e-05, "loss": 0.3414173722267151, "memory(GiB)": 78.33, "step": 4580, "token_acc": 0.8987891249714416, "train_speed(iter/s)": 0.032217 }, { "epoch": 0.8876616770818195, "grad_norm": 0.10262391716241837, "learning_rate": 1.024387159895067e-05, "loss": 0.35209545493125916, "memory(GiB)": 78.33, "step": 4581, "token_acc": 0.894268269722155, "train_speed(iter/s)": 0.032218 }, { "epoch": 0.8878554473671463, "grad_norm": 0.10410673916339874, "learning_rate": 1.0208984265226649e-05, "loss": 0.3540557324886322, "memory(GiB)": 78.33, "step": 4582, "token_acc": 0.896634891466636, "train_speed(iter/s)": 0.032219 }, { "epoch": 0.888049217652473, "grad_norm": 0.09851502627134323, "learning_rate": 1.0174154347429141e-05, "loss": 0.3153938055038452, "memory(GiB)": 78.33, "step": 4583, "token_acc": 0.904395755821965, "train_speed(iter/s)": 0.032219 }, { "epoch": 0.8882429879377998, "grad_norm": 0.09433472901582718, "learning_rate": 1.013938185986372e-05, "loss": 0.30990996956825256, "memory(GiB)": 78.33, "step": 4584, "token_acc": 0.9047421073145381, "train_speed(iter/s)": 0.03222 }, { "epoch": 0.8884367582231265, "grad_norm": 0.10146637260913849, "learning_rate": 1.0104666816812362e-05, "loss": 0.33719319105148315, "memory(GiB)": 78.33, "step": 4585, "token_acc": 0.89920724801812, "train_speed(iter/s)": 0.032221 }, { "epoch": 0.8886305285084533, "grad_norm": 0.11201319843530655, "learning_rate": 1.0070009232533476e-05, "loss": 0.3517708480358124, "memory(GiB)": 78.33, "step": 4586, "token_acc": 0.8955111278762731, "train_speed(iter/s)": 0.032221 }, { "epoch": 0.88882429879378, "grad_norm": 0.09309983253479004, "learning_rate": 1.0035409121261828e-05, "loss": 0.2867148518562317, "memory(GiB)": 78.33, "step": 4587, "token_acc": 0.912418334951329, "train_speed(iter/s)": 0.032222 }, { "epoch": 0.8890180690791067, "grad_norm": 0.09345608949661255, "learning_rate": 1.0000866497208714e-05, "loss": 0.310814768075943, "memory(GiB)": 78.33, "step": 4588, "token_acc": 0.9047558526727841, "train_speed(iter/s)": 0.032222 }, { "epoch": 0.8892118393644335, "grad_norm": 0.1012081727385521, "learning_rate": 9.96638137456159e-06, "loss": 0.3262358605861664, "memory(GiB)": 78.33, "step": 4589, "token_acc": 0.9022921972678861, "train_speed(iter/s)": 0.032223 }, { "epoch": 0.8894056096497602, "grad_norm": 0.10403067618608475, "learning_rate": 9.931953767484518e-06, "loss": 0.3487986922264099, "memory(GiB)": 78.33, "step": 4590, "token_acc": 0.8952626667398583, "train_speed(iter/s)": 0.032224 }, { "epoch": 0.889599379935087, "grad_norm": 0.11049600690603256, "learning_rate": 9.89758369011781e-06, "loss": 0.3473038971424103, "memory(GiB)": 78.33, "step": 4591, "token_acc": 0.8961088918479065, "train_speed(iter/s)": 0.032224 }, { "epoch": 0.8897931502204137, "grad_norm": 0.08729095757007599, "learning_rate": 9.863271156578174e-06, "loss": 0.28431570529937744, "memory(GiB)": 78.33, "step": 4592, "token_acc": 0.9126174143741486, "train_speed(iter/s)": 0.032225 }, { "epoch": 0.8899869205057405, "grad_norm": 0.09780146926641464, "learning_rate": 9.829016180958681e-06, "loss": 0.3170766234397888, "memory(GiB)": 78.33, "step": 4593, "token_acc": 0.9040569020021075, "train_speed(iter/s)": 0.032226 }, { "epoch": 0.8901806907910672, "grad_norm": 0.11150769889354706, "learning_rate": 9.794818777328767e-06, "loss": 0.29845190048217773, "memory(GiB)": 78.33, "step": 4594, "token_acc": 0.909273330479452, "train_speed(iter/s)": 0.032226 }, { "epoch": 0.890374461076394, "grad_norm": 0.10244564712047577, "learning_rate": 9.760678959734292e-06, "loss": 0.34681934118270874, "memory(GiB)": 78.33, "step": 4595, "token_acc": 0.8985476177715848, "train_speed(iter/s)": 0.032227 }, { "epoch": 0.8905682313617207, "grad_norm": 0.10472705215215683, "learning_rate": 9.726596742197307e-06, "loss": 0.3317742645740509, "memory(GiB)": 78.33, "step": 4596, "token_acc": 0.8990034710558727, "train_speed(iter/s)": 0.032227 }, { "epoch": 0.8907620016470474, "grad_norm": 0.0967646911740303, "learning_rate": 9.692572138716347e-06, "loss": 0.34411588311195374, "memory(GiB)": 78.33, "step": 4597, "token_acc": 0.8969721095713183, "train_speed(iter/s)": 0.032228 }, { "epoch": 0.8909557719323742, "grad_norm": 0.10965237766504288, "learning_rate": 9.658605163266203e-06, "loss": 0.3249712884426117, "memory(GiB)": 78.33, "step": 4598, "token_acc": 0.9018522218099831, "train_speed(iter/s)": 0.032229 }, { "epoch": 0.8911495422177009, "grad_norm": 0.10542988777160645, "learning_rate": 9.624695829798045e-06, "loss": 0.33859795331954956, "memory(GiB)": 78.33, "step": 4599, "token_acc": 0.898973819608289, "train_speed(iter/s)": 0.032229 }, { "epoch": 0.8913433125030277, "grad_norm": 0.09840527176856995, "learning_rate": 9.590844152239353e-06, "loss": 0.3379114270210266, "memory(GiB)": 78.33, "step": 4600, "token_acc": 0.8994015604878418, "train_speed(iter/s)": 0.03223 }, { "epoch": 0.8915370827883544, "grad_norm": 0.0945395678281784, "learning_rate": 9.557050144493884e-06, "loss": 0.30370837450027466, "memory(GiB)": 78.33, "step": 4601, "token_acc": 0.9077565971057858, "train_speed(iter/s)": 0.032227 }, { "epoch": 0.8917308530736812, "grad_norm": 0.10549890249967575, "learning_rate": 9.523313820441803e-06, "loss": 0.3490893840789795, "memory(GiB)": 78.33, "step": 4602, "token_acc": 0.897390101976148, "train_speed(iter/s)": 0.032228 }, { "epoch": 0.8919246233590079, "grad_norm": 0.10837141424417496, "learning_rate": 9.489635193939444e-06, "loss": 0.36473971605300903, "memory(GiB)": 78.33, "step": 4603, "token_acc": 0.8911379629370041, "train_speed(iter/s)": 0.032229 }, { "epoch": 0.8921183936443347, "grad_norm": 0.09465762972831726, "learning_rate": 9.456014278819606e-06, "loss": 0.3148060142993927, "memory(GiB)": 78.33, "step": 4604, "token_acc": 0.9028953344343518, "train_speed(iter/s)": 0.032229 }, { "epoch": 0.8923121639296614, "grad_norm": 0.09781907498836517, "learning_rate": 9.422451088891264e-06, "loss": 0.3031347990036011, "memory(GiB)": 78.33, "step": 4605, "token_acc": 0.908634026587614, "train_speed(iter/s)": 0.03223 }, { "epoch": 0.8925059342149881, "grad_norm": 0.09446154534816742, "learning_rate": 9.38894563793972e-06, "loss": 0.3180665373802185, "memory(GiB)": 78.33, "step": 4606, "token_acc": 0.905185446482564, "train_speed(iter/s)": 0.03223 }, { "epoch": 0.8926997045003149, "grad_norm": 0.09713321179151535, "learning_rate": 9.355497939726569e-06, "loss": 0.31975382566452026, "memory(GiB)": 78.33, "step": 4607, "token_acc": 0.904540804281105, "train_speed(iter/s)": 0.032231 }, { "epoch": 0.8928934747856416, "grad_norm": 0.0889185443520546, "learning_rate": 9.322108007989654e-06, "loss": 0.28848981857299805, "memory(GiB)": 78.33, "step": 4608, "token_acc": 0.9133419560242761, "train_speed(iter/s)": 0.032232 }, { "epoch": 0.8930872450709684, "grad_norm": 0.10388769209384918, "learning_rate": 9.288775856443187e-06, "loss": 0.3318116068840027, "memory(GiB)": 78.33, "step": 4609, "token_acc": 0.9007829299222584, "train_speed(iter/s)": 0.032232 }, { "epoch": 0.8932810153562951, "grad_norm": 0.10328590124845505, "learning_rate": 9.255501498777485e-06, "loss": 0.35095512866973877, "memory(GiB)": 78.33, "step": 4610, "token_acc": 0.8963482109922538, "train_speed(iter/s)": 0.032233 }, { "epoch": 0.8934747856416219, "grad_norm": 0.09728133678436279, "learning_rate": 9.222284948659297e-06, "loss": 0.32896798849105835, "memory(GiB)": 78.33, "step": 4611, "token_acc": 0.9007275166400083, "train_speed(iter/s)": 0.032233 }, { "epoch": 0.8936685559269486, "grad_norm": 0.09340497106313705, "learning_rate": 9.189126219731513e-06, "loss": 0.3199424147605896, "memory(GiB)": 78.33, "step": 4612, "token_acc": 0.9017920064431693, "train_speed(iter/s)": 0.032234 }, { "epoch": 0.8938623262122753, "grad_norm": 0.12456195801496506, "learning_rate": 9.156025325613319e-06, "loss": 0.38881370425224304, "memory(GiB)": 78.33, "step": 4613, "token_acc": 0.8865035516969219, "train_speed(iter/s)": 0.032235 }, { "epoch": 0.8940560964976021, "grad_norm": 0.10831289738416672, "learning_rate": 9.122982279900192e-06, "loss": 0.31916946172714233, "memory(GiB)": 78.33, "step": 4614, "token_acc": 0.9046099809746817, "train_speed(iter/s)": 0.032235 }, { "epoch": 0.8942498667829288, "grad_norm": 0.09680543094873428, "learning_rate": 9.089997096163692e-06, "loss": 0.31975850462913513, "memory(GiB)": 78.33, "step": 4615, "token_acc": 0.9020113186492016, "train_speed(iter/s)": 0.032236 }, { "epoch": 0.8944436370682556, "grad_norm": 0.09476674348115921, "learning_rate": 9.057069787951832e-06, "loss": 0.3149741291999817, "memory(GiB)": 78.33, "step": 4616, "token_acc": 0.9046471784987686, "train_speed(iter/s)": 0.032236 }, { "epoch": 0.8946374073535823, "grad_norm": 0.09538931399583817, "learning_rate": 9.024200368788676e-06, "loss": 0.298635333776474, "memory(GiB)": 78.33, "step": 4617, "token_acc": 0.9104565695584527, "train_speed(iter/s)": 0.032237 }, { "epoch": 0.8948311776389091, "grad_norm": 0.09865786135196686, "learning_rate": 8.991388852174592e-06, "loss": 0.3136424124240875, "memory(GiB)": 78.33, "step": 4618, "token_acc": 0.9061650538218985, "train_speed(iter/s)": 0.032238 }, { "epoch": 0.8950249479242358, "grad_norm": 0.09958308935165405, "learning_rate": 8.958635251586166e-06, "loss": 0.32274237275123596, "memory(GiB)": 78.33, "step": 4619, "token_acc": 0.9019686621132985, "train_speed(iter/s)": 0.032238 }, { "epoch": 0.8952187182095626, "grad_norm": 0.1041649580001831, "learning_rate": 8.925939580476138e-06, "loss": 0.33783018589019775, "memory(GiB)": 78.33, "step": 4620, "token_acc": 0.8964275424949583, "train_speed(iter/s)": 0.032239 }, { "epoch": 0.8954124884948893, "grad_norm": 0.09779240190982819, "learning_rate": 8.893301852273582e-06, "loss": 0.3440534174442291, "memory(GiB)": 78.33, "step": 4621, "token_acc": 0.8966321509124462, "train_speed(iter/s)": 0.03224 }, { "epoch": 0.895606258780216, "grad_norm": 0.11845773458480835, "learning_rate": 8.860722080383593e-06, "loss": 0.35379868745803833, "memory(GiB)": 78.33, "step": 4622, "token_acc": 0.8932370241687239, "train_speed(iter/s)": 0.03224 }, { "epoch": 0.8958000290655428, "grad_norm": 0.0978410616517067, "learning_rate": 8.82820027818763e-06, "loss": 0.3412013053894043, "memory(GiB)": 78.33, "step": 4623, "token_acc": 0.9002301375594958, "train_speed(iter/s)": 0.032241 }, { "epoch": 0.8959937993508695, "grad_norm": 0.09728839993476868, "learning_rate": 8.795736459043246e-06, "loss": 0.3061705529689789, "memory(GiB)": 78.33, "step": 4624, "token_acc": 0.9070262390670554, "train_speed(iter/s)": 0.032241 }, { "epoch": 0.8961875696361963, "grad_norm": 0.09861348569393158, "learning_rate": 8.763330636284204e-06, "loss": 0.3142586350440979, "memory(GiB)": 78.33, "step": 4625, "token_acc": 0.9079194559839123, "train_speed(iter/s)": 0.032242 }, { "epoch": 0.896381339921523, "grad_norm": 0.09855727106332779, "learning_rate": 8.730982823220445e-06, "loss": 0.32500770688056946, "memory(GiB)": 78.33, "step": 4626, "token_acc": 0.9023219335979276, "train_speed(iter/s)": 0.032243 }, { "epoch": 0.8965751102068498, "grad_norm": 0.10426346212625504, "learning_rate": 8.698693033138054e-06, "loss": 0.32699429988861084, "memory(GiB)": 78.33, "step": 4627, "token_acc": 0.9018576936558009, "train_speed(iter/s)": 0.032243 }, { "epoch": 0.8967688804921765, "grad_norm": 0.09685856848955154, "learning_rate": 8.666461279299408e-06, "loss": 0.3376917243003845, "memory(GiB)": 78.33, "step": 4628, "token_acc": 0.8998025786638719, "train_speed(iter/s)": 0.032244 }, { "epoch": 0.8969626507775033, "grad_norm": 0.10653463006019592, "learning_rate": 8.634287574942833e-06, "loss": 0.3381001055240631, "memory(GiB)": 78.33, "step": 4629, "token_acc": 0.9000566335807326, "train_speed(iter/s)": 0.032244 }, { "epoch": 0.89715642106283, "grad_norm": 0.1089102178812027, "learning_rate": 8.602171933283025e-06, "loss": 0.3288399279117584, "memory(GiB)": 78.33, "step": 4630, "token_acc": 0.9028040685495066, "train_speed(iter/s)": 0.032245 }, { "epoch": 0.8973501913481567, "grad_norm": 0.10240910202264786, "learning_rate": 8.570114367510717e-06, "loss": 0.33557647466659546, "memory(GiB)": 78.33, "step": 4631, "token_acc": 0.897394733136178, "train_speed(iter/s)": 0.032246 }, { "epoch": 0.8975439616334835, "grad_norm": 0.10204704850912094, "learning_rate": 8.53811489079279e-06, "loss": 0.3257525861263275, "memory(GiB)": 78.33, "step": 4632, "token_acc": 0.9036119817088836, "train_speed(iter/s)": 0.032246 }, { "epoch": 0.8977377319188102, "grad_norm": 0.10609758645296097, "learning_rate": 8.506173516272319e-06, "loss": 0.362753689289093, "memory(GiB)": 78.33, "step": 4633, "token_acc": 0.8909109816971714, "train_speed(iter/s)": 0.032247 }, { "epoch": 0.897931502204137, "grad_norm": 0.09235696494579315, "learning_rate": 8.474290257068456e-06, "loss": 0.3118291199207306, "memory(GiB)": 78.33, "step": 4634, "token_acc": 0.9042984381050729, "train_speed(iter/s)": 0.032247 }, { "epoch": 0.8981252724894637, "grad_norm": 0.10148902982473373, "learning_rate": 8.44246512627656e-06, "loss": 0.3247118592262268, "memory(GiB)": 78.33, "step": 4635, "token_acc": 0.9009333957432749, "train_speed(iter/s)": 0.032248 }, { "epoch": 0.8983190427747905, "grad_norm": 0.10464513301849365, "learning_rate": 8.41069813696799e-06, "loss": 0.3417477309703827, "memory(GiB)": 78.33, "step": 4636, "token_acc": 0.898022001552421, "train_speed(iter/s)": 0.032249 }, { "epoch": 0.8985128130601172, "grad_norm": 0.0930318832397461, "learning_rate": 8.37898930219038e-06, "loss": 0.3098987638950348, "memory(GiB)": 78.33, "step": 4637, "token_acc": 0.9070895522388059, "train_speed(iter/s)": 0.032249 }, { "epoch": 0.898706583345444, "grad_norm": 0.09498867392539978, "learning_rate": 8.34733863496736e-06, "loss": 0.3298591375350952, "memory(GiB)": 78.33, "step": 4638, "token_acc": 0.9012791647079199, "train_speed(iter/s)": 0.03225 }, { "epoch": 0.8989003536307707, "grad_norm": 0.09717161953449249, "learning_rate": 8.315746148298713e-06, "loss": 0.3042880892753601, "memory(GiB)": 78.33, "step": 4639, "token_acc": 0.9077266387726639, "train_speed(iter/s)": 0.03225 }, { "epoch": 0.8990941239160974, "grad_norm": 0.09742318093776703, "learning_rate": 8.284211855160328e-06, "loss": 0.32740819454193115, "memory(GiB)": 78.33, "step": 4640, "token_acc": 0.9031217784356744, "train_speed(iter/s)": 0.032251 }, { "epoch": 0.8992878942014242, "grad_norm": 0.10585065186023712, "learning_rate": 8.252735768504176e-06, "loss": 0.32093313336372375, "memory(GiB)": 78.33, "step": 4641, "token_acc": 0.9033283470456245, "train_speed(iter/s)": 0.032252 }, { "epoch": 0.8994816644867509, "grad_norm": 0.10464109480381012, "learning_rate": 8.221317901258367e-06, "loss": 0.3451445698738098, "memory(GiB)": 78.33, "step": 4642, "token_acc": 0.8969477704080168, "train_speed(iter/s)": 0.032252 }, { "epoch": 0.8996754347720777, "grad_norm": 0.09843490272760391, "learning_rate": 8.189958266326996e-06, "loss": 0.3220871090888977, "memory(GiB)": 78.33, "step": 4643, "token_acc": 0.9026759937819232, "train_speed(iter/s)": 0.032253 }, { "epoch": 0.8998692050574044, "grad_norm": 0.10574258118867874, "learning_rate": 8.158656876590375e-06, "loss": 0.36457881331443787, "memory(GiB)": 78.33, "step": 4644, "token_acc": 0.8903435022820082, "train_speed(iter/s)": 0.032254 }, { "epoch": 0.9000629753427312, "grad_norm": 0.0989030972123146, "learning_rate": 8.127413744904804e-06, "loss": 0.3353177309036255, "memory(GiB)": 78.33, "step": 4645, "token_acc": 0.8995719135954338, "train_speed(iter/s)": 0.032254 }, { "epoch": 0.9002567456280579, "grad_norm": 0.10432767868041992, "learning_rate": 8.096228884102652e-06, "loss": 0.32852283120155334, "memory(GiB)": 78.33, "step": 4646, "token_acc": 0.9020051081818983, "train_speed(iter/s)": 0.032255 }, { "epoch": 0.9004505159133847, "grad_norm": 0.09884372353553772, "learning_rate": 8.065102306992439e-06, "loss": 0.34128913283348083, "memory(GiB)": 78.33, "step": 4647, "token_acc": 0.8975473369505038, "train_speed(iter/s)": 0.032255 }, { "epoch": 0.9006442861987114, "grad_norm": 0.09996260702610016, "learning_rate": 8.034034026358587e-06, "loss": 0.32066965103149414, "memory(GiB)": 78.33, "step": 4648, "token_acc": 0.9021558379855089, "train_speed(iter/s)": 0.032256 }, { "epoch": 0.9008380564840381, "grad_norm": 0.10087880492210388, "learning_rate": 8.003024054961776e-06, "loss": 0.34059831500053406, "memory(GiB)": 78.33, "step": 4649, "token_acc": 0.89795299887043, "train_speed(iter/s)": 0.032257 }, { "epoch": 0.9010318267693649, "grad_norm": 0.10195588320493698, "learning_rate": 7.972072405538582e-06, "loss": 0.3243820369243622, "memory(GiB)": 78.33, "step": 4650, "token_acc": 0.9031757307864341, "train_speed(iter/s)": 0.032257 }, { "epoch": 0.9012255970546916, "grad_norm": 0.11210127919912338, "learning_rate": 7.941179090801687e-06, "loss": 0.33561965823173523, "memory(GiB)": 78.33, "step": 4651, "token_acc": 0.8998974967630556, "train_speed(iter/s)": 0.032258 }, { "epoch": 0.9014193673400184, "grad_norm": 0.09735889732837677, "learning_rate": 7.91034412343982e-06, "loss": 0.3177420496940613, "memory(GiB)": 78.33, "step": 4652, "token_acc": 0.9025480310669028, "train_speed(iter/s)": 0.032258 }, { "epoch": 0.9016131376253451, "grad_norm": 0.1058276817202568, "learning_rate": 7.879567516117691e-06, "loss": 0.34389030933380127, "memory(GiB)": 78.33, "step": 4653, "token_acc": 0.9005196304849884, "train_speed(iter/s)": 0.032259 }, { "epoch": 0.9018069079106719, "grad_norm": 0.10890252888202667, "learning_rate": 7.848849281476149e-06, "loss": 0.3455137014389038, "memory(GiB)": 78.33, "step": 4654, "token_acc": 0.89581208468552, "train_speed(iter/s)": 0.03226 }, { "epoch": 0.9020006781959986, "grad_norm": 0.09760237485170364, "learning_rate": 7.818189432131921e-06, "loss": 0.3121528923511505, "memory(GiB)": 78.33, "step": 4655, "token_acc": 0.9046669988374024, "train_speed(iter/s)": 0.03226 }, { "epoch": 0.9021944484813254, "grad_norm": 0.0973569005727768, "learning_rate": 7.787587980677868e-06, "loss": 0.3131091594696045, "memory(GiB)": 78.33, "step": 4656, "token_acc": 0.9063661202185792, "train_speed(iter/s)": 0.032261 }, { "epoch": 0.9023882187666521, "grad_norm": 0.09090343117713928, "learning_rate": 7.75704493968285e-06, "loss": 0.29967737197875977, "memory(GiB)": 78.33, "step": 4657, "token_acc": 0.9070998676422846, "train_speed(iter/s)": 0.032262 }, { "epoch": 0.9025819890519788, "grad_norm": 0.10325030982494354, "learning_rate": 7.726560321691682e-06, "loss": 0.3381047248840332, "memory(GiB)": 78.33, "step": 4658, "token_acc": 0.8999558238845531, "train_speed(iter/s)": 0.032262 }, { "epoch": 0.9027757593373056, "grad_norm": 0.10501549392938614, "learning_rate": 7.696134139225219e-06, "loss": 0.33340954780578613, "memory(GiB)": 78.33, "step": 4659, "token_acc": 0.8982804760006833, "train_speed(iter/s)": 0.032263 }, { "epoch": 0.9029695296226323, "grad_norm": 0.098429374396801, "learning_rate": 7.66576640478031e-06, "loss": 0.30632802844047546, "memory(GiB)": 78.33, "step": 4660, "token_acc": 0.9067490984028851, "train_speed(iter/s)": 0.032263 }, { "epoch": 0.9031632999079591, "grad_norm": 0.11389485746622086, "learning_rate": 7.635457130829832e-06, "loss": 0.34137189388275146, "memory(GiB)": 78.33, "step": 4661, "token_acc": 0.8986180210060807, "train_speed(iter/s)": 0.032264 }, { "epoch": 0.9033570701932858, "grad_norm": 0.09606049209833145, "learning_rate": 7.6052063298225715e-06, "loss": 0.29577910900115967, "memory(GiB)": 78.33, "step": 4662, "token_acc": 0.9094975613979395, "train_speed(iter/s)": 0.032265 }, { "epoch": 0.9035508404786126, "grad_norm": 0.10640320926904678, "learning_rate": 7.575014014183378e-06, "loss": 0.30323442816734314, "memory(GiB)": 78.33, "step": 4663, "token_acc": 0.9066974069962713, "train_speed(iter/s)": 0.032265 }, { "epoch": 0.9037446107639393, "grad_norm": 0.10014378279447556, "learning_rate": 7.5448801963130305e-06, "loss": 0.337171733379364, "memory(GiB)": 78.33, "step": 4664, "token_acc": 0.8995919717688575, "train_speed(iter/s)": 0.032266 }, { "epoch": 0.903938381049266, "grad_norm": 0.10196644067764282, "learning_rate": 7.5148048885883105e-06, "loss": 0.31075039505958557, "memory(GiB)": 78.33, "step": 4665, "token_acc": 0.9083940206975852, "train_speed(iter/s)": 0.032266 }, { "epoch": 0.9041321513345928, "grad_norm": 0.09376231580972672, "learning_rate": 7.484788103361955e-06, "loss": 0.3261268436908722, "memory(GiB)": 78.33, "step": 4666, "token_acc": 0.9021167228378149, "train_speed(iter/s)": 0.032267 }, { "epoch": 0.9043259216199195, "grad_norm": 0.09824056923389435, "learning_rate": 7.454829852962635e-06, "loss": 0.32310348749160767, "memory(GiB)": 78.33, "step": 4667, "token_acc": 0.9040854978354979, "train_speed(iter/s)": 0.032268 }, { "epoch": 0.9045196919052463, "grad_norm": 0.09825246036052704, "learning_rate": 7.424930149695074e-06, "loss": 0.32509177923202515, "memory(GiB)": 78.33, "step": 4668, "token_acc": 0.9037440744840898, "train_speed(iter/s)": 0.032268 }, { "epoch": 0.904713462190573, "grad_norm": 0.11145921051502228, "learning_rate": 7.395089005839783e-06, "loss": 0.32568415999412537, "memory(GiB)": 78.33, "step": 4669, "token_acc": 0.9009174311926605, "train_speed(iter/s)": 0.032269 }, { "epoch": 0.9049072324758998, "grad_norm": 0.10170602053403854, "learning_rate": 7.365306433653423e-06, "loss": 0.3407144844532013, "memory(GiB)": 78.33, "step": 4670, "token_acc": 0.8984231756508985, "train_speed(iter/s)": 0.032269 }, { "epoch": 0.9051010027612265, "grad_norm": 0.1278442144393921, "learning_rate": 7.335582445368443e-06, "loss": 0.3306085467338562, "memory(GiB)": 78.33, "step": 4671, "token_acc": 0.8980235373613379, "train_speed(iter/s)": 0.03227 }, { "epoch": 0.9052947730465533, "grad_norm": 0.0913781076669693, "learning_rate": 7.305917053193294e-06, "loss": 0.28779953718185425, "memory(GiB)": 78.33, "step": 4672, "token_acc": 0.9126665944305992, "train_speed(iter/s)": 0.032271 }, { "epoch": 0.90548854333188, "grad_norm": 0.08933483064174652, "learning_rate": 7.276310269312347e-06, "loss": 0.3098442852497101, "memory(GiB)": 78.33, "step": 4673, "token_acc": 0.9050055824339412, "train_speed(iter/s)": 0.032271 }, { "epoch": 0.9056823136172067, "grad_norm": 0.09485527127981186, "learning_rate": 7.24676210588589e-06, "loss": 0.3187693655490875, "memory(GiB)": 78.33, "step": 4674, "token_acc": 0.9054063940302876, "train_speed(iter/s)": 0.032272 }, { "epoch": 0.9058760839025335, "grad_norm": 0.10101988166570663, "learning_rate": 7.217272575050198e-06, "loss": 0.3332398235797882, "memory(GiB)": 78.33, "step": 4675, "token_acc": 0.9006496386548044, "train_speed(iter/s)": 0.032272 }, { "epoch": 0.9060698541878603, "grad_norm": 0.1058175191283226, "learning_rate": 7.187841688917351e-06, "loss": 0.30771443247795105, "memory(GiB)": 78.33, "step": 4676, "token_acc": 0.9090541632983024, "train_speed(iter/s)": 0.032273 }, { "epoch": 0.9062636244731871, "grad_norm": 0.10288235545158386, "learning_rate": 7.158469459575444e-06, "loss": 0.31234169006347656, "memory(GiB)": 78.33, "step": 4677, "token_acc": 0.9043364814657251, "train_speed(iter/s)": 0.032274 }, { "epoch": 0.9064573947585138, "grad_norm": 0.11373648047447205, "learning_rate": 7.129155899088429e-06, "loss": 0.36738601326942444, "memory(GiB)": 78.33, "step": 4678, "token_acc": 0.8934572225850501, "train_speed(iter/s)": 0.032274 }, { "epoch": 0.9066511650438406, "grad_norm": 0.09895353019237518, "learning_rate": 7.099901019496157e-06, "loss": 0.32815736532211304, "memory(GiB)": 78.33, "step": 4679, "token_acc": 0.9006158244900203, "train_speed(iter/s)": 0.032275 }, { "epoch": 0.9068449353291673, "grad_norm": 0.10247547179460526, "learning_rate": 7.070704832814467e-06, "loss": 0.32758232951164246, "memory(GiB)": 78.33, "step": 4680, "token_acc": 0.9017008208823009, "train_speed(iter/s)": 0.032275 }, { "epoch": 0.9070387056144941, "grad_norm": 0.09913128614425659, "learning_rate": 7.041567351034899e-06, "loss": 0.31721755862236023, "memory(GiB)": 78.33, "step": 4681, "token_acc": 0.9058820262163964, "train_speed(iter/s)": 0.032276 }, { "epoch": 0.9072324758998208, "grad_norm": 0.1092270091176033, "learning_rate": 7.0124885861251145e-06, "loss": 0.31192123889923096, "memory(GiB)": 78.33, "step": 4682, "token_acc": 0.9066539550794942, "train_speed(iter/s)": 0.032277 }, { "epoch": 0.9074262461851476, "grad_norm": 0.09576813131570816, "learning_rate": 6.983468550028442e-06, "loss": 0.3214895725250244, "memory(GiB)": 78.33, "step": 4683, "token_acc": 0.9031646569427624, "train_speed(iter/s)": 0.032277 }, { "epoch": 0.9076200164704743, "grad_norm": 0.09397298097610474, "learning_rate": 6.954507254664266e-06, "loss": 0.3123416006565094, "memory(GiB)": 78.33, "step": 4684, "token_acc": 0.9053452877968654, "train_speed(iter/s)": 0.032278 }, { "epoch": 0.907813786755801, "grad_norm": 0.11117564886808395, "learning_rate": 6.925604711927751e-06, "loss": 0.36098116636276245, "memory(GiB)": 78.33, "step": 4685, "token_acc": 0.8910591965540965, "train_speed(iter/s)": 0.032278 }, { "epoch": 0.9080075570411278, "grad_norm": 0.0936315655708313, "learning_rate": 6.896760933689904e-06, "loss": 0.3093935251235962, "memory(GiB)": 78.33, "step": 4686, "token_acc": 0.9063284652831155, "train_speed(iter/s)": 0.032279 }, { "epoch": 0.9082013273264545, "grad_norm": 0.09810250252485275, "learning_rate": 6.867975931797715e-06, "loss": 0.2804984450340271, "memory(GiB)": 78.33, "step": 4687, "token_acc": 0.9146265467818037, "train_speed(iter/s)": 0.03228 }, { "epoch": 0.9083950976117813, "grad_norm": 0.09873296320438385, "learning_rate": 6.839249718073875e-06, "loss": 0.3098883032798767, "memory(GiB)": 78.33, "step": 4688, "token_acc": 0.9055900621118013, "train_speed(iter/s)": 0.03228 }, { "epoch": 0.908588867897108, "grad_norm": 0.10540574043989182, "learning_rate": 6.810582304317081e-06, "loss": 0.31421953439712524, "memory(GiB)": 78.33, "step": 4689, "token_acc": 0.9060614765871876, "train_speed(iter/s)": 0.032281 }, { "epoch": 0.9087826381824348, "grad_norm": 0.12720443308353424, "learning_rate": 6.781973702301796e-06, "loss": 0.3550341725349426, "memory(GiB)": 78.33, "step": 4690, "token_acc": 0.8944157949020186, "train_speed(iter/s)": 0.032282 }, { "epoch": 0.9089764084677615, "grad_norm": 0.1082942858338356, "learning_rate": 6.7534239237783065e-06, "loss": 0.3174242675304413, "memory(GiB)": 78.33, "step": 4691, "token_acc": 0.9045280815373182, "train_speed(iter/s)": 0.032282 }, { "epoch": 0.9091701787530883, "grad_norm": 0.0936630368232727, "learning_rate": 6.724932980472813e-06, "loss": 0.3189206123352051, "memory(GiB)": 78.33, "step": 4692, "token_acc": 0.9040681037771843, "train_speed(iter/s)": 0.032283 }, { "epoch": 0.909363949038415, "grad_norm": 0.1038210541009903, "learning_rate": 6.696500884087258e-06, "loss": 0.3283487558364868, "memory(GiB)": 78.33, "step": 4693, "token_acc": 0.901529364943999, "train_speed(iter/s)": 0.032283 }, { "epoch": 0.9095577193237417, "grad_norm": 0.09301093220710754, "learning_rate": 6.668127646299548e-06, "loss": 0.3160119652748108, "memory(GiB)": 78.33, "step": 4694, "token_acc": 0.906036029207408, "train_speed(iter/s)": 0.032284 }, { "epoch": 0.9097514896090685, "grad_norm": 0.10908481478691101, "learning_rate": 6.639813278763262e-06, "loss": 0.35018086433410645, "memory(GiB)": 78.33, "step": 4695, "token_acc": 0.8955263546434372, "train_speed(iter/s)": 0.032285 }, { "epoch": 0.9099452598943952, "grad_norm": 0.09090767800807953, "learning_rate": 6.611557793107914e-06, "loss": 0.30631187558174133, "memory(GiB)": 78.33, "step": 4696, "token_acc": 0.9081879446779534, "train_speed(iter/s)": 0.032285 }, { "epoch": 0.910139030179722, "grad_norm": 0.10230287909507751, "learning_rate": 6.583361200938769e-06, "loss": 0.3390125036239624, "memory(GiB)": 78.33, "step": 4697, "token_acc": 0.8979241998125084, "train_speed(iter/s)": 0.032286 }, { "epoch": 0.9103328004650487, "grad_norm": 0.10016355663537979, "learning_rate": 6.5552235138369494e-06, "loss": 0.32282742857933044, "memory(GiB)": 78.33, "step": 4698, "token_acc": 0.904220375941925, "train_speed(iter/s)": 0.032286 }, { "epoch": 0.9105265707503755, "grad_norm": 0.10659368336200714, "learning_rate": 6.527144743359342e-06, "loss": 0.35568249225616455, "memory(GiB)": 78.33, "step": 4699, "token_acc": 0.8939569935121965, "train_speed(iter/s)": 0.032287 }, { "epoch": 0.9107203410357022, "grad_norm": 0.10241632908582687, "learning_rate": 6.499124901038621e-06, "loss": 0.33742234110832214, "memory(GiB)": 78.33, "step": 4700, "token_acc": 0.8987963512711178, "train_speed(iter/s)": 0.032287 }, { "epoch": 0.910914111321029, "grad_norm": 0.09770441055297852, "learning_rate": 6.471163998383366e-06, "loss": 0.3216272294521332, "memory(GiB)": 78.33, "step": 4701, "token_acc": 0.903465820499494, "train_speed(iter/s)": 0.032288 }, { "epoch": 0.9111078816063557, "grad_norm": 0.09685520827770233, "learning_rate": 6.44326204687779e-06, "loss": 0.2898436188697815, "memory(GiB)": 78.33, "step": 4702, "token_acc": 0.9125583603703411, "train_speed(iter/s)": 0.032289 }, { "epoch": 0.9113016518916824, "grad_norm": 0.10244952887296677, "learning_rate": 6.415419057982024e-06, "loss": 0.3338771164417267, "memory(GiB)": 78.33, "step": 4703, "token_acc": 0.8992232679442114, "train_speed(iter/s)": 0.032289 }, { "epoch": 0.9114954221770092, "grad_norm": 0.09772542864084244, "learning_rate": 6.387635043131923e-06, "loss": 0.31243184208869934, "memory(GiB)": 78.33, "step": 4704, "token_acc": 0.9057063523545078, "train_speed(iter/s)": 0.03229 }, { "epoch": 0.9116891924623359, "grad_norm": 0.10280278325080872, "learning_rate": 6.359910013739122e-06, "loss": 0.3368726372718811, "memory(GiB)": 78.33, "step": 4705, "token_acc": 0.8969188578803032, "train_speed(iter/s)": 0.03229 }, { "epoch": 0.9118829627476627, "grad_norm": 0.09057314693927765, "learning_rate": 6.332243981191032e-06, "loss": 0.30282315611839294, "memory(GiB)": 78.33, "step": 4706, "token_acc": 0.9087763447625039, "train_speed(iter/s)": 0.032291 }, { "epoch": 0.9120767330329894, "grad_norm": 0.10076764971017838, "learning_rate": 6.304636956850828e-06, "loss": 0.32998421788215637, "memory(GiB)": 78.33, "step": 4707, "token_acc": 0.900506177543134, "train_speed(iter/s)": 0.032291 }, { "epoch": 0.9122705033183162, "grad_norm": 0.10298432409763336, "learning_rate": 6.277088952057508e-06, "loss": 0.3607329726219177, "memory(GiB)": 78.33, "step": 4708, "token_acc": 0.8920216927933519, "train_speed(iter/s)": 0.032292 }, { "epoch": 0.9124642736036429, "grad_norm": 0.09257783740758896, "learning_rate": 6.249599978125685e-06, "loss": 0.31179288029670715, "memory(GiB)": 78.33, "step": 4709, "token_acc": 0.9052415912583234, "train_speed(iter/s)": 0.032293 }, { "epoch": 0.9126580438889697, "grad_norm": 0.10241623967885971, "learning_rate": 6.222170046345914e-06, "loss": 0.3377891480922699, "memory(GiB)": 78.33, "step": 4710, "token_acc": 0.8971870153484144, "train_speed(iter/s)": 0.032293 }, { "epoch": 0.9128518141742964, "grad_norm": 0.09825358539819717, "learning_rate": 6.194799167984365e-06, "loss": 0.2925589084625244, "memory(GiB)": 78.33, "step": 4711, "token_acc": 0.9106943825234062, "train_speed(iter/s)": 0.032294 }, { "epoch": 0.9130455844596231, "grad_norm": 0.09656988829374313, "learning_rate": 6.167487354282963e-06, "loss": 0.32839635014533997, "memory(GiB)": 78.33, "step": 4712, "token_acc": 0.9005813656994237, "train_speed(iter/s)": 0.032294 }, { "epoch": 0.9132393547449499, "grad_norm": 0.10096679627895355, "learning_rate": 6.140234616459483e-06, "loss": 0.33763110637664795, "memory(GiB)": 78.33, "step": 4713, "token_acc": 0.895675275944236, "train_speed(iter/s)": 0.032295 }, { "epoch": 0.9134331250302766, "grad_norm": 0.10386110842227936, "learning_rate": 6.113040965707256e-06, "loss": 0.30816999077796936, "memory(GiB)": 78.33, "step": 4714, "token_acc": 0.904618748291883, "train_speed(iter/s)": 0.032295 }, { "epoch": 0.9136268953156034, "grad_norm": 0.09449145197868347, "learning_rate": 6.085906413195546e-06, "loss": 0.30260464549064636, "memory(GiB)": 78.33, "step": 4715, "token_acc": 0.9089724446857608, "train_speed(iter/s)": 0.032296 }, { "epoch": 0.9138206656009301, "grad_norm": 0.1062229797244072, "learning_rate": 6.058830970069156e-06, "loss": 0.3432777523994446, "memory(GiB)": 78.33, "step": 4716, "token_acc": 0.8961035016855469, "train_speed(iter/s)": 0.032297 }, { "epoch": 0.9140144358862569, "grad_norm": 0.10210466384887695, "learning_rate": 6.03181464744874e-06, "loss": 0.31807243824005127, "memory(GiB)": 78.33, "step": 4717, "token_acc": 0.9045282715354299, "train_speed(iter/s)": 0.032297 }, { "epoch": 0.9142082061715836, "grad_norm": 0.10354772955179214, "learning_rate": 6.00485745643064e-06, "loss": 0.3176378011703491, "memory(GiB)": 78.33, "step": 4718, "token_acc": 0.9056022642137684, "train_speed(iter/s)": 0.032298 }, { "epoch": 0.9144019764569103, "grad_norm": 0.10732652246952057, "learning_rate": 5.977959408086863e-06, "loss": 0.3399035930633545, "memory(GiB)": 78.33, "step": 4719, "token_acc": 0.8993071737036692, "train_speed(iter/s)": 0.032298 }, { "epoch": 0.9145957467422371, "grad_norm": 0.08825431764125824, "learning_rate": 5.951120513465207e-06, "loss": 0.2766910195350647, "memory(GiB)": 78.33, "step": 4720, "token_acc": 0.9151678797839868, "train_speed(iter/s)": 0.032299 }, { "epoch": 0.9147895170275638, "grad_norm": 0.08927903324365616, "learning_rate": 5.924340783589071e-06, "loss": 0.2686108350753784, "memory(GiB)": 78.33, "step": 4721, "token_acc": 0.917844274653106, "train_speed(iter/s)": 0.0323 }, { "epoch": 0.9149832873128906, "grad_norm": 0.09068787842988968, "learning_rate": 5.897620229457639e-06, "loss": 0.2903996407985687, "memory(GiB)": 78.33, "step": 4722, "token_acc": 0.9110736173485845, "train_speed(iter/s)": 0.0323 }, { "epoch": 0.9151770575982173, "grad_norm": 0.10634169727563858, "learning_rate": 5.870958862045782e-06, "loss": 0.3310278356075287, "memory(GiB)": 78.33, "step": 4723, "token_acc": 0.9006753122572163, "train_speed(iter/s)": 0.032301 }, { "epoch": 0.9153708278835441, "grad_norm": 0.09822243452072144, "learning_rate": 5.844356692304009e-06, "loss": 0.316002756357193, "memory(GiB)": 78.33, "step": 4724, "token_acc": 0.9054254372961755, "train_speed(iter/s)": 0.032301 }, { "epoch": 0.9155645981688708, "grad_norm": 0.10107910633087158, "learning_rate": 5.817813731158544e-06, "loss": 0.3441876769065857, "memory(GiB)": 78.33, "step": 4725, "token_acc": 0.8951170707621944, "train_speed(iter/s)": 0.032302 }, { "epoch": 0.9157583684541976, "grad_norm": 0.10910195857286453, "learning_rate": 5.791329989511301e-06, "loss": 0.34992867708206177, "memory(GiB)": 78.33, "step": 4726, "token_acc": 0.8951081299872788, "train_speed(iter/s)": 0.032302 }, { "epoch": 0.9159521387395243, "grad_norm": 0.10190389305353165, "learning_rate": 5.764905478239895e-06, "loss": 0.3404694199562073, "memory(GiB)": 78.33, "step": 4727, "token_acc": 0.8978785857238158, "train_speed(iter/s)": 0.032303 }, { "epoch": 0.916145909024851, "grad_norm": 0.09621675312519073, "learning_rate": 5.7385402081975284e-06, "loss": 0.3171209990978241, "memory(GiB)": 78.33, "step": 4728, "token_acc": 0.9050346044239381, "train_speed(iter/s)": 0.032304 }, { "epoch": 0.9163396793101778, "grad_norm": 0.09636913985013962, "learning_rate": 5.712234190213172e-06, "loss": 0.317484587430954, "memory(GiB)": 78.33, "step": 4729, "token_acc": 0.9054396568531877, "train_speed(iter/s)": 0.032304 }, { "epoch": 0.9165334495955045, "grad_norm": 0.10614298284053802, "learning_rate": 5.685987435091399e-06, "loss": 0.3546099066734314, "memory(GiB)": 78.33, "step": 4730, "token_acc": 0.8933270676691729, "train_speed(iter/s)": 0.032305 }, { "epoch": 0.9167272198808313, "grad_norm": 0.1044028177857399, "learning_rate": 5.659799953612438e-06, "loss": 0.349134236574173, "memory(GiB)": 78.33, "step": 4731, "token_acc": 0.8962899543378996, "train_speed(iter/s)": 0.032305 }, { "epoch": 0.916920990166158, "grad_norm": 0.09492190182209015, "learning_rate": 5.633671756532232e-06, "loss": 0.2976999878883362, "memory(GiB)": 78.33, "step": 4732, "token_acc": 0.9110198151256294, "train_speed(iter/s)": 0.032306 }, { "epoch": 0.9171147604514848, "grad_norm": 0.10704229772090912, "learning_rate": 5.607602854582266e-06, "loss": 0.38176852464675903, "memory(GiB)": 78.33, "step": 4733, "token_acc": 0.8874598960061953, "train_speed(iter/s)": 0.032306 }, { "epoch": 0.9173085307368115, "grad_norm": 0.09060212224721909, "learning_rate": 5.581593258469841e-06, "loss": 0.30857348442077637, "memory(GiB)": 78.33, "step": 4734, "token_acc": 0.9065131425913003, "train_speed(iter/s)": 0.032307 }, { "epoch": 0.9175023010221383, "grad_norm": 0.10411375015974045, "learning_rate": 5.555642978877678e-06, "loss": 0.3482334315776825, "memory(GiB)": 78.33, "step": 4735, "token_acc": 0.8967308452374231, "train_speed(iter/s)": 0.032307 }, { "epoch": 0.917696071307465, "grad_norm": 0.11206993460655212, "learning_rate": 5.529752026464351e-06, "loss": 0.35386189818382263, "memory(GiB)": 78.33, "step": 4736, "token_acc": 0.8936384358071106, "train_speed(iter/s)": 0.032308 }, { "epoch": 0.9178898415927917, "grad_norm": 0.0973845049738884, "learning_rate": 5.5039204118639215e-06, "loss": 0.34648823738098145, "memory(GiB)": 78.33, "step": 4737, "token_acc": 0.8961357595583725, "train_speed(iter/s)": 0.032309 }, { "epoch": 0.9180836118781185, "grad_norm": 0.09547542035579681, "learning_rate": 5.478148145686151e-06, "loss": 0.32375043630599976, "memory(GiB)": 78.33, "step": 4738, "token_acc": 0.9037339556592765, "train_speed(iter/s)": 0.032309 }, { "epoch": 0.9182773821634452, "grad_norm": 0.10374420881271362, "learning_rate": 5.452435238516373e-06, "loss": 0.34971579909324646, "memory(GiB)": 78.33, "step": 4739, "token_acc": 0.8960960267766973, "train_speed(iter/s)": 0.03231 }, { "epoch": 0.918471152448772, "grad_norm": 0.098647341132164, "learning_rate": 5.426781700915573e-06, "loss": 0.33870449662208557, "memory(GiB)": 78.33, "step": 4740, "token_acc": 0.8994294051744353, "train_speed(iter/s)": 0.03231 }, { "epoch": 0.9186649227340987, "grad_norm": 0.1018560454249382, "learning_rate": 5.401187543420405e-06, "loss": 0.3089270293712616, "memory(GiB)": 78.33, "step": 4741, "token_acc": 0.9068556551923633, "train_speed(iter/s)": 0.032311 }, { "epoch": 0.9188586930194255, "grad_norm": 0.10648074001073837, "learning_rate": 5.375652776542994e-06, "loss": 0.33178946375846863, "memory(GiB)": 78.33, "step": 4742, "token_acc": 0.8994858134060215, "train_speed(iter/s)": 0.032312 }, { "epoch": 0.9190524633047522, "grad_norm": 0.09511461853981018, "learning_rate": 5.350177410771217e-06, "loss": 0.31413573026657104, "memory(GiB)": 78.33, "step": 4743, "token_acc": 0.9058918735479156, "train_speed(iter/s)": 0.032312 }, { "epoch": 0.919246233590079, "grad_norm": 0.10511302202939987, "learning_rate": 5.324761456568455e-06, "loss": 0.3475594222545624, "memory(GiB)": 78.33, "step": 4744, "token_acc": 0.8972395423451783, "train_speed(iter/s)": 0.032313 }, { "epoch": 0.9194400038754057, "grad_norm": 0.0897306576371193, "learning_rate": 5.29940492437374e-06, "loss": 0.30537813901901245, "memory(GiB)": 78.33, "step": 4745, "token_acc": 0.9067023393016396, "train_speed(iter/s)": 0.032313 }, { "epoch": 0.9196337741607324, "grad_norm": 0.10397239774465561, "learning_rate": 5.274107824601692e-06, "loss": 0.3287217617034912, "memory(GiB)": 78.33, "step": 4746, "token_acc": 0.9034060827107477, "train_speed(iter/s)": 0.032314 }, { "epoch": 0.9198275444460592, "grad_norm": 0.10913705080747604, "learning_rate": 5.248870167642466e-06, "loss": 0.37385323643684387, "memory(GiB)": 78.33, "step": 4747, "token_acc": 0.8894808050059192, "train_speed(iter/s)": 0.032315 }, { "epoch": 0.9200213147313859, "grad_norm": 0.09678813070058823, "learning_rate": 5.22369196386192e-06, "loss": 0.29931795597076416, "memory(GiB)": 78.33, "step": 4748, "token_acc": 0.9094277587871513, "train_speed(iter/s)": 0.032315 }, { "epoch": 0.9202150850167127, "grad_norm": 0.1072593703866005, "learning_rate": 5.198573223601332e-06, "loss": 0.33650124073028564, "memory(GiB)": 78.33, "step": 4749, "token_acc": 0.9004353969032035, "train_speed(iter/s)": 0.032316 }, { "epoch": 0.9204088553020394, "grad_norm": 0.10376523435115814, "learning_rate": 5.173513957177716e-06, "loss": 0.35175377130508423, "memory(GiB)": 78.33, "step": 4750, "token_acc": 0.8930210695128021, "train_speed(iter/s)": 0.032316 }, { "epoch": 0.9206026255873662, "grad_norm": 0.09530597925186157, "learning_rate": 5.148514174883539e-06, "loss": 0.3145235478878021, "memory(GiB)": 78.33, "step": 4751, "token_acc": 0.9033812729498164, "train_speed(iter/s)": 0.032317 }, { "epoch": 0.9207963958726929, "grad_norm": 0.09804235398769379, "learning_rate": 5.123573886986887e-06, "loss": 0.32991063594818115, "memory(GiB)": 78.33, "step": 4752, "token_acc": 0.90162206627079, "train_speed(iter/s)": 0.032318 }, { "epoch": 0.9209901661580197, "grad_norm": 0.09455542266368866, "learning_rate": 5.098693103731466e-06, "loss": 0.3050658106803894, "memory(GiB)": 78.33, "step": 4753, "token_acc": 0.9097025408482637, "train_speed(iter/s)": 0.032318 }, { "epoch": 0.9211839364433464, "grad_norm": 0.09994572401046753, "learning_rate": 5.073871835336402e-06, "loss": 0.3111793100833893, "memory(GiB)": 78.33, "step": 4754, "token_acc": 0.9068566536879021, "train_speed(iter/s)": 0.032319 }, { "epoch": 0.9213777067286731, "grad_norm": 0.0913093090057373, "learning_rate": 5.049110091996505e-06, "loss": 0.2801298499107361, "memory(GiB)": 78.33, "step": 4755, "token_acc": 0.9132923800810352, "train_speed(iter/s)": 0.032319 }, { "epoch": 0.9215714770139999, "grad_norm": 0.1138007864356041, "learning_rate": 5.024407883882059e-06, "loss": 0.32883748412132263, "memory(GiB)": 78.33, "step": 4756, "token_acc": 0.8995948792740237, "train_speed(iter/s)": 0.03232 }, { "epoch": 0.9217652472993266, "grad_norm": 0.10155371576547623, "learning_rate": 4.999765221138946e-06, "loss": 0.338005006313324, "memory(GiB)": 78.33, "step": 4757, "token_acc": 0.8987489089322083, "train_speed(iter/s)": 0.03232 }, { "epoch": 0.9219590175846534, "grad_norm": 0.09977617859840393, "learning_rate": 4.975182113888571e-06, "loss": 0.3259299397468567, "memory(GiB)": 78.33, "step": 4758, "token_acc": 0.9017150933652252, "train_speed(iter/s)": 0.032321 }, { "epoch": 0.9221527878699801, "grad_norm": 0.10653609782457352, "learning_rate": 4.950658572227856e-06, "loss": 0.3374466300010681, "memory(GiB)": 78.33, "step": 4759, "token_acc": 0.8985345429169574, "train_speed(iter/s)": 0.032322 }, { "epoch": 0.9223465581553069, "grad_norm": 0.11073584109544754, "learning_rate": 4.926194606229311e-06, "loss": 0.3533399701118469, "memory(GiB)": 78.33, "step": 4760, "token_acc": 0.8946103673189152, "train_speed(iter/s)": 0.032322 }, { "epoch": 0.9225403284406336, "grad_norm": 0.10663000494241714, "learning_rate": 4.901790225940916e-06, "loss": 0.3574818968772888, "memory(GiB)": 78.33, "step": 4761, "token_acc": 0.8953725220185329, "train_speed(iter/s)": 0.032323 }, { "epoch": 0.9227340987259603, "grad_norm": 0.08861353248357773, "learning_rate": 4.877445441386218e-06, "loss": 0.2784247100353241, "memory(GiB)": 78.33, "step": 4762, "token_acc": 0.9163010066405425, "train_speed(iter/s)": 0.032323 }, { "epoch": 0.9229278690112871, "grad_norm": 0.10696634650230408, "learning_rate": 4.853160262564271e-06, "loss": 0.34982746839523315, "memory(GiB)": 78.33, "step": 4763, "token_acc": 0.8978552430535907, "train_speed(iter/s)": 0.032324 }, { "epoch": 0.9231216392966138, "grad_norm": 0.09375255554914474, "learning_rate": 4.8289346994496434e-06, "loss": 0.3086707890033722, "memory(GiB)": 78.33, "step": 4764, "token_acc": 0.9071370640713706, "train_speed(iter/s)": 0.032324 }, { "epoch": 0.9233154095819406, "grad_norm": 0.10051163285970688, "learning_rate": 4.804768761992445e-06, "loss": 0.33212947845458984, "memory(GiB)": 78.33, "step": 4765, "token_acc": 0.9005619215513442, "train_speed(iter/s)": 0.032325 }, { "epoch": 0.9235091798672673, "grad_norm": 0.0955612063407898, "learning_rate": 4.780662460118234e-06, "loss": 0.31439507007598877, "memory(GiB)": 78.33, "step": 4766, "token_acc": 0.9068176100628931, "train_speed(iter/s)": 0.032326 }, { "epoch": 0.9237029501525941, "grad_norm": 0.09498281031847, "learning_rate": 4.756615803728192e-06, "loss": 0.31122851371765137, "memory(GiB)": 78.33, "step": 4767, "token_acc": 0.9053871107818846, "train_speed(iter/s)": 0.032326 }, { "epoch": 0.9238967204379208, "grad_norm": 0.10263197124004364, "learning_rate": 4.732628802698835e-06, "loss": 0.31166380643844604, "memory(GiB)": 78.33, "step": 4768, "token_acc": 0.9051054675202015, "train_speed(iter/s)": 0.032327 }, { "epoch": 0.9240904907232476, "grad_norm": 0.08937201648950577, "learning_rate": 4.708701466882348e-06, "loss": 0.2815870940685272, "memory(GiB)": 78.33, "step": 4769, "token_acc": 0.9132003898738179, "train_speed(iter/s)": 0.032327 }, { "epoch": 0.9242842610085743, "grad_norm": 0.09476013481616974, "learning_rate": 4.684833806106286e-06, "loss": 0.32919424772262573, "memory(GiB)": 78.33, "step": 4770, "token_acc": 0.9013624509493285, "train_speed(iter/s)": 0.032328 }, { "epoch": 0.924478031293901, "grad_norm": 0.09523847699165344, "learning_rate": 4.661025830173742e-06, "loss": 0.3461284935474396, "memory(GiB)": 78.33, "step": 4771, "token_acc": 0.8952430099455808, "train_speed(iter/s)": 0.032328 }, { "epoch": 0.9246718015792278, "grad_norm": 0.09518402069807053, "learning_rate": 4.63727754886331e-06, "loss": 0.30630695819854736, "memory(GiB)": 78.33, "step": 4772, "token_acc": 0.9087172109035788, "train_speed(iter/s)": 0.032329 }, { "epoch": 0.9248655718645545, "grad_norm": 0.09036926180124283, "learning_rate": 4.613588971929022e-06, "loss": 0.31284967064857483, "memory(GiB)": 78.33, "step": 4773, "token_acc": 0.9051664182811724, "train_speed(iter/s)": 0.03233 }, { "epoch": 0.9250593421498813, "grad_norm": 0.09823027998209, "learning_rate": 4.589960109100444e-06, "loss": 0.30616480112075806, "memory(GiB)": 78.33, "step": 4774, "token_acc": 0.904891304347826, "train_speed(iter/s)": 0.03233 }, { "epoch": 0.925253112435208, "grad_norm": 0.10480865091085434, "learning_rate": 4.566390970082562e-06, "loss": 0.32189154624938965, "memory(GiB)": 78.33, "step": 4775, "token_acc": 0.9040938430251407, "train_speed(iter/s)": 0.032331 }, { "epoch": 0.9254468827205348, "grad_norm": 0.09449231624603271, "learning_rate": 4.54288156455585e-06, "loss": 0.3326607942581177, "memory(GiB)": 78.33, "step": 4776, "token_acc": 0.9016650796510282, "train_speed(iter/s)": 0.032331 }, { "epoch": 0.9256406530058615, "grad_norm": 0.09553948044776917, "learning_rate": 4.519431902176285e-06, "loss": 0.318975567817688, "memory(GiB)": 78.33, "step": 4777, "token_acc": 0.9035005686844433, "train_speed(iter/s)": 0.032332 }, { "epoch": 0.9258344232911883, "grad_norm": 0.09830733388662338, "learning_rate": 4.496041992575227e-06, "loss": 0.3239940404891968, "memory(GiB)": 78.33, "step": 4778, "token_acc": 0.900301950436604, "train_speed(iter/s)": 0.032332 }, { "epoch": 0.926028193576515, "grad_norm": 0.09941928833723068, "learning_rate": 4.472711845359594e-06, "loss": 0.3222883641719818, "memory(GiB)": 78.33, "step": 4779, "token_acc": 0.9041993341209322, "train_speed(iter/s)": 0.032333 }, { "epoch": 0.9262219638618417, "grad_norm": 0.10502801835536957, "learning_rate": 4.449441470111653e-06, "loss": 0.35170549154281616, "memory(GiB)": 78.33, "step": 4780, "token_acc": 0.895190294701809, "train_speed(iter/s)": 0.032333 }, { "epoch": 0.9264157341471685, "grad_norm": 0.08964619785547256, "learning_rate": 4.426230876389208e-06, "loss": 0.2930634617805481, "memory(GiB)": 78.33, "step": 4781, "token_acc": 0.9114340692728378, "train_speed(iter/s)": 0.032334 }, { "epoch": 0.9266095044324952, "grad_norm": 0.09876509755849838, "learning_rate": 4.403080073725451e-06, "loss": 0.31545448303222656, "memory(GiB)": 78.33, "step": 4782, "token_acc": 0.9063012667079041, "train_speed(iter/s)": 0.032334 }, { "epoch": 0.926803274717822, "grad_norm": 0.10196632146835327, "learning_rate": 4.379989071629059e-06, "loss": 0.3357621133327484, "memory(GiB)": 78.33, "step": 4783, "token_acc": 0.9003710320186891, "train_speed(iter/s)": 0.032335 }, { "epoch": 0.9269970450031487, "grad_norm": 0.10657868534326553, "learning_rate": 4.356957879584111e-06, "loss": 0.3701134920120239, "memory(GiB)": 78.33, "step": 4784, "token_acc": 0.8883001847625258, "train_speed(iter/s)": 0.032336 }, { "epoch": 0.9271908152884755, "grad_norm": 0.09407640993595123, "learning_rate": 4.333986507050125e-06, "loss": 0.3258771002292633, "memory(GiB)": 78.33, "step": 4785, "token_acc": 0.9031313034290654, "train_speed(iter/s)": 0.032336 }, { "epoch": 0.9273845855738022, "grad_norm": 0.09928041696548462, "learning_rate": 4.311074963462119e-06, "loss": 0.34973931312561035, "memory(GiB)": 78.33, "step": 4786, "token_acc": 0.8954454830273685, "train_speed(iter/s)": 0.032337 }, { "epoch": 0.927578355859129, "grad_norm": 0.10799898207187653, "learning_rate": 4.2882232582304e-06, "loss": 0.34097224473953247, "memory(GiB)": 78.33, "step": 4787, "token_acc": 0.9003566184182924, "train_speed(iter/s)": 0.032337 }, { "epoch": 0.9277721261444557, "grad_norm": 0.09870768338441849, "learning_rate": 4.265431400740843e-06, "loss": 0.3039855659008026, "memory(GiB)": 78.33, "step": 4788, "token_acc": 0.907001012860256, "train_speed(iter/s)": 0.032338 }, { "epoch": 0.9279658964297824, "grad_norm": 0.0973040908575058, "learning_rate": 4.242699400354627e-06, "loss": 0.34012576937675476, "memory(GiB)": 78.33, "step": 4789, "token_acc": 0.8992445949466007, "train_speed(iter/s)": 0.032339 }, { "epoch": 0.9281596667151092, "grad_norm": 0.09325951337814331, "learning_rate": 4.220027266408432e-06, "loss": 0.30864644050598145, "memory(GiB)": 78.33, "step": 4790, "token_acc": 0.9057424223046425, "train_speed(iter/s)": 0.032339 }, { "epoch": 0.9283534370004359, "grad_norm": 0.2708342671394348, "learning_rate": 4.197415008214294e-06, "loss": 0.3377147316932678, "memory(GiB)": 78.33, "step": 4791, "token_acc": 0.897419232539876, "train_speed(iter/s)": 0.03234 }, { "epoch": 0.9285472072857627, "grad_norm": 0.10447484999895096, "learning_rate": 4.174862635059667e-06, "loss": 0.36220014095306396, "memory(GiB)": 78.33, "step": 4792, "token_acc": 0.8931025706286587, "train_speed(iter/s)": 0.03234 }, { "epoch": 0.9287409775710894, "grad_norm": 0.10496804118156433, "learning_rate": 4.152370156207457e-06, "loss": 0.3289092481136322, "memory(GiB)": 78.33, "step": 4793, "token_acc": 0.9012323899291816, "train_speed(iter/s)": 0.032341 }, { "epoch": 0.9289347478564162, "grad_norm": 0.09275829046964645, "learning_rate": 4.129937580895876e-06, "loss": 0.30164456367492676, "memory(GiB)": 78.33, "step": 4794, "token_acc": 0.9096051284813877, "train_speed(iter/s)": 0.032341 }, { "epoch": 0.9291285181417429, "grad_norm": 0.09018470346927643, "learning_rate": 4.107564918338635e-06, "loss": 0.2863234579563141, "memory(GiB)": 78.33, "step": 4795, "token_acc": 0.9150141643059491, "train_speed(iter/s)": 0.032342 }, { "epoch": 0.9293222884270697, "grad_norm": 0.09511598944664001, "learning_rate": 4.085252177724751e-06, "loss": 0.29719942808151245, "memory(GiB)": 78.33, "step": 4796, "token_acc": 0.9094084830157199, "train_speed(iter/s)": 0.032342 }, { "epoch": 0.9295160587123965, "grad_norm": 0.11475857347249985, "learning_rate": 4.062999368218678e-06, "loss": 0.38321453332901, "memory(GiB)": 78.33, "step": 4797, "token_acc": 0.8886180329842301, "train_speed(iter/s)": 0.032343 }, { "epoch": 0.9297098289977233, "grad_norm": 0.10678976774215698, "learning_rate": 4.040806498960236e-06, "loss": 0.3360079824924469, "memory(GiB)": 78.33, "step": 4798, "token_acc": 0.8982757019471875, "train_speed(iter/s)": 0.032344 }, { "epoch": 0.92990359928305, "grad_norm": 0.09599542617797852, "learning_rate": 4.0186735790646355e-06, "loss": 0.30816134810447693, "memory(GiB)": 78.33, "step": 4799, "token_acc": 0.9078661001140015, "train_speed(iter/s)": 0.032344 }, { "epoch": 0.9300973695683767, "grad_norm": 0.10428732633590698, "learning_rate": 3.996600617622503e-06, "loss": 0.3343293070793152, "memory(GiB)": 78.33, "step": 4800, "token_acc": 0.8990470605483067, "train_speed(iter/s)": 0.032345 }, { "epoch": 0.9302911398537035, "grad_norm": 0.10038130730390549, "learning_rate": 3.974587623699721e-06, "loss": 0.2978077828884125, "memory(GiB)": 78.33, "step": 4801, "token_acc": 0.9088284412239237, "train_speed(iter/s)": 0.032343 }, { "epoch": 0.9304849101390302, "grad_norm": 0.10180116444826126, "learning_rate": 3.9526346063376735e-06, "loss": 0.31616196036338806, "memory(GiB)": 78.33, "step": 4802, "token_acc": 0.9038762241790799, "train_speed(iter/s)": 0.032343 }, { "epoch": 0.930678680424357, "grad_norm": 0.10563566535711288, "learning_rate": 3.930741574553048e-06, "loss": 0.35403674840927124, "memory(GiB)": 78.33, "step": 4803, "token_acc": 0.8924158321943599, "train_speed(iter/s)": 0.032344 }, { "epoch": 0.9308724507096837, "grad_norm": 0.10199618339538574, "learning_rate": 3.908908537337868e-06, "loss": 0.35334300994873047, "memory(GiB)": 78.33, "step": 4804, "token_acc": 0.8947171598813293, "train_speed(iter/s)": 0.032344 }, { "epoch": 0.9310662209950105, "grad_norm": 0.08841365575790405, "learning_rate": 3.887135503659594e-06, "loss": 0.27694904804229736, "memory(GiB)": 78.33, "step": 4805, "token_acc": 0.9153520015775981, "train_speed(iter/s)": 0.032345 }, { "epoch": 0.9312599912803372, "grad_norm": 0.10381640493869781, "learning_rate": 3.8654224824609396e-06, "loss": 0.2975887358188629, "memory(GiB)": 78.33, "step": 4806, "token_acc": 0.9085034107542324, "train_speed(iter/s)": 0.032346 }, { "epoch": 0.931453761565664, "grad_norm": 0.10239759087562561, "learning_rate": 3.8437694826601025e-06, "loss": 0.34857773780822754, "memory(GiB)": 78.33, "step": 4807, "token_acc": 0.8943446895898057, "train_speed(iter/s)": 0.032346 }, { "epoch": 0.9316475318509907, "grad_norm": 0.12824027240276337, "learning_rate": 3.8221765131504714e-06, "loss": 0.32302480936050415, "memory(GiB)": 78.33, "step": 4808, "token_acc": 0.904497486048953, "train_speed(iter/s)": 0.032347 }, { "epoch": 0.9318413021363174, "grad_norm": 0.09815438836812973, "learning_rate": 3.8006435828009162e-06, "loss": 0.30667853355407715, "memory(GiB)": 78.33, "step": 4809, "token_acc": 0.907749177788046, "train_speed(iter/s)": 0.032347 }, { "epoch": 0.9320350724216442, "grad_norm": 0.09541095048189163, "learning_rate": 3.7791707004555802e-06, "loss": 0.3058740794658661, "memory(GiB)": 78.33, "step": 4810, "token_acc": 0.9084823790877341, "train_speed(iter/s)": 0.032348 }, { "epoch": 0.9322288427069709, "grad_norm": 0.10464680939912796, "learning_rate": 3.7577578749339255e-06, "loss": 0.3602639138698578, "memory(GiB)": 78.33, "step": 4811, "token_acc": 0.8935190262090141, "train_speed(iter/s)": 0.032348 }, { "epoch": 0.9324226129922977, "grad_norm": 0.10377727448940277, "learning_rate": 3.7364051150307993e-06, "loss": 0.33127960562705994, "memory(GiB)": 78.33, "step": 4812, "token_acc": 0.9014416334773097, "train_speed(iter/s)": 0.032349 }, { "epoch": 0.9326163832776244, "grad_norm": 0.09491308033466339, "learning_rate": 3.715112429516337e-06, "loss": 0.32317915558815, "memory(GiB)": 78.33, "step": 4813, "token_acc": 0.9034436015745158, "train_speed(iter/s)": 0.032349 }, { "epoch": 0.9328101535629512, "grad_norm": 0.10311052948236465, "learning_rate": 3.6938798271360594e-06, "loss": 0.3292618691921234, "memory(GiB)": 78.33, "step": 4814, "token_acc": 0.8985615698012804, "train_speed(iter/s)": 0.03235 }, { "epoch": 0.9330039238482779, "grad_norm": 0.0882132351398468, "learning_rate": 3.672707316610707e-06, "loss": 0.30853813886642456, "memory(GiB)": 78.33, "step": 4815, "token_acc": 0.9053889463403424, "train_speed(iter/s)": 0.032351 }, { "epoch": 0.9331976941336046, "grad_norm": 0.09400478005409241, "learning_rate": 3.6515949066364236e-06, "loss": 0.3131456971168518, "memory(GiB)": 78.33, "step": 4816, "token_acc": 0.9059575833888848, "train_speed(iter/s)": 0.032351 }, { "epoch": 0.9333914644189314, "grad_norm": 0.09559616446495056, "learning_rate": 3.6305426058846565e-06, "loss": 0.3201538324356079, "memory(GiB)": 78.33, "step": 4817, "token_acc": 0.9050924765675906, "train_speed(iter/s)": 0.032352 }, { "epoch": 0.9335852347042581, "grad_norm": 0.1044517233967781, "learning_rate": 3.6095504230021387e-06, "loss": 0.34499791264533997, "memory(GiB)": 78.33, "step": 4818, "token_acc": 0.8969500515691764, "train_speed(iter/s)": 0.032352 }, { "epoch": 0.9337790049895849, "grad_norm": 0.0850808322429657, "learning_rate": 3.5886183666109405e-06, "loss": 0.27842819690704346, "memory(GiB)": 78.33, "step": 4819, "token_acc": 0.9156751866093644, "train_speed(iter/s)": 0.032353 }, { "epoch": 0.9339727752749116, "grad_norm": 0.09250815957784653, "learning_rate": 3.567746445308367e-06, "loss": 0.30809280276298523, "memory(GiB)": 78.33, "step": 4820, "token_acc": 0.9050716781110709, "train_speed(iter/s)": 0.032353 }, { "epoch": 0.9341665455602384, "grad_norm": 0.09440483152866364, "learning_rate": 3.5469346676671616e-06, "loss": 0.2813994288444519, "memory(GiB)": 78.33, "step": 4821, "token_acc": 0.9127701474449607, "train_speed(iter/s)": 0.032354 }, { "epoch": 0.9343603158455651, "grad_norm": 0.09050828963518143, "learning_rate": 3.526183042235203e-06, "loss": 0.2954739034175873, "memory(GiB)": 78.33, "step": 4822, "token_acc": 0.9097762259958737, "train_speed(iter/s)": 0.032354 }, { "epoch": 0.9345540861308919, "grad_norm": 0.10041210055351257, "learning_rate": 3.5054915775357907e-06, "loss": 0.3556936979293823, "memory(GiB)": 78.33, "step": 4823, "token_acc": 0.8935934907970614, "train_speed(iter/s)": 0.032355 }, { "epoch": 0.9347478564162186, "grad_norm": 0.10176945477724075, "learning_rate": 3.4848602820674255e-06, "loss": 0.3064349889755249, "memory(GiB)": 78.33, "step": 4824, "token_acc": 0.9054652880354506, "train_speed(iter/s)": 0.032355 }, { "epoch": 0.9349416267015453, "grad_norm": 0.10641314834356308, "learning_rate": 3.464289164303963e-06, "loss": 0.32750222086906433, "memory(GiB)": 78.33, "step": 4825, "token_acc": 0.9015335861015222, "train_speed(iter/s)": 0.032356 }, { "epoch": 0.9351353969868721, "grad_norm": 0.09517652541399002, "learning_rate": 3.4437782326945274e-06, "loss": 0.31871315836906433, "memory(GiB)": 78.33, "step": 4826, "token_acc": 0.9053494708163051, "train_speed(iter/s)": 0.032357 }, { "epoch": 0.9353291672721988, "grad_norm": 0.09240631759166718, "learning_rate": 3.4233274956634803e-06, "loss": 0.2959226667881012, "memory(GiB)": 78.33, "step": 4827, "token_acc": 0.9104759299781182, "train_speed(iter/s)": 0.032357 }, { "epoch": 0.9355229375575256, "grad_norm": 0.10840350389480591, "learning_rate": 3.402936961610503e-06, "loss": 0.33262190222740173, "memory(GiB)": 78.33, "step": 4828, "token_acc": 0.9010374853331686, "train_speed(iter/s)": 0.032358 }, { "epoch": 0.9357167078428523, "grad_norm": 0.09562506526708603, "learning_rate": 3.3826066389105123e-06, "loss": 0.3282950818538666, "memory(GiB)": 78.33, "step": 4829, "token_acc": 0.9013872354899055, "train_speed(iter/s)": 0.032358 }, { "epoch": 0.9359104781281791, "grad_norm": 0.1005750298500061, "learning_rate": 3.3623365359137453e-06, "loss": 0.3179299831390381, "memory(GiB)": 78.33, "step": 4830, "token_acc": 0.9042720884875364, "train_speed(iter/s)": 0.032359 }, { "epoch": 0.9361042484135058, "grad_norm": 0.09987672418355942, "learning_rate": 3.3421266609456766e-06, "loss": 0.27341562509536743, "memory(GiB)": 78.33, "step": 4831, "token_acc": 0.9170077307867213, "train_speed(iter/s)": 0.032359 }, { "epoch": 0.9362980186988326, "grad_norm": 0.10075198113918304, "learning_rate": 3.321977022307032e-06, "loss": 0.3398180305957794, "memory(GiB)": 78.33, "step": 4832, "token_acc": 0.8986838767860665, "train_speed(iter/s)": 0.03236 }, { "epoch": 0.9364917889841593, "grad_norm": 0.10808294266462326, "learning_rate": 3.301887628273825e-06, "loss": 0.35527899861335754, "memory(GiB)": 78.33, "step": 4833, "token_acc": 0.8941093763730631, "train_speed(iter/s)": 0.032361 }, { "epoch": 0.936685559269486, "grad_norm": 0.09991144388914108, "learning_rate": 3.2818584870972887e-06, "loss": 0.319998562335968, "memory(GiB)": 78.33, "step": 4834, "token_acc": 0.902095910695979, "train_speed(iter/s)": 0.032361 }, { "epoch": 0.9368793295548128, "grad_norm": 0.10339634865522385, "learning_rate": 3.2618896070039422e-06, "loss": 0.3507855236530304, "memory(GiB)": 78.33, "step": 4835, "token_acc": 0.8963165437078712, "train_speed(iter/s)": 0.032362 }, { "epoch": 0.9370730998401395, "grad_norm": 0.10384315997362137, "learning_rate": 3.241980996195559e-06, "loss": 0.34698525071144104, "memory(GiB)": 78.33, "step": 4836, "token_acc": 0.8970609075192019, "train_speed(iter/s)": 0.032362 }, { "epoch": 0.9372668701254663, "grad_norm": 0.0977269858121872, "learning_rate": 3.2221326628490973e-06, "loss": 0.33199065923690796, "memory(GiB)": 78.33, "step": 4837, "token_acc": 0.900308274874384, "train_speed(iter/s)": 0.032363 }, { "epoch": 0.937460640410793, "grad_norm": 0.11115771532058716, "learning_rate": 3.2023446151168363e-06, "loss": 0.35356926918029785, "memory(GiB)": 78.33, "step": 4838, "token_acc": 0.8959869212498843, "train_speed(iter/s)": 0.032363 }, { "epoch": 0.9376544106961198, "grad_norm": 0.09477823227643967, "learning_rate": 3.1826168611262417e-06, "loss": 0.30472081899642944, "memory(GiB)": 78.33, "step": 4839, "token_acc": 0.9091226468297661, "train_speed(iter/s)": 0.032364 }, { "epoch": 0.9378481809814465, "grad_norm": 0.09087901562452316, "learning_rate": 3.162949408980048e-06, "loss": 0.295918732881546, "memory(GiB)": 78.33, "step": 4840, "token_acc": 0.9117901101274023, "train_speed(iter/s)": 0.032364 }, { "epoch": 0.9380419512667733, "grad_norm": 0.10526914894580841, "learning_rate": 3.143342266756177e-06, "loss": 0.3462876081466675, "memory(GiB)": 78.33, "step": 4841, "token_acc": 0.8977767847167915, "train_speed(iter/s)": 0.032365 }, { "epoch": 0.9382357215521, "grad_norm": 0.08861321955919266, "learning_rate": 3.1237954425078537e-06, "loss": 0.30201205611228943, "memory(GiB)": 78.33, "step": 4842, "token_acc": 0.9064061563062162, "train_speed(iter/s)": 0.032366 }, { "epoch": 0.9384294918374267, "grad_norm": 0.09867441654205322, "learning_rate": 3.1043089442634394e-06, "loss": 0.3132460415363312, "memory(GiB)": 78.33, "step": 4843, "token_acc": 0.9052049214169717, "train_speed(iter/s)": 0.032366 }, { "epoch": 0.9386232621227535, "grad_norm": 0.09859292209148407, "learning_rate": 3.0848827800265817e-06, "loss": 0.3280273675918579, "memory(GiB)": 78.33, "step": 4844, "token_acc": 0.900389837343729, "train_speed(iter/s)": 0.032367 }, { "epoch": 0.9388170324080802, "grad_norm": 0.10763488709926605, "learning_rate": 3.0655169577761483e-06, "loss": 0.3330923914909363, "memory(GiB)": 78.33, "step": 4845, "token_acc": 0.8993531614620408, "train_speed(iter/s)": 0.032367 }, { "epoch": 0.939010802693407, "grad_norm": 0.10462969541549683, "learning_rate": 3.04621148546616e-06, "loss": 0.34643369913101196, "memory(GiB)": 78.33, "step": 4846, "token_acc": 0.8975769701011448, "train_speed(iter/s)": 0.032368 }, { "epoch": 0.9392045729787337, "grad_norm": 0.10669691115617752, "learning_rate": 3.0269663710259405e-06, "loss": 0.3267556130886078, "memory(GiB)": 78.33, "step": 4847, "token_acc": 0.9032104437471986, "train_speed(iter/s)": 0.032368 }, { "epoch": 0.9393983432640605, "grad_norm": 0.09605014324188232, "learning_rate": 3.007781622359934e-06, "loss": 0.31911730766296387, "memory(GiB)": 78.33, "step": 4848, "token_acc": 0.9046072137571867, "train_speed(iter/s)": 0.032369 }, { "epoch": 0.9395921135493872, "grad_norm": 0.09630642086267471, "learning_rate": 2.988657247347853e-06, "loss": 0.31961339712142944, "memory(GiB)": 78.33, "step": 4849, "token_acc": 0.9028399069553437, "train_speed(iter/s)": 0.032369 }, { "epoch": 0.939785883834714, "grad_norm": 0.09434531629085541, "learning_rate": 2.969593253844582e-06, "loss": 0.29251518845558167, "memory(GiB)": 78.33, "step": 4850, "token_acc": 0.9108287448821121, "train_speed(iter/s)": 0.03237 }, { "epoch": 0.9399796541200407, "grad_norm": 0.09503661096096039, "learning_rate": 2.950589649680224e-06, "loss": 0.327890545129776, "memory(GiB)": 78.33, "step": 4851, "token_acc": 0.9029296643340055, "train_speed(iter/s)": 0.032371 }, { "epoch": 0.9401734244053674, "grad_norm": 0.09823209792375565, "learning_rate": 2.931646442660085e-06, "loss": 0.3063351511955261, "memory(GiB)": 78.33, "step": 4852, "token_acc": 0.9096577564030869, "train_speed(iter/s)": 0.032371 }, { "epoch": 0.9403671946906942, "grad_norm": 0.10142441838979721, "learning_rate": 2.912763640564608e-06, "loss": 0.3455069363117218, "memory(GiB)": 78.33, "step": 4853, "token_acc": 0.8964419326593966, "train_speed(iter/s)": 0.032372 }, { "epoch": 0.9405609649760209, "grad_norm": 0.09713559597730637, "learning_rate": 2.8939412511495066e-06, "loss": 0.3110349178314209, "memory(GiB)": 78.33, "step": 4854, "token_acc": 0.9043400246719748, "train_speed(iter/s)": 0.032372 }, { "epoch": 0.9407547352613477, "grad_norm": 0.09745633602142334, "learning_rate": 2.875179282145612e-06, "loss": 0.33706194162368774, "memory(GiB)": 78.33, "step": 4855, "token_acc": 0.9004770564824333, "train_speed(iter/s)": 0.032373 }, { "epoch": 0.9409485055466744, "grad_norm": 0.10170414298772812, "learning_rate": 2.8564777412589944e-06, "loss": 0.33610811829566956, "memory(GiB)": 78.33, "step": 4856, "token_acc": 0.9004509018036072, "train_speed(iter/s)": 0.032373 }, { "epoch": 0.9411422758320012, "grad_norm": 0.09948156028985977, "learning_rate": 2.8378366361708593e-06, "loss": 0.3124832808971405, "memory(GiB)": 78.33, "step": 4857, "token_acc": 0.9069445621169759, "train_speed(iter/s)": 0.032374 }, { "epoch": 0.9413360461173279, "grad_norm": 0.10482876747846603, "learning_rate": 2.8192559745376152e-06, "loss": 0.35402458906173706, "memory(GiB)": 78.33, "step": 4858, "token_acc": 0.8951886335504009, "train_speed(iter/s)": 0.032375 }, { "epoch": 0.9415298164026547, "grad_norm": 0.12244974076747894, "learning_rate": 2.8007357639908743e-06, "loss": 0.34525981545448303, "memory(GiB)": 78.33, "step": 4859, "token_acc": 0.8989004930936265, "train_speed(iter/s)": 0.032375 }, { "epoch": 0.9417235866879814, "grad_norm": 0.10524637997150421, "learning_rate": 2.7822760121373187e-06, "loss": 0.3427123427391052, "memory(GiB)": 78.33, "step": 4860, "token_acc": 0.8977062523122457, "train_speed(iter/s)": 0.032376 }, { "epoch": 0.9419173569733081, "grad_norm": 0.10324079543352127, "learning_rate": 2.7638767265589168e-06, "loss": 0.3470621109008789, "memory(GiB)": 78.33, "step": 4861, "token_acc": 0.8988676079966448, "train_speed(iter/s)": 0.032376 }, { "epoch": 0.9421111272586349, "grad_norm": 0.10326438397169113, "learning_rate": 2.7455379148127064e-06, "loss": 0.3370799422264099, "memory(GiB)": 78.33, "step": 4862, "token_acc": 0.8991185653891851, "train_speed(iter/s)": 0.032377 }, { "epoch": 0.9423048975439616, "grad_norm": 0.10224196314811707, "learning_rate": 2.7272595844309797e-06, "loss": 0.3339768052101135, "memory(GiB)": 78.33, "step": 4863, "token_acc": 0.9005996573386637, "train_speed(iter/s)": 0.032377 }, { "epoch": 0.9424986678292884, "grad_norm": 0.09687656909227371, "learning_rate": 2.7090417429211143e-06, "loss": 0.316772997379303, "memory(GiB)": 78.33, "step": 4864, "token_acc": 0.9048740035311112, "train_speed(iter/s)": 0.032378 }, { "epoch": 0.9426924381146151, "grad_norm": 0.10042627900838852, "learning_rate": 2.6908843977656415e-06, "loss": 0.3032311499118805, "memory(GiB)": 78.33, "step": 4865, "token_acc": 0.9067732763020391, "train_speed(iter/s)": 0.032378 }, { "epoch": 0.9428862083999419, "grad_norm": 0.10065774619579315, "learning_rate": 2.6727875564223287e-06, "loss": 0.32765907049179077, "memory(GiB)": 78.33, "step": 4866, "token_acc": 0.9011883622455948, "train_speed(iter/s)": 0.032379 }, { "epoch": 0.9430799786852686, "grad_norm": 0.10036752372980118, "learning_rate": 2.654751226323981e-06, "loss": 0.3179859519004822, "memory(GiB)": 78.33, "step": 4867, "token_acc": 0.9036099036099036, "train_speed(iter/s)": 0.03238 }, { "epoch": 0.9432737489705953, "grad_norm": 0.09917581081390381, "learning_rate": 2.6367754148786225e-06, "loss": 0.3333044648170471, "memory(GiB)": 78.33, "step": 4868, "token_acc": 0.8983144059174526, "train_speed(iter/s)": 0.03238 }, { "epoch": 0.9434675192559221, "grad_norm": 0.09930333495140076, "learning_rate": 2.6188601294694135e-06, "loss": 0.3436489999294281, "memory(GiB)": 78.33, "step": 4869, "token_acc": 0.8961160505381376, "train_speed(iter/s)": 0.032381 }, { "epoch": 0.9436612895412488, "grad_norm": 0.10882118344306946, "learning_rate": 2.601005377454635e-06, "loss": 0.3218170702457428, "memory(GiB)": 78.33, "step": 4870, "token_acc": 0.9025652352254441, "train_speed(iter/s)": 0.032381 }, { "epoch": 0.9438550598265756, "grad_norm": 0.10568146407604218, "learning_rate": 2.5832111661677203e-06, "loss": 0.3271356225013733, "memory(GiB)": 78.33, "step": 4871, "token_acc": 0.9018029976218129, "train_speed(iter/s)": 0.032382 }, { "epoch": 0.9440488301119023, "grad_norm": 0.11422929167747498, "learning_rate": 2.5654775029171903e-06, "loss": 0.3454614281654358, "memory(GiB)": 78.33, "step": 4872, "token_acc": 0.8961740297778863, "train_speed(iter/s)": 0.032382 }, { "epoch": 0.9442426003972291, "grad_norm": 0.10132316499948502, "learning_rate": 2.547804394986819e-06, "loss": 0.3422181010246277, "memory(GiB)": 78.33, "step": 4873, "token_acc": 0.8978968880151073, "train_speed(iter/s)": 0.032383 }, { "epoch": 0.9444363706825558, "grad_norm": 0.09934362024068832, "learning_rate": 2.5301918496353322e-06, "loss": 0.31417742371559143, "memory(GiB)": 78.33, "step": 4874, "token_acc": 0.9052588444201611, "train_speed(iter/s)": 0.032383 }, { "epoch": 0.9446301409678826, "grad_norm": 0.10042758285999298, "learning_rate": 2.5126398740967446e-06, "loss": 0.3282407522201538, "memory(GiB)": 78.33, "step": 4875, "token_acc": 0.901354957914186, "train_speed(iter/s)": 0.032384 }, { "epoch": 0.9448239112532093, "grad_norm": 0.09526004642248154, "learning_rate": 2.4951484755800886e-06, "loss": 0.29993996024131775, "memory(GiB)": 78.33, "step": 4876, "token_acc": 0.9110852110852111, "train_speed(iter/s)": 0.032384 }, { "epoch": 0.945017681538536, "grad_norm": 0.21196365356445312, "learning_rate": 2.4777176612695513e-06, "loss": 0.3271010220050812, "memory(GiB)": 78.33, "step": 4877, "token_acc": 0.9029842588543944, "train_speed(iter/s)": 0.032385 }, { "epoch": 0.9452114518238628, "grad_norm": 0.10081423819065094, "learning_rate": 2.4603474383244724e-06, "loss": 0.32736343145370483, "memory(GiB)": 78.33, "step": 4878, "token_acc": 0.904177119187912, "train_speed(iter/s)": 0.032386 }, { "epoch": 0.9454052221091895, "grad_norm": 0.10198832303285599, "learning_rate": 2.4430378138792282e-06, "loss": 0.3308171331882477, "memory(GiB)": 78.33, "step": 4879, "token_acc": 0.899286116294914, "train_speed(iter/s)": 0.032386 }, { "epoch": 0.9455989923945163, "grad_norm": 0.10578960925340652, "learning_rate": 2.425788795043382e-06, "loss": 0.3238624930381775, "memory(GiB)": 78.33, "step": 4880, "token_acc": 0.904650030083374, "train_speed(iter/s)": 0.032387 }, { "epoch": 0.945792762679843, "grad_norm": 0.09681179374456406, "learning_rate": 2.4086003889015326e-06, "loss": 0.2874050438404083, "memory(GiB)": 78.33, "step": 4881, "token_acc": 0.9121931027345872, "train_speed(iter/s)": 0.032387 }, { "epoch": 0.9459865329651698, "grad_norm": 0.10076259076595306, "learning_rate": 2.3914726025134335e-06, "loss": 0.3133833706378937, "memory(GiB)": 78.33, "step": 4882, "token_acc": 0.9046173554639635, "train_speed(iter/s)": 0.032388 }, { "epoch": 0.9461803032504965, "grad_norm": 0.09847967326641083, "learning_rate": 2.3744054429139402e-06, "loss": 0.31684496998786926, "memory(GiB)": 78.33, "step": 4883, "token_acc": 0.903607284929505, "train_speed(iter/s)": 0.032389 }, { "epoch": 0.9463740735358233, "grad_norm": 0.09512414038181305, "learning_rate": 2.3573989171129792e-06, "loss": 0.3404044806957245, "memory(GiB)": 78.33, "step": 4884, "token_acc": 0.8973556187081547, "train_speed(iter/s)": 0.032389 }, { "epoch": 0.94656784382115, "grad_norm": 0.111565001308918, "learning_rate": 2.340453032095613e-06, "loss": 0.36841219663619995, "memory(GiB)": 78.33, "step": 4885, "token_acc": 0.8914011037920598, "train_speed(iter/s)": 0.03239 }, { "epoch": 0.9467616141064767, "grad_norm": 0.09721893817186356, "learning_rate": 2.3235677948219234e-06, "loss": 0.3191075921058655, "memory(GiB)": 78.33, "step": 4886, "token_acc": 0.9030001088968747, "train_speed(iter/s)": 0.03239 }, { "epoch": 0.9469553843918035, "grad_norm": 0.09971962869167328, "learning_rate": 2.3067432122271966e-06, "loss": 0.33436092734336853, "memory(GiB)": 78.33, "step": 4887, "token_acc": 0.8994865769240593, "train_speed(iter/s)": 0.032391 }, { "epoch": 0.9471491546771302, "grad_norm": 0.103705994784832, "learning_rate": 2.289979291221672e-06, "loss": 0.3510754108428955, "memory(GiB)": 78.33, "step": 4888, "token_acc": 0.8964129530759531, "train_speed(iter/s)": 0.032391 }, { "epoch": 0.947342924962457, "grad_norm": 0.10477303713560104, "learning_rate": 2.273276038690791e-06, "loss": 0.3268052637577057, "memory(GiB)": 78.33, "step": 4889, "token_acc": 0.9006867252517022, "train_speed(iter/s)": 0.032392 }, { "epoch": 0.9475366952477837, "grad_norm": 0.10907679796218872, "learning_rate": 2.256633461495e-06, "loss": 0.35595622658729553, "memory(GiB)": 78.33, "step": 4890, "token_acc": 0.8929480987090542, "train_speed(iter/s)": 0.032392 }, { "epoch": 0.9477304655331105, "grad_norm": 0.09888530522584915, "learning_rate": 2.240051566469864e-06, "loss": 0.3075358271598816, "memory(GiB)": 78.33, "step": 4891, "token_acc": 0.9061078252957234, "train_speed(iter/s)": 0.032393 }, { "epoch": 0.9479242358184372, "grad_norm": 0.09880520403385162, "learning_rate": 2.2235303604260347e-06, "loss": 0.34165963530540466, "memory(GiB)": 78.33, "step": 4892, "token_acc": 0.8993901612684646, "train_speed(iter/s)": 0.032393 }, { "epoch": 0.948118006103764, "grad_norm": 0.09279303252696991, "learning_rate": 2.207069850149168e-06, "loss": 0.31139740347862244, "memory(GiB)": 78.33, "step": 4893, "token_acc": 0.906871677108554, "train_speed(iter/s)": 0.032394 }, { "epoch": 0.9483117763890907, "grad_norm": 0.10540986061096191, "learning_rate": 2.190670042400089e-06, "loss": 0.3575151264667511, "memory(GiB)": 78.33, "step": 4894, "token_acc": 0.8956390443200994, "train_speed(iter/s)": 0.032394 }, { "epoch": 0.9485055466744174, "grad_norm": 0.10701259970664978, "learning_rate": 2.174330943914593e-06, "loss": 0.3582616150379181, "memory(GiB)": 78.33, "step": 4895, "token_acc": 0.8956076759061834, "train_speed(iter/s)": 0.032395 }, { "epoch": 0.9486993169597442, "grad_norm": 0.09107998758554459, "learning_rate": 2.1580525614036115e-06, "loss": 0.325961172580719, "memory(GiB)": 78.33, "step": 4896, "token_acc": 0.9035096153846154, "train_speed(iter/s)": 0.032396 }, { "epoch": 0.9488930872450709, "grad_norm": 0.09532765299081802, "learning_rate": 2.141834901553113e-06, "loss": 0.304240345954895, "memory(GiB)": 78.33, "step": 4897, "token_acc": 0.9070364131139044, "train_speed(iter/s)": 0.032396 }, { "epoch": 0.9490868575303977, "grad_norm": 0.0961633175611496, "learning_rate": 2.12567797102412e-06, "loss": 0.317226380109787, "memory(GiB)": 78.33, "step": 4898, "token_acc": 0.9035757011830386, "train_speed(iter/s)": 0.032397 }, { "epoch": 0.9492806278157244, "grad_norm": 0.092696912586689, "learning_rate": 2.10958177645274e-06, "loss": 0.31177228689193726, "memory(GiB)": 78.33, "step": 4899, "token_acc": 0.9065967318942909, "train_speed(iter/s)": 0.032397 }, { "epoch": 0.9494743981010512, "grad_norm": 0.09380333125591278, "learning_rate": 2.0935463244500683e-06, "loss": 0.3175070583820343, "memory(GiB)": 78.33, "step": 4900, "token_acc": 0.9043746832999542, "train_speed(iter/s)": 0.032398 }, { "epoch": 0.9496681683863779, "grad_norm": 0.09854871779680252, "learning_rate": 2.077571621602353e-06, "loss": 0.3263704180717468, "memory(GiB)": 78.33, "step": 4901, "token_acc": 0.9024825089818742, "train_speed(iter/s)": 0.032398 }, { "epoch": 0.9498619386717047, "grad_norm": 0.11277089267969131, "learning_rate": 2.0616576744707624e-06, "loss": 0.35801947116851807, "memory(GiB)": 78.33, "step": 4902, "token_acc": 0.895363334124911, "train_speed(iter/s)": 0.032399 }, { "epoch": 0.9500557089570314, "grad_norm": 0.09970259666442871, "learning_rate": 2.0458044895916513e-06, "loss": 0.31966596841812134, "memory(GiB)": 78.33, "step": 4903, "token_acc": 0.9028877820418606, "train_speed(iter/s)": 0.032399 }, { "epoch": 0.9502494792423581, "grad_norm": 0.10125202685594559, "learning_rate": 2.0300120734763113e-06, "loss": 0.3093150556087494, "memory(GiB)": 78.33, "step": 4904, "token_acc": 0.905799933852491, "train_speed(iter/s)": 0.0324 }, { "epoch": 0.9504432495276849, "grad_norm": 0.09156003594398499, "learning_rate": 2.014280432611104e-06, "loss": 0.29941630363464355, "memory(GiB)": 78.33, "step": 4905, "token_acc": 0.9120291488636662, "train_speed(iter/s)": 0.032401 }, { "epoch": 0.9506370198130116, "grad_norm": 0.10564015060663223, "learning_rate": 1.998609573457477e-06, "loss": 0.3411896824836731, "memory(GiB)": 78.33, "step": 4906, "token_acc": 0.8986840243384746, "train_speed(iter/s)": 0.032401 }, { "epoch": 0.9508307900983384, "grad_norm": 0.10158051550388336, "learning_rate": 1.982999502451832e-06, "loss": 0.33457452058792114, "memory(GiB)": 78.33, "step": 4907, "token_acc": 0.8990842883153047, "train_speed(iter/s)": 0.032402 }, { "epoch": 0.9510245603836651, "grad_norm": 0.09677699208259583, "learning_rate": 1.9674502260056733e-06, "loss": 0.3045703172683716, "memory(GiB)": 78.33, "step": 4908, "token_acc": 0.9080135296660817, "train_speed(iter/s)": 0.032402 }, { "epoch": 0.9512183306689919, "grad_norm": 0.09688906371593475, "learning_rate": 1.9519617505055098e-06, "loss": 0.3192395865917206, "memory(GiB)": 78.33, "step": 4909, "token_acc": 0.9055065341226405, "train_speed(iter/s)": 0.032403 }, { "epoch": 0.9514121009543186, "grad_norm": 0.1135900616645813, "learning_rate": 1.936534082312835e-06, "loss": 0.35068824887275696, "memory(GiB)": 78.33, "step": 4910, "token_acc": 0.898571652801758, "train_speed(iter/s)": 0.032403 }, { "epoch": 0.9516058712396454, "grad_norm": 0.09443049877882004, "learning_rate": 1.9211672277642475e-06, "loss": 0.30319830775260925, "memory(GiB)": 78.33, "step": 4911, "token_acc": 0.906641655112416, "train_speed(iter/s)": 0.032404 }, { "epoch": 0.9517996415249721, "grad_norm": 0.10316184908151627, "learning_rate": 1.9058611931712986e-06, "loss": 0.31675225496292114, "memory(GiB)": 78.33, "step": 4912, "token_acc": 0.9031579571039222, "train_speed(iter/s)": 0.032405 }, { "epoch": 0.9519934118102988, "grad_norm": 0.10155156254768372, "learning_rate": 1.8906159848206092e-06, "loss": 0.30963414907455444, "memory(GiB)": 78.33, "step": 4913, "token_acc": 0.905279359704479, "train_speed(iter/s)": 0.032405 }, { "epoch": 0.9521871820956256, "grad_norm": 0.11472687125205994, "learning_rate": 1.8754316089737876e-06, "loss": 0.3777286112308502, "memory(GiB)": 78.33, "step": 4914, "token_acc": 0.8904687163389038, "train_speed(iter/s)": 0.032406 }, { "epoch": 0.9523809523809523, "grad_norm": 0.09569194912910461, "learning_rate": 1.8603080718674612e-06, "loss": 0.3033876419067383, "memory(GiB)": 78.33, "step": 4915, "token_acc": 0.9085435990308607, "train_speed(iter/s)": 0.032406 }, { "epoch": 0.9525747226662791, "grad_norm": 0.09913970530033112, "learning_rate": 1.8452453797132948e-06, "loss": 0.3156486749649048, "memory(GiB)": 78.33, "step": 4916, "token_acc": 0.9022938238862399, "train_speed(iter/s)": 0.032407 }, { "epoch": 0.9527684929516058, "grad_norm": 0.09308433532714844, "learning_rate": 1.8302435386978897e-06, "loss": 0.30741560459136963, "memory(GiB)": 78.33, "step": 4917, "token_acc": 0.907486671172186, "train_speed(iter/s)": 0.032407 }, { "epoch": 0.9529622632369327, "grad_norm": 0.09139852225780487, "learning_rate": 1.8153025549829836e-06, "loss": 0.32832008600234985, "memory(GiB)": 78.33, "step": 4918, "token_acc": 0.9023957409050577, "train_speed(iter/s)": 0.032408 }, { "epoch": 0.9531560335222594, "grad_norm": 0.11089830100536346, "learning_rate": 1.800422434705151e-06, "loss": 0.3461150527000427, "memory(GiB)": 78.33, "step": 4919, "token_acc": 0.8961630109366407, "train_speed(iter/s)": 0.032408 }, { "epoch": 0.9533498038075862, "grad_norm": 0.10732929408550262, "learning_rate": 1.7856031839761363e-06, "loss": 0.3544648289680481, "memory(GiB)": 78.33, "step": 4920, "token_acc": 0.8927505075839006, "train_speed(iter/s)": 0.032409 }, { "epoch": 0.9535435740929129, "grad_norm": 0.10486234724521637, "learning_rate": 1.7708448088825545e-06, "loss": 0.341609388589859, "memory(GiB)": 78.33, "step": 4921, "token_acc": 0.8979067310163635, "train_speed(iter/s)": 0.032409 }, { "epoch": 0.9537373443782396, "grad_norm": 0.10882783681154251, "learning_rate": 1.7561473154860728e-06, "loss": 0.34817302227020264, "memory(GiB)": 78.33, "step": 4922, "token_acc": 0.8951537080828513, "train_speed(iter/s)": 0.03241 }, { "epoch": 0.9539311146635664, "grad_norm": 0.09959416091442108, "learning_rate": 1.7415107098233628e-06, "loss": 0.3298611044883728, "memory(GiB)": 78.33, "step": 4923, "token_acc": 0.9004360465116279, "train_speed(iter/s)": 0.03241 }, { "epoch": 0.9541248849488931, "grad_norm": 0.08811165392398834, "learning_rate": 1.7269349979060654e-06, "loss": 0.28269749879837036, "memory(GiB)": 78.33, "step": 4924, "token_acc": 0.9127474034881442, "train_speed(iter/s)": 0.032411 }, { "epoch": 0.9543186552342199, "grad_norm": 0.10352582484483719, "learning_rate": 1.7124201857208252e-06, "loss": 0.3340131640434265, "memory(GiB)": 78.33, "step": 4925, "token_acc": 0.8997261368174816, "train_speed(iter/s)": 0.032412 }, { "epoch": 0.9545124255195466, "grad_norm": 0.10261107236146927, "learning_rate": 1.6979662792292404e-06, "loss": 0.32005396485328674, "memory(GiB)": 78.33, "step": 4926, "token_acc": 0.9025667147748818, "train_speed(iter/s)": 0.032412 }, { "epoch": 0.9547061958048734, "grad_norm": 0.09662662446498871, "learning_rate": 1.6835732843679451e-06, "loss": 0.32958412170410156, "memory(GiB)": 78.33, "step": 4927, "token_acc": 0.9002728335715213, "train_speed(iter/s)": 0.032413 }, { "epoch": 0.9548999660902001, "grad_norm": 0.09711389243602753, "learning_rate": 1.6692412070485106e-06, "loss": 0.3142922818660736, "memory(GiB)": 78.33, "step": 4928, "token_acc": 0.904241246186658, "train_speed(iter/s)": 0.032413 }, { "epoch": 0.9550937363755269, "grad_norm": 0.10424994677305222, "learning_rate": 1.6549700531575284e-06, "loss": 0.335157573223114, "memory(GiB)": 78.33, "step": 4929, "token_acc": 0.899737302977233, "train_speed(iter/s)": 0.032414 }, { "epoch": 0.9552875066608536, "grad_norm": 0.09868663549423218, "learning_rate": 1.6407598285565093e-06, "loss": 0.32267120480537415, "memory(GiB)": 78.33, "step": 4930, "token_acc": 0.9046480108427011, "train_speed(iter/s)": 0.032414 }, { "epoch": 0.9554812769461803, "grad_norm": 0.09154357016086578, "learning_rate": 1.6266105390820017e-06, "loss": 0.31675025820732117, "memory(GiB)": 78.33, "step": 4931, "token_acc": 0.903805316214699, "train_speed(iter/s)": 0.032415 }, { "epoch": 0.9556750472315071, "grad_norm": 0.09340998530387878, "learning_rate": 1.6125221905455231e-06, "loss": 0.30397865176200867, "memory(GiB)": 78.33, "step": 4932, "token_acc": 0.9084064579411999, "train_speed(iter/s)": 0.032415 }, { "epoch": 0.9558688175168338, "grad_norm": 0.10670307278633118, "learning_rate": 1.598494788733462e-06, "loss": 0.32941100001335144, "memory(GiB)": 78.33, "step": 4933, "token_acc": 0.9001266174608566, "train_speed(iter/s)": 0.032416 }, { "epoch": 0.9560625878021606, "grad_norm": 0.09027762711048126, "learning_rate": 1.584528339407326e-06, "loss": 0.2942469120025635, "memory(GiB)": 78.33, "step": 4934, "token_acc": 0.9111908000494621, "train_speed(iter/s)": 0.032416 }, { "epoch": 0.9562563580874873, "grad_norm": 0.09194032102823257, "learning_rate": 1.57062284830346e-06, "loss": 0.303377628326416, "memory(GiB)": 78.33, "step": 4935, "token_acc": 0.9077935247705968, "train_speed(iter/s)": 0.032417 }, { "epoch": 0.9564501283728141, "grad_norm": 0.13369497656822205, "learning_rate": 1.5567783211332619e-06, "loss": 0.3248521685600281, "memory(GiB)": 78.33, "step": 4936, "token_acc": 0.9012696041822256, "train_speed(iter/s)": 0.032417 }, { "epoch": 0.9566438986581408, "grad_norm": 0.0936022698879242, "learning_rate": 1.5429947635830164e-06, "loss": 0.3220188021659851, "memory(GiB)": 78.33, "step": 4937, "token_acc": 0.9041003304295584, "train_speed(iter/s)": 0.032418 }, { "epoch": 0.9568376689434676, "grad_norm": 0.09517528116703033, "learning_rate": 1.529272181314012e-06, "loss": 0.3031097650527954, "memory(GiB)": 78.33, "step": 4938, "token_acc": 0.9060737583491436, "train_speed(iter/s)": 0.032419 }, { "epoch": 0.9570314392287943, "grad_norm": 0.09666939824819565, "learning_rate": 1.5156105799625063e-06, "loss": 0.3083738684654236, "memory(GiB)": 78.33, "step": 4939, "token_acc": 0.9067667594099428, "train_speed(iter/s)": 0.032419 }, { "epoch": 0.957225209514121, "grad_norm": 0.09546789526939392, "learning_rate": 1.5020099651396444e-06, "loss": 0.3306988775730133, "memory(GiB)": 78.33, "step": 4940, "token_acc": 0.9035588697408784, "train_speed(iter/s)": 0.03242 }, { "epoch": 0.9574189797994478, "grad_norm": 0.10004635155200958, "learning_rate": 1.4884703424315915e-06, "loss": 0.3262169063091278, "memory(GiB)": 78.33, "step": 4941, "token_acc": 0.9031863057731064, "train_speed(iter/s)": 0.03242 }, { "epoch": 0.9576127500847745, "grad_norm": 0.09645616263151169, "learning_rate": 1.474991717399432e-06, "loss": 0.2961626946926117, "memory(GiB)": 78.33, "step": 4942, "token_acc": 0.9100462809072081, "train_speed(iter/s)": 0.032421 }, { "epoch": 0.9578065203701013, "grad_norm": 0.09985015541315079, "learning_rate": 1.4615740955792044e-06, "loss": 0.3318878412246704, "memory(GiB)": 78.33, "step": 4943, "token_acc": 0.9009440305532396, "train_speed(iter/s)": 0.032421 }, { "epoch": 0.958000290655428, "grad_norm": 0.10011833906173706, "learning_rate": 1.4482174824818671e-06, "loss": 0.337127149105072, "memory(GiB)": 78.33, "step": 4944, "token_acc": 0.8997083603370059, "train_speed(iter/s)": 0.032422 }, { "epoch": 0.9581940609407548, "grad_norm": 0.10610119998455048, "learning_rate": 1.4349218835933486e-06, "loss": 0.35346800088882446, "memory(GiB)": 78.33, "step": 4945, "token_acc": 0.8942115189322501, "train_speed(iter/s)": 0.032422 }, { "epoch": 0.9583878312260815, "grad_norm": 0.09986083954572678, "learning_rate": 1.4216873043745137e-06, "loss": 0.3105732202529907, "memory(GiB)": 78.33, "step": 4946, "token_acc": 0.9050974597942831, "train_speed(iter/s)": 0.032423 }, { "epoch": 0.9585816015114083, "grad_norm": 0.10908082872629166, "learning_rate": 1.4085137502611477e-06, "loss": 0.3552319407463074, "memory(GiB)": 78.33, "step": 4947, "token_acc": 0.8946395037842144, "train_speed(iter/s)": 0.032423 }, { "epoch": 0.958775371796735, "grad_norm": 0.09129244834184647, "learning_rate": 1.3954012266640059e-06, "loss": 0.27050110697746277, "memory(GiB)": 78.33, "step": 4948, "token_acc": 0.9176398959428793, "train_speed(iter/s)": 0.032424 }, { "epoch": 0.9589691420820617, "grad_norm": 0.10130368173122406, "learning_rate": 1.3823497389687466e-06, "loss": 0.3376561403274536, "memory(GiB)": 78.33, "step": 4949, "token_acc": 0.8982817684028808, "train_speed(iter/s)": 0.032424 }, { "epoch": 0.9591629123673885, "grad_norm": 0.10016176849603653, "learning_rate": 1.369359292535932e-06, "loss": 0.3218695819377899, "memory(GiB)": 78.33, "step": 4950, "token_acc": 0.9014069086330542, "train_speed(iter/s)": 0.032425 }, { "epoch": 0.9593566826527152, "grad_norm": 0.09046775102615356, "learning_rate": 1.356429892701144e-06, "loss": 0.30442506074905396, "memory(GiB)": 78.33, "step": 4951, "token_acc": 0.9069761971963496, "train_speed(iter/s)": 0.032425 }, { "epoch": 0.959550452938042, "grad_norm": 0.09234574437141418, "learning_rate": 1.343561544774785e-06, "loss": 0.30501365661621094, "memory(GiB)": 78.33, "step": 4952, "token_acc": 0.9067395915863726, "train_speed(iter/s)": 0.032426 }, { "epoch": 0.9597442232233687, "grad_norm": 0.09642866253852844, "learning_rate": 1.3307542540422766e-06, "loss": 0.29419055581092834, "memory(GiB)": 78.33, "step": 4953, "token_acc": 0.9106110240706452, "train_speed(iter/s)": 0.032426 }, { "epoch": 0.9599379935086955, "grad_norm": 0.10034112632274628, "learning_rate": 1.3180080257638782e-06, "loss": 0.339307963848114, "memory(GiB)": 78.33, "step": 4954, "token_acc": 0.8978796816739941, "train_speed(iter/s)": 0.032427 }, { "epoch": 0.9601317637940222, "grad_norm": 0.10334701836109161, "learning_rate": 1.3053228651748349e-06, "loss": 0.3403671681880951, "memory(GiB)": 78.33, "step": 4955, "token_acc": 0.8992078746578135, "train_speed(iter/s)": 0.032428 }, { "epoch": 0.960325534079349, "grad_norm": 0.10150493681430817, "learning_rate": 1.2926987774852627e-06, "loss": 0.3322233259677887, "memory(GiB)": 78.33, "step": 4956, "token_acc": 0.9011913901447297, "train_speed(iter/s)": 0.032428 }, { "epoch": 0.9605193043646757, "grad_norm": 0.09975534677505493, "learning_rate": 1.2801357678802138e-06, "loss": 0.3389909267425537, "memory(GiB)": 78.33, "step": 4957, "token_acc": 0.8986236035148656, "train_speed(iter/s)": 0.032429 }, { "epoch": 0.9607130746500024, "grad_norm": 0.10334469377994537, "learning_rate": 1.2676338415196774e-06, "loss": 0.34487882256507874, "memory(GiB)": 78.33, "step": 4958, "token_acc": 0.897090561398716, "train_speed(iter/s)": 0.032429 }, { "epoch": 0.9609068449353292, "grad_norm": 0.10029633343219757, "learning_rate": 1.2551930035385126e-06, "loss": 0.32758665084838867, "memory(GiB)": 78.33, "step": 4959, "token_acc": 0.9024535719127016, "train_speed(iter/s)": 0.03243 }, { "epoch": 0.9611006152206559, "grad_norm": 0.1052929162979126, "learning_rate": 1.2428132590465156e-06, "loss": 0.3499680757522583, "memory(GiB)": 78.33, "step": 4960, "token_acc": 0.8937179730499146, "train_speed(iter/s)": 0.03243 }, { "epoch": 0.9612943855059827, "grad_norm": 0.10789167135953903, "learning_rate": 1.2304946131283521e-06, "loss": 0.347482293844223, "memory(GiB)": 78.33, "step": 4961, "token_acc": 0.8981575675370183, "train_speed(iter/s)": 0.032431 }, { "epoch": 0.9614881557913094, "grad_norm": 0.09253862500190735, "learning_rate": 1.2182370708436584e-06, "loss": 0.2795042097568512, "memory(GiB)": 78.33, "step": 4962, "token_acc": 0.9132865314612585, "train_speed(iter/s)": 0.032431 }, { "epoch": 0.9616819260766362, "grad_norm": 0.11232099682092667, "learning_rate": 1.2060406372269238e-06, "loss": 0.3673211932182312, "memory(GiB)": 78.33, "step": 4963, "token_acc": 0.8896090319882919, "train_speed(iter/s)": 0.032432 }, { "epoch": 0.9618756963619629, "grad_norm": 0.09660880267620087, "learning_rate": 1.1939053172875245e-06, "loss": 0.31402865052223206, "memory(GiB)": 78.33, "step": 4964, "token_acc": 0.9040212859337283, "train_speed(iter/s)": 0.032433 }, { "epoch": 0.9620694666472897, "grad_norm": 0.10611861944198608, "learning_rate": 1.1818311160098237e-06, "loss": 0.3377360999584198, "memory(GiB)": 78.33, "step": 4965, "token_acc": 0.9000355008579374, "train_speed(iter/s)": 0.032433 }, { "epoch": 0.9622632369326164, "grad_norm": 0.1003570705652237, "learning_rate": 1.1698180383529542e-06, "loss": 0.32752174139022827, "memory(GiB)": 78.33, "step": 4966, "token_acc": 0.9013712208822072, "train_speed(iter/s)": 0.032434 }, { "epoch": 0.9624570072179431, "grad_norm": 0.08952160179615021, "learning_rate": 1.1578660892510528e-06, "loss": 0.2788778245449066, "memory(GiB)": 78.33, "step": 4967, "token_acc": 0.9154586305821423, "train_speed(iter/s)": 0.032434 }, { "epoch": 0.9626507775032699, "grad_norm": 0.10649926215410233, "learning_rate": 1.1459752736130756e-06, "loss": 0.3794569969177246, "memory(GiB)": 78.33, "step": 4968, "token_acc": 0.888167308750688, "train_speed(iter/s)": 0.032435 }, { "epoch": 0.9628445477885966, "grad_norm": 0.08885262906551361, "learning_rate": 1.1341455963229329e-06, "loss": 0.29271164536476135, "memory(GiB)": 78.33, "step": 4969, "token_acc": 0.9096743030637593, "train_speed(iter/s)": 0.032435 }, { "epoch": 0.9630383180739234, "grad_norm": 0.10036831349134445, "learning_rate": 1.1223770622393714e-06, "loss": 0.3443622887134552, "memory(GiB)": 78.33, "step": 4970, "token_acc": 0.8970222654561176, "train_speed(iter/s)": 0.032436 }, { "epoch": 0.9632320883592501, "grad_norm": 0.09779267013072968, "learning_rate": 1.110669676196041e-06, "loss": 0.3445608615875244, "memory(GiB)": 78.33, "step": 4971, "token_acc": 0.8951095773995004, "train_speed(iter/s)": 0.032436 }, { "epoch": 0.9634258586445769, "grad_norm": 0.10155529528856277, "learning_rate": 1.0990234430014954e-06, "loss": 0.3322911262512207, "memory(GiB)": 78.33, "step": 4972, "token_acc": 0.9017332921313593, "train_speed(iter/s)": 0.032437 }, { "epoch": 0.9636196289299036, "grad_norm": 0.09281053394079208, "learning_rate": 1.087438367439125e-06, "loss": 0.29993924498558044, "memory(GiB)": 78.33, "step": 4973, "token_acc": 0.9089661368003393, "train_speed(iter/s)": 0.032437 }, { "epoch": 0.9638133992152303, "grad_norm": 0.09526029974222183, "learning_rate": 1.0759144542672737e-06, "loss": 0.29557523131370544, "memory(GiB)": 78.33, "step": 4974, "token_acc": 0.9109068897204491, "train_speed(iter/s)": 0.032438 }, { "epoch": 0.9640071695005571, "grad_norm": 0.10219217091798782, "learning_rate": 1.0644517082190883e-06, "loss": 0.3419698476791382, "memory(GiB)": 78.33, "step": 4975, "token_acc": 0.8975590462199119, "train_speed(iter/s)": 0.032439 }, { "epoch": 0.9642009397858838, "grad_norm": 0.09392113983631134, "learning_rate": 1.0530501340026532e-06, "loss": 0.3047352135181427, "memory(GiB)": 78.33, "step": 4976, "token_acc": 0.9068194872330426, "train_speed(iter/s)": 0.032439 }, { "epoch": 0.9643947100712106, "grad_norm": 0.10133232921361923, "learning_rate": 1.0417097363008886e-06, "loss": 0.32953399419784546, "memory(GiB)": 78.33, "step": 4977, "token_acc": 0.9026956897794436, "train_speed(iter/s)": 0.03244 }, { "epoch": 0.9645884803565373, "grad_norm": 0.09701909869909286, "learning_rate": 1.030430519771569e-06, "loss": 0.3382679224014282, "memory(GiB)": 78.33, "step": 4978, "token_acc": 0.8983569375214284, "train_speed(iter/s)": 0.03244 }, { "epoch": 0.9647822506418641, "grad_norm": 0.09171733260154724, "learning_rate": 1.0192124890474385e-06, "loss": 0.3097023665904999, "memory(GiB)": 78.33, "step": 4979, "token_acc": 0.9057717083225972, "train_speed(iter/s)": 0.032441 }, { "epoch": 0.9649760209271908, "grad_norm": 0.0920836329460144, "learning_rate": 1.0080556487359947e-06, "loss": 0.30298176407814026, "memory(GiB)": 78.33, "step": 4980, "token_acc": 0.9079930043486482, "train_speed(iter/s)": 0.032441 }, { "epoch": 0.9651697912125176, "grad_norm": 0.09740208089351654, "learning_rate": 9.969600034196557e-07, "loss": 0.3513997197151184, "memory(GiB)": 78.33, "step": 4981, "token_acc": 0.8963819470346885, "train_speed(iter/s)": 0.032442 }, { "epoch": 0.9653635614978443, "grad_norm": 0.09901650995016098, "learning_rate": 9.859255576557257e-07, "loss": 0.32587093114852905, "memory(GiB)": 78.33, "step": 4982, "token_acc": 0.9019118199881854, "train_speed(iter/s)": 0.032442 }, { "epoch": 0.965557331783171, "grad_norm": 0.10267394781112671, "learning_rate": 9.749523159763295e-07, "loss": 0.32968056201934814, "memory(GiB)": 78.33, "step": 4983, "token_acc": 0.9018928833455613, "train_speed(iter/s)": 0.032443 }, { "epoch": 0.9657511020684978, "grad_norm": 0.0950947031378746, "learning_rate": 9.64040282888462e-07, "loss": 0.32611650228500366, "memory(GiB)": 78.33, "step": 4984, "token_acc": 0.9009006650259203, "train_speed(iter/s)": 0.032443 }, { "epoch": 0.9659448723538245, "grad_norm": 0.10337654501199722, "learning_rate": 9.531894628740044e-07, "loss": 0.3508460521697998, "memory(GiB)": 78.33, "step": 4985, "token_acc": 0.8948303758520995, "train_speed(iter/s)": 0.032444 }, { "epoch": 0.9661386426391513, "grad_norm": 0.08894824236631393, "learning_rate": 9.423998603896921e-07, "loss": 0.29168346524238586, "memory(GiB)": 78.33, "step": 4986, "token_acc": 0.9120740535223346, "train_speed(iter/s)": 0.032444 }, { "epoch": 0.966332412924478, "grad_norm": 0.0989081859588623, "learning_rate": 9.316714798670799e-07, "loss": 0.32772505283355713, "memory(GiB)": 78.33, "step": 4987, "token_acc": 0.899692881430295, "train_speed(iter/s)": 0.032445 }, { "epoch": 0.9665261832098048, "grad_norm": 0.08811601996421814, "learning_rate": 9.210043257126098e-07, "loss": 0.289480984210968, "memory(GiB)": 78.33, "step": 4988, "token_acc": 0.9128572101125182, "train_speed(iter/s)": 0.032445 }, { "epoch": 0.9667199534951315, "grad_norm": 0.10313121974468231, "learning_rate": 9.103984023075772e-07, "loss": 0.35241398215293884, "memory(GiB)": 78.33, "step": 4989, "token_acc": 0.8971772553485724, "train_speed(iter/s)": 0.032446 }, { "epoch": 0.9669137237804583, "grad_norm": 0.09983363002538681, "learning_rate": 8.998537140081141e-07, "loss": 0.31030166149139404, "memory(GiB)": 78.33, "step": 4990, "token_acc": 0.9058147247402166, "train_speed(iter/s)": 0.032447 }, { "epoch": 0.967107494065785, "grad_norm": 0.10593412816524506, "learning_rate": 8.893702651452062e-07, "loss": 0.3430192768573761, "memory(GiB)": 78.33, "step": 4991, "token_acc": 0.8970623145400594, "train_speed(iter/s)": 0.032447 }, { "epoch": 0.9673012643511117, "grad_norm": 0.09164869040250778, "learning_rate": 8.789480600246757e-07, "loss": 0.3059519827365875, "memory(GiB)": 78.33, "step": 4992, "token_acc": 0.9062849909936225, "train_speed(iter/s)": 0.032448 }, { "epoch": 0.9674950346364385, "grad_norm": 0.11753620952367783, "learning_rate": 8.685871029272318e-07, "loss": 0.35421380400657654, "memory(GiB)": 78.33, "step": 4993, "token_acc": 0.8949581180397317, "train_speed(iter/s)": 0.032448 }, { "epoch": 0.9676888049217652, "grad_norm": 0.09262137115001678, "learning_rate": 8.582873981083705e-07, "loss": 0.29231324791908264, "memory(GiB)": 78.33, "step": 4994, "token_acc": 0.9102221280876863, "train_speed(iter/s)": 0.032449 }, { "epoch": 0.967882575207092, "grad_norm": 0.09335288405418396, "learning_rate": 8.480489497984744e-07, "loss": 0.3362676203250885, "memory(GiB)": 78.33, "step": 4995, "token_acc": 0.8979033950843279, "train_speed(iter/s)": 0.032449 }, { "epoch": 0.9680763454924187, "grad_norm": 0.12949591875076294, "learning_rate": 8.378717622027465e-07, "loss": 0.32554298639297485, "memory(GiB)": 78.33, "step": 4996, "token_acc": 0.9030780971762911, "train_speed(iter/s)": 0.03245 }, { "epoch": 0.9682701157777455, "grad_norm": 0.10350757837295532, "learning_rate": 8.277558395012096e-07, "loss": 0.335059255361557, "memory(GiB)": 78.33, "step": 4997, "token_acc": 0.8990599887317261, "train_speed(iter/s)": 0.03245 }, { "epoch": 0.9684638860630722, "grad_norm": 0.11247258633375168, "learning_rate": 8.177011858487903e-07, "loss": 0.3429482579231262, "memory(GiB)": 78.33, "step": 4998, "token_acc": 0.895786360575093, "train_speed(iter/s)": 0.032451 }, { "epoch": 0.968657656348399, "grad_norm": 0.10709885507822037, "learning_rate": 8.077078053751518e-07, "loss": 0.3340757191181183, "memory(GiB)": 78.33, "step": 4999, "token_acc": 0.8992043255199533, "train_speed(iter/s)": 0.032451 }, { "epoch": 0.9688514266337257, "grad_norm": 0.09967630356550217, "learning_rate": 7.97775702184894e-07, "loss": 0.30826178193092346, "memory(GiB)": 78.33, "step": 5000, "token_acc": 0.9074506820281506, "train_speed(iter/s)": 0.032452 }, { "epoch": 0.9688514266337257, "eval_loss": 0.3782345950603485, "eval_runtime": 1344.6451, "eval_samples_per_second": 5.019, "eval_steps_per_second": 5.019, "eval_token_acc": 0.9026852655344548, "step": 5000 }, { "epoch": 0.9690451969190524, "grad_norm": 0.10758214443922043, "learning_rate": 7.87904880357354e-07, "loss": 0.3177351951599121, "memory(GiB)": 78.33, "step": 5001, "token_acc": 0.9042991375981465, "train_speed(iter/s)": 0.032169 }, { "epoch": 0.9692389672043792, "grad_norm": 0.0970822274684906, "learning_rate": 7.780953439467719e-07, "loss": 0.3285868167877197, "memory(GiB)": 78.33, "step": 5002, "token_acc": 0.9016248076571994, "train_speed(iter/s)": 0.03217 }, { "epoch": 0.9694327374897059, "grad_norm": 0.10737350583076477, "learning_rate": 7.683470969821748e-07, "loss": 0.3421315550804138, "memory(GiB)": 78.33, "step": 5003, "token_acc": 0.894730186830209, "train_speed(iter/s)": 0.03217 }, { "epoch": 0.9696265077750327, "grad_norm": 0.09953954815864563, "learning_rate": 7.586601434674266e-07, "loss": 0.31448549032211304, "memory(GiB)": 78.33, "step": 5004, "token_acc": 0.9061734010562758, "train_speed(iter/s)": 0.032171 }, { "epoch": 0.9698202780603594, "grad_norm": 0.10338281095027924, "learning_rate": 7.490344873812615e-07, "loss": 0.33236753940582275, "memory(GiB)": 78.33, "step": 5005, "token_acc": 0.8993050377307842, "train_speed(iter/s)": 0.032171 }, { "epoch": 0.9700140483456862, "grad_norm": 0.09365373104810715, "learning_rate": 7.394701326771335e-07, "loss": 0.3056509792804718, "memory(GiB)": 78.33, "step": 5006, "token_acc": 0.906607994493338, "train_speed(iter/s)": 0.032172 }, { "epoch": 0.9702078186310129, "grad_norm": 0.10147716104984283, "learning_rate": 7.29967083283417e-07, "loss": 0.31711748242378235, "memory(GiB)": 78.33, "step": 5007, "token_acc": 0.902543880455408, "train_speed(iter/s)": 0.032173 }, { "epoch": 0.9704015889163397, "grad_norm": 0.09770286083221436, "learning_rate": 7.205253431032564e-07, "loss": 0.324236124753952, "memory(GiB)": 78.33, "step": 5008, "token_acc": 0.9028327266972622, "train_speed(iter/s)": 0.032173 }, { "epoch": 0.9705953592016664, "grad_norm": 0.10440776497125626, "learning_rate": 7.111449160146332e-07, "loss": 0.3207136392593384, "memory(GiB)": 78.33, "step": 5009, "token_acc": 0.9042985518859825, "train_speed(iter/s)": 0.032174 }, { "epoch": 0.9707891294869931, "grad_norm": 0.09528158605098724, "learning_rate": 7.018258058703319e-07, "loss": 0.2945985496044159, "memory(GiB)": 78.33, "step": 5010, "token_acc": 0.9116842726151536, "train_speed(iter/s)": 0.032174 }, { "epoch": 0.9709828997723199, "grad_norm": 0.09875106066465378, "learning_rate": 6.925680164979741e-07, "loss": 0.33711299300193787, "memory(GiB)": 78.33, "step": 5011, "token_acc": 0.8979212309573547, "train_speed(iter/s)": 0.032175 }, { "epoch": 0.9711766700576466, "grad_norm": 0.09752146899700165, "learning_rate": 6.833715516999849e-07, "loss": 0.33156195282936096, "memory(GiB)": 78.33, "step": 5012, "token_acc": 0.9005984838409363, "train_speed(iter/s)": 0.032175 }, { "epoch": 0.9713704403429734, "grad_norm": 0.11073119193315506, "learning_rate": 6.742364152535929e-07, "loss": 0.3404943645000458, "memory(GiB)": 78.33, "step": 5013, "token_acc": 0.8999028182701652, "train_speed(iter/s)": 0.032176 }, { "epoch": 0.9715642106283001, "grad_norm": 0.08712846785783768, "learning_rate": 6.651626109108465e-07, "loss": 0.30490002036094666, "memory(GiB)": 78.33, "step": 5014, "token_acc": 0.907512204600102, "train_speed(iter/s)": 0.032176 }, { "epoch": 0.9717579809136269, "grad_norm": 0.0969480574131012, "learning_rate": 6.561501423985816e-07, "loss": 0.32621052861213684, "memory(GiB)": 78.33, "step": 5015, "token_acc": 0.9022739990842956, "train_speed(iter/s)": 0.032177 }, { "epoch": 0.9719517511989536, "grad_norm": 0.11033257842063904, "learning_rate": 6.471990134185035e-07, "loss": 0.34958821535110474, "memory(GiB)": 78.33, "step": 5016, "token_acc": 0.8971919453168299, "train_speed(iter/s)": 0.032178 }, { "epoch": 0.9721455214842804, "grad_norm": 0.11200974881649017, "learning_rate": 6.383092276470381e-07, "loss": 0.37351301312446594, "memory(GiB)": 78.33, "step": 5017, "token_acc": 0.890080579498554, "train_speed(iter/s)": 0.032178 }, { "epoch": 0.9723392917696071, "grad_norm": 0.10688678920269012, "learning_rate": 6.294807887354647e-07, "loss": 0.32566171884536743, "memory(GiB)": 78.33, "step": 5018, "token_acc": 0.9027785449925426, "train_speed(iter/s)": 0.032179 }, { "epoch": 0.9725330620549338, "grad_norm": 0.11659563332796097, "learning_rate": 6.207137003098994e-07, "loss": 0.3676939904689789, "memory(GiB)": 78.33, "step": 5019, "token_acc": 0.8909811380567443, "train_speed(iter/s)": 0.032179 }, { "epoch": 0.9727268323402606, "grad_norm": 0.10240488499403, "learning_rate": 6.120079659711786e-07, "loss": 0.3170427680015564, "memory(GiB)": 78.33, "step": 5020, "token_acc": 0.9030933713471133, "train_speed(iter/s)": 0.03218 }, { "epoch": 0.9729206026255873, "grad_norm": 0.1029011458158493, "learning_rate": 6.033635892950084e-07, "loss": 0.3273800313472748, "memory(GiB)": 78.33, "step": 5021, "token_acc": 0.9038344491783323, "train_speed(iter/s)": 0.03218 }, { "epoch": 0.9731143729109141, "grad_norm": 0.09669654816389084, "learning_rate": 5.94780573831849e-07, "loss": 0.3153877854347229, "memory(GiB)": 78.33, "step": 5022, "token_acc": 0.9058483637541851, "train_speed(iter/s)": 0.032181 }, { "epoch": 0.9733081431962408, "grad_norm": 0.09650097042322159, "learning_rate": 5.862589231069803e-07, "loss": 0.3036450445652008, "memory(GiB)": 78.33, "step": 5023, "token_acc": 0.9092272045795284, "train_speed(iter/s)": 0.032181 }, { "epoch": 0.9735019134815676, "grad_norm": 0.11860737949609756, "learning_rate": 5.777986406204694e-07, "loss": 0.3921282887458801, "memory(GiB)": 78.33, "step": 5024, "token_acc": 0.8855097849722156, "train_speed(iter/s)": 0.032182 }, { "epoch": 0.9736956837668943, "grad_norm": 0.10267064720392227, "learning_rate": 5.693997298472031e-07, "loss": 0.31443023681640625, "memory(GiB)": 78.33, "step": 5025, "token_acc": 0.9051502501866032, "train_speed(iter/s)": 0.032183 }, { "epoch": 0.973889454052221, "grad_norm": 0.09616965055465698, "learning_rate": 5.610621942368054e-07, "loss": 0.3315185606479645, "memory(GiB)": 78.33, "step": 5026, "token_acc": 0.902337848564771, "train_speed(iter/s)": 0.032183 }, { "epoch": 0.9740832243375478, "grad_norm": 0.09976300597190857, "learning_rate": 5.527860372137538e-07, "loss": 0.32565930485725403, "memory(GiB)": 78.33, "step": 5027, "token_acc": 0.9027216527952472, "train_speed(iter/s)": 0.032184 }, { "epoch": 0.9742769946228745, "grad_norm": 0.09962465614080429, "learning_rate": 5.445712621772791e-07, "loss": 0.327309787273407, "memory(GiB)": 78.33, "step": 5028, "token_acc": 0.9021066306645948, "train_speed(iter/s)": 0.032184 }, { "epoch": 0.9744707649082013, "grad_norm": 0.10362134128808975, "learning_rate": 5.364178725014157e-07, "loss": 0.3133904039859772, "memory(GiB)": 78.33, "step": 5029, "token_acc": 0.9041495198902606, "train_speed(iter/s)": 0.032185 }, { "epoch": 0.974664535193528, "grad_norm": 0.10740216076374054, "learning_rate": 5.283258715349514e-07, "loss": 0.33964803814888, "memory(GiB)": 78.33, "step": 5030, "token_acc": 0.8985633557311141, "train_speed(iter/s)": 0.032186 }, { "epoch": 0.9748583054788548, "grad_norm": 0.0993296429514885, "learning_rate": 5.202952626015445e-07, "loss": 0.3082018196582794, "memory(GiB)": 78.33, "step": 5031, "token_acc": 0.9063476667744369, "train_speed(iter/s)": 0.032186 }, { "epoch": 0.9750520757641815, "grad_norm": 0.09285027533769608, "learning_rate": 5.123260489995229e-07, "loss": 0.2862505316734314, "memory(GiB)": 78.33, "step": 5032, "token_acc": 0.9113854235062376, "train_speed(iter/s)": 0.032187 }, { "epoch": 0.9752458460495083, "grad_norm": 0.11154574900865555, "learning_rate": 5.044182340021019e-07, "loss": 0.36379557847976685, "memory(GiB)": 78.33, "step": 5033, "token_acc": 0.8903923823000501, "train_speed(iter/s)": 0.032187 }, { "epoch": 0.975439616334835, "grad_norm": 0.09221196174621582, "learning_rate": 4.965718208572001e-07, "loss": 0.3152211308479309, "memory(GiB)": 78.33, "step": 5034, "token_acc": 0.9047773077880069, "train_speed(iter/s)": 0.032188 }, { "epoch": 0.9756333866201617, "grad_norm": 0.09758051484823227, "learning_rate": 4.887868127875561e-07, "loss": 0.3170757293701172, "memory(GiB)": 78.33, "step": 5035, "token_acc": 0.9045536265328575, "train_speed(iter/s)": 0.032188 }, { "epoch": 0.9758271569054885, "grad_norm": 0.09600334614515305, "learning_rate": 4.810632129907122e-07, "loss": 0.31009405851364136, "memory(GiB)": 78.33, "step": 5036, "token_acc": 0.9058032803330491, "train_speed(iter/s)": 0.032189 }, { "epoch": 0.9760209271908152, "grad_norm": 0.10319238901138306, "learning_rate": 4.7340102463891415e-07, "loss": 0.33604225516319275, "memory(GiB)": 78.33, "step": 5037, "token_acc": 0.8996856559863233, "train_speed(iter/s)": 0.032189 }, { "epoch": 0.976214697476142, "grad_norm": 0.10404365509748459, "learning_rate": 4.6580025087926134e-07, "loss": 0.3151894509792328, "memory(GiB)": 78.33, "step": 5038, "token_acc": 0.9064724919093851, "train_speed(iter/s)": 0.03219 }, { "epoch": 0.9764084677614687, "grad_norm": 0.10096472501754761, "learning_rate": 4.5826089483358973e-07, "loss": 0.3288641571998596, "memory(GiB)": 78.33, "step": 5039, "token_acc": 0.9003461989642643, "train_speed(iter/s)": 0.032191 }, { "epoch": 0.9766022380467956, "grad_norm": 0.08929524570703506, "learning_rate": 4.5078295959850576e-07, "loss": 0.2969154715538025, "memory(GiB)": 78.33, "step": 5040, "token_acc": 0.9102983397827986, "train_speed(iter/s)": 0.032191 }, { "epoch": 0.9767960083321223, "grad_norm": 0.09447763115167618, "learning_rate": 4.4336644824540245e-07, "loss": 0.3048000633716583, "memory(GiB)": 78.33, "step": 5041, "token_acc": 0.9062621145943711, "train_speed(iter/s)": 0.032192 }, { "epoch": 0.9769897786174491, "grad_norm": 0.10533607751131058, "learning_rate": 4.360113638204432e-07, "loss": 0.3521908223628998, "memory(GiB)": 78.33, "step": 5042, "token_acc": 0.8957643566617194, "train_speed(iter/s)": 0.032192 }, { "epoch": 0.9771835489027758, "grad_norm": 0.09984603524208069, "learning_rate": 4.287177093445615e-07, "loss": 0.3105589747428894, "memory(GiB)": 78.33, "step": 5043, "token_acc": 0.9076200993926008, "train_speed(iter/s)": 0.032193 }, { "epoch": 0.9773773191881026, "grad_norm": 0.09986462444067001, "learning_rate": 4.2148548781344437e-07, "loss": 0.2763623893260956, "memory(GiB)": 78.33, "step": 5044, "token_acc": 0.9144782780290841, "train_speed(iter/s)": 0.032193 }, { "epoch": 0.9775710894734293, "grad_norm": 0.09815599024295807, "learning_rate": 4.143147021975823e-07, "loss": 0.33960387110710144, "memory(GiB)": 78.33, "step": 5045, "token_acc": 0.9006058664958497, "train_speed(iter/s)": 0.032194 }, { "epoch": 0.977764859758756, "grad_norm": 0.10120173543691635, "learning_rate": 4.0720535544216945e-07, "loss": 0.3160873055458069, "memory(GiB)": 78.33, "step": 5046, "token_acc": 0.9033082947099249, "train_speed(iter/s)": 0.032195 }, { "epoch": 0.9779586300440828, "grad_norm": 0.09940861910581589, "learning_rate": 4.0015745046725336e-07, "loss": 0.31816571950912476, "memory(GiB)": 78.33, "step": 5047, "token_acc": 0.9058804471083752, "train_speed(iter/s)": 0.032195 }, { "epoch": 0.9781524003294095, "grad_norm": 0.11098282784223557, "learning_rate": 3.931709901675684e-07, "loss": 0.353601336479187, "memory(GiB)": 78.33, "step": 5048, "token_acc": 0.8934923500340327, "train_speed(iter/s)": 0.032196 }, { "epoch": 0.9783461706147363, "grad_norm": 0.09338078647851944, "learning_rate": 3.862459774126525e-07, "loss": 0.3041617274284363, "memory(GiB)": 78.33, "step": 5049, "token_acc": 0.9068331108843003, "train_speed(iter/s)": 0.032196 }, { "epoch": 0.978539940900063, "grad_norm": 0.10786343365907669, "learning_rate": 3.793824150467806e-07, "loss": 0.3523224890232086, "memory(GiB)": 78.33, "step": 5050, "token_acc": 0.8945492180312787, "train_speed(iter/s)": 0.032197 }, { "epoch": 0.9787337111853898, "grad_norm": 0.09506326168775558, "learning_rate": 3.7258030588901424e-07, "loss": 0.3265533447265625, "memory(GiB)": 78.33, "step": 5051, "token_acc": 0.9006531536959823, "train_speed(iter/s)": 0.032197 }, { "epoch": 0.9789274814707165, "grad_norm": 0.0921454057097435, "learning_rate": 3.6583965273316864e-07, "loss": 0.29769742488861084, "memory(GiB)": 78.33, "step": 5052, "token_acc": 0.9098529003608105, "train_speed(iter/s)": 0.032198 }, { "epoch": 0.9791212517560433, "grad_norm": 0.10511661320924759, "learning_rate": 3.591604583478125e-07, "loss": 0.33176693320274353, "memory(GiB)": 78.33, "step": 5053, "token_acc": 0.9007189710979348, "train_speed(iter/s)": 0.032199 }, { "epoch": 0.97931502204137, "grad_norm": 0.09943993389606476, "learning_rate": 3.5254272547623474e-07, "loss": 0.32589173316955566, "memory(GiB)": 78.33, "step": 5054, "token_acc": 0.9014144342263095, "train_speed(iter/s)": 0.032199 }, { "epoch": 0.9795087923266967, "grad_norm": 0.10941484570503235, "learning_rate": 3.4598645683656113e-07, "loss": 0.3675105571746826, "memory(GiB)": 78.33, "step": 5055, "token_acc": 0.8914224336351082, "train_speed(iter/s)": 0.0322 }, { "epoch": 0.9797025626120235, "grad_norm": 0.10779840499162674, "learning_rate": 3.3949165512160423e-07, "loss": 0.35344696044921875, "memory(GiB)": 78.33, "step": 5056, "token_acc": 0.8957955624622291, "train_speed(iter/s)": 0.0322 }, { "epoch": 0.9798963328973502, "grad_norm": 0.09050939232110977, "learning_rate": 3.330583229989636e-07, "loss": 0.30384790897369385, "memory(GiB)": 78.33, "step": 5057, "token_acc": 0.9091451737259681, "train_speed(iter/s)": 0.032201 }, { "epoch": 0.980090103182677, "grad_norm": 0.10062376409769058, "learning_rate": 3.2668646311097556e-07, "loss": 0.30701377987861633, "memory(GiB)": 78.33, "step": 5058, "token_acc": 0.9093784940958303, "train_speed(iter/s)": 0.032201 }, { "epoch": 0.9802838734680037, "grad_norm": 0.09237212687730789, "learning_rate": 3.2037607807473e-07, "loss": 0.30352169275283813, "memory(GiB)": 78.33, "step": 5059, "token_acc": 0.9058777531604327, "train_speed(iter/s)": 0.032202 }, { "epoch": 0.9804776437533305, "grad_norm": 0.0979812890291214, "learning_rate": 3.1412717048207025e-07, "loss": 0.31397053599357605, "memory(GiB)": 78.33, "step": 5060, "token_acc": 0.9068339778781405, "train_speed(iter/s)": 0.032203 }, { "epoch": 0.9806714140386572, "grad_norm": 0.09479079395532608, "learning_rate": 3.0793974289961e-07, "loss": 0.31937700510025024, "memory(GiB)": 78.33, "step": 5061, "token_acc": 0.9035309120858683, "train_speed(iter/s)": 0.032203 }, { "epoch": 0.980865184323984, "grad_norm": 0.12520615756511688, "learning_rate": 3.01813797868683e-07, "loss": 0.35134872794151306, "memory(GiB)": 78.33, "step": 5062, "token_acc": 0.8932484641205903, "train_speed(iter/s)": 0.032204 }, { "epoch": 0.9810589546093107, "grad_norm": 0.10819295048713684, "learning_rate": 2.957493379053599e-07, "loss": 0.3610383868217468, "memory(GiB)": 78.33, "step": 5063, "token_acc": 0.8935895511184507, "train_speed(iter/s)": 0.032204 }, { "epoch": 0.9812527248946374, "grad_norm": 0.09377988427877426, "learning_rate": 2.8974636550049833e-07, "loss": 0.3174287676811218, "memory(GiB)": 78.33, "step": 5064, "token_acc": 0.9046624721817732, "train_speed(iter/s)": 0.032205 }, { "epoch": 0.9814464951799642, "grad_norm": 0.09167510271072388, "learning_rate": 2.83804883119676e-07, "loss": 0.297576367855072, "memory(GiB)": 78.33, "step": 5065, "token_acc": 0.9104416645391882, "train_speed(iter/s)": 0.032205 }, { "epoch": 0.9816402654652909, "grad_norm": 0.09928658604621887, "learning_rate": 2.7792489320322407e-07, "loss": 0.3319474458694458, "memory(GiB)": 78.33, "step": 5066, "token_acc": 0.9005888179616993, "train_speed(iter/s)": 0.032206 }, { "epoch": 0.9818340357506177, "grad_norm": 0.09505660086870193, "learning_rate": 2.721063981661942e-07, "loss": 0.32442140579223633, "memory(GiB)": 78.33, "step": 5067, "token_acc": 0.9013683579704355, "train_speed(iter/s)": 0.032206 }, { "epoch": 0.9820278060359444, "grad_norm": 0.09601055830717087, "learning_rate": 2.663494003984079e-07, "loss": 0.3260546922683716, "memory(GiB)": 78.33, "step": 5068, "token_acc": 0.9013065431263338, "train_speed(iter/s)": 0.032207 }, { "epoch": 0.9822215763212712, "grad_norm": 0.10125764459371567, "learning_rate": 2.6065390226444047e-07, "loss": 0.3628811836242676, "memory(GiB)": 78.33, "step": 5069, "token_acc": 0.8924914675767918, "train_speed(iter/s)": 0.032208 }, { "epoch": 0.9824153466065979, "grad_norm": 0.10014388710260391, "learning_rate": 2.5501990610355406e-07, "loss": 0.32086047530174255, "memory(GiB)": 78.33, "step": 5070, "token_acc": 0.9021128125605737, "train_speed(iter/s)": 0.032208 }, { "epoch": 0.9826091168919246, "grad_norm": 0.09349211305379868, "learning_rate": 2.4944741422979754e-07, "loss": 0.31767165660858154, "memory(GiB)": 78.33, "step": 5071, "token_acc": 0.9040567600306544, "train_speed(iter/s)": 0.032209 }, { "epoch": 0.9828028871772514, "grad_norm": 0.09731042385101318, "learning_rate": 2.4393642893194007e-07, "loss": 0.3093053102493286, "memory(GiB)": 78.33, "step": 5072, "token_acc": 0.9075150674702651, "train_speed(iter/s)": 0.032209 }, { "epoch": 0.9829966574625781, "grad_norm": 0.10741175711154938, "learning_rate": 2.3848695247350446e-07, "loss": 0.31964316964149475, "memory(GiB)": 78.33, "step": 5073, "token_acc": 0.9033793824646376, "train_speed(iter/s)": 0.03221 }, { "epoch": 0.9831904277479049, "grad_norm": 0.09962072223424911, "learning_rate": 2.330989870927169e-07, "loss": 0.33166712522506714, "memory(GiB)": 78.33, "step": 5074, "token_acc": 0.8991797207209679, "train_speed(iter/s)": 0.03221 }, { "epoch": 0.9833841980332316, "grad_norm": 0.11339244991540909, "learning_rate": 2.2777253500257386e-07, "loss": 0.3660873770713806, "memory(GiB)": 78.33, "step": 5075, "token_acc": 0.8902705205370726, "train_speed(iter/s)": 0.032211 }, { "epoch": 0.9835779683185584, "grad_norm": 0.10337600857019424, "learning_rate": 2.2250759839077536e-07, "loss": 0.3016257882118225, "memory(GiB)": 78.33, "step": 5076, "token_acc": 0.9081414405155412, "train_speed(iter/s)": 0.032212 }, { "epoch": 0.9837717386038851, "grad_norm": 0.09644091874361038, "learning_rate": 2.173041794197916e-07, "loss": 0.3238067924976349, "memory(GiB)": 78.33, "step": 5077, "token_acc": 0.9044397813242034, "train_speed(iter/s)": 0.032212 }, { "epoch": 0.9839655088892119, "grad_norm": 0.10839894413948059, "learning_rate": 2.1216228022679638e-07, "loss": 0.3413659334182739, "memory(GiB)": 78.33, "step": 5078, "token_acc": 0.8964040304440267, "train_speed(iter/s)": 0.032213 }, { "epoch": 0.9841592791745386, "grad_norm": 0.08885491639375687, "learning_rate": 2.070819029237003e-07, "loss": 0.29514098167419434, "memory(GiB)": 78.33, "step": 5079, "token_acc": 0.9091107924858441, "train_speed(iter/s)": 0.032213 }, { "epoch": 0.9843530494598653, "grad_norm": 0.09718841314315796, "learning_rate": 2.0206304959716756e-07, "loss": 0.3231916129589081, "memory(GiB)": 78.33, "step": 5080, "token_acc": 0.9021267809209168, "train_speed(iter/s)": 0.032214 }, { "epoch": 0.9845468197451921, "grad_norm": 0.11555752903223038, "learning_rate": 1.971057223085659e-07, "loss": 0.3575592637062073, "memory(GiB)": 78.33, "step": 5081, "token_acc": 0.893569844789357, "train_speed(iter/s)": 0.032214 }, { "epoch": 0.9847405900305188, "grad_norm": 0.11206049472093582, "learning_rate": 1.9220992309399997e-07, "loss": 0.37476587295532227, "memory(GiB)": 78.33, "step": 5082, "token_acc": 0.8887654848355404, "train_speed(iter/s)": 0.032215 }, { "epoch": 0.9849343603158456, "grad_norm": 0.08786389976739883, "learning_rate": 1.873756539643112e-07, "loss": 0.2945099174976349, "memory(GiB)": 78.33, "step": 5083, "token_acc": 0.9125009193655153, "train_speed(iter/s)": 0.032215 }, { "epoch": 0.9851281306011723, "grad_norm": 0.09714756906032562, "learning_rate": 1.8260291690506135e-07, "loss": 0.32386794686317444, "memory(GiB)": 78.33, "step": 5084, "token_acc": 0.9020605635215081, "train_speed(iter/s)": 0.032216 }, { "epoch": 0.9853219008864991, "grad_norm": 0.09232458472251892, "learning_rate": 1.7789171387654898e-07, "loss": 0.3251858949661255, "memory(GiB)": 78.33, "step": 5085, "token_acc": 0.901725535610886, "train_speed(iter/s)": 0.032216 }, { "epoch": 0.9855156711718258, "grad_norm": 0.11348365992307663, "learning_rate": 1.7324204681377628e-07, "loss": 0.35106420516967773, "memory(GiB)": 78.33, "step": 5086, "token_acc": 0.895291405992756, "train_speed(iter/s)": 0.032217 }, { "epoch": 0.9857094414571526, "grad_norm": 0.09008847177028656, "learning_rate": 1.6865391762649893e-07, "loss": 0.2719075083732605, "memory(GiB)": 78.33, "step": 5087, "token_acc": 0.9164248403946604, "train_speed(iter/s)": 0.032218 }, { "epoch": 0.9859032117424793, "grad_norm": 0.1007675901055336, "learning_rate": 1.6412732819919284e-07, "loss": 0.29824161529541016, "memory(GiB)": 78.33, "step": 5088, "token_acc": 0.9085992132867133, "train_speed(iter/s)": 0.032218 }, { "epoch": 0.986096982027806, "grad_norm": 0.12334459275007248, "learning_rate": 1.596622803910208e-07, "loss": 0.3122141361236572, "memory(GiB)": 78.33, "step": 5089, "token_acc": 0.90404706917409, "train_speed(iter/s)": 0.032219 }, { "epoch": 0.9862907523131328, "grad_norm": 0.1014440655708313, "learning_rate": 1.552587760359325e-07, "loss": 0.31267303228378296, "memory(GiB)": 78.33, "step": 5090, "token_acc": 0.9055354659248956, "train_speed(iter/s)": 0.032219 }, { "epoch": 0.9864845225984595, "grad_norm": 0.11133712530136108, "learning_rate": 1.5091681694253122e-07, "loss": 0.35740742087364197, "memory(GiB)": 78.33, "step": 5091, "token_acc": 0.8951927600808125, "train_speed(iter/s)": 0.03222 }, { "epoch": 0.9866782928837863, "grad_norm": 0.10128425806760788, "learning_rate": 1.4663640489420702e-07, "loss": 0.3182716965675354, "memory(GiB)": 78.33, "step": 5092, "token_acc": 0.9042219609160648, "train_speed(iter/s)": 0.03222 }, { "epoch": 0.986872063169113, "grad_norm": 0.09385867416858673, "learning_rate": 1.4241754164903696e-07, "loss": 0.2903617322444916, "memory(GiB)": 78.33, "step": 5093, "token_acc": 0.9133545725178879, "train_speed(iter/s)": 0.032221 }, { "epoch": 0.9870658334544398, "grad_norm": 0.09520326554775238, "learning_rate": 1.3826022893980159e-07, "loss": 0.32819217443466187, "memory(GiB)": 78.33, "step": 5094, "token_acc": 0.901990578939371, "train_speed(iter/s)": 0.032221 }, { "epoch": 0.9872596037397665, "grad_norm": 0.09930814802646637, "learning_rate": 1.3416446847401842e-07, "loss": 0.30430757999420166, "memory(GiB)": 78.33, "step": 5095, "token_acc": 0.9086877119749543, "train_speed(iter/s)": 0.032222 }, { "epoch": 0.9874533740250933, "grad_norm": 0.10373541712760925, "learning_rate": 1.3013026193395836e-07, "loss": 0.3304864168167114, "memory(GiB)": 78.33, "step": 5096, "token_acc": 0.9003137958085845, "train_speed(iter/s)": 0.032223 }, { "epoch": 0.98764714431042, "grad_norm": 0.09495776146650314, "learning_rate": 1.2615761097654608e-07, "loss": 0.31185808777809143, "memory(GiB)": 78.33, "step": 5097, "token_acc": 0.9045740484060134, "train_speed(iter/s)": 0.032223 }, { "epoch": 0.9878409145957467, "grad_norm": 0.10554935038089752, "learning_rate": 1.2224651723347634e-07, "loss": 0.3295019567012787, "memory(GiB)": 78.33, "step": 5098, "token_acc": 0.9011663040850858, "train_speed(iter/s)": 0.032224 }, { "epoch": 0.9880346848810735, "grad_norm": 0.09783808141946793, "learning_rate": 1.1839698231113082e-07, "loss": 0.32202041149139404, "memory(GiB)": 78.33, "step": 5099, "token_acc": 0.9030985169491526, "train_speed(iter/s)": 0.032224 }, { "epoch": 0.9882284551664002, "grad_norm": 0.1201200857758522, "learning_rate": 1.1460900779061144e-07, "loss": 0.31718727946281433, "memory(GiB)": 78.33, "step": 5100, "token_acc": 0.9047082558230932, "train_speed(iter/s)": 0.032225 }, { "epoch": 0.988422225451727, "grad_norm": 0.10204530507326126, "learning_rate": 1.1088259522777365e-07, "loss": 0.33351755142211914, "memory(GiB)": 78.33, "step": 5101, "token_acc": 0.90131747431921, "train_speed(iter/s)": 0.032225 }, { "epoch": 0.9886159957370537, "grad_norm": 0.1120908334851265, "learning_rate": 1.0721774615310985e-07, "loss": 0.35009273886680603, "memory(GiB)": 78.33, "step": 5102, "token_acc": 0.895470053070508, "train_speed(iter/s)": 0.032226 }, { "epoch": 0.9888097660223805, "grad_norm": 0.10349807143211365, "learning_rate": 1.0361446207189928e-07, "loss": 0.3232646584510803, "memory(GiB)": 78.33, "step": 5103, "token_acc": 0.9045182551383227, "train_speed(iter/s)": 0.032226 }, { "epoch": 0.9890035363077072, "grad_norm": 0.10206515341997147, "learning_rate": 1.0007274446409141e-07, "loss": 0.32843074202537537, "memory(GiB)": 78.33, "step": 5104, "token_acc": 0.9033802574615097, "train_speed(iter/s)": 0.032227 }, { "epoch": 0.989197306593034, "grad_norm": 0.10101883858442307, "learning_rate": 9.65925947843893e-08, "loss": 0.3663754165172577, "memory(GiB)": 78.33, "step": 5105, "token_acc": 0.8890428585568643, "train_speed(iter/s)": 0.032227 }, { "epoch": 0.9893910768783607, "grad_norm": 0.10111556947231293, "learning_rate": 9.317401446216621e-08, "loss": 0.3458866775035858, "memory(GiB)": 78.33, "step": 5106, "token_acc": 0.898968688533305, "train_speed(iter/s)": 0.032228 }, { "epoch": 0.9895848471636874, "grad_norm": 0.10001836717128754, "learning_rate": 8.981700490151567e-08, "loss": 0.34546831250190735, "memory(GiB)": 78.33, "step": 5107, "token_acc": 0.8980206216602694, "train_speed(iter/s)": 0.032229 }, { "epoch": 0.9897786174490142, "grad_norm": 0.10675647109746933, "learning_rate": 8.652156748126804e-08, "loss": 0.34267866611480713, "memory(GiB)": 78.33, "step": 5108, "token_acc": 0.8978237122930847, "train_speed(iter/s)": 0.032229 }, { "epoch": 0.9899723877343409, "grad_norm": 0.10720111429691315, "learning_rate": 8.328770355495729e-08, "loss": 0.3194417953491211, "memory(GiB)": 78.33, "step": 5109, "token_acc": 0.9027690371302706, "train_speed(iter/s)": 0.03223 }, { "epoch": 0.9901661580196677, "grad_norm": 0.08818720281124115, "learning_rate": 8.011541445078762e-08, "loss": 0.2857103645801544, "memory(GiB)": 78.33, "step": 5110, "token_acc": 0.91203895313451, "train_speed(iter/s)": 0.03223 }, { "epoch": 0.9903599283049944, "grad_norm": 0.09907843917608261, "learning_rate": 7.700470147173343e-08, "loss": 0.3331596553325653, "memory(GiB)": 78.33, "step": 5111, "token_acc": 0.8999431495167709, "train_speed(iter/s)": 0.032231 }, { "epoch": 0.9905536985903212, "grad_norm": 0.13084760308265686, "learning_rate": 7.395556589542274e-08, "loss": 0.3520573377609253, "memory(GiB)": 78.33, "step": 5112, "token_acc": 0.8953529427741111, "train_speed(iter/s)": 0.032231 }, { "epoch": 0.9907474688756479, "grad_norm": 0.09977789223194122, "learning_rate": 7.09680089742537e-08, "loss": 0.34126073122024536, "memory(GiB)": 78.33, "step": 5113, "token_acc": 0.8968176914778857, "train_speed(iter/s)": 0.032232 }, { "epoch": 0.9909412391609747, "grad_norm": 0.0944044291973114, "learning_rate": 6.804203193524483e-08, "loss": 0.32394513487815857, "memory(GiB)": 78.33, "step": 5114, "token_acc": 0.9009689518649718, "train_speed(iter/s)": 0.032232 }, { "epoch": 0.9911350094463014, "grad_norm": 0.09428620338439941, "learning_rate": 6.517763598021808e-08, "loss": 0.2987945079803467, "memory(GiB)": 78.33, "step": 5115, "token_acc": 0.9094635777663906, "train_speed(iter/s)": 0.032233 }, { "epoch": 0.9913287797316281, "grad_norm": 0.10901875793933868, "learning_rate": 6.237482228563239e-08, "loss": 0.3245546817779541, "memory(GiB)": 78.33, "step": 5116, "token_acc": 0.9043344214726151, "train_speed(iter/s)": 0.032234 }, { "epoch": 0.9915225500169549, "grad_norm": 0.10262294858694077, "learning_rate": 5.963359200270024e-08, "loss": 0.3375054895877838, "memory(GiB)": 78.33, "step": 5117, "token_acc": 0.8971592035573682, "train_speed(iter/s)": 0.032234 }, { "epoch": 0.9917163203022816, "grad_norm": 0.0967864915728569, "learning_rate": 5.6953946257287665e-08, "loss": 0.3116954267024994, "memory(GiB)": 78.33, "step": 5118, "token_acc": 0.9069773955911599, "train_speed(iter/s)": 0.032235 }, { "epoch": 0.9919100905876084, "grad_norm": 0.09405119717121124, "learning_rate": 5.433588615003093e-08, "loss": 0.30092549324035645, "memory(GiB)": 78.33, "step": 5119, "token_acc": 0.9092479884464617, "train_speed(iter/s)": 0.032235 }, { "epoch": 0.9921038608729351, "grad_norm": 0.0960017740726471, "learning_rate": 5.177941275620323e-08, "loss": 0.29913923144340515, "memory(GiB)": 78.33, "step": 5120, "token_acc": 0.9077315436241611, "train_speed(iter/s)": 0.032236 }, { "epoch": 0.9922976311582619, "grad_norm": 0.10961098968982697, "learning_rate": 4.928452712584796e-08, "loss": 0.3457680344581604, "memory(GiB)": 78.33, "step": 5121, "token_acc": 0.8968839910971175, "train_speed(iter/s)": 0.032236 }, { "epoch": 0.9924914014435886, "grad_norm": 0.10494101792573929, "learning_rate": 4.6851230283678766e-08, "loss": 0.32835787534713745, "memory(GiB)": 78.33, "step": 5122, "token_acc": 0.9018704634282524, "train_speed(iter/s)": 0.032237 }, { "epoch": 0.9926851717289153, "grad_norm": 0.09790334105491638, "learning_rate": 4.44795232290962e-08, "loss": 0.30867090821266174, "memory(GiB)": 78.33, "step": 5123, "token_acc": 0.9083395542284313, "train_speed(iter/s)": 0.032237 }, { "epoch": 0.9928789420142421, "grad_norm": 0.09199753403663635, "learning_rate": 4.216940693622106e-08, "loss": 0.3020906448364258, "memory(GiB)": 78.33, "step": 5124, "token_acc": 0.9092012383900929, "train_speed(iter/s)": 0.032238 }, { "epoch": 0.9930727122995688, "grad_norm": 0.10397046059370041, "learning_rate": 3.9920882353911e-08, "loss": 0.3302524983882904, "memory(GiB)": 78.33, "step": 5125, "token_acc": 0.9000474552141416, "train_speed(iter/s)": 0.032238 }, { "epoch": 0.9932664825848956, "grad_norm": 0.1026199460029602, "learning_rate": 3.773395040567728e-08, "loss": 0.3645437955856323, "memory(GiB)": 78.33, "step": 5126, "token_acc": 0.8931143232588699, "train_speed(iter/s)": 0.032239 }, { "epoch": 0.9934602528702223, "grad_norm": 0.1036728173494339, "learning_rate": 3.56086119897514e-08, "loss": 0.34227946400642395, "memory(GiB)": 78.33, "step": 5127, "token_acc": 0.8958297382801266, "train_speed(iter/s)": 0.03224 }, { "epoch": 0.9936540231555491, "grad_norm": 0.10324281454086304, "learning_rate": 3.354486797906841e-08, "loss": 0.3703176975250244, "memory(GiB)": 78.33, "step": 5128, "token_acc": 0.8903992961943412, "train_speed(iter/s)": 0.03224 }, { "epoch": 0.9938477934408758, "grad_norm": 0.10356750339269638, "learning_rate": 3.154271922125029e-08, "loss": 0.3422203063964844, "memory(GiB)": 78.33, "step": 5129, "token_acc": 0.9005716619028175, "train_speed(iter/s)": 0.032241 }, { "epoch": 0.9940415637262026, "grad_norm": 0.10634325444698334, "learning_rate": 2.960216653865588e-08, "loss": 0.33742478489875793, "memory(GiB)": 78.33, "step": 5130, "token_acc": 0.8997738043946575, "train_speed(iter/s)": 0.032241 }, { "epoch": 0.9942353340115293, "grad_norm": 0.09013240784406662, "learning_rate": 2.7723210728314292e-08, "loss": 0.29299721121788025, "memory(GiB)": 78.33, "step": 5131, "token_acc": 0.910865125192264, "train_speed(iter/s)": 0.032242 }, { "epoch": 0.994429104296856, "grad_norm": 0.09943090379238129, "learning_rate": 2.5905852561958208e-08, "loss": 0.3339230716228485, "memory(GiB)": 78.33, "step": 5132, "token_acc": 0.8997598211081357, "train_speed(iter/s)": 0.032242 }, { "epoch": 0.9946228745821828, "grad_norm": 0.10124190896749496, "learning_rate": 2.415009278604052e-08, "loss": 0.3518436551094055, "memory(GiB)": 78.33, "step": 5133, "token_acc": 0.8950833333333333, "train_speed(iter/s)": 0.032243 }, { "epoch": 0.9948166448675095, "grad_norm": 0.09643140435218811, "learning_rate": 2.245593212166774e-08, "loss": 0.3150833249092102, "memory(GiB)": 78.33, "step": 5134, "token_acc": 0.9049085264157237, "train_speed(iter/s)": 0.032243 }, { "epoch": 0.9950104151528363, "grad_norm": 0.10642070323228836, "learning_rate": 2.0823371264699907e-08, "loss": 0.33650028705596924, "memory(GiB)": 78.33, "step": 5135, "token_acc": 0.9001945581787031, "train_speed(iter/s)": 0.032244 }, { "epoch": 0.995204185438163, "grad_norm": 0.10311167687177658, "learning_rate": 1.9252410885683965e-08, "loss": 0.3397340178489685, "memory(GiB)": 78.33, "step": 5136, "token_acc": 0.8996154508408426, "train_speed(iter/s)": 0.032245 }, { "epoch": 0.9953979557234898, "grad_norm": 0.10818962752819061, "learning_rate": 1.7743051629837135e-08, "loss": 0.3583690822124481, "memory(GiB)": 78.33, "step": 5137, "token_acc": 0.8913712208308734, "train_speed(iter/s)": 0.032245 }, { "epoch": 0.9955917260088165, "grad_norm": 0.11495489627122879, "learning_rate": 1.6295294117080192e-08, "loss": 0.3481709063053131, "memory(GiB)": 78.33, "step": 5138, "token_acc": 0.8962670979044539, "train_speed(iter/s)": 0.032246 }, { "epoch": 0.9957854962941433, "grad_norm": 0.09410503506660461, "learning_rate": 1.490913894208745e-08, "loss": 0.30858170986175537, "memory(GiB)": 78.33, "step": 5139, "token_acc": 0.9072154599071401, "train_speed(iter/s)": 0.032246 }, { "epoch": 0.99597926657947, "grad_norm": 0.10258731245994568, "learning_rate": 1.3584586674153519e-08, "loss": 0.3513551652431488, "memory(GiB)": 78.33, "step": 5140, "token_acc": 0.8945655624933856, "train_speed(iter/s)": 0.032247 }, { "epoch": 0.9961730368647967, "grad_norm": 0.10465652495622635, "learning_rate": 1.2321637857326538e-08, "loss": 0.32509326934814453, "memory(GiB)": 78.33, "step": 5141, "token_acc": 0.9022335312411648, "train_speed(iter/s)": 0.032247 }, { "epoch": 0.9963668071501235, "grad_norm": 0.17421969771385193, "learning_rate": 1.112029301032491e-08, "loss": 0.3402435779571533, "memory(GiB)": 78.33, "step": 5142, "token_acc": 0.8987543069175722, "train_speed(iter/s)": 0.032248 }, { "epoch": 0.9965605774354502, "grad_norm": 0.09737160056829453, "learning_rate": 9.980552626587257e-09, "loss": 0.31006062030792236, "memory(GiB)": 78.33, "step": 5143, "token_acc": 0.9062544199598314, "train_speed(iter/s)": 0.032248 }, { "epoch": 0.996754347720777, "grad_norm": 0.0954391285777092, "learning_rate": 8.902417174205812e-09, "loss": 0.3203745186328888, "memory(GiB)": 78.33, "step": 5144, "token_acc": 0.903609002530297, "train_speed(iter/s)": 0.032249 }, { "epoch": 0.9969481180061037, "grad_norm": 0.10394702851772308, "learning_rate": 7.885887096026333e-09, "loss": 0.33040565252304077, "memory(GiB)": 78.33, "step": 5145, "token_acc": 0.9019830523281209, "train_speed(iter/s)": 0.03225 }, { "epoch": 0.9971418882914305, "grad_norm": 0.1102806031703949, "learning_rate": 6.930962809564844e-09, "loss": 0.38149887323379517, "memory(GiB)": 78.33, "step": 5146, "token_acc": 0.8889834487615917, "train_speed(iter/s)": 0.03225 }, { "epoch": 0.9973356585767572, "grad_norm": 0.09441733360290527, "learning_rate": 6.0376447070242805e-09, "loss": 0.2967412769794464, "memory(GiB)": 78.33, "step": 5147, "token_acc": 0.910029761147712, "train_speed(iter/s)": 0.032251 }, { "epoch": 0.997529428862084, "grad_norm": 0.09879467636346817, "learning_rate": 5.205933155311149e-09, "loss": 0.31042206287384033, "memory(GiB)": 78.33, "step": 5148, "token_acc": 0.9073388532511939, "train_speed(iter/s)": 0.032251 }, { "epoch": 0.9977231991474107, "grad_norm": 0.09496015310287476, "learning_rate": 4.435828496035521e-09, "loss": 0.30281057953834534, "memory(GiB)": 78.33, "step": 5149, "token_acc": 0.9083017847485128, "train_speed(iter/s)": 0.032252 }, { "epoch": 0.9979169694327374, "grad_norm": 0.09966279566287994, "learning_rate": 3.727331045511039e-09, "loss": 0.3183283805847168, "memory(GiB)": 78.33, "step": 5150, "token_acc": 0.9036394691893312, "train_speed(iter/s)": 0.032252 }, { "epoch": 0.9981107397180642, "grad_norm": 0.0968567430973053, "learning_rate": 3.0804410947216084e-09, "loss": 0.34515106678009033, "memory(GiB)": 78.33, "step": 5151, "token_acc": 0.8957895251601545, "train_speed(iter/s)": 0.032253 }, { "epoch": 0.9983045100033909, "grad_norm": 0.09464351087808609, "learning_rate": 2.4951589093713533e-09, "loss": 0.32018736004829407, "memory(GiB)": 78.33, "step": 5152, "token_acc": 0.90508582795118, "train_speed(iter/s)": 0.032253 }, { "epoch": 0.9984982802887177, "grad_norm": 0.0978541150689125, "learning_rate": 1.9714847298513135e-09, "loss": 0.3221741318702698, "memory(GiB)": 78.33, "step": 5153, "token_acc": 0.90467557008248, "train_speed(iter/s)": 0.032254 }, { "epoch": 0.9986920505740444, "grad_norm": 0.09578622877597809, "learning_rate": 1.5094187712394456e-09, "loss": 0.3303835988044739, "memory(GiB)": 78.33, "step": 5154, "token_acc": 0.9011171856429759, "train_speed(iter/s)": 0.032255 }, { "epoch": 0.9988858208593712, "grad_norm": 0.09848620742559433, "learning_rate": 1.1089612233339261e-09, "loss": 0.30692458152770996, "memory(GiB)": 78.33, "step": 5155, "token_acc": 0.9065349757288363, "train_speed(iter/s)": 0.032255 }, { "epoch": 0.9990795911446979, "grad_norm": 0.09920880943536758, "learning_rate": 7.701122505865409e-10, "loss": 0.3253172039985657, "memory(GiB)": 78.33, "step": 5156, "token_acc": 0.9011366073343341, "train_speed(iter/s)": 0.032256 }, { "epoch": 0.9992733614300247, "grad_norm": 0.09585346281528473, "learning_rate": 4.928719922026037e-10, "loss": 0.30521735548973083, "memory(GiB)": 78.33, "step": 5157, "token_acc": 0.906159781992823, "train_speed(iter/s)": 0.032256 }, { "epoch": 0.9994671317153514, "grad_norm": 0.10266774892807007, "learning_rate": 2.772405620410367e-10, "loss": 0.341879278421402, "memory(GiB)": 78.33, "step": 5158, "token_acc": 0.8973592287271203, "train_speed(iter/s)": 0.032257 }, { "epoch": 0.9996609020006781, "grad_norm": 0.10256272554397583, "learning_rate": 1.2321804866433082e-10, "loss": 0.3247736990451813, "memory(GiB)": 78.33, "step": 5159, "token_acc": 0.9010874626783905, "train_speed(iter/s)": 0.032257 }, { "epoch": 0.9998546722860049, "grad_norm": 0.09452533721923828, "learning_rate": 3.0804515321891657e-11, "loss": 0.3123696446418762, "memory(GiB)": 78.33, "step": 5160, "token_acc": 0.9040158570691574, "train_speed(iter/s)": 0.032258 }, { "epoch": 1.0, "grad_norm": 0.1213284358382225, "learning_rate": 0.0, "loss": 0.31489574909210205, "memory(GiB)": 78.33, "step": 5161, "token_acc": 0.9069757440220196, "train_speed(iter/s)": 0.03226 }, { "epoch": 1.0, "eval_loss": 0.37820467352867126, "eval_runtime": 1344.7251, "eval_samples_per_second": 5.019, "eval_steps_per_second": 5.019, "eval_token_acc": 0.9026974626241785, "step": 5161 } ], "logging_steps": 1, "max_steps": 5161, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.701470853092039e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }