diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,51743 @@ +{ + "best_global_step": 5161, + "best_metric": 0.37820467, + "best_model_checkpoint": "/home/work/newrag/qwen3/ms-swift-finetuning/output/qwen2.5-bnk-phase2/v1-20250804-040453/checkpoint-5161", + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5161, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00019377028532674515, + "grad_norm": 0.8763577342033386, + "learning_rate": 1.1583011583011583e-06, + "loss": 0.9684686660766602, + "memory(GiB)": 55.14, + "step": 1, + "token_acc": 0.7912246865959498, + "train_speed(iter/s)": 0.010196 + }, + { + "epoch": 0.0003875405706534903, + "grad_norm": 0.848241925239563, + "learning_rate": 2.3166023166023166e-06, + "loss": 0.9568785429000854, + "memory(GiB)": 55.14, + "step": 2, + "token_acc": 0.7863654591632134, + "train_speed(iter/s)": 0.015869 + }, + { + "epoch": 0.0005813108559802354, + "grad_norm": 0.8443148136138916, + "learning_rate": 3.4749034749034742e-06, + "loss": 0.8965896964073181, + "memory(GiB)": 66.69, + "step": 3, + "token_acc": 0.7995744899005435, + "train_speed(iter/s)": 0.019428 + }, + { + "epoch": 0.0007750811413069806, + "grad_norm": 0.8933870196342468, + "learning_rate": 4.633204633204633e-06, + "loss": 0.9800578355789185, + "memory(GiB)": 78.26, + "step": 4, + "token_acc": 0.7830852365415987, + "train_speed(iter/s)": 0.021886 + }, + { + "epoch": 0.0009688514266337257, + "grad_norm": 0.9806432723999023, + "learning_rate": 5.791505791505791e-06, + "loss": 1.0483475923538208, + "memory(GiB)": 78.26, + "step": 5, + "token_acc": 0.7753844505170658, + "train_speed(iter/s)": 0.023671 + }, + { + "epoch": 0.0011626217119604708, + "grad_norm": 0.897155225276947, + "learning_rate": 6.9498069498069484e-06, + "loss": 1.0103771686553955, + "memory(GiB)": 78.26, + "step": 6, + "token_acc": 0.7809635036496351, + "train_speed(iter/s)": 0.025066 + }, + { + "epoch": 0.0013563919972872161, + "grad_norm": 0.9799181818962097, + "learning_rate": 8.108108108108107e-06, + "loss": 1.0178213119506836, + "memory(GiB)": 78.26, + "step": 7, + "token_acc": 0.7772091722595078, + "train_speed(iter/s)": 0.02613 + }, + { + "epoch": 0.0015501622826139612, + "grad_norm": 1.0471524000167847, + "learning_rate": 9.266409266409266e-06, + "loss": 1.1097385883331299, + "memory(GiB)": 78.26, + "step": 8, + "token_acc": 0.7599808682440987, + "train_speed(iter/s)": 0.026976 + }, + { + "epoch": 0.0017439325679407063, + "grad_norm": 0.8407416939735413, + "learning_rate": 1.0424710424710423e-05, + "loss": 0.9527825117111206, + "memory(GiB)": 78.26, + "step": 9, + "token_acc": 0.7823414284532899, + "train_speed(iter/s)": 0.027706 + }, + { + "epoch": 0.0019377028532674514, + "grad_norm": 0.8665539622306824, + "learning_rate": 1.1583011583011582e-05, + "loss": 1.0066951513290405, + "memory(GiB)": 78.26, + "step": 10, + "token_acc": 0.7773281507906927, + "train_speed(iter/s)": 0.028311 + }, + { + "epoch": 0.0021314731385941965, + "grad_norm": 0.8096230030059814, + "learning_rate": 1.274131274131274e-05, + "loss": 0.9802660346031189, + "memory(GiB)": 78.26, + "step": 11, + "token_acc": 0.7735733563339918, + "train_speed(iter/s)": 0.028814 + }, + { + "epoch": 0.0023252434239209416, + "grad_norm": 0.6567773818969727, + "learning_rate": 1.3899613899613897e-05, + "loss": 0.9128895998001099, + "memory(GiB)": 78.26, + "step": 12, + "token_acc": 0.7855364418288818, + "train_speed(iter/s)": 0.029273 + }, + { + "epoch": 0.0025190137092476867, + "grad_norm": 0.5237937569618225, + "learning_rate": 1.5057915057915056e-05, + "loss": 0.9058799743652344, + "memory(GiB)": 78.26, + "step": 13, + "token_acc": 0.7805104547360853, + "train_speed(iter/s)": 0.029674 + }, + { + "epoch": 0.0027127839945744322, + "grad_norm": 0.4156467914581299, + "learning_rate": 1.6216216216216215e-05, + "loss": 0.8571181893348694, + "memory(GiB)": 78.26, + "step": 14, + "token_acc": 0.7879932696276072, + "train_speed(iter/s)": 0.030038 + }, + { + "epoch": 0.0029065542799011773, + "grad_norm": 0.2674636244773865, + "learning_rate": 1.7374517374517374e-05, + "loss": 0.7994478344917297, + "memory(GiB)": 78.26, + "step": 15, + "token_acc": 0.7901431703158408, + "train_speed(iter/s)": 0.030338 + }, + { + "epoch": 0.0031003245652279224, + "grad_norm": 0.24086351692676544, + "learning_rate": 1.8532818532818533e-05, + "loss": 0.8486983180046082, + "memory(GiB)": 78.26, + "step": 16, + "token_acc": 0.7774675145147912, + "train_speed(iter/s)": 0.0306 + }, + { + "epoch": 0.0032940948505546675, + "grad_norm": 0.21650397777557373, + "learning_rate": 1.9691119691119688e-05, + "loss": 0.8528725504875183, + "memory(GiB)": 78.26, + "step": 17, + "token_acc": 0.775322540195614, + "train_speed(iter/s)": 0.030832 + }, + { + "epoch": 0.0034878651358814126, + "grad_norm": 0.15441368520259857, + "learning_rate": 2.0849420849420847e-05, + "loss": 0.803363025188446, + "memory(GiB)": 78.26, + "step": 18, + "token_acc": 0.7852022395479044, + "train_speed(iter/s)": 0.031049 + }, + { + "epoch": 0.0036816354212081577, + "grad_norm": 0.14429929852485657, + "learning_rate": 2.200772200772201e-05, + "loss": 0.8013445734977722, + "memory(GiB)": 78.26, + "step": 19, + "token_acc": 0.7839805825242718, + "train_speed(iter/s)": 0.031259 + }, + { + "epoch": 0.003875405706534903, + "grad_norm": 0.1348273605108261, + "learning_rate": 2.3166023166023165e-05, + "loss": 0.7239266633987427, + "memory(GiB)": 78.26, + "step": 20, + "token_acc": 0.8037726258718549, + "train_speed(iter/s)": 0.031439 + }, + { + "epoch": 0.004069175991861648, + "grad_norm": 0.1491738110780716, + "learning_rate": 2.4324324324324324e-05, + "loss": 0.8465297222137451, + "memory(GiB)": 78.26, + "step": 21, + "token_acc": 0.7757449962935508, + "train_speed(iter/s)": 0.031603 + }, + { + "epoch": 0.004262946277188393, + "grad_norm": 0.167159304022789, + "learning_rate": 2.548262548262548e-05, + "loss": 0.736566424369812, + "memory(GiB)": 78.26, + "step": 22, + "token_acc": 0.8000630083629282, + "train_speed(iter/s)": 0.03176 + }, + { + "epoch": 0.004456716562515138, + "grad_norm": 0.15523645281791687, + "learning_rate": 2.6640926640926638e-05, + "loss": 0.6989044547080994, + "memory(GiB)": 78.26, + "step": 23, + "token_acc": 0.8113520593800029, + "train_speed(iter/s)": 0.031902 + }, + { + "epoch": 0.004650486847841883, + "grad_norm": 0.16765183210372925, + "learning_rate": 2.7799227799227794e-05, + "loss": 0.6959363222122192, + "memory(GiB)": 78.26, + "step": 24, + "token_acc": 0.8058972412047583, + "train_speed(iter/s)": 0.03202 + }, + { + "epoch": 0.004844257133168628, + "grad_norm": 0.16556301712989807, + "learning_rate": 2.8957528957528956e-05, + "loss": 0.7282753586769104, + "memory(GiB)": 78.26, + "step": 25, + "token_acc": 0.8006341844340851, + "train_speed(iter/s)": 0.03212 + }, + { + "epoch": 0.005038027418495373, + "grad_norm": 0.15321362018585205, + "learning_rate": 3.011583011583011e-05, + "loss": 0.7563999891281128, + "memory(GiB)": 78.26, + "step": 26, + "token_acc": 0.7961533964690751, + "train_speed(iter/s)": 0.032232 + }, + { + "epoch": 0.0052317977038221185, + "grad_norm": 0.14468924701213837, + "learning_rate": 3.1274131274131274e-05, + "loss": 0.7149499654769897, + "memory(GiB)": 78.26, + "step": 27, + "token_acc": 0.8046370478271425, + "train_speed(iter/s)": 0.032331 + }, + { + "epoch": 0.0054255679891488644, + "grad_norm": 0.13254722952842712, + "learning_rate": 3.243243243243243e-05, + "loss": 0.7311335802078247, + "memory(GiB)": 78.26, + "step": 28, + "token_acc": 0.8027618690788417, + "train_speed(iter/s)": 0.032422 + }, + { + "epoch": 0.0056193382744756095, + "grad_norm": 0.12081703543663025, + "learning_rate": 3.3590733590733585e-05, + "loss": 0.7195248603820801, + "memory(GiB)": 78.26, + "step": 29, + "token_acc": 0.8041594893099605, + "train_speed(iter/s)": 0.032499 + }, + { + "epoch": 0.005813108559802355, + "grad_norm": 0.11528925597667694, + "learning_rate": 3.474903474903475e-05, + "loss": 0.6729803681373596, + "memory(GiB)": 78.26, + "step": 30, + "token_acc": 0.8157344552564334, + "train_speed(iter/s)": 0.03258 + }, + { + "epoch": 0.0060068788451291, + "grad_norm": 0.13018707931041718, + "learning_rate": 3.59073359073359e-05, + "loss": 0.7402101159095764, + "memory(GiB)": 78.26, + "step": 31, + "token_acc": 0.7976209655764996, + "train_speed(iter/s)": 0.032663 + }, + { + "epoch": 0.006200649130455845, + "grad_norm": 0.12700200080871582, + "learning_rate": 3.7065637065637065e-05, + "loss": 0.7580655813217163, + "memory(GiB)": 78.26, + "step": 32, + "token_acc": 0.7951273532668881, + "train_speed(iter/s)": 0.032732 + }, + { + "epoch": 0.00639441941578259, + "grad_norm": 0.12156156450510025, + "learning_rate": 3.822393822393822e-05, + "loss": 0.6386986374855042, + "memory(GiB)": 78.26, + "step": 33, + "token_acc": 0.8224443651076996, + "train_speed(iter/s)": 0.032803 + }, + { + "epoch": 0.006588189701109335, + "grad_norm": 0.13328500092029572, + "learning_rate": 3.9382239382239376e-05, + "loss": 0.7081706523895264, + "memory(GiB)": 78.26, + "step": 34, + "token_acc": 0.8066472273114339, + "train_speed(iter/s)": 0.03287 + }, + { + "epoch": 0.00678195998643608, + "grad_norm": 0.12701700627803802, + "learning_rate": 4.054054054054054e-05, + "loss": 0.6880778074264526, + "memory(GiB)": 78.26, + "step": 35, + "token_acc": 0.8144979985743269, + "train_speed(iter/s)": 0.032933 + }, + { + "epoch": 0.006975730271762825, + "grad_norm": 0.13268497586250305, + "learning_rate": 4.1698841698841694e-05, + "loss": 0.7258732914924622, + "memory(GiB)": 78.26, + "step": 36, + "token_acc": 0.8062436274215798, + "train_speed(iter/s)": 0.033003 + }, + { + "epoch": 0.00716950055708957, + "grad_norm": 0.1224246695637703, + "learning_rate": 4.285714285714285e-05, + "loss": 0.7171946167945862, + "memory(GiB)": 78.26, + "step": 37, + "token_acc": 0.8046310103255212, + "train_speed(iter/s)": 0.033068 + }, + { + "epoch": 0.007363270842416315, + "grad_norm": 0.12160563468933105, + "learning_rate": 4.401544401544402e-05, + "loss": 0.7064481377601624, + "memory(GiB)": 78.26, + "step": 38, + "token_acc": 0.8088185898806071, + "train_speed(iter/s)": 0.033127 + }, + { + "epoch": 0.0075570411277430605, + "grad_norm": 0.11494564265012741, + "learning_rate": 4.5173745173745174e-05, + "loss": 0.7347713708877563, + "memory(GiB)": 78.26, + "step": 39, + "token_acc": 0.798022857441745, + "train_speed(iter/s)": 0.033179 + }, + { + "epoch": 0.007750811413069806, + "grad_norm": 0.11666765064001083, + "learning_rate": 4.633204633204633e-05, + "loss": 0.759428083896637, + "memory(GiB)": 78.26, + "step": 40, + "token_acc": 0.7921686007998233, + "train_speed(iter/s)": 0.033222 + }, + { + "epoch": 0.007944581698396552, + "grad_norm": 0.11888981610536575, + "learning_rate": 4.7490347490347485e-05, + "loss": 0.7375974655151367, + "memory(GiB)": 78.26, + "step": 41, + "token_acc": 0.8020120724346076, + "train_speed(iter/s)": 0.033273 + }, + { + "epoch": 0.008138351983723296, + "grad_norm": 0.11488386988639832, + "learning_rate": 4.864864864864865e-05, + "loss": 0.696284294128418, + "memory(GiB)": 78.26, + "step": 42, + "token_acc": 0.8061018470051596, + "train_speed(iter/s)": 0.033319 + }, + { + "epoch": 0.008332122269050042, + "grad_norm": 0.1082736924290657, + "learning_rate": 4.98069498069498e-05, + "loss": 0.6931319832801819, + "memory(GiB)": 78.26, + "step": 43, + "token_acc": 0.8120705042391789, + "train_speed(iter/s)": 0.033355 + }, + { + "epoch": 0.008525892554376786, + "grad_norm": 0.12314610928297043, + "learning_rate": 5.096525096525096e-05, + "loss": 0.7261099815368652, + "memory(GiB)": 78.26, + "step": 44, + "token_acc": 0.8016441410059914, + "train_speed(iter/s)": 0.033397 + }, + { + "epoch": 0.008719662839703532, + "grad_norm": 0.12343846261501312, + "learning_rate": 5.212355212355212e-05, + "loss": 0.6880357265472412, + "memory(GiB)": 78.26, + "step": 45, + "token_acc": 0.8112455396966993, + "train_speed(iter/s)": 0.033441 + }, + { + "epoch": 0.008913433125030276, + "grad_norm": 0.11987273395061493, + "learning_rate": 5.3281853281853276e-05, + "loss": 0.6877344250679016, + "memory(GiB)": 78.26, + "step": 46, + "token_acc": 0.8103002813343285, + "train_speed(iter/s)": 0.033486 + }, + { + "epoch": 0.009107203410357022, + "grad_norm": 0.1239103302359581, + "learning_rate": 5.444015444015443e-05, + "loss": 0.7504675388336182, + "memory(GiB)": 78.26, + "step": 47, + "token_acc": 0.7963265423078025, + "train_speed(iter/s)": 0.033526 + }, + { + "epoch": 0.009300973695683766, + "grad_norm": 0.11254343390464783, + "learning_rate": 5.559845559845559e-05, + "loss": 0.652977466583252, + "memory(GiB)": 78.26, + "step": 48, + "token_acc": 0.8192933347417345, + "train_speed(iter/s)": 0.033559 + }, + { + "epoch": 0.009494743981010512, + "grad_norm": 0.13146735727787018, + "learning_rate": 5.6756756756756757e-05, + "loss": 0.7184028029441833, + "memory(GiB)": 78.26, + "step": 49, + "token_acc": 0.8021164329650468, + "train_speed(iter/s)": 0.033598 + }, + { + "epoch": 0.009688514266337257, + "grad_norm": 0.11965050548315048, + "learning_rate": 5.791505791505791e-05, + "loss": 0.6229060888290405, + "memory(GiB)": 78.26, + "step": 50, + "token_acc": 0.8271026669398988, + "train_speed(iter/s)": 0.03363 + }, + { + "epoch": 0.009882284551664003, + "grad_norm": 0.12212494015693665, + "learning_rate": 5.907335907335907e-05, + "loss": 0.698147177696228, + "memory(GiB)": 78.26, + "step": 51, + "token_acc": 0.8134602899805237, + "train_speed(iter/s)": 0.033662 + }, + { + "epoch": 0.010076054836990747, + "grad_norm": 0.1269523799419403, + "learning_rate": 6.023166023166022e-05, + "loss": 0.6502314805984497, + "memory(GiB)": 78.26, + "step": 52, + "token_acc": 0.8226499256566991, + "train_speed(iter/s)": 0.033693 + }, + { + "epoch": 0.010269825122317493, + "grad_norm": 0.13087137043476105, + "learning_rate": 6.138996138996139e-05, + "loss": 0.6692061424255371, + "memory(GiB)": 78.26, + "step": 53, + "token_acc": 0.8201304240156289, + "train_speed(iter/s)": 0.033727 + }, + { + "epoch": 0.010463595407644237, + "grad_norm": 0.14609168469905853, + "learning_rate": 6.254826254826255e-05, + "loss": 0.7044385671615601, + "memory(GiB)": 78.26, + "step": 54, + "token_acc": 0.8073001694132143, + "train_speed(iter/s)": 0.033764 + }, + { + "epoch": 0.010657365692970983, + "grad_norm": 0.12461866438388824, + "learning_rate": 6.37065637065637e-05, + "loss": 0.6916706562042236, + "memory(GiB)": 78.26, + "step": 55, + "token_acc": 0.8121318182938552, + "train_speed(iter/s)": 0.033788 + }, + { + "epoch": 0.010851135978297729, + "grad_norm": 0.12224052846431732, + "learning_rate": 6.486486486486486e-05, + "loss": 0.6395901441574097, + "memory(GiB)": 78.26, + "step": 56, + "token_acc": 0.824472191901174, + "train_speed(iter/s)": 0.033815 + }, + { + "epoch": 0.011044906263624473, + "grad_norm": 0.1317582130432129, + "learning_rate": 6.602316602316601e-05, + "loss": 0.6729621291160583, + "memory(GiB)": 78.26, + "step": 57, + "token_acc": 0.8167814800579487, + "train_speed(iter/s)": 0.033843 + }, + { + "epoch": 0.011238676548951219, + "grad_norm": 0.12963414192199707, + "learning_rate": 6.718146718146717e-05, + "loss": 0.667634129524231, + "memory(GiB)": 78.26, + "step": 58, + "token_acc": 0.8161623128430612, + "train_speed(iter/s)": 0.033872 + }, + { + "epoch": 0.011432446834277963, + "grad_norm": 0.14076951146125793, + "learning_rate": 6.833976833976833e-05, + "loss": 0.659672200679779, + "memory(GiB)": 78.26, + "step": 59, + "token_acc": 0.8184602051133901, + "train_speed(iter/s)": 0.033893 + }, + { + "epoch": 0.01162621711960471, + "grad_norm": 0.1304904669523239, + "learning_rate": 6.94980694980695e-05, + "loss": 0.6900058388710022, + "memory(GiB)": 78.26, + "step": 60, + "token_acc": 0.8142272123233802, + "train_speed(iter/s)": 0.033915 + }, + { + "epoch": 0.011819987404931454, + "grad_norm": 0.12284844368696213, + "learning_rate": 7.065637065637065e-05, + "loss": 0.6068597435951233, + "memory(GiB)": 78.26, + "step": 61, + "token_acc": 0.8333287809674688, + "train_speed(iter/s)": 0.03394 + }, + { + "epoch": 0.0120137576902582, + "grad_norm": 0.16575318574905396, + "learning_rate": 7.18146718146718e-05, + "loss": 0.6391609311103821, + "memory(GiB)": 78.26, + "step": 62, + "token_acc": 0.8243709681258777, + "train_speed(iter/s)": 0.033959 + }, + { + "epoch": 0.012207527975584944, + "grad_norm": 0.1536901593208313, + "learning_rate": 7.297297297297297e-05, + "loss": 0.6720226407051086, + "memory(GiB)": 78.26, + "step": 63, + "token_acc": 0.8132890573603584, + "train_speed(iter/s)": 0.03398 + }, + { + "epoch": 0.01240129826091169, + "grad_norm": 0.12783144414424896, + "learning_rate": 7.413127413127413e-05, + "loss": 0.6299335956573486, + "memory(GiB)": 78.26, + "step": 64, + "token_acc": 0.8247123041659774, + "train_speed(iter/s)": 0.033999 + }, + { + "epoch": 0.012595068546238434, + "grad_norm": 0.12650437653064728, + "learning_rate": 7.528957528957529e-05, + "loss": 0.5913993120193481, + "memory(GiB)": 78.26, + "step": 65, + "token_acc": 0.8381598287733052, + "train_speed(iter/s)": 0.034015 + }, + { + "epoch": 0.01278883883156518, + "grad_norm": 0.13705813884735107, + "learning_rate": 7.644787644787644e-05, + "loss": 0.6303204298019409, + "memory(GiB)": 78.26, + "step": 66, + "token_acc": 0.8255334138486312, + "train_speed(iter/s)": 0.034034 + }, + { + "epoch": 0.012982609116891924, + "grad_norm": 0.15545400977134705, + "learning_rate": 7.76061776061776e-05, + "loss": 0.6292211413383484, + "memory(GiB)": 78.26, + "step": 67, + "token_acc": 0.8286270117314154, + "train_speed(iter/s)": 0.034055 + }, + { + "epoch": 0.01317637940221867, + "grad_norm": 0.14907345175743103, + "learning_rate": 7.876447876447875e-05, + "loss": 0.6275659799575806, + "memory(GiB)": 78.26, + "step": 68, + "token_acc": 0.8254116669263935, + "train_speed(iter/s)": 0.03407 + }, + { + "epoch": 0.013370149687545414, + "grad_norm": 0.15840691328048706, + "learning_rate": 7.992277992277992e-05, + "loss": 0.6730965375900269, + "memory(GiB)": 78.26, + "step": 69, + "token_acc": 0.8142665820821879, + "train_speed(iter/s)": 0.034091 + }, + { + "epoch": 0.01356391997287216, + "grad_norm": 0.14479568600654602, + "learning_rate": 8.108108108108108e-05, + "loss": 0.6207985877990723, + "memory(GiB)": 78.26, + "step": 70, + "token_acc": 0.8265187594377976, + "train_speed(iter/s)": 0.034111 + }, + { + "epoch": 0.013757690258198904, + "grad_norm": 0.13668787479400635, + "learning_rate": 8.223938223938223e-05, + "loss": 0.6429966688156128, + "memory(GiB)": 78.26, + "step": 71, + "token_acc": 0.8240099449476115, + "train_speed(iter/s)": 0.034127 + }, + { + "epoch": 0.01395146054352565, + "grad_norm": 0.14420422911643982, + "learning_rate": 8.339768339768339e-05, + "loss": 0.6110091805458069, + "memory(GiB)": 78.26, + "step": 72, + "token_acc": 0.8311519082643192, + "train_speed(iter/s)": 0.034142 + }, + { + "epoch": 0.014145230828852395, + "grad_norm": 0.16625623404979706, + "learning_rate": 8.455598455598454e-05, + "loss": 0.6166950464248657, + "memory(GiB)": 78.26, + "step": 73, + "token_acc": 0.8307741268199539, + "train_speed(iter/s)": 0.034155 + }, + { + "epoch": 0.01433900111417914, + "grad_norm": 0.14389079809188843, + "learning_rate": 8.57142857142857e-05, + "loss": 0.5988722443580627, + "memory(GiB)": 78.26, + "step": 74, + "token_acc": 0.8348723657246945, + "train_speed(iter/s)": 0.034166 + }, + { + "epoch": 0.014532771399505887, + "grad_norm": 0.18358756601810455, + "learning_rate": 8.687258687258685e-05, + "loss": 0.7113780379295349, + "memory(GiB)": 78.26, + "step": 75, + "token_acc": 0.8083578854685225, + "train_speed(iter/s)": 0.034184 + }, + { + "epoch": 0.01472654168483263, + "grad_norm": 0.1523071825504303, + "learning_rate": 8.803088803088804e-05, + "loss": 0.6291996240615845, + "memory(GiB)": 78.26, + "step": 76, + "token_acc": 0.8278446372484995, + "train_speed(iter/s)": 0.0342 + }, + { + "epoch": 0.014920311970159377, + "grad_norm": 0.16895808279514313, + "learning_rate": 8.918918918918919e-05, + "loss": 0.6497290730476379, + "memory(GiB)": 78.26, + "step": 77, + "token_acc": 0.819365872911459, + "train_speed(iter/s)": 0.034215 + }, + { + "epoch": 0.015114082255486121, + "grad_norm": 0.17319287359714508, + "learning_rate": 9.034749034749035e-05, + "loss": 0.5960345268249512, + "memory(GiB)": 78.26, + "step": 78, + "token_acc": 0.8366524602867106, + "train_speed(iter/s)": 0.034224 + }, + { + "epoch": 0.015307852540812867, + "grad_norm": 0.14983628690242767, + "learning_rate": 9.15057915057915e-05, + "loss": 0.6000748872756958, + "memory(GiB)": 78.26, + "step": 79, + "token_acc": 0.8339163897148855, + "train_speed(iter/s)": 0.034241 + }, + { + "epoch": 0.015501622826139611, + "grad_norm": 0.14565478265285492, + "learning_rate": 9.266409266409266e-05, + "loss": 0.6301745772361755, + "memory(GiB)": 78.26, + "step": 80, + "token_acc": 0.8267408675799087, + "train_speed(iter/s)": 0.034254 + }, + { + "epoch": 0.015695393111466355, + "grad_norm": 0.16253505647182465, + "learning_rate": 9.382239382239381e-05, + "loss": 0.6245601177215576, + "memory(GiB)": 78.26, + "step": 81, + "token_acc": 0.8304850012464338, + "train_speed(iter/s)": 0.034267 + }, + { + "epoch": 0.015889163396793103, + "grad_norm": 0.15641474723815918, + "learning_rate": 9.498069498069497e-05, + "loss": 0.6229044795036316, + "memory(GiB)": 78.26, + "step": 82, + "token_acc": 0.8290063166096224, + "train_speed(iter/s)": 0.034278 + }, + { + "epoch": 0.016082933682119847, + "grad_norm": 0.19741681218147278, + "learning_rate": 9.613899613899614e-05, + "loss": 0.5958088636398315, + "memory(GiB)": 78.26, + "step": 83, + "token_acc": 0.8380999073922755, + "train_speed(iter/s)": 0.034294 + }, + { + "epoch": 0.01627670396744659, + "grad_norm": 0.16509681940078735, + "learning_rate": 9.72972972972973e-05, + "loss": 0.652334988117218, + "memory(GiB)": 78.26, + "step": 84, + "token_acc": 0.8223709473915191, + "train_speed(iter/s)": 0.034307 + }, + { + "epoch": 0.016470474252773336, + "grad_norm": 0.17223341763019562, + "learning_rate": 9.845559845559845e-05, + "loss": 0.655316174030304, + "memory(GiB)": 78.26, + "step": 85, + "token_acc": 0.82221954379727, + "train_speed(iter/s)": 0.034319 + }, + { + "epoch": 0.016664244538100084, + "grad_norm": 0.1611337959766388, + "learning_rate": 9.96138996138996e-05, + "loss": 0.6308309435844421, + "memory(GiB)": 78.26, + "step": 86, + "token_acc": 0.8256332148699723, + "train_speed(iter/s)": 0.034331 + }, + { + "epoch": 0.016858014823426828, + "grad_norm": 0.1462010145187378, + "learning_rate": 0.00010077220077220076, + "loss": 0.5816379189491272, + "memory(GiB)": 78.26, + "step": 87, + "token_acc": 0.8376213592233009, + "train_speed(iter/s)": 0.03434 + }, + { + "epoch": 0.017051785108753572, + "grad_norm": 0.15553230047225952, + "learning_rate": 0.00010193050193050192, + "loss": 0.5978987216949463, + "memory(GiB)": 78.26, + "step": 88, + "token_acc": 0.8393552427369511, + "train_speed(iter/s)": 0.03435 + }, + { + "epoch": 0.017245555394080316, + "grad_norm": 0.172433540225029, + "learning_rate": 0.00010308880308880307, + "loss": 0.6419100165367126, + "memory(GiB)": 78.26, + "step": 89, + "token_acc": 0.8243931496649293, + "train_speed(iter/s)": 0.034363 + }, + { + "epoch": 0.017439325679407064, + "grad_norm": 0.17001327872276306, + "learning_rate": 0.00010424710424710424, + "loss": 0.6268438100814819, + "memory(GiB)": 78.26, + "step": 90, + "token_acc": 0.8299691153761306, + "train_speed(iter/s)": 0.034376 + }, + { + "epoch": 0.017633095964733808, + "grad_norm": 0.19090093672275543, + "learning_rate": 0.0001054054054054054, + "loss": 0.6875048875808716, + "memory(GiB)": 78.26, + "step": 91, + "token_acc": 0.8123634272570442, + "train_speed(iter/s)": 0.034391 + }, + { + "epoch": 0.017826866250060552, + "grad_norm": 0.16166290640830994, + "learning_rate": 0.00010656370656370655, + "loss": 0.5984062552452087, + "memory(GiB)": 78.26, + "step": 92, + "token_acc": 0.8348241568976055, + "train_speed(iter/s)": 0.034401 + }, + { + "epoch": 0.0180206365353873, + "grad_norm": 0.14463870227336884, + "learning_rate": 0.00010772200772200771, + "loss": 0.5481261014938354, + "memory(GiB)": 78.26, + "step": 93, + "token_acc": 0.8472176412382793, + "train_speed(iter/s)": 0.034409 + }, + { + "epoch": 0.018214406820714044, + "grad_norm": 0.17527909576892853, + "learning_rate": 0.00010888030888030886, + "loss": 0.6553927063941956, + "memory(GiB)": 78.26, + "step": 94, + "token_acc": 0.8222487233587132, + "train_speed(iter/s)": 0.034415 + }, + { + "epoch": 0.01840817710604079, + "grad_norm": 0.16232283413410187, + "learning_rate": 0.00011003861003861002, + "loss": 0.6176364421844482, + "memory(GiB)": 78.26, + "step": 95, + "token_acc": 0.8296051451559177, + "train_speed(iter/s)": 0.034422 + }, + { + "epoch": 0.018601947391367533, + "grad_norm": 0.1550573855638504, + "learning_rate": 0.00011119691119691117, + "loss": 0.5948160886764526, + "memory(GiB)": 78.26, + "step": 96, + "token_acc": 0.8375553097345133, + "train_speed(iter/s)": 0.034429 + }, + { + "epoch": 0.01879571767669428, + "grad_norm": 0.16793379187583923, + "learning_rate": 0.00011235521235521234, + "loss": 0.6064925193786621, + "memory(GiB)": 78.26, + "step": 97, + "token_acc": 0.8337825430204662, + "train_speed(iter/s)": 0.034438 + }, + { + "epoch": 0.018989487962021025, + "grad_norm": 0.17881280183792114, + "learning_rate": 0.00011351351351351351, + "loss": 0.5967330932617188, + "memory(GiB)": 78.26, + "step": 98, + "token_acc": 0.829712168876606, + "train_speed(iter/s)": 0.03445 + }, + { + "epoch": 0.01918325824734777, + "grad_norm": 0.1565878838300705, + "learning_rate": 0.00011467181467181467, + "loss": 0.5600182414054871, + "memory(GiB)": 78.26, + "step": 99, + "token_acc": 0.8435590667538151, + "train_speed(iter/s)": 0.034461 + }, + { + "epoch": 0.019377028532674513, + "grad_norm": 0.15280094742774963, + "learning_rate": 0.00011583011583011582, + "loss": 0.5975373983383179, + "memory(GiB)": 78.26, + "step": 100, + "token_acc": 0.8333461057041408, + "train_speed(iter/s)": 0.034468 + }, + { + "epoch": 0.01957079881800126, + "grad_norm": 0.15495073795318604, + "learning_rate": 0.00011698841698841698, + "loss": 0.6017282009124756, + "memory(GiB)": 78.26, + "step": 101, + "token_acc": 0.8339026241596184, + "train_speed(iter/s)": 0.034475 + }, + { + "epoch": 0.019764569103328005, + "grad_norm": 0.14694905281066895, + "learning_rate": 0.00011814671814671814, + "loss": 0.5602938532829285, + "memory(GiB)": 78.26, + "step": 102, + "token_acc": 0.8411804083454201, + "train_speed(iter/s)": 0.034482 + }, + { + "epoch": 0.01995833938865475, + "grad_norm": 0.1619928628206253, + "learning_rate": 0.00011930501930501929, + "loss": 0.5849668979644775, + "memory(GiB)": 78.26, + "step": 103, + "token_acc": 0.8391262944887513, + "train_speed(iter/s)": 0.03449 + }, + { + "epoch": 0.020152109673981494, + "grad_norm": 0.1454261839389801, + "learning_rate": 0.00012046332046332045, + "loss": 0.5901338458061218, + "memory(GiB)": 78.26, + "step": 104, + "token_acc": 0.8340299917345614, + "train_speed(iter/s)": 0.034498 + }, + { + "epoch": 0.02034587995930824, + "grad_norm": 0.2236863374710083, + "learning_rate": 0.00012162162162162162, + "loss": 0.5838300585746765, + "memory(GiB)": 78.26, + "step": 105, + "token_acc": 0.8396284829721362, + "train_speed(iter/s)": 0.034506 + }, + { + "epoch": 0.020539650244634985, + "grad_norm": 0.15272359549999237, + "learning_rate": 0.00012277992277992278, + "loss": 0.5919955372810364, + "memory(GiB)": 78.26, + "step": 106, + "token_acc": 0.8308620948755198, + "train_speed(iter/s)": 0.034513 + }, + { + "epoch": 0.02073342052996173, + "grad_norm": 0.13382184505462646, + "learning_rate": 0.00012393822393822393, + "loss": 0.5290847420692444, + "memory(GiB)": 78.26, + "step": 107, + "token_acc": 0.8508332939898966, + "train_speed(iter/s)": 0.034521 + }, + { + "epoch": 0.020927190815288474, + "grad_norm": 0.1610032171010971, + "learning_rate": 0.0001250965250965251, + "loss": 0.6414130330085754, + "memory(GiB)": 78.26, + "step": 108, + "token_acc": 0.8247312177217657, + "train_speed(iter/s)": 0.034529 + }, + { + "epoch": 0.02112096110061522, + "grad_norm": 0.1506713628768921, + "learning_rate": 0.00012625482625482624, + "loss": 0.6079325675964355, + "memory(GiB)": 78.26, + "step": 109, + "token_acc": 0.8329596412556054, + "train_speed(iter/s)": 0.034536 + }, + { + "epoch": 0.021314731385941966, + "grad_norm": 0.16280919313430786, + "learning_rate": 0.0001274131274131274, + "loss": 0.6136084198951721, + "memory(GiB)": 78.26, + "step": 110, + "token_acc": 0.8316319235938349, + "train_speed(iter/s)": 0.034545 + }, + { + "epoch": 0.02150850167126871, + "grad_norm": 0.14740246534347534, + "learning_rate": 0.00012857142857142855, + "loss": 0.5931621789932251, + "memory(GiB)": 78.26, + "step": 111, + "token_acc": 0.8302635446262369, + "train_speed(iter/s)": 0.034549 + }, + { + "epoch": 0.021702271956595458, + "grad_norm": 0.15233321487903595, + "learning_rate": 0.00012972972972972972, + "loss": 0.5877476930618286, + "memory(GiB)": 78.26, + "step": 112, + "token_acc": 0.8364121451149842, + "train_speed(iter/s)": 0.034559 + }, + { + "epoch": 0.021896042241922202, + "grad_norm": 0.1440098136663437, + "learning_rate": 0.0001308880308880309, + "loss": 0.553906261920929, + "memory(GiB)": 78.26, + "step": 113, + "token_acc": 0.8466698357365446, + "train_speed(iter/s)": 0.034565 + }, + { + "epoch": 0.022089812527248946, + "grad_norm": 0.16069145500659943, + "learning_rate": 0.00013204633204633203, + "loss": 0.6046357750892639, + "memory(GiB)": 78.26, + "step": 114, + "token_acc": 0.832103537128133, + "train_speed(iter/s)": 0.034573 + }, + { + "epoch": 0.02228358281257569, + "grad_norm": 0.1648361086845398, + "learning_rate": 0.0001332046332046332, + "loss": 0.6004931330680847, + "memory(GiB)": 78.26, + "step": 115, + "token_acc": 0.8346290569636716, + "train_speed(iter/s)": 0.034581 + }, + { + "epoch": 0.022477353097902438, + "grad_norm": 0.15239156782627106, + "learning_rate": 0.00013436293436293434, + "loss": 0.6015850901603699, + "memory(GiB)": 78.26, + "step": 116, + "token_acc": 0.8332898444838743, + "train_speed(iter/s)": 0.034589 + }, + { + "epoch": 0.022671123383229182, + "grad_norm": 0.15159296989440918, + "learning_rate": 0.0001355212355212355, + "loss": 0.5867636203765869, + "memory(GiB)": 78.26, + "step": 117, + "token_acc": 0.8370697910212358, + "train_speed(iter/s)": 0.034594 + }, + { + "epoch": 0.022864893668555927, + "grad_norm": 0.1747017502784729, + "learning_rate": 0.00013667953667953665, + "loss": 0.636696457862854, + "memory(GiB)": 78.26, + "step": 118, + "token_acc": 0.8233567399562565, + "train_speed(iter/s)": 0.034599 + }, + { + "epoch": 0.02305866395388267, + "grad_norm": 0.15044157207012177, + "learning_rate": 0.00013783783783783782, + "loss": 0.5734996199607849, + "memory(GiB)": 78.26, + "step": 119, + "token_acc": 0.8417291220556745, + "train_speed(iter/s)": 0.034603 + }, + { + "epoch": 0.02325243423920942, + "grad_norm": 0.16326741874217987, + "learning_rate": 0.000138996138996139, + "loss": 0.5967447757720947, + "memory(GiB)": 78.26, + "step": 120, + "token_acc": 0.8323221786037829, + "train_speed(iter/s)": 0.034609 + }, + { + "epoch": 0.023446204524536163, + "grad_norm": 0.13857780396938324, + "learning_rate": 0.00014015444015444016, + "loss": 0.5407213568687439, + "memory(GiB)": 78.26, + "step": 121, + "token_acc": 0.8461235837180026, + "train_speed(iter/s)": 0.034612 + }, + { + "epoch": 0.023639974809862907, + "grad_norm": 0.13604214787483215, + "learning_rate": 0.0001413127413127413, + "loss": 0.5105344653129578, + "memory(GiB)": 78.26, + "step": 122, + "token_acc": 0.8567164915396379, + "train_speed(iter/s)": 0.034616 + }, + { + "epoch": 0.02383374509518965, + "grad_norm": 0.14465397596359253, + "learning_rate": 0.00014247104247104247, + "loss": 0.5738057494163513, + "memory(GiB)": 78.26, + "step": 123, + "token_acc": 0.8395190358188781, + "train_speed(iter/s)": 0.034624 + }, + { + "epoch": 0.0240275153805164, + "grad_norm": 0.14596134424209595, + "learning_rate": 0.0001436293436293436, + "loss": 0.5468869209289551, + "memory(GiB)": 78.26, + "step": 124, + "token_acc": 0.8466944373600839, + "train_speed(iter/s)": 0.034632 + }, + { + "epoch": 0.024221285665843143, + "grad_norm": 0.14446629583835602, + "learning_rate": 0.00014478764478764478, + "loss": 0.5509823560714722, + "memory(GiB)": 78.26, + "step": 125, + "token_acc": 0.8461442816999478, + "train_speed(iter/s)": 0.034635 + }, + { + "epoch": 0.024415055951169887, + "grad_norm": 0.13912013173103333, + "learning_rate": 0.00014594594594594595, + "loss": 0.5554008483886719, + "memory(GiB)": 78.26, + "step": 126, + "token_acc": 0.8426597276608645, + "train_speed(iter/s)": 0.03464 + }, + { + "epoch": 0.02460882623649663, + "grad_norm": 0.14441823959350586, + "learning_rate": 0.0001471042471042471, + "loss": 0.5533645749092102, + "memory(GiB)": 78.26, + "step": 127, + "token_acc": 0.8452828346917859, + "train_speed(iter/s)": 0.034642 + }, + { + "epoch": 0.02480259652182338, + "grad_norm": 0.1425672024488449, + "learning_rate": 0.00014826254826254826, + "loss": 0.5448683500289917, + "memory(GiB)": 78.26, + "step": 128, + "token_acc": 0.8480569849679332, + "train_speed(iter/s)": 0.034646 + }, + { + "epoch": 0.024996366807150124, + "grad_norm": 0.15022799372673035, + "learning_rate": 0.0001494208494208494, + "loss": 0.5586391687393188, + "memory(GiB)": 78.26, + "step": 129, + "token_acc": 0.843522056269538, + "train_speed(iter/s)": 0.034653 + }, + { + "epoch": 0.025190137092476868, + "grad_norm": 0.14275088906288147, + "learning_rate": 0.00015057915057915057, + "loss": 0.5584322810173035, + "memory(GiB)": 78.26, + "step": 130, + "token_acc": 0.8456992938407218, + "train_speed(iter/s)": 0.034657 + }, + { + "epoch": 0.025383907377803615, + "grad_norm": 0.15598376095294952, + "learning_rate": 0.0001517374517374517, + "loss": 0.5684648156166077, + "memory(GiB)": 78.26, + "step": 131, + "token_acc": 0.841918682337542, + "train_speed(iter/s)": 0.03466 + }, + { + "epoch": 0.02557767766313036, + "grad_norm": 0.13116468489170074, + "learning_rate": 0.00015289575289575288, + "loss": 0.518610954284668, + "memory(GiB)": 78.26, + "step": 132, + "token_acc": 0.8523458494613052, + "train_speed(iter/s)": 0.034665 + }, + { + "epoch": 0.025771447948457104, + "grad_norm": 0.140156552195549, + "learning_rate": 0.00015405405405405402, + "loss": 0.5425719022750854, + "memory(GiB)": 78.26, + "step": 133, + "token_acc": 0.8470962689771895, + "train_speed(iter/s)": 0.03467 + }, + { + "epoch": 0.025965218233783848, + "grad_norm": 0.15615837275981903, + "learning_rate": 0.0001552123552123552, + "loss": 0.58672034740448, + "memory(GiB)": 78.26, + "step": 134, + "token_acc": 0.8424782073907695, + "train_speed(iter/s)": 0.034673 + }, + { + "epoch": 0.026158988519110596, + "grad_norm": 0.1514168381690979, + "learning_rate": 0.00015637065637065634, + "loss": 0.6196274161338806, + "memory(GiB)": 78.26, + "step": 135, + "token_acc": 0.8292035111586176, + "train_speed(iter/s)": 0.034681 + }, + { + "epoch": 0.02635275880443734, + "grad_norm": 0.15724033117294312, + "learning_rate": 0.0001575289575289575, + "loss": 0.5608164668083191, + "memory(GiB)": 78.26, + "step": 136, + "token_acc": 0.8445253505933118, + "train_speed(iter/s)": 0.034686 + }, + { + "epoch": 0.026546529089764084, + "grad_norm": 0.17307862639427185, + "learning_rate": 0.0001586872586872587, + "loss": 0.6565619111061096, + "memory(GiB)": 78.26, + "step": 137, + "token_acc": 0.8214262132717068, + "train_speed(iter/s)": 0.034691 + }, + { + "epoch": 0.02674029937509083, + "grad_norm": 0.1405402421951294, + "learning_rate": 0.00015984555984555984, + "loss": 0.5477691888809204, + "memory(GiB)": 78.26, + "step": 138, + "token_acc": 0.8483550808604967, + "train_speed(iter/s)": 0.034696 + }, + { + "epoch": 0.026934069660417576, + "grad_norm": 0.15360456705093384, + "learning_rate": 0.000161003861003861, + "loss": 0.6261464357376099, + "memory(GiB)": 78.26, + "step": 139, + "token_acc": 0.8274440827470335, + "train_speed(iter/s)": 0.0347 + }, + { + "epoch": 0.02712783994574432, + "grad_norm": 0.12648895382881165, + "learning_rate": 0.00016216216216216215, + "loss": 0.4741262197494507, + "memory(GiB)": 78.26, + "step": 140, + "token_acc": 0.8678006997225238, + "train_speed(iter/s)": 0.034702 + }, + { + "epoch": 0.027321610231071065, + "grad_norm": 0.1415882259607315, + "learning_rate": 0.00016332046332046332, + "loss": 0.5620299577713013, + "memory(GiB)": 78.26, + "step": 141, + "token_acc": 0.8392420713649041, + "train_speed(iter/s)": 0.034706 + }, + { + "epoch": 0.02751538051639781, + "grad_norm": 0.143955260515213, + "learning_rate": 0.00016447876447876446, + "loss": 0.5819345712661743, + "memory(GiB)": 78.26, + "step": 142, + "token_acc": 0.8385089572540735, + "train_speed(iter/s)": 0.03471 + }, + { + "epoch": 0.027709150801724557, + "grad_norm": 0.14960965514183044, + "learning_rate": 0.00016563706563706563, + "loss": 0.5373987555503845, + "memory(GiB)": 78.26, + "step": 143, + "token_acc": 0.8491564847739486, + "train_speed(iter/s)": 0.034715 + }, + { + "epoch": 0.0279029210870513, + "grad_norm": 0.13836169242858887, + "learning_rate": 0.00016679536679536678, + "loss": 0.5637241005897522, + "memory(GiB)": 78.26, + "step": 144, + "token_acc": 0.8398485764610005, + "train_speed(iter/s)": 0.034719 + }, + { + "epoch": 0.028096691372378045, + "grad_norm": 0.15270289778709412, + "learning_rate": 0.00016795366795366795, + "loss": 0.6161123514175415, + "memory(GiB)": 78.26, + "step": 145, + "token_acc": 0.828073843783531, + "train_speed(iter/s)": 0.034725 + }, + { + "epoch": 0.02829046165770479, + "grad_norm": 0.14659975469112396, + "learning_rate": 0.0001691119691119691, + "loss": 0.5577420592308044, + "memory(GiB)": 78.26, + "step": 146, + "token_acc": 0.8441303927792653, + "train_speed(iter/s)": 0.034731 + }, + { + "epoch": 0.028484231943031537, + "grad_norm": 0.13934315741062164, + "learning_rate": 0.00017027027027027026, + "loss": 0.5665932893753052, + "memory(GiB)": 78.26, + "step": 147, + "token_acc": 0.8396104567059309, + "train_speed(iter/s)": 0.034731 + }, + { + "epoch": 0.02867800222835828, + "grad_norm": 0.15382340550422668, + "learning_rate": 0.0001714285714285714, + "loss": 0.6315574645996094, + "memory(GiB)": 78.26, + "step": 148, + "token_acc": 0.8281886687133687, + "train_speed(iter/s)": 0.034735 + }, + { + "epoch": 0.028871772513685025, + "grad_norm": 0.15263354778289795, + "learning_rate": 0.00017258687258687257, + "loss": 0.5572026968002319, + "memory(GiB)": 78.26, + "step": 149, + "token_acc": 0.8416983617968031, + "train_speed(iter/s)": 0.034739 + }, + { + "epoch": 0.029065542799011773, + "grad_norm": 0.14366725087165833, + "learning_rate": 0.0001737451737451737, + "loss": 0.5455084443092346, + "memory(GiB)": 78.26, + "step": 150, + "token_acc": 0.8478349198978408, + "train_speed(iter/s)": 0.034744 + }, + { + "epoch": 0.029259313084338517, + "grad_norm": 0.14723645150661469, + "learning_rate": 0.00017490347490347488, + "loss": 0.5214795470237732, + "memory(GiB)": 78.26, + "step": 151, + "token_acc": 0.8530382721575649, + "train_speed(iter/s)": 0.034748 + }, + { + "epoch": 0.02945308336966526, + "grad_norm": 0.1369503289461136, + "learning_rate": 0.00017606177606177607, + "loss": 0.5099643468856812, + "memory(GiB)": 78.26, + "step": 152, + "token_acc": 0.8562671739589939, + "train_speed(iter/s)": 0.034751 + }, + { + "epoch": 0.029646853654992006, + "grad_norm": 0.16232426464557648, + "learning_rate": 0.00017722007722007722, + "loss": 0.6399052143096924, + "memory(GiB)": 78.26, + "step": 153, + "token_acc": 0.8246472248353716, + "train_speed(iter/s)": 0.034755 + }, + { + "epoch": 0.029840623940318754, + "grad_norm": 0.15433335304260254, + "learning_rate": 0.00017837837837837839, + "loss": 0.5725199580192566, + "memory(GiB)": 78.26, + "step": 154, + "token_acc": 0.8409897602206277, + "train_speed(iter/s)": 0.034758 + }, + { + "epoch": 0.030034394225645498, + "grad_norm": 0.13948160409927368, + "learning_rate": 0.00017953667953667953, + "loss": 0.5367989540100098, + "memory(GiB)": 78.26, + "step": 155, + "token_acc": 0.8473491551719958, + "train_speed(iter/s)": 0.034762 + }, + { + "epoch": 0.030228164510972242, + "grad_norm": 0.14237907528877258, + "learning_rate": 0.0001806949806949807, + "loss": 0.5689682364463806, + "memory(GiB)": 78.26, + "step": 156, + "token_acc": 0.8421181996941228, + "train_speed(iter/s)": 0.034765 + }, + { + "epoch": 0.030421934796298986, + "grad_norm": 0.13025622069835663, + "learning_rate": 0.00018185328185328184, + "loss": 0.5721440315246582, + "memory(GiB)": 78.26, + "step": 157, + "token_acc": 0.8421661117277099, + "train_speed(iter/s)": 0.034767 + }, + { + "epoch": 0.030615705081625734, + "grad_norm": 0.15070052444934845, + "learning_rate": 0.000183011583011583, + "loss": 0.5769027471542358, + "memory(GiB)": 78.26, + "step": 158, + "token_acc": 0.8369094922737307, + "train_speed(iter/s)": 0.034772 + }, + { + "epoch": 0.030809475366952478, + "grad_norm": 0.15805287659168243, + "learning_rate": 0.00018416988416988415, + "loss": 0.6211342215538025, + "memory(GiB)": 78.26, + "step": 159, + "token_acc": 0.8280907315607631, + "train_speed(iter/s)": 0.034775 + }, + { + "epoch": 0.031003245652279222, + "grad_norm": 0.1476234793663025, + "learning_rate": 0.00018532818532818532, + "loss": 0.5813598036766052, + "memory(GiB)": 78.26, + "step": 160, + "token_acc": 0.8377501760032183, + "train_speed(iter/s)": 0.034778 + }, + { + "epoch": 0.031197015937605967, + "grad_norm": 0.1391780972480774, + "learning_rate": 0.00018648648648648646, + "loss": 0.5335437059402466, + "memory(GiB)": 78.26, + "step": 161, + "token_acc": 0.848728354978355, + "train_speed(iter/s)": 0.034779 + }, + { + "epoch": 0.03139078622293271, + "grad_norm": 0.1483832597732544, + "learning_rate": 0.00018764478764478763, + "loss": 0.5789586305618286, + "memory(GiB)": 78.26, + "step": 162, + "token_acc": 0.8385598141695703, + "train_speed(iter/s)": 0.034781 + }, + { + "epoch": 0.031584556508259455, + "grad_norm": 0.14238472282886505, + "learning_rate": 0.00018880308880308877, + "loss": 0.5635126233100891, + "memory(GiB)": 78.26, + "step": 163, + "token_acc": 0.8438286425038327, + "train_speed(iter/s)": 0.034786 + }, + { + "epoch": 0.031778326793586206, + "grad_norm": 0.12377775460481644, + "learning_rate": 0.00018996138996138994, + "loss": 0.493597149848938, + "memory(GiB)": 78.26, + "step": 164, + "token_acc": 0.8575653587393326, + "train_speed(iter/s)": 0.034785 + }, + { + "epoch": 0.03197209707891295, + "grad_norm": 0.13860811293125153, + "learning_rate": 0.00019111969111969108, + "loss": 0.5463928580284119, + "memory(GiB)": 78.26, + "step": 165, + "token_acc": 0.844146549588483, + "train_speed(iter/s)": 0.034789 + }, + { + "epoch": 0.032165867364239695, + "grad_norm": 0.13424870371818542, + "learning_rate": 0.00019227799227799228, + "loss": 0.5533477067947388, + "memory(GiB)": 78.26, + "step": 166, + "token_acc": 0.8481163938685373, + "train_speed(iter/s)": 0.03479 + }, + { + "epoch": 0.03235963764956644, + "grad_norm": 0.12482903897762299, + "learning_rate": 0.00019343629343629342, + "loss": 0.5704807043075562, + "memory(GiB)": 78.26, + "step": 167, + "token_acc": 0.8384424192212097, + "train_speed(iter/s)": 0.034792 + }, + { + "epoch": 0.03255340793489318, + "grad_norm": 0.13756157457828522, + "learning_rate": 0.0001945945945945946, + "loss": 0.5534645318984985, + "memory(GiB)": 78.26, + "step": 168, + "token_acc": 0.8454033863318016, + "train_speed(iter/s)": 0.034794 + }, + { + "epoch": 0.03274717822021993, + "grad_norm": 0.13826580345630646, + "learning_rate": 0.00019575289575289573, + "loss": 0.534266471862793, + "memory(GiB)": 78.26, + "step": 169, + "token_acc": 0.8489137497528039, + "train_speed(iter/s)": 0.034796 + }, + { + "epoch": 0.03294094850554667, + "grad_norm": 0.14315061271190643, + "learning_rate": 0.0001969111969111969, + "loss": 0.5491966009140015, + "memory(GiB)": 78.26, + "step": 170, + "token_acc": 0.8464807519310109, + "train_speed(iter/s)": 0.034798 + }, + { + "epoch": 0.03313471879087342, + "grad_norm": 0.12808647751808167, + "learning_rate": 0.00019806949806949804, + "loss": 0.5022028684616089, + "memory(GiB)": 78.26, + "step": 171, + "token_acc": 0.8551667255802993, + "train_speed(iter/s)": 0.034798 + }, + { + "epoch": 0.03332848907620017, + "grad_norm": 0.14019440114498138, + "learning_rate": 0.0001992277992277992, + "loss": 0.5900301933288574, + "memory(GiB)": 78.26, + "step": 172, + "token_acc": 0.8387483645878762, + "train_speed(iter/s)": 0.034802 + }, + { + "epoch": 0.03352225936152691, + "grad_norm": 0.13789018988609314, + "learning_rate": 0.00020038610038610038, + "loss": 0.541344940662384, + "memory(GiB)": 78.26, + "step": 173, + "token_acc": 0.8496686986482905, + "train_speed(iter/s)": 0.034806 + }, + { + "epoch": 0.033716029646853655, + "grad_norm": 0.1350238174200058, + "learning_rate": 0.00020154440154440152, + "loss": 0.5568801164627075, + "memory(GiB)": 78.26, + "step": 174, + "token_acc": 0.8458727972794469, + "train_speed(iter/s)": 0.034809 + }, + { + "epoch": 0.0339097999321804, + "grad_norm": 0.14095835387706757, + "learning_rate": 0.0002027027027027027, + "loss": 0.5708956122398376, + "memory(GiB)": 78.26, + "step": 175, + "token_acc": 0.845259973423054, + "train_speed(iter/s)": 0.034811 + }, + { + "epoch": 0.034103570217507144, + "grad_norm": 0.13087715208530426, + "learning_rate": 0.00020386100386100383, + "loss": 0.5266304016113281, + "memory(GiB)": 78.26, + "step": 176, + "token_acc": 0.8541874507744814, + "train_speed(iter/s)": 0.034813 + }, + { + "epoch": 0.03429734050283389, + "grad_norm": 0.13467884063720703, + "learning_rate": 0.000205019305019305, + "loss": 0.4999612867832184, + "memory(GiB)": 78.26, + "step": 177, + "token_acc": 0.859081674116213, + "train_speed(iter/s)": 0.034816 + }, + { + "epoch": 0.03449111078816063, + "grad_norm": 0.1308935433626175, + "learning_rate": 0.00020617760617760615, + "loss": 0.5238428115844727, + "memory(GiB)": 78.26, + "step": 178, + "token_acc": 0.8521208527878653, + "train_speed(iter/s)": 0.034818 + }, + { + "epoch": 0.034684881073487384, + "grad_norm": 0.1268441081047058, + "learning_rate": 0.00020733590733590731, + "loss": 0.5368191599845886, + "memory(GiB)": 78.26, + "step": 179, + "token_acc": 0.8482607365092318, + "train_speed(iter/s)": 0.034819 + }, + { + "epoch": 0.03487865135881413, + "grad_norm": 0.12708643078804016, + "learning_rate": 0.00020849420849420848, + "loss": 0.49307650327682495, + "memory(GiB)": 78.26, + "step": 180, + "token_acc": 0.8607868020304569, + "train_speed(iter/s)": 0.034821 + }, + { + "epoch": 0.03507242164414087, + "grad_norm": 0.12540721893310547, + "learning_rate": 0.00020965250965250965, + "loss": 0.507483720779419, + "memory(GiB)": 78.26, + "step": 181, + "token_acc": 0.8567486597287922, + "train_speed(iter/s)": 0.034824 + }, + { + "epoch": 0.035266191929467616, + "grad_norm": 0.1270364373922348, + "learning_rate": 0.0002108108108108108, + "loss": 0.5022760629653931, + "memory(GiB)": 78.26, + "step": 182, + "token_acc": 0.8590840060746543, + "train_speed(iter/s)": 0.034825 + }, + { + "epoch": 0.03545996221479436, + "grad_norm": 0.14542265236377716, + "learning_rate": 0.00021196911196911196, + "loss": 0.516878604888916, + "memory(GiB)": 78.26, + "step": 183, + "token_acc": 0.8556428363617128, + "train_speed(iter/s)": 0.034829 + }, + { + "epoch": 0.035653732500121105, + "grad_norm": 0.14277629554271698, + "learning_rate": 0.0002131274131274131, + "loss": 0.5679965019226074, + "memory(GiB)": 78.26, + "step": 184, + "token_acc": 0.841894944113048, + "train_speed(iter/s)": 0.034833 + }, + { + "epoch": 0.03584750278544785, + "grad_norm": 0.13046088814735413, + "learning_rate": 0.00021428571428571427, + "loss": 0.5434874296188354, + "memory(GiB)": 78.26, + "step": 185, + "token_acc": 0.8491733876369872, + "train_speed(iter/s)": 0.034835 + }, + { + "epoch": 0.0360412730707746, + "grad_norm": 0.12494053691625595, + "learning_rate": 0.00021544401544401542, + "loss": 0.537643313407898, + "memory(GiB)": 78.26, + "step": 186, + "token_acc": 0.8506308711310349, + "train_speed(iter/s)": 0.034836 + }, + { + "epoch": 0.036235043356101344, + "grad_norm": 0.12971843779087067, + "learning_rate": 0.00021660231660231659, + "loss": 0.5533092617988586, + "memory(GiB)": 78.26, + "step": 187, + "token_acc": 0.845800933125972, + "train_speed(iter/s)": 0.034839 + }, + { + "epoch": 0.03642881364142809, + "grad_norm": 0.12924546003341675, + "learning_rate": 0.00021776061776061773, + "loss": 0.5168436765670776, + "memory(GiB)": 78.26, + "step": 188, + "token_acc": 0.8556058890147226, + "train_speed(iter/s)": 0.034839 + }, + { + "epoch": 0.03662258392675483, + "grad_norm": 0.1508742719888687, + "learning_rate": 0.0002189189189189189, + "loss": 0.6136130690574646, + "memory(GiB)": 78.26, + "step": 189, + "token_acc": 0.8350558426404401, + "train_speed(iter/s)": 0.034842 + }, + { + "epoch": 0.03681635421208158, + "grad_norm": 0.1334015280008316, + "learning_rate": 0.00022007722007722004, + "loss": 0.545418918132782, + "memory(GiB)": 78.26, + "step": 190, + "token_acc": 0.8462031558185404, + "train_speed(iter/s)": 0.034844 + }, + { + "epoch": 0.03701012449740832, + "grad_norm": 0.13890019059181213, + "learning_rate": 0.0002212355212355212, + "loss": 0.5522346496582031, + "memory(GiB)": 78.26, + "step": 191, + "token_acc": 0.8448294130112312, + "train_speed(iter/s)": 0.034845 + }, + { + "epoch": 0.037203894782735066, + "grad_norm": 0.13158449530601501, + "learning_rate": 0.00022239382239382235, + "loss": 0.5597431063652039, + "memory(GiB)": 78.26, + "step": 192, + "token_acc": 0.8428118697781619, + "train_speed(iter/s)": 0.034848 + }, + { + "epoch": 0.03739766506806181, + "grad_norm": 0.13762398064136505, + "learning_rate": 0.00022355212355212352, + "loss": 0.5332627892494202, + "memory(GiB)": 78.26, + "step": 193, + "token_acc": 0.8513293253173013, + "train_speed(iter/s)": 0.034851 + }, + { + "epoch": 0.03759143535338856, + "grad_norm": 0.13980168104171753, + "learning_rate": 0.0002247104247104247, + "loss": 0.5322573184967041, + "memory(GiB)": 78.26, + "step": 194, + "token_acc": 0.85326330790953, + "train_speed(iter/s)": 0.034851 + }, + { + "epoch": 0.037785205638715305, + "grad_norm": 0.12236251682043076, + "learning_rate": 0.00022586872586872586, + "loss": 0.5012400150299072, + "memory(GiB)": 78.26, + "step": 195, + "token_acc": 0.8592677644111584, + "train_speed(iter/s)": 0.034853 + }, + { + "epoch": 0.03797897592404205, + "grad_norm": 0.14247579872608185, + "learning_rate": 0.00022702702702702703, + "loss": 0.5467385053634644, + "memory(GiB)": 78.26, + "step": 196, + "token_acc": 0.8513190123595822, + "train_speed(iter/s)": 0.034856 + }, + { + "epoch": 0.038172746209368794, + "grad_norm": 0.12214695662260056, + "learning_rate": 0.00022818532818532817, + "loss": 0.4757518768310547, + "memory(GiB)": 78.26, + "step": 197, + "token_acc": 0.8650738047495954, + "train_speed(iter/s)": 0.034857 + }, + { + "epoch": 0.03836651649469554, + "grad_norm": 0.12631294131278992, + "learning_rate": 0.00022934362934362934, + "loss": 0.5603289604187012, + "memory(GiB)": 78.26, + "step": 198, + "token_acc": 0.8452558741537236, + "train_speed(iter/s)": 0.034857 + }, + { + "epoch": 0.03856028678002228, + "grad_norm": 0.12949825823307037, + "learning_rate": 0.00023050193050193048, + "loss": 0.5440013408660889, + "memory(GiB)": 78.26, + "step": 199, + "token_acc": 0.8481728066281994, + "train_speed(iter/s)": 0.03486 + }, + { + "epoch": 0.038754057065349026, + "grad_norm": 0.13039319217205048, + "learning_rate": 0.00023166023166023165, + "loss": 0.5734332799911499, + "memory(GiB)": 78.26, + "step": 200, + "token_acc": 0.8401181070071356, + "train_speed(iter/s)": 0.034862 + }, + { + "epoch": 0.03894782735067577, + "grad_norm": 0.13796895742416382, + "learning_rate": 0.0002328185328185328, + "loss": 0.5982975959777832, + "memory(GiB)": 78.26, + "step": 201, + "token_acc": 0.832787772216962, + "train_speed(iter/s)": 0.03479 + }, + { + "epoch": 0.03914159763600252, + "grad_norm": 0.12662553787231445, + "learning_rate": 0.00023397683397683396, + "loss": 0.5020790696144104, + "memory(GiB)": 78.26, + "step": 202, + "token_acc": 0.858308341381589, + "train_speed(iter/s)": 0.034793 + }, + { + "epoch": 0.039335367921329266, + "grad_norm": 0.13133689761161804, + "learning_rate": 0.0002351351351351351, + "loss": 0.48208650946617126, + "memory(GiB)": 78.26, + "step": 203, + "token_acc": 0.8654059206966738, + "train_speed(iter/s)": 0.034794 + }, + { + "epoch": 0.03952913820665601, + "grad_norm": 0.12844805419445038, + "learning_rate": 0.00023629343629343627, + "loss": 0.5480844974517822, + "memory(GiB)": 78.26, + "step": 204, + "token_acc": 0.8480197137937178, + "train_speed(iter/s)": 0.034797 + }, + { + "epoch": 0.039722908491982754, + "grad_norm": 0.13410721719264984, + "learning_rate": 0.0002374517374517374, + "loss": 0.524722695350647, + "memory(GiB)": 78.26, + "step": 205, + "token_acc": 0.8535315555041361, + "train_speed(iter/s)": 0.034801 + }, + { + "epoch": 0.0399166787773095, + "grad_norm": 0.15366047620773315, + "learning_rate": 0.00023861003861003858, + "loss": 0.593670129776001, + "memory(GiB)": 78.26, + "step": 206, + "token_acc": 0.834993270524899, + "train_speed(iter/s)": 0.034805 + }, + { + "epoch": 0.04011044906263624, + "grad_norm": 0.1396535038948059, + "learning_rate": 0.00023976833976833972, + "loss": 0.5373449325561523, + "memory(GiB)": 78.26, + "step": 207, + "token_acc": 0.8462800580988756, + "train_speed(iter/s)": 0.034806 + }, + { + "epoch": 0.04030421934796299, + "grad_norm": 0.14069020748138428, + "learning_rate": 0.0002409266409266409, + "loss": 0.583406925201416, + "memory(GiB)": 78.26, + "step": 208, + "token_acc": 0.8375456332490874, + "train_speed(iter/s)": 0.034809 + }, + { + "epoch": 0.04049798963328974, + "grad_norm": 0.13054059445858002, + "learning_rate": 0.0002420849420849421, + "loss": 0.5496135950088501, + "memory(GiB)": 78.26, + "step": 209, + "token_acc": 0.843966505507651, + "train_speed(iter/s)": 0.034811 + }, + { + "epoch": 0.04069175991861648, + "grad_norm": 0.14763560891151428, + "learning_rate": 0.00024324324324324323, + "loss": 0.5195255279541016, + "memory(GiB)": 78.26, + "step": 210, + "token_acc": 0.8529995042141795, + "train_speed(iter/s)": 0.034814 + }, + { + "epoch": 0.04088553020394323, + "grad_norm": 0.12177236378192902, + "learning_rate": 0.0002444015444015444, + "loss": 0.49852877855300903, + "memory(GiB)": 78.26, + "step": 211, + "token_acc": 0.8585345707205675, + "train_speed(iter/s)": 0.034815 + }, + { + "epoch": 0.04107930048926997, + "grad_norm": 0.1217300221323967, + "learning_rate": 0.00024555984555984557, + "loss": 0.48390355706214905, + "memory(GiB)": 78.26, + "step": 212, + "token_acc": 0.8644299537231804, + "train_speed(iter/s)": 0.034818 + }, + { + "epoch": 0.041273070774596715, + "grad_norm": 0.14275750517845154, + "learning_rate": 0.0002467181467181467, + "loss": 0.5250924825668335, + "memory(GiB)": 78.26, + "step": 213, + "token_acc": 0.8536855100046294, + "train_speed(iter/s)": 0.03482 + }, + { + "epoch": 0.04146684105992346, + "grad_norm": 0.1468067467212677, + "learning_rate": 0.00024787644787644785, + "loss": 0.5309884548187256, + "memory(GiB)": 78.26, + "step": 214, + "token_acc": 0.8531648971912378, + "train_speed(iter/s)": 0.034823 + }, + { + "epoch": 0.041660611345250204, + "grad_norm": 0.12400522828102112, + "learning_rate": 0.000249034749034749, + "loss": 0.5384760499000549, + "memory(GiB)": 78.26, + "step": 215, + "token_acc": 0.8493547014607857, + "train_speed(iter/s)": 0.034824 + }, + { + "epoch": 0.04185438163057695, + "grad_norm": 0.14342345297336578, + "learning_rate": 0.0002501930501930502, + "loss": 0.5606729388237, + "memory(GiB)": 78.26, + "step": 216, + "token_acc": 0.8429284181681875, + "train_speed(iter/s)": 0.034825 + }, + { + "epoch": 0.0420481519159037, + "grad_norm": 0.1313794106245041, + "learning_rate": 0.0002513513513513513, + "loss": 0.5184580087661743, + "memory(GiB)": 78.26, + "step": 217, + "token_acc": 0.8552152612420286, + "train_speed(iter/s)": 0.034827 + }, + { + "epoch": 0.04224192220123044, + "grad_norm": 0.13364368677139282, + "learning_rate": 0.0002525096525096525, + "loss": 0.5389662981033325, + "memory(GiB)": 78.26, + "step": 218, + "token_acc": 0.8503248639797159, + "train_speed(iter/s)": 0.034829 + }, + { + "epoch": 0.04243569248655719, + "grad_norm": 0.1303595006465912, + "learning_rate": 0.00025366795366795364, + "loss": 0.5534095764160156, + "memory(GiB)": 78.26, + "step": 219, + "token_acc": 0.8464527027027027, + "train_speed(iter/s)": 0.034832 + }, + { + "epoch": 0.04262946277188393, + "grad_norm": 0.13916410505771637, + "learning_rate": 0.0002548262548262548, + "loss": 0.5153782963752747, + "memory(GiB)": 78.26, + "step": 220, + "token_acc": 0.8532017429948345, + "train_speed(iter/s)": 0.034834 + }, + { + "epoch": 0.042823233057210676, + "grad_norm": 0.12516328692436218, + "learning_rate": 0.00025598455598455593, + "loss": 0.5454630851745605, + "memory(GiB)": 78.26, + "step": 221, + "token_acc": 0.8455304060358847, + "train_speed(iter/s)": 0.034837 + }, + { + "epoch": 0.04301700334253742, + "grad_norm": 0.12558779120445251, + "learning_rate": 0.0002571428571428571, + "loss": 0.5476839542388916, + "memory(GiB)": 78.26, + "step": 222, + "token_acc": 0.8491302437385303, + "train_speed(iter/s)": 0.034838 + }, + { + "epoch": 0.043210773627864164, + "grad_norm": 0.1301163285970688, + "learning_rate": 0.00025830115830115827, + "loss": 0.5624793171882629, + "memory(GiB)": 78.26, + "step": 223, + "token_acc": 0.8417110837775045, + "train_speed(iter/s)": 0.03484 + }, + { + "epoch": 0.043404543913190916, + "grad_norm": 0.1233832985162735, + "learning_rate": 0.00025945945945945944, + "loss": 0.5634196996688843, + "memory(GiB)": 78.26, + "step": 224, + "token_acc": 0.8440744270023572, + "train_speed(iter/s)": 0.034841 + }, + { + "epoch": 0.04359831419851766, + "grad_norm": 0.11998777091503143, + "learning_rate": 0.0002606177606177606, + "loss": 0.5352566838264465, + "memory(GiB)": 78.26, + "step": 225, + "token_acc": 0.850525063369717, + "train_speed(iter/s)": 0.034843 + }, + { + "epoch": 0.043792084483844404, + "grad_norm": 0.11935053765773773, + "learning_rate": 0.0002617760617760618, + "loss": 0.5280268788337708, + "memory(GiB)": 78.26, + "step": 226, + "token_acc": 0.8505288461538462, + "train_speed(iter/s)": 0.034845 + }, + { + "epoch": 0.04398585476917115, + "grad_norm": 0.11708512902259827, + "learning_rate": 0.00026293436293436294, + "loss": 0.5135525465011597, + "memory(GiB)": 78.26, + "step": 227, + "token_acc": 0.8566448341432495, + "train_speed(iter/s)": 0.034846 + }, + { + "epoch": 0.04417962505449789, + "grad_norm": 0.1259176880121231, + "learning_rate": 0.00026409266409266406, + "loss": 0.5575259327888489, + "memory(GiB)": 78.26, + "step": 228, + "token_acc": 0.8424169123390531, + "train_speed(iter/s)": 0.034847 + }, + { + "epoch": 0.04437339533982464, + "grad_norm": 0.12446990609169006, + "learning_rate": 0.0002652509652509652, + "loss": 0.5132482647895813, + "memory(GiB)": 78.26, + "step": 229, + "token_acc": 0.8576641341938295, + "train_speed(iter/s)": 0.034848 + }, + { + "epoch": 0.04456716562515138, + "grad_norm": 0.1376199871301651, + "learning_rate": 0.0002664092664092664, + "loss": 0.5118637681007385, + "memory(GiB)": 78.26, + "step": 230, + "token_acc": 0.8575808249721293, + "train_speed(iter/s)": 0.034849 + }, + { + "epoch": 0.044760935910478125, + "grad_norm": 0.13398443162441254, + "learning_rate": 0.00026756756756756756, + "loss": 0.543663501739502, + "memory(GiB)": 78.26, + "step": 231, + "token_acc": 0.8458760878036109, + "train_speed(iter/s)": 0.034849 + }, + { + "epoch": 0.044954706195804876, + "grad_norm": 0.13235385715961456, + "learning_rate": 0.0002687258687258687, + "loss": 0.5783892869949341, + "memory(GiB)": 78.26, + "step": 232, + "token_acc": 0.8393900968051217, + "train_speed(iter/s)": 0.034852 + }, + { + "epoch": 0.04514847648113162, + "grad_norm": 0.13555991649627686, + "learning_rate": 0.00026988416988416985, + "loss": 0.5653671026229858, + "memory(GiB)": 78.26, + "step": 233, + "token_acc": 0.8414533928152148, + "train_speed(iter/s)": 0.034855 + }, + { + "epoch": 0.045342246766458365, + "grad_norm": 0.1324978917837143, + "learning_rate": 0.000271042471042471, + "loss": 0.561633825302124, + "memory(GiB)": 78.26, + "step": 234, + "token_acc": 0.845039593124316, + "train_speed(iter/s)": 0.034858 + }, + { + "epoch": 0.04553601705178511, + "grad_norm": 0.1257573366165161, + "learning_rate": 0.0002722007722007722, + "loss": 0.5691174864768982, + "memory(GiB)": 78.26, + "step": 235, + "token_acc": 0.8414384744097102, + "train_speed(iter/s)": 0.03486 + }, + { + "epoch": 0.04572978733711185, + "grad_norm": 0.12331625819206238, + "learning_rate": 0.0002733590733590733, + "loss": 0.5002002120018005, + "memory(GiB)": 78.26, + "step": 236, + "token_acc": 0.8585492089747352, + "train_speed(iter/s)": 0.034862 + }, + { + "epoch": 0.0459235576224386, + "grad_norm": 0.12218355387449265, + "learning_rate": 0.00027451737451737447, + "loss": 0.48562729358673096, + "memory(GiB)": 78.26, + "step": 237, + "token_acc": 0.8611403640740184, + "train_speed(iter/s)": 0.034864 + }, + { + "epoch": 0.04611732790776534, + "grad_norm": 0.12444531172513962, + "learning_rate": 0.00027567567567567564, + "loss": 0.5203258991241455, + "memory(GiB)": 78.26, + "step": 238, + "token_acc": 0.8543315991857046, + "train_speed(iter/s)": 0.034866 + }, + { + "epoch": 0.046311098193092086, + "grad_norm": 0.12778066098690033, + "learning_rate": 0.0002768339768339768, + "loss": 0.5473844408988953, + "memory(GiB)": 78.26, + "step": 239, + "token_acc": 0.8458585640138409, + "train_speed(iter/s)": 0.034867 + }, + { + "epoch": 0.04650486847841884, + "grad_norm": 0.12521140277385712, + "learning_rate": 0.000277992277992278, + "loss": 0.5302430391311646, + "memory(GiB)": 78.26, + "step": 240, + "token_acc": 0.8497068457705965, + "train_speed(iter/s)": 0.034868 + }, + { + "epoch": 0.04669863876374558, + "grad_norm": 0.13087455928325653, + "learning_rate": 0.00027915057915057915, + "loss": 0.52344810962677, + "memory(GiB)": 78.26, + "step": 241, + "token_acc": 0.8509464638253397, + "train_speed(iter/s)": 0.034871 + }, + { + "epoch": 0.046892409049072326, + "grad_norm": 0.12300854921340942, + "learning_rate": 0.0002803088803088803, + "loss": 0.5085083842277527, + "memory(GiB)": 78.26, + "step": 242, + "token_acc": 0.8563601071279393, + "train_speed(iter/s)": 0.034873 + }, + { + "epoch": 0.04708617933439907, + "grad_norm": 0.1252821385860443, + "learning_rate": 0.00028146718146718143, + "loss": 0.5789982080459595, + "memory(GiB)": 78.26, + "step": 243, + "token_acc": 0.8396179117080508, + "train_speed(iter/s)": 0.034874 + }, + { + "epoch": 0.047279949619725814, + "grad_norm": 0.12332039326429367, + "learning_rate": 0.0002826254826254826, + "loss": 0.5306107997894287, + "memory(GiB)": 78.26, + "step": 244, + "token_acc": 0.8511578885733525, + "train_speed(iter/s)": 0.034877 + }, + { + "epoch": 0.04747371990505256, + "grad_norm": 0.11339928209781647, + "learning_rate": 0.00028378378378378377, + "loss": 0.533706784248352, + "memory(GiB)": 78.26, + "step": 245, + "token_acc": 0.8514316174230886, + "train_speed(iter/s)": 0.034877 + }, + { + "epoch": 0.0476674901903793, + "grad_norm": 0.12364498525857925, + "learning_rate": 0.00028494208494208494, + "loss": 0.5375621914863586, + "memory(GiB)": 78.26, + "step": 246, + "token_acc": 0.8459392614747498, + "train_speed(iter/s)": 0.03488 + }, + { + "epoch": 0.047861260475706054, + "grad_norm": 0.12653161585330963, + "learning_rate": 0.00028610038610038605, + "loss": 0.4905088245868683, + "memory(GiB)": 78.26, + "step": 247, + "token_acc": 0.8620850743557136, + "train_speed(iter/s)": 0.034882 + }, + { + "epoch": 0.0480550307610328, + "grad_norm": 0.1326380968093872, + "learning_rate": 0.0002872586872586872, + "loss": 0.5300474762916565, + "memory(GiB)": 78.26, + "step": 248, + "token_acc": 0.8534217764115839, + "train_speed(iter/s)": 0.034883 + }, + { + "epoch": 0.04824880104635954, + "grad_norm": 0.11917278915643692, + "learning_rate": 0.0002884169884169884, + "loss": 0.5086590647697449, + "memory(GiB)": 78.26, + "step": 249, + "token_acc": 0.8568408610460755, + "train_speed(iter/s)": 0.034884 + }, + { + "epoch": 0.048442571331686286, + "grad_norm": 0.12199165672063828, + "learning_rate": 0.00028957528957528956, + "loss": 0.5237119197845459, + "memory(GiB)": 78.26, + "step": 250, + "token_acc": 0.8521546095586315, + "train_speed(iter/s)": 0.034885 + }, + { + "epoch": 0.04863634161701303, + "grad_norm": 0.11789362877607346, + "learning_rate": 0.0002907335907335907, + "loss": 0.4931395649909973, + "memory(GiB)": 78.26, + "step": 251, + "token_acc": 0.8595119082622757, + "train_speed(iter/s)": 0.034887 + }, + { + "epoch": 0.048830111902339775, + "grad_norm": 0.11933058500289917, + "learning_rate": 0.0002918918918918919, + "loss": 0.5037907361984253, + "memory(GiB)": 78.26, + "step": 252, + "token_acc": 0.8580700162252839, + "train_speed(iter/s)": 0.034889 + }, + { + "epoch": 0.04902388218766652, + "grad_norm": 0.11169978976249695, + "learning_rate": 0.000293050193050193, + "loss": 0.4690629243850708, + "memory(GiB)": 78.26, + "step": 253, + "token_acc": 0.8666485000123864, + "train_speed(iter/s)": 0.03489 + }, + { + "epoch": 0.04921765247299326, + "grad_norm": 0.11621616035699844, + "learning_rate": 0.0002942084942084942, + "loss": 0.49911874532699585, + "memory(GiB)": 78.26, + "step": 254, + "token_acc": 0.8591189560995347, + "train_speed(iter/s)": 0.034891 + }, + { + "epoch": 0.049411422758320014, + "grad_norm": 0.1331455260515213, + "learning_rate": 0.00029536679536679535, + "loss": 0.5178690552711487, + "memory(GiB)": 78.26, + "step": 255, + "token_acc": 0.8545419436705194, + "train_speed(iter/s)": 0.034894 + }, + { + "epoch": 0.04960519304364676, + "grad_norm": 0.13746346533298492, + "learning_rate": 0.0002965250965250965, + "loss": 0.5907042026519775, + "memory(GiB)": 78.26, + "step": 256, + "token_acc": 0.8399606359750285, + "train_speed(iter/s)": 0.034896 + }, + { + "epoch": 0.0497989633289735, + "grad_norm": 0.13077257573604584, + "learning_rate": 0.00029768339768339764, + "loss": 0.5277935266494751, + "memory(GiB)": 78.26, + "step": 257, + "token_acc": 0.8542142655941911, + "train_speed(iter/s)": 0.034897 + }, + { + "epoch": 0.04999273361430025, + "grad_norm": 0.11512145400047302, + "learning_rate": 0.0002988416988416988, + "loss": 0.5082737803459167, + "memory(GiB)": 78.26, + "step": 258, + "token_acc": 0.8569431737318288, + "train_speed(iter/s)": 0.034898 + }, + { + "epoch": 0.05018650389962699, + "grad_norm": 0.12492549419403076, + "learning_rate": 0.0003, + "loss": 0.5232591032981873, + "memory(GiB)": 78.26, + "step": 259, + "token_acc": 0.8529347048792563, + "train_speed(iter/s)": 0.034899 + }, + { + "epoch": 0.050380274184953736, + "grad_norm": 0.1214490681886673, + "learning_rate": 0.0002999999691954846, + "loss": 0.538935124874115, + "memory(GiB)": 78.26, + "step": 260, + "token_acc": 0.848226576457839, + "train_speed(iter/s)": 0.034901 + }, + { + "epoch": 0.05057404447028048, + "grad_norm": 0.12162759155035019, + "learning_rate": 0.0002999998767819513, + "loss": 0.5226523876190186, + "memory(GiB)": 78.26, + "step": 261, + "token_acc": 0.8527364343343139, + "train_speed(iter/s)": 0.034904 + }, + { + "epoch": 0.05076781475560723, + "grad_norm": 0.14032401144504547, + "learning_rate": 0.0002999997227594379, + "loss": 0.5270801782608032, + "memory(GiB)": 78.26, + "step": 262, + "token_acc": 0.8529345116700657, + "train_speed(iter/s)": 0.034905 + }, + { + "epoch": 0.050961585040933975, + "grad_norm": 0.12755590677261353, + "learning_rate": 0.00029999950712800773, + "loss": 0.5621036887168884, + "memory(GiB)": 78.26, + "step": 263, + "token_acc": 0.8452567559094614, + "train_speed(iter/s)": 0.034908 + }, + { + "epoch": 0.05115535532626072, + "grad_norm": 0.12125645577907562, + "learning_rate": 0.0002999992298877494, + "loss": 0.48772531747817993, + "memory(GiB)": 78.26, + "step": 264, + "token_acc": 0.860484942704546, + "train_speed(iter/s)": 0.03491 + }, + { + "epoch": 0.051349125611587464, + "grad_norm": 0.14849497377872467, + "learning_rate": 0.00029999889103877667, + "loss": 0.5987675189971924, + "memory(GiB)": 78.26, + "step": 265, + "token_acc": 0.8323770133690618, + "train_speed(iter/s)": 0.034912 + }, + { + "epoch": 0.05154289589691421, + "grad_norm": 0.11173395067453384, + "learning_rate": 0.00029999849058122874, + "loss": 0.4839743971824646, + "memory(GiB)": 78.26, + "step": 266, + "token_acc": 0.8618188850746991, + "train_speed(iter/s)": 0.034913 + }, + { + "epoch": 0.05173666618224095, + "grad_norm": 0.13394448161125183, + "learning_rate": 0.0002999980285152701, + "loss": 0.5341227054595947, + "memory(GiB)": 78.26, + "step": 267, + "token_acc": 0.8524495008126306, + "train_speed(iter/s)": 0.034913 + }, + { + "epoch": 0.051930436467567696, + "grad_norm": 0.13987241685390472, + "learning_rate": 0.0002999975048410906, + "loss": 0.5620037913322449, + "memory(GiB)": 78.26, + "step": 268, + "token_acc": 0.8440742478752803, + "train_speed(iter/s)": 0.034913 + }, + { + "epoch": 0.05212420675289444, + "grad_norm": 0.12064801156520844, + "learning_rate": 0.0002999969195589052, + "loss": 0.5743050575256348, + "memory(GiB)": 78.26, + "step": 269, + "token_acc": 0.8411526254595733, + "train_speed(iter/s)": 0.034916 + }, + { + "epoch": 0.05231797703822119, + "grad_norm": 0.1192815750837326, + "learning_rate": 0.00029999627266895444, + "loss": 0.497215211391449, + "memory(GiB)": 78.26, + "step": 270, + "token_acc": 0.8576876267748479, + "train_speed(iter/s)": 0.034917 + }, + { + "epoch": 0.052511747323547936, + "grad_norm": 0.11295323818922043, + "learning_rate": 0.0002999955641715039, + "loss": 0.46060121059417725, + "memory(GiB)": 78.26, + "step": 271, + "token_acc": 0.8695854936493567, + "train_speed(iter/s)": 0.034918 + }, + { + "epoch": 0.05270551760887468, + "grad_norm": 0.11337928473949432, + "learning_rate": 0.00029999479406684466, + "loss": 0.47304582595825195, + "memory(GiB)": 78.26, + "step": 272, + "token_acc": 0.8662467580585402, + "train_speed(iter/s)": 0.034918 + }, + { + "epoch": 0.052899287894201424, + "grad_norm": 0.13473428785800934, + "learning_rate": 0.000299993962355293, + "loss": 0.5001351833343506, + "memory(GiB)": 78.26, + "step": 273, + "token_acc": 0.8586831727649379, + "train_speed(iter/s)": 0.03492 + }, + { + "epoch": 0.05309305817952817, + "grad_norm": 0.12432650476694107, + "learning_rate": 0.00029999306903719043, + "loss": 0.49999624490737915, + "memory(GiB)": 78.26, + "step": 274, + "token_acc": 0.8596558122982282, + "train_speed(iter/s)": 0.034921 + }, + { + "epoch": 0.05328682846485491, + "grad_norm": 0.12385527044534683, + "learning_rate": 0.0002999921141129039, + "loss": 0.5260789394378662, + "memory(GiB)": 78.26, + "step": 275, + "token_acc": 0.8528883832638099, + "train_speed(iter/s)": 0.034922 + }, + { + "epoch": 0.05348059875018166, + "grad_norm": 0.1163427084684372, + "learning_rate": 0.00029999109758282577, + "loss": 0.5076729655265808, + "memory(GiB)": 78.26, + "step": 276, + "token_acc": 0.8572664593754225, + "train_speed(iter/s)": 0.034923 + }, + { + "epoch": 0.0536743690355084, + "grad_norm": 0.10747300833463669, + "learning_rate": 0.0002999900194473734, + "loss": 0.4819478392601013, + "memory(GiB)": 78.26, + "step": 277, + "token_acc": 0.8656782763309278, + "train_speed(iter/s)": 0.034925 + }, + { + "epoch": 0.05386813932083515, + "grad_norm": 0.12642782926559448, + "learning_rate": 0.00029998887970698966, + "loss": 0.5688496232032776, + "memory(GiB)": 78.26, + "step": 278, + "token_acc": 0.8424845950704225, + "train_speed(iter/s)": 0.034926 + }, + { + "epoch": 0.0540619096061619, + "grad_norm": 0.11862245947122574, + "learning_rate": 0.00029998767836214265, + "loss": 0.5431630611419678, + "memory(GiB)": 78.26, + "step": 279, + "token_acc": 0.8460705036731674, + "train_speed(iter/s)": 0.034927 + }, + { + "epoch": 0.05425567989148864, + "grad_norm": 0.10874070972204208, + "learning_rate": 0.00029998641541332583, + "loss": 0.446528822183609, + "memory(GiB)": 78.26, + "step": 280, + "token_acc": 0.875025387583779, + "train_speed(iter/s)": 0.034929 + }, + { + "epoch": 0.054449450176815385, + "grad_norm": 0.1398598551750183, + "learning_rate": 0.0002999850908610579, + "loss": 0.5694330334663391, + "memory(GiB)": 78.26, + "step": 281, + "token_acc": 0.841722914998284, + "train_speed(iter/s)": 0.03493 + }, + { + "epoch": 0.05464322046214213, + "grad_norm": 0.11889787018299103, + "learning_rate": 0.00029998370470588287, + "loss": 0.4838942885398865, + "memory(GiB)": 78.26, + "step": 282, + "token_acc": 0.8644304682040531, + "train_speed(iter/s)": 0.034932 + }, + { + "epoch": 0.054836990747468874, + "grad_norm": 0.10652502626180649, + "learning_rate": 0.00029998225694837015, + "loss": 0.4832991063594818, + "memory(GiB)": 78.26, + "step": 283, + "token_acc": 0.8605131303975627, + "train_speed(iter/s)": 0.034933 + }, + { + "epoch": 0.05503076103279562, + "grad_norm": 0.1357640027999878, + "learning_rate": 0.0002999807475891143, + "loss": 0.5402747392654419, + "memory(GiB)": 78.26, + "step": 284, + "token_acc": 0.8523422441967653, + "train_speed(iter/s)": 0.034934 + }, + { + "epoch": 0.05522453131812237, + "grad_norm": 0.12577500939369202, + "learning_rate": 0.00029997917662873526, + "loss": 0.5225556492805481, + "memory(GiB)": 78.26, + "step": 285, + "token_acc": 0.8524319637512936, + "train_speed(iter/s)": 0.034935 + }, + { + "epoch": 0.05541830160344911, + "grad_norm": 0.11081529408693314, + "learning_rate": 0.0002999775440678783, + "loss": 0.47770658135414124, + "memory(GiB)": 78.26, + "step": 286, + "token_acc": 0.8655869779677737, + "train_speed(iter/s)": 0.034937 + }, + { + "epoch": 0.05561207188877586, + "grad_norm": 0.11565393954515457, + "learning_rate": 0.00029997584990721396, + "loss": 0.5191536545753479, + "memory(GiB)": 78.26, + "step": 287, + "token_acc": 0.8520145631067961, + "train_speed(iter/s)": 0.034937 + }, + { + "epoch": 0.0558058421741026, + "grad_norm": 0.1177186667919159, + "learning_rate": 0.000299974094147438, + "loss": 0.5068661570549011, + "memory(GiB)": 78.26, + "step": 288, + "token_acc": 0.8555767793372644, + "train_speed(iter/s)": 0.034939 + }, + { + "epoch": 0.055999612459429346, + "grad_norm": 0.11559902131557465, + "learning_rate": 0.00029997227678927164, + "loss": 0.5000442862510681, + "memory(GiB)": 78.26, + "step": 289, + "token_acc": 0.8562957392033785, + "train_speed(iter/s)": 0.03494 + }, + { + "epoch": 0.05619338274475609, + "grad_norm": 0.1195712685585022, + "learning_rate": 0.0002999703978334613, + "loss": 0.5059924125671387, + "memory(GiB)": 78.26, + "step": 290, + "token_acc": 0.8566373608427013, + "train_speed(iter/s)": 0.034941 + }, + { + "epoch": 0.056387153030082834, + "grad_norm": 0.11978676170110703, + "learning_rate": 0.00029996845728077874, + "loss": 0.5218163728713989, + "memory(GiB)": 78.26, + "step": 291, + "token_acc": 0.8552988421821216, + "train_speed(iter/s)": 0.034943 + }, + { + "epoch": 0.05658092331540958, + "grad_norm": 0.11734765022993088, + "learning_rate": 0.00029996645513202086, + "loss": 0.5239380598068237, + "memory(GiB)": 78.26, + "step": 292, + "token_acc": 0.85505396631614, + "train_speed(iter/s)": 0.034945 + }, + { + "epoch": 0.05677469360073633, + "grad_norm": 0.11253336071968079, + "learning_rate": 0.0002999643913880102, + "loss": 0.512554943561554, + "memory(GiB)": 78.26, + "step": 293, + "token_acc": 0.8555791147627883, + "train_speed(iter/s)": 0.034947 + }, + { + "epoch": 0.056968463886063074, + "grad_norm": 0.1775335669517517, + "learning_rate": 0.0002999622660495943, + "loss": 0.48465481400489807, + "memory(GiB)": 78.26, + "step": 294, + "token_acc": 0.8632988755461122, + "train_speed(iter/s)": 0.034948 + }, + { + "epoch": 0.05716223417138982, + "grad_norm": 0.11287941783666611, + "learning_rate": 0.0002999600791176461, + "loss": 0.5061824321746826, + "memory(GiB)": 78.26, + "step": 295, + "token_acc": 0.8573063443244403, + "train_speed(iter/s)": 0.034949 + }, + { + "epoch": 0.05735600445671656, + "grad_norm": 0.26354172825813293, + "learning_rate": 0.00029995783059306373, + "loss": 0.5662236213684082, + "memory(GiB)": 78.26, + "step": 296, + "token_acc": 0.8466373350094281, + "train_speed(iter/s)": 0.034951 + }, + { + "epoch": 0.05754977474204331, + "grad_norm": 0.4357684552669525, + "learning_rate": 0.0002999555204767709, + "loss": 0.5678244233131409, + "memory(GiB)": 78.26, + "step": 297, + "token_acc": 0.8449365109151186, + "train_speed(iter/s)": 0.034952 + }, + { + "epoch": 0.05774354502737005, + "grad_norm": 0.12695299088954926, + "learning_rate": 0.00029995314876971627, + "loss": 0.5329843759536743, + "memory(GiB)": 78.26, + "step": 298, + "token_acc": 0.8508813455487176, + "train_speed(iter/s)": 0.034953 + }, + { + "epoch": 0.057937315312696795, + "grad_norm": 0.13667218387126923, + "learning_rate": 0.00029995071547287414, + "loss": 0.5417138934135437, + "memory(GiB)": 78.26, + "step": 299, + "token_acc": 0.8479577535288818, + "train_speed(iter/s)": 0.034954 + }, + { + "epoch": 0.058131085598023546, + "grad_norm": 0.10648724436759949, + "learning_rate": 0.00029994822058724375, + "loss": 0.4887983202934265, + "memory(GiB)": 78.26, + "step": 300, + "token_acc": 0.8605226687784597, + "train_speed(iter/s)": 0.034954 + }, + { + "epoch": 0.05832485588335029, + "grad_norm": 0.11893422901630402, + "learning_rate": 0.00029994566411384993, + "loss": 0.505358099937439, + "memory(GiB)": 78.26, + "step": 301, + "token_acc": 0.8587887578336663, + "train_speed(iter/s)": 0.034955 + }, + { + "epoch": 0.058518626168677035, + "grad_norm": 0.12744036316871643, + "learning_rate": 0.0002999430460537427, + "loss": 0.5472061634063721, + "memory(GiB)": 78.26, + "step": 302, + "token_acc": 0.8475548152800153, + "train_speed(iter/s)": 0.034956 + }, + { + "epoch": 0.05871239645400378, + "grad_norm": 0.12299606949090958, + "learning_rate": 0.00029994036640799726, + "loss": 0.5060437917709351, + "memory(GiB)": 78.26, + "step": 303, + "token_acc": 0.8581589526852902, + "train_speed(iter/s)": 0.034956 + }, + { + "epoch": 0.05890616673933052, + "grad_norm": 0.11920775473117828, + "learning_rate": 0.00029993762517771435, + "loss": 0.5124378204345703, + "memory(GiB)": 78.26, + "step": 304, + "token_acc": 0.853845315310405, + "train_speed(iter/s)": 0.034957 + }, + { + "epoch": 0.05909993702465727, + "grad_norm": 0.11296442151069641, + "learning_rate": 0.0002999348223640198, + "loss": 0.5146512985229492, + "memory(GiB)": 78.26, + "step": 305, + "token_acc": 0.856233997982776, + "train_speed(iter/s)": 0.034958 + }, + { + "epoch": 0.05929370730998401, + "grad_norm": 0.11093028634786606, + "learning_rate": 0.0002999319579680647, + "loss": 0.485245943069458, + "memory(GiB)": 78.26, + "step": 306, + "token_acc": 0.8640687244248123, + "train_speed(iter/s)": 0.034959 + }, + { + "epoch": 0.059487477595310756, + "grad_norm": 0.11484235525131226, + "learning_rate": 0.00029992903199102576, + "loss": 0.4985674023628235, + "memory(GiB)": 78.26, + "step": 307, + "token_acc": 0.8578893469527086, + "train_speed(iter/s)": 0.03496 + }, + { + "epoch": 0.05968124788063751, + "grad_norm": 0.11024551838636398, + "learning_rate": 0.00029992604443410456, + "loss": 0.4915925860404968, + "memory(GiB)": 78.26, + "step": 308, + "token_acc": 0.8619262990183482, + "train_speed(iter/s)": 0.03496 + }, + { + "epoch": 0.05987501816596425, + "grad_norm": 0.11654435843229294, + "learning_rate": 0.00029992299529852827, + "loss": 0.5016142129898071, + "memory(GiB)": 78.26, + "step": 309, + "token_acc": 0.8607442107550769, + "train_speed(iter/s)": 0.03496 + }, + { + "epoch": 0.060068788451290996, + "grad_norm": 0.10757559537887573, + "learning_rate": 0.0002999198845855492, + "loss": 0.4506911039352417, + "memory(GiB)": 78.26, + "step": 310, + "token_acc": 0.8732513679968862, + "train_speed(iter/s)": 0.03496 + }, + { + "epoch": 0.06026255873661774, + "grad_norm": 0.11542216688394547, + "learning_rate": 0.00029991671229644503, + "loss": 0.47681623697280884, + "memory(GiB)": 78.26, + "step": 311, + "token_acc": 0.8678860712584043, + "train_speed(iter/s)": 0.034962 + }, + { + "epoch": 0.060456329021944484, + "grad_norm": 0.12022379785776138, + "learning_rate": 0.0002999134784325187, + "loss": 0.5177062153816223, + "memory(GiB)": 78.26, + "step": 312, + "token_acc": 0.8545514413146945, + "train_speed(iter/s)": 0.034964 + }, + { + "epoch": 0.06065009930727123, + "grad_norm": 0.11384547501802444, + "learning_rate": 0.0002999101829950985, + "loss": 0.5101380348205566, + "memory(GiB)": 78.26, + "step": 313, + "token_acc": 0.8565856442943443, + "train_speed(iter/s)": 0.034964 + }, + { + "epoch": 0.06084386959259797, + "grad_norm": 0.11187402158975601, + "learning_rate": 0.0002999068259855378, + "loss": 0.5241718888282776, + "memory(GiB)": 78.26, + "step": 314, + "token_acc": 0.8541580041580041, + "train_speed(iter/s)": 0.034966 + }, + { + "epoch": 0.061037639877924724, + "grad_norm": 0.10513240844011307, + "learning_rate": 0.0002999034074052156, + "loss": 0.4202113449573517, + "memory(GiB)": 78.26, + "step": 315, + "token_acc": 0.8803593092589247, + "train_speed(iter/s)": 0.034966 + }, + { + "epoch": 0.06123141016325147, + "grad_norm": 0.11992194503545761, + "learning_rate": 0.0002998999272555359, + "loss": 0.506824254989624, + "memory(GiB)": 78.26, + "step": 316, + "token_acc": 0.8580272713024398, + "train_speed(iter/s)": 0.034967 + }, + { + "epoch": 0.06142518044857821, + "grad_norm": 0.11586200445890427, + "learning_rate": 0.0002998963855379281, + "loss": 0.5369887351989746, + "memory(GiB)": 78.26, + "step": 317, + "token_acc": 0.8491955856933918, + "train_speed(iter/s)": 0.034968 + }, + { + "epoch": 0.061618950733904956, + "grad_norm": 0.11802522093057632, + "learning_rate": 0.0002998927822538469, + "loss": 0.5725922584533691, + "memory(GiB)": 78.26, + "step": 318, + "token_acc": 0.8380715205103811, + "train_speed(iter/s)": 0.03497 + }, + { + "epoch": 0.0618127210192317, + "grad_norm": 0.12824736535549164, + "learning_rate": 0.0002998891174047722, + "loss": 0.567888617515564, + "memory(GiB)": 78.26, + "step": 319, + "token_acc": 0.841992673992674, + "train_speed(iter/s)": 0.034972 + }, + { + "epoch": 0.062006491304558445, + "grad_norm": 0.10866405814886093, + "learning_rate": 0.00029988539099220937, + "loss": 0.4857517182826996, + "memory(GiB)": 78.26, + "step": 320, + "token_acc": 0.8629794688168165, + "train_speed(iter/s)": 0.034973 + }, + { + "epoch": 0.06220026158988519, + "grad_norm": 0.10862304270267487, + "learning_rate": 0.00029988160301768884, + "loss": 0.45832955837249756, + "memory(GiB)": 78.26, + "step": 321, + "token_acc": 0.8700757980143056, + "train_speed(iter/s)": 0.034974 + }, + { + "epoch": 0.06239403187521193, + "grad_norm": 0.11096280068159103, + "learning_rate": 0.00029987775348276646, + "loss": 0.5003219842910767, + "memory(GiB)": 78.26, + "step": 322, + "token_acc": 0.8605376800379422, + "train_speed(iter/s)": 0.034975 + }, + { + "epoch": 0.06258780216053868, + "grad_norm": 0.11027340590953827, + "learning_rate": 0.0002998738423890234, + "loss": 0.4482397139072418, + "memory(GiB)": 78.26, + "step": 323, + "token_acc": 0.8740220554858348, + "train_speed(iter/s)": 0.034975 + }, + { + "epoch": 0.06278157244586542, + "grad_norm": 0.11176367849111557, + "learning_rate": 0.000299869869738066, + "loss": 0.5309886932373047, + "memory(GiB)": 78.26, + "step": 324, + "token_acc": 0.8513593064326234, + "train_speed(iter/s)": 0.034976 + }, + { + "epoch": 0.06297534273119217, + "grad_norm": 0.10626234114170074, + "learning_rate": 0.000299865835531526, + "loss": 0.4504719376564026, + "memory(GiB)": 78.26, + "step": 325, + "token_acc": 0.8722543040638998, + "train_speed(iter/s)": 0.034977 + }, + { + "epoch": 0.06316911301651891, + "grad_norm": 0.10870497673749924, + "learning_rate": 0.00029986173977106017, + "loss": 0.5229367017745972, + "memory(GiB)": 78.26, + "step": 326, + "token_acc": 0.8503923012467308, + "train_speed(iter/s)": 0.034978 + }, + { + "epoch": 0.06336288330184567, + "grad_norm": 0.11313097178936005, + "learning_rate": 0.0002998575824583509, + "loss": 0.4801797866821289, + "memory(GiB)": 78.26, + "step": 327, + "token_acc": 0.8636890035268726, + "train_speed(iter/s)": 0.034979 + }, + { + "epoch": 0.06355665358717241, + "grad_norm": 0.11869515478610992, + "learning_rate": 0.0002998533635951058, + "loss": 0.5341205596923828, + "memory(GiB)": 78.26, + "step": 328, + "token_acc": 0.8493568134624201, + "train_speed(iter/s)": 0.03498 + }, + { + "epoch": 0.06375042387249916, + "grad_norm": 0.11751175671815872, + "learning_rate": 0.00029984908318305743, + "loss": 0.47566699981689453, + "memory(GiB)": 78.26, + "step": 329, + "token_acc": 0.864566263149548, + "train_speed(iter/s)": 0.034981 + }, + { + "epoch": 0.0639441941578259, + "grad_norm": 0.11967992782592773, + "learning_rate": 0.000299844741223964, + "loss": 0.4633430540561676, + "memory(GiB)": 78.26, + "step": 330, + "token_acc": 0.8679728375820995, + "train_speed(iter/s)": 0.034981 + }, + { + "epoch": 0.06413796444315265, + "grad_norm": 0.12265921384096146, + "learning_rate": 0.00029984033771960895, + "loss": 0.5029769539833069, + "memory(GiB)": 78.26, + "step": 331, + "token_acc": 0.8578765113276207, + "train_speed(iter/s)": 0.034982 + }, + { + "epoch": 0.06433173472847939, + "grad_norm": 0.11440466344356537, + "learning_rate": 0.0002998358726718008, + "loss": 0.5016182661056519, + "memory(GiB)": 78.26, + "step": 332, + "token_acc": 0.8598369870713884, + "train_speed(iter/s)": 0.034982 + }, + { + "epoch": 0.06452550501380613, + "grad_norm": 0.1108684316277504, + "learning_rate": 0.0002998313460823735, + "loss": 0.5170926451683044, + "memory(GiB)": 78.26, + "step": 333, + "token_acc": 0.8539208882720333, + "train_speed(iter/s)": 0.034983 + }, + { + "epoch": 0.06471927529913288, + "grad_norm": 0.12245868891477585, + "learning_rate": 0.00029982675795318616, + "loss": 0.49607276916503906, + "memory(GiB)": 78.26, + "step": 334, + "token_acc": 0.8618406713164778, + "train_speed(iter/s)": 0.034983 + }, + { + "epoch": 0.06491304558445962, + "grad_norm": 0.11894813925027847, + "learning_rate": 0.0002998221082861234, + "loss": 0.49732956290245056, + "memory(GiB)": 78.26, + "step": 335, + "token_acc": 0.8582034755649832, + "train_speed(iter/s)": 0.034985 + }, + { + "epoch": 0.06510681586978637, + "grad_norm": 0.12267972528934479, + "learning_rate": 0.0002998173970830949, + "loss": 0.5078924298286438, + "memory(GiB)": 78.26, + "step": 336, + "token_acc": 0.8583969800719866, + "train_speed(iter/s)": 0.034986 + }, + { + "epoch": 0.06530058615511311, + "grad_norm": 0.10648109018802643, + "learning_rate": 0.0002998126243460357, + "loss": 0.5070585608482361, + "memory(GiB)": 78.26, + "step": 337, + "token_acc": 0.8565583698958306, + "train_speed(iter/s)": 0.034987 + }, + { + "epoch": 0.06549435644043985, + "grad_norm": 0.11792565882205963, + "learning_rate": 0.000299807790076906, + "loss": 0.5286574959754944, + "memory(GiB)": 78.26, + "step": 338, + "token_acc": 0.8529562054765698, + "train_speed(iter/s)": 0.034989 + }, + { + "epoch": 0.0656881267257666, + "grad_norm": 0.12195685505867004, + "learning_rate": 0.0002998028942776914, + "loss": 0.4869121313095093, + "memory(GiB)": 78.26, + "step": 339, + "token_acc": 0.8612365934096389, + "train_speed(iter/s)": 0.03499 + }, + { + "epoch": 0.06588189701109334, + "grad_norm": 0.1249171569943428, + "learning_rate": 0.0002997979369504028, + "loss": 0.5445664525032043, + "memory(GiB)": 78.26, + "step": 340, + "token_acc": 0.8522178660532028, + "train_speed(iter/s)": 0.034991 + }, + { + "epoch": 0.06607566729642009, + "grad_norm": 0.13457630574703217, + "learning_rate": 0.0002997929180970763, + "loss": 0.5670903325080872, + "memory(GiB)": 78.26, + "step": 341, + "token_acc": 0.8372440096177763, + "train_speed(iter/s)": 0.034991 + }, + { + "epoch": 0.06626943758174685, + "grad_norm": 0.11432075500488281, + "learning_rate": 0.0002997878377197732, + "loss": 0.5261147022247314, + "memory(GiB)": 78.26, + "step": 342, + "token_acc": 0.8552226935312831, + "train_speed(iter/s)": 0.034992 + }, + { + "epoch": 0.06646320786707359, + "grad_norm": 0.10323546081781387, + "learning_rate": 0.00029978269582058015, + "loss": 0.4720154404640198, + "memory(GiB)": 78.26, + "step": 343, + "token_acc": 0.8659866148531952, + "train_speed(iter/s)": 0.034992 + }, + { + "epoch": 0.06665697815240033, + "grad_norm": 0.11725510656833649, + "learning_rate": 0.0002997774924016092, + "loss": 0.5160101652145386, + "memory(GiB)": 78.26, + "step": 344, + "token_acc": 0.8551109929549382, + "train_speed(iter/s)": 0.034992 + }, + { + "epoch": 0.06685074843772708, + "grad_norm": 0.12052306532859802, + "learning_rate": 0.0002997722274649974, + "loss": 0.537044107913971, + "memory(GiB)": 78.26, + "step": 345, + "token_acc": 0.8476571428571429, + "train_speed(iter/s)": 0.034993 + }, + { + "epoch": 0.06704451872305382, + "grad_norm": 0.11849239468574524, + "learning_rate": 0.00029976690101290727, + "loss": 0.5134192705154419, + "memory(GiB)": 78.26, + "step": 346, + "token_acc": 0.8558187985790219, + "train_speed(iter/s)": 0.034995 + }, + { + "epoch": 0.06723828900838057, + "grad_norm": 0.12307219952344894, + "learning_rate": 0.00029976151304752645, + "loss": 0.4876058101654053, + "memory(GiB)": 78.26, + "step": 347, + "token_acc": 0.8650527622594661, + "train_speed(iter/s)": 0.034996 + }, + { + "epoch": 0.06743205929370731, + "grad_norm": 0.11402394622564316, + "learning_rate": 0.00029975606357106804, + "loss": 0.47068169713020325, + "memory(GiB)": 78.26, + "step": 348, + "token_acc": 0.8668408661682147, + "train_speed(iter/s)": 0.034998 + }, + { + "epoch": 0.06762582957903406, + "grad_norm": 0.12601493299007416, + "learning_rate": 0.00029975055258577016, + "loss": 0.5161094665527344, + "memory(GiB)": 78.26, + "step": 349, + "token_acc": 0.8581284381363401, + "train_speed(iter/s)": 0.034999 + }, + { + "epoch": 0.0678195998643608, + "grad_norm": 0.12433503568172455, + "learning_rate": 0.0002997449800938964, + "loss": 0.5517579913139343, + "memory(GiB)": 78.26, + "step": 350, + "token_acc": 0.8489635649712355, + "train_speed(iter/s)": 0.035001 + }, + { + "epoch": 0.06801337014968754, + "grad_norm": 0.11672255396842957, + "learning_rate": 0.0002997393460977355, + "loss": 0.49452394247055054, + "memory(GiB)": 78.26, + "step": 351, + "token_acc": 0.8589091777061701, + "train_speed(iter/s)": 0.035002 + }, + { + "epoch": 0.06820714043501429, + "grad_norm": 0.11498520523309708, + "learning_rate": 0.00029973365059960153, + "loss": 0.5128313302993774, + "memory(GiB)": 78.26, + "step": 352, + "token_acc": 0.8535805294087055, + "train_speed(iter/s)": 0.035002 + }, + { + "epoch": 0.06840091072034103, + "grad_norm": 0.14143286645412445, + "learning_rate": 0.00029972789360183376, + "loss": 0.5255135893821716, + "memory(GiB)": 78.26, + "step": 353, + "token_acc": 0.8556857047731469, + "train_speed(iter/s)": 0.035003 + }, + { + "epoch": 0.06859468100566778, + "grad_norm": 0.12254343181848526, + "learning_rate": 0.00029972207510679675, + "loss": 0.5137450695037842, + "memory(GiB)": 78.26, + "step": 354, + "token_acc": 0.8559991402933749, + "train_speed(iter/s)": 0.035004 + }, + { + "epoch": 0.06878845129099452, + "grad_norm": 0.11112511903047562, + "learning_rate": 0.0002997161951168803, + "loss": 0.46960967779159546, + "memory(GiB)": 78.26, + "step": 355, + "token_acc": 0.8666562937606971, + "train_speed(iter/s)": 0.035004 + }, + { + "epoch": 0.06898222157632126, + "grad_norm": 0.11723387986421585, + "learning_rate": 0.0002997102536344995, + "loss": 0.5213668346405029, + "memory(GiB)": 78.26, + "step": 356, + "token_acc": 0.8504984318996416, + "train_speed(iter/s)": 0.035005 + }, + { + "epoch": 0.06917599186164802, + "grad_norm": 0.11225343495607376, + "learning_rate": 0.0002997042506620946, + "loss": 0.4935987591743469, + "memory(GiB)": 78.26, + "step": 357, + "token_acc": 0.8581589163069739, + "train_speed(iter/s)": 0.035007 + }, + { + "epoch": 0.06936976214697477, + "grad_norm": 0.11436620354652405, + "learning_rate": 0.0002996981862021313, + "loss": 0.47615599632263184, + "memory(GiB)": 78.26, + "step": 358, + "token_acc": 0.8642991737933033, + "train_speed(iter/s)": 0.035007 + }, + { + "epoch": 0.06956353243230151, + "grad_norm": 0.12642012536525726, + "learning_rate": 0.00029969206025710037, + "loss": 0.5081407427787781, + "memory(GiB)": 78.26, + "step": 359, + "token_acc": 0.8585974082543831, + "train_speed(iter/s)": 0.035008 + }, + { + "epoch": 0.06975730271762826, + "grad_norm": 0.1168803945183754, + "learning_rate": 0.0002996858728295179, + "loss": 0.4714093804359436, + "memory(GiB)": 78.26, + "step": 360, + "token_acc": 0.8668951045236009, + "train_speed(iter/s)": 0.035008 + }, + { + "epoch": 0.069951073002955, + "grad_norm": 0.11019770056009293, + "learning_rate": 0.00029967962392192526, + "loss": 0.5050376057624817, + "memory(GiB)": 78.26, + "step": 361, + "token_acc": 0.8604877186782117, + "train_speed(iter/s)": 0.03501 + }, + { + "epoch": 0.07014484328828174, + "grad_norm": 0.12403653562068939, + "learning_rate": 0.000299673313536889, + "loss": 0.49756452441215515, + "memory(GiB)": 78.26, + "step": 362, + "token_acc": 0.8619797028974849, + "train_speed(iter/s)": 0.035011 + }, + { + "epoch": 0.07033861357360849, + "grad_norm": 0.10632438957691193, + "learning_rate": 0.00029966694167700105, + "loss": 0.50919508934021, + "memory(GiB)": 78.26, + "step": 363, + "token_acc": 0.8557894184337602, + "train_speed(iter/s)": 0.035011 + }, + { + "epoch": 0.07053238385893523, + "grad_norm": 0.12035337090492249, + "learning_rate": 0.0002996605083448784, + "loss": 0.5421941876411438, + "memory(GiB)": 78.26, + "step": 364, + "token_acc": 0.8479168897932955, + "train_speed(iter/s)": 0.035012 + }, + { + "epoch": 0.07072615414426198, + "grad_norm": 0.11031734943389893, + "learning_rate": 0.00029965401354316345, + "loss": 0.48488667607307434, + "memory(GiB)": 78.26, + "step": 365, + "token_acc": 0.8627623778240666, + "train_speed(iter/s)": 0.035013 + }, + { + "epoch": 0.07091992442958872, + "grad_norm": 0.10722577571868896, + "learning_rate": 0.00029964745727452375, + "loss": 0.45194217562675476, + "memory(GiB)": 78.26, + "step": 366, + "token_acc": 0.8705380798689725, + "train_speed(iter/s)": 0.035013 + }, + { + "epoch": 0.07111369471491547, + "grad_norm": 0.10588467866182327, + "learning_rate": 0.0002996408395416521, + "loss": 0.45176932215690613, + "memory(GiB)": 78.26, + "step": 367, + "token_acc": 0.8708429432333904, + "train_speed(iter/s)": 0.035014 + }, + { + "epoch": 0.07130746500024221, + "grad_norm": 0.10778294503688812, + "learning_rate": 0.0002996341603472668, + "loss": 0.501011312007904, + "memory(GiB)": 78.26, + "step": 368, + "token_acc": 0.8594550505951623, + "train_speed(iter/s)": 0.035014 + }, + { + "epoch": 0.07150123528556895, + "grad_norm": 0.12546683847904205, + "learning_rate": 0.00029962741969411096, + "loss": 0.4865407943725586, + "memory(GiB)": 78.26, + "step": 369, + "token_acc": 0.8665692482545868, + "train_speed(iter/s)": 0.035016 + }, + { + "epoch": 0.0716950055708957, + "grad_norm": 0.11863457411527634, + "learning_rate": 0.0002996206175849532, + "loss": 0.5059066414833069, + "memory(GiB)": 78.26, + "step": 370, + "token_acc": 0.8579709417580488, + "train_speed(iter/s)": 0.035017 + }, + { + "epoch": 0.07188877585622244, + "grad_norm": 0.1237197294831276, + "learning_rate": 0.0002996137540225873, + "loss": 0.5302804708480835, + "memory(GiB)": 78.26, + "step": 371, + "token_acc": 0.8513494809688581, + "train_speed(iter/s)": 0.035018 + }, + { + "epoch": 0.0720825461415492, + "grad_norm": 0.11564111709594727, + "learning_rate": 0.0002996068290098324, + "loss": 0.4804614186286926, + "memory(GiB)": 78.26, + "step": 372, + "token_acc": 0.8635471113692303, + "train_speed(iter/s)": 0.035019 + }, + { + "epoch": 0.07227631642687594, + "grad_norm": 0.115287646651268, + "learning_rate": 0.0002995998425495327, + "loss": 0.5029768943786621, + "memory(GiB)": 78.26, + "step": 373, + "token_acc": 0.8573006711038692, + "train_speed(iter/s)": 0.035019 + }, + { + "epoch": 0.07247008671220269, + "grad_norm": 0.11475943773984909, + "learning_rate": 0.0002995927946445578, + "loss": 0.4446421265602112, + "memory(GiB)": 78.26, + "step": 374, + "token_acc": 0.8731128990987405, + "train_speed(iter/s)": 0.035021 + }, + { + "epoch": 0.07266385699752943, + "grad_norm": 0.11748611181974411, + "learning_rate": 0.00029958568529780245, + "loss": 0.4998936653137207, + "memory(GiB)": 78.26, + "step": 375, + "token_acc": 0.8600776778413737, + "train_speed(iter/s)": 0.035021 + }, + { + "epoch": 0.07285762728285618, + "grad_norm": 0.11014379560947418, + "learning_rate": 0.00029957851451218654, + "loss": 0.48282113671302795, + "memory(GiB)": 78.26, + "step": 376, + "token_acc": 0.8650632477795992, + "train_speed(iter/s)": 0.035022 + }, + { + "epoch": 0.07305139756818292, + "grad_norm": 0.1131962388753891, + "learning_rate": 0.0002995712822906554, + "loss": 0.47199296951293945, + "memory(GiB)": 78.26, + "step": 377, + "token_acc": 0.8649652360874542, + "train_speed(iter/s)": 0.035023 + }, + { + "epoch": 0.07324516785350967, + "grad_norm": 0.11221049726009369, + "learning_rate": 0.0002995639886361795, + "loss": 0.5101888179779053, + "memory(GiB)": 78.26, + "step": 378, + "token_acc": 0.8550319599324829, + "train_speed(iter/s)": 0.035024 + }, + { + "epoch": 0.07343893813883641, + "grad_norm": 0.1063636839389801, + "learning_rate": 0.0002995566335517546, + "loss": 0.5004944205284119, + "memory(GiB)": 78.26, + "step": 379, + "token_acc": 0.8585596162973695, + "train_speed(iter/s)": 0.035024 + }, + { + "epoch": 0.07363270842416315, + "grad_norm": 0.11803896725177765, + "learning_rate": 0.00029954921704040147, + "loss": 0.510295033454895, + "memory(GiB)": 78.26, + "step": 380, + "token_acc": 0.8577513030528667, + "train_speed(iter/s)": 0.035026 + }, + { + "epoch": 0.0738264787094899, + "grad_norm": 0.11037638783454895, + "learning_rate": 0.00029954173910516635, + "loss": 0.44948601722717285, + "memory(GiB)": 78.26, + "step": 381, + "token_acc": 0.8719704952581665, + "train_speed(iter/s)": 0.035026 + }, + { + "epoch": 0.07402024899481664, + "grad_norm": 0.1230226382613182, + "learning_rate": 0.0002995341997491207, + "loss": 0.5153728723526001, + "memory(GiB)": 78.26, + "step": 382, + "token_acc": 0.8567678516574045, + "train_speed(iter/s)": 0.035026 + }, + { + "epoch": 0.07421401928014339, + "grad_norm": 0.11615514755249023, + "learning_rate": 0.00029952659897536106, + "loss": 0.46452564001083374, + "memory(GiB)": 78.26, + "step": 383, + "token_acc": 0.8670107503877628, + "train_speed(iter/s)": 0.035027 + }, + { + "epoch": 0.07440778956547013, + "grad_norm": 0.13006049394607544, + "learning_rate": 0.00029951893678700927, + "loss": 0.506874144077301, + "memory(GiB)": 78.26, + "step": 384, + "token_acc": 0.8588143291124011, + "train_speed(iter/s)": 0.035028 + }, + { + "epoch": 0.07460155985079688, + "grad_norm": 0.11657480150461197, + "learning_rate": 0.00029951121318721243, + "loss": 0.49863314628601074, + "memory(GiB)": 78.26, + "step": 385, + "token_acc": 0.8593393170109936, + "train_speed(iter/s)": 0.035029 + }, + { + "epoch": 0.07479533013612362, + "grad_norm": 0.11655829846858978, + "learning_rate": 0.0002995034281791428, + "loss": 0.5014276504516602, + "memory(GiB)": 78.26, + "step": 386, + "token_acc": 0.861993529502388, + "train_speed(iter/s)": 0.03503 + }, + { + "epoch": 0.07498910042145036, + "grad_norm": 0.10777109861373901, + "learning_rate": 0.0002994955817659979, + "loss": 0.4714622497558594, + "memory(GiB)": 78.26, + "step": 387, + "token_acc": 0.8677407562147482, + "train_speed(iter/s)": 0.035031 + }, + { + "epoch": 0.07518287070677712, + "grad_norm": 0.12219083309173584, + "learning_rate": 0.00029948767395100045, + "loss": 0.5111258625984192, + "memory(GiB)": 78.26, + "step": 388, + "token_acc": 0.8558166152672241, + "train_speed(iter/s)": 0.035032 + }, + { + "epoch": 0.07537664099210387, + "grad_norm": 0.10944923013448715, + "learning_rate": 0.00029947970473739844, + "loss": 0.4479862451553345, + "memory(GiB)": 78.26, + "step": 389, + "token_acc": 0.8729919678714859, + "train_speed(iter/s)": 0.035033 + }, + { + "epoch": 0.07557041127743061, + "grad_norm": 0.10940490663051605, + "learning_rate": 0.000299471674128465, + "loss": 0.49675190448760986, + "memory(GiB)": 78.26, + "step": 390, + "token_acc": 0.8615333689812339, + "train_speed(iter/s)": 0.035032 + }, + { + "epoch": 0.07576418156275735, + "grad_norm": 0.11179753392934799, + "learning_rate": 0.0002994635821274986, + "loss": 0.4398466646671295, + "memory(GiB)": 78.26, + "step": 391, + "token_acc": 0.8771611786033172, + "train_speed(iter/s)": 0.035033 + }, + { + "epoch": 0.0759579518480841, + "grad_norm": 0.12613913416862488, + "learning_rate": 0.0002994554287378227, + "loss": 0.5353314876556396, + "memory(GiB)": 78.26, + "step": 392, + "token_acc": 0.8493086152908249, + "train_speed(iter/s)": 0.035033 + }, + { + "epoch": 0.07615172213341084, + "grad_norm": 0.11761578917503357, + "learning_rate": 0.00029944721396278623, + "loss": 0.5115870237350464, + "memory(GiB)": 78.26, + "step": 393, + "token_acc": 0.8544060286040279, + "train_speed(iter/s)": 0.035034 + }, + { + "epoch": 0.07634549241873759, + "grad_norm": 0.11017254739999771, + "learning_rate": 0.0002994389378057632, + "loss": 0.48874709010124207, + "memory(GiB)": 78.26, + "step": 394, + "token_acc": 0.8645016225055574, + "train_speed(iter/s)": 0.035034 + }, + { + "epoch": 0.07653926270406433, + "grad_norm": 0.10908011347055435, + "learning_rate": 0.00029943060027015276, + "loss": 0.47319239377975464, + "memory(GiB)": 78.26, + "step": 395, + "token_acc": 0.8659232780237955, + "train_speed(iter/s)": 0.035034 + }, + { + "epoch": 0.07673303298939108, + "grad_norm": 0.11207929253578186, + "learning_rate": 0.0002994222013593795, + "loss": 0.45064985752105713, + "memory(GiB)": 78.26, + "step": 396, + "token_acc": 0.8735906331309627, + "train_speed(iter/s)": 0.035035 + }, + { + "epoch": 0.07692680327471782, + "grad_norm": 0.11263241618871689, + "learning_rate": 0.000299413741076893, + "loss": 0.49612677097320557, + "memory(GiB)": 78.26, + "step": 397, + "token_acc": 0.8607561516527616, + "train_speed(iter/s)": 0.035036 + }, + { + "epoch": 0.07712057356004456, + "grad_norm": 0.12468399852514267, + "learning_rate": 0.0002994052194261681, + "loss": 0.530525267124176, + "memory(GiB)": 78.26, + "step": 398, + "token_acc": 0.8516811113159998, + "train_speed(iter/s)": 0.035037 + }, + { + "epoch": 0.07731434384537131, + "grad_norm": 0.10838499665260315, + "learning_rate": 0.00029939663641070496, + "loss": 0.46940740942955017, + "memory(GiB)": 78.26, + "step": 399, + "token_acc": 0.8676767140734452, + "train_speed(iter/s)": 0.035037 + }, + { + "epoch": 0.07750811413069805, + "grad_norm": 0.107694610953331, + "learning_rate": 0.0002993879920340288, + "loss": 0.4813489019870758, + "memory(GiB)": 78.26, + "step": 400, + "token_acc": 0.8637204826412022, + "train_speed(iter/s)": 0.035037 + }, + { + "epoch": 0.0777018844160248, + "grad_norm": 0.10585100203752518, + "learning_rate": 0.00029937928629969007, + "loss": 0.4711493253707886, + "memory(GiB)": 78.26, + "step": 401, + "token_acc": 0.868919624217119, + "train_speed(iter/s)": 0.034999 + }, + { + "epoch": 0.07789565470135154, + "grad_norm": 0.12802836298942566, + "learning_rate": 0.0002993705192112645, + "loss": 0.537087619304657, + "memory(GiB)": 78.26, + "step": 402, + "token_acc": 0.8504514311327399, + "train_speed(iter/s)": 0.035001 + }, + { + "epoch": 0.0780894249866783, + "grad_norm": 0.10905808210372925, + "learning_rate": 0.00029936169077235294, + "loss": 0.46871036291122437, + "memory(GiB)": 78.26, + "step": 403, + "token_acc": 0.8674802147324147, + "train_speed(iter/s)": 0.035002 + }, + { + "epoch": 0.07828319527200504, + "grad_norm": 0.1310214251279831, + "learning_rate": 0.0002993528009865815, + "loss": 0.5204190611839294, + "memory(GiB)": 78.26, + "step": 404, + "token_acc": 0.8552753875213625, + "train_speed(iter/s)": 0.035003 + }, + { + "epoch": 0.07847696555733179, + "grad_norm": 0.10810894519090652, + "learning_rate": 0.0002993438498576014, + "loss": 0.45181161165237427, + "memory(GiB)": 78.26, + "step": 405, + "token_acc": 0.8717410764238559, + "train_speed(iter/s)": 0.035004 + }, + { + "epoch": 0.07867073584265853, + "grad_norm": 0.10277310758829117, + "learning_rate": 0.0002993348373890891, + "loss": 0.4377118945121765, + "memory(GiB)": 78.26, + "step": 406, + "token_acc": 0.8732325819672131, + "train_speed(iter/s)": 0.035005 + }, + { + "epoch": 0.07886450612798528, + "grad_norm": 0.1221415251493454, + "learning_rate": 0.0002993257635847464, + "loss": 0.5307734608650208, + "memory(GiB)": 78.26, + "step": 407, + "token_acc": 0.8525828880078148, + "train_speed(iter/s)": 0.035005 + }, + { + "epoch": 0.07905827641331202, + "grad_norm": 0.11799792945384979, + "learning_rate": 0.0002993166284483, + "loss": 0.5113755464553833, + "memory(GiB)": 78.26, + "step": 408, + "token_acc": 0.8569156381218984, + "train_speed(iter/s)": 0.035006 + }, + { + "epoch": 0.07925204669863876, + "grad_norm": 0.11527646332979202, + "learning_rate": 0.000299307431983502, + "loss": 0.45992469787597656, + "memory(GiB)": 78.26, + "step": 409, + "token_acc": 0.8694049499736703, + "train_speed(iter/s)": 0.035007 + }, + { + "epoch": 0.07944581698396551, + "grad_norm": 0.11302848160266876, + "learning_rate": 0.00029929817419412964, + "loss": 0.492914617061615, + "memory(GiB)": 78.26, + "step": 410, + "token_acc": 0.8602246439421388, + "train_speed(iter/s)": 0.035007 + }, + { + "epoch": 0.07963958726929225, + "grad_norm": 0.11444272100925446, + "learning_rate": 0.0002992888550839853, + "loss": 0.5189880728721619, + "memory(GiB)": 78.26, + "step": 411, + "token_acc": 0.8538719731479262, + "train_speed(iter/s)": 0.035009 + }, + { + "epoch": 0.079833357554619, + "grad_norm": 0.1215919554233551, + "learning_rate": 0.0002992794746568967, + "loss": 0.5156800150871277, + "memory(GiB)": 78.26, + "step": 412, + "token_acc": 0.8556100806786991, + "train_speed(iter/s)": 0.03501 + }, + { + "epoch": 0.08002712783994574, + "grad_norm": 0.10883322358131409, + "learning_rate": 0.0002992700329167166, + "loss": 0.4452913999557495, + "memory(GiB)": 78.26, + "step": 413, + "token_acc": 0.8729308276689324, + "train_speed(iter/s)": 0.03501 + }, + { + "epoch": 0.08022089812527249, + "grad_norm": 0.10672541707754135, + "learning_rate": 0.00029926052986732285, + "loss": 0.4543689787387848, + "memory(GiB)": 78.26, + "step": 414, + "token_acc": 0.8705642256902761, + "train_speed(iter/s)": 0.03501 + }, + { + "epoch": 0.08041466841059923, + "grad_norm": 0.10857294499874115, + "learning_rate": 0.00029925096551261873, + "loss": 0.48616546392440796, + "memory(GiB)": 78.26, + "step": 415, + "token_acc": 0.8620314125989207, + "train_speed(iter/s)": 0.035011 + }, + { + "epoch": 0.08060843869592597, + "grad_norm": 0.11831134557723999, + "learning_rate": 0.0002992413398565325, + "loss": 0.48336830735206604, + "memory(GiB)": 78.26, + "step": 416, + "token_acc": 0.8604266578468662, + "train_speed(iter/s)": 0.035011 + }, + { + "epoch": 0.08080220898125272, + "grad_norm": 0.12000903487205505, + "learning_rate": 0.0002992316529030178, + "loss": 0.49910253286361694, + "memory(GiB)": 78.26, + "step": 417, + "token_acc": 0.857624620965228, + "train_speed(iter/s)": 0.035012 + }, + { + "epoch": 0.08099597926657948, + "grad_norm": 0.11360272020101547, + "learning_rate": 0.0002992219046560532, + "loss": 0.4726894199848175, + "memory(GiB)": 78.26, + "step": 418, + "token_acc": 0.8649808638600328, + "train_speed(iter/s)": 0.035013 + }, + { + "epoch": 0.08118974955190622, + "grad_norm": 0.11910063773393631, + "learning_rate": 0.0002992120951196426, + "loss": 0.49129027128219604, + "memory(GiB)": 78.26, + "step": 419, + "token_acc": 0.8614566125740939, + "train_speed(iter/s)": 0.035013 + }, + { + "epoch": 0.08138351983723296, + "grad_norm": 0.11072386801242828, + "learning_rate": 0.0002992022242978151, + "loss": 0.4478093385696411, + "memory(GiB)": 78.26, + "step": 420, + "token_acc": 0.8705150236942144, + "train_speed(iter/s)": 0.035014 + }, + { + "epoch": 0.08157729012255971, + "grad_norm": 0.11215101927518845, + "learning_rate": 0.0002991922921946248, + "loss": 0.43505847454071045, + "memory(GiB)": 78.26, + "step": 421, + "token_acc": 0.8739453295629583, + "train_speed(iter/s)": 0.035015 + }, + { + "epoch": 0.08177106040788645, + "grad_norm": 0.12848572432994843, + "learning_rate": 0.0002991822988141512, + "loss": 0.4977684020996094, + "memory(GiB)": 78.26, + "step": 422, + "token_acc": 0.8609223368850855, + "train_speed(iter/s)": 0.035016 + }, + { + "epoch": 0.0819648306932132, + "grad_norm": 0.10522928833961487, + "learning_rate": 0.0002991722441604988, + "loss": 0.415419340133667, + "memory(GiB)": 78.26, + "step": 423, + "token_acc": 0.880544936757575, + "train_speed(iter/s)": 0.035017 + }, + { + "epoch": 0.08215860097853994, + "grad_norm": 0.12996944785118103, + "learning_rate": 0.00029916212823779723, + "loss": 0.4946865737438202, + "memory(GiB)": 78.26, + "step": 424, + "token_acc": 0.8606087064986903, + "train_speed(iter/s)": 0.035018 + }, + { + "epoch": 0.08235237126386669, + "grad_norm": 0.1072884202003479, + "learning_rate": 0.0002991519510502015, + "loss": 0.44100552797317505, + "memory(GiB)": 78.26, + "step": 425, + "token_acc": 0.8742977528089888, + "train_speed(iter/s)": 0.035018 + }, + { + "epoch": 0.08254614154919343, + "grad_norm": 0.11559458076953888, + "learning_rate": 0.0002991417126018916, + "loss": 0.4932180643081665, + "memory(GiB)": 78.26, + "step": 426, + "token_acc": 0.8621398554887082, + "train_speed(iter/s)": 0.035018 + }, + { + "epoch": 0.08273991183452017, + "grad_norm": 0.10428472608327866, + "learning_rate": 0.00029913141289707277, + "loss": 0.44228169322013855, + "memory(GiB)": 78.26, + "step": 427, + "token_acc": 0.8711459857697936, + "train_speed(iter/s)": 0.035018 + }, + { + "epoch": 0.08293368211984692, + "grad_norm": 0.11177484691143036, + "learning_rate": 0.0002991210519399753, + "loss": 0.46969670057296753, + "memory(GiB)": 78.26, + "step": 428, + "token_acc": 0.8665406640525826, + "train_speed(iter/s)": 0.035019 + }, + { + "epoch": 0.08312745240517366, + "grad_norm": 0.12332018464803696, + "learning_rate": 0.00029911062973485476, + "loss": 0.5035005807876587, + "memory(GiB)": 78.26, + "step": 429, + "token_acc": 0.856846298426235, + "train_speed(iter/s)": 0.03502 + }, + { + "epoch": 0.08332122269050041, + "grad_norm": 0.11600656807422638, + "learning_rate": 0.00029910014628599184, + "loss": 0.45422035455703735, + "memory(GiB)": 78.26, + "step": 430, + "token_acc": 0.8697181133128663, + "train_speed(iter/s)": 0.035021 + }, + { + "epoch": 0.08351499297582715, + "grad_norm": 0.12605808675289154, + "learning_rate": 0.0002990896015976924, + "loss": 0.48837852478027344, + "memory(GiB)": 78.26, + "step": 431, + "token_acc": 0.8621029303127309, + "train_speed(iter/s)": 0.035023 + }, + { + "epoch": 0.0837087632611539, + "grad_norm": 0.11456278711557388, + "learning_rate": 0.00029907899567428736, + "loss": 0.48890623450279236, + "memory(GiB)": 78.26, + "step": 432, + "token_acc": 0.8624089155593656, + "train_speed(iter/s)": 0.035022 + }, + { + "epoch": 0.08390253354648065, + "grad_norm": 0.11591736972332001, + "learning_rate": 0.0002990683285201329, + "loss": 0.4833714962005615, + "memory(GiB)": 78.26, + "step": 433, + "token_acc": 0.8619619484549993, + "train_speed(iter/s)": 0.035023 + }, + { + "epoch": 0.0840963038318074, + "grad_norm": 0.11160556972026825, + "learning_rate": 0.00029905760013961024, + "loss": 0.5017392635345459, + "memory(GiB)": 78.26, + "step": 434, + "token_acc": 0.857612434705058, + "train_speed(iter/s)": 0.035023 + }, + { + "epoch": 0.08429007411713414, + "grad_norm": 0.11990434676408768, + "learning_rate": 0.000299046810537126, + "loss": 0.5158663988113403, + "memory(GiB)": 78.26, + "step": 435, + "token_acc": 0.8545139761525788, + "train_speed(iter/s)": 0.035025 + }, + { + "epoch": 0.08448384440246089, + "grad_norm": 0.1072223111987114, + "learning_rate": 0.0002990359597171115, + "loss": 0.4850725531578064, + "memory(GiB)": 78.26, + "step": 436, + "token_acc": 0.8633244854009116, + "train_speed(iter/s)": 0.035025 + }, + { + "epoch": 0.08467761468778763, + "grad_norm": 0.1028514951467514, + "learning_rate": 0.00029902504768402363, + "loss": 0.4222199618816376, + "memory(GiB)": 78.26, + "step": 437, + "token_acc": 0.8768839397139292, + "train_speed(iter/s)": 0.035025 + }, + { + "epoch": 0.08487138497311437, + "grad_norm": 0.12057259678840637, + "learning_rate": 0.0002990140744423443, + "loss": 0.4803910255432129, + "memory(GiB)": 78.26, + "step": 438, + "token_acc": 0.8642994549535108, + "train_speed(iter/s)": 0.035027 + }, + { + "epoch": 0.08506515525844112, + "grad_norm": 0.11526720970869064, + "learning_rate": 0.0002990030399965803, + "loss": 0.505075216293335, + "memory(GiB)": 78.26, + "step": 439, + "token_acc": 0.8569081317921075, + "train_speed(iter/s)": 0.035027 + }, + { + "epoch": 0.08525892554376786, + "grad_norm": 0.115642249584198, + "learning_rate": 0.000298991944351264, + "loss": 0.4736385643482208, + "memory(GiB)": 78.26, + "step": 440, + "token_acc": 0.8655344655344656, + "train_speed(iter/s)": 0.035027 + }, + { + "epoch": 0.08545269582909461, + "grad_norm": 0.10979454219341278, + "learning_rate": 0.0002989807875109525, + "loss": 0.4606488347053528, + "memory(GiB)": 78.26, + "step": 441, + "token_acc": 0.8691641871787936, + "train_speed(iter/s)": 0.035028 + }, + { + "epoch": 0.08564646611442135, + "grad_norm": 0.1096881628036499, + "learning_rate": 0.0002989695694802284, + "loss": 0.46464937925338745, + "memory(GiB)": 78.26, + "step": 442, + "token_acc": 0.8675828444373007, + "train_speed(iter/s)": 0.035028 + }, + { + "epoch": 0.0858402363997481, + "grad_norm": 0.11683948338031769, + "learning_rate": 0.0002989582902636991, + "loss": 0.5202597379684448, + "memory(GiB)": 78.26, + "step": 443, + "token_acc": 0.8540174192205939, + "train_speed(iter/s)": 0.035029 + }, + { + "epoch": 0.08603400668507484, + "grad_norm": 0.11013835668563843, + "learning_rate": 0.00029894694986599735, + "loss": 0.45811498165130615, + "memory(GiB)": 78.26, + "step": 444, + "token_acc": 0.8691748066748066, + "train_speed(iter/s)": 0.035029 + }, + { + "epoch": 0.08622777697040158, + "grad_norm": 0.12029381096363068, + "learning_rate": 0.0002989355482917809, + "loss": 0.5269613862037659, + "memory(GiB)": 78.26, + "step": 445, + "token_acc": 0.8544889657602255, + "train_speed(iter/s)": 0.03503 + }, + { + "epoch": 0.08642154725572833, + "grad_norm": 0.11967018991708755, + "learning_rate": 0.00029892408554573266, + "loss": 0.5077260732650757, + "memory(GiB)": 78.26, + "step": 446, + "token_acc": 0.8567639257294429, + "train_speed(iter/s)": 0.035031 + }, + { + "epoch": 0.08661531754105507, + "grad_norm": 0.1161133274435997, + "learning_rate": 0.00029891256163256085, + "loss": 0.49124279618263245, + "memory(GiB)": 78.26, + "step": 447, + "token_acc": 0.8613902094308108, + "train_speed(iter/s)": 0.035031 + }, + { + "epoch": 0.08680908782638183, + "grad_norm": 0.11311367154121399, + "learning_rate": 0.0002989009765569985, + "loss": 0.4679524898529053, + "memory(GiB)": 78.26, + "step": 448, + "token_acc": 0.8683576233183856, + "train_speed(iter/s)": 0.035033 + }, + { + "epoch": 0.08700285811170858, + "grad_norm": 0.1288428157567978, + "learning_rate": 0.00029888933032380394, + "loss": 0.5291860103607178, + "memory(GiB)": 78.26, + "step": 449, + "token_acc": 0.850703275825085, + "train_speed(iter/s)": 0.035034 + }, + { + "epoch": 0.08719662839703532, + "grad_norm": 0.1069454550743103, + "learning_rate": 0.0002988776229377606, + "loss": 0.44400641322135925, + "memory(GiB)": 78.26, + "step": 450, + "token_acc": 0.8715773445359871, + "train_speed(iter/s)": 0.035035 + }, + { + "epoch": 0.08739039868236206, + "grad_norm": 0.12414740771055222, + "learning_rate": 0.00029886585440367703, + "loss": 0.49137061834335327, + "memory(GiB)": 78.26, + "step": 451, + "token_acc": 0.8623676960997535, + "train_speed(iter/s)": 0.035036 + }, + { + "epoch": 0.08758416896768881, + "grad_norm": 0.11944427341222763, + "learning_rate": 0.0002988540247263869, + "loss": 0.4897725582122803, + "memory(GiB)": 78.26, + "step": 452, + "token_acc": 0.8620128910176807, + "train_speed(iter/s)": 0.035037 + }, + { + "epoch": 0.08777793925301555, + "grad_norm": 0.11256846785545349, + "learning_rate": 0.0002988421339107489, + "loss": 0.45858433842658997, + "memory(GiB)": 78.26, + "step": 453, + "token_acc": 0.8707719882217386, + "train_speed(iter/s)": 0.035037 + }, + { + "epoch": 0.0879717095383423, + "grad_norm": 0.11249680817127228, + "learning_rate": 0.000298830181961647, + "loss": 0.49964413046836853, + "memory(GiB)": 78.26, + "step": 454, + "token_acc": 0.8624331586368352, + "train_speed(iter/s)": 0.035038 + }, + { + "epoch": 0.08816547982366904, + "grad_norm": 0.11055975407361984, + "learning_rate": 0.00029881816888399014, + "loss": 0.458358496427536, + "memory(GiB)": 78.26, + "step": 455, + "token_acc": 0.8681250838813582, + "train_speed(iter/s)": 0.035039 + }, + { + "epoch": 0.08835925010899578, + "grad_norm": 0.11990530788898468, + "learning_rate": 0.0002988060946827124, + "loss": 0.51871657371521, + "memory(GiB)": 78.26, + "step": 456, + "token_acc": 0.8554156908665106, + "train_speed(iter/s)": 0.035039 + }, + { + "epoch": 0.08855302039432253, + "grad_norm": 0.13425587117671967, + "learning_rate": 0.00029879395936277303, + "loss": 0.5187729597091675, + "memory(GiB)": 78.26, + "step": 457, + "token_acc": 0.8552357434987268, + "train_speed(iter/s)": 0.03504 + }, + { + "epoch": 0.08874679067964927, + "grad_norm": 0.12039537727832794, + "learning_rate": 0.0002987817629291563, + "loss": 0.5168625116348267, + "memory(GiB)": 78.26, + "step": 458, + "token_acc": 0.852051777849393, + "train_speed(iter/s)": 0.035041 + }, + { + "epoch": 0.08894056096497602, + "grad_norm": 0.10525672137737274, + "learning_rate": 0.0002987695053868716, + "loss": 0.47165447473526, + "memory(GiB)": 78.26, + "step": 459, + "token_acc": 0.8652151769798828, + "train_speed(iter/s)": 0.03504 + }, + { + "epoch": 0.08913433125030276, + "grad_norm": 0.12413229048252106, + "learning_rate": 0.00029875718674095346, + "loss": 0.4889269471168518, + "memory(GiB)": 78.26, + "step": 460, + "token_acc": 0.8654352837852674, + "train_speed(iter/s)": 0.03504 + }, + { + "epoch": 0.0893281015356295, + "grad_norm": 0.11871378868818283, + "learning_rate": 0.00029874480699646145, + "loss": 0.47212713956832886, + "memory(GiB)": 78.26, + "step": 461, + "token_acc": 0.8666805704006882, + "train_speed(iter/s)": 0.03504 + }, + { + "epoch": 0.08952187182095625, + "grad_norm": 0.11364120990037918, + "learning_rate": 0.0002987323661584803, + "loss": 0.4883894622325897, + "memory(GiB)": 78.26, + "step": 462, + "token_acc": 0.8616672536642194, + "train_speed(iter/s)": 0.035041 + }, + { + "epoch": 0.08971564210628301, + "grad_norm": 0.11412378400564194, + "learning_rate": 0.00029871986423211976, + "loss": 0.48642444610595703, + "memory(GiB)": 78.26, + "step": 463, + "token_acc": 0.8636845270483118, + "train_speed(iter/s)": 0.035042 + }, + { + "epoch": 0.08990941239160975, + "grad_norm": 0.12467852979898453, + "learning_rate": 0.0002987073012225147, + "loss": 0.537085235118866, + "memory(GiB)": 78.26, + "step": 464, + "token_acc": 0.851391779396462, + "train_speed(iter/s)": 0.035042 + }, + { + "epoch": 0.0901031826769365, + "grad_norm": 0.11268845200538635, + "learning_rate": 0.00029869467713482516, + "loss": 0.5060177445411682, + "memory(GiB)": 78.26, + "step": 465, + "token_acc": 0.8563837385366968, + "train_speed(iter/s)": 0.035042 + }, + { + "epoch": 0.09029695296226324, + "grad_norm": 0.1154741421341896, + "learning_rate": 0.00029868199197423607, + "loss": 0.5034860372543335, + "memory(GiB)": 78.26, + "step": 466, + "token_acc": 0.8582380491378634, + "train_speed(iter/s)": 0.035043 + }, + { + "epoch": 0.09049072324758999, + "grad_norm": 0.11442988365888596, + "learning_rate": 0.0002986692457459577, + "loss": 0.4907042980194092, + "memory(GiB)": 78.26, + "step": 467, + "token_acc": 0.8586531036135595, + "train_speed(iter/s)": 0.035045 + }, + { + "epoch": 0.09068449353291673, + "grad_norm": 0.1217266321182251, + "learning_rate": 0.00029865643845522515, + "loss": 0.5039050579071045, + "memory(GiB)": 78.26, + "step": 468, + "token_acc": 0.859063377160585, + "train_speed(iter/s)": 0.035045 + }, + { + "epoch": 0.09087826381824347, + "grad_norm": 0.11309139430522919, + "learning_rate": 0.00029864357010729885, + "loss": 0.459963858127594, + "memory(GiB)": 78.26, + "step": 469, + "token_acc": 0.8677241875771288, + "train_speed(iter/s)": 0.035045 + }, + { + "epoch": 0.09107203410357022, + "grad_norm": 0.10612141340970993, + "learning_rate": 0.00029863064070746406, + "loss": 0.4284835755825043, + "memory(GiB)": 78.26, + "step": 470, + "token_acc": 0.8769310129503435, + "train_speed(iter/s)": 0.035045 + }, + { + "epoch": 0.09126580438889696, + "grad_norm": 0.13069190084934235, + "learning_rate": 0.00029861765026103126, + "loss": 0.570850670337677, + "memory(GiB)": 78.26, + "step": 471, + "token_acc": 0.8367923900751679, + "train_speed(iter/s)": 0.035047 + }, + { + "epoch": 0.0914595746742237, + "grad_norm": 0.11362801492214203, + "learning_rate": 0.000298604598773336, + "loss": 0.47750890254974365, + "memory(GiB)": 78.26, + "step": 472, + "token_acc": 0.8643800970372665, + "train_speed(iter/s)": 0.035048 + }, + { + "epoch": 0.09165334495955045, + "grad_norm": 0.11324970424175262, + "learning_rate": 0.0002985914862497388, + "loss": 0.47157877683639526, + "memory(GiB)": 78.26, + "step": 473, + "token_acc": 0.8665956123989698, + "train_speed(iter/s)": 0.035049 + }, + { + "epoch": 0.0918471152448772, + "grad_norm": 0.11150199174880981, + "learning_rate": 0.00029857831269562544, + "loss": 0.47126272320747375, + "memory(GiB)": 78.26, + "step": 474, + "token_acc": 0.8654503990877993, + "train_speed(iter/s)": 0.035049 + }, + { + "epoch": 0.09204088553020394, + "grad_norm": 0.10614330321550369, + "learning_rate": 0.00029856507811640667, + "loss": 0.4606248736381531, + "memory(GiB)": 78.26, + "step": 475, + "token_acc": 0.8700272670069217, + "train_speed(iter/s)": 0.03505 + }, + { + "epoch": 0.09223465581553068, + "grad_norm": 0.11242176592350006, + "learning_rate": 0.0002985517825175181, + "loss": 0.46226733922958374, + "memory(GiB)": 78.26, + "step": 476, + "token_acc": 0.8658786323456982, + "train_speed(iter/s)": 0.03505 + }, + { + "epoch": 0.09242842610085743, + "grad_norm": 0.11099898815155029, + "learning_rate": 0.0002985384259044208, + "loss": 0.4800049662590027, + "memory(GiB)": 78.26, + "step": 477, + "token_acc": 0.8640604175921812, + "train_speed(iter/s)": 0.03505 + }, + { + "epoch": 0.09262219638618417, + "grad_norm": 0.11142181605100632, + "learning_rate": 0.0002985250082826005, + "loss": 0.48013511300086975, + "memory(GiB)": 78.26, + "step": 478, + "token_acc": 0.8613092239573668, + "train_speed(iter/s)": 0.03505 + }, + { + "epoch": 0.09281596667151093, + "grad_norm": 0.11062056571245193, + "learning_rate": 0.0002985115296575684, + "loss": 0.5107330083847046, + "memory(GiB)": 78.26, + "step": 479, + "token_acc": 0.8574547485466223, + "train_speed(iter/s)": 0.03505 + }, + { + "epoch": 0.09300973695683767, + "grad_norm": 0.10769648849964142, + "learning_rate": 0.00029849799003486035, + "loss": 0.4654473066329956, + "memory(GiB)": 78.26, + "step": 480, + "token_acc": 0.8700014251104461, + "train_speed(iter/s)": 0.035052 + }, + { + "epoch": 0.09320350724216442, + "grad_norm": 0.11566044390201569, + "learning_rate": 0.00029848438942003746, + "loss": 0.5077210664749146, + "memory(GiB)": 78.26, + "step": 481, + "token_acc": 0.857971693966654, + "train_speed(iter/s)": 0.035053 + }, + { + "epoch": 0.09339727752749116, + "grad_norm": 0.10409853607416153, + "learning_rate": 0.00029847072781868597, + "loss": 0.4633353352546692, + "memory(GiB)": 78.26, + "step": 482, + "token_acc": 0.8679831912361242, + "train_speed(iter/s)": 0.035053 + }, + { + "epoch": 0.0935910478128179, + "grad_norm": 0.11067858338356018, + "learning_rate": 0.00029845700523641695, + "loss": 0.4580146074295044, + "memory(GiB)": 78.26, + "step": 483, + "token_acc": 0.8679867986798679, + "train_speed(iter/s)": 0.035054 + }, + { + "epoch": 0.09378481809814465, + "grad_norm": 0.11449804157018661, + "learning_rate": 0.0002984432216788667, + "loss": 0.4559538662433624, + "memory(GiB)": 78.26, + "step": 484, + "token_acc": 0.8718022752208812, + "train_speed(iter/s)": 0.035054 + }, + { + "epoch": 0.0939785883834714, + "grad_norm": 0.11818825453519821, + "learning_rate": 0.0002984293771516965, + "loss": 0.5102649331092834, + "memory(GiB)": 78.26, + "step": 485, + "token_acc": 0.8580015707122576, + "train_speed(iter/s)": 0.035055 + }, + { + "epoch": 0.09417235866879814, + "grad_norm": 0.11284741759300232, + "learning_rate": 0.00029841547166059264, + "loss": 0.48310020565986633, + "memory(GiB)": 78.26, + "step": 486, + "token_acc": 0.8661896877956481, + "train_speed(iter/s)": 0.035056 + }, + { + "epoch": 0.09436612895412488, + "grad_norm": 0.12485720217227936, + "learning_rate": 0.0002984015052112665, + "loss": 0.506294310092926, + "memory(GiB)": 78.26, + "step": 487, + "token_acc": 0.8588789601949635, + "train_speed(iter/s)": 0.035057 + }, + { + "epoch": 0.09455989923945163, + "grad_norm": 0.10859735310077667, + "learning_rate": 0.0002983874778094545, + "loss": 0.4783238172531128, + "memory(GiB)": 78.26, + "step": 488, + "token_acc": 0.8641949593834618, + "train_speed(iter/s)": 0.035057 + }, + { + "epoch": 0.09475366952477837, + "grad_norm": 0.10904448479413986, + "learning_rate": 0.00029837338946091794, + "loss": 0.4522710144519806, + "memory(GiB)": 78.26, + "step": 489, + "token_acc": 0.8690276365871533, + "train_speed(iter/s)": 0.035057 + }, + { + "epoch": 0.09494743981010512, + "grad_norm": 0.10352976620197296, + "learning_rate": 0.0002983592401714435, + "loss": 0.4140471816062927, + "memory(GiB)": 78.26, + "step": 490, + "token_acc": 0.8802987811455075, + "train_speed(iter/s)": 0.035057 + }, + { + "epoch": 0.09514121009543186, + "grad_norm": 0.10894495248794556, + "learning_rate": 0.00029834502994684247, + "loss": 0.4381650686264038, + "memory(GiB)": 78.26, + "step": 491, + "token_acc": 0.875124829777576, + "train_speed(iter/s)": 0.035057 + }, + { + "epoch": 0.0953349803807586, + "grad_norm": 0.10618970543146133, + "learning_rate": 0.00029833075879295146, + "loss": 0.44262564182281494, + "memory(GiB)": 78.26, + "step": 492, + "token_acc": 0.8727838336831989, + "train_speed(iter/s)": 0.035057 + }, + { + "epoch": 0.09552875066608535, + "grad_norm": 0.11135976016521454, + "learning_rate": 0.00029831642671563203, + "loss": 0.504035472869873, + "memory(GiB)": 78.26, + "step": 493, + "token_acc": 0.8569741773191643, + "train_speed(iter/s)": 0.035057 + }, + { + "epoch": 0.09572252095141211, + "grad_norm": 0.11450373381376266, + "learning_rate": 0.00029830203372077077, + "loss": 0.46363234519958496, + "memory(GiB)": 78.26, + "step": 494, + "token_acc": 0.8658075691564747, + "train_speed(iter/s)": 0.035058 + }, + { + "epoch": 0.09591629123673885, + "grad_norm": 0.1127939373254776, + "learning_rate": 0.0002982875798142791, + "loss": 0.4617582857608795, + "memory(GiB)": 78.26, + "step": 495, + "token_acc": 0.8662511720258121, + "train_speed(iter/s)": 0.035058 + }, + { + "epoch": 0.0961100615220656, + "grad_norm": 0.11518420279026031, + "learning_rate": 0.00029827306500209387, + "loss": 0.45998328924179077, + "memory(GiB)": 78.26, + "step": 496, + "token_acc": 0.8671692940370117, + "train_speed(iter/s)": 0.035059 + }, + { + "epoch": 0.09630383180739234, + "grad_norm": 0.10046973824501038, + "learning_rate": 0.0002982584892901766, + "loss": 0.4228544235229492, + "memory(GiB)": 78.26, + "step": 497, + "token_acc": 0.8777845109683675, + "train_speed(iter/s)": 0.035059 + }, + { + "epoch": 0.09649760209271908, + "grad_norm": 0.10890112817287445, + "learning_rate": 0.00029824385268451394, + "loss": 0.43357470631599426, + "memory(GiB)": 78.26, + "step": 498, + "token_acc": 0.875686566090231, + "train_speed(iter/s)": 0.035059 + }, + { + "epoch": 0.09669137237804583, + "grad_norm": 0.1078747883439064, + "learning_rate": 0.0002982291551911174, + "loss": 0.4050006866455078, + "memory(GiB)": 78.26, + "step": 499, + "token_acc": 0.8838355027744645, + "train_speed(iter/s)": 0.035059 + }, + { + "epoch": 0.09688514266337257, + "grad_norm": 0.12394701689481735, + "learning_rate": 0.0002982143968160238, + "loss": 0.42825788259506226, + "memory(GiB)": 78.26, + "step": 500, + "token_acc": 0.8775999553895054, + "train_speed(iter/s)": 0.03506 + }, + { + "epoch": 0.09688514266337257, + "eval_loss": 0.5390220284461975, + "eval_runtime": 1346.167, + "eval_samples_per_second": 5.013, + "eval_steps_per_second": 5.013, + "eval_token_acc": 0.8670573123458688, + "step": 500 + }, + { + "epoch": 0.09707891294869932, + "grad_norm": 0.11574912816286087, + "learning_rate": 0.0002981995775652948, + "loss": 0.4860585331916809, + "memory(GiB)": 78.26, + "step": 501, + "token_acc": 0.8617111447871986, + "train_speed(iter/s)": 0.03204 + }, + { + "epoch": 0.09727268323402606, + "grad_norm": 0.10734662413597107, + "learning_rate": 0.000298184697445017, + "loss": 0.48478803038597107, + "memory(GiB)": 78.26, + "step": 502, + "token_acc": 0.8619134645052572, + "train_speed(iter/s)": 0.032046 + }, + { + "epoch": 0.0974664535193528, + "grad_norm": 0.10531099885702133, + "learning_rate": 0.00029816975646130206, + "loss": 0.40947991609573364, + "memory(GiB)": 78.26, + "step": 503, + "token_acc": 0.8847219204866391, + "train_speed(iter/s)": 0.032052 + }, + { + "epoch": 0.09766022380467955, + "grad_norm": 0.11789926886558533, + "learning_rate": 0.0002981547546202867, + "loss": 0.48241478204727173, + "memory(GiB)": 78.26, + "step": 504, + "token_acc": 0.8646450249428649, + "train_speed(iter/s)": 0.032058 + }, + { + "epoch": 0.0978539940900063, + "grad_norm": 0.12707959115505219, + "learning_rate": 0.0002981396919281325, + "loss": 0.5034913420677185, + "memory(GiB)": 78.26, + "step": 505, + "token_acc": 0.8592644506741423, + "train_speed(iter/s)": 0.032064 + }, + { + "epoch": 0.09804776437533304, + "grad_norm": 0.11420562863349915, + "learning_rate": 0.0002981245683910262, + "loss": 0.4770240783691406, + "memory(GiB)": 78.26, + "step": 506, + "token_acc": 0.8642017671785194, + "train_speed(iter/s)": 0.03207 + }, + { + "epoch": 0.09824153466065978, + "grad_norm": 0.11006432771682739, + "learning_rate": 0.00029810938401517937, + "loss": 0.4580528140068054, + "memory(GiB)": 78.26, + "step": 507, + "token_acc": 0.8687396218329851, + "train_speed(iter/s)": 0.032076 + }, + { + "epoch": 0.09843530494598653, + "grad_norm": 0.11123590171337128, + "learning_rate": 0.00029809413880682866, + "loss": 0.4647199213504791, + "memory(GiB)": 78.26, + "step": 508, + "token_acc": 0.8687078223879421, + "train_speed(iter/s)": 0.032082 + }, + { + "epoch": 0.09862907523131328, + "grad_norm": 0.11409325152635574, + "learning_rate": 0.00029807883277223573, + "loss": 0.44523417949676514, + "memory(GiB)": 78.26, + "step": 509, + "token_acc": 0.8726046297715775, + "train_speed(iter/s)": 0.032087 + }, + { + "epoch": 0.09882284551664003, + "grad_norm": 0.10299310833215714, + "learning_rate": 0.00029806346591768713, + "loss": 0.4694310128688812, + "memory(GiB)": 78.26, + "step": 510, + "token_acc": 0.8651645115862245, + "train_speed(iter/s)": 0.032093 + }, + { + "epoch": 0.09901661580196677, + "grad_norm": 0.11479201167821884, + "learning_rate": 0.0002980480382494945, + "loss": 0.4353720545768738, + "memory(GiB)": 78.26, + "step": 511, + "token_acc": 0.8747248514197666, + "train_speed(iter/s)": 0.032098 + }, + { + "epoch": 0.09921038608729352, + "grad_norm": 0.12371563911437988, + "learning_rate": 0.0002980325497739943, + "loss": 0.4968646466732025, + "memory(GiB)": 78.26, + "step": 512, + "token_acc": 0.8599567167456011, + "train_speed(iter/s)": 0.032104 + }, + { + "epoch": 0.09940415637262026, + "grad_norm": 0.12062571942806244, + "learning_rate": 0.00029801700049754816, + "loss": 0.5140507817268372, + "memory(GiB)": 78.26, + "step": 513, + "token_acc": 0.8532771431740046, + "train_speed(iter/s)": 0.03211 + }, + { + "epoch": 0.099597926657947, + "grad_norm": 0.11467831581830978, + "learning_rate": 0.0002980013904265425, + "loss": 0.4433932900428772, + "memory(GiB)": 78.26, + "step": 514, + "token_acc": 0.8714406689954483, + "train_speed(iter/s)": 0.032115 + }, + { + "epoch": 0.09979169694327375, + "grad_norm": 0.10191385447978973, + "learning_rate": 0.00029798571956738887, + "loss": 0.44032424688339233, + "memory(GiB)": 78.26, + "step": 515, + "token_acc": 0.8739991993594876, + "train_speed(iter/s)": 0.032121 + }, + { + "epoch": 0.0999854672286005, + "grad_norm": 0.1040392592549324, + "learning_rate": 0.00029796998792652366, + "loss": 0.43922415375709534, + "memory(GiB)": 78.26, + "step": 516, + "token_acc": 0.8740507096930729, + "train_speed(iter/s)": 0.032126 + }, + { + "epoch": 0.10017923751392724, + "grad_norm": 0.1119905412197113, + "learning_rate": 0.00029795419551040833, + "loss": 0.4818880259990692, + "memory(GiB)": 78.26, + "step": 517, + "token_acc": 0.8635854743792163, + "train_speed(iter/s)": 0.032131 + }, + { + "epoch": 0.10037300779925398, + "grad_norm": 0.10882623493671417, + "learning_rate": 0.00029793834232552923, + "loss": 0.4754379093647003, + "memory(GiB)": 78.26, + "step": 518, + "token_acc": 0.8650517086330936, + "train_speed(iter/s)": 0.032137 + }, + { + "epoch": 0.10056677808458073, + "grad_norm": 0.11377903819084167, + "learning_rate": 0.00029792242837839764, + "loss": 0.4642985165119171, + "memory(GiB)": 78.26, + "step": 519, + "token_acc": 0.8642178631297238, + "train_speed(iter/s)": 0.032143 + }, + { + "epoch": 0.10076054836990747, + "grad_norm": 0.1146177351474762, + "learning_rate": 0.0002979064536755499, + "loss": 0.47592130303382874, + "memory(GiB)": 78.26, + "step": 520, + "token_acc": 0.8636443661971831, + "train_speed(iter/s)": 0.032148 + }, + { + "epoch": 0.10095431865523422, + "grad_norm": 0.1103118285536766, + "learning_rate": 0.00029789041822354725, + "loss": 0.4815801978111267, + "memory(GiB)": 78.26, + "step": 521, + "token_acc": 0.864503921461087, + "train_speed(iter/s)": 0.032153 + }, + { + "epoch": 0.10114808894056096, + "grad_norm": 0.10768909007310867, + "learning_rate": 0.00029787432202897586, + "loss": 0.45158839225769043, + "memory(GiB)": 78.26, + "step": 522, + "token_acc": 0.8713771448091021, + "train_speed(iter/s)": 0.032159 + }, + { + "epoch": 0.1013418592258877, + "grad_norm": 0.11492832750082016, + "learning_rate": 0.00029785816509844687, + "loss": 0.510797381401062, + "memory(GiB)": 78.26, + "step": 523, + "token_acc": 0.8555613215706626, + "train_speed(iter/s)": 0.032164 + }, + { + "epoch": 0.10153562951121446, + "grad_norm": 0.11431033164262772, + "learning_rate": 0.00029784194743859635, + "loss": 0.4578917324542999, + "memory(GiB)": 78.26, + "step": 524, + "token_acc": 0.8709328500014617, + "train_speed(iter/s)": 0.03217 + }, + { + "epoch": 0.1017293997965412, + "grad_norm": 0.11284809559583664, + "learning_rate": 0.00029782566905608537, + "loss": 0.5204124450683594, + "memory(GiB)": 78.26, + "step": 525, + "token_acc": 0.8546162111927511, + "train_speed(iter/s)": 0.032175 + }, + { + "epoch": 0.10192317008186795, + "grad_norm": 0.10947126150131226, + "learning_rate": 0.00029780932995759993, + "loss": 0.47740259766578674, + "memory(GiB)": 78.26, + "step": 526, + "token_acc": 0.8662353223100887, + "train_speed(iter/s)": 0.03218 + }, + { + "epoch": 0.1021169403671947, + "grad_norm": 0.10436630249023438, + "learning_rate": 0.0002977929301498508, + "loss": 0.41452687978744507, + "memory(GiB)": 78.26, + "step": 527, + "token_acc": 0.8821526069681326, + "train_speed(iter/s)": 0.032185 + }, + { + "epoch": 0.10231071065252144, + "grad_norm": 0.10980913788080215, + "learning_rate": 0.00029777646963957395, + "loss": 0.4662986397743225, + "memory(GiB)": 78.26, + "step": 528, + "token_acc": 0.8664124246946333, + "train_speed(iter/s)": 0.032191 + }, + { + "epoch": 0.10250448093784818, + "grad_norm": 0.12164165824651718, + "learning_rate": 0.00029775994843353015, + "loss": 0.5069164037704468, + "memory(GiB)": 78.26, + "step": 529, + "token_acc": 0.855151571221513, + "train_speed(iter/s)": 0.032197 + }, + { + "epoch": 0.10269825122317493, + "grad_norm": 0.10582577437162399, + "learning_rate": 0.000297743366538505, + "loss": 0.48306143283843994, + "memory(GiB)": 78.26, + "step": 530, + "token_acc": 0.8629473684210527, + "train_speed(iter/s)": 0.032202 + }, + { + "epoch": 0.10289202150850167, + "grad_norm": 0.10868332535028458, + "learning_rate": 0.00029772672396130914, + "loss": 0.46425890922546387, + "memory(GiB)": 78.26, + "step": 531, + "token_acc": 0.867730867643638, + "train_speed(iter/s)": 0.032208 + }, + { + "epoch": 0.10308579179382842, + "grad_norm": 0.10470977425575256, + "learning_rate": 0.0002977100207087783, + "loss": 0.4376215934753418, + "memory(GiB)": 78.26, + "step": 532, + "token_acc": 0.8725883947271159, + "train_speed(iter/s)": 0.032213 + }, + { + "epoch": 0.10327956207915516, + "grad_norm": 0.10088858008384705, + "learning_rate": 0.0002976932567877728, + "loss": 0.44405412673950195, + "memory(GiB)": 78.26, + "step": 533, + "token_acc": 0.8700791271440507, + "train_speed(iter/s)": 0.032219 + }, + { + "epoch": 0.1034733323644819, + "grad_norm": 0.11034450680017471, + "learning_rate": 0.00029767643220517803, + "loss": 0.4524500370025635, + "memory(GiB)": 78.26, + "step": 534, + "token_acc": 0.8718685509782921, + "train_speed(iter/s)": 0.032224 + }, + { + "epoch": 0.10366710264980865, + "grad_norm": 0.1017063558101654, + "learning_rate": 0.0002976595469679044, + "loss": 0.45182546973228455, + "memory(GiB)": 78.26, + "step": 535, + "token_acc": 0.8708247343881372, + "train_speed(iter/s)": 0.032229 + }, + { + "epoch": 0.10386087293513539, + "grad_norm": 0.12204176932573318, + "learning_rate": 0.000297642601082887, + "loss": 0.4296923875808716, + "memory(GiB)": 78.26, + "step": 536, + "token_acc": 0.8769677967208831, + "train_speed(iter/s)": 0.032234 + }, + { + "epoch": 0.10405464322046214, + "grad_norm": 0.11646491289138794, + "learning_rate": 0.00029762559455708606, + "loss": 0.5057516694068909, + "memory(GiB)": 78.26, + "step": 537, + "token_acc": 0.8552889453890566, + "train_speed(iter/s)": 0.03224 + }, + { + "epoch": 0.10424841350578888, + "grad_norm": 0.1092115193605423, + "learning_rate": 0.00029760852739748656, + "loss": 0.4715476334095001, + "memory(GiB)": 78.26, + "step": 538, + "token_acc": 0.8671120470554063, + "train_speed(iter/s)": 0.032244 + }, + { + "epoch": 0.10444218379111564, + "grad_norm": 0.10526008903980255, + "learning_rate": 0.00029759139961109843, + "loss": 0.4452831447124481, + "memory(GiB)": 78.26, + "step": 539, + "token_acc": 0.8757942310248571, + "train_speed(iter/s)": 0.032249 + }, + { + "epoch": 0.10463595407644238, + "grad_norm": 0.10792571306228638, + "learning_rate": 0.00029757421120495657, + "loss": 0.4550265669822693, + "memory(GiB)": 78.26, + "step": 540, + "token_acc": 0.8692174444670878, + "train_speed(iter/s)": 0.032254 + }, + { + "epoch": 0.10482972436176913, + "grad_norm": 0.13169942796230316, + "learning_rate": 0.00029755696218612075, + "loss": 0.4817619323730469, + "memory(GiB)": 78.26, + "step": 541, + "token_acc": 0.8636959761549925, + "train_speed(iter/s)": 0.03226 + }, + { + "epoch": 0.10502349464709587, + "grad_norm": 0.1143367663025856, + "learning_rate": 0.0002975396525616755, + "loss": 0.4964498281478882, + "memory(GiB)": 78.26, + "step": 542, + "token_acc": 0.8611530156476469, + "train_speed(iter/s)": 0.032265 + }, + { + "epoch": 0.10521726493242262, + "grad_norm": 0.11364971101284027, + "learning_rate": 0.0002975222823387304, + "loss": 0.5033693313598633, + "memory(GiB)": 78.26, + "step": 543, + "token_acc": 0.8565000138033846, + "train_speed(iter/s)": 0.03227 + }, + { + "epoch": 0.10541103521774936, + "grad_norm": 0.11851288378238678, + "learning_rate": 0.0002975048515244199, + "loss": 0.4963114261627197, + "memory(GiB)": 78.26, + "step": 544, + "token_acc": 0.8588933277681713, + "train_speed(iter/s)": 0.032275 + }, + { + "epoch": 0.1056048055030761, + "grad_norm": 0.10574258863925934, + "learning_rate": 0.00029748736012590325, + "loss": 0.422842800617218, + "memory(GiB)": 78.26, + "step": 545, + "token_acc": 0.8765757477808995, + "train_speed(iter/s)": 0.03228 + }, + { + "epoch": 0.10579857578840285, + "grad_norm": 0.11669930815696716, + "learning_rate": 0.00029746980815036463, + "loss": 0.504793107509613, + "memory(GiB)": 78.26, + "step": 546, + "token_acc": 0.8590873623674248, + "train_speed(iter/s)": 0.032285 + }, + { + "epoch": 0.10599234607372959, + "grad_norm": 0.11788391321897507, + "learning_rate": 0.00029745219560501317, + "loss": 0.5016002058982849, + "memory(GiB)": 78.26, + "step": 547, + "token_acc": 0.8547953818015547, + "train_speed(iter/s)": 0.03229 + }, + { + "epoch": 0.10618611635905634, + "grad_norm": 0.1270827203989029, + "learning_rate": 0.0002974345224970828, + "loss": 0.49133074283599854, + "memory(GiB)": 78.26, + "step": 548, + "token_acc": 0.8635658391797085, + "train_speed(iter/s)": 0.032296 + }, + { + "epoch": 0.10637988664438308, + "grad_norm": 0.11762789636850357, + "learning_rate": 0.0002974167888338323, + "loss": 0.4606111943721771, + "memory(GiB)": 78.26, + "step": 549, + "token_acc": 0.8683581749914666, + "train_speed(iter/s)": 0.032301 + }, + { + "epoch": 0.10657365692970983, + "grad_norm": 0.11224767565727234, + "learning_rate": 0.00029739899462254534, + "loss": 0.4845007658004761, + "memory(GiB)": 78.26, + "step": 550, + "token_acc": 0.8617370771520969, + "train_speed(iter/s)": 0.032306 + }, + { + "epoch": 0.10676742721503657, + "grad_norm": 0.12319989502429962, + "learning_rate": 0.00029738113987053057, + "loss": 0.5313453078269958, + "memory(GiB)": 78.26, + "step": 551, + "token_acc": 0.8511862414550404, + "train_speed(iter/s)": 0.032312 + }, + { + "epoch": 0.10696119750036331, + "grad_norm": 0.11050526797771454, + "learning_rate": 0.00029736322458512137, + "loss": 0.4900805354118347, + "memory(GiB)": 78.26, + "step": 552, + "token_acc": 0.8623392457978925, + "train_speed(iter/s)": 0.032317 + }, + { + "epoch": 0.10715496778569006, + "grad_norm": 0.10729413479566574, + "learning_rate": 0.000297345248773676, + "loss": 0.42731186747550964, + "memory(GiB)": 78.26, + "step": 553, + "token_acc": 0.8775543301978592, + "train_speed(iter/s)": 0.032322 + }, + { + "epoch": 0.1073487380710168, + "grad_norm": 0.11160185933113098, + "learning_rate": 0.00029732721244357766, + "loss": 0.43972256779670715, + "memory(GiB)": 78.26, + "step": 554, + "token_acc": 0.8772162864568294, + "train_speed(iter/s)": 0.032327 + }, + { + "epoch": 0.10754250835634356, + "grad_norm": 0.1065671369433403, + "learning_rate": 0.0002973091156022343, + "loss": 0.4213421940803528, + "memory(GiB)": 78.26, + "step": 555, + "token_acc": 0.8795612510860121, + "train_speed(iter/s)": 0.032332 + }, + { + "epoch": 0.1077362786416703, + "grad_norm": 0.12415304034948349, + "learning_rate": 0.0002972909582570789, + "loss": 0.5269736051559448, + "memory(GiB)": 78.26, + "step": 556, + "token_acc": 0.8518212183871481, + "train_speed(iter/s)": 0.032337 + }, + { + "epoch": 0.10793004892699705, + "grad_norm": 0.11127987504005432, + "learning_rate": 0.00029727274041556903, + "loss": 0.43144795298576355, + "memory(GiB)": 78.26, + "step": 557, + "token_acc": 0.8748079670107538, + "train_speed(iter/s)": 0.032342 + }, + { + "epoch": 0.1081238192123238, + "grad_norm": 0.12338504940271378, + "learning_rate": 0.0002972544620851873, + "loss": 0.5014538764953613, + "memory(GiB)": 78.26, + "step": 558, + "token_acc": 0.8602012135858572, + "train_speed(iter/s)": 0.032347 + }, + { + "epoch": 0.10831758949765054, + "grad_norm": 0.11730282008647919, + "learning_rate": 0.0002972361232734411, + "loss": 0.5035312175750732, + "memory(GiB)": 78.26, + "step": 559, + "token_acc": 0.857784964507907, + "train_speed(iter/s)": 0.032351 + }, + { + "epoch": 0.10851135978297728, + "grad_norm": 0.11394292116165161, + "learning_rate": 0.00029721772398786267, + "loss": 0.5110200047492981, + "memory(GiB)": 78.26, + "step": 560, + "token_acc": 0.8569026399865478, + "train_speed(iter/s)": 0.032356 + }, + { + "epoch": 0.10870513006830403, + "grad_norm": 0.10496672242879868, + "learning_rate": 0.0002971992642360091, + "loss": 0.4619162976741791, + "memory(GiB)": 78.26, + "step": 561, + "token_acc": 0.8678721452368306, + "train_speed(iter/s)": 0.03236 + }, + { + "epoch": 0.10889890035363077, + "grad_norm": 0.11340762674808502, + "learning_rate": 0.0002971807440254623, + "loss": 0.4587838649749756, + "memory(GiB)": 78.26, + "step": 562, + "token_acc": 0.869656622284513, + "train_speed(iter/s)": 0.032365 + }, + { + "epoch": 0.10909267063895751, + "grad_norm": 0.11113093048334122, + "learning_rate": 0.0002971621633638291, + "loss": 0.4310606122016907, + "memory(GiB)": 78.26, + "step": 563, + "token_acc": 0.8775456994698572, + "train_speed(iter/s)": 0.03237 + }, + { + "epoch": 0.10928644092428426, + "grad_norm": 0.11764062941074371, + "learning_rate": 0.00029714352225874096, + "loss": 0.4371579587459564, + "memory(GiB)": 78.26, + "step": 564, + "token_acc": 0.8744286816643912, + "train_speed(iter/s)": 0.032375 + }, + { + "epoch": 0.109480211209611, + "grad_norm": 0.12444280833005905, + "learning_rate": 0.00029712482071785436, + "loss": 0.4438256621360779, + "memory(GiB)": 78.26, + "step": 565, + "token_acc": 0.8732139048837705, + "train_speed(iter/s)": 0.032379 + }, + { + "epoch": 0.10967398149493775, + "grad_norm": 0.1286301612854004, + "learning_rate": 0.0002971060587488505, + "loss": 0.48562532663345337, + "memory(GiB)": 78.26, + "step": 566, + "token_acc": 0.8602812731310141, + "train_speed(iter/s)": 0.032384 + }, + { + "epoch": 0.10986775178026449, + "grad_norm": 0.10392966866493225, + "learning_rate": 0.00029708723635943536, + "loss": 0.41354262828826904, + "memory(GiB)": 78.26, + "step": 567, + "token_acc": 0.8819511093547112, + "train_speed(iter/s)": 0.032389 + }, + { + "epoch": 0.11006152206559124, + "grad_norm": 0.12217994779348373, + "learning_rate": 0.00029706835355733987, + "loss": 0.4606168568134308, + "memory(GiB)": 78.26, + "step": 568, + "token_acc": 0.8694803015044706, + "train_speed(iter/s)": 0.032394 + }, + { + "epoch": 0.11025529235091798, + "grad_norm": 0.10960107296705246, + "learning_rate": 0.00029704941035031977, + "loss": 0.4657290577888489, + "memory(GiB)": 78.26, + "step": 569, + "token_acc": 0.8707625272331154, + "train_speed(iter/s)": 0.032398 + }, + { + "epoch": 0.11044906263624474, + "grad_norm": 0.1072949692606926, + "learning_rate": 0.0002970304067461554, + "loss": 0.47186678647994995, + "memory(GiB)": 78.26, + "step": 570, + "token_acc": 0.8641140360004161, + "train_speed(iter/s)": 0.032403 + }, + { + "epoch": 0.11064283292157148, + "grad_norm": 0.1146392896771431, + "learning_rate": 0.0002970113427526521, + "loss": 0.460860013961792, + "memory(GiB)": 78.26, + "step": 571, + "token_acc": 0.8702294444126566, + "train_speed(iter/s)": 0.032408 + }, + { + "epoch": 0.11083660320689823, + "grad_norm": 0.11065780371427536, + "learning_rate": 0.00029699221837764, + "loss": 0.4562651515007019, + "memory(GiB)": 78.26, + "step": 572, + "token_acc": 0.8716815289879143, + "train_speed(iter/s)": 0.032412 + }, + { + "epoch": 0.11103037349222497, + "grad_norm": 0.10410984605550766, + "learning_rate": 0.0002969730336289741, + "loss": 0.410195916891098, + "memory(GiB)": 78.26, + "step": 573, + "token_acc": 0.8815111480133825, + "train_speed(iter/s)": 0.032416 + }, + { + "epoch": 0.11122414377755171, + "grad_norm": 0.1023547351360321, + "learning_rate": 0.0002969537885145338, + "loss": 0.44055572152137756, + "memory(GiB)": 78.26, + "step": 574, + "token_acc": 0.8741572450990561, + "train_speed(iter/s)": 0.032421 + }, + { + "epoch": 0.11141791406287846, + "grad_norm": 0.11351530253887177, + "learning_rate": 0.00029693448304222384, + "loss": 0.48370641469955444, + "memory(GiB)": 78.26, + "step": 575, + "token_acc": 0.8646675213198818, + "train_speed(iter/s)": 0.032426 + }, + { + "epoch": 0.1116116843482052, + "grad_norm": 0.10755743831396103, + "learning_rate": 0.0002969151172199734, + "loss": 0.4608403444290161, + "memory(GiB)": 78.26, + "step": 576, + "token_acc": 0.8711053970511804, + "train_speed(iter/s)": 0.032431 + }, + { + "epoch": 0.11180545463353195, + "grad_norm": 0.11767973005771637, + "learning_rate": 0.00029689569105573654, + "loss": 0.4457208216190338, + "memory(GiB)": 78.26, + "step": 577, + "token_acc": 0.8742761124613676, + "train_speed(iter/s)": 0.032435 + }, + { + "epoch": 0.11199922491885869, + "grad_norm": 0.10909876972436905, + "learning_rate": 0.0002968762045574921, + "loss": 0.4398786425590515, + "memory(GiB)": 78.26, + "step": 578, + "token_acc": 0.8746623305095859, + "train_speed(iter/s)": 0.032439 + }, + { + "epoch": 0.11219299520418544, + "grad_norm": 0.10908223688602448, + "learning_rate": 0.0002968566577332438, + "loss": 0.4776487350463867, + "memory(GiB)": 78.26, + "step": 579, + "token_acc": 0.8675198893116569, + "train_speed(iter/s)": 0.032444 + }, + { + "epoch": 0.11238676548951218, + "grad_norm": 0.11661187559366226, + "learning_rate": 0.0002968370505910199, + "loss": 0.4867633581161499, + "memory(GiB)": 78.26, + "step": 580, + "token_acc": 0.8616142945163278, + "train_speed(iter/s)": 0.032449 + }, + { + "epoch": 0.11258053577483892, + "grad_norm": 0.10946566611528397, + "learning_rate": 0.0002968173831388737, + "loss": 0.4538075029850006, + "memory(GiB)": 78.26, + "step": 581, + "token_acc": 0.8708697120254691, + "train_speed(iter/s)": 0.032453 + }, + { + "epoch": 0.11277430606016567, + "grad_norm": 0.10774602741003036, + "learning_rate": 0.00029679765538488315, + "loss": 0.4607040286064148, + "memory(GiB)": 78.26, + "step": 582, + "token_acc": 0.8665272975798022, + "train_speed(iter/s)": 0.032457 + }, + { + "epoch": 0.11296807634549241, + "grad_norm": 0.11124128848314285, + "learning_rate": 0.00029677786733715085, + "loss": 0.48860326409339905, + "memory(GiB)": 78.26, + "step": 583, + "token_acc": 0.8632434789299546, + "train_speed(iter/s)": 0.032461 + }, + { + "epoch": 0.11316184663081916, + "grad_norm": 0.11296314746141434, + "learning_rate": 0.00029675801900380444, + "loss": 0.44037532806396484, + "memory(GiB)": 78.26, + "step": 584, + "token_acc": 0.8732820680628273, + "train_speed(iter/s)": 0.032466 + }, + { + "epoch": 0.11335561691614592, + "grad_norm": 0.12023019045591354, + "learning_rate": 0.000296738110392996, + "loss": 0.4782663881778717, + "memory(GiB)": 78.26, + "step": 585, + "token_acc": 0.8658891296908863, + "train_speed(iter/s)": 0.03247 + }, + { + "epoch": 0.11354938720147266, + "grad_norm": 0.10668137669563293, + "learning_rate": 0.0002967181415129027, + "loss": 0.46215036511421204, + "memory(GiB)": 78.26, + "step": 586, + "token_acc": 0.870391061452514, + "train_speed(iter/s)": 0.032475 + }, + { + "epoch": 0.1137431574867994, + "grad_norm": 0.11411837488412857, + "learning_rate": 0.00029669811237172615, + "loss": 0.48195797204971313, + "memory(GiB)": 78.26, + "step": 587, + "token_acc": 0.8628970636393407, + "train_speed(iter/s)": 0.032479 + }, + { + "epoch": 0.11393692777212615, + "grad_norm": 0.11026424914598465, + "learning_rate": 0.0002966780229776929, + "loss": 0.4768543541431427, + "memory(GiB)": 78.26, + "step": 588, + "token_acc": 0.8641793084597512, + "train_speed(iter/s)": 0.032484 + }, + { + "epoch": 0.11413069805745289, + "grad_norm": 0.09665640443563461, + "learning_rate": 0.0002966578733390543, + "loss": 0.40953487157821655, + "memory(GiB)": 78.26, + "step": 589, + "token_acc": 0.8820130831304016, + "train_speed(iter/s)": 0.032488 + }, + { + "epoch": 0.11432446834277964, + "grad_norm": 0.10579517483711243, + "learning_rate": 0.00029663766346408623, + "loss": 0.4386206269264221, + "memory(GiB)": 78.26, + "step": 590, + "token_acc": 0.8759878419452888, + "train_speed(iter/s)": 0.032492 + }, + { + "epoch": 0.11451823862810638, + "grad_norm": 0.1087721437215805, + "learning_rate": 0.00029661739336108947, + "loss": 0.456497460603714, + "memory(GiB)": 78.26, + "step": 591, + "token_acc": 0.8705190602070312, + "train_speed(iter/s)": 0.032497 + }, + { + "epoch": 0.11471200891343313, + "grad_norm": 0.11170576512813568, + "learning_rate": 0.0002965970630383895, + "loss": 0.5045605897903442, + "memory(GiB)": 78.26, + "step": 592, + "token_acc": 0.8581936467002697, + "train_speed(iter/s)": 0.032502 + }, + { + "epoch": 0.11490577919875987, + "grad_norm": 0.10968808829784393, + "learning_rate": 0.00029657667250433645, + "loss": 0.45187485218048096, + "memory(GiB)": 78.26, + "step": 593, + "token_acc": 0.8707526605975126, + "train_speed(iter/s)": 0.032506 + }, + { + "epoch": 0.11509954948408661, + "grad_norm": 0.12064143270254135, + "learning_rate": 0.00029655622176730543, + "loss": 0.48971986770629883, + "memory(GiB)": 78.26, + "step": 594, + "token_acc": 0.8627284832344753, + "train_speed(iter/s)": 0.032511 + }, + { + "epoch": 0.11529331976941336, + "grad_norm": 0.10990637540817261, + "learning_rate": 0.000296535710835696, + "loss": 0.46801620721817017, + "memory(GiB)": 78.26, + "step": 595, + "token_acc": 0.8661632053702538, + "train_speed(iter/s)": 0.032515 + }, + { + "epoch": 0.1154870900547401, + "grad_norm": 0.11462108045816422, + "learning_rate": 0.00029651513971793255, + "loss": 0.4742628037929535, + "memory(GiB)": 78.26, + "step": 596, + "token_acc": 0.8639128007756265, + "train_speed(iter/s)": 0.03252 + }, + { + "epoch": 0.11568086034006685, + "grad_norm": 0.10829971730709076, + "learning_rate": 0.0002964945084224642, + "loss": 0.47385069727897644, + "memory(GiB)": 78.26, + "step": 597, + "token_acc": 0.8652491420019095, + "train_speed(iter/s)": 0.032524 + }, + { + "epoch": 0.11587463062539359, + "grad_norm": 0.1086253821849823, + "learning_rate": 0.00029647381695776474, + "loss": 0.4455265998840332, + "memory(GiB)": 78.26, + "step": 598, + "token_acc": 0.872541041111267, + "train_speed(iter/s)": 0.032528 + }, + { + "epoch": 0.11606840091072033, + "grad_norm": 0.11207325756549835, + "learning_rate": 0.0002964530653323328, + "loss": 0.4860404133796692, + "memory(GiB)": 78.26, + "step": 599, + "token_acc": 0.8606821106821106, + "train_speed(iter/s)": 0.032532 + }, + { + "epoch": 0.11626217119604709, + "grad_norm": 0.11669515818357468, + "learning_rate": 0.0002964322535546916, + "loss": 0.5320515632629395, + "memory(GiB)": 78.26, + "step": 600, + "token_acc": 0.8462377317339149, + "train_speed(iter/s)": 0.032537 + }, + { + "epoch": 0.11645594148137384, + "grad_norm": 0.10914132744073868, + "learning_rate": 0.00029641138163338907, + "loss": 0.4786490797996521, + "memory(GiB)": 78.26, + "step": 601, + "token_acc": 0.8650941795350415, + "train_speed(iter/s)": 0.03252 + }, + { + "epoch": 0.11664971176670058, + "grad_norm": 0.10866682976484299, + "learning_rate": 0.0002963904495769978, + "loss": 0.4337434768676758, + "memory(GiB)": 78.26, + "step": 602, + "token_acc": 0.875823794658342, + "train_speed(iter/s)": 0.032525 + }, + { + "epoch": 0.11684348205202733, + "grad_norm": 0.11499880999326706, + "learning_rate": 0.0002963694573941153, + "loss": 0.46066945791244507, + "memory(GiB)": 78.26, + "step": 603, + "token_acc": 0.8695754925999232, + "train_speed(iter/s)": 0.032529 + }, + { + "epoch": 0.11703725233735407, + "grad_norm": 0.09975297749042511, + "learning_rate": 0.0002963484050933636, + "loss": 0.4278182089328766, + "memory(GiB)": 78.26, + "step": 604, + "token_acc": 0.8778897736293872, + "train_speed(iter/s)": 0.032533 + }, + { + "epoch": 0.11723102262268081, + "grad_norm": 0.11780435591936111, + "learning_rate": 0.0002963272926833893, + "loss": 0.4575609862804413, + "memory(GiB)": 78.26, + "step": 605, + "token_acc": 0.8685906319290465, + "train_speed(iter/s)": 0.032538 + }, + { + "epoch": 0.11742479290800756, + "grad_norm": 0.10950589925050735, + "learning_rate": 0.00029630612017286393, + "loss": 0.43958431482315063, + "memory(GiB)": 78.26, + "step": 606, + "token_acc": 0.8741350906095552, + "train_speed(iter/s)": 0.032542 + }, + { + "epoch": 0.1176185631933343, + "grad_norm": 0.11566930264234543, + "learning_rate": 0.00029628488757048365, + "loss": 0.4670230448246002, + "memory(GiB)": 78.26, + "step": 607, + "token_acc": 0.8678823105807314, + "train_speed(iter/s)": 0.032546 + }, + { + "epoch": 0.11781233347866105, + "grad_norm": 0.11282102763652802, + "learning_rate": 0.00029626359488496914, + "loss": 0.4526304006576538, + "memory(GiB)": 78.26, + "step": 608, + "token_acc": 0.8723770983213429, + "train_speed(iter/s)": 0.032551 + }, + { + "epoch": 0.11800610376398779, + "grad_norm": 0.10305222868919373, + "learning_rate": 0.0002962422421250661, + "loss": 0.437641978263855, + "memory(GiB)": 78.26, + "step": 609, + "token_acc": 0.8741574159921818, + "train_speed(iter/s)": 0.032555 + }, + { + "epoch": 0.11819987404931454, + "grad_norm": 0.10737847536802292, + "learning_rate": 0.0002962208292995444, + "loss": 0.46929752826690674, + "memory(GiB)": 78.26, + "step": 610, + "token_acc": 0.8661016542219875, + "train_speed(iter/s)": 0.032559 + }, + { + "epoch": 0.11839364433464128, + "grad_norm": 0.11264858394861221, + "learning_rate": 0.00029619935641719906, + "loss": 0.4528808295726776, + "memory(GiB)": 78.26, + "step": 611, + "token_acc": 0.8737685426339032, + "train_speed(iter/s)": 0.032563 + }, + { + "epoch": 0.11858741461996802, + "grad_norm": 0.10752102732658386, + "learning_rate": 0.00029617782348684946, + "loss": 0.46101704239845276, + "memory(GiB)": 78.26, + "step": 612, + "token_acc": 0.8676157977176102, + "train_speed(iter/s)": 0.032568 + }, + { + "epoch": 0.11878118490529477, + "grad_norm": 0.12059959769248962, + "learning_rate": 0.00029615623051733986, + "loss": 0.4816949665546417, + "memory(GiB)": 78.26, + "step": 613, + "token_acc": 0.8623071979434447, + "train_speed(iter/s)": 0.032572 + }, + { + "epoch": 0.11897495519062151, + "grad_norm": 0.11858416348695755, + "learning_rate": 0.00029613457751753903, + "loss": 0.4759201407432556, + "memory(GiB)": 78.26, + "step": 614, + "token_acc": 0.8641525737563264, + "train_speed(iter/s)": 0.032576 + }, + { + "epoch": 0.11916872547594827, + "grad_norm": 0.11951098591089249, + "learning_rate": 0.0002961128644963404, + "loss": 0.4854854643344879, + "memory(GiB)": 78.26, + "step": 615, + "token_acc": 0.8598115112756648, + "train_speed(iter/s)": 0.032581 + }, + { + "epoch": 0.11936249576127501, + "grad_norm": 0.11944432556629181, + "learning_rate": 0.0002960910914626621, + "loss": 0.49400413036346436, + "memory(GiB)": 78.26, + "step": 616, + "token_acc": 0.8579318625243885, + "train_speed(iter/s)": 0.032585 + }, + { + "epoch": 0.11955626604660176, + "grad_norm": 0.10993971675634384, + "learning_rate": 0.00029606925842544694, + "loss": 0.421941339969635, + "memory(GiB)": 78.26, + "step": 617, + "token_acc": 0.8773255961925123, + "train_speed(iter/s)": 0.032588 + }, + { + "epoch": 0.1197500363319285, + "grad_norm": 0.10016026347875595, + "learning_rate": 0.00029604736539366234, + "loss": 0.4218504726886749, + "memory(GiB)": 78.26, + "step": 618, + "token_acc": 0.8786656708986806, + "train_speed(iter/s)": 0.032592 + }, + { + "epoch": 0.11994380661725525, + "grad_norm": 0.11316211521625519, + "learning_rate": 0.00029602541237630026, + "loss": 0.43415647745132446, + "memory(GiB)": 78.26, + "step": 619, + "token_acc": 0.8750876058008518, + "train_speed(iter/s)": 0.032595 + }, + { + "epoch": 0.12013757690258199, + "grad_norm": 0.12216460704803467, + "learning_rate": 0.00029600339938237746, + "loss": 0.4619022309780121, + "memory(GiB)": 78.26, + "step": 620, + "token_acc": 0.867951649787651, + "train_speed(iter/s)": 0.032599 + }, + { + "epoch": 0.12033134718790874, + "grad_norm": 0.10427816212177277, + "learning_rate": 0.0002959813264209353, + "loss": 0.4056597054004669, + "memory(GiB)": 78.26, + "step": 621, + "token_acc": 0.8820620175323826, + "train_speed(iter/s)": 0.032603 + }, + { + "epoch": 0.12052511747323548, + "grad_norm": 0.113120436668396, + "learning_rate": 0.0002959591935010397, + "loss": 0.4769364595413208, + "memory(GiB)": 78.26, + "step": 622, + "token_acc": 0.8646665349552651, + "train_speed(iter/s)": 0.032607 + }, + { + "epoch": 0.12071888775856222, + "grad_norm": 0.10840658843517303, + "learning_rate": 0.00029593700063178127, + "loss": 0.4412975609302521, + "memory(GiB)": 78.26, + "step": 623, + "token_acc": 0.8720108105499385, + "train_speed(iter/s)": 0.032611 + }, + { + "epoch": 0.12091265804388897, + "grad_norm": 0.11971483379602432, + "learning_rate": 0.00029591474782227523, + "loss": 0.4542793333530426, + "memory(GiB)": 78.26, + "step": 624, + "token_acc": 0.8705333207178225, + "train_speed(iter/s)": 0.032616 + }, + { + "epoch": 0.12110642832921571, + "grad_norm": 0.1130179837346077, + "learning_rate": 0.00029589243508166136, + "loss": 0.48266369104385376, + "memory(GiB)": 78.26, + "step": 625, + "token_acc": 0.8646267140680548, + "train_speed(iter/s)": 0.032619 + }, + { + "epoch": 0.12130019861454246, + "grad_norm": 0.11181651800870895, + "learning_rate": 0.0002958700624191041, + "loss": 0.41792306303977966, + "memory(GiB)": 78.26, + "step": 626, + "token_acc": 0.8797664711191335, + "train_speed(iter/s)": 0.032623 + }, + { + "epoch": 0.1214939688998692, + "grad_norm": 0.11760863661766052, + "learning_rate": 0.00029584762984379253, + "loss": 0.49022918939590454, + "memory(GiB)": 78.26, + "step": 627, + "token_acc": 0.8612506898254378, + "train_speed(iter/s)": 0.032627 + }, + { + "epoch": 0.12168773918519595, + "grad_norm": 0.10427683591842651, + "learning_rate": 0.00029582513736494027, + "loss": 0.4219359755516052, + "memory(GiB)": 78.26, + "step": 628, + "token_acc": 0.8782604333868379, + "train_speed(iter/s)": 0.032631 + }, + { + "epoch": 0.12188150947052269, + "grad_norm": 0.10674095898866653, + "learning_rate": 0.00029580258499178566, + "loss": 0.4139460325241089, + "memory(GiB)": 78.26, + "step": 629, + "token_acc": 0.8805075685104946, + "train_speed(iter/s)": 0.032635 + }, + { + "epoch": 0.12207527975584945, + "grad_norm": 0.1084229126572609, + "learning_rate": 0.00029577997273359157, + "loss": 0.42610836029052734, + "memory(GiB)": 78.26, + "step": 630, + "token_acc": 0.8765509800395612, + "train_speed(iter/s)": 0.032639 + }, + { + "epoch": 0.12226905004117619, + "grad_norm": 0.12075889855623245, + "learning_rate": 0.00029575730059964534, + "loss": 0.47887349128723145, + "memory(GiB)": 78.26, + "step": 631, + "token_acc": 0.8641042884486059, + "train_speed(iter/s)": 0.032643 + }, + { + "epoch": 0.12246282032650294, + "grad_norm": 0.13688334822654724, + "learning_rate": 0.00029573456859925917, + "loss": 0.5096902847290039, + "memory(GiB)": 78.26, + "step": 632, + "token_acc": 0.8591979075850044, + "train_speed(iter/s)": 0.032647 + }, + { + "epoch": 0.12265659061182968, + "grad_norm": 0.09688639640808105, + "learning_rate": 0.0002957117767417696, + "loss": 0.4519810974597931, + "memory(GiB)": 78.26, + "step": 633, + "token_acc": 0.8703838308049426, + "train_speed(iter/s)": 0.032651 + }, + { + "epoch": 0.12285036089715642, + "grad_norm": 0.10635363310575485, + "learning_rate": 0.0002956889250365379, + "loss": 0.4540587067604065, + "memory(GiB)": 78.26, + "step": 634, + "token_acc": 0.8705026824989194, + "train_speed(iter/s)": 0.032654 + }, + { + "epoch": 0.12304413118248317, + "grad_norm": 0.1146511361002922, + "learning_rate": 0.00029566601349294985, + "loss": 0.44937869906425476, + "memory(GiB)": 78.26, + "step": 635, + "token_acc": 0.8692985300475573, + "train_speed(iter/s)": 0.032658 + }, + { + "epoch": 0.12323790146780991, + "grad_norm": 0.11808592826128006, + "learning_rate": 0.0002956430421204159, + "loss": 0.4610764980316162, + "memory(GiB)": 78.26, + "step": 636, + "token_acc": 0.86909560882543, + "train_speed(iter/s)": 0.032663 + }, + { + "epoch": 0.12343167175313666, + "grad_norm": 0.11165952682495117, + "learning_rate": 0.0002956200109283709, + "loss": 0.44256392121315, + "memory(GiB)": 78.26, + "step": 637, + "token_acc": 0.872780487804878, + "train_speed(iter/s)": 0.032667 + }, + { + "epoch": 0.1236254420384634, + "grad_norm": 0.11056765913963318, + "learning_rate": 0.0002955969199262745, + "loss": 0.44177088141441345, + "memory(GiB)": 78.26, + "step": 638, + "token_acc": 0.8738199880373443, + "train_speed(iter/s)": 0.032671 + }, + { + "epoch": 0.12381921232379015, + "grad_norm": 0.12657155096530914, + "learning_rate": 0.00029557376912361076, + "loss": 0.4548972249031067, + "memory(GiB)": 78.26, + "step": 639, + "token_acc": 0.8688163152817969, + "train_speed(iter/s)": 0.032675 + }, + { + "epoch": 0.12401298260911689, + "grad_norm": 0.1116451621055603, + "learning_rate": 0.00029555055852988836, + "loss": 0.44486871361732483, + "memory(GiB)": 78.26, + "step": 640, + "token_acc": 0.8735002147061053, + "train_speed(iter/s)": 0.032679 + }, + { + "epoch": 0.12420675289444363, + "grad_norm": 0.11732634902000427, + "learning_rate": 0.0002955272881546404, + "loss": 0.4493526816368103, + "memory(GiB)": 78.26, + "step": 641, + "token_acc": 0.8715556519852462, + "train_speed(iter/s)": 0.032683 + }, + { + "epoch": 0.12440052317977038, + "grad_norm": 0.11484182626008987, + "learning_rate": 0.00029550395800742477, + "loss": 0.4882141351699829, + "memory(GiB)": 78.26, + "step": 642, + "token_acc": 0.8629650457276468, + "train_speed(iter/s)": 0.032687 + }, + { + "epoch": 0.12459429346509712, + "grad_norm": 0.10542822629213333, + "learning_rate": 0.0002954805680978237, + "loss": 0.4319552779197693, + "memory(GiB)": 78.26, + "step": 643, + "token_acc": 0.8768453037188416, + "train_speed(iter/s)": 0.03269 + }, + { + "epoch": 0.12478806375042387, + "grad_norm": 0.10408841073513031, + "learning_rate": 0.0002954571184354441, + "loss": 0.402774840593338, + "memory(GiB)": 78.26, + "step": 644, + "token_acc": 0.8839004707464694, + "train_speed(iter/s)": 0.032694 + }, + { + "epoch": 0.12498183403575061, + "grad_norm": 0.10431966185569763, + "learning_rate": 0.0002954336090299174, + "loss": 0.4379886984825134, + "memory(GiB)": 78.26, + "step": 645, + "token_acc": 0.8783049896738991, + "train_speed(iter/s)": 0.032698 + }, + { + "epoch": 0.12517560432107736, + "grad_norm": 0.11031137406826019, + "learning_rate": 0.0002954100398908995, + "loss": 0.44032973051071167, + "memory(GiB)": 78.26, + "step": 646, + "token_acc": 0.8727701606958573, + "train_speed(iter/s)": 0.032702 + }, + { + "epoch": 0.1253693746064041, + "grad_norm": 0.11465989798307419, + "learning_rate": 0.000295386411028071, + "loss": 0.4586394429206848, + "memory(GiB)": 78.26, + "step": 647, + "token_acc": 0.8680211123783254, + "train_speed(iter/s)": 0.032705 + }, + { + "epoch": 0.12556314489173084, + "grad_norm": 0.10031208395957947, + "learning_rate": 0.0002953627224511367, + "loss": 0.4253405034542084, + "memory(GiB)": 78.26, + "step": 648, + "token_acc": 0.8783018191380011, + "train_speed(iter/s)": 0.032708 + }, + { + "epoch": 0.1257569151770576, + "grad_norm": 0.11166791617870331, + "learning_rate": 0.0002953389741698262, + "loss": 0.45778605341911316, + "memory(GiB)": 78.26, + "step": 649, + "token_acc": 0.8692058831525433, + "train_speed(iter/s)": 0.032712 + }, + { + "epoch": 0.12595068546238433, + "grad_norm": 0.13080237805843353, + "learning_rate": 0.0002953151661938937, + "loss": 0.4996272325515747, + "memory(GiB)": 78.26, + "step": 650, + "token_acc": 0.8591764266135061, + "train_speed(iter/s)": 0.032716 + }, + { + "epoch": 0.12614445574771108, + "grad_norm": 0.1140795424580574, + "learning_rate": 0.00029529129853311765, + "loss": 0.4572920799255371, + "memory(GiB)": 78.26, + "step": 651, + "token_acc": 0.8702007294029808, + "train_speed(iter/s)": 0.03272 + }, + { + "epoch": 0.12633822603303782, + "grad_norm": 0.10730257630348206, + "learning_rate": 0.00029526737119730113, + "loss": 0.4401112198829651, + "memory(GiB)": 78.26, + "step": 652, + "token_acc": 0.8765418681736673, + "train_speed(iter/s)": 0.032724 + }, + { + "epoch": 0.1265319963183646, + "grad_norm": 0.10705429315567017, + "learning_rate": 0.0002952433841962718, + "loss": 0.42639651894569397, + "memory(GiB)": 78.26, + "step": 653, + "token_acc": 0.87889592899469, + "train_speed(iter/s)": 0.032727 + }, + { + "epoch": 0.12672576660369134, + "grad_norm": 0.10336251556873322, + "learning_rate": 0.0002952193375398817, + "loss": 0.4257362484931946, + "memory(GiB)": 78.26, + "step": 654, + "token_acc": 0.8773996726028077, + "train_speed(iter/s)": 0.032731 + }, + { + "epoch": 0.12691953688901808, + "grad_norm": 0.11652205884456635, + "learning_rate": 0.0002951952312380075, + "loss": 0.4846917986869812, + "memory(GiB)": 78.26, + "step": 655, + "token_acc": 0.8638244781206257, + "train_speed(iter/s)": 0.032734 + }, + { + "epoch": 0.12711330717434483, + "grad_norm": 0.1111086755990982, + "learning_rate": 0.00029517106530055034, + "loss": 0.4347532093524933, + "memory(GiB)": 78.26, + "step": 656, + "token_acc": 0.8746101794645484, + "train_speed(iter/s)": 0.032738 + }, + { + "epoch": 0.12730707745967157, + "grad_norm": 0.11007381230592728, + "learning_rate": 0.0002951468397374357, + "loss": 0.44459158182144165, + "memory(GiB)": 78.26, + "step": 657, + "token_acc": 0.8722839608558633, + "train_speed(iter/s)": 0.032742 + }, + { + "epoch": 0.1275008477449983, + "grad_norm": 0.10285302996635437, + "learning_rate": 0.00029512255455861375, + "loss": 0.3928186297416687, + "memory(GiB)": 78.26, + "step": 658, + "token_acc": 0.8879543480957698, + "train_speed(iter/s)": 0.032745 + }, + { + "epoch": 0.12769461803032506, + "grad_norm": 0.1072237491607666, + "learning_rate": 0.00029509820977405906, + "loss": 0.4308614730834961, + "memory(GiB)": 78.26, + "step": 659, + "token_acc": 0.876323788978231, + "train_speed(iter/s)": 0.032749 + }, + { + "epoch": 0.1278883883156518, + "grad_norm": 0.12245503067970276, + "learning_rate": 0.0002950738053937707, + "loss": 0.49942547082901, + "memory(GiB)": 78.26, + "step": 660, + "token_acc": 0.8578174506458689, + "train_speed(iter/s)": 0.032753 + }, + { + "epoch": 0.12808215860097855, + "grad_norm": 0.10673464089632034, + "learning_rate": 0.0002950493414277721, + "loss": 0.4471551477909088, + "memory(GiB)": 78.26, + "step": 661, + "token_acc": 0.8700834952475975, + "train_speed(iter/s)": 0.032756 + }, + { + "epoch": 0.1282759288863053, + "grad_norm": 0.11559055000543594, + "learning_rate": 0.0002950248178861114, + "loss": 0.4593106806278229, + "memory(GiB)": 78.26, + "step": 662, + "token_acc": 0.8703804270157487, + "train_speed(iter/s)": 0.032761 + }, + { + "epoch": 0.12846969917163203, + "grad_norm": 0.11425399780273438, + "learning_rate": 0.000295000234778861, + "loss": 0.456471711397171, + "memory(GiB)": 78.26, + "step": 663, + "token_acc": 0.868738057655562, + "train_speed(iter/s)": 0.032764 + }, + { + "epoch": 0.12866346945695878, + "grad_norm": 0.11667713522911072, + "learning_rate": 0.0002949755921161179, + "loss": 0.4741016924381256, + "memory(GiB)": 78.26, + "step": 664, + "token_acc": 0.864152462756677, + "train_speed(iter/s)": 0.032768 + }, + { + "epoch": 0.12885723974228552, + "grad_norm": 0.1101280227303505, + "learning_rate": 0.0002949508899080035, + "loss": 0.4579119384288788, + "memory(GiB)": 78.26, + "step": 665, + "token_acc": 0.8712681302556577, + "train_speed(iter/s)": 0.032772 + }, + { + "epoch": 0.12905101002761227, + "grad_norm": 0.10604141652584076, + "learning_rate": 0.0002949261281646636, + "loss": 0.46181899309158325, + "memory(GiB)": 78.26, + "step": 666, + "token_acc": 0.8664379947229551, + "train_speed(iter/s)": 0.032775 + }, + { + "epoch": 0.129244780312939, + "grad_norm": 0.11616591364145279, + "learning_rate": 0.0002949013068962685, + "loss": 0.4584234952926636, + "memory(GiB)": 78.26, + "step": 667, + "token_acc": 0.8690121073872609, + "train_speed(iter/s)": 0.032778 + }, + { + "epoch": 0.12943855059826576, + "grad_norm": 0.11570385843515396, + "learning_rate": 0.00029487642611301305, + "loss": 0.4578292965888977, + "memory(GiB)": 78.26, + "step": 668, + "token_acc": 0.8677614050350715, + "train_speed(iter/s)": 0.032782 + }, + { + "epoch": 0.1296323208835925, + "grad_norm": 0.09622353315353394, + "learning_rate": 0.0002948514858251164, + "loss": 0.41976287961006165, + "memory(GiB)": 78.26, + "step": 669, + "token_acc": 0.8775399043243499, + "train_speed(iter/s)": 0.032785 + }, + { + "epoch": 0.12982609116891924, + "grad_norm": 0.10756651312112808, + "learning_rate": 0.0002948264860428223, + "loss": 0.41081976890563965, + "memory(GiB)": 78.26, + "step": 670, + "token_acc": 0.8800941338537481, + "train_speed(iter/s)": 0.032789 + }, + { + "epoch": 0.130019861454246, + "grad_norm": 0.11921778321266174, + "learning_rate": 0.00029480142677639864, + "loss": 0.5204399824142456, + "memory(GiB)": 78.26, + "step": 671, + "token_acc": 0.8542876514459217, + "train_speed(iter/s)": 0.032792 + }, + { + "epoch": 0.13021363173957273, + "grad_norm": 0.10356691479682922, + "learning_rate": 0.00029477630803613806, + "loss": 0.4329625368118286, + "memory(GiB)": 78.26, + "step": 672, + "token_acc": 0.8725223288914362, + "train_speed(iter/s)": 0.032795 + }, + { + "epoch": 0.13040740202489948, + "grad_norm": 0.11215896904468536, + "learning_rate": 0.0002947511298323575, + "loss": 0.4640699625015259, + "memory(GiB)": 78.26, + "step": 673, + "token_acc": 0.8692367949865711, + "train_speed(iter/s)": 0.032799 + }, + { + "epoch": 0.13060117231022622, + "grad_norm": 0.10207220911979675, + "learning_rate": 0.0002947258921753983, + "loss": 0.41831666231155396, + "memory(GiB)": 78.26, + "step": 674, + "token_acc": 0.8800374444184413, + "train_speed(iter/s)": 0.032802 + }, + { + "epoch": 0.13079494259555297, + "grad_norm": 0.11593617498874664, + "learning_rate": 0.0002947005950756262, + "loss": 0.4830693006515503, + "memory(GiB)": 78.26, + "step": 675, + "token_acc": 0.8652564297671588, + "train_speed(iter/s)": 0.032806 + }, + { + "epoch": 0.1309887128808797, + "grad_norm": 0.10835679620504379, + "learning_rate": 0.00029467523854343153, + "loss": 0.41629844903945923, + "memory(GiB)": 78.26, + "step": 676, + "token_acc": 0.8807510973825394, + "train_speed(iter/s)": 0.032809 + }, + { + "epoch": 0.13118248316620645, + "grad_norm": 0.1169678345322609, + "learning_rate": 0.00029464982258922874, + "loss": 0.43599221110343933, + "memory(GiB)": 78.26, + "step": 677, + "token_acc": 0.8760780894853193, + "train_speed(iter/s)": 0.032813 + }, + { + "epoch": 0.1313762534515332, + "grad_norm": 0.10601246356964111, + "learning_rate": 0.00029462434722345697, + "loss": 0.43275031447410583, + "memory(GiB)": 78.26, + "step": 678, + "token_acc": 0.8755334681042228, + "train_speed(iter/s)": 0.032817 + }, + { + "epoch": 0.13157002373685994, + "grad_norm": 0.10723179578781128, + "learning_rate": 0.0002945988124565796, + "loss": 0.432290643453598, + "memory(GiB)": 78.26, + "step": 679, + "token_acc": 0.8754735278964755, + "train_speed(iter/s)": 0.03282 + }, + { + "epoch": 0.1317637940221867, + "grad_norm": 0.1098715141415596, + "learning_rate": 0.0002945732182990844, + "loss": 0.46575891971588135, + "memory(GiB)": 78.26, + "step": 680, + "token_acc": 0.8681277056277056, + "train_speed(iter/s)": 0.032823 + }, + { + "epoch": 0.13195756430751343, + "grad_norm": 0.10466992110013962, + "learning_rate": 0.0002945475647614836, + "loss": 0.4427857995033264, + "memory(GiB)": 78.26, + "step": 681, + "token_acc": 0.8734357015159308, + "train_speed(iter/s)": 0.032827 + }, + { + "epoch": 0.13215133459284018, + "grad_norm": 0.10467381030321121, + "learning_rate": 0.0002945218518543138, + "loss": 0.4318023920059204, + "memory(GiB)": 78.26, + "step": 682, + "token_acc": 0.8751313343570678, + "train_speed(iter/s)": 0.03283 + }, + { + "epoch": 0.13234510487816692, + "grad_norm": 0.1041102334856987, + "learning_rate": 0.00029449607958813604, + "loss": 0.43351155519485474, + "memory(GiB)": 78.26, + "step": 683, + "token_acc": 0.873902149545035, + "train_speed(iter/s)": 0.032833 + }, + { + "epoch": 0.1325388751634937, + "grad_norm": 0.11077916622161865, + "learning_rate": 0.0002944702479735356, + "loss": 0.43510738015174866, + "memory(GiB)": 78.26, + "step": 684, + "token_acc": 0.8754168786388559, + "train_speed(iter/s)": 0.032837 + }, + { + "epoch": 0.13273264544882044, + "grad_norm": 0.10889974236488342, + "learning_rate": 0.0002944443570211223, + "loss": 0.4274645745754242, + "memory(GiB)": 78.26, + "step": 685, + "token_acc": 0.8780943484353106, + "train_speed(iter/s)": 0.032841 + }, + { + "epoch": 0.13292641573414718, + "grad_norm": 0.10870851576328278, + "learning_rate": 0.00029441840674153017, + "loss": 0.47596949338912964, + "memory(GiB)": 78.26, + "step": 686, + "token_acc": 0.863644912544668, + "train_speed(iter/s)": 0.032844 + }, + { + "epoch": 0.13312018601947392, + "grad_norm": 0.11301931738853455, + "learning_rate": 0.0002943923971454177, + "loss": 0.46415457129478455, + "memory(GiB)": 78.26, + "step": 687, + "token_acc": 0.8705835224996662, + "train_speed(iter/s)": 0.032847 + }, + { + "epoch": 0.13331395630480067, + "grad_norm": 0.10821959376335144, + "learning_rate": 0.0002943663282434678, + "loss": 0.43708160519599915, + "memory(GiB)": 78.26, + "step": 688, + "token_acc": 0.8739687603123969, + "train_speed(iter/s)": 0.03285 + }, + { + "epoch": 0.1335077265901274, + "grad_norm": 0.1074993684887886, + "learning_rate": 0.0002943402000463875, + "loss": 0.4149300754070282, + "memory(GiB)": 78.26, + "step": 689, + "token_acc": 0.8803831398155441, + "train_speed(iter/s)": 0.032854 + }, + { + "epoch": 0.13370149687545416, + "grad_norm": 0.11187649518251419, + "learning_rate": 0.0002943140125649086, + "loss": 0.4131820797920227, + "memory(GiB)": 78.26, + "step": 690, + "token_acc": 0.8804750593824228, + "train_speed(iter/s)": 0.032857 + }, + { + "epoch": 0.1338952671607809, + "grad_norm": 0.10465361177921295, + "learning_rate": 0.0002942877658097868, + "loss": 0.45171868801116943, + "memory(GiB)": 78.26, + "step": 691, + "token_acc": 0.8700957481747215, + "train_speed(iter/s)": 0.032861 + }, + { + "epoch": 0.13408903744610765, + "grad_norm": 0.12278591841459274, + "learning_rate": 0.00029426145979180243, + "loss": 0.4629440903663635, + "memory(GiB)": 78.26, + "step": 692, + "token_acc": 0.8690040563923638, + "train_speed(iter/s)": 0.032864 + }, + { + "epoch": 0.1342828077314344, + "grad_norm": 0.10109657794237137, + "learning_rate": 0.00029423509452176005, + "loss": 0.42186468839645386, + "memory(GiB)": 78.26, + "step": 693, + "token_acc": 0.877427268729604, + "train_speed(iter/s)": 0.032868 + }, + { + "epoch": 0.13447657801676113, + "grad_norm": 0.10865118354558945, + "learning_rate": 0.00029420867001048867, + "loss": 0.4467792510986328, + "memory(GiB)": 78.26, + "step": 694, + "token_acc": 0.8724129751527012, + "train_speed(iter/s)": 0.032871 + }, + { + "epoch": 0.13467034830208788, + "grad_norm": 0.1067957952618599, + "learning_rate": 0.0002941821862688414, + "loss": 0.43282240629196167, + "memory(GiB)": 78.26, + "step": 695, + "token_acc": 0.8747978812378032, + "train_speed(iter/s)": 0.032874 + }, + { + "epoch": 0.13486411858741462, + "grad_norm": 0.11119311302900314, + "learning_rate": 0.00029415564330769595, + "loss": 0.4267749488353729, + "memory(GiB)": 78.26, + "step": 696, + "token_acc": 0.8750176528738879, + "train_speed(iter/s)": 0.032877 + }, + { + "epoch": 0.13505788887274137, + "grad_norm": 0.1177472174167633, + "learning_rate": 0.00029412904113795417, + "loss": 0.4727480709552765, + "memory(GiB)": 78.26, + "step": 697, + "token_acc": 0.8640138408304499, + "train_speed(iter/s)": 0.032881 + }, + { + "epoch": 0.1352516591580681, + "grad_norm": 0.1068907380104065, + "learning_rate": 0.0002941023797705423, + "loss": 0.4286806583404541, + "memory(GiB)": 78.26, + "step": 698, + "token_acc": 0.8773055332798717, + "train_speed(iter/s)": 0.032884 + }, + { + "epoch": 0.13544542944339485, + "grad_norm": 0.10560762882232666, + "learning_rate": 0.00029407565921641093, + "loss": 0.43469613790512085, + "memory(GiB)": 78.26, + "step": 699, + "token_acc": 0.8766754544193146, + "train_speed(iter/s)": 0.032888 + }, + { + "epoch": 0.1356391997287216, + "grad_norm": 0.11222469061613083, + "learning_rate": 0.0002940488794865348, + "loss": 0.49300503730773926, + "memory(GiB)": 78.26, + "step": 700, + "token_acc": 0.8590341029442915, + "train_speed(iter/s)": 0.032891 + }, + { + "epoch": 0.13583297001404834, + "grad_norm": 0.10281093418598175, + "learning_rate": 0.0002940220405919131, + "loss": 0.40541547536849976, + "memory(GiB)": 78.26, + "step": 701, + "token_acc": 0.8827319887407821, + "train_speed(iter/s)": 0.032894 + }, + { + "epoch": 0.1360267402993751, + "grad_norm": 0.11664767563343048, + "learning_rate": 0.00029399514254356936, + "loss": 0.47185465693473816, + "memory(GiB)": 78.26, + "step": 702, + "token_acc": 0.8657896737890655, + "train_speed(iter/s)": 0.032897 + }, + { + "epoch": 0.13622051058470183, + "grad_norm": 0.11212712526321411, + "learning_rate": 0.0002939681853525512, + "loss": 0.44248533248901367, + "memory(GiB)": 78.26, + "step": 703, + "token_acc": 0.8741593492114769, + "train_speed(iter/s)": 0.032901 + }, + { + "epoch": 0.13641428087002858, + "grad_norm": 0.10552475601434708, + "learning_rate": 0.0002939411690299308, + "loss": 0.41650229692459106, + "memory(GiB)": 78.26, + "step": 704, + "token_acc": 0.8810408921933085, + "train_speed(iter/s)": 0.032904 + }, + { + "epoch": 0.13660805115535532, + "grad_norm": 0.11409718543291092, + "learning_rate": 0.0002939140935868044, + "loss": 0.42940688133239746, + "memory(GiB)": 78.26, + "step": 705, + "token_acc": 0.8761153646882736, + "train_speed(iter/s)": 0.032908 + }, + { + "epoch": 0.13680182144068206, + "grad_norm": 0.1144266277551651, + "learning_rate": 0.0002938869590342927, + "loss": 0.4688735604286194, + "memory(GiB)": 78.26, + "step": 706, + "token_acc": 0.8650337555096803, + "train_speed(iter/s)": 0.032911 + }, + { + "epoch": 0.1369955917260088, + "grad_norm": 0.11157376319169998, + "learning_rate": 0.0002938597653835405, + "loss": 0.45128777623176575, + "memory(GiB)": 78.26, + "step": 707, + "token_acc": 0.8709400486478752, + "train_speed(iter/s)": 0.032913 + }, + { + "epoch": 0.13718936201133555, + "grad_norm": 0.10916193574666977, + "learning_rate": 0.000293832512645717, + "loss": 0.43759334087371826, + "memory(GiB)": 78.26, + "step": 708, + "token_acc": 0.8740179495065283, + "train_speed(iter/s)": 0.032916 + }, + { + "epoch": 0.1373831322966623, + "grad_norm": 0.12670759856700897, + "learning_rate": 0.00029380520083201563, + "loss": 0.49803271889686584, + "memory(GiB)": 78.26, + "step": 709, + "token_acc": 0.8560126128708614, + "train_speed(iter/s)": 0.03292 + }, + { + "epoch": 0.13757690258198904, + "grad_norm": 0.11388320475816727, + "learning_rate": 0.00029377782995365404, + "loss": 0.43962639570236206, + "memory(GiB)": 78.26, + "step": 710, + "token_acc": 0.8743197278911564, + "train_speed(iter/s)": 0.032923 + }, + { + "epoch": 0.13777067286731579, + "grad_norm": 0.116911880671978, + "learning_rate": 0.0002937504000218743, + "loss": 0.46620821952819824, + "memory(GiB)": 78.26, + "step": 711, + "token_acc": 0.8658806794868288, + "train_speed(iter/s)": 0.032926 + }, + { + "epoch": 0.13796444315264253, + "grad_norm": 0.10047486424446106, + "learning_rate": 0.0002937229110479425, + "loss": 0.411388635635376, + "memory(GiB)": 78.26, + "step": 712, + "token_acc": 0.8812828160820372, + "train_speed(iter/s)": 0.032929 + }, + { + "epoch": 0.13815821343796927, + "grad_norm": 0.10067104548215866, + "learning_rate": 0.00029369536304314916, + "loss": 0.4194115698337555, + "memory(GiB)": 78.26, + "step": 713, + "token_acc": 0.8785104275426338, + "train_speed(iter/s)": 0.032932 + }, + { + "epoch": 0.13835198372329605, + "grad_norm": 0.10838618874549866, + "learning_rate": 0.0002936677560188089, + "loss": 0.46899646520614624, + "memory(GiB)": 78.26, + "step": 714, + "token_acc": 0.8667539476848265, + "train_speed(iter/s)": 0.032935 + }, + { + "epoch": 0.1385457540086228, + "grad_norm": 0.12074261903762817, + "learning_rate": 0.00029364008998626086, + "loss": 0.41880735754966736, + "memory(GiB)": 78.26, + "step": 715, + "token_acc": 0.8823898581649303, + "train_speed(iter/s)": 0.032938 + }, + { + "epoch": 0.13873952429394953, + "grad_norm": 0.11350797116756439, + "learning_rate": 0.00029361236495686806, + "loss": 0.4601813554763794, + "memory(GiB)": 78.26, + "step": 716, + "token_acc": 0.8699107434095424, + "train_speed(iter/s)": 0.032942 + }, + { + "epoch": 0.13893329457927628, + "grad_norm": 0.11661586165428162, + "learning_rate": 0.0002935845809420179, + "loss": 0.4675025939941406, + "memory(GiB)": 78.26, + "step": 717, + "token_acc": 0.8649476228847703, + "train_speed(iter/s)": 0.032945 + }, + { + "epoch": 0.13912706486460302, + "grad_norm": 0.11661852151155472, + "learning_rate": 0.0002935567379531222, + "loss": 0.47289931774139404, + "memory(GiB)": 78.26, + "step": 718, + "token_acc": 0.8682559598494354, + "train_speed(iter/s)": 0.032949 + }, + { + "epoch": 0.13932083514992977, + "grad_norm": 0.11107414215803146, + "learning_rate": 0.0002935288360016166, + "loss": 0.4379945397377014, + "memory(GiB)": 78.26, + "step": 719, + "token_acc": 0.8734506763257683, + "train_speed(iter/s)": 0.032952 + }, + { + "epoch": 0.1395146054352565, + "grad_norm": 0.1156369224190712, + "learning_rate": 0.00029350087509896137, + "loss": 0.4373580515384674, + "memory(GiB)": 78.26, + "step": 720, + "token_acc": 0.8751508606999936, + "train_speed(iter/s)": 0.032954 + }, + { + "epoch": 0.13970837572058326, + "grad_norm": 0.11756598204374313, + "learning_rate": 0.00029347285525664065, + "loss": 0.48482561111450195, + "memory(GiB)": 78.26, + "step": 721, + "token_acc": 0.8630889444773964, + "train_speed(iter/s)": 0.032958 + }, + { + "epoch": 0.13990214600591, + "grad_norm": 0.10152934491634369, + "learning_rate": 0.00029344477648616304, + "loss": 0.41498956084251404, + "memory(GiB)": 78.26, + "step": 722, + "token_acc": 0.881353398445286, + "train_speed(iter/s)": 0.032961 + }, + { + "epoch": 0.14009591629123674, + "grad_norm": 0.11098542809486389, + "learning_rate": 0.0002934166387990612, + "loss": 0.4146006107330322, + "memory(GiB)": 78.26, + "step": 723, + "token_acc": 0.8806968679173152, + "train_speed(iter/s)": 0.032963 + }, + { + "epoch": 0.1402896865765635, + "grad_norm": 0.11475761979818344, + "learning_rate": 0.00029338844220689204, + "loss": 0.4258284270763397, + "memory(GiB)": 78.26, + "step": 724, + "token_acc": 0.8758714403875694, + "train_speed(iter/s)": 0.032967 + }, + { + "epoch": 0.14048345686189023, + "grad_norm": 0.10699902474880219, + "learning_rate": 0.0002933601867212367, + "loss": 0.4143221974372864, + "memory(GiB)": 78.26, + "step": 725, + "token_acc": 0.8792861274064282, + "train_speed(iter/s)": 0.03297 + }, + { + "epoch": 0.14067722714721698, + "grad_norm": 0.11193379759788513, + "learning_rate": 0.0002933318723537004, + "loss": 0.4646386504173279, + "memory(GiB)": 78.26, + "step": 726, + "token_acc": 0.8705041600375577, + "train_speed(iter/s)": 0.032973 + }, + { + "epoch": 0.14087099743254372, + "grad_norm": 0.10847294330596924, + "learning_rate": 0.0002933034991159127, + "loss": 0.431307852268219, + "memory(GiB)": 78.26, + "step": 727, + "token_acc": 0.8776439295122638, + "train_speed(iter/s)": 0.032976 + }, + { + "epoch": 0.14106476771787047, + "grad_norm": 0.11553511023521423, + "learning_rate": 0.0002932750670195272, + "loss": 0.47172775864601135, + "memory(GiB)": 78.26, + "step": 728, + "token_acc": 0.8666254148619003, + "train_speed(iter/s)": 0.032979 + }, + { + "epoch": 0.1412585380031972, + "grad_norm": 0.12142791599035263, + "learning_rate": 0.0002932465760762217, + "loss": 0.4737962484359741, + "memory(GiB)": 78.26, + "step": 729, + "token_acc": 0.8672086720867209, + "train_speed(iter/s)": 0.032982 + }, + { + "epoch": 0.14145230828852395, + "grad_norm": 0.11232297122478485, + "learning_rate": 0.0002932180262976982, + "loss": 0.4628666341304779, + "memory(GiB)": 78.26, + "step": 730, + "token_acc": 0.8675850891410049, + "train_speed(iter/s)": 0.032985 + }, + { + "epoch": 0.1416460785738507, + "grad_norm": 0.11426133662462234, + "learning_rate": 0.0002931894176956829, + "loss": 0.48010843992233276, + "memory(GiB)": 78.26, + "step": 731, + "token_acc": 0.8621309527436702, + "train_speed(iter/s)": 0.032988 + }, + { + "epoch": 0.14183984885917744, + "grad_norm": 0.10444355756044388, + "learning_rate": 0.0002931607502819261, + "loss": 0.4301010072231293, + "memory(GiB)": 78.26, + "step": 732, + "token_acc": 0.8748947870489872, + "train_speed(iter/s)": 0.032991 + }, + { + "epoch": 0.1420336191445042, + "grad_norm": 0.10665851086378098, + "learning_rate": 0.0002931320240682023, + "loss": 0.4206262230873108, + "memory(GiB)": 78.26, + "step": 733, + "token_acc": 0.876729803932444, + "train_speed(iter/s)": 0.032994 + }, + { + "epoch": 0.14222738942983093, + "grad_norm": 0.10180269926786423, + "learning_rate": 0.00029310323906631006, + "loss": 0.41888511180877686, + "memory(GiB)": 78.26, + "step": 734, + "token_acc": 0.8802506609223538, + "train_speed(iter/s)": 0.032997 + }, + { + "epoch": 0.14242115971515767, + "grad_norm": 0.109284408390522, + "learning_rate": 0.00029307439528807223, + "loss": 0.45739686489105225, + "memory(GiB)": 78.26, + "step": 735, + "token_acc": 0.8659565024411895, + "train_speed(iter/s)": 0.033 + }, + { + "epoch": 0.14261493000048442, + "grad_norm": 0.11013077199459076, + "learning_rate": 0.0002930454927453357, + "loss": 0.45072418451309204, + "memory(GiB)": 78.26, + "step": 736, + "token_acc": 0.8717199543645825, + "train_speed(iter/s)": 0.033003 + }, + { + "epoch": 0.14280870028581116, + "grad_norm": 0.11075732111930847, + "learning_rate": 0.00029301653144997154, + "loss": 0.45079749822616577, + "memory(GiB)": 78.26, + "step": 737, + "token_acc": 0.8688188438181866, + "train_speed(iter/s)": 0.033006 + }, + { + "epoch": 0.1430024705711379, + "grad_norm": 0.12135940045118332, + "learning_rate": 0.0002929875114138749, + "loss": 0.4898369312286377, + "memory(GiB)": 78.26, + "step": 738, + "token_acc": 0.8604469273743017, + "train_speed(iter/s)": 0.033009 + }, + { + "epoch": 0.14319624085646465, + "grad_norm": 0.10148437321186066, + "learning_rate": 0.00029295843264896506, + "loss": 0.37936070561408997, + "memory(GiB)": 78.26, + "step": 739, + "token_acc": 0.890900732946855, + "train_speed(iter/s)": 0.033011 + }, + { + "epoch": 0.1433900111417914, + "grad_norm": 0.10243412852287292, + "learning_rate": 0.0002929292951671855, + "loss": 0.42304426431655884, + "memory(GiB)": 78.26, + "step": 740, + "token_acc": 0.878807962695576, + "train_speed(iter/s)": 0.033014 + }, + { + "epoch": 0.14358378142711814, + "grad_norm": 0.10791970789432526, + "learning_rate": 0.0002929000989805038, + "loss": 0.42843425273895264, + "memory(GiB)": 78.26, + "step": 741, + "token_acc": 0.8764322681978526, + "train_speed(iter/s)": 0.033017 + }, + { + "epoch": 0.14377755171244488, + "grad_norm": 0.11472160369157791, + "learning_rate": 0.00029287084410091154, + "loss": 0.4652520716190338, + "memory(GiB)": 78.26, + "step": 742, + "token_acc": 0.8662527909382999, + "train_speed(iter/s)": 0.03302 + }, + { + "epoch": 0.14397132199777163, + "grad_norm": 0.12771129608154297, + "learning_rate": 0.00029284153054042454, + "loss": 0.5056222081184387, + "memory(GiB)": 78.26, + "step": 743, + "token_acc": 0.8580256694660118, + "train_speed(iter/s)": 0.033023 + }, + { + "epoch": 0.1441650922830984, + "grad_norm": 0.10892749577760696, + "learning_rate": 0.0002928121583110826, + "loss": 0.4871826469898224, + "memory(GiB)": 78.26, + "step": 744, + "token_acc": 0.8604575074297244, + "train_speed(iter/s)": 0.033026 + }, + { + "epoch": 0.14435886256842514, + "grad_norm": 0.10514423251152039, + "learning_rate": 0.0002927827274249498, + "loss": 0.4291183352470398, + "memory(GiB)": 78.26, + "step": 745, + "token_acc": 0.8779703110007337, + "train_speed(iter/s)": 0.033029 + }, + { + "epoch": 0.1445526328537519, + "grad_norm": 0.11185097694396973, + "learning_rate": 0.00029275323789411406, + "loss": 0.4657382667064667, + "memory(GiB)": 78.26, + "step": 746, + "token_acc": 0.8690815142887272, + "train_speed(iter/s)": 0.033031 + }, + { + "epoch": 0.14474640313907863, + "grad_norm": 0.1169874519109726, + "learning_rate": 0.00029272368973068765, + "loss": 0.4796285331249237, + "memory(GiB)": 78.26, + "step": 747, + "token_acc": 0.8630539484445522, + "train_speed(iter/s)": 0.033034 + }, + { + "epoch": 0.14494017342440538, + "grad_norm": 0.10806058347225189, + "learning_rate": 0.0002926940829468067, + "loss": 0.44982171058654785, + "memory(GiB)": 78.26, + "step": 748, + "token_acc": 0.8720634405109884, + "train_speed(iter/s)": 0.033037 + }, + { + "epoch": 0.14513394370973212, + "grad_norm": 0.10634516179561615, + "learning_rate": 0.00029266441755463154, + "loss": 0.4241579473018646, + "memory(GiB)": 78.26, + "step": 749, + "token_acc": 0.8774991483898016, + "train_speed(iter/s)": 0.03304 + }, + { + "epoch": 0.14532771399505887, + "grad_norm": 0.10630025714635849, + "learning_rate": 0.00029263469356634656, + "loss": 0.45012885332107544, + "memory(GiB)": 78.26, + "step": 750, + "token_acc": 0.8703159258521028, + "train_speed(iter/s)": 0.033043 + }, + { + "epoch": 0.1455214842803856, + "grad_norm": 0.11359134316444397, + "learning_rate": 0.0002926049109941602, + "loss": 0.47736871242523193, + "memory(GiB)": 78.26, + "step": 751, + "token_acc": 0.8675646281268301, + "train_speed(iter/s)": 0.033046 + }, + { + "epoch": 0.14571525456571235, + "grad_norm": 0.10710503160953522, + "learning_rate": 0.00029257506985030495, + "loss": 0.4275802671909332, + "memory(GiB)": 78.26, + "step": 752, + "token_acc": 0.8778828604612576, + "train_speed(iter/s)": 0.033048 + }, + { + "epoch": 0.1459090248510391, + "grad_norm": 0.10511702299118042, + "learning_rate": 0.00029254517014703737, + "loss": 0.41856449842453003, + "memory(GiB)": 78.26, + "step": 753, + "token_acc": 0.8796367161589878, + "train_speed(iter/s)": 0.033051 + }, + { + "epoch": 0.14610279513636584, + "grad_norm": 0.10652240365743637, + "learning_rate": 0.000292515211896638, + "loss": 0.4258817732334137, + "memory(GiB)": 78.26, + "step": 754, + "token_acc": 0.8783486686751893, + "train_speed(iter/s)": 0.033054 + }, + { + "epoch": 0.1462965654216926, + "grad_norm": 0.10977134853601456, + "learning_rate": 0.00029248519511141166, + "loss": 0.45545950531959534, + "memory(GiB)": 78.26, + "step": 755, + "token_acc": 0.8708563737967758, + "train_speed(iter/s)": 0.033057 + }, + { + "epoch": 0.14649033570701933, + "grad_norm": 0.10239318013191223, + "learning_rate": 0.00029245511980368694, + "loss": 0.40798917412757874, + "memory(GiB)": 78.26, + "step": 756, + "token_acc": 0.8818212167124625, + "train_speed(iter/s)": 0.033059 + }, + { + "epoch": 0.14668410599234608, + "grad_norm": 0.10805483907461166, + "learning_rate": 0.0002924249859858166, + "loss": 0.44618624448776245, + "memory(GiB)": 78.26, + "step": 757, + "token_acc": 0.873049779628478, + "train_speed(iter/s)": 0.033062 + }, + { + "epoch": 0.14687787627767282, + "grad_norm": 0.09752228856086731, + "learning_rate": 0.0002923947936701774, + "loss": 0.4019618034362793, + "memory(GiB)": 78.26, + "step": 758, + "token_acc": 0.8834425762325449, + "train_speed(iter/s)": 0.033065 + }, + { + "epoch": 0.14707164656299956, + "grad_norm": 0.10851308703422546, + "learning_rate": 0.00029236454286917017, + "loss": 0.4336164891719818, + "memory(GiB)": 78.26, + "step": 759, + "token_acc": 0.8760058721183123, + "train_speed(iter/s)": 0.033067 + }, + { + "epoch": 0.1472654168483263, + "grad_norm": 0.10959117859601974, + "learning_rate": 0.00029233423359521966, + "loss": 0.4287548363208771, + "memory(GiB)": 78.26, + "step": 760, + "token_acc": 0.874751174456565, + "train_speed(iter/s)": 0.033069 + }, + { + "epoch": 0.14745918713365305, + "grad_norm": 0.11091278493404388, + "learning_rate": 0.0002923038658607748, + "loss": 0.44881102442741394, + "memory(GiB)": 78.26, + "step": 761, + "token_acc": 0.8713587887777806, + "train_speed(iter/s)": 0.033072 + }, + { + "epoch": 0.1476529574189798, + "grad_norm": 0.11895415931940079, + "learning_rate": 0.0002922734396783083, + "loss": 0.49504998326301575, + "memory(GiB)": 78.26, + "step": 762, + "token_acc": 0.8594216543375924, + "train_speed(iter/s)": 0.033075 + }, + { + "epoch": 0.14784672770430654, + "grad_norm": 0.09505145251750946, + "learning_rate": 0.00029224295506031714, + "loss": 0.38731294870376587, + "memory(GiB)": 78.26, + "step": 763, + "token_acc": 0.8887887484287171, + "train_speed(iter/s)": 0.033077 + }, + { + "epoch": 0.14804049798963329, + "grad_norm": 0.10213866084814072, + "learning_rate": 0.0002922124120193221, + "loss": 0.4288552403450012, + "memory(GiB)": 78.26, + "step": 764, + "token_acc": 0.8762533946104032, + "train_speed(iter/s)": 0.033079 + }, + { + "epoch": 0.14823426827496003, + "grad_norm": 0.11133123189210892, + "learning_rate": 0.00029218181056786806, + "loss": 0.46136656403541565, + "memory(GiB)": 78.26, + "step": 765, + "token_acc": 0.8676832940918797, + "train_speed(iter/s)": 0.033082 + }, + { + "epoch": 0.14842803856028677, + "grad_norm": 0.1136741042137146, + "learning_rate": 0.00029215115071852386, + "loss": 0.44833481311798096, + "memory(GiB)": 78.26, + "step": 766, + "token_acc": 0.8728677891250345, + "train_speed(iter/s)": 0.033085 + }, + { + "epoch": 0.14862180884561352, + "grad_norm": 0.1057792380452156, + "learning_rate": 0.0002921204324838823, + "loss": 0.4067574441432953, + "memory(GiB)": 78.26, + "step": 767, + "token_acc": 0.883631027963543, + "train_speed(iter/s)": 0.033088 + }, + { + "epoch": 0.14881557913094026, + "grad_norm": 0.12142736464738846, + "learning_rate": 0.0002920896558765602, + "loss": 0.4738287031650543, + "memory(GiB)": 78.26, + "step": 768, + "token_acc": 0.8667360749609578, + "train_speed(iter/s)": 0.033091 + }, + { + "epoch": 0.149009349416267, + "grad_norm": 0.12040547281503677, + "learning_rate": 0.0002920588209091983, + "loss": 0.43065720796585083, + "memory(GiB)": 78.26, + "step": 769, + "token_acc": 0.8761208605841793, + "train_speed(iter/s)": 0.033094 + }, + { + "epoch": 0.14920311970159375, + "grad_norm": 0.09921874105930328, + "learning_rate": 0.0002920279275944614, + "loss": 0.43212834000587463, + "memory(GiB)": 78.26, + "step": 770, + "token_acc": 0.8759197564892266, + "train_speed(iter/s)": 0.033096 + }, + { + "epoch": 0.1493968899869205, + "grad_norm": 0.141917884349823, + "learning_rate": 0.0002919969759450382, + "loss": 0.4761424660682678, + "memory(GiB)": 78.26, + "step": 771, + "token_acc": 0.8653158180933573, + "train_speed(iter/s)": 0.033099 + }, + { + "epoch": 0.14959066027224724, + "grad_norm": 0.1117415577173233, + "learning_rate": 0.0002919659659736414, + "loss": 0.4318593144416809, + "memory(GiB)": 78.26, + "step": 772, + "token_acc": 0.8758288466987815, + "train_speed(iter/s)": 0.033101 + }, + { + "epoch": 0.14978443055757398, + "grad_norm": 0.16481183469295502, + "learning_rate": 0.00029193489769300754, + "loss": 0.4566386044025421, + "memory(GiB)": 78.26, + "step": 773, + "token_acc": 0.8704999401985408, + "train_speed(iter/s)": 0.033104 + }, + { + "epoch": 0.14997820084290073, + "grad_norm": 0.10933414846658707, + "learning_rate": 0.0002919037711158973, + "loss": 0.430907666683197, + "memory(GiB)": 78.26, + "step": 774, + "token_acc": 0.8769761425697039, + "train_speed(iter/s)": 0.033107 + }, + { + "epoch": 0.1501719711282275, + "grad_norm": 0.1175195574760437, + "learning_rate": 0.00029187258625509513, + "loss": 0.45036518573760986, + "memory(GiB)": 78.26, + "step": 775, + "token_acc": 0.869928163775235, + "train_speed(iter/s)": 0.03311 + }, + { + "epoch": 0.15036574141355424, + "grad_norm": 0.1023663803935051, + "learning_rate": 0.0002918413431234096, + "loss": 0.39650124311447144, + "memory(GiB)": 78.26, + "step": 776, + "token_acc": 0.8873017168666504, + "train_speed(iter/s)": 0.033112 + }, + { + "epoch": 0.150559511698881, + "grad_norm": 0.12010087072849274, + "learning_rate": 0.000291810041733673, + "loss": 0.4465680420398712, + "memory(GiB)": 78.26, + "step": 777, + "token_acc": 0.8728560775540641, + "train_speed(iter/s)": 0.033115 + }, + { + "epoch": 0.15075328198420773, + "grad_norm": 0.10836753249168396, + "learning_rate": 0.0002917786820987416, + "loss": 0.43890365958213806, + "memory(GiB)": 78.26, + "step": 778, + "token_acc": 0.8746218049034951, + "train_speed(iter/s)": 0.033118 + }, + { + "epoch": 0.15094705226953448, + "grad_norm": 0.11870795488357544, + "learning_rate": 0.0002917472642314958, + "loss": 0.46871399879455566, + "memory(GiB)": 78.26, + "step": 779, + "token_acc": 0.867821888084105, + "train_speed(iter/s)": 0.033121 + }, + { + "epoch": 0.15114082255486122, + "grad_norm": 0.1149909496307373, + "learning_rate": 0.00029171578814483966, + "loss": 0.44199684262275696, + "memory(GiB)": 78.26, + "step": 780, + "token_acc": 0.8727175418713008, + "train_speed(iter/s)": 0.033124 + }, + { + "epoch": 0.15133459284018796, + "grad_norm": 0.1077098697423935, + "learning_rate": 0.0002916842538517013, + "loss": 0.4246280789375305, + "memory(GiB)": 78.26, + "step": 781, + "token_acc": 0.8763708602062753, + "train_speed(iter/s)": 0.033126 + }, + { + "epoch": 0.1515283631255147, + "grad_norm": 0.11270920932292938, + "learning_rate": 0.0002916526613650326, + "loss": 0.4364209771156311, + "memory(GiB)": 78.26, + "step": 782, + "token_acc": 0.8724545624382823, + "train_speed(iter/s)": 0.033129 + }, + { + "epoch": 0.15172213341084145, + "grad_norm": 0.10723229497671127, + "learning_rate": 0.0002916210106978096, + "loss": 0.47758546471595764, + "memory(GiB)": 78.26, + "step": 783, + "token_acc": 0.8667694833461854, + "train_speed(iter/s)": 0.033132 + }, + { + "epoch": 0.1519159036961682, + "grad_norm": 0.12354382872581482, + "learning_rate": 0.000291589301863032, + "loss": 0.4946221709251404, + "memory(GiB)": 78.26, + "step": 784, + "token_acc": 0.8581249637407902, + "train_speed(iter/s)": 0.033134 + }, + { + "epoch": 0.15210967398149494, + "grad_norm": 0.11447127163410187, + "learning_rate": 0.00029155753487372345, + "loss": 0.44242554903030396, + "memory(GiB)": 78.26, + "step": 785, + "token_acc": 0.8728641926006219, + "train_speed(iter/s)": 0.033137 + }, + { + "epoch": 0.15230344426682169, + "grad_norm": 0.10308409482240677, + "learning_rate": 0.0002915257097429315, + "loss": 0.4122265577316284, + "memory(GiB)": 78.26, + "step": 786, + "token_acc": 0.881477327759794, + "train_speed(iter/s)": 0.033139 + }, + { + "epoch": 0.15249721455214843, + "grad_norm": 0.10119541734457016, + "learning_rate": 0.00029149382648372763, + "loss": 0.3953108787536621, + "memory(GiB)": 78.26, + "step": 787, + "token_acc": 0.8823500826124988, + "train_speed(iter/s)": 0.033141 + }, + { + "epoch": 0.15269098483747517, + "grad_norm": 0.10768993198871613, + "learning_rate": 0.0002914618851092072, + "loss": 0.4354395270347595, + "memory(GiB)": 78.26, + "step": 788, + "token_acc": 0.8756917373170133, + "train_speed(iter/s)": 0.033144 + }, + { + "epoch": 0.15288475512280192, + "grad_norm": 0.10865043103694916, + "learning_rate": 0.0002914298856324893, + "loss": 0.40121009945869446, + "memory(GiB)": 78.26, + "step": 789, + "token_acc": 0.886211232187762, + "train_speed(iter/s)": 0.033147 + }, + { + "epoch": 0.15307852540812866, + "grad_norm": 0.1131611317396164, + "learning_rate": 0.00029139782806671696, + "loss": 0.41886383295059204, + "memory(GiB)": 78.26, + "step": 790, + "token_acc": 0.8795520757465404, + "train_speed(iter/s)": 0.03315 + }, + { + "epoch": 0.1532722956934554, + "grad_norm": 0.11826243251562119, + "learning_rate": 0.0002913657124250571, + "loss": 0.4526827037334442, + "memory(GiB)": 78.26, + "step": 791, + "token_acc": 0.8706143245597825, + "train_speed(iter/s)": 0.033152 + }, + { + "epoch": 0.15346606597878215, + "grad_norm": 0.12152021378278732, + "learning_rate": 0.0002913335387207006, + "loss": 0.4480191767215729, + "memory(GiB)": 78.26, + "step": 792, + "token_acc": 0.8714323428470857, + "train_speed(iter/s)": 0.033155 + }, + { + "epoch": 0.1536598362641089, + "grad_norm": 0.10104276239871979, + "learning_rate": 0.0002913013069668619, + "loss": 0.41667938232421875, + "memory(GiB)": 78.26, + "step": 793, + "token_acc": 0.880495535822304, + "train_speed(iter/s)": 0.033158 + }, + { + "epoch": 0.15385360654943564, + "grad_norm": 0.1120065450668335, + "learning_rate": 0.0002912690171767795, + "loss": 0.444894015789032, + "memory(GiB)": 78.26, + "step": 794, + "token_acc": 0.8719184281987115, + "train_speed(iter/s)": 0.033161 + }, + { + "epoch": 0.15404737683476238, + "grad_norm": 0.10677407681941986, + "learning_rate": 0.00029123666936371577, + "loss": 0.4287039041519165, + "memory(GiB)": 78.26, + "step": 795, + "token_acc": 0.8766376159685128, + "train_speed(iter/s)": 0.033163 + }, + { + "epoch": 0.15424114712008913, + "grad_norm": 0.10139796137809753, + "learning_rate": 0.0002912042635409568, + "loss": 0.3969685435295105, + "memory(GiB)": 78.26, + "step": 796, + "token_acc": 0.8835710646968925, + "train_speed(iter/s)": 0.033166 + }, + { + "epoch": 0.15443491740541587, + "grad_norm": 0.11726026982069016, + "learning_rate": 0.0002911717997218123, + "loss": 0.45599818229675293, + "memory(GiB)": 78.26, + "step": 797, + "token_acc": 0.8708311582854823, + "train_speed(iter/s)": 0.033169 + }, + { + "epoch": 0.15462868769074262, + "grad_norm": 0.10516592860221863, + "learning_rate": 0.0002911392779196164, + "loss": 0.39878836274147034, + "memory(GiB)": 78.26, + "step": 798, + "token_acc": 0.8827302498037836, + "train_speed(iter/s)": 0.033171 + }, + { + "epoch": 0.15482245797606936, + "grad_norm": 0.1172914132475853, + "learning_rate": 0.00029110669814772644, + "loss": 0.43762531876564026, + "memory(GiB)": 78.26, + "step": 799, + "token_acc": 0.8754726710209694, + "train_speed(iter/s)": 0.033174 + }, + { + "epoch": 0.1550162282613961, + "grad_norm": 0.1156349629163742, + "learning_rate": 0.0002910740604195238, + "loss": 0.46107926964759827, + "memory(GiB)": 78.26, + "step": 800, + "token_acc": 0.869101935557004, + "train_speed(iter/s)": 0.033176 + }, + { + "epoch": 0.15520999854672285, + "grad_norm": 0.1091204434633255, + "learning_rate": 0.00029104136474841384, + "loss": 0.4546446204185486, + "memory(GiB)": 78.26, + "step": 801, + "token_acc": 0.8690357722725606, + "train_speed(iter/s)": 0.033161 + }, + { + "epoch": 0.1554037688320496, + "grad_norm": 0.10520661622285843, + "learning_rate": 0.00029100861114782537, + "loss": 0.391659677028656, + "memory(GiB)": 78.26, + "step": 802, + "token_acc": 0.883874415497662, + "train_speed(iter/s)": 0.033164 + }, + { + "epoch": 0.15559753911737634, + "grad_norm": 0.10649837553501129, + "learning_rate": 0.0002909757996312113, + "loss": 0.4287815988063812, + "memory(GiB)": 78.26, + "step": 803, + "token_acc": 0.8760036358127556, + "train_speed(iter/s)": 0.033167 + }, + { + "epoch": 0.15579130940270308, + "grad_norm": 0.11375483870506287, + "learning_rate": 0.00029094293021204816, + "loss": 0.43256044387817383, + "memory(GiB)": 78.26, + "step": 804, + "token_acc": 0.8724521560241453, + "train_speed(iter/s)": 0.033169 + }, + { + "epoch": 0.15598507968802985, + "grad_norm": 0.1098484992980957, + "learning_rate": 0.00029091000290383626, + "loss": 0.44678372144699097, + "memory(GiB)": 78.26, + "step": 805, + "token_acc": 0.8710053046965972, + "train_speed(iter/s)": 0.033171 + }, + { + "epoch": 0.1561788499733566, + "grad_norm": 0.12109239399433136, + "learning_rate": 0.0002908770177200998, + "loss": 0.43375393748283386, + "memory(GiB)": 78.26, + "step": 806, + "token_acc": 0.8734656356459201, + "train_speed(iter/s)": 0.033174 + }, + { + "epoch": 0.15637262025868334, + "grad_norm": 0.1153409481048584, + "learning_rate": 0.00029084397467438666, + "loss": 0.4369393587112427, + "memory(GiB)": 78.26, + "step": 807, + "token_acc": 0.8766715060747307, + "train_speed(iter/s)": 0.033176 + }, + { + "epoch": 0.1565663905440101, + "grad_norm": 0.11903059482574463, + "learning_rate": 0.0002908108737802685, + "loss": 0.4392112195491791, + "memory(GiB)": 78.26, + "step": 808, + "token_acc": 0.8715450323167365, + "train_speed(iter/s)": 0.033178 + }, + { + "epoch": 0.15676016082933683, + "grad_norm": 0.10243628919124603, + "learning_rate": 0.0002907777150513407, + "loss": 0.40879738330841064, + "memory(GiB)": 78.26, + "step": 809, + "token_acc": 0.8799474375821288, + "train_speed(iter/s)": 0.033181 + }, + { + "epoch": 0.15695393111466358, + "grad_norm": 0.1134597584605217, + "learning_rate": 0.0002907444985012225, + "loss": 0.4965320825576782, + "memory(GiB)": 78.26, + "step": 810, + "token_acc": 0.8585183351214547, + "train_speed(iter/s)": 0.033183 + }, + { + "epoch": 0.15714770139999032, + "grad_norm": 0.11033570021390915, + "learning_rate": 0.0002907112241435568, + "loss": 0.46451640129089355, + "memory(GiB)": 78.26, + "step": 811, + "token_acc": 0.8680681696762474, + "train_speed(iter/s)": 0.033186 + }, + { + "epoch": 0.15734147168531706, + "grad_norm": 0.10630565881729126, + "learning_rate": 0.0002906778919920103, + "loss": 0.4418585002422333, + "memory(GiB)": 78.26, + "step": 812, + "token_acc": 0.8757123405113723, + "train_speed(iter/s)": 0.033188 + }, + { + "epoch": 0.1575352419706438, + "grad_norm": 0.10460490733385086, + "learning_rate": 0.0002906445020602734, + "loss": 0.4312971532344818, + "memory(GiB)": 78.26, + "step": 813, + "token_acc": 0.8766326100695175, + "train_speed(iter/s)": 0.03319 + }, + { + "epoch": 0.15772901225597055, + "grad_norm": 0.09658509492874146, + "learning_rate": 0.0002906110543620603, + "loss": 0.3654577136039734, + "memory(GiB)": 78.26, + "step": 814, + "token_acc": 0.894104601737202, + "train_speed(iter/s)": 0.033193 + }, + { + "epoch": 0.1579227825412973, + "grad_norm": 0.11015032976865768, + "learning_rate": 0.0002905775489111087, + "loss": 0.4334821403026581, + "memory(GiB)": 78.26, + "step": 815, + "token_acc": 0.8754580788450861, + "train_speed(iter/s)": 0.033195 + }, + { + "epoch": 0.15811655282662404, + "grad_norm": 0.10890354961156845, + "learning_rate": 0.0002905439857211804, + "loss": 0.4261341392993927, + "memory(GiB)": 78.26, + "step": 816, + "token_acc": 0.8757039216681041, + "train_speed(iter/s)": 0.033197 + }, + { + "epoch": 0.15831032311195078, + "grad_norm": 0.10780946165323257, + "learning_rate": 0.00029051036480606053, + "loss": 0.41828641295433044, + "memory(GiB)": 78.26, + "step": 817, + "token_acc": 0.8813302092763193, + "train_speed(iter/s)": 0.0332 + }, + { + "epoch": 0.15850409339727753, + "grad_norm": 0.11241378635168076, + "learning_rate": 0.0002904766861795582, + "loss": 0.4504837095737457, + "memory(GiB)": 78.26, + "step": 818, + "token_acc": 0.8700797107404059, + "train_speed(iter/s)": 0.033202 + }, + { + "epoch": 0.15869786368260427, + "grad_norm": 0.10670538991689682, + "learning_rate": 0.00029044294985550607, + "loss": 0.4240032434463501, + "memory(GiB)": 78.26, + "step": 819, + "token_acc": 0.8797002130335464, + "train_speed(iter/s)": 0.033205 + }, + { + "epoch": 0.15889163396793102, + "grad_norm": 0.09907601028680801, + "learning_rate": 0.00029040915584776063, + "loss": 0.4048020541667938, + "memory(GiB)": 78.26, + "step": 820, + "token_acc": 0.8819687307138449, + "train_speed(iter/s)": 0.033207 + }, + { + "epoch": 0.15908540425325776, + "grad_norm": 0.10909497737884521, + "learning_rate": 0.00029037530417020194, + "loss": 0.4121737480163574, + "memory(GiB)": 78.26, + "step": 821, + "token_acc": 0.882557633753806, + "train_speed(iter/s)": 0.03321 + }, + { + "epoch": 0.1592791745385845, + "grad_norm": 0.12102677673101425, + "learning_rate": 0.00029034139483673373, + "loss": 0.4818951487541199, + "memory(GiB)": 78.26, + "step": 822, + "token_acc": 0.8623975497008026, + "train_speed(iter/s)": 0.033212 + }, + { + "epoch": 0.15947294482391125, + "grad_norm": 0.11021539568901062, + "learning_rate": 0.00029030742786128363, + "loss": 0.42791998386383057, + "memory(GiB)": 78.26, + "step": 823, + "token_acc": 0.8786863654583977, + "train_speed(iter/s)": 0.033214 + }, + { + "epoch": 0.159666715109238, + "grad_norm": 0.12519389390945435, + "learning_rate": 0.0002902734032578027, + "loss": 0.4770694077014923, + "memory(GiB)": 78.26, + "step": 824, + "token_acc": 0.8628657500881326, + "train_speed(iter/s)": 0.033217 + }, + { + "epoch": 0.15986048539456474, + "grad_norm": 0.10490487515926361, + "learning_rate": 0.0002902393210402657, + "loss": 0.3933722674846649, + "memory(GiB)": 78.26, + "step": 825, + "token_acc": 0.8858756744604317, + "train_speed(iter/s)": 0.033219 + }, + { + "epoch": 0.16005425567989148, + "grad_norm": 0.10252831131219864, + "learning_rate": 0.0002902051812226712, + "loss": 0.3960869014263153, + "memory(GiB)": 78.26, + "step": 826, + "token_acc": 0.8850197639086826, + "train_speed(iter/s)": 0.033221 + }, + { + "epoch": 0.16024802596521823, + "grad_norm": 0.11041463166475296, + "learning_rate": 0.0002901709838190413, + "loss": 0.4398559033870697, + "memory(GiB)": 78.26, + "step": 827, + "token_acc": 0.8748830196526983, + "train_speed(iter/s)": 0.033224 + }, + { + "epoch": 0.16044179625054497, + "grad_norm": 0.11490801721811295, + "learning_rate": 0.00029013672884342184, + "loss": 0.47440311312675476, + "memory(GiB)": 78.26, + "step": 828, + "token_acc": 0.865016464809957, + "train_speed(iter/s)": 0.033226 + }, + { + "epoch": 0.16063556653587172, + "grad_norm": 0.11218751221895218, + "learning_rate": 0.00029010241630988217, + "loss": 0.43563181161880493, + "memory(GiB)": 78.26, + "step": 829, + "token_acc": 0.8744653041825095, + "train_speed(iter/s)": 0.033229 + }, + { + "epoch": 0.16082933682119846, + "grad_norm": 0.10691240429878235, + "learning_rate": 0.00029006804623251547, + "loss": 0.4449552297592163, + "memory(GiB)": 78.26, + "step": 830, + "token_acc": 0.869553477182905, + "train_speed(iter/s)": 0.033231 + }, + { + "epoch": 0.1610231071065252, + "grad_norm": 0.09648893773555756, + "learning_rate": 0.00029003361862543834, + "loss": 0.37794414162635803, + "memory(GiB)": 78.26, + "step": 831, + "token_acc": 0.8909843389926163, + "train_speed(iter/s)": 0.033233 + }, + { + "epoch": 0.16121687739185195, + "grad_norm": 0.10781273245811462, + "learning_rate": 0.0002899991335027913, + "loss": 0.4308411180973053, + "memory(GiB)": 78.26, + "step": 832, + "token_acc": 0.877130367419212, + "train_speed(iter/s)": 0.033235 + }, + { + "epoch": 0.1614106476771787, + "grad_norm": 0.10362358391284943, + "learning_rate": 0.0002899645908787381, + "loss": 0.40463730692863464, + "memory(GiB)": 78.26, + "step": 833, + "token_acc": 0.8802580076079166, + "train_speed(iter/s)": 0.033237 + }, + { + "epoch": 0.16160441796250544, + "grad_norm": 0.10180053114891052, + "learning_rate": 0.0002899299907674665, + "loss": 0.4222317337989807, + "memory(GiB)": 78.26, + "step": 834, + "token_acc": 0.8798938964980744, + "train_speed(iter/s)": 0.03324 + }, + { + "epoch": 0.1617981882478322, + "grad_norm": 0.1112997904419899, + "learning_rate": 0.0002898953331831876, + "loss": 0.45217838883399963, + "memory(GiB)": 78.26, + "step": 835, + "token_acc": 0.8695664756612838, + "train_speed(iter/s)": 0.033243 + }, + { + "epoch": 0.16199195853315895, + "grad_norm": 0.10161816328763962, + "learning_rate": 0.0002898606181401362, + "loss": 0.42238953709602356, + "memory(GiB)": 78.26, + "step": 836, + "token_acc": 0.8775941837409121, + "train_speed(iter/s)": 0.033245 + }, + { + "epoch": 0.1621857288184857, + "grad_norm": 0.11536618322134018, + "learning_rate": 0.0002898258456525708, + "loss": 0.4566255509853363, + "memory(GiB)": 78.26, + "step": 837, + "token_acc": 0.8699845787186317, + "train_speed(iter/s)": 0.033247 + }, + { + "epoch": 0.16237949910381244, + "grad_norm": 0.10816803574562073, + "learning_rate": 0.0002897910157347733, + "loss": 0.4683791697025299, + "memory(GiB)": 78.26, + "step": 838, + "token_acc": 0.8670610211706102, + "train_speed(iter/s)": 0.033249 + }, + { + "epoch": 0.16257326938913919, + "grad_norm": 0.11843861639499664, + "learning_rate": 0.00028975612840104935, + "loss": 0.45893388986587524, + "memory(GiB)": 78.26, + "step": 839, + "token_acc": 0.8687269042794136, + "train_speed(iter/s)": 0.033252 + }, + { + "epoch": 0.16276703967446593, + "grad_norm": 0.10663704574108124, + "learning_rate": 0.000289721183665728, + "loss": 0.4239065945148468, + "memory(GiB)": 78.26, + "step": 840, + "token_acc": 0.8774621714892488, + "train_speed(iter/s)": 0.033254 + }, + { + "epoch": 0.16296080995979267, + "grad_norm": 0.11534672975540161, + "learning_rate": 0.00028968618154316206, + "loss": 0.4733141362667084, + "memory(GiB)": 78.26, + "step": 841, + "token_acc": 0.8653584989329564, + "train_speed(iter/s)": 0.033256 + }, + { + "epoch": 0.16315458024511942, + "grad_norm": 0.10526616871356964, + "learning_rate": 0.00028965112204772786, + "loss": 0.4100850820541382, + "memory(GiB)": 78.26, + "step": 842, + "token_acc": 0.8815631620593716, + "train_speed(iter/s)": 0.033258 + }, + { + "epoch": 0.16334835053044616, + "grad_norm": 0.11487496644258499, + "learning_rate": 0.00028961600519382527, + "loss": 0.44554945826530457, + "memory(GiB)": 78.26, + "step": 843, + "token_acc": 0.87236373596275, + "train_speed(iter/s)": 0.033261 + }, + { + "epoch": 0.1635421208157729, + "grad_norm": 0.12236074358224869, + "learning_rate": 0.00028958083099587774, + "loss": 0.45087599754333496, + "memory(GiB)": 78.26, + "step": 844, + "token_acc": 0.8709829274416269, + "train_speed(iter/s)": 0.033263 + }, + { + "epoch": 0.16373589110109965, + "grad_norm": 0.11835192143917084, + "learning_rate": 0.0002895455994683323, + "loss": 0.4500479996204376, + "memory(GiB)": 78.26, + "step": 845, + "token_acc": 0.8728866462050668, + "train_speed(iter/s)": 0.033266 + }, + { + "epoch": 0.1639296613864264, + "grad_norm": 0.11468973010778427, + "learning_rate": 0.0002895103106256593, + "loss": 0.45714688301086426, + "memory(GiB)": 78.26, + "step": 846, + "token_acc": 0.8693362313994315, + "train_speed(iter/s)": 0.033268 + }, + { + "epoch": 0.16412343167175314, + "grad_norm": 0.11863450706005096, + "learning_rate": 0.000289474964482353, + "loss": 0.5020843148231506, + "memory(GiB)": 78.26, + "step": 847, + "token_acc": 0.8581676315127108, + "train_speed(iter/s)": 0.03327 + }, + { + "epoch": 0.16431720195707988, + "grad_norm": 0.09941709041595459, + "learning_rate": 0.0002894395610529309, + "loss": 0.3951787054538727, + "memory(GiB)": 78.26, + "step": 848, + "token_acc": 0.8848054109751909, + "train_speed(iter/s)": 0.033272 + }, + { + "epoch": 0.16451097224240663, + "grad_norm": 0.09850473701953888, + "learning_rate": 0.0002894041003519343, + "loss": 0.37680599093437195, + "memory(GiB)": 78.26, + "step": 849, + "token_acc": 0.8903767059263413, + "train_speed(iter/s)": 0.033274 + }, + { + "epoch": 0.16470474252773337, + "grad_norm": 0.10843576490879059, + "learning_rate": 0.0002893685823939276, + "loss": 0.4342295229434967, + "memory(GiB)": 78.26, + "step": 850, + "token_acc": 0.8752759675488762, + "train_speed(iter/s)": 0.033276 + }, + { + "epoch": 0.16489851281306012, + "grad_norm": 0.09915035963058472, + "learning_rate": 0.00028933300719349923, + "loss": 0.41352295875549316, + "memory(GiB)": 78.26, + "step": 851, + "token_acc": 0.8786153965024339, + "train_speed(iter/s)": 0.033279 + }, + { + "epoch": 0.16509228309838686, + "grad_norm": 0.11959869414567947, + "learning_rate": 0.00028929737476526075, + "loss": 0.506973147392273, + "memory(GiB)": 78.26, + "step": 852, + "token_acc": 0.8568055984935781, + "train_speed(iter/s)": 0.033282 + }, + { + "epoch": 0.1652860533837136, + "grad_norm": 0.10934106260538101, + "learning_rate": 0.00028926168512384743, + "loss": 0.43080487847328186, + "memory(GiB)": 78.26, + "step": 853, + "token_acc": 0.875984682713348, + "train_speed(iter/s)": 0.033284 + }, + { + "epoch": 0.16547982366904035, + "grad_norm": 0.10780926048755646, + "learning_rate": 0.0002892259382839179, + "loss": 0.4089958667755127, + "memory(GiB)": 78.26, + "step": 854, + "token_acc": 0.8801118674476281, + "train_speed(iter/s)": 0.033287 + }, + { + "epoch": 0.1656735939543671, + "grad_norm": 0.1189161166548729, + "learning_rate": 0.0002891901342601543, + "loss": 0.47321808338165283, + "memory(GiB)": 78.26, + "step": 855, + "token_acc": 0.8673148412925364, + "train_speed(iter/s)": 0.033289 + }, + { + "epoch": 0.16586736423969384, + "grad_norm": 0.11358017474412918, + "learning_rate": 0.00028915427306726245, + "loss": 0.4263777732849121, + "memory(GiB)": 78.26, + "step": 856, + "token_acc": 0.8767594964603477, + "train_speed(iter/s)": 0.033291 + }, + { + "epoch": 0.16606113452502058, + "grad_norm": 0.1009831428527832, + "learning_rate": 0.00028911835471997143, + "loss": 0.4132530987262726, + "memory(GiB)": 78.26, + "step": 857, + "token_acc": 0.8810461753266181, + "train_speed(iter/s)": 0.033293 + }, + { + "epoch": 0.16625490481034733, + "grad_norm": 0.10224108397960663, + "learning_rate": 0.0002890823792330339, + "loss": 0.4362914562225342, + "memory(GiB)": 78.26, + "step": 858, + "token_acc": 0.8741152805026449, + "train_speed(iter/s)": 0.033295 + }, + { + "epoch": 0.16644867509567407, + "grad_norm": 0.1064584031701088, + "learning_rate": 0.00028904634662122586, + "loss": 0.44220423698425293, + "memory(GiB)": 78.26, + "step": 859, + "token_acc": 0.8728430508570941, + "train_speed(iter/s)": 0.033297 + }, + { + "epoch": 0.16664244538100081, + "grad_norm": 0.1092744916677475, + "learning_rate": 0.000289010256899347, + "loss": 0.422356516122818, + "memory(GiB)": 78.26, + "step": 860, + "token_acc": 0.8780035309255214, + "train_speed(iter/s)": 0.033299 + }, + { + "epoch": 0.16683621566632756, + "grad_norm": 0.1130421906709671, + "learning_rate": 0.0002889741100822202, + "loss": 0.45889347791671753, + "memory(GiB)": 78.26, + "step": 861, + "token_acc": 0.867090987216672, + "train_speed(iter/s)": 0.033302 + }, + { + "epoch": 0.1670299859516543, + "grad_norm": 0.11241286247968674, + "learning_rate": 0.00028893790618469213, + "loss": 0.47700247168540955, + "memory(GiB)": 78.26, + "step": 862, + "token_acc": 0.862, + "train_speed(iter/s)": 0.033304 + }, + { + "epoch": 0.16722375623698105, + "grad_norm": 0.1090337261557579, + "learning_rate": 0.0002889016452216325, + "loss": 0.42925596237182617, + "memory(GiB)": 78.26, + "step": 863, + "token_acc": 0.8770264194853622, + "train_speed(iter/s)": 0.033306 + }, + { + "epoch": 0.1674175265223078, + "grad_norm": 0.12319561094045639, + "learning_rate": 0.00028886532720793476, + "loss": 0.4747333526611328, + "memory(GiB)": 78.26, + "step": 864, + "token_acc": 0.8648867565967787, + "train_speed(iter/s)": 0.033308 + }, + { + "epoch": 0.16761129680763454, + "grad_norm": 0.11913838982582092, + "learning_rate": 0.0002888289521585157, + "loss": 0.43152928352355957, + "memory(GiB)": 78.26, + "step": 865, + "token_acc": 0.8771402469572392, + "train_speed(iter/s)": 0.033311 + }, + { + "epoch": 0.1678050670929613, + "grad_norm": 0.12217225134372711, + "learning_rate": 0.0002887925200883155, + "loss": 0.468718022108078, + "memory(GiB)": 78.26, + "step": 866, + "token_acc": 0.8652350105007839, + "train_speed(iter/s)": 0.033313 + }, + { + "epoch": 0.16799883737828805, + "grad_norm": 0.10935720801353455, + "learning_rate": 0.0002887560310122978, + "loss": 0.427009254693985, + "memory(GiB)": 78.26, + "step": 867, + "token_acc": 0.8757216164207825, + "train_speed(iter/s)": 0.033315 + }, + { + "epoch": 0.1681926076636148, + "grad_norm": 0.11316697299480438, + "learning_rate": 0.0002887194849454496, + "loss": 0.4437088370323181, + "memory(GiB)": 78.26, + "step": 868, + "token_acc": 0.873464339700426, + "train_speed(iter/s)": 0.033317 + }, + { + "epoch": 0.16838637794894154, + "grad_norm": 0.11642614752054214, + "learning_rate": 0.00028868288190278145, + "loss": 0.4325766861438751, + "memory(GiB)": 78.26, + "step": 869, + "token_acc": 0.8753681468056185, + "train_speed(iter/s)": 0.033319 + }, + { + "epoch": 0.16858014823426828, + "grad_norm": 0.10680645704269409, + "learning_rate": 0.00028864622189932713, + "loss": 0.443330854177475, + "memory(GiB)": 78.26, + "step": 870, + "token_acc": 0.8720809086981933, + "train_speed(iter/s)": 0.033321 + }, + { + "epoch": 0.16877391851959503, + "grad_norm": 0.11195547878742218, + "learning_rate": 0.00028860950495014393, + "loss": 0.44975459575653076, + "memory(GiB)": 78.26, + "step": 871, + "token_acc": 0.8717114605117083, + "train_speed(iter/s)": 0.033323 + }, + { + "epoch": 0.16896768880492177, + "grad_norm": 0.10190173983573914, + "learning_rate": 0.00028857273107031243, + "loss": 0.4224821925163269, + "memory(GiB)": 78.26, + "step": 872, + "token_acc": 0.8783074075926575, + "train_speed(iter/s)": 0.033326 + }, + { + "epoch": 0.16916145909024852, + "grad_norm": 0.10820237547159195, + "learning_rate": 0.0002885359002749367, + "loss": 0.4363636076450348, + "memory(GiB)": 78.26, + "step": 873, + "token_acc": 0.8747307109231629, + "train_speed(iter/s)": 0.033328 + }, + { + "epoch": 0.16935522937557526, + "grad_norm": 0.10944371670484543, + "learning_rate": 0.00028849901257914416, + "loss": 0.4190293550491333, + "memory(GiB)": 78.26, + "step": 874, + "token_acc": 0.87886427298192, + "train_speed(iter/s)": 0.03333 + }, + { + "epoch": 0.169548999660902, + "grad_norm": 0.11039703339338303, + "learning_rate": 0.0002884620679980855, + "loss": 0.44972705841064453, + "memory(GiB)": 78.26, + "step": 875, + "token_acc": 0.8701769399209686, + "train_speed(iter/s)": 0.033332 + }, + { + "epoch": 0.16974276994622875, + "grad_norm": 0.10529112815856934, + "learning_rate": 0.00028842506654693493, + "loss": 0.45059582591056824, + "memory(GiB)": 78.26, + "step": 876, + "token_acc": 0.8691080109913337, + "train_speed(iter/s)": 0.033335 + }, + { + "epoch": 0.1699365402315555, + "grad_norm": 0.10080941766500473, + "learning_rate": 0.00028838800824088984, + "loss": 0.4126817286014557, + "memory(GiB)": 78.26, + "step": 877, + "token_acc": 0.880942601909847, + "train_speed(iter/s)": 0.033336 + }, + { + "epoch": 0.17013031051688224, + "grad_norm": 0.0992860496044159, + "learning_rate": 0.00028835089309517116, + "loss": 0.39974620938301086, + "memory(GiB)": 78.26, + "step": 878, + "token_acc": 0.8827877507919747, + "train_speed(iter/s)": 0.033338 + }, + { + "epoch": 0.17032408080220898, + "grad_norm": 0.11538361012935638, + "learning_rate": 0.0002883137211250231, + "loss": 0.4553724229335785, + "memory(GiB)": 78.26, + "step": 879, + "token_acc": 0.8721838688599108, + "train_speed(iter/s)": 0.03334 + }, + { + "epoch": 0.17051785108753573, + "grad_norm": 0.10386830568313599, + "learning_rate": 0.0002882764923457131, + "loss": 0.40255507826805115, + "memory(GiB)": 78.26, + "step": 880, + "token_acc": 0.8842005576892724, + "train_speed(iter/s)": 0.033343 + }, + { + "epoch": 0.17071162137286247, + "grad_norm": 0.10418614745140076, + "learning_rate": 0.000288239206772532, + "loss": 0.41422078013420105, + "memory(GiB)": 78.26, + "step": 881, + "token_acc": 0.8813669826953899, + "train_speed(iter/s)": 0.033345 + }, + { + "epoch": 0.17090539165818922, + "grad_norm": 0.10407593846321106, + "learning_rate": 0.00028820186442079414, + "loss": 0.40433794260025024, + "memory(GiB)": 78.26, + "step": 882, + "token_acc": 0.8834683588753167, + "train_speed(iter/s)": 0.033347 + }, + { + "epoch": 0.17109916194351596, + "grad_norm": 0.11592178791761398, + "learning_rate": 0.0002881644653058369, + "loss": 0.4496549963951111, + "memory(GiB)": 78.26, + "step": 883, + "token_acc": 0.8705704799275581, + "train_speed(iter/s)": 0.033349 + }, + { + "epoch": 0.1712929322288427, + "grad_norm": 0.10687348991632462, + "learning_rate": 0.0002881270094430212, + "loss": 0.3880995512008667, + "memory(GiB)": 78.26, + "step": 884, + "token_acc": 0.889063373346464, + "train_speed(iter/s)": 0.033351 + }, + { + "epoch": 0.17148670251416945, + "grad_norm": 0.11258858442306519, + "learning_rate": 0.000288089496847731, + "loss": 0.43804460763931274, + "memory(GiB)": 78.26, + "step": 885, + "token_acc": 0.8751539547598306, + "train_speed(iter/s)": 0.033353 + }, + { + "epoch": 0.1716804727994962, + "grad_norm": 0.10251414030790329, + "learning_rate": 0.00028805192753537386, + "loss": 0.39091038703918457, + "memory(GiB)": 78.26, + "step": 886, + "token_acc": 0.884448902027027, + "train_speed(iter/s)": 0.033355 + }, + { + "epoch": 0.17187424308482294, + "grad_norm": 0.10573960095643997, + "learning_rate": 0.0002880143015213805, + "loss": 0.43722304701805115, + "memory(GiB)": 78.26, + "step": 887, + "token_acc": 0.8758301120031717, + "train_speed(iter/s)": 0.033358 + }, + { + "epoch": 0.17206801337014968, + "grad_norm": 0.10792998969554901, + "learning_rate": 0.00028797661882120495, + "loss": 0.40696701407432556, + "memory(GiB)": 78.26, + "step": 888, + "token_acc": 0.8834754878388297, + "train_speed(iter/s)": 0.03336 + }, + { + "epoch": 0.17226178365547642, + "grad_norm": 0.10105016082525253, + "learning_rate": 0.0002879388794503245, + "loss": 0.3817410469055176, + "memory(GiB)": 78.26, + "step": 889, + "token_acc": 0.8874983670803397, + "train_speed(iter/s)": 0.033362 + }, + { + "epoch": 0.17245555394080317, + "grad_norm": 0.1093856692314148, + "learning_rate": 0.0002879010834242396, + "loss": 0.4195503294467926, + "memory(GiB)": 78.26, + "step": 890, + "token_acc": 0.8788049002300327, + "train_speed(iter/s)": 0.033364 + }, + { + "epoch": 0.1726493242261299, + "grad_norm": 0.11232632398605347, + "learning_rate": 0.00028786323075847425, + "loss": 0.45594799518585205, + "memory(GiB)": 78.26, + "step": 891, + "token_acc": 0.8703043956619689, + "train_speed(iter/s)": 0.033366 + }, + { + "epoch": 0.17284309451145666, + "grad_norm": 0.10968980938196182, + "learning_rate": 0.00028782532146857546, + "loss": 0.4293142557144165, + "memory(GiB)": 78.26, + "step": 892, + "token_acc": 0.8755912321954648, + "train_speed(iter/s)": 0.033368 + }, + { + "epoch": 0.1730368647967834, + "grad_norm": 0.12125832587480545, + "learning_rate": 0.0002877873555701137, + "loss": 0.49456584453582764, + "memory(GiB)": 78.26, + "step": 893, + "token_acc": 0.8624862149246416, + "train_speed(iter/s)": 0.03337 + }, + { + "epoch": 0.17323063508211015, + "grad_norm": 0.1106322631239891, + "learning_rate": 0.00028774933307868243, + "loss": 0.4586373269557953, + "memory(GiB)": 78.26, + "step": 894, + "token_acc": 0.869065453654019, + "train_speed(iter/s)": 0.033373 + }, + { + "epoch": 0.1734244053674369, + "grad_norm": 0.09941691905260086, + "learning_rate": 0.00028771125400989863, + "loss": 0.413199782371521, + "memory(GiB)": 78.26, + "step": 895, + "token_acc": 0.8804068187829097, + "train_speed(iter/s)": 0.033375 + }, + { + "epoch": 0.17361817565276366, + "grad_norm": 0.11516234278678894, + "learning_rate": 0.0002876731183794024, + "loss": 0.43939366936683655, + "memory(GiB)": 78.26, + "step": 896, + "token_acc": 0.8730681167716084, + "train_speed(iter/s)": 0.033377 + }, + { + "epoch": 0.1738119459380904, + "grad_norm": 0.10177889466285706, + "learning_rate": 0.000287634926202857, + "loss": 0.37696516513824463, + "memory(GiB)": 78.26, + "step": 897, + "token_acc": 0.8901657575597098, + "train_speed(iter/s)": 0.033379 + }, + { + "epoch": 0.17400571622341715, + "grad_norm": 0.11825387924909592, + "learning_rate": 0.00028759667749594903, + "loss": 0.4378824830055237, + "memory(GiB)": 78.26, + "step": 898, + "token_acc": 0.8729655768399828, + "train_speed(iter/s)": 0.033381 + }, + { + "epoch": 0.1741994865087439, + "grad_norm": 0.1011887937784195, + "learning_rate": 0.0002875583722743882, + "loss": 0.4053630232810974, + "memory(GiB)": 78.26, + "step": 899, + "token_acc": 0.8832929164007658, + "train_speed(iter/s)": 0.033383 + }, + { + "epoch": 0.17439325679407064, + "grad_norm": 0.11091629415750504, + "learning_rate": 0.0002875200105539076, + "loss": 0.43335989117622375, + "memory(GiB)": 78.26, + "step": 900, + "token_acc": 0.8734870853616364, + "train_speed(iter/s)": 0.033385 + }, + { + "epoch": 0.17458702707939738, + "grad_norm": 0.12012141942977905, + "learning_rate": 0.00028748159235026337, + "loss": 0.4563184380531311, + "memory(GiB)": 78.26, + "step": 901, + "token_acc": 0.8695292291774444, + "train_speed(iter/s)": 0.033387 + }, + { + "epoch": 0.17478079736472413, + "grad_norm": 0.1084456518292427, + "learning_rate": 0.00028744311767923487, + "loss": 0.42208635807037354, + "memory(GiB)": 78.26, + "step": 902, + "token_acc": 0.8803865467709915, + "train_speed(iter/s)": 0.033389 + }, + { + "epoch": 0.17497456765005087, + "grad_norm": 0.11079172044992447, + "learning_rate": 0.00028740458655662467, + "loss": 0.4311895966529846, + "memory(GiB)": 78.26, + "step": 903, + "token_acc": 0.877756061809076, + "train_speed(iter/s)": 0.033391 + }, + { + "epoch": 0.17516833793537762, + "grad_norm": 0.10750513523817062, + "learning_rate": 0.00028736599899825856, + "loss": 0.43391120433807373, + "memory(GiB)": 78.26, + "step": 904, + "token_acc": 0.8769369834710744, + "train_speed(iter/s)": 0.033393 + }, + { + "epoch": 0.17536210822070436, + "grad_norm": 0.11065103858709335, + "learning_rate": 0.00028732735501998556, + "loss": 0.4614184498786926, + "memory(GiB)": 78.26, + "step": 905, + "token_acc": 0.8664335104252877, + "train_speed(iter/s)": 0.033395 + }, + { + "epoch": 0.1755558785060311, + "grad_norm": 0.11256072670221329, + "learning_rate": 0.0002872886546376777, + "loss": 0.4341152012348175, + "memory(GiB)": 78.26, + "step": 906, + "token_acc": 0.8766848053578903, + "train_speed(iter/s)": 0.033397 + }, + { + "epoch": 0.17574964879135785, + "grad_norm": 0.10496652871370316, + "learning_rate": 0.00028724989786723027, + "loss": 0.4188949167728424, + "memory(GiB)": 78.26, + "step": 907, + "token_acc": 0.8798962523819606, + "train_speed(iter/s)": 0.033399 + }, + { + "epoch": 0.1759434190766846, + "grad_norm": 0.10520727187395096, + "learning_rate": 0.00028721108472456173, + "loss": 0.40807557106018066, + "memory(GiB)": 78.26, + "step": 908, + "token_acc": 0.8816372301654051, + "train_speed(iter/s)": 0.033401 + }, + { + "epoch": 0.17613718936201134, + "grad_norm": 0.10006900131702423, + "learning_rate": 0.0002871722152256137, + "loss": 0.41225340962409973, + "memory(GiB)": 78.26, + "step": 909, + "token_acc": 0.8827568245850794, + "train_speed(iter/s)": 0.033403 + }, + { + "epoch": 0.17633095964733808, + "grad_norm": 0.10612796992063522, + "learning_rate": 0.0002871332893863509, + "loss": 0.40802818536758423, + "memory(GiB)": 78.26, + "step": 910, + "token_acc": 0.8830624153381265, + "train_speed(iter/s)": 0.033405 + }, + { + "epoch": 0.17652472993266483, + "grad_norm": 0.11015652120113373, + "learning_rate": 0.0002870943072227613, + "loss": 0.4355056881904602, + "memory(GiB)": 78.26, + "step": 911, + "token_acc": 0.8744773544633198, + "train_speed(iter/s)": 0.033407 + }, + { + "epoch": 0.17671850021799157, + "grad_norm": 0.11280830204486847, + "learning_rate": 0.00028705526875085575, + "loss": 0.39818063378334045, + "memory(GiB)": 78.26, + "step": 912, + "token_acc": 0.8824910029729307, + "train_speed(iter/s)": 0.033409 + }, + { + "epoch": 0.17691227050331831, + "grad_norm": 0.10122046619653702, + "learning_rate": 0.00028701617398666857, + "loss": 0.412325382232666, + "memory(GiB)": 78.26, + "step": 913, + "token_acc": 0.8825016812373907, + "train_speed(iter/s)": 0.033411 + }, + { + "epoch": 0.17710604078864506, + "grad_norm": 0.11189986765384674, + "learning_rate": 0.00028697702294625693, + "loss": 0.4302537441253662, + "memory(GiB)": 78.26, + "step": 914, + "token_acc": 0.8747316444826105, + "train_speed(iter/s)": 0.033413 + }, + { + "epoch": 0.1772998110739718, + "grad_norm": 0.1030743420124054, + "learning_rate": 0.0002869378156457013, + "loss": 0.4212334454059601, + "memory(GiB)": 78.26, + "step": 915, + "token_acc": 0.8769243019428814, + "train_speed(iter/s)": 0.033415 + }, + { + "epoch": 0.17749358135929855, + "grad_norm": 0.09644519537687302, + "learning_rate": 0.0002868985521011051, + "loss": 0.3844713568687439, + "memory(GiB)": 78.26, + "step": 916, + "token_acc": 0.8888474264248576, + "train_speed(iter/s)": 0.033416 + }, + { + "epoch": 0.1776873516446253, + "grad_norm": 0.10586938261985779, + "learning_rate": 0.000286859232328595, + "loss": 0.4248504638671875, + "memory(GiB)": 78.26, + "step": 917, + "token_acc": 0.8777870642473915, + "train_speed(iter/s)": 0.033418 + }, + { + "epoch": 0.17788112192995204, + "grad_norm": 0.11170489341020584, + "learning_rate": 0.00028681985634432055, + "loss": 0.41771912574768066, + "memory(GiB)": 78.26, + "step": 918, + "token_acc": 0.8776203050061971, + "train_speed(iter/s)": 0.03342 + }, + { + "epoch": 0.17807489221527878, + "grad_norm": 0.11335260421037674, + "learning_rate": 0.00028678042416445463, + "loss": 0.4187549352645874, + "memory(GiB)": 78.26, + "step": 919, + "token_acc": 0.8776417309627642, + "train_speed(iter/s)": 0.033422 + }, + { + "epoch": 0.17826866250060552, + "grad_norm": 0.10571952164173126, + "learning_rate": 0.0002867409358051931, + "loss": 0.4060910642147064, + "memory(GiB)": 78.26, + "step": 920, + "token_acc": 0.8823606660713348, + "train_speed(iter/s)": 0.033423 + }, + { + "epoch": 0.17846243278593227, + "grad_norm": 0.12325315922498703, + "learning_rate": 0.00028670139128275483, + "loss": 0.44652315974235535, + "memory(GiB)": 78.26, + "step": 921, + "token_acc": 0.8700105204529983, + "train_speed(iter/s)": 0.033425 + }, + { + "epoch": 0.178656203071259, + "grad_norm": 0.1102391704916954, + "learning_rate": 0.0002866617906133819, + "loss": 0.4251522123813629, + "memory(GiB)": 78.26, + "step": 922, + "token_acc": 0.876720608841388, + "train_speed(iter/s)": 0.033427 + }, + { + "epoch": 0.17884997335658576, + "grad_norm": 0.1056000366806984, + "learning_rate": 0.00028662213381333926, + "loss": 0.4017042815685272, + "memory(GiB)": 78.26, + "step": 923, + "token_acc": 0.8835219794682811, + "train_speed(iter/s)": 0.033429 + }, + { + "epoch": 0.1790437436419125, + "grad_norm": 0.11068852990865707, + "learning_rate": 0.00028658242089891513, + "loss": 0.40017062425613403, + "memory(GiB)": 78.26, + "step": 924, + "token_acc": 0.8838785514205683, + "train_speed(iter/s)": 0.033431 + }, + { + "epoch": 0.17923751392723924, + "grad_norm": 0.10683512687683105, + "learning_rate": 0.0002865426518864206, + "loss": 0.4157922565937042, + "memory(GiB)": 78.26, + "step": 925, + "token_acc": 0.8800466330435665, + "train_speed(iter/s)": 0.033433 + }, + { + "epoch": 0.17943128421256602, + "grad_norm": 0.10481857508420944, + "learning_rate": 0.00028650282679218994, + "loss": 0.4214838743209839, + "memory(GiB)": 78.26, + "step": 926, + "token_acc": 0.8785436944843359, + "train_speed(iter/s)": 0.033435 + }, + { + "epoch": 0.17962505449789276, + "grad_norm": 0.10210458934307098, + "learning_rate": 0.0002864629456325803, + "loss": 0.39417165517807007, + "memory(GiB)": 78.26, + "step": 927, + "token_acc": 0.8880719554934686, + "train_speed(iter/s)": 0.033437 + }, + { + "epoch": 0.1798188247832195, + "grad_norm": 0.10829130560159683, + "learning_rate": 0.000286423008423972, + "loss": 0.4183374047279358, + "memory(GiB)": 78.26, + "step": 928, + "token_acc": 0.8781808337845154, + "train_speed(iter/s)": 0.033438 + }, + { + "epoch": 0.18001259506854625, + "grad_norm": 0.10995203256607056, + "learning_rate": 0.00028638301518276826, + "loss": 0.447214275598526, + "memory(GiB)": 78.26, + "step": 929, + "token_acc": 0.8710934284538345, + "train_speed(iter/s)": 0.03344 + }, + { + "epoch": 0.180206365353873, + "grad_norm": 0.10909594595432281, + "learning_rate": 0.00028634296592539547, + "loss": 0.3997655510902405, + "memory(GiB)": 78.26, + "step": 930, + "token_acc": 0.8848353156450137, + "train_speed(iter/s)": 0.033442 + }, + { + "epoch": 0.18040013563919974, + "grad_norm": 0.1183900386095047, + "learning_rate": 0.0002863028606683029, + "loss": 0.4467264413833618, + "memory(GiB)": 78.26, + "step": 931, + "token_acc": 0.8734884573103701, + "train_speed(iter/s)": 0.033444 + }, + { + "epoch": 0.18059390592452648, + "grad_norm": 0.10841910541057587, + "learning_rate": 0.00028626269942796294, + "loss": 0.4616568088531494, + "memory(GiB)": 78.26, + "step": 932, + "token_acc": 0.8696779070709253, + "train_speed(iter/s)": 0.033446 + }, + { + "epoch": 0.18078767620985323, + "grad_norm": 0.11348854750394821, + "learning_rate": 0.0002862224822208707, + "loss": 0.45820578932762146, + "memory(GiB)": 78.26, + "step": 933, + "token_acc": 0.8705489207844856, + "train_speed(iter/s)": 0.033448 + }, + { + "epoch": 0.18098144649517997, + "grad_norm": 0.10200490057468414, + "learning_rate": 0.0002861822090635446, + "loss": 0.40025630593299866, + "memory(GiB)": 78.26, + "step": 934, + "token_acc": 0.8839878318584071, + "train_speed(iter/s)": 0.03345 + }, + { + "epoch": 0.18117521678050671, + "grad_norm": 0.10408560186624527, + "learning_rate": 0.00028614187997252585, + "loss": 0.44248849153518677, + "memory(GiB)": 78.26, + "step": 935, + "token_acc": 0.8738729347996039, + "train_speed(iter/s)": 0.033451 + }, + { + "epoch": 0.18136898706583346, + "grad_norm": 0.1146451011300087, + "learning_rate": 0.0002861014949643787, + "loss": 0.4483628273010254, + "memory(GiB)": 78.26, + "step": 936, + "token_acc": 0.8693443867125927, + "train_speed(iter/s)": 0.033453 + }, + { + "epoch": 0.1815627573511602, + "grad_norm": 0.11855790764093399, + "learning_rate": 0.0002860610540556905, + "loss": 0.41510143876075745, + "memory(GiB)": 78.26, + "step": 937, + "token_acc": 0.8812605670888882, + "train_speed(iter/s)": 0.033455 + }, + { + "epoch": 0.18175652763648695, + "grad_norm": 0.12932439148426056, + "learning_rate": 0.0002860205572630712, + "loss": 0.41384294629096985, + "memory(GiB)": 78.26, + "step": 938, + "token_acc": 0.8795511921458625, + "train_speed(iter/s)": 0.033457 + }, + { + "epoch": 0.1819502979218137, + "grad_norm": 0.1010417714715004, + "learning_rate": 0.00028598000460315404, + "loss": 0.3942815363407135, + "memory(GiB)": 78.26, + "step": 939, + "token_acc": 0.88588022518277, + "train_speed(iter/s)": 0.033459 + }, + { + "epoch": 0.18214406820714044, + "grad_norm": 0.11028870195150375, + "learning_rate": 0.00028593939609259506, + "loss": 0.46208620071411133, + "memory(GiB)": 78.26, + "step": 940, + "token_acc": 0.8662080777943533, + "train_speed(iter/s)": 0.033461 + }, + { + "epoch": 0.18233783849246718, + "grad_norm": 0.10703767091035843, + "learning_rate": 0.0002858987317480733, + "loss": 0.4361118674278259, + "memory(GiB)": 78.26, + "step": 941, + "token_acc": 0.8754664583875726, + "train_speed(iter/s)": 0.033463 + }, + { + "epoch": 0.18253160877779392, + "grad_norm": 0.09738662093877792, + "learning_rate": 0.00028585801158629063, + "loss": 0.4183694124221802, + "memory(GiB)": 78.26, + "step": 942, + "token_acc": 0.8812399810345217, + "train_speed(iter/s)": 0.033465 + }, + { + "epoch": 0.18272537906312067, + "grad_norm": 0.10861563682556152, + "learning_rate": 0.000285817235623972, + "loss": 0.4377916157245636, + "memory(GiB)": 78.26, + "step": 943, + "token_acc": 0.8772842397668782, + "train_speed(iter/s)": 0.033466 + }, + { + "epoch": 0.1829191493484474, + "grad_norm": 0.11255238950252533, + "learning_rate": 0.0002857764038778651, + "loss": 0.4062233865261078, + "memory(GiB)": 78.26, + "step": 944, + "token_acc": 0.8827611918896441, + "train_speed(iter/s)": 0.033468 + }, + { + "epoch": 0.18311291963377416, + "grad_norm": 0.1005673035979271, + "learning_rate": 0.0002857355163647407, + "loss": 0.3768383264541626, + "memory(GiB)": 78.26, + "step": 945, + "token_acc": 0.8898000897352809, + "train_speed(iter/s)": 0.03347 + }, + { + "epoch": 0.1833066899191009, + "grad_norm": 0.1112765297293663, + "learning_rate": 0.00028569457310139237, + "loss": 0.4166128635406494, + "memory(GiB)": 78.26, + "step": 946, + "token_acc": 0.87920341245126, + "train_speed(iter/s)": 0.033472 + }, + { + "epoch": 0.18350046020442765, + "grad_norm": 0.12681414186954498, + "learning_rate": 0.00028565357410463663, + "loss": 0.4559955894947052, + "memory(GiB)": 78.26, + "step": 947, + "token_acc": 0.8697389451251998, + "train_speed(iter/s)": 0.033474 + }, + { + "epoch": 0.1836942304897544, + "grad_norm": 0.1174798235297203, + "learning_rate": 0.0002856125193913128, + "loss": 0.48667585849761963, + "memory(GiB)": 78.26, + "step": 948, + "token_acc": 0.8624453490125132, + "train_speed(iter/s)": 0.033476 + }, + { + "epoch": 0.18388800077508113, + "grad_norm": 0.11614065617322922, + "learning_rate": 0.00028557140897828324, + "loss": 0.43031829595565796, + "memory(GiB)": 78.26, + "step": 949, + "token_acc": 0.8768826126436422, + "train_speed(iter/s)": 0.033478 + }, + { + "epoch": 0.18408177106040788, + "grad_norm": 0.11647061258554459, + "learning_rate": 0.00028553024288243306, + "loss": 0.43689054250717163, + "memory(GiB)": 78.26, + "step": 950, + "token_acc": 0.8749047820567076, + "train_speed(iter/s)": 0.03348 + }, + { + "epoch": 0.18427554134573462, + "grad_norm": 0.12167865037918091, + "learning_rate": 0.0002854890211206703, + "loss": 0.4729604125022888, + "memory(GiB)": 78.26, + "step": 951, + "token_acc": 0.8637253103406103, + "train_speed(iter/s)": 0.033483 + }, + { + "epoch": 0.18446931163106137, + "grad_norm": 0.11439337581396103, + "learning_rate": 0.00028544774370992587, + "loss": 0.4109064042568207, + "memory(GiB)": 78.26, + "step": 952, + "token_acc": 0.8795947901591896, + "train_speed(iter/s)": 0.033484 + }, + { + "epoch": 0.1846630819163881, + "grad_norm": 0.11438382416963577, + "learning_rate": 0.0002854064106671534, + "loss": 0.43977493047714233, + "memory(GiB)": 78.26, + "step": 953, + "token_acc": 0.8736031785448225, + "train_speed(iter/s)": 0.033486 + }, + { + "epoch": 0.18485685220171486, + "grad_norm": 0.10898079723119736, + "learning_rate": 0.0002853650220093296, + "loss": 0.45643150806427, + "memory(GiB)": 78.26, + "step": 954, + "token_acc": 0.8683694024069004, + "train_speed(iter/s)": 0.033488 + }, + { + "epoch": 0.1850506224870416, + "grad_norm": 0.1164807602763176, + "learning_rate": 0.0002853235777534539, + "loss": 0.41862934827804565, + "memory(GiB)": 78.26, + "step": 955, + "token_acc": 0.8782421565296183, + "train_speed(iter/s)": 0.03349 + }, + { + "epoch": 0.18524439277236834, + "grad_norm": 0.11116447299718857, + "learning_rate": 0.00028528207791654847, + "loss": 0.4242367148399353, + "memory(GiB)": 78.26, + "step": 956, + "token_acc": 0.8753245376993589, + "train_speed(iter/s)": 0.033492 + }, + { + "epoch": 0.18543816305769512, + "grad_norm": 0.11318708211183548, + "learning_rate": 0.0002852405225156585, + "loss": 0.43691354990005493, + "memory(GiB)": 78.26, + "step": 957, + "token_acc": 0.8767160750423891, + "train_speed(iter/s)": 0.033493 + }, + { + "epoch": 0.18563193334302186, + "grad_norm": 0.10934760421514511, + "learning_rate": 0.00028519891156785187, + "loss": 0.45028555393218994, + "memory(GiB)": 78.26, + "step": 958, + "token_acc": 0.8715856095936043, + "train_speed(iter/s)": 0.033495 + }, + { + "epoch": 0.1858257036283486, + "grad_norm": 0.11843264102935791, + "learning_rate": 0.0002851572450902193, + "loss": 0.45888209342956543, + "memory(GiB)": 78.26, + "step": 959, + "token_acc": 0.8671850891810159, + "train_speed(iter/s)": 0.033497 + }, + { + "epoch": 0.18601947391367535, + "grad_norm": 0.10771480202674866, + "learning_rate": 0.0002851155230998744, + "loss": 0.44502004981040955, + "memory(GiB)": 78.26, + "step": 960, + "token_acc": 0.8715418593237414, + "train_speed(iter/s)": 0.033499 + }, + { + "epoch": 0.1862132441990021, + "grad_norm": 0.11727321892976761, + "learning_rate": 0.00028507374561395345, + "loss": 0.4620700478553772, + "memory(GiB)": 78.26, + "step": 961, + "token_acc": 0.8692043503148255, + "train_speed(iter/s)": 0.033501 + }, + { + "epoch": 0.18640701448432884, + "grad_norm": 0.11580432206392288, + "learning_rate": 0.0002850319126496156, + "loss": 0.43666505813598633, + "memory(GiB)": 78.26, + "step": 962, + "token_acc": 0.8729514358743046, + "train_speed(iter/s)": 0.033503 + }, + { + "epoch": 0.18660078476965558, + "grad_norm": 0.11996878683567047, + "learning_rate": 0.00028499002422404274, + "loss": 0.49721530079841614, + "memory(GiB)": 78.26, + "step": 963, + "token_acc": 0.8572844400396432, + "train_speed(iter/s)": 0.033504 + }, + { + "epoch": 0.18679455505498233, + "grad_norm": 0.09926389157772064, + "learning_rate": 0.00028494808035443966, + "loss": 0.3834714889526367, + "memory(GiB)": 78.26, + "step": 964, + "token_acc": 0.8917675254643499, + "train_speed(iter/s)": 0.033506 + }, + { + "epoch": 0.18698832534030907, + "grad_norm": 0.11022564768791199, + "learning_rate": 0.00028490608105803374, + "loss": 0.43356478214263916, + "memory(GiB)": 78.26, + "step": 965, + "token_acc": 0.8749029825923051, + "train_speed(iter/s)": 0.033508 + }, + { + "epoch": 0.1871820956256358, + "grad_norm": 0.11110399663448334, + "learning_rate": 0.0002848640263520753, + "loss": 0.4324399530887604, + "memory(GiB)": 78.26, + "step": 966, + "token_acc": 0.8767509301816591, + "train_speed(iter/s)": 0.03351 + }, + { + "epoch": 0.18737586591096256, + "grad_norm": 0.11281298100948334, + "learning_rate": 0.00028482191625383733, + "loss": 0.4322070777416229, + "memory(GiB)": 78.26, + "step": 967, + "token_acc": 0.8765359035058055, + "train_speed(iter/s)": 0.033512 + }, + { + "epoch": 0.1875696361962893, + "grad_norm": 0.1022447943687439, + "learning_rate": 0.0002847797507806155, + "loss": 0.39318740367889404, + "memory(GiB)": 78.26, + "step": 968, + "token_acc": 0.8852789546220203, + "train_speed(iter/s)": 0.033513 + }, + { + "epoch": 0.18776340648161605, + "grad_norm": 0.12078309804201126, + "learning_rate": 0.0002847375299497284, + "loss": 0.4697941243648529, + "memory(GiB)": 78.26, + "step": 969, + "token_acc": 0.8657820386654392, + "train_speed(iter/s)": 0.033515 + }, + { + "epoch": 0.1879571767669428, + "grad_norm": 0.10926394909620285, + "learning_rate": 0.00028469525377851715, + "loss": 0.3991783857345581, + "memory(GiB)": 78.26, + "step": 970, + "token_acc": 0.8852660300136426, + "train_speed(iter/s)": 0.033517 + }, + { + "epoch": 0.18815094705226953, + "grad_norm": 0.11045163869857788, + "learning_rate": 0.0002846529222843458, + "loss": 0.4475860893726349, + "memory(GiB)": 78.26, + "step": 971, + "token_acc": 0.8725651014968219, + "train_speed(iter/s)": 0.033518 + }, + { + "epoch": 0.18834471733759628, + "grad_norm": 0.09906210750341415, + "learning_rate": 0.000284610535484601, + "loss": 0.34970560669898987, + "memory(GiB)": 78.26, + "step": 972, + "token_acc": 0.8972711247963943, + "train_speed(iter/s)": 0.03352 + }, + { + "epoch": 0.18853848762292302, + "grad_norm": 0.11121566593647003, + "learning_rate": 0.00028456809339669214, + "loss": 0.4188978672027588, + "memory(GiB)": 78.26, + "step": 973, + "token_acc": 0.8785746369216949, + "train_speed(iter/s)": 0.033522 + }, + { + "epoch": 0.18873225790824977, + "grad_norm": 0.11328519135713577, + "learning_rate": 0.00028452559603805137, + "loss": 0.44297975301742554, + "memory(GiB)": 78.26, + "step": 974, + "token_acc": 0.8741913622081202, + "train_speed(iter/s)": 0.033524 + }, + { + "epoch": 0.1889260281935765, + "grad_norm": 0.10972630977630615, + "learning_rate": 0.00028448304342613344, + "loss": 0.4079618453979492, + "memory(GiB)": 78.26, + "step": 975, + "token_acc": 0.8813834977772959, + "train_speed(iter/s)": 0.033526 + }, + { + "epoch": 0.18911979847890326, + "grad_norm": 0.0967109352350235, + "learning_rate": 0.00028444043557841585, + "loss": 0.38039693236351013, + "memory(GiB)": 78.26, + "step": 976, + "token_acc": 0.8867412594640688, + "train_speed(iter/s)": 0.033528 + }, + { + "epoch": 0.18931356876423, + "grad_norm": 0.10372382402420044, + "learning_rate": 0.00028439777251239887, + "loss": 0.42939719557762146, + "memory(GiB)": 78.26, + "step": 977, + "token_acc": 0.8766995304010031, + "train_speed(iter/s)": 0.033529 + }, + { + "epoch": 0.18950733904955674, + "grad_norm": 0.10110750049352646, + "learning_rate": 0.00028435505424560527, + "loss": 0.4236353039741516, + "memory(GiB)": 78.26, + "step": 978, + "token_acc": 0.8794206803637589, + "train_speed(iter/s)": 0.033531 + }, + { + "epoch": 0.1897011093348835, + "grad_norm": 0.11021259427070618, + "learning_rate": 0.00028431228079558063, + "loss": 0.4256800711154938, + "memory(GiB)": 78.26, + "step": 979, + "token_acc": 0.8776469925658932, + "train_speed(iter/s)": 0.033533 + }, + { + "epoch": 0.18989487962021023, + "grad_norm": 0.10382870584726334, + "learning_rate": 0.00028426945217989316, + "loss": 0.39924290776252747, + "memory(GiB)": 78.26, + "step": 980, + "token_acc": 0.8837884005314389, + "train_speed(iter/s)": 0.033535 + }, + { + "epoch": 0.19008864990553698, + "grad_norm": 0.11175846308469772, + "learning_rate": 0.00028422656841613377, + "loss": 0.42587774991989136, + "memory(GiB)": 78.26, + "step": 981, + "token_acc": 0.8782795084309802, + "train_speed(iter/s)": 0.033537 + }, + { + "epoch": 0.19028242019086372, + "grad_norm": 0.10329094529151917, + "learning_rate": 0.00028418362952191585, + "loss": 0.40456724166870117, + "memory(GiB)": 78.26, + "step": 982, + "token_acc": 0.881374140565317, + "train_speed(iter/s)": 0.033539 + }, + { + "epoch": 0.19047619047619047, + "grad_norm": 0.11278124898672104, + "learning_rate": 0.0002841406355148757, + "loss": 0.4433564245700836, + "memory(GiB)": 78.26, + "step": 983, + "token_acc": 0.871099352158647, + "train_speed(iter/s)": 0.033541 + }, + { + "epoch": 0.1906699607615172, + "grad_norm": 0.11207697540521622, + "learning_rate": 0.000284097586412672, + "loss": 0.40271443128585815, + "memory(GiB)": 78.26, + "step": 984, + "token_acc": 0.8841104695500599, + "train_speed(iter/s)": 0.033542 + }, + { + "epoch": 0.19086373104684395, + "grad_norm": 0.11175701022148132, + "learning_rate": 0.00028405448223298624, + "loss": 0.451436847448349, + "memory(GiB)": 78.26, + "step": 985, + "token_acc": 0.8711304942705661, + "train_speed(iter/s)": 0.033544 + }, + { + "epoch": 0.1910575013321707, + "grad_norm": 0.10010375082492828, + "learning_rate": 0.0002840113229935224, + "loss": 0.3909650444984436, + "memory(GiB)": 78.26, + "step": 986, + "token_acc": 0.8874252112061217, + "train_speed(iter/s)": 0.033545 + }, + { + "epoch": 0.19125127161749747, + "grad_norm": 0.09799065440893173, + "learning_rate": 0.0002839681087120073, + "loss": 0.38588500022888184, + "memory(GiB)": 78.26, + "step": 987, + "token_acc": 0.8859629435674921, + "train_speed(iter/s)": 0.033547 + }, + { + "epoch": 0.19144504190282421, + "grad_norm": 0.11740557104349136, + "learning_rate": 0.0002839248394061899, + "loss": 0.4740091562271118, + "memory(GiB)": 78.26, + "step": 988, + "token_acc": 0.8610855829982769, + "train_speed(iter/s)": 0.033549 + }, + { + "epoch": 0.19163881218815096, + "grad_norm": 0.11015292257070541, + "learning_rate": 0.0002838815150938424, + "loss": 0.41003599762916565, + "memory(GiB)": 78.26, + "step": 989, + "token_acc": 0.882699868938401, + "train_speed(iter/s)": 0.033551 + }, + { + "epoch": 0.1918325824734777, + "grad_norm": 0.11227616667747498, + "learning_rate": 0.0002838381357927591, + "loss": 0.41413140296936035, + "memory(GiB)": 78.26, + "step": 990, + "token_acc": 0.880181635836862, + "train_speed(iter/s)": 0.033552 + }, + { + "epoch": 0.19202635275880445, + "grad_norm": 0.11800327897071838, + "learning_rate": 0.000283794701520757, + "loss": 0.4505453109741211, + "memory(GiB)": 78.26, + "step": 991, + "token_acc": 0.8734527175620483, + "train_speed(iter/s)": 0.033554 + }, + { + "epoch": 0.1922201230441312, + "grad_norm": 0.11021004617214203, + "learning_rate": 0.00028375121229567583, + "loss": 0.42506149411201477, + "memory(GiB)": 78.26, + "step": 992, + "token_acc": 0.8773052126543056, + "train_speed(iter/s)": 0.033556 + }, + { + "epoch": 0.19241389332945794, + "grad_norm": 0.11306871473789215, + "learning_rate": 0.0002837076681353777, + "loss": 0.4095050096511841, + "memory(GiB)": 78.26, + "step": 993, + "token_acc": 0.8811114926437571, + "train_speed(iter/s)": 0.033557 + }, + { + "epoch": 0.19260766361478468, + "grad_norm": 0.09795724600553513, + "learning_rate": 0.00028366406905774746, + "loss": 0.376755952835083, + "memory(GiB)": 78.26, + "step": 994, + "token_acc": 0.8902818813383374, + "train_speed(iter/s)": 0.033559 + }, + { + "epoch": 0.19280143390011142, + "grad_norm": 0.10513719916343689, + "learning_rate": 0.0002836204150806923, + "loss": 0.39423179626464844, + "memory(GiB)": 78.26, + "step": 995, + "token_acc": 0.8858132362606805, + "train_speed(iter/s)": 0.03356 + }, + { + "epoch": 0.19299520418543817, + "grad_norm": 0.10870835930109024, + "learning_rate": 0.0002835767062221422, + "loss": 0.43109869956970215, + "memory(GiB)": 78.26, + "step": 996, + "token_acc": 0.8794217244073977, + "train_speed(iter/s)": 0.033562 + }, + { + "epoch": 0.1931889744707649, + "grad_norm": 0.11085118353366852, + "learning_rate": 0.0002835329425000495, + "loss": 0.41626110672950745, + "memory(GiB)": 78.26, + "step": 997, + "token_acc": 0.8794915927446048, + "train_speed(iter/s)": 0.033564 + }, + { + "epoch": 0.19338274475609166, + "grad_norm": 0.10917031019926071, + "learning_rate": 0.00028348912393238914, + "loss": 0.442068487405777, + "memory(GiB)": 78.26, + "step": 998, + "token_acc": 0.8718288169021695, + "train_speed(iter/s)": 0.033566 + }, + { + "epoch": 0.1935765150414184, + "grad_norm": 0.11420496553182602, + "learning_rate": 0.00028344525053715857, + "loss": 0.45237767696380615, + "memory(GiB)": 78.26, + "step": 999, + "token_acc": 0.8713443106414352, + "train_speed(iter/s)": 0.033568 + }, + { + "epoch": 0.19377028532674515, + "grad_norm": 0.10354109853506088, + "learning_rate": 0.0002834013223323778, + "loss": 0.40592384338378906, + "memory(GiB)": 78.26, + "step": 1000, + "token_acc": 0.883349086326402, + "train_speed(iter/s)": 0.033569 + }, + { + "epoch": 0.19377028532674515, + "eval_loss": 0.48618289828300476, + "eval_runtime": 1345.1073, + "eval_samples_per_second": 5.017, + "eval_steps_per_second": 5.017, + "eval_token_acc": 0.8788295367774018, + "step": 1000 + }, + { + "epoch": 0.1939640556120719, + "grad_norm": 0.11008410900831223, + "learning_rate": 0.00028335733933608937, + "loss": 0.44370290637016296, + "memory(GiB)": 78.26, + "step": 1001, + "token_acc": 0.8703201902339214, + "train_speed(iter/s)": 0.032109 + }, + { + "epoch": 0.19415782589739863, + "grad_norm": 0.10204144567251205, + "learning_rate": 0.00028331330156635814, + "loss": 0.37651312351226807, + "memory(GiB)": 78.26, + "step": 1002, + "token_acc": 0.8898332699723022, + "train_speed(iter/s)": 0.032111 + }, + { + "epoch": 0.19435159618272538, + "grad_norm": 0.10822474956512451, + "learning_rate": 0.0002832692090412717, + "loss": 0.4068623185157776, + "memory(GiB)": 78.26, + "step": 1003, + "token_acc": 0.8804925222697431, + "train_speed(iter/s)": 0.032114 + }, + { + "epoch": 0.19454536646805212, + "grad_norm": 0.10033664852380753, + "learning_rate": 0.0002832250617789401, + "loss": 0.4149298667907715, + "memory(GiB)": 78.26, + "step": 1004, + "token_acc": 0.8801352320458969, + "train_speed(iter/s)": 0.032117 + }, + { + "epoch": 0.19473913675337887, + "grad_norm": 0.12285728007555008, + "learning_rate": 0.00028318085979749563, + "loss": 0.46761053800582886, + "memory(GiB)": 78.26, + "step": 1005, + "token_acc": 0.8641041659816526, + "train_speed(iter/s)": 0.03212 + }, + { + "epoch": 0.1949329070387056, + "grad_norm": 0.09842801839113235, + "learning_rate": 0.0002831366031150934, + "loss": 0.39952999353408813, + "memory(GiB)": 78.26, + "step": 1006, + "token_acc": 0.8830670627587659, + "train_speed(iter/s)": 0.032123 + }, + { + "epoch": 0.19512667732403235, + "grad_norm": 0.11463743448257446, + "learning_rate": 0.0002830922917499108, + "loss": 0.43408212065696716, + "memory(GiB)": 78.26, + "step": 1007, + "token_acc": 0.8743004464566434, + "train_speed(iter/s)": 0.032126 + }, + { + "epoch": 0.1953204476093591, + "grad_norm": 0.11018949747085571, + "learning_rate": 0.00028304792572014754, + "loss": 0.43823209404945374, + "memory(GiB)": 78.26, + "step": 1008, + "token_acc": 0.8741348099243957, + "train_speed(iter/s)": 0.032128 + }, + { + "epoch": 0.19551421789468584, + "grad_norm": 0.11423125863075256, + "learning_rate": 0.00028300350504402606, + "loss": 0.45217055082321167, + "memory(GiB)": 78.26, + "step": 1009, + "token_acc": 0.8701353400348177, + "train_speed(iter/s)": 0.032131 + }, + { + "epoch": 0.1957079881800126, + "grad_norm": 0.1115386039018631, + "learning_rate": 0.0002829590297397912, + "loss": 0.4279889762401581, + "memory(GiB)": 78.26, + "step": 1010, + "token_acc": 0.8788333144732375, + "train_speed(iter/s)": 0.032134 + }, + { + "epoch": 0.19590175846533933, + "grad_norm": 0.1104380413889885, + "learning_rate": 0.00028291449982570995, + "loss": 0.4566521942615509, + "memory(GiB)": 78.26, + "step": 1011, + "token_acc": 0.8691424216142343, + "train_speed(iter/s)": 0.032137 + }, + { + "epoch": 0.19609552875066608, + "grad_norm": 0.11296399682760239, + "learning_rate": 0.00028286991532007217, + "loss": 0.43526938557624817, + "memory(GiB)": 78.26, + "step": 1012, + "token_acc": 0.8731874876819551, + "train_speed(iter/s)": 0.03214 + }, + { + "epoch": 0.19628929903599282, + "grad_norm": 0.10835513472557068, + "learning_rate": 0.0002828252762411898, + "loss": 0.4160099923610687, + "memory(GiB)": 78.26, + "step": 1013, + "token_acc": 0.880335822929728, + "train_speed(iter/s)": 0.032143 + }, + { + "epoch": 0.19648306932131956, + "grad_norm": 0.09724919497966766, + "learning_rate": 0.00028278058260739733, + "loss": 0.3947051763534546, + "memory(GiB)": 78.26, + "step": 1014, + "token_acc": 0.8845330218523385, + "train_speed(iter/s)": 0.032145 + }, + { + "epoch": 0.1966768396066463, + "grad_norm": 0.11479044705629349, + "learning_rate": 0.0002827358344370516, + "loss": 0.4259167015552521, + "memory(GiB)": 78.26, + "step": 1015, + "token_acc": 0.8770737733851042, + "train_speed(iter/s)": 0.032148 + }, + { + "epoch": 0.19687060989197305, + "grad_norm": 0.11837134510278702, + "learning_rate": 0.000282691031748532, + "loss": 0.4474408030509949, + "memory(GiB)": 78.26, + "step": 1016, + "token_acc": 0.8693976711066432, + "train_speed(iter/s)": 0.032151 + }, + { + "epoch": 0.19706438017729982, + "grad_norm": 0.10522796213626862, + "learning_rate": 0.00028264617456024, + "loss": 0.41682255268096924, + "memory(GiB)": 78.26, + "step": 1017, + "token_acc": 0.8796135557400558, + "train_speed(iter/s)": 0.032154 + }, + { + "epoch": 0.19725815046262657, + "grad_norm": 0.10754600167274475, + "learning_rate": 0.00028260126289059986, + "loss": 0.4248649477958679, + "memory(GiB)": 78.26, + "step": 1018, + "token_acc": 0.8764376037077302, + "train_speed(iter/s)": 0.032156 + }, + { + "epoch": 0.1974519207479533, + "grad_norm": 0.10504762083292007, + "learning_rate": 0.00028255629675805785, + "loss": 0.4158293604850769, + "memory(GiB)": 78.26, + "step": 1019, + "token_acc": 0.8806701766216252, + "train_speed(iter/s)": 0.032159 + }, + { + "epoch": 0.19764569103328006, + "grad_norm": 0.11185994744300842, + "learning_rate": 0.0002825112761810828, + "loss": 0.43085163831710815, + "memory(GiB)": 78.26, + "step": 1020, + "token_acc": 0.8744727730563424, + "train_speed(iter/s)": 0.032162 + }, + { + "epoch": 0.1978394613186068, + "grad_norm": 0.09813162684440613, + "learning_rate": 0.000282466201178166, + "loss": 0.4018121063709259, + "memory(GiB)": 78.26, + "step": 1021, + "token_acc": 0.8820514020879172, + "train_speed(iter/s)": 0.032165 + }, + { + "epoch": 0.19803323160393355, + "grad_norm": 0.1113823875784874, + "learning_rate": 0.0002824210717678209, + "loss": 0.40826714038848877, + "memory(GiB)": 78.26, + "step": 1022, + "token_acc": 0.882307549027434, + "train_speed(iter/s)": 0.032167 + }, + { + "epoch": 0.1982270018892603, + "grad_norm": 0.1155644953250885, + "learning_rate": 0.00028237588796858323, + "loss": 0.4487013816833496, + "memory(GiB)": 78.26, + "step": 1023, + "token_acc": 0.8709012113617377, + "train_speed(iter/s)": 0.03217 + }, + { + "epoch": 0.19842077217458703, + "grad_norm": 0.11795882880687714, + "learning_rate": 0.0002823306497990113, + "loss": 0.43464547395706177, + "memory(GiB)": 78.26, + "step": 1024, + "token_acc": 0.8770835761743793, + "train_speed(iter/s)": 0.032173 + }, + { + "epoch": 0.19861454245991378, + "grad_norm": 0.1030697301030159, + "learning_rate": 0.00028228535727768575, + "loss": 0.4153880476951599, + "memory(GiB)": 78.26, + "step": 1025, + "token_acc": 0.8816005247622172, + "train_speed(iter/s)": 0.032175 + }, + { + "epoch": 0.19880831274524052, + "grad_norm": 0.10959405452013016, + "learning_rate": 0.00028224001042320923, + "loss": 0.4293935000896454, + "memory(GiB)": 78.26, + "step": 1026, + "token_acc": 0.8751588677065281, + "train_speed(iter/s)": 0.032178 + }, + { + "epoch": 0.19900208303056727, + "grad_norm": 0.11512494832277298, + "learning_rate": 0.00028219460925420697, + "loss": 0.46233439445495605, + "memory(GiB)": 78.26, + "step": 1027, + "token_acc": 0.8720349947631076, + "train_speed(iter/s)": 0.032181 + }, + { + "epoch": 0.199195853315894, + "grad_norm": 0.11418092250823975, + "learning_rate": 0.00028214915378932653, + "loss": 0.45269933342933655, + "memory(GiB)": 78.26, + "step": 1028, + "token_acc": 0.8701842783140374, + "train_speed(iter/s)": 0.032184 + }, + { + "epoch": 0.19938962360122076, + "grad_norm": 0.10788623243570328, + "learning_rate": 0.00028210364404723765, + "loss": 0.4401698708534241, + "memory(GiB)": 78.26, + "step": 1029, + "token_acc": 0.8730818757377401, + "train_speed(iter/s)": 0.032187 + }, + { + "epoch": 0.1995833938865475, + "grad_norm": 0.10503195226192474, + "learning_rate": 0.00028205808004663237, + "loss": 0.39039331674575806, + "memory(GiB)": 78.26, + "step": 1030, + "token_acc": 0.8859542777970212, + "train_speed(iter/s)": 0.03219 + }, + { + "epoch": 0.19977716417187424, + "grad_norm": 0.11032052338123322, + "learning_rate": 0.0002820124618062251, + "loss": 0.39245864748954773, + "memory(GiB)": 78.26, + "step": 1031, + "token_acc": 0.8865768832322036, + "train_speed(iter/s)": 0.032193 + }, + { + "epoch": 0.199970934457201, + "grad_norm": 0.10605579614639282, + "learning_rate": 0.00028196678934475246, + "loss": 0.42015910148620605, + "memory(GiB)": 78.26, + "step": 1032, + "token_acc": 0.8770413805519122, + "train_speed(iter/s)": 0.032196 + }, + { + "epoch": 0.20016470474252773, + "grad_norm": 0.11556072533130646, + "learning_rate": 0.00028192106268097334, + "loss": 0.412725567817688, + "memory(GiB)": 78.26, + "step": 1033, + "token_acc": 0.8829517954994298, + "train_speed(iter/s)": 0.032199 + }, + { + "epoch": 0.20035847502785448, + "grad_norm": 0.11041781306266785, + "learning_rate": 0.00028187528183366893, + "loss": 0.4078274369239807, + "memory(GiB)": 78.26, + "step": 1034, + "token_acc": 0.8829038467063116, + "train_speed(iter/s)": 0.032202 + }, + { + "epoch": 0.20055224531318122, + "grad_norm": 0.11429005116224289, + "learning_rate": 0.0002818294468216426, + "loss": 0.4424319267272949, + "memory(GiB)": 78.26, + "step": 1035, + "token_acc": 0.8738159769761659, + "train_speed(iter/s)": 0.032205 + }, + { + "epoch": 0.20074601559850797, + "grad_norm": 0.10818706452846527, + "learning_rate": 0.00028178355766372013, + "loss": 0.41155439615249634, + "memory(GiB)": 78.26, + "step": 1036, + "token_acc": 0.8821931101407084, + "train_speed(iter/s)": 0.032207 + }, + { + "epoch": 0.2009397858838347, + "grad_norm": 0.10576290637254715, + "learning_rate": 0.0002817376143787493, + "loss": 0.425853431224823, + "memory(GiB)": 78.26, + "step": 1037, + "token_acc": 0.8778621912804088, + "train_speed(iter/s)": 0.03221 + }, + { + "epoch": 0.20113355616916145, + "grad_norm": 0.11154097318649292, + "learning_rate": 0.0002816916169856004, + "loss": 0.39488485455513, + "memory(GiB)": 78.26, + "step": 1038, + "token_acc": 0.8855062677979039, + "train_speed(iter/s)": 0.032212 + }, + { + "epoch": 0.2013273264544882, + "grad_norm": 0.10686661303043365, + "learning_rate": 0.00028164556550316563, + "loss": 0.401602178812027, + "memory(GiB)": 78.26, + "step": 1039, + "token_acc": 0.8835590770383853, + "train_speed(iter/s)": 0.032215 + }, + { + "epoch": 0.20152109673981494, + "grad_norm": 0.11480426788330078, + "learning_rate": 0.00028159945995035975, + "loss": 0.4522130489349365, + "memory(GiB)": 78.26, + "step": 1040, + "token_acc": 0.8704494740197641, + "train_speed(iter/s)": 0.032218 + }, + { + "epoch": 0.2017148670251417, + "grad_norm": 0.10808564722537994, + "learning_rate": 0.0002815533003461193, + "loss": 0.4125955402851105, + "memory(GiB)": 78.26, + "step": 1041, + "token_acc": 0.8798711669505963, + "train_speed(iter/s)": 0.032221 + }, + { + "epoch": 0.20190863731046843, + "grad_norm": 0.11643750965595245, + "learning_rate": 0.00028150708670940356, + "loss": 0.44826722145080566, + "memory(GiB)": 78.26, + "step": 1042, + "token_acc": 0.8713785046728972, + "train_speed(iter/s)": 0.032223 + }, + { + "epoch": 0.20210240759579517, + "grad_norm": 0.1128215417265892, + "learning_rate": 0.00028146081905919355, + "loss": 0.4168522357940674, + "memory(GiB)": 78.26, + "step": 1043, + "token_acc": 0.8776015581524763, + "train_speed(iter/s)": 0.032226 + }, + { + "epoch": 0.20229617788112192, + "grad_norm": 0.10721378773450851, + "learning_rate": 0.00028141449741449264, + "loss": 0.4148525297641754, + "memory(GiB)": 78.26, + "step": 1044, + "token_acc": 0.880571123565519, + "train_speed(iter/s)": 0.032228 + }, + { + "epoch": 0.20248994816644866, + "grad_norm": 0.11519747227430344, + "learning_rate": 0.0002813681217943264, + "loss": 0.44546476006507874, + "memory(GiB)": 78.26, + "step": 1045, + "token_acc": 0.8722711825355682, + "train_speed(iter/s)": 0.032231 + }, + { + "epoch": 0.2026837184517754, + "grad_norm": 0.10372038185596466, + "learning_rate": 0.00028132169221774256, + "loss": 0.36725738644599915, + "memory(GiB)": 78.26, + "step": 1046, + "token_acc": 0.8941524609236684, + "train_speed(iter/s)": 0.032233 + }, + { + "epoch": 0.20287748873710215, + "grad_norm": 0.11429732292890549, + "learning_rate": 0.00028127520870381095, + "loss": 0.45745980739593506, + "memory(GiB)": 78.26, + "step": 1047, + "token_acc": 0.8703554240277405, + "train_speed(iter/s)": 0.032236 + }, + { + "epoch": 0.20307125902242892, + "grad_norm": 0.11484638601541519, + "learning_rate": 0.00028122867127162364, + "loss": 0.4351459741592407, + "memory(GiB)": 78.26, + "step": 1048, + "token_acc": 0.8764367816091954, + "train_speed(iter/s)": 0.032239 + }, + { + "epoch": 0.20326502930775567, + "grad_norm": 0.10716410726308823, + "learning_rate": 0.0002811820799402948, + "loss": 0.4354075789451599, + "memory(GiB)": 78.26, + "step": 1049, + "token_acc": 0.8755018944749194, + "train_speed(iter/s)": 0.032242 + }, + { + "epoch": 0.2034587995930824, + "grad_norm": 0.0956411212682724, + "learning_rate": 0.00028113543472896074, + "loss": 0.37369605898857117, + "memory(GiB)": 78.26, + "step": 1050, + "token_acc": 0.8899866307131041, + "train_speed(iter/s)": 0.032244 + }, + { + "epoch": 0.20365256987840916, + "grad_norm": 0.10245929658412933, + "learning_rate": 0.0002810887356567798, + "loss": 0.3949301242828369, + "memory(GiB)": 78.26, + "step": 1051, + "token_acc": 0.8889433444422662, + "train_speed(iter/s)": 0.032247 + }, + { + "epoch": 0.2038463401637359, + "grad_norm": 0.11300604790449142, + "learning_rate": 0.0002810419827429327, + "loss": 0.4184969663619995, + "memory(GiB)": 78.26, + "step": 1052, + "token_acc": 0.8773636026580726, + "train_speed(iter/s)": 0.032249 + }, + { + "epoch": 0.20404011044906264, + "grad_norm": 0.1060163602232933, + "learning_rate": 0.00028099517600662207, + "loss": 0.41772571206092834, + "memory(GiB)": 78.26, + "step": 1053, + "token_acc": 0.8804525071341215, + "train_speed(iter/s)": 0.032252 + }, + { + "epoch": 0.2042338807343894, + "grad_norm": 0.11069151014089584, + "learning_rate": 0.00028094831546707265, + "loss": 0.4313889741897583, + "memory(GiB)": 78.26, + "step": 1054, + "token_acc": 0.8761178045515395, + "train_speed(iter/s)": 0.032254 + }, + { + "epoch": 0.20442765101971613, + "grad_norm": 0.11553742736577988, + "learning_rate": 0.00028090140114353133, + "loss": 0.4176057279109955, + "memory(GiB)": 78.26, + "step": 1055, + "token_acc": 0.8819988121591706, + "train_speed(iter/s)": 0.032257 + }, + { + "epoch": 0.20462142130504288, + "grad_norm": 0.10594391077756882, + "learning_rate": 0.00028085443305526713, + "loss": 0.394021213054657, + "memory(GiB)": 78.26, + "step": 1056, + "token_acc": 0.885325837540156, + "train_speed(iter/s)": 0.03226 + }, + { + "epoch": 0.20481519159036962, + "grad_norm": 0.11002414673566818, + "learning_rate": 0.0002808074112215711, + "loss": 0.41269078850746155, + "memory(GiB)": 78.26, + "step": 1057, + "token_acc": 0.8808142873550017, + "train_speed(iter/s)": 0.032263 + }, + { + "epoch": 0.20500896187569637, + "grad_norm": 0.09443158656358719, + "learning_rate": 0.0002807603356617563, + "loss": 0.352535218000412, + "memory(GiB)": 78.26, + "step": 1058, + "token_acc": 0.8964792433000526, + "train_speed(iter/s)": 0.032265 + }, + { + "epoch": 0.2052027321610231, + "grad_norm": 0.11314352601766586, + "learning_rate": 0.00028071320639515805, + "loss": 0.4366722106933594, + "memory(GiB)": 78.26, + "step": 1059, + "token_acc": 0.8743527508090615, + "train_speed(iter/s)": 0.032268 + }, + { + "epoch": 0.20539650244634985, + "grad_norm": 0.10947109758853912, + "learning_rate": 0.00028066602344113353, + "loss": 0.4176010489463806, + "memory(GiB)": 78.26, + "step": 1060, + "token_acc": 0.8801968582792068, + "train_speed(iter/s)": 0.03227 + }, + { + "epoch": 0.2055902727316766, + "grad_norm": 0.10332785546779633, + "learning_rate": 0.000280618786819062, + "loss": 0.39836370944976807, + "memory(GiB)": 78.26, + "step": 1061, + "token_acc": 0.8847858883602182, + "train_speed(iter/s)": 0.032273 + }, + { + "epoch": 0.20578404301700334, + "grad_norm": 0.10330932587385178, + "learning_rate": 0.0002805714965483449, + "loss": 0.404694139957428, + "memory(GiB)": 78.26, + "step": 1062, + "token_acc": 0.8835061262959473, + "train_speed(iter/s)": 0.032275 + }, + { + "epoch": 0.2059778133023301, + "grad_norm": 0.12757974863052368, + "learning_rate": 0.0002805241526484055, + "loss": 0.4440545439720154, + "memory(GiB)": 78.26, + "step": 1063, + "token_acc": 0.8742229290154692, + "train_speed(iter/s)": 0.032278 + }, + { + "epoch": 0.20617158358765683, + "grad_norm": 0.1015448048710823, + "learning_rate": 0.00028047675513868936, + "loss": 0.41053593158721924, + "memory(GiB)": 78.26, + "step": 1064, + "token_acc": 0.8818211780215641, + "train_speed(iter/s)": 0.03228 + }, + { + "epoch": 0.20636535387298358, + "grad_norm": 0.10831795632839203, + "learning_rate": 0.00028042930403866383, + "loss": 0.4191955626010895, + "memory(GiB)": 78.26, + "step": 1065, + "token_acc": 0.880605738575983, + "train_speed(iter/s)": 0.032283 + }, + { + "epoch": 0.20655912415831032, + "grad_norm": 0.10870091617107391, + "learning_rate": 0.0002803817993678183, + "loss": 0.41718360781669617, + "memory(GiB)": 78.26, + "step": 1066, + "token_acc": 0.8796682921131778, + "train_speed(iter/s)": 0.032285 + }, + { + "epoch": 0.20675289444363706, + "grad_norm": 0.10902893543243408, + "learning_rate": 0.00028033424114566434, + "loss": 0.4172331988811493, + "memory(GiB)": 78.26, + "step": 1067, + "token_acc": 0.8794469249603918, + "train_speed(iter/s)": 0.032288 + }, + { + "epoch": 0.2069466647289638, + "grad_norm": 0.11219903081655502, + "learning_rate": 0.0002802866293917353, + "loss": 0.42483946681022644, + "memory(GiB)": 78.26, + "step": 1068, + "token_acc": 0.8773620614354346, + "train_speed(iter/s)": 0.03229 + }, + { + "epoch": 0.20714043501429055, + "grad_norm": 0.10504312813282013, + "learning_rate": 0.00028023896412558664, + "loss": 0.3886624276638031, + "memory(GiB)": 78.26, + "step": 1069, + "token_acc": 0.8863732842790435, + "train_speed(iter/s)": 0.032293 + }, + { + "epoch": 0.2073342052996173, + "grad_norm": 0.11080943793058395, + "learning_rate": 0.00028019124536679573, + "loss": 0.4068402647972107, + "memory(GiB)": 78.26, + "step": 1070, + "token_acc": 0.8815200753561757, + "train_speed(iter/s)": 0.032296 + }, + { + "epoch": 0.20752797558494404, + "grad_norm": 0.11574803292751312, + "learning_rate": 0.000280143473134962, + "loss": 0.4340014159679413, + "memory(GiB)": 78.26, + "step": 1071, + "token_acc": 0.8773786767852557, + "train_speed(iter/s)": 0.032299 + }, + { + "epoch": 0.20772174587027079, + "grad_norm": 0.10820218175649643, + "learning_rate": 0.00028009564744970676, + "loss": 0.43230772018432617, + "memory(GiB)": 78.26, + "step": 1072, + "token_acc": 0.8758193236979952, + "train_speed(iter/s)": 0.032301 + }, + { + "epoch": 0.20791551615559753, + "grad_norm": 0.11416003853082657, + "learning_rate": 0.0002800477683306733, + "loss": 0.4142245352268219, + "memory(GiB)": 78.26, + "step": 1073, + "token_acc": 0.881467683756135, + "train_speed(iter/s)": 0.032304 + }, + { + "epoch": 0.20810928644092427, + "grad_norm": 0.10887296497821808, + "learning_rate": 0.0002799998357975269, + "loss": 0.3994034230709076, + "memory(GiB)": 78.26, + "step": 1074, + "token_acc": 0.8853917309454148, + "train_speed(iter/s)": 0.032306 + }, + { + "epoch": 0.20830305672625102, + "grad_norm": 0.10443485528230667, + "learning_rate": 0.00027995184986995465, + "loss": 0.41546863317489624, + "memory(GiB)": 78.26, + "step": 1075, + "token_acc": 0.8815385377451952, + "train_speed(iter/s)": 0.032309 + }, + { + "epoch": 0.20849682701157776, + "grad_norm": 0.10981526970863342, + "learning_rate": 0.0002799038105676658, + "loss": 0.4088175296783447, + "memory(GiB)": 78.26, + "step": 1076, + "token_acc": 0.8810430263475949, + "train_speed(iter/s)": 0.032311 + }, + { + "epoch": 0.2086905972969045, + "grad_norm": 0.10930271446704865, + "learning_rate": 0.0002798557179103912, + "loss": 0.44178467988967896, + "memory(GiB)": 78.26, + "step": 1077, + "token_acc": 0.8711125622398171, + "train_speed(iter/s)": 0.032314 + }, + { + "epoch": 0.20888436758223128, + "grad_norm": 0.10522231459617615, + "learning_rate": 0.00027980757191788395, + "loss": 0.4103907644748688, + "memory(GiB)": 78.26, + "step": 1078, + "token_acc": 0.8826355904120181, + "train_speed(iter/s)": 0.032316 + }, + { + "epoch": 0.20907813786755802, + "grad_norm": 0.10563495755195618, + "learning_rate": 0.00027975937260991886, + "loss": 0.4192779064178467, + "memory(GiB)": 78.26, + "step": 1079, + "token_acc": 0.8789847870957878, + "train_speed(iter/s)": 0.032319 + }, + { + "epoch": 0.20927190815288477, + "grad_norm": 0.10958468914031982, + "learning_rate": 0.00027971112000629264, + "loss": 0.4349307417869568, + "memory(GiB)": 78.26, + "step": 1080, + "token_acc": 0.8740859040094141, + "train_speed(iter/s)": 0.032321 + }, + { + "epoch": 0.2094656784382115, + "grad_norm": 0.09944422543048859, + "learning_rate": 0.000279662814126824, + "loss": 0.35245591402053833, + "memory(GiB)": 78.26, + "step": 1081, + "token_acc": 0.896417537322233, + "train_speed(iter/s)": 0.032324 + }, + { + "epoch": 0.20965944872353826, + "grad_norm": 0.10272464156150818, + "learning_rate": 0.0002796144549913534, + "loss": 0.4116705358028412, + "memory(GiB)": 78.26, + "step": 1082, + "token_acc": 0.8808783118162493, + "train_speed(iter/s)": 0.032326 + }, + { + "epoch": 0.209853219008865, + "grad_norm": 0.11050140857696533, + "learning_rate": 0.0002795660426197432, + "loss": 0.42899516224861145, + "memory(GiB)": 78.26, + "step": 1083, + "token_acc": 0.874714182142658, + "train_speed(iter/s)": 0.032329 + }, + { + "epoch": 0.21004698929419174, + "grad_norm": 0.10963544994592667, + "learning_rate": 0.0002795175770318778, + "loss": 0.43252986669540405, + "memory(GiB)": 78.26, + "step": 1084, + "token_acc": 0.8759278350515464, + "train_speed(iter/s)": 0.032331 + }, + { + "epoch": 0.2102407595795185, + "grad_norm": 0.10902027040719986, + "learning_rate": 0.0002794690582476632, + "loss": 0.42844176292419434, + "memory(GiB)": 78.26, + "step": 1085, + "token_acc": 0.8774478501489996, + "train_speed(iter/s)": 0.032334 + }, + { + "epoch": 0.21043452986484523, + "grad_norm": 0.10531258583068848, + "learning_rate": 0.00027942048628702747, + "loss": 0.39160269498825073, + "memory(GiB)": 78.26, + "step": 1086, + "token_acc": 0.8859537508298296, + "train_speed(iter/s)": 0.032336 + }, + { + "epoch": 0.21062830015017198, + "grad_norm": 0.10602930188179016, + "learning_rate": 0.0002793718611699203, + "loss": 0.41201895475387573, + "memory(GiB)": 78.26, + "step": 1087, + "token_acc": 0.8800353219229671, + "train_speed(iter/s)": 0.032338 + }, + { + "epoch": 0.21082207043549872, + "grad_norm": 0.11210530251264572, + "learning_rate": 0.0002793231829163134, + "loss": 0.40734055638313293, + "memory(GiB)": 78.26, + "step": 1088, + "token_acc": 0.8804322497961086, + "train_speed(iter/s)": 0.032341 + }, + { + "epoch": 0.21101584072082546, + "grad_norm": 0.12339694052934647, + "learning_rate": 0.00027927445154620026, + "loss": 0.4420923888683319, + "memory(GiB)": 78.26, + "step": 1089, + "token_acc": 0.8733834499841585, + "train_speed(iter/s)": 0.032344 + }, + { + "epoch": 0.2112096110061522, + "grad_norm": 0.09618587791919708, + "learning_rate": 0.00027922566707959607, + "loss": 0.390455961227417, + "memory(GiB)": 78.26, + "step": 1090, + "token_acc": 0.88821477324435, + "train_speed(iter/s)": 0.032346 + }, + { + "epoch": 0.21140338129147895, + "grad_norm": 0.1139514371752739, + "learning_rate": 0.00027917682953653805, + "loss": 0.43686941266059875, + "memory(GiB)": 78.26, + "step": 1091, + "token_acc": 0.8737241340844442, + "train_speed(iter/s)": 0.032349 + }, + { + "epoch": 0.2115971515768057, + "grad_norm": 0.10429059714078903, + "learning_rate": 0.000279127938937085, + "loss": 0.41246047616004944, + "memory(GiB)": 78.26, + "step": 1092, + "token_acc": 0.8822545491408587, + "train_speed(iter/s)": 0.032351 + }, + { + "epoch": 0.21179092186213244, + "grad_norm": 0.10676047950983047, + "learning_rate": 0.0002790789953013176, + "loss": 0.40249383449554443, + "memory(GiB)": 78.26, + "step": 1093, + "token_acc": 0.8805235195023973, + "train_speed(iter/s)": 0.032354 + }, + { + "epoch": 0.21198469214745919, + "grad_norm": 0.10362397134304047, + "learning_rate": 0.0002790299986493384, + "loss": 0.3985450267791748, + "memory(GiB)": 78.26, + "step": 1094, + "token_acc": 0.8856300352798694, + "train_speed(iter/s)": 0.032356 + }, + { + "epoch": 0.21217846243278593, + "grad_norm": 0.12481772899627686, + "learning_rate": 0.0002789809490012715, + "loss": 0.48419952392578125, + "memory(GiB)": 78.26, + "step": 1095, + "token_acc": 0.8615846373517498, + "train_speed(iter/s)": 0.032359 + }, + { + "epoch": 0.21237223271811267, + "grad_norm": 0.09449519217014313, + "learning_rate": 0.00027893184637726304, + "loss": 0.3890914022922516, + "memory(GiB)": 78.26, + "step": 1096, + "token_acc": 0.8871999258504032, + "train_speed(iter/s)": 0.032361 + }, + { + "epoch": 0.21256600300343942, + "grad_norm": 0.1078861802816391, + "learning_rate": 0.00027888269079748073, + "loss": 0.4059605002403259, + "memory(GiB)": 78.26, + "step": 1097, + "token_acc": 0.8837670990726387, + "train_speed(iter/s)": 0.032363 + }, + { + "epoch": 0.21275977328876616, + "grad_norm": 0.1090501993894577, + "learning_rate": 0.0002788334822821141, + "loss": 0.4224609136581421, + "memory(GiB)": 78.26, + "step": 1098, + "token_acc": 0.8791557955103201, + "train_speed(iter/s)": 0.032365 + }, + { + "epoch": 0.2129535435740929, + "grad_norm": 0.10729729384183884, + "learning_rate": 0.00027878422085137437, + "loss": 0.4196450710296631, + "memory(GiB)": 78.26, + "step": 1099, + "token_acc": 0.8797552019583843, + "train_speed(iter/s)": 0.032368 + }, + { + "epoch": 0.21314731385941965, + "grad_norm": 0.11662878841161728, + "learning_rate": 0.00027873490652549464, + "loss": 0.44980597496032715, + "memory(GiB)": 78.26, + "step": 1100, + "token_acc": 0.87146529562982, + "train_speed(iter/s)": 0.03237 + }, + { + "epoch": 0.2133410841447464, + "grad_norm": 0.11625700443983078, + "learning_rate": 0.00027868553932472955, + "loss": 0.4741382300853729, + "memory(GiB)": 78.26, + "step": 1101, + "token_acc": 0.8651265929377421, + "train_speed(iter/s)": 0.032373 + }, + { + "epoch": 0.21353485443007314, + "grad_norm": 0.10019952803850174, + "learning_rate": 0.0002786361192693555, + "loss": 0.38797110319137573, + "memory(GiB)": 78.26, + "step": 1102, + "token_acc": 0.8859078392670029, + "train_speed(iter/s)": 0.032375 + }, + { + "epoch": 0.21372862471539988, + "grad_norm": 0.10606972128152847, + "learning_rate": 0.0002785866463796707, + "loss": 0.41653597354888916, + "memory(GiB)": 78.26, + "step": 1103, + "token_acc": 0.8805738034589087, + "train_speed(iter/s)": 0.032378 + }, + { + "epoch": 0.21392239500072663, + "grad_norm": 0.11103309690952301, + "learning_rate": 0.000278537120675995, + "loss": 0.4037442207336426, + "memory(GiB)": 78.26, + "step": 1104, + "token_acc": 0.8811096803090532, + "train_speed(iter/s)": 0.03238 + }, + { + "epoch": 0.21411616528605337, + "grad_norm": 0.10646630078554153, + "learning_rate": 0.0002784875421786699, + "loss": 0.38875192403793335, + "memory(GiB)": 78.26, + "step": 1105, + "token_acc": 0.8858713855085408, + "train_speed(iter/s)": 0.032383 + }, + { + "epoch": 0.21430993557138012, + "grad_norm": 0.10254091769456863, + "learning_rate": 0.0002784379109080586, + "loss": 0.3985883891582489, + "memory(GiB)": 78.26, + "step": 1106, + "token_acc": 0.8814364477700108, + "train_speed(iter/s)": 0.032385 + }, + { + "epoch": 0.21450370585670686, + "grad_norm": 0.11805382370948792, + "learning_rate": 0.00027838822688454605, + "loss": 0.44648706912994385, + "memory(GiB)": 78.26, + "step": 1107, + "token_acc": 0.8738782929855441, + "train_speed(iter/s)": 0.032388 + }, + { + "epoch": 0.2146974761420336, + "grad_norm": 0.10933182388544083, + "learning_rate": 0.0002783384901285388, + "loss": 0.43097054958343506, + "memory(GiB)": 78.26, + "step": 1108, + "token_acc": 0.8753957234101638, + "train_speed(iter/s)": 0.03239 + }, + { + "epoch": 0.21489124642736038, + "grad_norm": 0.11015936732292175, + "learning_rate": 0.00027828870066046505, + "loss": 0.4059434235095978, + "memory(GiB)": 78.26, + "step": 1109, + "token_acc": 0.8822940702232354, + "train_speed(iter/s)": 0.032393 + }, + { + "epoch": 0.21508501671268712, + "grad_norm": 0.09735988825559616, + "learning_rate": 0.00027823885850077474, + "loss": 0.36265531182289124, + "memory(GiB)": 78.26, + "step": 1110, + "token_acc": 0.8935244370144557, + "train_speed(iter/s)": 0.032395 + }, + { + "epoch": 0.21527878699801387, + "grad_norm": 0.10875032097101212, + "learning_rate": 0.00027818896366993927, + "loss": 0.41812700033187866, + "memory(GiB)": 78.26, + "step": 1111, + "token_acc": 0.8791208791208791, + "train_speed(iter/s)": 0.032398 + }, + { + "epoch": 0.2154725572833406, + "grad_norm": 0.11396840214729309, + "learning_rate": 0.0002781390161884519, + "loss": 0.41851508617401123, + "memory(GiB)": 78.26, + "step": 1112, + "token_acc": 0.8805713529956355, + "train_speed(iter/s)": 0.0324 + }, + { + "epoch": 0.21566632756866735, + "grad_norm": 0.09718259423971176, + "learning_rate": 0.00027808901607682734, + "loss": 0.3617076277732849, + "memory(GiB)": 78.26, + "step": 1113, + "token_acc": 0.8916775495666627, + "train_speed(iter/s)": 0.032402 + }, + { + "epoch": 0.2158600978539941, + "grad_norm": 0.11616761237382889, + "learning_rate": 0.0002780389633556019, + "loss": 0.40616050362586975, + "memory(GiB)": 78.26, + "step": 1114, + "token_acc": 0.883260254376963, + "train_speed(iter/s)": 0.032405 + }, + { + "epoch": 0.21605386813932084, + "grad_norm": 0.09937416017055511, + "learning_rate": 0.0002779888580453338, + "loss": 0.3472048044204712, + "memory(GiB)": 78.26, + "step": 1115, + "token_acc": 0.9003076604876031, + "train_speed(iter/s)": 0.032407 + }, + { + "epoch": 0.2162476384246476, + "grad_norm": 0.10610184073448181, + "learning_rate": 0.00027793870016660247, + "loss": 0.43418559432029724, + "memory(GiB)": 78.26, + "step": 1116, + "token_acc": 0.8744638034210666, + "train_speed(iter/s)": 0.032409 + }, + { + "epoch": 0.21644140870997433, + "grad_norm": 0.10146701335906982, + "learning_rate": 0.0002778884897400091, + "loss": 0.40924298763275146, + "memory(GiB)": 78.26, + "step": 1117, + "token_acc": 0.8807274502624192, + "train_speed(iter/s)": 0.032411 + }, + { + "epoch": 0.21663517899530108, + "grad_norm": 0.11153874546289444, + "learning_rate": 0.0002778382267861765, + "loss": 0.40396177768707275, + "memory(GiB)": 78.26, + "step": 1118, + "token_acc": 0.8842105263157894, + "train_speed(iter/s)": 0.032414 + }, + { + "epoch": 0.21682894928062782, + "grad_norm": 0.1122811809182167, + "learning_rate": 0.000277787911325749, + "loss": 0.40342846512794495, + "memory(GiB)": 78.26, + "step": 1119, + "token_acc": 0.882786297835746, + "train_speed(iter/s)": 0.032416 + }, + { + "epoch": 0.21702271956595456, + "grad_norm": 0.1134880930185318, + "learning_rate": 0.0002777375433793926, + "loss": 0.4314640164375305, + "memory(GiB)": 78.26, + "step": 1120, + "token_acc": 0.8745123894291522, + "train_speed(iter/s)": 0.032418 + }, + { + "epoch": 0.2172164898512813, + "grad_norm": 0.12474801391363144, + "learning_rate": 0.0002776871229677946, + "loss": 0.44416388869285583, + "memory(GiB)": 78.26, + "step": 1121, + "token_acc": 0.8747154810374123, + "train_speed(iter/s)": 0.032421 + }, + { + "epoch": 0.21741026013660805, + "grad_norm": 0.1143270805478096, + "learning_rate": 0.0002776366501116642, + "loss": 0.43782278895378113, + "memory(GiB)": 78.26, + "step": 1122, + "token_acc": 0.8753460954479145, + "train_speed(iter/s)": 0.032423 + }, + { + "epoch": 0.2176040304219348, + "grad_norm": 0.10256467759609222, + "learning_rate": 0.00027758612483173183, + "loss": 0.3676009774208069, + "memory(GiB)": 78.26, + "step": 1123, + "token_acc": 0.8936442031299904, + "train_speed(iter/s)": 0.032425 + }, + { + "epoch": 0.21779780070726154, + "grad_norm": 0.11031196266412735, + "learning_rate": 0.00027753554714874957, + "loss": 0.40533462166786194, + "memory(GiB)": 78.26, + "step": 1124, + "token_acc": 0.8813823163138231, + "train_speed(iter/s)": 0.032428 + }, + { + "epoch": 0.21799157099258828, + "grad_norm": 0.09570778906345367, + "learning_rate": 0.00027748491708349117, + "loss": 0.3897078037261963, + "memory(GiB)": 78.26, + "step": 1125, + "token_acc": 0.8868044826994286, + "train_speed(iter/s)": 0.03243 + }, + { + "epoch": 0.21818534127791503, + "grad_norm": 0.10430624336004257, + "learning_rate": 0.00027743423465675167, + "loss": 0.3872542083263397, + "memory(GiB)": 78.26, + "step": 1126, + "token_acc": 0.8860606646058733, + "train_speed(iter/s)": 0.032432 + }, + { + "epoch": 0.21837911156324177, + "grad_norm": 0.10934372991323471, + "learning_rate": 0.0002773834998893476, + "loss": 0.3712359666824341, + "memory(GiB)": 78.26, + "step": 1127, + "token_acc": 0.8935813061202739, + "train_speed(iter/s)": 0.032435 + }, + { + "epoch": 0.21857288184856852, + "grad_norm": 0.11239821463823318, + "learning_rate": 0.0002773327128021173, + "loss": 0.4694061875343323, + "memory(GiB)": 78.26, + "step": 1128, + "token_acc": 0.867498464802099, + "train_speed(iter/s)": 0.032437 + }, + { + "epoch": 0.21876665213389526, + "grad_norm": 0.12323271483182907, + "learning_rate": 0.00027728187341592025, + "loss": 0.4466722011566162, + "memory(GiB)": 78.26, + "step": 1129, + "token_acc": 0.8716039707419018, + "train_speed(iter/s)": 0.032439 + }, + { + "epoch": 0.218960422419222, + "grad_norm": 0.1258774995803833, + "learning_rate": 0.0002772309817516376, + "loss": 0.4656696021556854, + "memory(GiB)": 78.26, + "step": 1130, + "token_acc": 0.8651681629371566, + "train_speed(iter/s)": 0.032442 + }, + { + "epoch": 0.21915419270454875, + "grad_norm": 0.08737502247095108, + "learning_rate": 0.0002771800378301719, + "loss": 0.3400501012802124, + "memory(GiB)": 78.26, + "step": 1131, + "token_acc": 0.9012786337143631, + "train_speed(iter/s)": 0.032444 + }, + { + "epoch": 0.2193479629898755, + "grad_norm": 0.10477188974618912, + "learning_rate": 0.0002771290416724472, + "loss": 0.4307701587677002, + "memory(GiB)": 78.26, + "step": 1132, + "token_acc": 0.8758152240789506, + "train_speed(iter/s)": 0.032447 + }, + { + "epoch": 0.21954173327520224, + "grad_norm": 0.10766912251710892, + "learning_rate": 0.000277077993299409, + "loss": 0.41765448451042175, + "memory(GiB)": 78.26, + "step": 1133, + "token_acc": 0.8770553935860058, + "train_speed(iter/s)": 0.032449 + }, + { + "epoch": 0.21973550356052898, + "grad_norm": 0.1124100387096405, + "learning_rate": 0.00027702689273202425, + "loss": 0.43786174058914185, + "memory(GiB)": 78.26, + "step": 1134, + "token_acc": 0.8744572796884457, + "train_speed(iter/s)": 0.032452 + }, + { + "epoch": 0.21992927384585573, + "grad_norm": 0.0979883000254631, + "learning_rate": 0.00027697573999128136, + "loss": 0.3537387549877167, + "memory(GiB)": 78.26, + "step": 1135, + "token_acc": 0.8937490836225014, + "train_speed(iter/s)": 0.032454 + }, + { + "epoch": 0.22012304413118247, + "grad_norm": 0.10655289143323898, + "learning_rate": 0.00027692453509819, + "loss": 0.41893625259399414, + "memory(GiB)": 78.26, + "step": 1136, + "token_acc": 0.8789784973184301, + "train_speed(iter/s)": 0.032456 + }, + { + "epoch": 0.22031681441650922, + "grad_norm": 0.10611773282289505, + "learning_rate": 0.0002768732780737815, + "loss": 0.39203619956970215, + "memory(GiB)": 78.26, + "step": 1137, + "token_acc": 0.8859876620924084, + "train_speed(iter/s)": 0.032459 + }, + { + "epoch": 0.22051058470183596, + "grad_norm": 0.10541475564241409, + "learning_rate": 0.0002768219689391085, + "loss": 0.3815195560455322, + "memory(GiB)": 78.26, + "step": 1138, + "token_acc": 0.8915424198443066, + "train_speed(iter/s)": 0.032461 + }, + { + "epoch": 0.22070435498716273, + "grad_norm": 0.11761334538459778, + "learning_rate": 0.000276770607715245, + "loss": 0.4491420090198517, + "memory(GiB)": 78.26, + "step": 1139, + "token_acc": 0.8722042094850799, + "train_speed(iter/s)": 0.032463 + }, + { + "epoch": 0.22089812527248948, + "grad_norm": 0.11273287236690521, + "learning_rate": 0.0002767191944232865, + "loss": 0.441051721572876, + "memory(GiB)": 78.26, + "step": 1140, + "token_acc": 0.8757705209438345, + "train_speed(iter/s)": 0.032466 + }, + { + "epoch": 0.22109189555781622, + "grad_norm": 0.10666593909263611, + "learning_rate": 0.00027666772908434967, + "loss": 0.41061753034591675, + "memory(GiB)": 78.26, + "step": 1141, + "token_acc": 0.8814865515755317, + "train_speed(iter/s)": 0.032468 + }, + { + "epoch": 0.22128566584314296, + "grad_norm": 0.0970299169421196, + "learning_rate": 0.0002766162117195729, + "loss": 0.37377890944480896, + "memory(GiB)": 78.26, + "step": 1142, + "token_acc": 0.8917459199248562, + "train_speed(iter/s)": 0.03247 + }, + { + "epoch": 0.2214794361284697, + "grad_norm": 0.11498536169528961, + "learning_rate": 0.0002765646423501156, + "loss": 0.4479539394378662, + "memory(GiB)": 78.26, + "step": 1143, + "token_acc": 0.873067222283145, + "train_speed(iter/s)": 0.032473 + }, + { + "epoch": 0.22167320641379645, + "grad_norm": 0.1132684275507927, + "learning_rate": 0.00027651302099715886, + "loss": 0.4535306692123413, + "memory(GiB)": 78.26, + "step": 1144, + "token_acc": 0.873893744313639, + "train_speed(iter/s)": 0.032475 + }, + { + "epoch": 0.2218669766991232, + "grad_norm": 0.11660971492528915, + "learning_rate": 0.0002764613476819048, + "loss": 0.42548397183418274, + "memory(GiB)": 78.26, + "step": 1145, + "token_acc": 0.8776913060594131, + "train_speed(iter/s)": 0.032477 + }, + { + "epoch": 0.22206074698444994, + "grad_norm": 0.10114238411188126, + "learning_rate": 0.0002764096224255771, + "loss": 0.4015495181083679, + "memory(GiB)": 78.26, + "step": 1146, + "token_acc": 0.8825136612021858, + "train_speed(iter/s)": 0.032479 + }, + { + "epoch": 0.22225451726977669, + "grad_norm": 0.10448514670133591, + "learning_rate": 0.0002763578452494207, + "loss": 0.3914845883846283, + "memory(GiB)": 78.26, + "step": 1147, + "token_acc": 0.8844678055190539, + "train_speed(iter/s)": 0.032482 + }, + { + "epoch": 0.22244828755510343, + "grad_norm": 0.11461488157510757, + "learning_rate": 0.0002763060161747019, + "loss": 0.4321270287036896, + "memory(GiB)": 78.26, + "step": 1148, + "token_acc": 0.8759273103686238, + "train_speed(iter/s)": 0.032484 + }, + { + "epoch": 0.22264205784043017, + "grad_norm": 0.11510949581861496, + "learning_rate": 0.00027625413522270833, + "loss": 0.42202824354171753, + "memory(GiB)": 78.26, + "step": 1149, + "token_acc": 0.8777195685670262, + "train_speed(iter/s)": 0.032486 + }, + { + "epoch": 0.22283582812575692, + "grad_norm": 0.11425234377384186, + "learning_rate": 0.0002762022024147488, + "loss": 0.45739829540252686, + "memory(GiB)": 78.26, + "step": 1150, + "token_acc": 0.8678213309024613, + "train_speed(iter/s)": 0.032488 + }, + { + "epoch": 0.22302959841108366, + "grad_norm": 0.11414557695388794, + "learning_rate": 0.0002761502177721535, + "loss": 0.4292232096195221, + "memory(GiB)": 78.26, + "step": 1151, + "token_acc": 0.8774583963691377, + "train_speed(iter/s)": 0.032491 + }, + { + "epoch": 0.2232233686964104, + "grad_norm": 0.10301478952169418, + "learning_rate": 0.00027609818131627407, + "loss": 0.3989236056804657, + "memory(GiB)": 78.26, + "step": 1152, + "token_acc": 0.8839550828699455, + "train_speed(iter/s)": 0.032493 + }, + { + "epoch": 0.22341713898173715, + "grad_norm": 0.10694558918476105, + "learning_rate": 0.0002760460930684831, + "loss": 0.36604946851730347, + "memory(GiB)": 78.26, + "step": 1153, + "token_acc": 0.8924390538257302, + "train_speed(iter/s)": 0.032496 + }, + { + "epoch": 0.2236109092670639, + "grad_norm": 0.13084904849529266, + "learning_rate": 0.0002759939530501748, + "loss": 0.42997825145721436, + "memory(GiB)": 78.26, + "step": 1154, + "token_acc": 0.8759590419790543, + "train_speed(iter/s)": 0.032498 + }, + { + "epoch": 0.22380467955239064, + "grad_norm": 0.10484279692173004, + "learning_rate": 0.00027594176128276435, + "loss": 0.42240795493125916, + "memory(GiB)": 78.26, + "step": 1155, + "token_acc": 0.8780322748568454, + "train_speed(iter/s)": 0.032501 + }, + { + "epoch": 0.22399844983771738, + "grad_norm": 0.10142414271831512, + "learning_rate": 0.00027588951778768835, + "loss": 0.38500741124153137, + "memory(GiB)": 78.26, + "step": 1156, + "token_acc": 0.8893578852952039, + "train_speed(iter/s)": 0.032503 + }, + { + "epoch": 0.22419222012304413, + "grad_norm": 0.11852286010980606, + "learning_rate": 0.0002758372225864046, + "loss": 0.41841238737106323, + "memory(GiB)": 78.26, + "step": 1157, + "token_acc": 0.8788539120704378, + "train_speed(iter/s)": 0.032505 + }, + { + "epoch": 0.22438599040837087, + "grad_norm": 0.10002440959215164, + "learning_rate": 0.0002757848757003922, + "loss": 0.35776591300964355, + "memory(GiB)": 78.26, + "step": 1158, + "token_acc": 0.8962676092945886, + "train_speed(iter/s)": 0.032507 + }, + { + "epoch": 0.22457976069369762, + "grad_norm": 0.10413894802331924, + "learning_rate": 0.0002757324771511514, + "loss": 0.38659295439720154, + "memory(GiB)": 78.26, + "step": 1159, + "token_acc": 0.887719821903838, + "train_speed(iter/s)": 0.032509 + }, + { + "epoch": 0.22477353097902436, + "grad_norm": 0.09892678260803223, + "learning_rate": 0.0002756800269602036, + "loss": 0.37748774886131287, + "memory(GiB)": 78.26, + "step": 1160, + "token_acc": 0.8910947249007374, + "train_speed(iter/s)": 0.032511 + }, + { + "epoch": 0.2249673012643511, + "grad_norm": 0.12362544238567352, + "learning_rate": 0.0002756275251490916, + "loss": 0.43426865339279175, + "memory(GiB)": 78.26, + "step": 1161, + "token_acc": 0.8750506441941496, + "train_speed(iter/s)": 0.032513 + }, + { + "epoch": 0.22516107154967785, + "grad_norm": 0.10890624672174454, + "learning_rate": 0.00027557497173937923, + "loss": 0.3891877830028534, + "memory(GiB)": 78.26, + "step": 1162, + "token_acc": 0.8879472436438945, + "train_speed(iter/s)": 0.032515 + }, + { + "epoch": 0.2253548418350046, + "grad_norm": 0.11398806422948837, + "learning_rate": 0.00027552236675265174, + "loss": 0.40979334712028503, + "memory(GiB)": 78.26, + "step": 1163, + "token_acc": 0.881280651429629, + "train_speed(iter/s)": 0.032518 + }, + { + "epoch": 0.22554861212033134, + "grad_norm": 0.10806816816329956, + "learning_rate": 0.00027546971021051526, + "loss": 0.39391979575157166, + "memory(GiB)": 78.26, + "step": 1164, + "token_acc": 0.8866101829123446, + "train_speed(iter/s)": 0.03252 + }, + { + "epoch": 0.22574238240565808, + "grad_norm": 0.10718252509832382, + "learning_rate": 0.00027541700213459726, + "loss": 0.44121599197387695, + "memory(GiB)": 78.26, + "step": 1165, + "token_acc": 0.8726908749968325, + "train_speed(iter/s)": 0.032522 + }, + { + "epoch": 0.22593615269098483, + "grad_norm": 0.1094525083899498, + "learning_rate": 0.00027536424254654643, + "loss": 0.4089512526988983, + "memory(GiB)": 78.26, + "step": 1166, + "token_acc": 0.8825124501342789, + "train_speed(iter/s)": 0.032524 + }, + { + "epoch": 0.22612992297631157, + "grad_norm": 0.12114433944225311, + "learning_rate": 0.00027531143146803256, + "loss": 0.4136141538619995, + "memory(GiB)": 78.26, + "step": 1167, + "token_acc": 0.8802897980960939, + "train_speed(iter/s)": 0.032527 + }, + { + "epoch": 0.22632369326163831, + "grad_norm": 0.10031379014253616, + "learning_rate": 0.00027525856892074646, + "loss": 0.37666836380958557, + "memory(GiB)": 78.26, + "step": 1168, + "token_acc": 0.8915674529813404, + "train_speed(iter/s)": 0.032529 + }, + { + "epoch": 0.2265174635469651, + "grad_norm": 0.11311759054660797, + "learning_rate": 0.0002752056549264003, + "loss": 0.40700918436050415, + "memory(GiB)": 78.26, + "step": 1169, + "token_acc": 0.8810722623836179, + "train_speed(iter/s)": 0.032531 + }, + { + "epoch": 0.22671123383229183, + "grad_norm": 0.11526045948266983, + "learning_rate": 0.0002751526895067273, + "loss": 0.4090433120727539, + "memory(GiB)": 78.26, + "step": 1170, + "token_acc": 0.8799791792230562, + "train_speed(iter/s)": 0.032534 + }, + { + "epoch": 0.22690500411761858, + "grad_norm": 0.1078508123755455, + "learning_rate": 0.0002750996726834817, + "loss": 0.3891124427318573, + "memory(GiB)": 78.26, + "step": 1171, + "token_acc": 0.887758734679347, + "train_speed(iter/s)": 0.032536 + }, + { + "epoch": 0.22709877440294532, + "grad_norm": 0.12240707129240036, + "learning_rate": 0.0002750466044784389, + "loss": 0.4698965549468994, + "memory(GiB)": 78.26, + "step": 1172, + "token_acc": 0.8641207547169811, + "train_speed(iter/s)": 0.032538 + }, + { + "epoch": 0.22729254468827206, + "grad_norm": 0.11151791363954544, + "learning_rate": 0.00027499348491339564, + "loss": 0.4254417419433594, + "memory(GiB)": 78.26, + "step": 1173, + "token_acc": 0.8755936161930712, + "train_speed(iter/s)": 0.03254 + }, + { + "epoch": 0.2274863149735988, + "grad_norm": 0.1045253574848175, + "learning_rate": 0.0002749403140101693, + "loss": 0.406377375125885, + "memory(GiB)": 78.26, + "step": 1174, + "token_acc": 0.8817293918478969, + "train_speed(iter/s)": 0.032542 + }, + { + "epoch": 0.22768008525892555, + "grad_norm": 0.11521026492118835, + "learning_rate": 0.00027488709179059886, + "loss": 0.4493963122367859, + "memory(GiB)": 78.26, + "step": 1175, + "token_acc": 0.8708561714101475, + "train_speed(iter/s)": 0.032545 + }, + { + "epoch": 0.2278738555442523, + "grad_norm": 0.1163213700056076, + "learning_rate": 0.00027483381827654384, + "loss": 0.42551738023757935, + "memory(GiB)": 78.26, + "step": 1176, + "token_acc": 0.8768161718256475, + "train_speed(iter/s)": 0.032547 + }, + { + "epoch": 0.22806762582957904, + "grad_norm": 0.11902674287557602, + "learning_rate": 0.0002747804934898853, + "loss": 0.4208133816719055, + "memory(GiB)": 78.26, + "step": 1177, + "token_acc": 0.8792419003713664, + "train_speed(iter/s)": 0.032549 + }, + { + "epoch": 0.22826139611490578, + "grad_norm": 0.11104600876569748, + "learning_rate": 0.00027472711745252514, + "loss": 0.4281710982322693, + "memory(GiB)": 78.26, + "step": 1178, + "token_acc": 0.8774945561675419, + "train_speed(iter/s)": 0.032551 + }, + { + "epoch": 0.22845516640023253, + "grad_norm": 0.12366992980241776, + "learning_rate": 0.00027467369018638625, + "loss": 0.4080888032913208, + "memory(GiB)": 78.26, + "step": 1179, + "token_acc": 0.8825195699186744, + "train_speed(iter/s)": 0.032554 + }, + { + "epoch": 0.22864893668555927, + "grad_norm": 0.1136869415640831, + "learning_rate": 0.00027462021171341264, + "loss": 0.401567280292511, + "memory(GiB)": 78.26, + "step": 1180, + "token_acc": 0.8848314606741573, + "train_speed(iter/s)": 0.032556 + }, + { + "epoch": 0.22884270697088602, + "grad_norm": 0.11184633523225784, + "learning_rate": 0.0002745666820555695, + "loss": 0.43062543869018555, + "memory(GiB)": 78.26, + "step": 1181, + "token_acc": 0.8777496003087271, + "train_speed(iter/s)": 0.032558 + }, + { + "epoch": 0.22903647725621276, + "grad_norm": 0.1049606204032898, + "learning_rate": 0.00027451310123484277, + "loss": 0.41081663966178894, + "memory(GiB)": 78.26, + "step": 1182, + "token_acc": 0.8797746098517307, + "train_speed(iter/s)": 0.03256 + }, + { + "epoch": 0.2292302475415395, + "grad_norm": 0.10288522392511368, + "learning_rate": 0.0002744594692732395, + "loss": 0.39744895696640015, + "memory(GiB)": 78.26, + "step": 1183, + "token_acc": 0.8816475626653231, + "train_speed(iter/s)": 0.032562 + }, + { + "epoch": 0.22942401782686625, + "grad_norm": 0.1137736588716507, + "learning_rate": 0.00027440578619278793, + "loss": 0.4427635967731476, + "memory(GiB)": 78.26, + "step": 1184, + "token_acc": 0.8732105788154099, + "train_speed(iter/s)": 0.032564 + }, + { + "epoch": 0.229617788112193, + "grad_norm": 0.1281413733959198, + "learning_rate": 0.000274352052015537, + "loss": 0.4403104782104492, + "memory(GiB)": 78.26, + "step": 1185, + "token_acc": 0.873717606594319, + "train_speed(iter/s)": 0.032566 + }, + { + "epoch": 0.22981155839751974, + "grad_norm": 0.1134016141295433, + "learning_rate": 0.00027429826676355685, + "loss": 0.361285537481308, + "memory(GiB)": 78.26, + "step": 1186, + "token_acc": 0.8938451254627725, + "train_speed(iter/s)": 0.032569 + }, + { + "epoch": 0.23000532868284648, + "grad_norm": 0.1069924458861351, + "learning_rate": 0.00027424443045893855, + "loss": 0.42521095275878906, + "memory(GiB)": 78.26, + "step": 1187, + "token_acc": 0.8769493235937237, + "train_speed(iter/s)": 0.032571 + }, + { + "epoch": 0.23019909896817323, + "grad_norm": 0.11626293510198593, + "learning_rate": 0.0002741905431237941, + "loss": 0.44659221172332764, + "memory(GiB)": 78.26, + "step": 1188, + "token_acc": 0.8720954699852707, + "train_speed(iter/s)": 0.032573 + }, + { + "epoch": 0.23039286925349997, + "grad_norm": 0.10542602837085724, + "learning_rate": 0.0002741366047802564, + "loss": 0.4164351522922516, + "memory(GiB)": 78.26, + "step": 1189, + "token_acc": 0.8788257817485642, + "train_speed(iter/s)": 0.032575 + }, + { + "epoch": 0.23058663953882672, + "grad_norm": 0.12464678287506104, + "learning_rate": 0.00027408261545047946, + "loss": 0.47106125950813293, + "memory(GiB)": 78.26, + "step": 1190, + "token_acc": 0.85997171145686, + "train_speed(iter/s)": 0.032577 + }, + { + "epoch": 0.23078040982415346, + "grad_norm": 0.10709256678819656, + "learning_rate": 0.00027402857515663814, + "loss": 0.4175183176994324, + "memory(GiB)": 78.26, + "step": 1191, + "token_acc": 0.8793799278364293, + "train_speed(iter/s)": 0.032579 + }, + { + "epoch": 0.2309741801094802, + "grad_norm": 0.10052433609962463, + "learning_rate": 0.0002739744839209282, + "loss": 0.38393762707710266, + "memory(GiB)": 78.26, + "step": 1192, + "token_acc": 0.8859593910996107, + "train_speed(iter/s)": 0.032581 + }, + { + "epoch": 0.23116795039480695, + "grad_norm": 0.11317754536867142, + "learning_rate": 0.0002739203417655664, + "loss": 0.4203610122203827, + "memory(GiB)": 78.26, + "step": 1193, + "token_acc": 0.878258625139809, + "train_speed(iter/s)": 0.032583 + }, + { + "epoch": 0.2313617206801337, + "grad_norm": 0.11243908107280731, + "learning_rate": 0.0002738661487127904, + "loss": 0.3912915885448456, + "memory(GiB)": 78.26, + "step": 1194, + "token_acc": 0.8882206116741347, + "train_speed(iter/s)": 0.032586 + }, + { + "epoch": 0.23155549096546044, + "grad_norm": 0.12153773754835129, + "learning_rate": 0.00027381190478485863, + "loss": 0.41862982511520386, + "memory(GiB)": 78.26, + "step": 1195, + "token_acc": 0.8794540140538524, + "train_speed(iter/s)": 0.032588 + }, + { + "epoch": 0.23174926125078718, + "grad_norm": 0.10597037523984909, + "learning_rate": 0.0002737576100040507, + "loss": 0.38848644495010376, + "memory(GiB)": 78.26, + "step": 1196, + "token_acc": 0.8870572073398096, + "train_speed(iter/s)": 0.03259 + }, + { + "epoch": 0.23194303153611392, + "grad_norm": 0.10839282721281052, + "learning_rate": 0.0002737032643926668, + "loss": 0.3961186110973358, + "memory(GiB)": 78.26, + "step": 1197, + "token_acc": 0.8850163836759011, + "train_speed(iter/s)": 0.032592 + }, + { + "epoch": 0.23213680182144067, + "grad_norm": 0.11460833996534348, + "learning_rate": 0.0002736488679730282, + "loss": 0.4457128942012787, + "memory(GiB)": 78.26, + "step": 1198, + "token_acc": 0.8719119445576845, + "train_speed(iter/s)": 0.032594 + }, + { + "epoch": 0.2323305721067674, + "grad_norm": 0.1022963598370552, + "learning_rate": 0.0002735944207674769, + "loss": 0.39515194296836853, + "memory(GiB)": 78.26, + "step": 1199, + "token_acc": 0.8834028679578924, + "train_speed(iter/s)": 0.032596 + }, + { + "epoch": 0.23252434239209419, + "grad_norm": 0.11231876909732819, + "learning_rate": 0.0002735399227983759, + "loss": 0.4120155870914459, + "memory(GiB)": 78.26, + "step": 1200, + "token_acc": 0.8804409194784245, + "train_speed(iter/s)": 0.032598 + }, + { + "epoch": 0.23271811267742093, + "grad_norm": 0.10544200241565704, + "learning_rate": 0.00027348537408810903, + "loss": 0.4136923551559448, + "memory(GiB)": 78.26, + "step": 1201, + "token_acc": 0.8811871376901352, + "train_speed(iter/s)": 0.03259 + }, + { + "epoch": 0.23291188296274767, + "grad_norm": 0.11337179690599442, + "learning_rate": 0.00027343077465908077, + "loss": 0.4338820278644562, + "memory(GiB)": 78.26, + "step": 1202, + "token_acc": 0.8770748279047591, + "train_speed(iter/s)": 0.032592 + }, + { + "epoch": 0.23310565324807442, + "grad_norm": 0.11407187581062317, + "learning_rate": 0.00027337612453371665, + "loss": 0.4095255136489868, + "memory(GiB)": 78.26, + "step": 1203, + "token_acc": 0.8804728546409807, + "train_speed(iter/s)": 0.032594 + }, + { + "epoch": 0.23329942353340116, + "grad_norm": 0.11431252956390381, + "learning_rate": 0.00027332142373446297, + "loss": 0.38796931505203247, + "memory(GiB)": 78.26, + "step": 1204, + "token_acc": 0.8878559697561278, + "train_speed(iter/s)": 0.032597 + }, + { + "epoch": 0.2334931938187279, + "grad_norm": 0.12404537945985794, + "learning_rate": 0.00027326667228378673, + "loss": 0.45794570446014404, + "memory(GiB)": 78.26, + "step": 1205, + "token_acc": 0.8690552162153949, + "train_speed(iter/s)": 0.032599 + }, + { + "epoch": 0.23368696410405465, + "grad_norm": 0.10816068947315216, + "learning_rate": 0.0002732118702041759, + "loss": 0.42319175601005554, + "memory(GiB)": 78.26, + "step": 1206, + "token_acc": 0.8777571014175469, + "train_speed(iter/s)": 0.0326 + }, + { + "epoch": 0.2338807343893814, + "grad_norm": 0.12030370533466339, + "learning_rate": 0.0002731570175181392, + "loss": 0.44653579592704773, + "memory(GiB)": 78.26, + "step": 1207, + "token_acc": 0.8731073377061239, + "train_speed(iter/s)": 0.032602 + }, + { + "epoch": 0.23407450467470814, + "grad_norm": 0.11633247882127762, + "learning_rate": 0.000273102114248206, + "loss": 0.44103875756263733, + "memory(GiB)": 78.26, + "step": 1208, + "token_acc": 0.8732270799824535, + "train_speed(iter/s)": 0.032605 + }, + { + "epoch": 0.23426827496003488, + "grad_norm": 0.121209517121315, + "learning_rate": 0.0002730471604169266, + "loss": 0.44791626930236816, + "memory(GiB)": 78.26, + "step": 1209, + "token_acc": 0.8704967327820505, + "train_speed(iter/s)": 0.032607 + }, + { + "epoch": 0.23446204524536163, + "grad_norm": 0.10651442408561707, + "learning_rate": 0.00027299215604687204, + "loss": 0.3869745433330536, + "memory(GiB)": 78.26, + "step": 1210, + "token_acc": 0.8861974534880438, + "train_speed(iter/s)": 0.032609 + }, + { + "epoch": 0.23465581553068837, + "grad_norm": 0.11880136281251907, + "learning_rate": 0.000272937101160634, + "loss": 0.42030277848243713, + "memory(GiB)": 78.26, + "step": 1211, + "token_acc": 0.8782967352155222, + "train_speed(iter/s)": 0.032611 + }, + { + "epoch": 0.23484958581601512, + "grad_norm": 0.1096733883023262, + "learning_rate": 0.0002728819957808252, + "loss": 0.40627193450927734, + "memory(GiB)": 78.26, + "step": 1212, + "token_acc": 0.8838417514030116, + "train_speed(iter/s)": 0.032613 + }, + { + "epoch": 0.23504335610134186, + "grad_norm": 0.10282113403081894, + "learning_rate": 0.0002728268399300786, + "loss": 0.3663085699081421, + "memory(GiB)": 78.26, + "step": 1213, + "token_acc": 0.8932632961260669, + "train_speed(iter/s)": 0.032616 + }, + { + "epoch": 0.2352371263866686, + "grad_norm": 0.11431027203798294, + "learning_rate": 0.00027277163363104845, + "loss": 0.42960235476493835, + "memory(GiB)": 78.26, + "step": 1214, + "token_acc": 0.8752838699642246, + "train_speed(iter/s)": 0.032618 + }, + { + "epoch": 0.23543089667199535, + "grad_norm": 0.11704136431217194, + "learning_rate": 0.0002727163769064094, + "loss": 0.42778557538986206, + "memory(GiB)": 78.26, + "step": 1215, + "token_acc": 0.8777720177409135, + "train_speed(iter/s)": 0.03262 + }, + { + "epoch": 0.2356246669573221, + "grad_norm": 0.10846978425979614, + "learning_rate": 0.00027266106977885674, + "loss": 0.40238186717033386, + "memory(GiB)": 78.26, + "step": 1216, + "token_acc": 0.88480611332412, + "train_speed(iter/s)": 0.032622 + }, + { + "epoch": 0.23581843724264884, + "grad_norm": 0.1148129478096962, + "learning_rate": 0.0002726057122711067, + "loss": 0.4382418096065521, + "memory(GiB)": 78.26, + "step": 1217, + "token_acc": 0.8733262608595923, + "train_speed(iter/s)": 0.032624 + }, + { + "epoch": 0.23601220752797558, + "grad_norm": 0.11113447695970535, + "learning_rate": 0.00027255030440589614, + "loss": 0.41040360927581787, + "memory(GiB)": 78.26, + "step": 1218, + "token_acc": 0.8821096468708997, + "train_speed(iter/s)": 0.032626 + }, + { + "epoch": 0.23620597781330233, + "grad_norm": 0.10677137225866318, + "learning_rate": 0.0002724948462059825, + "loss": 0.4053942859172821, + "memory(GiB)": 78.26, + "step": 1219, + "token_acc": 0.8811227442521177, + "train_speed(iter/s)": 0.032628 + }, + { + "epoch": 0.23639974809862907, + "grad_norm": 0.11822210252285004, + "learning_rate": 0.00027243933769414394, + "loss": 0.4305747449398041, + "memory(GiB)": 78.26, + "step": 1220, + "token_acc": 0.8739931621950513, + "train_speed(iter/s)": 0.03263 + }, + { + "epoch": 0.23659351838395581, + "grad_norm": 0.13333040475845337, + "learning_rate": 0.00027238377889317935, + "loss": 0.42261943221092224, + "memory(GiB)": 78.26, + "step": 1221, + "token_acc": 0.8800861924217832, + "train_speed(iter/s)": 0.032632 + }, + { + "epoch": 0.23678728866928256, + "grad_norm": 0.10652173310518265, + "learning_rate": 0.0002723281698259081, + "loss": 0.40445369482040405, + "memory(GiB)": 78.26, + "step": 1222, + "token_acc": 0.8841292322492718, + "train_speed(iter/s)": 0.032634 + }, + { + "epoch": 0.2369810589546093, + "grad_norm": 0.10241284221410751, + "learning_rate": 0.0002722725105151705, + "loss": 0.3897974491119385, + "memory(GiB)": 78.26, + "step": 1223, + "token_acc": 0.887528428624319, + "train_speed(iter/s)": 0.032636 + }, + { + "epoch": 0.23717482923993605, + "grad_norm": 0.11149775236845016, + "learning_rate": 0.00027221680098382726, + "loss": 0.4146060049533844, + "memory(GiB)": 78.26, + "step": 1224, + "token_acc": 0.8800704465401659, + "train_speed(iter/s)": 0.032638 + }, + { + "epoch": 0.2373685995252628, + "grad_norm": 0.11865018308162689, + "learning_rate": 0.00027216104125475974, + "loss": 0.390518456697464, + "memory(GiB)": 78.26, + "step": 1225, + "token_acc": 0.8875045692701352, + "train_speed(iter/s)": 0.032641 + }, + { + "epoch": 0.23756236981058954, + "grad_norm": 0.10814682394266129, + "learning_rate": 0.00027210523135086996, + "loss": 0.39619240164756775, + "memory(GiB)": 78.26, + "step": 1226, + "token_acc": 0.8837669838825017, + "train_speed(iter/s)": 0.032643 + }, + { + "epoch": 0.23775614009591628, + "grad_norm": 0.10125034302473068, + "learning_rate": 0.0002720493712950805, + "loss": 0.3593364953994751, + "memory(GiB)": 78.26, + "step": 1227, + "token_acc": 0.8945650484430858, + "train_speed(iter/s)": 0.032644 + }, + { + "epoch": 0.23794991038124302, + "grad_norm": 0.11461735516786575, + "learning_rate": 0.0002719934611103348, + "loss": 0.41169360280036926, + "memory(GiB)": 78.26, + "step": 1228, + "token_acc": 0.8817771660166704, + "train_speed(iter/s)": 0.032647 + }, + { + "epoch": 0.23814368066656977, + "grad_norm": 0.1014741063117981, + "learning_rate": 0.00027193750081959644, + "loss": 0.39811670780181885, + "memory(GiB)": 78.26, + "step": 1229, + "token_acc": 0.883717022349185, + "train_speed(iter/s)": 0.032649 + }, + { + "epoch": 0.23833745095189654, + "grad_norm": 0.10858450084924698, + "learning_rate": 0.00027188149044584997, + "loss": 0.42627233266830444, + "memory(GiB)": 78.26, + "step": 1230, + "token_acc": 0.8777344179399802, + "train_speed(iter/s)": 0.032651 + }, + { + "epoch": 0.23853122123722328, + "grad_norm": 0.10094203799962997, + "learning_rate": 0.0002718254300121002, + "loss": 0.3662244379520416, + "memory(GiB)": 78.26, + "step": 1231, + "token_acc": 0.8925039872408294, + "train_speed(iter/s)": 0.032653 + }, + { + "epoch": 0.23872499152255003, + "grad_norm": 0.10092142224311829, + "learning_rate": 0.0002717693195413728, + "loss": 0.3618488907814026, + "memory(GiB)": 78.26, + "step": 1232, + "token_acc": 0.8951329653788259, + "train_speed(iter/s)": 0.032655 + }, + { + "epoch": 0.23891876180787677, + "grad_norm": 0.10050500929355621, + "learning_rate": 0.0002717131590567138, + "loss": 0.37352171540260315, + "memory(GiB)": 78.26, + "step": 1233, + "token_acc": 0.8890915338461132, + "train_speed(iter/s)": 0.032657 + }, + { + "epoch": 0.23911253209320352, + "grad_norm": 0.10266852378845215, + "learning_rate": 0.0002716569485811898, + "loss": 0.38168448209762573, + "memory(GiB)": 78.26, + "step": 1234, + "token_acc": 0.8879904318660461, + "train_speed(iter/s)": 0.032659 + }, + { + "epoch": 0.23930630237853026, + "grad_norm": 0.10961468517780304, + "learning_rate": 0.00027160068813788797, + "loss": 0.423623651266098, + "memory(GiB)": 78.26, + "step": 1235, + "token_acc": 0.8771537798836959, + "train_speed(iter/s)": 0.032661 + }, + { + "epoch": 0.239500072663857, + "grad_norm": 0.10763482749462128, + "learning_rate": 0.000271544377749916, + "loss": 0.3990626335144043, + "memory(GiB)": 78.26, + "step": 1236, + "token_acc": 0.8839777513770386, + "train_speed(iter/s)": 0.032663 + }, + { + "epoch": 0.23969384294918375, + "grad_norm": 0.11288584768772125, + "learning_rate": 0.0002714880174404021, + "loss": 0.43142029643058777, + "memory(GiB)": 78.26, + "step": 1237, + "token_acc": 0.8738743873247464, + "train_speed(iter/s)": 0.032665 + }, + { + "epoch": 0.2398876132345105, + "grad_norm": 0.1059926375746727, + "learning_rate": 0.00027143160723249485, + "loss": 0.39879652857780457, + "memory(GiB)": 78.26, + "step": 1238, + "token_acc": 0.8833674819098997, + "train_speed(iter/s)": 0.032667 + }, + { + "epoch": 0.24008138351983724, + "grad_norm": 0.10575394332408905, + "learning_rate": 0.00027137514714936357, + "loss": 0.3970308303833008, + "memory(GiB)": 78.26, + "step": 1239, + "token_acc": 0.8835697867955933, + "train_speed(iter/s)": 0.032668 + }, + { + "epoch": 0.24027515380516398, + "grad_norm": 0.11206567287445068, + "learning_rate": 0.00027131863721419785, + "loss": 0.4004877507686615, + "memory(GiB)": 78.26, + "step": 1240, + "token_acc": 0.8835185939591457, + "train_speed(iter/s)": 0.032671 + }, + { + "epoch": 0.24046892409049073, + "grad_norm": 0.11048437654972076, + "learning_rate": 0.00027126207745020785, + "loss": 0.41594791412353516, + "memory(GiB)": 78.26, + "step": 1241, + "token_acc": 0.8785890073831009, + "train_speed(iter/s)": 0.032672 + }, + { + "epoch": 0.24066269437581747, + "grad_norm": 0.11345624923706055, + "learning_rate": 0.0002712054678806242, + "loss": 0.4346695840358734, + "memory(GiB)": 78.26, + "step": 1242, + "token_acc": 0.8738188213551137, + "train_speed(iter/s)": 0.032674 + }, + { + "epoch": 0.24085646466114422, + "grad_norm": 0.11145740747451782, + "learning_rate": 0.00027114880852869807, + "loss": 0.4504337012767792, + "memory(GiB)": 78.26, + "step": 1243, + "token_acc": 0.8700323658937762, + "train_speed(iter/s)": 0.032676 + }, + { + "epoch": 0.24105023494647096, + "grad_norm": 0.11080954968929291, + "learning_rate": 0.0002710920994177008, + "loss": 0.43637946248054504, + "memory(GiB)": 78.26, + "step": 1244, + "token_acc": 0.8750756533700138, + "train_speed(iter/s)": 0.032678 + }, + { + "epoch": 0.2412440052317977, + "grad_norm": 0.10086048394441605, + "learning_rate": 0.00027103534057092447, + "loss": 0.3988358974456787, + "memory(GiB)": 78.26, + "step": 1245, + "token_acc": 0.8830572217461677, + "train_speed(iter/s)": 0.03268 + }, + { + "epoch": 0.24143777551712445, + "grad_norm": 0.11646781861782074, + "learning_rate": 0.0002709785320116814, + "loss": 0.44504404067993164, + "memory(GiB)": 78.26, + "step": 1246, + "token_acc": 0.8725327939193331, + "train_speed(iter/s)": 0.032682 + }, + { + "epoch": 0.2416315458024512, + "grad_norm": 0.11032546311616898, + "learning_rate": 0.0002709216737633044, + "loss": 0.44403916597366333, + "memory(GiB)": 78.26, + "step": 1247, + "token_acc": 0.872105901587469, + "train_speed(iter/s)": 0.032684 + }, + { + "epoch": 0.24182531608777794, + "grad_norm": 0.09166669845581055, + "learning_rate": 0.0002708647658491467, + "loss": 0.3495952785015106, + "memory(GiB)": 78.26, + "step": 1248, + "token_acc": 0.8977107887579329, + "train_speed(iter/s)": 0.032685 + }, + { + "epoch": 0.24201908637310468, + "grad_norm": 0.10630014538764954, + "learning_rate": 0.0002708078082925819, + "loss": 0.4074488878250122, + "memory(GiB)": 78.26, + "step": 1249, + "token_acc": 0.8815855361990353, + "train_speed(iter/s)": 0.032687 + }, + { + "epoch": 0.24221285665843142, + "grad_norm": 0.09900429099798203, + "learning_rate": 0.000270750801117004, + "loss": 0.39275825023651123, + "memory(GiB)": 78.26, + "step": 1250, + "token_acc": 0.8847475750400977, + "train_speed(iter/s)": 0.032689 + }, + { + "epoch": 0.24240662694375817, + "grad_norm": 0.10974457859992981, + "learning_rate": 0.0002706937443458274, + "loss": 0.4224117398262024, + "memory(GiB)": 78.26, + "step": 1251, + "token_acc": 0.8794166810767191, + "train_speed(iter/s)": 0.032691 + }, + { + "epoch": 0.2426003972290849, + "grad_norm": 0.10894999653100967, + "learning_rate": 0.0002706366380024868, + "loss": 0.37922725081443787, + "memory(GiB)": 78.26, + "step": 1252, + "token_acc": 0.8908649728803882, + "train_speed(iter/s)": 0.032693 + }, + { + "epoch": 0.24279416751441166, + "grad_norm": 0.12082328647375107, + "learning_rate": 0.00027057948211043736, + "loss": 0.43291229009628296, + "memory(GiB)": 78.26, + "step": 1253, + "token_acc": 0.8723226076593552, + "train_speed(iter/s)": 0.032695 + }, + { + "epoch": 0.2429879377997384, + "grad_norm": 0.10953840613365173, + "learning_rate": 0.00027052227669315454, + "loss": 0.38156622648239136, + "memory(GiB)": 78.26, + "step": 1254, + "token_acc": 0.8904028611119321, + "train_speed(iter/s)": 0.032697 + }, + { + "epoch": 0.24318170808506515, + "grad_norm": 0.11070944368839264, + "learning_rate": 0.00027046502177413415, + "loss": 0.4315045475959778, + "memory(GiB)": 78.26, + "step": 1255, + "token_acc": 0.8770893778724934, + "train_speed(iter/s)": 0.032699 + }, + { + "epoch": 0.2433754783703919, + "grad_norm": 0.10312207043170929, + "learning_rate": 0.0002704077173768922, + "loss": 0.38498249650001526, + "memory(GiB)": 78.26, + "step": 1256, + "token_acc": 0.8877153677921801, + "train_speed(iter/s)": 0.032701 + }, + { + "epoch": 0.24356924865571863, + "grad_norm": 0.11712785810232162, + "learning_rate": 0.0002703503635249653, + "loss": 0.4142095446586609, + "memory(GiB)": 78.26, + "step": 1257, + "token_acc": 0.8779902972174204, + "train_speed(iter/s)": 0.032703 + }, + { + "epoch": 0.24376301894104538, + "grad_norm": 0.11441401392221451, + "learning_rate": 0.0002702929602419102, + "loss": 0.40991705656051636, + "memory(GiB)": 78.26, + "step": 1258, + "token_acc": 0.8792493116817441, + "train_speed(iter/s)": 0.032705 + }, + { + "epoch": 0.24395678922637212, + "grad_norm": 0.10743826627731323, + "learning_rate": 0.0002702355075513039, + "loss": 0.3729992210865021, + "memory(GiB)": 78.26, + "step": 1259, + "token_acc": 0.8912204989885367, + "train_speed(iter/s)": 0.032707 + }, + { + "epoch": 0.2441505595116989, + "grad_norm": 0.12141770869493484, + "learning_rate": 0.0002701780054767438, + "loss": 0.456862211227417, + "memory(GiB)": 78.26, + "step": 1260, + "token_acc": 0.868301950047494, + "train_speed(iter/s)": 0.032708 + }, + { + "epoch": 0.24434432979702564, + "grad_norm": 0.11354810744524002, + "learning_rate": 0.0002701204540418475, + "loss": 0.43698757886886597, + "memory(GiB)": 78.26, + "step": 1261, + "token_acc": 0.875366250678242, + "train_speed(iter/s)": 0.03271 + }, + { + "epoch": 0.24453810008235238, + "grad_norm": 0.10827391594648361, + "learning_rate": 0.000270062853270253, + "loss": 0.39581573009490967, + "memory(GiB)": 78.26, + "step": 1262, + "token_acc": 0.8848191899087549, + "train_speed(iter/s)": 0.032712 + }, + { + "epoch": 0.24473187036767913, + "grad_norm": 0.10404475033283234, + "learning_rate": 0.0002700052031856184, + "loss": 0.3672696352005005, + "memory(GiB)": 78.26, + "step": 1263, + "token_acc": 0.8917072443605286, + "train_speed(iter/s)": 0.032714 + }, + { + "epoch": 0.24492564065300587, + "grad_norm": 0.12273856997489929, + "learning_rate": 0.00026994750381162223, + "loss": 0.4346576929092407, + "memory(GiB)": 78.26, + "step": 1264, + "token_acc": 0.8755994537428144, + "train_speed(iter/s)": 0.032716 + }, + { + "epoch": 0.24511941093833262, + "grad_norm": 0.11384225636720657, + "learning_rate": 0.00026988975517196315, + "loss": 0.4470018148422241, + "memory(GiB)": 78.26, + "step": 1265, + "token_acc": 0.8724487077005063, + "train_speed(iter/s)": 0.032718 + }, + { + "epoch": 0.24531318122365936, + "grad_norm": 0.11222903430461884, + "learning_rate": 0.00026983195729036004, + "loss": 0.4389076232910156, + "memory(GiB)": 78.26, + "step": 1266, + "token_acc": 0.8733030464991983, + "train_speed(iter/s)": 0.032719 + }, + { + "epoch": 0.2455069515089861, + "grad_norm": 0.09804686903953552, + "learning_rate": 0.00026977411019055207, + "loss": 0.34668290615081787, + "memory(GiB)": 78.26, + "step": 1267, + "token_acc": 0.8978196899835492, + "train_speed(iter/s)": 0.032721 + }, + { + "epoch": 0.24570072179431285, + "grad_norm": 0.10429113358259201, + "learning_rate": 0.00026971621389629855, + "loss": 0.4104643762111664, + "memory(GiB)": 78.26, + "step": 1268, + "token_acc": 0.8796510205745658, + "train_speed(iter/s)": 0.032723 + }, + { + "epoch": 0.2458944920796396, + "grad_norm": 0.10256537050008774, + "learning_rate": 0.0002696582684313791, + "loss": 0.37179747223854065, + "memory(GiB)": 78.26, + "step": 1269, + "token_acc": 0.8922330825188111, + "train_speed(iter/s)": 0.032724 + }, + { + "epoch": 0.24608826236496634, + "grad_norm": 0.11051318049430847, + "learning_rate": 0.0002696002738195935, + "loss": 0.42477959394454956, + "memory(GiB)": 78.26, + "step": 1270, + "token_acc": 0.8768773913540061, + "train_speed(iter/s)": 0.032726 + }, + { + "epoch": 0.24628203265029308, + "grad_norm": 0.10435964167118073, + "learning_rate": 0.00026954223008476163, + "loss": 0.3826453983783722, + "memory(GiB)": 78.26, + "step": 1271, + "token_acc": 0.8870310249713184, + "train_speed(iter/s)": 0.032728 + }, + { + "epoch": 0.24647580293561983, + "grad_norm": 0.11433306336402893, + "learning_rate": 0.0002694841372507236, + "loss": 0.4138341248035431, + "memory(GiB)": 78.26, + "step": 1272, + "token_acc": 0.8797960325850382, + "train_speed(iter/s)": 0.03273 + }, + { + "epoch": 0.24666957322094657, + "grad_norm": 0.11614301800727844, + "learning_rate": 0.00026942599534133984, + "loss": 0.44977250695228577, + "memory(GiB)": 78.26, + "step": 1273, + "token_acc": 0.8709602418042316, + "train_speed(iter/s)": 0.032732 + }, + { + "epoch": 0.24686334350627331, + "grad_norm": 0.18777108192443848, + "learning_rate": 0.0002693678043804906, + "loss": 0.3745490312576294, + "memory(GiB)": 78.26, + "step": 1274, + "token_acc": 0.8912466843501327, + "train_speed(iter/s)": 0.032734 + }, + { + "epoch": 0.24705711379160006, + "grad_norm": 0.10248465836048126, + "learning_rate": 0.0002693095643920766, + "loss": 0.37728846073150635, + "memory(GiB)": 78.26, + "step": 1275, + "token_acc": 0.8897580936334095, + "train_speed(iter/s)": 0.032736 + }, + { + "epoch": 0.2472508840769268, + "grad_norm": 0.17319722473621368, + "learning_rate": 0.0002692512754000185, + "loss": 0.42109817266464233, + "memory(GiB)": 78.26, + "step": 1276, + "token_acc": 0.8798434724156143, + "train_speed(iter/s)": 0.032737 + }, + { + "epoch": 0.24744465436225355, + "grad_norm": 0.1109052449464798, + "learning_rate": 0.0002691929374282572, + "loss": 0.3932892084121704, + "memory(GiB)": 78.26, + "step": 1277, + "token_acc": 0.886698000389354, + "train_speed(iter/s)": 0.032739 + }, + { + "epoch": 0.2476384246475803, + "grad_norm": 0.10127067565917969, + "learning_rate": 0.00026913455050075374, + "loss": 0.3878341615200043, + "memory(GiB)": 78.26, + "step": 1278, + "token_acc": 0.8887858173572459, + "train_speed(iter/s)": 0.032741 + }, + { + "epoch": 0.24783219493290704, + "grad_norm": 0.11656392365694046, + "learning_rate": 0.00026907611464148905, + "loss": 0.4242454171180725, + "memory(GiB)": 78.26, + "step": 1279, + "token_acc": 0.8786354490579843, + "train_speed(iter/s)": 0.032743 + }, + { + "epoch": 0.24802596521823378, + "grad_norm": 0.10642395168542862, + "learning_rate": 0.00026901762987446436, + "loss": 0.42408353090286255, + "memory(GiB)": 78.26, + "step": 1280, + "token_acc": 0.8807520778430975, + "train_speed(iter/s)": 0.032745 + }, + { + "epoch": 0.24821973550356052, + "grad_norm": 0.11042629927396774, + "learning_rate": 0.000268959096223701, + "loss": 0.3927229344844818, + "memory(GiB)": 78.26, + "step": 1281, + "token_acc": 0.8880031570639305, + "train_speed(iter/s)": 0.032747 + }, + { + "epoch": 0.24841350578888727, + "grad_norm": 0.09721720218658447, + "learning_rate": 0.0002689005137132402, + "loss": 0.3534315526485443, + "memory(GiB)": 78.26, + "step": 1282, + "token_acc": 0.8957349486957596, + "train_speed(iter/s)": 0.032748 + }, + { + "epoch": 0.248607276074214, + "grad_norm": 0.10490331053733826, + "learning_rate": 0.0002688418823671435, + "loss": 0.39838045835494995, + "memory(GiB)": 78.26, + "step": 1283, + "token_acc": 0.8837405682220588, + "train_speed(iter/s)": 0.03275 + }, + { + "epoch": 0.24880104635954076, + "grad_norm": 0.11232082545757294, + "learning_rate": 0.0002687832022094923, + "loss": 0.4016090929508209, + "memory(GiB)": 78.26, + "step": 1284, + "token_acc": 0.8843071140346028, + "train_speed(iter/s)": 0.032752 + }, + { + "epoch": 0.2489948166448675, + "grad_norm": 0.09907688200473785, + "learning_rate": 0.0002687244732643881, + "loss": 0.3738039433956146, + "memory(GiB)": 78.26, + "step": 1285, + "token_acc": 0.8901385820445873, + "train_speed(iter/s)": 0.032754 + }, + { + "epoch": 0.24918858693019424, + "grad_norm": 0.11846373975276947, + "learning_rate": 0.0002686656955559525, + "loss": 0.45066556334495544, + "memory(GiB)": 78.26, + "step": 1286, + "token_acc": 0.8704005115386851, + "train_speed(iter/s)": 0.032756 + }, + { + "epoch": 0.249382357215521, + "grad_norm": 0.10774929076433182, + "learning_rate": 0.00026860686910832704, + "loss": 0.40347960591316223, + "memory(GiB)": 78.26, + "step": 1287, + "token_acc": 0.8835657036827049, + "train_speed(iter/s)": 0.032758 + }, + { + "epoch": 0.24957612750084773, + "grad_norm": 0.10784945636987686, + "learning_rate": 0.0002685479939456734, + "loss": 0.41915833950042725, + "memory(GiB)": 78.26, + "step": 1288, + "token_acc": 0.8762773629622079, + "train_speed(iter/s)": 0.032759 + }, + { + "epoch": 0.24976989778617448, + "grad_norm": 0.10360375046730042, + "learning_rate": 0.000268489070092173, + "loss": 0.3886149823665619, + "memory(GiB)": 78.26, + "step": 1289, + "token_acc": 0.8865310852948481, + "train_speed(iter/s)": 0.032761 + }, + { + "epoch": 0.24996366807150122, + "grad_norm": 0.10862737894058228, + "learning_rate": 0.00026843009757202777, + "loss": 0.4151816666126251, + "memory(GiB)": 78.26, + "step": 1290, + "token_acc": 0.8797664608766098, + "train_speed(iter/s)": 0.032763 + }, + { + "epoch": 0.25015743835682797, + "grad_norm": 0.11150000244379044, + "learning_rate": 0.00026837107640945905, + "loss": 0.433391273021698, + "memory(GiB)": 78.26, + "step": 1291, + "token_acc": 0.8773802907537953, + "train_speed(iter/s)": 0.032765 + }, + { + "epoch": 0.2503512086421547, + "grad_norm": 0.10859289765357971, + "learning_rate": 0.0002683120066287085, + "loss": 0.41375094652175903, + "memory(GiB)": 78.26, + "step": 1292, + "token_acc": 0.8798327262916498, + "train_speed(iter/s)": 0.032767 + }, + { + "epoch": 0.25054497892748145, + "grad_norm": 0.11398376524448395, + "learning_rate": 0.0002682528882540376, + "loss": 0.4081695079803467, + "memory(GiB)": 78.26, + "step": 1293, + "token_acc": 0.8846480067854113, + "train_speed(iter/s)": 0.032769 + }, + { + "epoch": 0.2507387492128082, + "grad_norm": 0.10699091851711273, + "learning_rate": 0.000268193721309728, + "loss": 0.40067243576049805, + "memory(GiB)": 78.26, + "step": 1294, + "token_acc": 0.8816360201176189, + "train_speed(iter/s)": 0.03277 + }, + { + "epoch": 0.25093251949813494, + "grad_norm": 0.10412931442260742, + "learning_rate": 0.00026813450582008103, + "loss": 0.40070998668670654, + "memory(GiB)": 78.26, + "step": 1295, + "token_acc": 0.8844565031409646, + "train_speed(iter/s)": 0.032772 + }, + { + "epoch": 0.2511262897834617, + "grad_norm": 0.11724124103784561, + "learning_rate": 0.00026807524180941814, + "loss": 0.462046355009079, + "memory(GiB)": 78.26, + "step": 1296, + "token_acc": 0.866198113456813, + "train_speed(iter/s)": 0.032774 + }, + { + "epoch": 0.25132006006878843, + "grad_norm": 0.10965701937675476, + "learning_rate": 0.0002680159293020806, + "loss": 0.4059637784957886, + "memory(GiB)": 78.26, + "step": 1297, + "token_acc": 0.8829141864372988, + "train_speed(iter/s)": 0.032776 + }, + { + "epoch": 0.2515138303541152, + "grad_norm": 0.10186024010181427, + "learning_rate": 0.0002679565683224297, + "loss": 0.3853279948234558, + "memory(GiB)": 78.26, + "step": 1298, + "token_acc": 0.8870169740948418, + "train_speed(iter/s)": 0.032778 + }, + { + "epoch": 0.2517076006394419, + "grad_norm": 0.11461776494979858, + "learning_rate": 0.00026789715889484657, + "loss": 0.39657965302467346, + "memory(GiB)": 78.26, + "step": 1299, + "token_acc": 0.8847986900967808, + "train_speed(iter/s)": 0.03278 + }, + { + "epoch": 0.25190137092476866, + "grad_norm": 0.12204183638095856, + "learning_rate": 0.0002678377010437323, + "loss": 0.4581944942474365, + "memory(GiB)": 78.26, + "step": 1300, + "token_acc": 0.8694807389051589, + "train_speed(iter/s)": 0.032781 + }, + { + "epoch": 0.2520951412100954, + "grad_norm": 0.10922391712665558, + "learning_rate": 0.00026777819479350775, + "loss": 0.39435988664627075, + "memory(GiB)": 78.26, + "step": 1301, + "token_acc": 0.8852810715217581, + "train_speed(iter/s)": 0.032783 + }, + { + "epoch": 0.25228891149542215, + "grad_norm": 0.10757733881473541, + "learning_rate": 0.00026771864016861377, + "loss": 0.38533589243888855, + "memory(GiB)": 78.26, + "step": 1302, + "token_acc": 0.8885692617484767, + "train_speed(iter/s)": 0.032785 + }, + { + "epoch": 0.2524826817807489, + "grad_norm": 0.11346130073070526, + "learning_rate": 0.0002676590371935111, + "loss": 0.4119528830051422, + "memory(GiB)": 78.26, + "step": 1303, + "token_acc": 0.8788996980878899, + "train_speed(iter/s)": 0.032787 + }, + { + "epoch": 0.25267645206607564, + "grad_norm": 0.11655943840742111, + "learning_rate": 0.0002675993858926802, + "loss": 0.4352225661277771, + "memory(GiB)": 78.26, + "step": 1304, + "token_acc": 0.8748795761078998, + "train_speed(iter/s)": 0.032788 + }, + { + "epoch": 0.2528702223514024, + "grad_norm": 0.10136115550994873, + "learning_rate": 0.00026753968629062146, + "loss": 0.39623382687568665, + "memory(GiB)": 78.26, + "step": 1305, + "token_acc": 0.882677549344216, + "train_speed(iter/s)": 0.03279 + }, + { + "epoch": 0.2530639926367292, + "grad_norm": 0.10475181043148041, + "learning_rate": 0.0002674799384118552, + "loss": 0.3807075023651123, + "memory(GiB)": 78.26, + "step": 1306, + "token_acc": 0.8890237979601748, + "train_speed(iter/s)": 0.032792 + }, + { + "epoch": 0.25325776292205593, + "grad_norm": 0.11299892514944077, + "learning_rate": 0.0002674201422809214, + "loss": 0.4387453496456146, + "memory(GiB)": 78.26, + "step": 1307, + "token_acc": 0.8713827248539375, + "train_speed(iter/s)": 0.032794 + }, + { + "epoch": 0.2534515332073827, + "grad_norm": 0.11837562918663025, + "learning_rate": 0.00026736029792238003, + "loss": 0.46790987253189087, + "memory(GiB)": 78.26, + "step": 1308, + "token_acc": 0.8648062202398891, + "train_speed(iter/s)": 0.032796 + }, + { + "epoch": 0.2536453034927094, + "grad_norm": 0.10069328546524048, + "learning_rate": 0.0002673004053608106, + "loss": 0.3791685700416565, + "memory(GiB)": 78.26, + "step": 1309, + "token_acc": 0.8905825121616565, + "train_speed(iter/s)": 0.032797 + }, + { + "epoch": 0.25383907377803616, + "grad_norm": 0.12146264314651489, + "learning_rate": 0.0002672404646208128, + "loss": 0.4696671664714813, + "memory(GiB)": 78.26, + "step": 1310, + "token_acc": 0.8655859144344838, + "train_speed(iter/s)": 0.032799 + }, + { + "epoch": 0.2540328440633629, + "grad_norm": 0.1213066577911377, + "learning_rate": 0.00026718047572700575, + "loss": 0.40812620520591736, + "memory(GiB)": 78.26, + "step": 1311, + "token_acc": 0.8800956738768719, + "train_speed(iter/s)": 0.032801 + }, + { + "epoch": 0.25422661434868965, + "grad_norm": 0.1058630719780922, + "learning_rate": 0.0002671204387040286, + "loss": 0.4007977247238159, + "memory(GiB)": 78.26, + "step": 1312, + "token_acc": 0.8834997096479827, + "train_speed(iter/s)": 0.032802 + }, + { + "epoch": 0.2544203846340164, + "grad_norm": 0.09740731865167618, + "learning_rate": 0.00026706035357654007, + "loss": 0.34724316000938416, + "memory(GiB)": 78.26, + "step": 1313, + "token_acc": 0.8995111614130565, + "train_speed(iter/s)": 0.032804 + }, + { + "epoch": 0.25461415491934314, + "grad_norm": 0.12260702252388, + "learning_rate": 0.00026700022036921884, + "loss": 0.47982773184776306, + "memory(GiB)": 78.26, + "step": 1314, + "token_acc": 0.8605628010809541, + "train_speed(iter/s)": 0.032806 + }, + { + "epoch": 0.2548079252046699, + "grad_norm": 0.09923295676708221, + "learning_rate": 0.00026694003910676315, + "loss": 0.3682483732700348, + "memory(GiB)": 78.26, + "step": 1315, + "token_acc": 0.8914272901666708, + "train_speed(iter/s)": 0.032808 + }, + { + "epoch": 0.2550016954899966, + "grad_norm": 0.11524137109518051, + "learning_rate": 0.0002668798098138911, + "loss": 0.4038434326648712, + "memory(GiB)": 78.26, + "step": 1316, + "token_acc": 0.8844845437065523, + "train_speed(iter/s)": 0.03281 + }, + { + "epoch": 0.25519546577532337, + "grad_norm": 0.11129481345415115, + "learning_rate": 0.00026681953251534053, + "loss": 0.4236079454421997, + "memory(GiB)": 78.26, + "step": 1317, + "token_acc": 0.8786745197919983, + "train_speed(iter/s)": 0.032811 + }, + { + "epoch": 0.2553892360606501, + "grad_norm": 0.10136806964874268, + "learning_rate": 0.00026675920723586886, + "loss": 0.3650326430797577, + "memory(GiB)": 78.26, + "step": 1318, + "token_acc": 0.8948705179282869, + "train_speed(iter/s)": 0.032813 + }, + { + "epoch": 0.25558300634597686, + "grad_norm": 0.10744194686412811, + "learning_rate": 0.0002666988340002533, + "loss": 0.4144882559776306, + "memory(GiB)": 78.26, + "step": 1319, + "token_acc": 0.8820147315987494, + "train_speed(iter/s)": 0.032815 + }, + { + "epoch": 0.2557767766313036, + "grad_norm": 0.10830198973417282, + "learning_rate": 0.00026663841283329086, + "loss": 0.4078242778778076, + "memory(GiB)": 78.26, + "step": 1320, + "token_acc": 0.8824059014869888, + "train_speed(iter/s)": 0.032817 + }, + { + "epoch": 0.25597054691663035, + "grad_norm": 0.09699589014053345, + "learning_rate": 0.000266577943759798, + "loss": 0.34918874502182007, + "memory(GiB)": 78.26, + "step": 1321, + "token_acc": 0.897431914673294, + "train_speed(iter/s)": 0.032818 + }, + { + "epoch": 0.2561643172019571, + "grad_norm": 0.11712899804115295, + "learning_rate": 0.00026651742680461115, + "loss": 0.42532727122306824, + "memory(GiB)": 78.26, + "step": 1322, + "token_acc": 0.8809470377019749, + "train_speed(iter/s)": 0.03282 + }, + { + "epoch": 0.25635808748728384, + "grad_norm": 0.12428336590528488, + "learning_rate": 0.0002664568619925862, + "loss": 0.44392460584640503, + "memory(GiB)": 78.26, + "step": 1323, + "token_acc": 0.8718965574699237, + "train_speed(iter/s)": 0.032822 + }, + { + "epoch": 0.2565518577726106, + "grad_norm": 0.1330966353416443, + "learning_rate": 0.00026639624934859853, + "loss": 0.46010035276412964, + "memory(GiB)": 78.26, + "step": 1324, + "token_acc": 0.86755315416462, + "train_speed(iter/s)": 0.032824 + }, + { + "epoch": 0.2567456280579373, + "grad_norm": 0.1076708659529686, + "learning_rate": 0.0002663355888975437, + "loss": 0.3726995587348938, + "memory(GiB)": 78.26, + "step": 1325, + "token_acc": 0.8889115964031298, + "train_speed(iter/s)": 0.032826 + }, + { + "epoch": 0.25693939834326407, + "grad_norm": 0.12054485827684402, + "learning_rate": 0.0002662748806643364, + "loss": 0.45460277795791626, + "memory(GiB)": 78.26, + "step": 1326, + "token_acc": 0.8696869355809753, + "train_speed(iter/s)": 0.032828 + }, + { + "epoch": 0.2571331686285908, + "grad_norm": 0.1159156784415245, + "learning_rate": 0.00026621412467391125, + "loss": 0.413094162940979, + "memory(GiB)": 78.26, + "step": 1327, + "token_acc": 0.8797178700263824, + "train_speed(iter/s)": 0.03283 + }, + { + "epoch": 0.25732693891391756, + "grad_norm": 0.10506505519151688, + "learning_rate": 0.00026615332095122223, + "loss": 0.3767690360546112, + "memory(GiB)": 78.26, + "step": 1328, + "token_acc": 0.8906741666898722, + "train_speed(iter/s)": 0.032831 + }, + { + "epoch": 0.2575207091992443, + "grad_norm": 0.10869252681732178, + "learning_rate": 0.00026609246952124323, + "loss": 0.4055717885494232, + "memory(GiB)": 78.26, + "step": 1329, + "token_acc": 0.8826902784786897, + "train_speed(iter/s)": 0.032833 + }, + { + "epoch": 0.25771447948457105, + "grad_norm": 0.10867463052272797, + "learning_rate": 0.00026603157040896736, + "loss": 0.41014426946640015, + "memory(GiB)": 78.26, + "step": 1330, + "token_acc": 0.8821821358824135, + "train_speed(iter/s)": 0.032835 + }, + { + "epoch": 0.2579082497698978, + "grad_norm": 0.11635489761829376, + "learning_rate": 0.0002659706236394077, + "loss": 0.41088828444480896, + "memory(GiB)": 78.26, + "step": 1331, + "token_acc": 0.8814593374412788, + "train_speed(iter/s)": 0.032837 + }, + { + "epoch": 0.25810202005522453, + "grad_norm": 0.11617472767829895, + "learning_rate": 0.00026590962923759664, + "loss": 0.4540368616580963, + "memory(GiB)": 78.26, + "step": 1332, + "token_acc": 0.8705997580226997, + "train_speed(iter/s)": 0.032838 + }, + { + "epoch": 0.2582957903405513, + "grad_norm": 0.10581759363412857, + "learning_rate": 0.0002658485872285863, + "loss": 0.40073099732398987, + "memory(GiB)": 78.26, + "step": 1333, + "token_acc": 0.8806860367272187, + "train_speed(iter/s)": 0.03284 + }, + { + "epoch": 0.258489560625878, + "grad_norm": 0.10285675525665283, + "learning_rate": 0.0002657874976374481, + "loss": 0.39908578991889954, + "memory(GiB)": 78.26, + "step": 1334, + "token_acc": 0.8866406011983345, + "train_speed(iter/s)": 0.032842 + }, + { + "epoch": 0.25868333091120477, + "grad_norm": 0.1177554577589035, + "learning_rate": 0.00026572636048927334, + "loss": 0.4447701573371887, + "memory(GiB)": 78.26, + "step": 1335, + "token_acc": 0.8721481689350358, + "train_speed(iter/s)": 0.032844 + }, + { + "epoch": 0.2588771011965315, + "grad_norm": 0.10632134228944778, + "learning_rate": 0.00026566517580917267, + "loss": 0.405487984418869, + "memory(GiB)": 78.26, + "step": 1336, + "token_acc": 0.881686990206509, + "train_speed(iter/s)": 0.032845 + }, + { + "epoch": 0.25907087148185826, + "grad_norm": 0.10160496830940247, + "learning_rate": 0.00026560394362227624, + "loss": 0.3819845914840698, + "memory(GiB)": 78.26, + "step": 1337, + "token_acc": 0.887969021231139, + "train_speed(iter/s)": 0.032847 + }, + { + "epoch": 0.259264641767185, + "grad_norm": 0.10578127205371857, + "learning_rate": 0.0002655426639537337, + "loss": 0.40268826484680176, + "memory(GiB)": 78.26, + "step": 1338, + "token_acc": 0.8844663787785317, + "train_speed(iter/s)": 0.032849 + }, + { + "epoch": 0.25945841205251174, + "grad_norm": 0.11469519138336182, + "learning_rate": 0.0002654813368287144, + "loss": 0.4174302816390991, + "memory(GiB)": 78.26, + "step": 1339, + "token_acc": 0.8773725181492171, + "train_speed(iter/s)": 0.03285 + }, + { + "epoch": 0.2596521823378385, + "grad_norm": 0.1754118949174881, + "learning_rate": 0.0002654199622724069, + "loss": 0.4385494291782379, + "memory(GiB)": 78.26, + "step": 1340, + "token_acc": 0.8754544961198242, + "train_speed(iter/s)": 0.032852 + }, + { + "epoch": 0.25984595262316523, + "grad_norm": 0.12439852207899094, + "learning_rate": 0.00026535854031001953, + "loss": 0.46192917227745056, + "memory(GiB)": 78.26, + "step": 1341, + "token_acc": 0.869569399467761, + "train_speed(iter/s)": 0.032854 + }, + { + "epoch": 0.260039722908492, + "grad_norm": 0.10508039593696594, + "learning_rate": 0.00026529707096677977, + "loss": 0.3968399167060852, + "memory(GiB)": 78.26, + "step": 1342, + "token_acc": 0.8842156942967024, + "train_speed(iter/s)": 0.032856 + }, + { + "epoch": 0.2602334931938187, + "grad_norm": 0.10867384821176529, + "learning_rate": 0.0002652355542679349, + "loss": 0.41717660427093506, + "memory(GiB)": 78.26, + "step": 1343, + "token_acc": 0.8766409013494039, + "train_speed(iter/s)": 0.032858 + }, + { + "epoch": 0.26042726347914547, + "grad_norm": 0.1032068282365799, + "learning_rate": 0.0002651739902387513, + "loss": 0.4130229353904724, + "memory(GiB)": 78.26, + "step": 1344, + "token_acc": 0.8800225915765693, + "train_speed(iter/s)": 0.032859 + }, + { + "epoch": 0.2606210337644722, + "grad_norm": 0.11675221472978592, + "learning_rate": 0.00026511237890451504, + "loss": 0.45590946078300476, + "memory(GiB)": 78.26, + "step": 1345, + "token_acc": 0.8688484994867459, + "train_speed(iter/s)": 0.032861 + }, + { + "epoch": 0.26081480404979895, + "grad_norm": 0.10524659603834152, + "learning_rate": 0.00026505072029053167, + "loss": 0.39327630400657654, + "memory(GiB)": 78.26, + "step": 1346, + "token_acc": 0.8866813620279599, + "train_speed(iter/s)": 0.032863 + }, + { + "epoch": 0.2610085743351257, + "grad_norm": 0.11350347101688385, + "learning_rate": 0.0002649890144221259, + "loss": 0.40386056900024414, + "memory(GiB)": 78.26, + "step": 1347, + "token_acc": 0.8825792875394559, + "train_speed(iter/s)": 0.032865 + }, + { + "epoch": 0.26120234462045244, + "grad_norm": 0.10424939543008804, + "learning_rate": 0.000264927261324642, + "loss": 0.37450239062309265, + "memory(GiB)": 78.26, + "step": 1348, + "token_acc": 0.8905393229501867, + "train_speed(iter/s)": 0.032866 + }, + { + "epoch": 0.2613961149057792, + "grad_norm": 0.11633609235286713, + "learning_rate": 0.00026486546102344374, + "loss": 0.40495017170906067, + "memory(GiB)": 78.26, + "step": 1349, + "token_acc": 0.8839027845893956, + "train_speed(iter/s)": 0.032868 + }, + { + "epoch": 0.26158988519110593, + "grad_norm": 0.12242694199085236, + "learning_rate": 0.000264803613543914, + "loss": 0.42412567138671875, + "memory(GiB)": 78.26, + "step": 1350, + "token_acc": 0.8782494429526367, + "train_speed(iter/s)": 0.03287 + }, + { + "epoch": 0.2617836554764327, + "grad_norm": 0.10657955706119537, + "learning_rate": 0.00026474171891145536, + "loss": 0.39834100008010864, + "memory(GiB)": 78.26, + "step": 1351, + "token_acc": 0.8844101811062164, + "train_speed(iter/s)": 0.032872 + }, + { + "epoch": 0.2619774257617594, + "grad_norm": 0.12258057296276093, + "learning_rate": 0.0002646797771514895, + "loss": 0.44253265857696533, + "memory(GiB)": 78.26, + "step": 1352, + "token_acc": 0.8755840383886855, + "train_speed(iter/s)": 0.032873 + }, + { + "epoch": 0.26217119604708616, + "grad_norm": 0.10096532851457596, + "learning_rate": 0.0002646177882894576, + "loss": 0.34528648853302, + "memory(GiB)": 78.26, + "step": 1353, + "token_acc": 0.8981727315320943, + "train_speed(iter/s)": 0.032875 + }, + { + "epoch": 0.2623649663324129, + "grad_norm": 0.10675135254859924, + "learning_rate": 0.0002645557523508202, + "loss": 0.3865097761154175, + "memory(GiB)": 78.26, + "step": 1354, + "token_acc": 0.8894203275279439, + "train_speed(iter/s)": 0.032877 + }, + { + "epoch": 0.26255873661773965, + "grad_norm": 0.10857495665550232, + "learning_rate": 0.00026449366936105696, + "loss": 0.4091225862503052, + "memory(GiB)": 78.26, + "step": 1355, + "token_acc": 0.8810944625407167, + "train_speed(iter/s)": 0.032878 + }, + { + "epoch": 0.2627525069030664, + "grad_norm": 0.11212481558322906, + "learning_rate": 0.0002644315393456672, + "loss": 0.4342299699783325, + "memory(GiB)": 78.26, + "step": 1356, + "token_acc": 0.8745918208676722, + "train_speed(iter/s)": 0.03288 + }, + { + "epoch": 0.26294627718839314, + "grad_norm": 0.10644607990980148, + "learning_rate": 0.00026436936233016937, + "loss": 0.3997873067855835, + "memory(GiB)": 78.26, + "step": 1357, + "token_acc": 0.8845487023221603, + "train_speed(iter/s)": 0.032882 + }, + { + "epoch": 0.2631400474737199, + "grad_norm": 0.1112193912267685, + "learning_rate": 0.0002643071383401012, + "loss": 0.3876175284385681, + "memory(GiB)": 78.26, + "step": 1358, + "token_acc": 0.8877623875481936, + "train_speed(iter/s)": 0.032883 + }, + { + "epoch": 0.26333381775904663, + "grad_norm": 0.10909376293420792, + "learning_rate": 0.00026424486740101973, + "loss": 0.3942829370498657, + "memory(GiB)": 78.26, + "step": 1359, + "token_acc": 0.886093114358897, + "train_speed(iter/s)": 0.032885 + }, + { + "epoch": 0.2635275880443734, + "grad_norm": 0.10721178352832794, + "learning_rate": 0.00026418254953850136, + "loss": 0.4055170714855194, + "memory(GiB)": 78.26, + "step": 1360, + "token_acc": 0.8812773628637651, + "train_speed(iter/s)": 0.032887 + }, + { + "epoch": 0.2637213583297001, + "grad_norm": 0.11268987506628036, + "learning_rate": 0.00026412018477814164, + "loss": 0.4035116136074066, + "memory(GiB)": 78.26, + "step": 1361, + "token_acc": 0.884084150849012, + "train_speed(iter/s)": 0.032888 + }, + { + "epoch": 0.26391512861502686, + "grad_norm": 0.0998779758810997, + "learning_rate": 0.0002640577731455556, + "loss": 0.37671032547950745, + "memory(GiB)": 78.26, + "step": 1362, + "token_acc": 0.8892367801327339, + "train_speed(iter/s)": 0.03289 + }, + { + "epoch": 0.2641088989003536, + "grad_norm": 0.109964519739151, + "learning_rate": 0.0002639953146663772, + "loss": 0.38640278577804565, + "memory(GiB)": 78.26, + "step": 1363, + "token_acc": 0.8855807365439093, + "train_speed(iter/s)": 0.032892 + }, + { + "epoch": 0.26430266918568035, + "grad_norm": 0.10154106467962265, + "learning_rate": 0.0002639328093662599, + "loss": 0.3642141819000244, + "memory(GiB)": 78.26, + "step": 1364, + "token_acc": 0.894092631464166, + "train_speed(iter/s)": 0.032893 + }, + { + "epoch": 0.2644964394710071, + "grad_norm": 0.10464335232973099, + "learning_rate": 0.00026387025727087635, + "loss": 0.3907454311847687, + "memory(GiB)": 78.26, + "step": 1365, + "token_acc": 0.8862689656913484, + "train_speed(iter/s)": 0.032895 + }, + { + "epoch": 0.26469020975633384, + "grad_norm": 0.10311836749315262, + "learning_rate": 0.00026380765840591834, + "loss": 0.3700196146965027, + "memory(GiB)": 78.26, + "step": 1366, + "token_acc": 0.8920064072753578, + "train_speed(iter/s)": 0.032896 + }, + { + "epoch": 0.26488398004166064, + "grad_norm": 0.11652354896068573, + "learning_rate": 0.00026374501279709684, + "loss": 0.4107242822647095, + "memory(GiB)": 78.26, + "step": 1367, + "token_acc": 0.8811259676284307, + "train_speed(iter/s)": 0.032898 + }, + { + "epoch": 0.2650777503269874, + "grad_norm": 0.11389324814081192, + "learning_rate": 0.0002636823204701421, + "loss": 0.4210531711578369, + "memory(GiB)": 78.26, + "step": 1368, + "token_acc": 0.8800777400635059, + "train_speed(iter/s)": 0.032899 + }, + { + "epoch": 0.2652715206123141, + "grad_norm": 0.100397489964962, + "learning_rate": 0.00026361958145080367, + "loss": 0.36525481939315796, + "memory(GiB)": 78.26, + "step": 1369, + "token_acc": 0.8945630471509752, + "train_speed(iter/s)": 0.032901 + }, + { + "epoch": 0.26546529089764087, + "grad_norm": 0.11270000785589218, + "learning_rate": 0.00026355679576485003, + "loss": 0.44845372438430786, + "memory(GiB)": 78.26, + "step": 1370, + "token_acc": 0.8699417372881356, + "train_speed(iter/s)": 0.032902 + }, + { + "epoch": 0.2656590611829676, + "grad_norm": 0.10077559947967529, + "learning_rate": 0.00026349396343806897, + "loss": 0.3875451385974884, + "memory(GiB)": 78.26, + "step": 1371, + "token_acc": 0.8859615802040374, + "train_speed(iter/s)": 0.032904 + }, + { + "epoch": 0.26585283146829436, + "grad_norm": 0.10320444405078888, + "learning_rate": 0.0002634310844962674, + "loss": 0.3602311909198761, + "memory(GiB)": 78.26, + "step": 1372, + "token_acc": 0.8927622360728608, + "train_speed(iter/s)": 0.032906 + }, + { + "epoch": 0.2660466017536211, + "grad_norm": 0.10504072159528732, + "learning_rate": 0.0002633681589652715, + "loss": 0.38470515608787537, + "memory(GiB)": 78.26, + "step": 1373, + "token_acc": 0.8860445526020218, + "train_speed(iter/s)": 0.032907 + }, + { + "epoch": 0.26624037203894785, + "grad_norm": 0.10138935595750809, + "learning_rate": 0.00026330518687092626, + "loss": 0.3703993260860443, + "memory(GiB)": 78.26, + "step": 1374, + "token_acc": 0.8889246858813908, + "train_speed(iter/s)": 0.032909 + }, + { + "epoch": 0.2664341423242746, + "grad_norm": 0.09620644897222519, + "learning_rate": 0.0002632421682390962, + "loss": 0.37472549080848694, + "memory(GiB)": 78.26, + "step": 1375, + "token_acc": 0.8922912631003824, + "train_speed(iter/s)": 0.03291 + }, + { + "epoch": 0.26662791260960134, + "grad_norm": 0.10792719572782516, + "learning_rate": 0.00026317910309566476, + "loss": 0.3973167836666107, + "memory(GiB)": 78.26, + "step": 1376, + "token_acc": 0.8853990475942648, + "train_speed(iter/s)": 0.032912 + }, + { + "epoch": 0.2668216828949281, + "grad_norm": 0.10119035840034485, + "learning_rate": 0.00026311599146653443, + "loss": 0.3766027092933655, + "memory(GiB)": 78.26, + "step": 1377, + "token_acc": 0.8914306730415594, + "train_speed(iter/s)": 0.032914 + }, + { + "epoch": 0.2670154531802548, + "grad_norm": 0.10936938971281052, + "learning_rate": 0.00026305283337762684, + "loss": 0.385044664144516, + "memory(GiB)": 78.26, + "step": 1378, + "token_acc": 0.8883062401678028, + "train_speed(iter/s)": 0.032915 + }, + { + "epoch": 0.26720922346558157, + "grad_norm": 0.10729166865348816, + "learning_rate": 0.0002629896288548827, + "loss": 0.3800238370895386, + "memory(GiB)": 78.26, + "step": 1379, + "token_acc": 0.8888697301491508, + "train_speed(iter/s)": 0.032917 + }, + { + "epoch": 0.2674029937509083, + "grad_norm": 0.11573578417301178, + "learning_rate": 0.0002629263779242619, + "loss": 0.4426755905151367, + "memory(GiB)": 78.26, + "step": 1380, + "token_acc": 0.8741497664796414, + "train_speed(iter/s)": 0.032919 + }, + { + "epoch": 0.26759676403623506, + "grad_norm": 0.09716112166643143, + "learning_rate": 0.00026286308061174315, + "loss": 0.36622488498687744, + "memory(GiB)": 78.26, + "step": 1381, + "token_acc": 0.8954976071882019, + "train_speed(iter/s)": 0.032921 + }, + { + "epoch": 0.2677905343215618, + "grad_norm": 0.11388426274061203, + "learning_rate": 0.0002627997369433246, + "loss": 0.40867888927459717, + "memory(GiB)": 78.26, + "step": 1382, + "token_acc": 0.8815565203408908, + "train_speed(iter/s)": 0.032922 + }, + { + "epoch": 0.26798430460688855, + "grad_norm": 0.10194810479879379, + "learning_rate": 0.0002627363469450229, + "loss": 0.3862283229827881, + "memory(GiB)": 78.26, + "step": 1383, + "token_acc": 0.8868069456729203, + "train_speed(iter/s)": 0.032924 + }, + { + "epoch": 0.2681780748922153, + "grad_norm": 0.11335241794586182, + "learning_rate": 0.0002626729106428742, + "loss": 0.3988998532295227, + "memory(GiB)": 78.26, + "step": 1384, + "token_acc": 0.8836738942922578, + "train_speed(iter/s)": 0.032926 + }, + { + "epoch": 0.26837184517754203, + "grad_norm": 0.10034345090389252, + "learning_rate": 0.0002626094280629335, + "loss": 0.3520752489566803, + "memory(GiB)": 78.26, + "step": 1385, + "token_acc": 0.8951549706812225, + "train_speed(iter/s)": 0.032928 + }, + { + "epoch": 0.2685656154628688, + "grad_norm": 0.10252499580383301, + "learning_rate": 0.0002625458992312747, + "loss": 0.37120020389556885, + "memory(GiB)": 78.26, + "step": 1386, + "token_acc": 0.892410527781646, + "train_speed(iter/s)": 0.032929 + }, + { + "epoch": 0.2687593857481955, + "grad_norm": 0.11847876012325287, + "learning_rate": 0.0002624823241739909, + "loss": 0.4186937212944031, + "memory(GiB)": 78.26, + "step": 1387, + "token_acc": 0.8775456306121251, + "train_speed(iter/s)": 0.032931 + }, + { + "epoch": 0.26895315603352227, + "grad_norm": 0.11340092122554779, + "learning_rate": 0.000262418702917194, + "loss": 0.41182029247283936, + "memory(GiB)": 78.26, + "step": 1388, + "token_acc": 0.8774313059586292, + "train_speed(iter/s)": 0.032932 + }, + { + "epoch": 0.269146926318849, + "grad_norm": 0.11647920310497284, + "learning_rate": 0.0002623550354870151, + "loss": 0.41684606671333313, + "memory(GiB)": 78.26, + "step": 1389, + "token_acc": 0.8813554314750806, + "train_speed(iter/s)": 0.032934 + }, + { + "epoch": 0.26934069660417576, + "grad_norm": 0.10434102267026901, + "learning_rate": 0.00026229132190960395, + "loss": 0.35181209444999695, + "memory(GiB)": 78.26, + "step": 1390, + "token_acc": 0.8950941151016876, + "train_speed(iter/s)": 0.032935 + }, + { + "epoch": 0.2695344668895025, + "grad_norm": 0.11331314593553543, + "learning_rate": 0.0002622275622111295, + "loss": 0.3922780156135559, + "memory(GiB)": 78.26, + "step": 1391, + "token_acc": 0.8852905742642418, + "train_speed(iter/s)": 0.032937 + }, + { + "epoch": 0.26972823717482924, + "grad_norm": 0.10961979627609253, + "learning_rate": 0.00026216375641777964, + "loss": 0.40969350934028625, + "memory(GiB)": 78.26, + "step": 1392, + "token_acc": 0.8799139553643452, + "train_speed(iter/s)": 0.032939 + }, + { + "epoch": 0.269922007460156, + "grad_norm": 0.10636827349662781, + "learning_rate": 0.000262099904555761, + "loss": 0.39670512080192566, + "memory(GiB)": 78.26, + "step": 1393, + "token_acc": 0.8852959931980019, + "train_speed(iter/s)": 0.03294 + }, + { + "epoch": 0.27011577774548273, + "grad_norm": 0.11483940482139587, + "learning_rate": 0.00026203600665129935, + "loss": 0.40846797823905945, + "memory(GiB)": 78.26, + "step": 1394, + "token_acc": 0.8796793825144723, + "train_speed(iter/s)": 0.032942 + }, + { + "epoch": 0.2703095480308095, + "grad_norm": 0.10763109475374222, + "learning_rate": 0.0002619720627306393, + "loss": 0.4060547351837158, + "memory(GiB)": 78.26, + "step": 1395, + "token_acc": 0.8830100404773169, + "train_speed(iter/s)": 0.032943 + }, + { + "epoch": 0.2705033183161362, + "grad_norm": 0.11100554466247559, + "learning_rate": 0.00026190807282004414, + "loss": 0.4114495813846588, + "memory(GiB)": 78.26, + "step": 1396, + "token_acc": 0.8800386130606068, + "train_speed(iter/s)": 0.032945 + }, + { + "epoch": 0.27069708860146297, + "grad_norm": 0.1212446540594101, + "learning_rate": 0.0002618440369457965, + "loss": 0.45782333612442017, + "memory(GiB)": 78.26, + "step": 1397, + "token_acc": 0.868626656635222, + "train_speed(iter/s)": 0.032946 + }, + { + "epoch": 0.2708908588867897, + "grad_norm": 0.10512025654315948, + "learning_rate": 0.0002617799551341975, + "loss": 0.40602394938468933, + "memory(GiB)": 78.26, + "step": 1398, + "token_acc": 0.8847831850941259, + "train_speed(iter/s)": 0.032948 + }, + { + "epoch": 0.27108462917211645, + "grad_norm": 0.1056622564792633, + "learning_rate": 0.00026171582741156725, + "loss": 0.3937338590621948, + "memory(GiB)": 78.26, + "step": 1399, + "token_acc": 0.8871733032741762, + "train_speed(iter/s)": 0.032949 + }, + { + "epoch": 0.2712783994574432, + "grad_norm": 0.10223392397165298, + "learning_rate": 0.0002616516538042448, + "loss": 0.4050544500350952, + "memory(GiB)": 78.26, + "step": 1400, + "token_acc": 0.883415804468436, + "train_speed(iter/s)": 0.032951 + }, + { + "epoch": 0.27147216974276994, + "grad_norm": 0.10160177946090698, + "learning_rate": 0.0002615874343385879, + "loss": 0.38988304138183594, + "memory(GiB)": 78.26, + "step": 1401, + "token_acc": 0.8859642291617994, + "train_speed(iter/s)": 0.032943 + }, + { + "epoch": 0.2716659400280967, + "grad_norm": 0.12478124350309372, + "learning_rate": 0.00026152316904097327, + "loss": 0.4281242787837982, + "memory(GiB)": 78.26, + "step": 1402, + "token_acc": 0.875607687959299, + "train_speed(iter/s)": 0.032945 + }, + { + "epoch": 0.27185971031342343, + "grad_norm": 0.10745180398225784, + "learning_rate": 0.00026145885793779633, + "loss": 0.3870825171470642, + "memory(GiB)": 78.26, + "step": 1403, + "token_acc": 0.8886628235560077, + "train_speed(iter/s)": 0.032946 + }, + { + "epoch": 0.2720534805987502, + "grad_norm": 0.10077864676713943, + "learning_rate": 0.0002613945010554715, + "loss": 0.37229087948799133, + "memory(GiB)": 78.26, + "step": 1404, + "token_acc": 0.8895456714202407, + "train_speed(iter/s)": 0.032948 + }, + { + "epoch": 0.2722472508840769, + "grad_norm": 0.1082373782992363, + "learning_rate": 0.00026133009842043174, + "loss": 0.418215274810791, + "memory(GiB)": 78.26, + "step": 1405, + "token_acc": 0.8777365051766379, + "train_speed(iter/s)": 0.03295 + }, + { + "epoch": 0.27244102116940366, + "grad_norm": 0.1040397435426712, + "learning_rate": 0.00026126565005912903, + "loss": 0.397979736328125, + "memory(GiB)": 78.26, + "step": 1406, + "token_acc": 0.8846828451006571, + "train_speed(iter/s)": 0.032951 + }, + { + "epoch": 0.2726347914547304, + "grad_norm": 0.10922178626060486, + "learning_rate": 0.000261201155998034, + "loss": 0.41443806886672974, + "memory(GiB)": 78.26, + "step": 1407, + "token_acc": 0.8808320476671809, + "train_speed(iter/s)": 0.032953 + }, + { + "epoch": 0.27282856174005715, + "grad_norm": 0.11000542342662811, + "learning_rate": 0.0002611366162636361, + "loss": 0.4330993890762329, + "memory(GiB)": 78.26, + "step": 1408, + "token_acc": 0.8738227795410791, + "train_speed(iter/s)": 0.032954 + }, + { + "epoch": 0.2730223320253839, + "grad_norm": 0.1046384945511818, + "learning_rate": 0.00026107203088244357, + "loss": 0.37256211042404175, + "memory(GiB)": 78.26, + "step": 1409, + "token_acc": 0.8898919399606564, + "train_speed(iter/s)": 0.032956 + }, + { + "epoch": 0.27321610231071064, + "grad_norm": 0.10583927482366562, + "learning_rate": 0.0002610073998809833, + "loss": 0.38221731781959534, + "memory(GiB)": 78.26, + "step": 1410, + "token_acc": 0.8893005757282106, + "train_speed(iter/s)": 0.032958 + }, + { + "epoch": 0.2734098725960374, + "grad_norm": 0.11508861929178238, + "learning_rate": 0.0002609427232858011, + "loss": 0.4040870666503906, + "memory(GiB)": 78.26, + "step": 1411, + "token_acc": 0.8844921965991148, + "train_speed(iter/s)": 0.032959 + }, + { + "epoch": 0.27360364288136413, + "grad_norm": 0.10771467536687851, + "learning_rate": 0.0002608780011234612, + "loss": 0.3735302686691284, + "memory(GiB)": 78.26, + "step": 1412, + "token_acc": 0.8907560765756076, + "train_speed(iter/s)": 0.032961 + }, + { + "epoch": 0.2737974131666909, + "grad_norm": 0.11977894604206085, + "learning_rate": 0.0002608132334205469, + "loss": 0.43817439675331116, + "memory(GiB)": 78.26, + "step": 1413, + "token_acc": 0.8712272477356252, + "train_speed(iter/s)": 0.032963 + }, + { + "epoch": 0.2739911834520176, + "grad_norm": 0.11429792642593384, + "learning_rate": 0.00026074842020365994, + "loss": 0.41938716173171997, + "memory(GiB)": 78.26, + "step": 1414, + "token_acc": 0.8812332907125421, + "train_speed(iter/s)": 0.032964 + }, + { + "epoch": 0.27418495373734436, + "grad_norm": 0.12319884449243546, + "learning_rate": 0.00026068356149942085, + "loss": 0.4341338574886322, + "memory(GiB)": 78.26, + "step": 1415, + "token_acc": 0.8744184650478735, + "train_speed(iter/s)": 0.032966 + }, + { + "epoch": 0.2743787240226711, + "grad_norm": 0.10541116446256638, + "learning_rate": 0.00026061865733446887, + "loss": 0.39338019490242004, + "memory(GiB)": 78.26, + "step": 1416, + "token_acc": 0.8870288248337029, + "train_speed(iter/s)": 0.032967 + }, + { + "epoch": 0.27457249430799785, + "grad_norm": 0.10803982615470886, + "learning_rate": 0.00026055370773546193, + "loss": 0.3971567749977112, + "memory(GiB)": 78.26, + "step": 1417, + "token_acc": 0.8830211291457155, + "train_speed(iter/s)": 0.032969 + }, + { + "epoch": 0.2747662645933246, + "grad_norm": 0.11194406449794769, + "learning_rate": 0.00026048871272907657, + "loss": 0.386283278465271, + "memory(GiB)": 78.26, + "step": 1418, + "token_acc": 0.8865788499180024, + "train_speed(iter/s)": 0.032971 + }, + { + "epoch": 0.27496003487865134, + "grad_norm": 0.09532847255468369, + "learning_rate": 0.00026042367234200783, + "loss": 0.365975558757782, + "memory(GiB)": 78.26, + "step": 1419, + "token_acc": 0.8936005233952846, + "train_speed(iter/s)": 0.032972 + }, + { + "epoch": 0.2751538051639781, + "grad_norm": 0.10494247823953629, + "learning_rate": 0.0002603585866009697, + "loss": 0.4001488983631134, + "memory(GiB)": 78.26, + "step": 1420, + "token_acc": 0.8836253515033528, + "train_speed(iter/s)": 0.032974 + }, + { + "epoch": 0.2753475754493048, + "grad_norm": 0.11904102563858032, + "learning_rate": 0.00026029345553269466, + "loss": 0.4139590263366699, + "memory(GiB)": 78.26, + "step": 1421, + "token_acc": 0.881683852450303, + "train_speed(iter/s)": 0.032975 + }, + { + "epoch": 0.27554134573463157, + "grad_norm": 0.09683738648891449, + "learning_rate": 0.00026022827916393366, + "loss": 0.3371363878250122, + "memory(GiB)": 78.26, + "step": 1422, + "token_acc": 0.8991735972207601, + "train_speed(iter/s)": 0.032977 + }, + { + "epoch": 0.2757351160199583, + "grad_norm": 0.09993797540664673, + "learning_rate": 0.0002601630575214565, + "loss": 0.3939957916736603, + "memory(GiB)": 78.26, + "step": 1423, + "token_acc": 0.8834102564102564, + "train_speed(iter/s)": 0.032978 + }, + { + "epoch": 0.27592888630528506, + "grad_norm": 0.11129175126552582, + "learning_rate": 0.0002600977906320514, + "loss": 0.41668498516082764, + "memory(GiB)": 78.26, + "step": 1424, + "token_acc": 0.8775945117501095, + "train_speed(iter/s)": 0.03298 + }, + { + "epoch": 0.2761226565906118, + "grad_norm": 0.10565146058797836, + "learning_rate": 0.00026003247852252525, + "loss": 0.37214991450309753, + "memory(GiB)": 78.26, + "step": 1425, + "token_acc": 0.8892408742152987, + "train_speed(iter/s)": 0.032981 + }, + { + "epoch": 0.27631642687593855, + "grad_norm": 0.11600112915039062, + "learning_rate": 0.0002599671212197035, + "loss": 0.42271527647972107, + "memory(GiB)": 78.26, + "step": 1426, + "token_acc": 0.8791572967215056, + "train_speed(iter/s)": 0.032983 + }, + { + "epoch": 0.27651019716126535, + "grad_norm": 0.1128147765994072, + "learning_rate": 0.0002599017187504301, + "loss": 0.4270755648612976, + "memory(GiB)": 78.26, + "step": 1427, + "token_acc": 0.875413155465563, + "train_speed(iter/s)": 0.032985 + }, + { + "epoch": 0.2767039674465921, + "grad_norm": 0.09815947711467743, + "learning_rate": 0.0002598362711415677, + "loss": 0.35846811532974243, + "memory(GiB)": 78.26, + "step": 1428, + "token_acc": 0.8941823402029246, + "train_speed(iter/s)": 0.032986 + }, + { + "epoch": 0.27689773773191884, + "grad_norm": 0.11663207411766052, + "learning_rate": 0.0002597707784199973, + "loss": 0.41208210587501526, + "memory(GiB)": 78.26, + "step": 1429, + "token_acc": 0.8810604466620577, + "train_speed(iter/s)": 0.032988 + }, + { + "epoch": 0.2770915080172456, + "grad_norm": 0.10307564586400986, + "learning_rate": 0.0002597052406126185, + "loss": 0.37274155020713806, + "memory(GiB)": 78.26, + "step": 1430, + "token_acc": 0.8910128388017119, + "train_speed(iter/s)": 0.032989 + }, + { + "epoch": 0.2772852783025723, + "grad_norm": 0.11798793822526932, + "learning_rate": 0.0002596396577463495, + "loss": 0.4077189266681671, + "memory(GiB)": 78.26, + "step": 1431, + "token_acc": 0.8813992194674013, + "train_speed(iter/s)": 0.032991 + }, + { + "epoch": 0.27747904858789907, + "grad_norm": 0.11641109734773636, + "learning_rate": 0.00025957402984812695, + "loss": 0.40781551599502563, + "memory(GiB)": 78.26, + "step": 1432, + "token_acc": 0.8810986109516703, + "train_speed(iter/s)": 0.032992 + }, + { + "epoch": 0.2776728188732258, + "grad_norm": 0.11191576719284058, + "learning_rate": 0.000259508356944906, + "loss": 0.4027900695800781, + "memory(GiB)": 78.26, + "step": 1433, + "token_acc": 0.8829726161088467, + "train_speed(iter/s)": 0.032994 + }, + { + "epoch": 0.27786658915855256, + "grad_norm": 0.11352302134037018, + "learning_rate": 0.0002594426390636602, + "loss": 0.423457533121109, + "memory(GiB)": 78.26, + "step": 1434, + "token_acc": 0.8779633306084316, + "train_speed(iter/s)": 0.032996 + }, + { + "epoch": 0.2780603594438793, + "grad_norm": 0.10580118000507355, + "learning_rate": 0.00025937687623138174, + "loss": 0.41249287128448486, + "memory(GiB)": 78.26, + "step": 1435, + "token_acc": 0.8809974249829208, + "train_speed(iter/s)": 0.032997 + }, + { + "epoch": 0.27825412972920605, + "grad_norm": 0.10057955980300903, + "learning_rate": 0.00025931106847508115, + "loss": 0.37016963958740234, + "memory(GiB)": 78.26, + "step": 1436, + "token_acc": 0.8921429696110849, + "train_speed(iter/s)": 0.032999 + }, + { + "epoch": 0.2784479000145328, + "grad_norm": 0.11049242317676544, + "learning_rate": 0.0002592452158217873, + "loss": 0.39204567670822144, + "memory(GiB)": 78.26, + "step": 1437, + "token_acc": 0.8855197058366565, + "train_speed(iter/s)": 0.033 + }, + { + "epoch": 0.27864167029985953, + "grad_norm": 0.1143956407904625, + "learning_rate": 0.00025917931829854795, + "loss": 0.41014882922172546, + "memory(GiB)": 78.26, + "step": 1438, + "token_acc": 0.8808566548416765, + "train_speed(iter/s)": 0.033002 + }, + { + "epoch": 0.2788354405851863, + "grad_norm": 0.10722561925649643, + "learning_rate": 0.00025911337593242874, + "loss": 0.4017128646373749, + "memory(GiB)": 78.26, + "step": 1439, + "token_acc": 0.8847192571825407, + "train_speed(iter/s)": 0.033004 + }, + { + "epoch": 0.279029210870513, + "grad_norm": 0.10540410131216049, + "learning_rate": 0.0002590473887505141, + "loss": 0.3711482286453247, + "memory(GiB)": 78.26, + "step": 1440, + "token_acc": 0.8929875288413797, + "train_speed(iter/s)": 0.033005 + }, + { + "epoch": 0.27922298115583977, + "grad_norm": 0.10699062794446945, + "learning_rate": 0.0002589813567799066, + "loss": 0.37242835760116577, + "memory(GiB)": 78.26, + "step": 1441, + "token_acc": 0.8892943052514933, + "train_speed(iter/s)": 0.033007 + }, + { + "epoch": 0.2794167514411665, + "grad_norm": 0.11198758333921432, + "learning_rate": 0.0002589152800477275, + "loss": 0.38732579350471497, + "memory(GiB)": 78.26, + "step": 1442, + "token_acc": 0.8864687236455637, + "train_speed(iter/s)": 0.033008 + }, + { + "epoch": 0.27961052172649326, + "grad_norm": 0.1112254187464714, + "learning_rate": 0.00025884915858111614, + "loss": 0.41368624567985535, + "memory(GiB)": 78.26, + "step": 1443, + "token_acc": 0.8818168540966846, + "train_speed(iter/s)": 0.033009 + }, + { + "epoch": 0.27980429201182, + "grad_norm": 0.09880448877811432, + "learning_rate": 0.00025878299240723055, + "loss": 0.3723769783973694, + "memory(GiB)": 78.26, + "step": 1444, + "token_acc": 0.8905043044032355, + "train_speed(iter/s)": 0.033011 + }, + { + "epoch": 0.27999806229714674, + "grad_norm": 0.10194198787212372, + "learning_rate": 0.0002587167815532468, + "loss": 0.3568089008331299, + "memory(GiB)": 78.26, + "step": 1445, + "token_acc": 0.8941702819956616, + "train_speed(iter/s)": 0.033012 + }, + { + "epoch": 0.2801918325824735, + "grad_norm": 0.10794655978679657, + "learning_rate": 0.00025865052604635955, + "loss": 0.3879980146884918, + "memory(GiB)": 78.26, + "step": 1446, + "token_acc": 0.8870534199744252, + "train_speed(iter/s)": 0.033014 + }, + { + "epoch": 0.28038560286780023, + "grad_norm": 0.10190257430076599, + "learning_rate": 0.0002585842259137817, + "loss": 0.385148823261261, + "memory(GiB)": 78.26, + "step": 1447, + "token_acc": 0.8866731047802994, + "train_speed(iter/s)": 0.033015 + }, + { + "epoch": 0.280579373153127, + "grad_norm": 0.11218508332967758, + "learning_rate": 0.0002585178811827445, + "loss": 0.41854822635650635, + "memory(GiB)": 78.26, + "step": 1448, + "token_acc": 0.8806162104733414, + "train_speed(iter/s)": 0.033017 + }, + { + "epoch": 0.2807731434384537, + "grad_norm": 0.10860633105039597, + "learning_rate": 0.00025845149188049747, + "loss": 0.3808492124080658, + "memory(GiB)": 78.26, + "step": 1449, + "token_acc": 0.8890830143110403, + "train_speed(iter/s)": 0.033018 + }, + { + "epoch": 0.28096691372378046, + "grad_norm": 0.10633924603462219, + "learning_rate": 0.0002583850580343086, + "loss": 0.3995111584663391, + "memory(GiB)": 78.26, + "step": 1450, + "token_acc": 0.8848618846379894, + "train_speed(iter/s)": 0.03302 + }, + { + "epoch": 0.2811606840091072, + "grad_norm": 0.1051037460565567, + "learning_rate": 0.00025831857967146394, + "loss": 0.3739997148513794, + "memory(GiB)": 78.26, + "step": 1451, + "token_acc": 0.8915456874466268, + "train_speed(iter/s)": 0.033021 + }, + { + "epoch": 0.28135445429443395, + "grad_norm": 0.1118805930018425, + "learning_rate": 0.0002582520568192679, + "loss": 0.4328954517841339, + "memory(GiB)": 78.26, + "step": 1452, + "token_acc": 0.874195172358973, + "train_speed(iter/s)": 0.033023 + }, + { + "epoch": 0.2815482245797607, + "grad_norm": 0.10707706958055496, + "learning_rate": 0.0002581854895050434, + "loss": 0.4252171814441681, + "memory(GiB)": 78.26, + "step": 1453, + "token_acc": 0.8767298393819819, + "train_speed(iter/s)": 0.033024 + }, + { + "epoch": 0.28174199486508744, + "grad_norm": 0.1086982861161232, + "learning_rate": 0.0002581188777561313, + "loss": 0.41112756729125977, + "memory(GiB)": 78.26, + "step": 1454, + "token_acc": 0.8803484635202252, + "train_speed(iter/s)": 0.033026 + }, + { + "epoch": 0.2819357651504142, + "grad_norm": 0.10656489431858063, + "learning_rate": 0.00025805222159989077, + "loss": 0.3699858486652374, + "memory(GiB)": 78.26, + "step": 1455, + "token_acc": 0.8922368486911256, + "train_speed(iter/s)": 0.033027 + }, + { + "epoch": 0.28212953543574093, + "grad_norm": 0.10233739018440247, + "learning_rate": 0.00025798552106369937, + "loss": 0.3772289752960205, + "memory(GiB)": 78.26, + "step": 1456, + "token_acc": 0.8884009801305224, + "train_speed(iter/s)": 0.033028 + }, + { + "epoch": 0.2823233057210677, + "grad_norm": 0.10542133450508118, + "learning_rate": 0.00025791877617495275, + "loss": 0.37354397773742676, + "memory(GiB)": 78.26, + "step": 1457, + "token_acc": 0.8912464826222526, + "train_speed(iter/s)": 0.03303 + }, + { + "epoch": 0.2825170760063944, + "grad_norm": 0.11344679445028305, + "learning_rate": 0.0002578519869610649, + "loss": 0.4273817241191864, + "memory(GiB)": 78.26, + "step": 1458, + "token_acc": 0.8761668213330422, + "train_speed(iter/s)": 0.033031 + }, + { + "epoch": 0.28271084629172116, + "grad_norm": 0.10357240587472916, + "learning_rate": 0.000257785153449468, + "loss": 0.3900695741176605, + "memory(GiB)": 78.26, + "step": 1459, + "token_acc": 0.884189494968947, + "train_speed(iter/s)": 0.033032 + }, + { + "epoch": 0.2829046165770479, + "grad_norm": 0.11411286890506744, + "learning_rate": 0.00025771827566761215, + "loss": 0.4180591106414795, + "memory(GiB)": 78.26, + "step": 1460, + "token_acc": 0.8776129660386003, + "train_speed(iter/s)": 0.033034 + }, + { + "epoch": 0.28309838686237465, + "grad_norm": 0.10684863477945328, + "learning_rate": 0.00025765135364296606, + "loss": 0.42027878761291504, + "memory(GiB)": 78.26, + "step": 1461, + "token_acc": 0.8778430934841382, + "train_speed(iter/s)": 0.033035 + }, + { + "epoch": 0.2832921571477014, + "grad_norm": 0.10738670825958252, + "learning_rate": 0.0002575843874030163, + "loss": 0.36749184131622314, + "memory(GiB)": 78.26, + "step": 1462, + "token_acc": 0.889355581127733, + "train_speed(iter/s)": 0.033037 + }, + { + "epoch": 0.28348592743302814, + "grad_norm": 0.10112845152616501, + "learning_rate": 0.0002575173769752677, + "loss": 0.3824685513973236, + "memory(GiB)": 78.26, + "step": 1463, + "token_acc": 0.8880091942280679, + "train_speed(iter/s)": 0.033038 + }, + { + "epoch": 0.2836796977183549, + "grad_norm": 0.11604847013950348, + "learning_rate": 0.00025745032238724325, + "loss": 0.4152447283267975, + "memory(GiB)": 78.26, + "step": 1464, + "token_acc": 0.8827828574982894, + "train_speed(iter/s)": 0.033039 + }, + { + "epoch": 0.28387346800368163, + "grad_norm": 0.10715403407812119, + "learning_rate": 0.0002573832236664842, + "loss": 0.3738415837287903, + "memory(GiB)": 78.26, + "step": 1465, + "token_acc": 0.8922089917371876, + "train_speed(iter/s)": 0.033041 + }, + { + "epoch": 0.2840672382890084, + "grad_norm": 0.10670117288827896, + "learning_rate": 0.0002573160808405496, + "loss": 0.40999048948287964, + "memory(GiB)": 78.26, + "step": 1466, + "token_acc": 0.8812486533074768, + "train_speed(iter/s)": 0.033042 + }, + { + "epoch": 0.2842610085743351, + "grad_norm": 0.11364596337080002, + "learning_rate": 0.00025724889393701687, + "loss": 0.38645139336586, + "memory(GiB)": 78.26, + "step": 1467, + "token_acc": 0.8858692377222055, + "train_speed(iter/s)": 0.033044 + }, + { + "epoch": 0.28445477885966186, + "grad_norm": 0.10818950086832047, + "learning_rate": 0.00025718166298348163, + "loss": 0.43197697401046753, + "memory(GiB)": 78.26, + "step": 1468, + "token_acc": 0.8740517566040781, + "train_speed(iter/s)": 0.033045 + }, + { + "epoch": 0.2846485491449886, + "grad_norm": 0.10857094824314117, + "learning_rate": 0.00025711438800755725, + "loss": 0.3983537256717682, + "memory(GiB)": 78.26, + "step": 1469, + "token_acc": 0.8840075154730327, + "train_speed(iter/s)": 0.033047 + }, + { + "epoch": 0.28484231943031535, + "grad_norm": 0.10339067131280899, + "learning_rate": 0.00025704706903687544, + "loss": 0.3655926585197449, + "memory(GiB)": 78.26, + "step": 1470, + "token_acc": 0.8920360215324245, + "train_speed(iter/s)": 0.033048 + }, + { + "epoch": 0.2850360897156421, + "grad_norm": 0.09809229522943497, + "learning_rate": 0.0002569797060990859, + "loss": 0.36431875824928284, + "memory(GiB)": 78.26, + "step": 1471, + "token_acc": 0.8942954390742002, + "train_speed(iter/s)": 0.033049 + }, + { + "epoch": 0.28522986000096884, + "grad_norm": 0.12293403595685959, + "learning_rate": 0.0002569122992218564, + "loss": 0.4425593316555023, + "memory(GiB)": 78.26, + "step": 1472, + "token_acc": 0.872041270483512, + "train_speed(iter/s)": 0.033051 + }, + { + "epoch": 0.2854236302862956, + "grad_norm": 0.11606067419052124, + "learning_rate": 0.00025684484843287284, + "loss": 0.4095402956008911, + "memory(GiB)": 78.26, + "step": 1473, + "token_acc": 0.8805144353865623, + "train_speed(iter/s)": 0.033052 + }, + { + "epoch": 0.2856174005716223, + "grad_norm": 0.09995657950639725, + "learning_rate": 0.00025677735375983894, + "loss": 0.38141509890556335, + "memory(GiB)": 78.26, + "step": 1474, + "token_acc": 0.886579869804707, + "train_speed(iter/s)": 0.033053 + }, + { + "epoch": 0.28581117085694907, + "grad_norm": 0.09991477429866791, + "learning_rate": 0.00025670981523047664, + "loss": 0.3756733238697052, + "memory(GiB)": 78.26, + "step": 1475, + "token_acc": 0.8895817295355402, + "train_speed(iter/s)": 0.033055 + }, + { + "epoch": 0.2860049411422758, + "grad_norm": 0.10868589580059052, + "learning_rate": 0.00025664223287252586, + "loss": 0.40122172236442566, + "memory(GiB)": 78.26, + "step": 1476, + "token_acc": 0.8827901370963692, + "train_speed(iter/s)": 0.033056 + }, + { + "epoch": 0.28619871142760256, + "grad_norm": 0.10444658994674683, + "learning_rate": 0.0002565746067137444, + "loss": 0.37232351303100586, + "memory(GiB)": 78.26, + "step": 1477, + "token_acc": 0.8921252470612712, + "train_speed(iter/s)": 0.033058 + }, + { + "epoch": 0.2863924817129293, + "grad_norm": 0.10679329186677933, + "learning_rate": 0.0002565069367819082, + "loss": 0.36880356073379517, + "memory(GiB)": 78.26, + "step": 1478, + "token_acc": 0.8917536644521776, + "train_speed(iter/s)": 0.033059 + }, + { + "epoch": 0.28658625199825605, + "grad_norm": 0.09651319682598114, + "learning_rate": 0.0002564392231048111, + "loss": 0.36866456270217896, + "memory(GiB)": 78.26, + "step": 1479, + "token_acc": 0.8893619056730125, + "train_speed(iter/s)": 0.033061 + }, + { + "epoch": 0.2867800222835828, + "grad_norm": 0.10870502889156342, + "learning_rate": 0.000256371465710265, + "loss": 0.3834042549133301, + "memory(GiB)": 78.26, + "step": 1480, + "token_acc": 0.8883792468895053, + "train_speed(iter/s)": 0.033062 + }, + { + "epoch": 0.28697379256890954, + "grad_norm": 0.11162568628787994, + "learning_rate": 0.0002563036646260996, + "loss": 0.4060906171798706, + "memory(GiB)": 78.26, + "step": 1481, + "token_acc": 0.8793504766127245, + "train_speed(iter/s)": 0.033064 + }, + { + "epoch": 0.2871675628542363, + "grad_norm": 0.10246509313583374, + "learning_rate": 0.00025623581988016257, + "loss": 0.37621062994003296, + "memory(GiB)": 78.26, + "step": 1482, + "token_acc": 0.8903611537529288, + "train_speed(iter/s)": 0.033065 + }, + { + "epoch": 0.287361333139563, + "grad_norm": 0.10581538081169128, + "learning_rate": 0.0002561679315003197, + "loss": 0.40835872292518616, + "memory(GiB)": 78.26, + "step": 1483, + "token_acc": 0.8827440663756468, + "train_speed(iter/s)": 0.033066 + }, + { + "epoch": 0.28755510342488977, + "grad_norm": 0.11528300493955612, + "learning_rate": 0.0002560999995144545, + "loss": 0.43208250403404236, + "memory(GiB)": 78.26, + "step": 1484, + "token_acc": 0.8761382138334439, + "train_speed(iter/s)": 0.033068 + }, + { + "epoch": 0.2877488737102165, + "grad_norm": 0.11191680282354355, + "learning_rate": 0.00025603202395046857, + "loss": 0.40369465947151184, + "memory(GiB)": 78.26, + "step": 1485, + "token_acc": 0.8817523721627779, + "train_speed(iter/s)": 0.033069 + }, + { + "epoch": 0.28794264399554326, + "grad_norm": 0.0994502380490303, + "learning_rate": 0.00025596400483628113, + "loss": 0.3518386781215668, + "memory(GiB)": 78.26, + "step": 1486, + "token_acc": 0.8963413044045166, + "train_speed(iter/s)": 0.033071 + }, + { + "epoch": 0.28813641428087, + "grad_norm": 0.10107074677944183, + "learning_rate": 0.00025589594219982957, + "loss": 0.36090126633644104, + "memory(GiB)": 78.26, + "step": 1487, + "token_acc": 0.8948038903451824, + "train_speed(iter/s)": 0.033072 + }, + { + "epoch": 0.2883301845661968, + "grad_norm": 0.10665128380060196, + "learning_rate": 0.000255827836069069, + "loss": 0.381521075963974, + "memory(GiB)": 78.26, + "step": 1488, + "token_acc": 0.8875178437492439, + "train_speed(iter/s)": 0.033073 + }, + { + "epoch": 0.28852395485152355, + "grad_norm": 0.11443497240543365, + "learning_rate": 0.00025575968647197246, + "loss": 0.4305599629878998, + "memory(GiB)": 78.26, + "step": 1489, + "token_acc": 0.8745530313214142, + "train_speed(iter/s)": 0.033075 + }, + { + "epoch": 0.2887177251368503, + "grad_norm": 0.10215198248624802, + "learning_rate": 0.0002556914934365308, + "loss": 0.38611581921577454, + "memory(GiB)": 78.26, + "step": 1490, + "token_acc": 0.8878939990051101, + "train_speed(iter/s)": 0.033076 + }, + { + "epoch": 0.28891149542217703, + "grad_norm": 0.11031733453273773, + "learning_rate": 0.00025562325699075275, + "loss": 0.3934246003627777, + "memory(GiB)": 78.26, + "step": 1491, + "token_acc": 0.8847335423197492, + "train_speed(iter/s)": 0.033078 + }, + { + "epoch": 0.2891052657075038, + "grad_norm": 0.107778400182724, + "learning_rate": 0.00025555497716266487, + "loss": 0.4159546196460724, + "memory(GiB)": 78.26, + "step": 1492, + "token_acc": 0.8804967649961618, + "train_speed(iter/s)": 0.033079 + }, + { + "epoch": 0.2892990359928305, + "grad_norm": 0.11043369024991989, + "learning_rate": 0.00025548665398031145, + "loss": 0.42264997959136963, + "memory(GiB)": 78.26, + "step": 1493, + "token_acc": 0.8791478474270945, + "train_speed(iter/s)": 0.033081 + }, + { + "epoch": 0.28949280627815727, + "grad_norm": 0.11627716571092606, + "learning_rate": 0.0002554182874717547, + "loss": 0.40543925762176514, + "memory(GiB)": 78.26, + "step": 1494, + "token_acc": 0.8830718414533444, + "train_speed(iter/s)": 0.033082 + }, + { + "epoch": 0.289686576563484, + "grad_norm": 0.11875671148300171, + "learning_rate": 0.00025534987766507466, + "loss": 0.42699331045150757, + "memory(GiB)": 78.26, + "step": 1495, + "token_acc": 0.8727887840821125, + "train_speed(iter/s)": 0.033084 + }, + { + "epoch": 0.28988034684881075, + "grad_norm": 0.1101786196231842, + "learning_rate": 0.00025528142458836896, + "loss": 0.36886993050575256, + "memory(GiB)": 78.26, + "step": 1496, + "token_acc": 0.8915269892151803, + "train_speed(iter/s)": 0.033085 + }, + { + "epoch": 0.2900741171341375, + "grad_norm": 0.1435171663761139, + "learning_rate": 0.0002552129282697532, + "loss": 0.38396596908569336, + "memory(GiB)": 78.26, + "step": 1497, + "token_acc": 0.8871555969652603, + "train_speed(iter/s)": 0.033087 + }, + { + "epoch": 0.29026788741946424, + "grad_norm": 0.10523149371147156, + "learning_rate": 0.0002551443887373605, + "loss": 0.36478835344314575, + "memory(GiB)": 78.26, + "step": 1498, + "token_acc": 0.8914113351325608, + "train_speed(iter/s)": 0.033088 + }, + { + "epoch": 0.290461657704791, + "grad_norm": 0.1243639886379242, + "learning_rate": 0.00025507580601934215, + "loss": 0.4351167678833008, + "memory(GiB)": 78.26, + "step": 1499, + "token_acc": 0.874244422464007, + "train_speed(iter/s)": 0.033089 + }, + { + "epoch": 0.29065542799011773, + "grad_norm": 0.12400247901678085, + "learning_rate": 0.0002550071801438667, + "loss": 0.4268084466457367, + "memory(GiB)": 78.26, + "step": 1500, + "token_acc": 0.8754112107334064, + "train_speed(iter/s)": 0.033091 + }, + { + "epoch": 0.29065542799011773, + "eval_loss": 0.46029698848724365, + "eval_runtime": 1344.1819, + "eval_samples_per_second": 5.021, + "eval_steps_per_second": 5.021, + "eval_token_acc": 0.8845504800698284, + "step": 1500 + }, + { + "epoch": 0.2908491982754445, + "grad_norm": 0.11321935057640076, + "learning_rate": 0.0002549385111391207, + "loss": 0.4121960699558258, + "memory(GiB)": 78.26, + "step": 1501, + "token_acc": 0.881211333653493, + "train_speed(iter/s)": 0.03214 + }, + { + "epoch": 0.2910429685607712, + "grad_norm": 0.11633153259754181, + "learning_rate": 0.0002548697990333084, + "loss": 0.4356468617916107, + "memory(GiB)": 78.26, + "step": 1502, + "token_acc": 0.8736777331731473, + "train_speed(iter/s)": 0.032142 + }, + { + "epoch": 0.29123673884609796, + "grad_norm": 0.10829450935125351, + "learning_rate": 0.00025480104385465166, + "loss": 0.4199885129928589, + "memory(GiB)": 78.26, + "step": 1503, + "token_acc": 0.8805436914067747, + "train_speed(iter/s)": 0.032144 + }, + { + "epoch": 0.2914305091314247, + "grad_norm": 0.11650849878787994, + "learning_rate": 0.0002547322456313901, + "loss": 0.4100668430328369, + "memory(GiB)": 78.26, + "step": 1504, + "token_acc": 0.8807525592548188, + "train_speed(iter/s)": 0.032146 + }, + { + "epoch": 0.29162427941675145, + "grad_norm": 0.10901083797216415, + "learning_rate": 0.000254663404391781, + "loss": 0.4044414460659027, + "memory(GiB)": 78.26, + "step": 1505, + "token_acc": 0.883741382734262, + "train_speed(iter/s)": 0.032148 + }, + { + "epoch": 0.2918180497020782, + "grad_norm": 0.10262013971805573, + "learning_rate": 0.00025459452016409926, + "loss": 0.40257662534713745, + "memory(GiB)": 78.26, + "step": 1506, + "token_acc": 0.8821285962936418, + "train_speed(iter/s)": 0.03215 + }, + { + "epoch": 0.29201181998740494, + "grad_norm": 0.10731082409620285, + "learning_rate": 0.0002545255929766376, + "loss": 0.4021913409233093, + "memory(GiB)": 78.26, + "step": 1507, + "token_acc": 0.8832824363320539, + "train_speed(iter/s)": 0.032152 + }, + { + "epoch": 0.2922055902727317, + "grad_norm": 0.11936552822589874, + "learning_rate": 0.00025445662285770613, + "loss": 0.4097455143928528, + "memory(GiB)": 78.26, + "step": 1508, + "token_acc": 0.8790452832446013, + "train_speed(iter/s)": 0.032154 + }, + { + "epoch": 0.29239936055805843, + "grad_norm": 0.11192671209573746, + "learning_rate": 0.00025438760983563285, + "loss": 0.40953508019447327, + "memory(GiB)": 78.26, + "step": 1509, + "token_acc": 0.8821702104868513, + "train_speed(iter/s)": 0.032156 + }, + { + "epoch": 0.2925931308433852, + "grad_norm": 0.10715434700250626, + "learning_rate": 0.0002543185539387632, + "loss": 0.3857203722000122, + "memory(GiB)": 78.26, + "step": 1510, + "token_acc": 0.8867008985879332, + "train_speed(iter/s)": 0.032158 + }, + { + "epoch": 0.2927869011287119, + "grad_norm": 0.1083342507481575, + "learning_rate": 0.0002542494551954602, + "loss": 0.3879879117012024, + "memory(GiB)": 78.26, + "step": 1511, + "token_acc": 0.886165023879687, + "train_speed(iter/s)": 0.032159 + }, + { + "epoch": 0.29298067141403866, + "grad_norm": 0.12331650406122208, + "learning_rate": 0.0002541803136341048, + "loss": 0.4193812608718872, + "memory(GiB)": 78.26, + "step": 1512, + "token_acc": 0.8781190284970506, + "train_speed(iter/s)": 0.032161 + }, + { + "epoch": 0.2931744416993654, + "grad_norm": 0.10431553423404694, + "learning_rate": 0.0002541111292830951, + "loss": 0.3808317184448242, + "memory(GiB)": 78.26, + "step": 1513, + "token_acc": 0.888267724649629, + "train_speed(iter/s)": 0.032163 + }, + { + "epoch": 0.29336821198469215, + "grad_norm": 0.09962674230337143, + "learning_rate": 0.00025404190217084697, + "loss": 0.3735466003417969, + "memory(GiB)": 78.26, + "step": 1514, + "token_acc": 0.8892632578004508, + "train_speed(iter/s)": 0.032165 + }, + { + "epoch": 0.2935619822700189, + "grad_norm": 0.1109800636768341, + "learning_rate": 0.000253972632325794, + "loss": 0.42662137746810913, + "memory(GiB)": 78.26, + "step": 1515, + "token_acc": 0.8756379486330658, + "train_speed(iter/s)": 0.032167 + }, + { + "epoch": 0.29375575255534564, + "grad_norm": 0.10075878351926804, + "learning_rate": 0.000253903319776387, + "loss": 0.3623042702674866, + "memory(GiB)": 78.26, + "step": 1516, + "token_acc": 0.8929071827001877, + "train_speed(iter/s)": 0.032169 + }, + { + "epoch": 0.2939495228406724, + "grad_norm": 0.10288105905056, + "learning_rate": 0.0002538339645510946, + "loss": 0.38466522097587585, + "memory(GiB)": 78.26, + "step": 1517, + "token_acc": 0.8891691358327007, + "train_speed(iter/s)": 0.032171 + }, + { + "epoch": 0.29414329312599913, + "grad_norm": 0.11288794130086899, + "learning_rate": 0.00025376456667840284, + "loss": 0.39476266503334045, + "memory(GiB)": 78.26, + "step": 1518, + "token_acc": 0.8875907973596265, + "train_speed(iter/s)": 0.032173 + }, + { + "epoch": 0.29433706341132587, + "grad_norm": 0.12219920009374619, + "learning_rate": 0.0002536951261868153, + "loss": 0.40511706471443176, + "memory(GiB)": 78.26, + "step": 1519, + "token_acc": 0.8838210765731614, + "train_speed(iter/s)": 0.032175 + }, + { + "epoch": 0.2945308336966526, + "grad_norm": 0.1097761020064354, + "learning_rate": 0.000253625643104853, + "loss": 0.4082501530647278, + "memory(GiB)": 78.26, + "step": 1520, + "token_acc": 0.8827916295636687, + "train_speed(iter/s)": 0.032177 + }, + { + "epoch": 0.29472460398197936, + "grad_norm": 0.21266750991344452, + "learning_rate": 0.0002535561174610546, + "loss": 0.4297519028186798, + "memory(GiB)": 78.26, + "step": 1521, + "token_acc": 0.8791513393088441, + "train_speed(iter/s)": 0.032179 + }, + { + "epoch": 0.2949183742673061, + "grad_norm": 0.10368236899375916, + "learning_rate": 0.00025348654928397614, + "loss": 0.3687269389629364, + "memory(GiB)": 78.26, + "step": 1522, + "token_acc": 0.8922836811893236, + "train_speed(iter/s)": 0.03218 + }, + { + "epoch": 0.29511214455263285, + "grad_norm": 0.11547058075666428, + "learning_rate": 0.000253416938602191, + "loss": 0.38897940516471863, + "memory(GiB)": 78.26, + "step": 1523, + "token_acc": 0.8873057637889094, + "train_speed(iter/s)": 0.032182 + }, + { + "epoch": 0.2953059148379596, + "grad_norm": 0.40387094020843506, + "learning_rate": 0.0002533472854442903, + "loss": 0.4147917926311493, + "memory(GiB)": 78.26, + "step": 1524, + "token_acc": 0.8777598930968507, + "train_speed(iter/s)": 0.032184 + }, + { + "epoch": 0.29549968512328634, + "grad_norm": 0.10571064054965973, + "learning_rate": 0.0002532775898388824, + "loss": 0.38508111238479614, + "memory(GiB)": 78.26, + "step": 1525, + "token_acc": 0.8872821274090757, + "train_speed(iter/s)": 0.032186 + }, + { + "epoch": 0.2956934554086131, + "grad_norm": 0.1151953935623169, + "learning_rate": 0.0002532078518145931, + "loss": 0.4073837697505951, + "memory(GiB)": 78.26, + "step": 1526, + "token_acc": 0.8840981728275945, + "train_speed(iter/s)": 0.032188 + }, + { + "epoch": 0.2958872256939398, + "grad_norm": 0.11016645282506943, + "learning_rate": 0.0002531380714000659, + "loss": 0.39761802554130554, + "memory(GiB)": 78.26, + "step": 1527, + "token_acc": 0.8864857603439011, + "train_speed(iter/s)": 0.03219 + }, + { + "epoch": 0.29608099597926657, + "grad_norm": 0.10478914529085159, + "learning_rate": 0.00025306824862396127, + "loss": 0.3668254613876343, + "memory(GiB)": 78.26, + "step": 1528, + "token_acc": 0.8923076923076924, + "train_speed(iter/s)": 0.032192 + }, + { + "epoch": 0.2962747662645933, + "grad_norm": 0.3135347068309784, + "learning_rate": 0.0002529983835149574, + "loss": 0.3972938656806946, + "memory(GiB)": 78.26, + "step": 1529, + "token_acc": 0.8850560993077107, + "train_speed(iter/s)": 0.032193 + }, + { + "epoch": 0.29646853654992006, + "grad_norm": 0.11302391439676285, + "learning_rate": 0.00025292847610174974, + "loss": 0.4002307057380676, + "memory(GiB)": 78.26, + "step": 1530, + "token_acc": 0.8843288710519805, + "train_speed(iter/s)": 0.032195 + }, + { + "epoch": 0.2966623068352468, + "grad_norm": 0.1242542490363121, + "learning_rate": 0.0002528585264130511, + "loss": 0.40723949670791626, + "memory(GiB)": 78.26, + "step": 1531, + "token_acc": 0.8828371991110276, + "train_speed(iter/s)": 0.032197 + }, + { + "epoch": 0.29685607712057355, + "grad_norm": 0.11407161504030228, + "learning_rate": 0.00025278853447759184, + "loss": 0.38927173614501953, + "memory(GiB)": 78.26, + "step": 1532, + "token_acc": 0.888841623906628, + "train_speed(iter/s)": 0.032199 + }, + { + "epoch": 0.2970498474059003, + "grad_norm": 0.10127895325422287, + "learning_rate": 0.0002527185003241194, + "loss": 0.38897159695625305, + "memory(GiB)": 78.26, + "step": 1533, + "token_acc": 0.884356180097505, + "train_speed(iter/s)": 0.032201 + }, + { + "epoch": 0.29724361769122704, + "grad_norm": 0.11514552682638168, + "learning_rate": 0.0002526484239813987, + "loss": 0.41849443316459656, + "memory(GiB)": 78.26, + "step": 1534, + "token_acc": 0.8769607696674535, + "train_speed(iter/s)": 0.032203 + }, + { + "epoch": 0.2974373879765538, + "grad_norm": 0.10013052076101303, + "learning_rate": 0.00025257830547821205, + "loss": 0.35412314534187317, + "memory(GiB)": 78.26, + "step": 1535, + "token_acc": 0.8960811384876806, + "train_speed(iter/s)": 0.032205 + }, + { + "epoch": 0.2976311582618805, + "grad_norm": 0.107215017080307, + "learning_rate": 0.0002525081448433589, + "loss": 0.4127185642719269, + "memory(GiB)": 78.26, + "step": 1536, + "token_acc": 0.8802447552447552, + "train_speed(iter/s)": 0.032206 + }, + { + "epoch": 0.29782492854720727, + "grad_norm": 0.10350769758224487, + "learning_rate": 0.00025243794210565623, + "loss": 0.35154351592063904, + "memory(GiB)": 78.26, + "step": 1537, + "token_acc": 0.8971347925653725, + "train_speed(iter/s)": 0.032208 + }, + { + "epoch": 0.298018698832534, + "grad_norm": 0.11700846254825592, + "learning_rate": 0.00025236769729393806, + "loss": 0.42805176973342896, + "memory(GiB)": 78.26, + "step": 1538, + "token_acc": 0.8778269617706237, + "train_speed(iter/s)": 0.03221 + }, + { + "epoch": 0.29821246911786076, + "grad_norm": 0.11825554072856903, + "learning_rate": 0.0002522974104370559, + "loss": 0.42821887135505676, + "memory(GiB)": 78.26, + "step": 1539, + "token_acc": 0.8769199262748311, + "train_speed(iter/s)": 0.032211 + }, + { + "epoch": 0.2984062394031875, + "grad_norm": 0.11428213119506836, + "learning_rate": 0.0002522270815638784, + "loss": 0.4251292645931244, + "memory(GiB)": 78.26, + "step": 1540, + "token_acc": 0.8747427227286092, + "train_speed(iter/s)": 0.032213 + }, + { + "epoch": 0.29860000968851425, + "grad_norm": 0.11575201153755188, + "learning_rate": 0.00025215671070329164, + "loss": 0.4403133988380432, + "memory(GiB)": 78.26, + "step": 1541, + "token_acc": 0.872397366589948, + "train_speed(iter/s)": 0.032215 + }, + { + "epoch": 0.298793779973841, + "grad_norm": 0.10235166549682617, + "learning_rate": 0.0002520862978841987, + "loss": 0.3714492619037628, + "memory(GiB)": 78.26, + "step": 1542, + "token_acc": 0.8913319238900634, + "train_speed(iter/s)": 0.032217 + }, + { + "epoch": 0.29898755025916773, + "grad_norm": 0.11813291907310486, + "learning_rate": 0.00025201584313552, + "loss": 0.4298678934574127, + "memory(GiB)": 78.26, + "step": 1543, + "token_acc": 0.875, + "train_speed(iter/s)": 0.032219 + }, + { + "epoch": 0.2991813205444945, + "grad_norm": 0.1107359528541565, + "learning_rate": 0.0002519453464861933, + "loss": 0.3796873390674591, + "memory(GiB)": 78.26, + "step": 1544, + "token_acc": 0.8897960202731381, + "train_speed(iter/s)": 0.03222 + }, + { + "epoch": 0.2993750908298212, + "grad_norm": 0.12202285975217819, + "learning_rate": 0.0002518748079651734, + "loss": 0.3539576232433319, + "memory(GiB)": 78.26, + "step": 1545, + "token_acc": 0.8958181376124483, + "train_speed(iter/s)": 0.032222 + }, + { + "epoch": 0.29956886111514797, + "grad_norm": 0.10455404222011566, + "learning_rate": 0.00025180422760143244, + "loss": 0.3648805618286133, + "memory(GiB)": 78.26, + "step": 1546, + "token_acc": 0.8921280583557064, + "train_speed(iter/s)": 0.032224 + }, + { + "epoch": 0.2997626314004747, + "grad_norm": 0.10837128758430481, + "learning_rate": 0.0002517336054239596, + "loss": 0.4049074351787567, + "memory(GiB)": 78.26, + "step": 1547, + "token_acc": 0.8826463706255946, + "train_speed(iter/s)": 0.032226 + }, + { + "epoch": 0.29995640168580145, + "grad_norm": 0.11739413440227509, + "learning_rate": 0.00025166294146176124, + "loss": 0.38583841919898987, + "memory(GiB)": 78.26, + "step": 1548, + "token_acc": 0.8861551627309925, + "train_speed(iter/s)": 0.032227 + }, + { + "epoch": 0.30015017197112825, + "grad_norm": 0.10927794128656387, + "learning_rate": 0.00025159223574386114, + "loss": 0.40299391746520996, + "memory(GiB)": 78.26, + "step": 1549, + "token_acc": 0.8808586662200263, + "train_speed(iter/s)": 0.032229 + }, + { + "epoch": 0.300343942256455, + "grad_norm": 0.10551624745130539, + "learning_rate": 0.0002515214882992999, + "loss": 0.3709675073623657, + "memory(GiB)": 78.26, + "step": 1550, + "token_acc": 0.8917359439947495, + "train_speed(iter/s)": 0.032231 + }, + { + "epoch": 0.30053771254178174, + "grad_norm": 0.11021958291530609, + "learning_rate": 0.00025145069915713536, + "loss": 0.3905988335609436, + "memory(GiB)": 78.26, + "step": 1551, + "token_acc": 0.8831299218774511, + "train_speed(iter/s)": 0.032233 + }, + { + "epoch": 0.3007314828271085, + "grad_norm": 0.09675493836402893, + "learning_rate": 0.0002513798683464427, + "loss": 0.3553582429885864, + "memory(GiB)": 78.26, + "step": 1552, + "token_acc": 0.895078622611465, + "train_speed(iter/s)": 0.032234 + }, + { + "epoch": 0.30092525311243523, + "grad_norm": 0.11715512722730637, + "learning_rate": 0.0002513089958963139, + "loss": 0.38959836959838867, + "memory(GiB)": 78.26, + "step": 1553, + "token_acc": 0.8870248193528119, + "train_speed(iter/s)": 0.032236 + }, + { + "epoch": 0.301119023397762, + "grad_norm": 0.10786168277263641, + "learning_rate": 0.00025123808183585817, + "loss": 0.3897908329963684, + "memory(GiB)": 78.26, + "step": 1554, + "token_acc": 0.8833446083074009, + "train_speed(iter/s)": 0.032238 + }, + { + "epoch": 0.3013127936830887, + "grad_norm": 0.11172870546579361, + "learning_rate": 0.00025116712619420185, + "loss": 0.37313905358314514, + "memory(GiB)": 78.26, + "step": 1555, + "token_acc": 0.8884811242923387, + "train_speed(iter/s)": 0.03224 + }, + { + "epoch": 0.30150656396841546, + "grad_norm": 0.11459864675998688, + "learning_rate": 0.0002510961290004884, + "loss": 0.41675081849098206, + "memory(GiB)": 78.26, + "step": 1556, + "token_acc": 0.8796063237082496, + "train_speed(iter/s)": 0.032242 + }, + { + "epoch": 0.3017003342537422, + "grad_norm": 0.12527529895305634, + "learning_rate": 0.00025102509028387813, + "loss": 0.3714950680732727, + "memory(GiB)": 78.26, + "step": 1557, + "token_acc": 0.8910757252156046, + "train_speed(iter/s)": 0.032244 + }, + { + "epoch": 0.30189410453906895, + "grad_norm": 0.11035227030515671, + "learning_rate": 0.00025095401007354867, + "loss": 0.39393147826194763, + "memory(GiB)": 78.26, + "step": 1558, + "token_acc": 0.8863604634929498, + "train_speed(iter/s)": 0.032246 + }, + { + "epoch": 0.3020878748243957, + "grad_norm": 0.10444723814725876, + "learning_rate": 0.0002508828883986945, + "loss": 0.3709162771701813, + "memory(GiB)": 78.26, + "step": 1559, + "token_acc": 0.8904844563673663, + "train_speed(iter/s)": 0.032248 + }, + { + "epoch": 0.30228164510972244, + "grad_norm": 0.10122967511415482, + "learning_rate": 0.0002508117252885273, + "loss": 0.34817183017730713, + "memory(GiB)": 78.26, + "step": 1560, + "token_acc": 0.8977317721590587, + "train_speed(iter/s)": 0.03225 + }, + { + "epoch": 0.3024754153950492, + "grad_norm": 0.10534863919019699, + "learning_rate": 0.00025074052077227556, + "loss": 0.4088999927043915, + "memory(GiB)": 78.26, + "step": 1561, + "token_acc": 0.8811859443631039, + "train_speed(iter/s)": 0.032251 + }, + { + "epoch": 0.30266918568037593, + "grad_norm": 0.0965069830417633, + "learning_rate": 0.000250669274879185, + "loss": 0.3669951558113098, + "memory(GiB)": 78.26, + "step": 1562, + "token_acc": 0.8907134896627016, + "train_speed(iter/s)": 0.032253 + }, + { + "epoch": 0.3028629559657027, + "grad_norm": 0.10454553365707397, + "learning_rate": 0.0002505979876385181, + "loss": 0.40625712275505066, + "memory(GiB)": 78.26, + "step": 1563, + "token_acc": 0.8799307565696124, + "train_speed(iter/s)": 0.032255 + }, + { + "epoch": 0.3030567262510294, + "grad_norm": 0.10287556052207947, + "learning_rate": 0.0002505266590795545, + "loss": 0.415050208568573, + "memory(GiB)": 78.26, + "step": 1564, + "token_acc": 0.8815440689198144, + "train_speed(iter/s)": 0.032257 + }, + { + "epoch": 0.30325049653635616, + "grad_norm": 0.11195293813943863, + "learning_rate": 0.00025045528923159073, + "loss": 0.4277609586715698, + "memory(GiB)": 78.26, + "step": 1565, + "token_acc": 0.8775790135451792, + "train_speed(iter/s)": 0.032258 + }, + { + "epoch": 0.3034442668216829, + "grad_norm": 0.10438180714845657, + "learning_rate": 0.0002503838781239404, + "loss": 0.3552014231681824, + "memory(GiB)": 78.26, + "step": 1566, + "token_acc": 0.896136847274513, + "train_speed(iter/s)": 0.03226 + }, + { + "epoch": 0.30363803710700965, + "grad_norm": 0.12397732585668564, + "learning_rate": 0.0002503124257859339, + "loss": 0.46164411306381226, + "memory(GiB)": 78.26, + "step": 1567, + "token_acc": 0.8641721234798877, + "train_speed(iter/s)": 0.032262 + }, + { + "epoch": 0.3038318073923364, + "grad_norm": 0.11469985544681549, + "learning_rate": 0.0002502409322469186, + "loss": 0.39702335000038147, + "memory(GiB)": 78.26, + "step": 1568, + "token_acc": 0.883688332266813, + "train_speed(iter/s)": 0.032264 + }, + { + "epoch": 0.30402557767766314, + "grad_norm": 0.11066870391368866, + "learning_rate": 0.00025016939753625886, + "loss": 0.40544670820236206, + "memory(GiB)": 78.26, + "step": 1569, + "token_acc": 0.8816060016671298, + "train_speed(iter/s)": 0.032266 + }, + { + "epoch": 0.3042193479629899, + "grad_norm": 0.09953310340642929, + "learning_rate": 0.0002500978216833359, + "loss": 0.39714691042900085, + "memory(GiB)": 78.26, + "step": 1570, + "token_acc": 0.8836974458757173, + "train_speed(iter/s)": 0.032267 + }, + { + "epoch": 0.3044131182483166, + "grad_norm": 0.11722380667924881, + "learning_rate": 0.00025002620471754785, + "loss": 0.41713088750839233, + "memory(GiB)": 78.26, + "step": 1571, + "token_acc": 0.8797300061362242, + "train_speed(iter/s)": 0.032269 + }, + { + "epoch": 0.30460688853364337, + "grad_norm": 0.11423590779304504, + "learning_rate": 0.00024995454666830967, + "loss": 0.40586379170417786, + "memory(GiB)": 78.26, + "step": 1572, + "token_acc": 0.8844951044863364, + "train_speed(iter/s)": 0.032271 + }, + { + "epoch": 0.3048006588189701, + "grad_norm": 0.11461616307497025, + "learning_rate": 0.00024988284756505334, + "loss": 0.3949568271636963, + "memory(GiB)": 78.26, + "step": 1573, + "token_acc": 0.8844877454711524, + "train_speed(iter/s)": 0.032273 + }, + { + "epoch": 0.30499442910429686, + "grad_norm": 0.09611064195632935, + "learning_rate": 0.0002498111074372276, + "loss": 0.3817123770713806, + "memory(GiB)": 78.26, + "step": 1574, + "token_acc": 0.8891472188103176, + "train_speed(iter/s)": 0.032274 + }, + { + "epoch": 0.3051881993896236, + "grad_norm": 0.12749864161014557, + "learning_rate": 0.0002497393263142979, + "loss": 0.43550729751586914, + "memory(GiB)": 78.26, + "step": 1575, + "token_acc": 0.876248012718601, + "train_speed(iter/s)": 0.032276 + }, + { + "epoch": 0.30538196967495035, + "grad_norm": 0.10742782801389694, + "learning_rate": 0.00024966750422574684, + "loss": 0.3822196125984192, + "memory(GiB)": 78.26, + "step": 1576, + "token_acc": 0.8897888795986622, + "train_speed(iter/s)": 0.032278 + }, + { + "epoch": 0.3055757399602771, + "grad_norm": 0.1094922348856926, + "learning_rate": 0.0002495956412010736, + "loss": 0.3977155089378357, + "memory(GiB)": 78.26, + "step": 1577, + "token_acc": 0.8843427182499494, + "train_speed(iter/s)": 0.03228 + }, + { + "epoch": 0.30576951024560384, + "grad_norm": 0.11330767720937729, + "learning_rate": 0.0002495237372697943, + "loss": 0.3944382071495056, + "memory(GiB)": 78.26, + "step": 1578, + "token_acc": 0.8850218853421586, + "train_speed(iter/s)": 0.032281 + }, + { + "epoch": 0.3059632805309306, + "grad_norm": 0.10812770575284958, + "learning_rate": 0.0002494517924614418, + "loss": 0.39382147789001465, + "memory(GiB)": 78.26, + "step": 1579, + "token_acc": 0.8852722927933593, + "train_speed(iter/s)": 0.032283 + }, + { + "epoch": 0.3061570508162573, + "grad_norm": 0.12108810991048813, + "learning_rate": 0.00024937980680556576, + "loss": 0.4625246226787567, + "memory(GiB)": 78.26, + "step": 1580, + "token_acc": 0.8664573098687892, + "train_speed(iter/s)": 0.032285 + }, + { + "epoch": 0.30635082110158407, + "grad_norm": 0.11010700464248657, + "learning_rate": 0.00024930778033173265, + "loss": 0.40388697385787964, + "memory(GiB)": 78.26, + "step": 1581, + "token_acc": 0.8823420361501623, + "train_speed(iter/s)": 0.032287 + }, + { + "epoch": 0.3065445913869108, + "grad_norm": 0.1091642677783966, + "learning_rate": 0.0002492357130695256, + "loss": 0.4263598322868347, + "memory(GiB)": 78.26, + "step": 1582, + "token_acc": 0.8781599433828733, + "train_speed(iter/s)": 0.032289 + }, + { + "epoch": 0.30673836167223756, + "grad_norm": 0.10464166849851608, + "learning_rate": 0.0002491636050485447, + "loss": 0.37317129969596863, + "memory(GiB)": 78.26, + "step": 1583, + "token_acc": 0.890936937421071, + "train_speed(iter/s)": 0.03229 + }, + { + "epoch": 0.3069321319575643, + "grad_norm": 0.10919937491416931, + "learning_rate": 0.00024909145629840645, + "loss": 0.39577916264533997, + "memory(GiB)": 78.26, + "step": 1584, + "token_acc": 0.8861203036791901, + "train_speed(iter/s)": 0.032292 + }, + { + "epoch": 0.30712590224289105, + "grad_norm": 0.10316384583711624, + "learning_rate": 0.0002490192668487445, + "loss": 0.3726585805416107, + "memory(GiB)": 78.26, + "step": 1585, + "token_acc": 0.8914358661264361, + "train_speed(iter/s)": 0.032294 + }, + { + "epoch": 0.3073196725282178, + "grad_norm": 0.11321194469928741, + "learning_rate": 0.00024894703672920894, + "loss": 0.4326629340648651, + "memory(GiB)": 78.26, + "step": 1586, + "token_acc": 0.8764373258299288, + "train_speed(iter/s)": 0.032296 + }, + { + "epoch": 0.30751344281354454, + "grad_norm": 0.10429013520479202, + "learning_rate": 0.0002488747659694665, + "loss": 0.36875998973846436, + "memory(GiB)": 78.26, + "step": 1587, + "token_acc": 0.8898948094499052, + "train_speed(iter/s)": 0.032297 + }, + { + "epoch": 0.3077072130988713, + "grad_norm": 0.10509074479341507, + "learning_rate": 0.0002488024545992009, + "loss": 0.36970990896224976, + "memory(GiB)": 78.26, + "step": 1588, + "token_acc": 0.8925745257452574, + "train_speed(iter/s)": 0.032299 + }, + { + "epoch": 0.307900983384198, + "grad_norm": 0.09186790883541107, + "learning_rate": 0.0002487301026481122, + "loss": 0.3301558494567871, + "memory(GiB)": 78.26, + "step": 1589, + "token_acc": 0.9027383654937571, + "train_speed(iter/s)": 0.032301 + }, + { + "epoch": 0.30809475366952477, + "grad_norm": 0.11293933540582657, + "learning_rate": 0.00024865771014591733, + "loss": 0.38940760493278503, + "memory(GiB)": 78.26, + "step": 1590, + "token_acc": 0.8861825562725317, + "train_speed(iter/s)": 0.032303 + }, + { + "epoch": 0.3082885239548515, + "grad_norm": 0.11680306494235992, + "learning_rate": 0.0002485852771223499, + "loss": 0.41928166151046753, + "memory(GiB)": 78.26, + "step": 1591, + "token_acc": 0.8788892413276046, + "train_speed(iter/s)": 0.032304 + }, + { + "epoch": 0.30848229424017826, + "grad_norm": 0.10015156865119934, + "learning_rate": 0.00024851280360716014, + "loss": 0.3776377737522125, + "memory(GiB)": 78.26, + "step": 1592, + "token_acc": 0.8896725440806046, + "train_speed(iter/s)": 0.032306 + }, + { + "epoch": 0.308676064525505, + "grad_norm": 0.11279778182506561, + "learning_rate": 0.00024844028963011476, + "loss": 0.39112144708633423, + "memory(GiB)": 78.26, + "step": 1593, + "token_acc": 0.8870161362751848, + "train_speed(iter/s)": 0.032308 + }, + { + "epoch": 0.30886983481083174, + "grad_norm": 0.09899583458900452, + "learning_rate": 0.0002483677352209972, + "loss": 0.35154473781585693, + "memory(GiB)": 78.26, + "step": 1594, + "token_acc": 0.897304444024716, + "train_speed(iter/s)": 0.032309 + }, + { + "epoch": 0.3090636050961585, + "grad_norm": 0.11332813650369644, + "learning_rate": 0.0002482951404096076, + "loss": 0.40011727809906006, + "memory(GiB)": 78.26, + "step": 1595, + "token_acc": 0.8834134615384616, + "train_speed(iter/s)": 0.032311 + }, + { + "epoch": 0.30925737538148523, + "grad_norm": 0.10168785601854324, + "learning_rate": 0.00024822250522576247, + "loss": 0.4107932150363922, + "memory(GiB)": 78.26, + "step": 1596, + "token_acc": 0.8791540056914098, + "train_speed(iter/s)": 0.032313 + }, + { + "epoch": 0.309451145666812, + "grad_norm": 0.10856655985116959, + "learning_rate": 0.0002481498296992951, + "loss": 0.40053847432136536, + "memory(GiB)": 78.26, + "step": 1597, + "token_acc": 0.8820065490768003, + "train_speed(iter/s)": 0.032314 + }, + { + "epoch": 0.3096449159521387, + "grad_norm": 0.10789895802736282, + "learning_rate": 0.0002480771138600553, + "loss": 0.3848547041416168, + "memory(GiB)": 78.26, + "step": 1598, + "token_acc": 0.8867153284671533, + "train_speed(iter/s)": 0.032316 + }, + { + "epoch": 0.30983868623746547, + "grad_norm": 0.11236843466758728, + "learning_rate": 0.00024800435773790946, + "loss": 0.40924182534217834, + "memory(GiB)": 78.26, + "step": 1599, + "token_acc": 0.8796616904126696, + "train_speed(iter/s)": 0.032318 + }, + { + "epoch": 0.3100324565227922, + "grad_norm": 0.10669440031051636, + "learning_rate": 0.00024793156136274037, + "loss": 0.38883164525032043, + "memory(GiB)": 78.26, + "step": 1600, + "token_acc": 0.8854082720253724, + "train_speed(iter/s)": 0.032319 + }, + { + "epoch": 0.31022622680811895, + "grad_norm": 0.10899162292480469, + "learning_rate": 0.0002478587247644475, + "loss": 0.3585772216320038, + "memory(GiB)": 78.26, + "step": 1601, + "token_acc": 0.8958569414886266, + "train_speed(iter/s)": 0.032313 + }, + { + "epoch": 0.3104199970934457, + "grad_norm": 0.11266548186540604, + "learning_rate": 0.00024778584797294684, + "loss": 0.3955519199371338, + "memory(GiB)": 78.26, + "step": 1602, + "token_acc": 0.8861248676036985, + "train_speed(iter/s)": 0.032315 + }, + { + "epoch": 0.31061376737877244, + "grad_norm": 0.1098904013633728, + "learning_rate": 0.0002477129310181708, + "loss": 0.42345693707466125, + "memory(GiB)": 78.26, + "step": 1603, + "token_acc": 0.8796804389928986, + "train_speed(iter/s)": 0.032316 + }, + { + "epoch": 0.3108075376640992, + "grad_norm": 0.10730142146348953, + "learning_rate": 0.0002476399739300683, + "loss": 0.40332454442977905, + "memory(GiB)": 78.26, + "step": 1604, + "token_acc": 0.8845008085794536, + "train_speed(iter/s)": 0.032318 + }, + { + "epoch": 0.31100130794942593, + "grad_norm": 0.10026847571134567, + "learning_rate": 0.0002475669767386049, + "loss": 0.3628842830657959, + "memory(GiB)": 78.26, + "step": 1605, + "token_acc": 0.8923505698501966, + "train_speed(iter/s)": 0.032319 + }, + { + "epoch": 0.3111950782347527, + "grad_norm": 0.12212540209293365, + "learning_rate": 0.00024749393947376234, + "loss": 0.38043132424354553, + "memory(GiB)": 78.26, + "step": 1606, + "token_acc": 0.8881142442463197, + "train_speed(iter/s)": 0.032321 + }, + { + "epoch": 0.3113888485200794, + "grad_norm": 0.10906349122524261, + "learning_rate": 0.00024742086216553914, + "loss": 0.4101215898990631, + "memory(GiB)": 78.26, + "step": 1607, + "token_acc": 0.8820161592380544, + "train_speed(iter/s)": 0.032323 + }, + { + "epoch": 0.31158261880540616, + "grad_norm": 0.10822609812021255, + "learning_rate": 0.00024734774484395, + "loss": 0.40345677733421326, + "memory(GiB)": 78.26, + "step": 1608, + "token_acc": 0.8825836216839678, + "train_speed(iter/s)": 0.032324 + }, + { + "epoch": 0.31177638909073296, + "grad_norm": 0.13473811745643616, + "learning_rate": 0.00024727458753902624, + "loss": 0.492877721786499, + "memory(GiB)": 78.26, + "step": 1609, + "token_acc": 0.8599673445153629, + "train_speed(iter/s)": 0.032326 + }, + { + "epoch": 0.3119701593760597, + "grad_norm": 0.10517790168523788, + "learning_rate": 0.0002472013902808155, + "loss": 0.375487744808197, + "memory(GiB)": 78.26, + "step": 1610, + "token_acc": 0.8903864596563075, + "train_speed(iter/s)": 0.032328 + }, + { + "epoch": 0.31216392966138645, + "grad_norm": 0.11642046272754669, + "learning_rate": 0.00024712815309938186, + "loss": 0.39327579736709595, + "memory(GiB)": 78.26, + "step": 1611, + "token_acc": 0.8876873924797247, + "train_speed(iter/s)": 0.03233 + }, + { + "epoch": 0.3123576999467132, + "grad_norm": 0.10896303504705429, + "learning_rate": 0.00024705487602480583, + "loss": 0.38862502574920654, + "memory(GiB)": 78.26, + "step": 1612, + "token_acc": 0.88512, + "train_speed(iter/s)": 0.032331 + }, + { + "epoch": 0.31255147023203994, + "grad_norm": 0.10455754399299622, + "learning_rate": 0.0002469815590871842, + "loss": 0.3765313923358917, + "memory(GiB)": 78.26, + "step": 1613, + "token_acc": 0.890468422279189, + "train_speed(iter/s)": 0.032333 + }, + { + "epoch": 0.3127452405173667, + "grad_norm": 0.10820908099412918, + "learning_rate": 0.00024690820231663036, + "loss": 0.40285009145736694, + "memory(GiB)": 78.26, + "step": 1614, + "token_acc": 0.8830803366074086, + "train_speed(iter/s)": 0.032335 + }, + { + "epoch": 0.31293901080269343, + "grad_norm": 0.10197239369153976, + "learning_rate": 0.0002468348057432737, + "loss": 0.3643296957015991, + "memory(GiB)": 78.26, + "step": 1615, + "token_acc": 0.8937665096266442, + "train_speed(iter/s)": 0.032336 + }, + { + "epoch": 0.3131327810880202, + "grad_norm": 0.12070825695991516, + "learning_rate": 0.00024676136939726036, + "loss": 0.4440290331840515, + "memory(GiB)": 78.26, + "step": 1616, + "token_acc": 0.8726066239878395, + "train_speed(iter/s)": 0.032338 + }, + { + "epoch": 0.3133265513733469, + "grad_norm": 0.10100740194320679, + "learning_rate": 0.0002466878933087525, + "loss": 0.38823482394218445, + "memory(GiB)": 78.26, + "step": 1617, + "token_acc": 0.8885969521807672, + "train_speed(iter/s)": 0.03234 + }, + { + "epoch": 0.31352032165867366, + "grad_norm": 0.10464068502187729, + "learning_rate": 0.00024661437750792865, + "loss": 0.3649863004684448, + "memory(GiB)": 78.26, + "step": 1618, + "token_acc": 0.8917689623982427, + "train_speed(iter/s)": 0.032341 + }, + { + "epoch": 0.3137140919440004, + "grad_norm": 0.09842365980148315, + "learning_rate": 0.00024654082202498395, + "loss": 0.37648287415504456, + "memory(GiB)": 78.26, + "step": 1619, + "token_acc": 0.8893629913904108, + "train_speed(iter/s)": 0.032343 + }, + { + "epoch": 0.31390786222932715, + "grad_norm": 0.11136578768491745, + "learning_rate": 0.00024646722689012946, + "loss": 0.3890235722064972, + "memory(GiB)": 78.26, + "step": 1620, + "token_acc": 0.8880463144161774, + "train_speed(iter/s)": 0.032345 + }, + { + "epoch": 0.3141016325146539, + "grad_norm": 0.12551820278167725, + "learning_rate": 0.0002463935921335927, + "loss": 0.39017459750175476, + "memory(GiB)": 78.26, + "step": 1621, + "token_acc": 0.8850291533218363, + "train_speed(iter/s)": 0.032347 + }, + { + "epoch": 0.31429540279998064, + "grad_norm": 0.12295407056808472, + "learning_rate": 0.00024631991778561747, + "loss": 0.4268419146537781, + "memory(GiB)": 78.26, + "step": 1622, + "token_acc": 0.8766729419818586, + "train_speed(iter/s)": 0.032348 + }, + { + "epoch": 0.3144891730853074, + "grad_norm": 0.10500724613666534, + "learning_rate": 0.00024624620387646377, + "loss": 0.3578689992427826, + "memory(GiB)": 78.26, + "step": 1623, + "token_acc": 0.8940465211599549, + "train_speed(iter/s)": 0.03235 + }, + { + "epoch": 0.3146829433706341, + "grad_norm": 0.1127677783370018, + "learning_rate": 0.0002461724504364079, + "loss": 0.41033974289894104, + "memory(GiB)": 78.26, + "step": 1624, + "token_acc": 0.8811105837683911, + "train_speed(iter/s)": 0.032352 + }, + { + "epoch": 0.31487671365596087, + "grad_norm": 0.10852906107902527, + "learning_rate": 0.0002460986574957424, + "loss": 0.4147697687149048, + "memory(GiB)": 78.26, + "step": 1625, + "token_acc": 0.8803296877476621, + "train_speed(iter/s)": 0.032353 + }, + { + "epoch": 0.3150704839412876, + "grad_norm": 0.10529500991106033, + "learning_rate": 0.000246024825084776, + "loss": 0.3946833312511444, + "memory(GiB)": 78.26, + "step": 1626, + "token_acc": 0.8831530219494435, + "train_speed(iter/s)": 0.032355 + }, + { + "epoch": 0.31526425422661436, + "grad_norm": 0.10391051322221756, + "learning_rate": 0.00024595095323383365, + "loss": 0.3614901900291443, + "memory(GiB)": 78.26, + "step": 1627, + "token_acc": 0.8932863813825308, + "train_speed(iter/s)": 0.032357 + }, + { + "epoch": 0.3154580245119411, + "grad_norm": 0.11637852340936661, + "learning_rate": 0.00024587704197325655, + "loss": 0.43050843477249146, + "memory(GiB)": 78.26, + "step": 1628, + "token_acc": 0.8762489252571971, + "train_speed(iter/s)": 0.032359 + }, + { + "epoch": 0.31565179479726785, + "grad_norm": 0.11949366331100464, + "learning_rate": 0.0002458030913334019, + "loss": 0.4221411347389221, + "memory(GiB)": 78.26, + "step": 1629, + "token_acc": 0.8803364945335391, + "train_speed(iter/s)": 0.03236 + }, + { + "epoch": 0.3158455650825946, + "grad_norm": 0.10639101266860962, + "learning_rate": 0.0002457291013446434, + "loss": 0.387704074382782, + "memory(GiB)": 78.26, + "step": 1630, + "token_acc": 0.8859933917540583, + "train_speed(iter/s)": 0.032362 + }, + { + "epoch": 0.31603933536792134, + "grad_norm": 0.12497800588607788, + "learning_rate": 0.00024565507203737054, + "loss": 0.4306849539279938, + "memory(GiB)": 78.26, + "step": 1631, + "token_acc": 0.8744147105336599, + "train_speed(iter/s)": 0.032364 + }, + { + "epoch": 0.3162331056532481, + "grad_norm": 0.10723750293254852, + "learning_rate": 0.0002455810034419893, + "loss": 0.41962188482284546, + "memory(GiB)": 78.26, + "step": 1632, + "token_acc": 0.8786517987789495, + "train_speed(iter/s)": 0.032366 + }, + { + "epoch": 0.3164268759385748, + "grad_norm": 0.10330141335725784, + "learning_rate": 0.0002455068955889216, + "loss": 0.39232727885246277, + "memory(GiB)": 78.26, + "step": 1633, + "token_acc": 0.8855892466395748, + "train_speed(iter/s)": 0.032367 + }, + { + "epoch": 0.31662064622390157, + "grad_norm": 0.09860360622406006, + "learning_rate": 0.0002454327485086055, + "loss": 0.37398630380630493, + "memory(GiB)": 78.26, + "step": 1634, + "token_acc": 0.8935310637039957, + "train_speed(iter/s)": 0.032369 + }, + { + "epoch": 0.3168144165092283, + "grad_norm": 0.10215026140213013, + "learning_rate": 0.00024535856223149524, + "loss": 0.3739304840564728, + "memory(GiB)": 78.26, + "step": 1635, + "token_acc": 0.889224391616981, + "train_speed(iter/s)": 0.03237 + }, + { + "epoch": 0.31700818679455506, + "grad_norm": 0.10608868300914764, + "learning_rate": 0.000245284336788061, + "loss": 0.40002018213272095, + "memory(GiB)": 78.26, + "step": 1636, + "token_acc": 0.8821046707934721, + "train_speed(iter/s)": 0.032372 + }, + { + "epoch": 0.3172019570798818, + "grad_norm": 0.10536207258701324, + "learning_rate": 0.0002452100722087893, + "loss": 0.38114506006240845, + "memory(GiB)": 78.26, + "step": 1637, + "token_acc": 0.8877112648882257, + "train_speed(iter/s)": 0.032373 + }, + { + "epoch": 0.31739572736520855, + "grad_norm": 0.10534202307462692, + "learning_rate": 0.00024513576852418256, + "loss": 0.3625592887401581, + "memory(GiB)": 78.26, + "step": 1638, + "token_acc": 0.8946749986647439, + "train_speed(iter/s)": 0.032375 + }, + { + "epoch": 0.3175894976505353, + "grad_norm": 0.10395082831382751, + "learning_rate": 0.0002450614257647593, + "loss": 0.40679264068603516, + "memory(GiB)": 78.26, + "step": 1639, + "token_acc": 0.882569104812343, + "train_speed(iter/s)": 0.032377 + }, + { + "epoch": 0.31778326793586203, + "grad_norm": 0.11221817880868912, + "learning_rate": 0.00024498704396105404, + "loss": 0.4055024981498718, + "memory(GiB)": 78.26, + "step": 1640, + "token_acc": 0.882479675964494, + "train_speed(iter/s)": 0.032378 + }, + { + "epoch": 0.3179770382211888, + "grad_norm": 0.09974303096532822, + "learning_rate": 0.00024491262314361745, + "loss": 0.38338178396224976, + "memory(GiB)": 78.26, + "step": 1641, + "token_acc": 0.8880624860078108, + "train_speed(iter/s)": 0.03238 + }, + { + "epoch": 0.3181708085065155, + "grad_norm": 0.10050232708454132, + "learning_rate": 0.0002448381633430161, + "loss": 0.35944464802742004, + "memory(GiB)": 78.26, + "step": 1642, + "token_acc": 0.8940914437298231, + "train_speed(iter/s)": 0.032382 + }, + { + "epoch": 0.31836457879184227, + "grad_norm": 0.11217102408409119, + "learning_rate": 0.0002447636645898327, + "loss": 0.39601394534111023, + "memory(GiB)": 78.26, + "step": 1643, + "token_acc": 0.8860522531810413, + "train_speed(iter/s)": 0.032383 + }, + { + "epoch": 0.318558349077169, + "grad_norm": 0.11227677762508392, + "learning_rate": 0.00024468912691466587, + "loss": 0.39023369550704956, + "memory(GiB)": 78.26, + "step": 1644, + "token_acc": 0.8874575239640919, + "train_speed(iter/s)": 0.032385 + }, + { + "epoch": 0.31875211936249576, + "grad_norm": 0.10062558948993683, + "learning_rate": 0.00024461455034813017, + "loss": 0.37629178166389465, + "memory(GiB)": 78.26, + "step": 1645, + "token_acc": 0.8884143616228832, + "train_speed(iter/s)": 0.032386 + }, + { + "epoch": 0.3189458896478225, + "grad_norm": 0.12126388400793076, + "learning_rate": 0.0002445399349208563, + "loss": 0.4465317726135254, + "memory(GiB)": 78.26, + "step": 1646, + "token_acc": 0.8706858890345129, + "train_speed(iter/s)": 0.032388 + }, + { + "epoch": 0.31913965993314924, + "grad_norm": 0.10243180394172668, + "learning_rate": 0.00024446528066349074, + "loss": 0.3463560938835144, + "memory(GiB)": 78.26, + "step": 1647, + "token_acc": 0.8977993900931345, + "train_speed(iter/s)": 0.03239 + }, + { + "epoch": 0.319333430218476, + "grad_norm": 0.09814903885126114, + "learning_rate": 0.00024439058760669603, + "loss": 0.3848741054534912, + "memory(GiB)": 78.26, + "step": 1648, + "token_acc": 0.8868541204061775, + "train_speed(iter/s)": 0.032391 + }, + { + "epoch": 0.31952720050380273, + "grad_norm": 0.10367216914892197, + "learning_rate": 0.00024431585578115064, + "loss": 0.38090386986732483, + "memory(GiB)": 78.26, + "step": 1649, + "token_acc": 0.8876089324618737, + "train_speed(iter/s)": 0.032393 + }, + { + "epoch": 0.3197209707891295, + "grad_norm": 0.11322217434644699, + "learning_rate": 0.00024424108521754886, + "loss": 0.3961893916130066, + "memory(GiB)": 78.26, + "step": 1650, + "token_acc": 0.8858755383929897, + "train_speed(iter/s)": 0.032394 + }, + { + "epoch": 0.3199147410744562, + "grad_norm": 0.11258938163518906, + "learning_rate": 0.00024416627594660105, + "loss": 0.40742677450180054, + "memory(GiB)": 78.26, + "step": 1651, + "token_acc": 0.8804218539370391, + "train_speed(iter/s)": 0.032396 + }, + { + "epoch": 0.32010851135978297, + "grad_norm": 0.10289296507835388, + "learning_rate": 0.00024409142799903342, + "loss": 0.37639862298965454, + "memory(GiB)": 78.26, + "step": 1652, + "token_acc": 0.8907713884992987, + "train_speed(iter/s)": 0.032398 + }, + { + "epoch": 0.3203022816451097, + "grad_norm": 0.10312007367610931, + "learning_rate": 0.00024401654140558795, + "loss": 0.39015206694602966, + "memory(GiB)": 78.26, + "step": 1653, + "token_acc": 0.8851129761136217, + "train_speed(iter/s)": 0.032399 + }, + { + "epoch": 0.32049605193043645, + "grad_norm": 0.09907987713813782, + "learning_rate": 0.00024394161619702257, + "loss": 0.35583794116973877, + "memory(GiB)": 78.26, + "step": 1654, + "token_acc": 0.8962283436398017, + "train_speed(iter/s)": 0.032401 + }, + { + "epoch": 0.3206898222157632, + "grad_norm": 0.11065588891506195, + "learning_rate": 0.00024386665240411115, + "loss": 0.3847392499446869, + "memory(GiB)": 78.26, + "step": 1655, + "token_acc": 0.886949811533553, + "train_speed(iter/s)": 0.032402 + }, + { + "epoch": 0.32088359250108994, + "grad_norm": 0.10860442370176315, + "learning_rate": 0.0002437916500576433, + "loss": 0.4008861184120178, + "memory(GiB)": 78.33, + "step": 1656, + "token_acc": 0.8821946594228897, + "train_speed(iter/s)": 0.032404 + }, + { + "epoch": 0.3210773627864167, + "grad_norm": 0.11284346878528595, + "learning_rate": 0.0002437166091884244, + "loss": 0.3918432593345642, + "memory(GiB)": 78.33, + "step": 1657, + "token_acc": 0.8856465073739568, + "train_speed(iter/s)": 0.032405 + }, + { + "epoch": 0.32127113307174343, + "grad_norm": 0.11324102431535721, + "learning_rate": 0.00024364152982727592, + "loss": 0.4203750789165497, + "memory(GiB)": 78.33, + "step": 1658, + "token_acc": 0.8781059802334023, + "train_speed(iter/s)": 0.032407 + }, + { + "epoch": 0.3214649033570702, + "grad_norm": 0.1033095270395279, + "learning_rate": 0.0002435664120050349, + "loss": 0.3659226894378662, + "memory(GiB)": 78.33, + "step": 1659, + "token_acc": 0.8937110992198721, + "train_speed(iter/s)": 0.032409 + }, + { + "epoch": 0.3216586736423969, + "grad_norm": 0.11475743353366852, + "learning_rate": 0.0002434912557525542, + "loss": 0.4008634686470032, + "memory(GiB)": 78.33, + "step": 1660, + "token_acc": 0.8813870157718026, + "train_speed(iter/s)": 0.03241 + }, + { + "epoch": 0.32185244392772366, + "grad_norm": 0.11086868494749069, + "learning_rate": 0.0002434160611007026, + "loss": 0.4097922742366791, + "memory(GiB)": 78.33, + "step": 1661, + "token_acc": 0.8791088493742418, + "train_speed(iter/s)": 0.032412 + }, + { + "epoch": 0.3220462142130504, + "grad_norm": 0.09878856688737869, + "learning_rate": 0.0002433408280803645, + "loss": 0.3229445517063141, + "memory(GiB)": 78.33, + "step": 1662, + "token_acc": 0.9044224128327949, + "train_speed(iter/s)": 0.032414 + }, + { + "epoch": 0.32223998449837715, + "grad_norm": 0.1650267392396927, + "learning_rate": 0.00024326555672244012, + "loss": 0.3968316912651062, + "memory(GiB)": 78.33, + "step": 1663, + "token_acc": 0.8846646732165742, + "train_speed(iter/s)": 0.032415 + }, + { + "epoch": 0.3224337547837039, + "grad_norm": 0.11585035920143127, + "learning_rate": 0.0002431902470578455, + "loss": 0.4106610417366028, + "memory(GiB)": 78.33, + "step": 1664, + "token_acc": 0.8812659238112779, + "train_speed(iter/s)": 0.032417 + }, + { + "epoch": 0.32262752506903064, + "grad_norm": 0.10727370530366898, + "learning_rate": 0.00024311489911751224, + "loss": 0.3810875117778778, + "memory(GiB)": 78.33, + "step": 1665, + "token_acc": 0.8862208987547374, + "train_speed(iter/s)": 0.032419 + }, + { + "epoch": 0.3228212953543574, + "grad_norm": 0.10232355445623398, + "learning_rate": 0.00024303951293238785, + "loss": 0.40213391184806824, + "memory(GiB)": 78.33, + "step": 1666, + "token_acc": 0.8846539618856569, + "train_speed(iter/s)": 0.03242 + }, + { + "epoch": 0.32301506563968413, + "grad_norm": 0.10427332669496536, + "learning_rate": 0.00024296408853343544, + "loss": 0.381198525428772, + "memory(GiB)": 78.33, + "step": 1667, + "token_acc": 0.886510858088634, + "train_speed(iter/s)": 0.032422 + }, + { + "epoch": 0.3232088359250109, + "grad_norm": 0.11577033996582031, + "learning_rate": 0.0002428886259516338, + "loss": 0.38930296897888184, + "memory(GiB)": 78.33, + "step": 1668, + "token_acc": 0.8849813571961223, + "train_speed(iter/s)": 0.032424 + }, + { + "epoch": 0.3234026062103376, + "grad_norm": 0.10479265451431274, + "learning_rate": 0.0002428131252179775, + "loss": 0.38501253724098206, + "memory(GiB)": 78.33, + "step": 1669, + "token_acc": 0.8847304574878965, + "train_speed(iter/s)": 0.032425 + }, + { + "epoch": 0.3235963764956644, + "grad_norm": 0.10768333077430725, + "learning_rate": 0.00024273758636347663, + "loss": 0.4145703613758087, + "memory(GiB)": 78.33, + "step": 1670, + "token_acc": 0.8764765532929537, + "train_speed(iter/s)": 0.032427 + }, + { + "epoch": 0.32379014678099116, + "grad_norm": 0.1042892262339592, + "learning_rate": 0.00024266200941915712, + "loss": 0.3692252039909363, + "memory(GiB)": 78.33, + "step": 1671, + "token_acc": 0.8922046134447549, + "train_speed(iter/s)": 0.032428 + }, + { + "epoch": 0.3239839170663179, + "grad_norm": 0.11078062653541565, + "learning_rate": 0.00024258639441606042, + "loss": 0.4250616133213043, + "memory(GiB)": 78.33, + "step": 1672, + "token_acc": 0.876742678050362, + "train_speed(iter/s)": 0.03243 + }, + { + "epoch": 0.32417768735164465, + "grad_norm": 0.10978075116872787, + "learning_rate": 0.00024251074138524365, + "loss": 0.40720629692077637, + "memory(GiB)": 78.33, + "step": 1673, + "token_acc": 0.8835114016903205, + "train_speed(iter/s)": 0.032432 + }, + { + "epoch": 0.3243714576369714, + "grad_norm": 0.09793038666248322, + "learning_rate": 0.00024243505035777954, + "loss": 0.34907984733581543, + "memory(GiB)": 78.33, + "step": 1674, + "token_acc": 0.8983231559561888, + "train_speed(iter/s)": 0.032433 + }, + { + "epoch": 0.32456522792229814, + "grad_norm": 0.11205164343118668, + "learning_rate": 0.0002423593213647564, + "loss": 0.3938961625099182, + "memory(GiB)": 78.33, + "step": 1675, + "token_acc": 0.8874208948595537, + "train_speed(iter/s)": 0.032435 + }, + { + "epoch": 0.3247589982076249, + "grad_norm": 0.12075439840555191, + "learning_rate": 0.0002422835544372782, + "loss": 0.42155468463897705, + "memory(GiB)": 78.33, + "step": 1676, + "token_acc": 0.8779719830355995, + "train_speed(iter/s)": 0.032436 + }, + { + "epoch": 0.3249527684929516, + "grad_norm": 0.10721995681524277, + "learning_rate": 0.0002422077496064644, + "loss": 0.4283701479434967, + "memory(GiB)": 78.33, + "step": 1677, + "token_acc": 0.874123831775701, + "train_speed(iter/s)": 0.032438 + }, + { + "epoch": 0.32514653877827837, + "grad_norm": 0.12848171591758728, + "learning_rate": 0.00024213190690345018, + "loss": 0.4088367521762848, + "memory(GiB)": 78.33, + "step": 1678, + "token_acc": 0.8799862555836692, + "train_speed(iter/s)": 0.03244 + }, + { + "epoch": 0.3253403090636051, + "grad_norm": 0.10981763154268265, + "learning_rate": 0.00024205602635938604, + "loss": 0.41480377316474915, + "memory(GiB)": 78.33, + "step": 1679, + "token_acc": 0.878927367125194, + "train_speed(iter/s)": 0.032441 + }, + { + "epoch": 0.32553407934893186, + "grad_norm": 0.10942694544792175, + "learning_rate": 0.0002419801080054383, + "loss": 0.39324793219566345, + "memory(GiB)": 78.33, + "step": 1680, + "token_acc": 0.8848377444002208, + "train_speed(iter/s)": 0.032443 + }, + { + "epoch": 0.3257278496342586, + "grad_norm": 0.10424201935529709, + "learning_rate": 0.00024190415187278855, + "loss": 0.38931453227996826, + "memory(GiB)": 78.33, + "step": 1681, + "token_acc": 0.8873810462300805, + "train_speed(iter/s)": 0.032445 + }, + { + "epoch": 0.32592161991958535, + "grad_norm": 0.10016170144081116, + "learning_rate": 0.0002418281579926341, + "loss": 0.3769393861293793, + "memory(GiB)": 78.33, + "step": 1682, + "token_acc": 0.888585472419443, + "train_speed(iter/s)": 0.032446 + }, + { + "epoch": 0.3261153902049121, + "grad_norm": 0.10119081288576126, + "learning_rate": 0.0002417521263961876, + "loss": 0.34021657705307007, + "memory(GiB)": 78.33, + "step": 1683, + "token_acc": 0.8990787809976002, + "train_speed(iter/s)": 0.032448 + }, + { + "epoch": 0.32630916049023884, + "grad_norm": 0.10256388038396835, + "learning_rate": 0.00024167605711467738, + "loss": 0.3520572781562805, + "memory(GiB)": 78.33, + "step": 1684, + "token_acc": 0.8965411006861784, + "train_speed(iter/s)": 0.03245 + }, + { + "epoch": 0.3265029307755656, + "grad_norm": 0.10342499613761902, + "learning_rate": 0.00024159995017934702, + "loss": 0.37822040915489197, + "memory(GiB)": 78.33, + "step": 1685, + "token_acc": 0.8892546779308175, + "train_speed(iter/s)": 0.032451 + }, + { + "epoch": 0.3266967010608923, + "grad_norm": 0.10906314849853516, + "learning_rate": 0.00024152380562145575, + "loss": 0.3807571232318878, + "memory(GiB)": 78.33, + "step": 1686, + "token_acc": 0.8904570071765867, + "train_speed(iter/s)": 0.032453 + }, + { + "epoch": 0.32689047134621907, + "grad_norm": 0.10030427575111389, + "learning_rate": 0.00024144762347227822, + "loss": 0.35491418838500977, + "memory(GiB)": 78.33, + "step": 1687, + "token_acc": 0.8963004898220396, + "train_speed(iter/s)": 0.032454 + }, + { + "epoch": 0.3270842416315458, + "grad_norm": 0.10785649716854095, + "learning_rate": 0.0002413714037631044, + "loss": 0.4118516743183136, + "memory(GiB)": 78.33, + "step": 1688, + "token_acc": 0.8789999213774667, + "train_speed(iter/s)": 0.032456 + }, + { + "epoch": 0.32727801191687256, + "grad_norm": 0.10498232394456863, + "learning_rate": 0.00024129514652523976, + "loss": 0.35566800832748413, + "memory(GiB)": 78.33, + "step": 1689, + "token_acc": 0.8963449018301123, + "train_speed(iter/s)": 0.032457 + }, + { + "epoch": 0.3274717822021993, + "grad_norm": 0.09992900490760803, + "learning_rate": 0.0002412188517900053, + "loss": 0.3673107326030731, + "memory(GiB)": 78.33, + "step": 1690, + "token_acc": 0.891633487007544, + "train_speed(iter/s)": 0.032459 + }, + { + "epoch": 0.32766555248752605, + "grad_norm": 0.10761499404907227, + "learning_rate": 0.00024114251958873726, + "loss": 0.39024174213409424, + "memory(GiB)": 78.33, + "step": 1691, + "token_acc": 0.886815954076521, + "train_speed(iter/s)": 0.03246 + }, + { + "epoch": 0.3278593227728528, + "grad_norm": 0.10518249124288559, + "learning_rate": 0.00024106614995278731, + "loss": 0.38077983260154724, + "memory(GiB)": 78.33, + "step": 1692, + "token_acc": 0.8875674808915496, + "train_speed(iter/s)": 0.032462 + }, + { + "epoch": 0.32805309305817953, + "grad_norm": 0.10248047858476639, + "learning_rate": 0.00024098974291352255, + "loss": 0.36225420236587524, + "memory(GiB)": 78.33, + "step": 1693, + "token_acc": 0.8952485416562797, + "train_speed(iter/s)": 0.032464 + }, + { + "epoch": 0.3282468633435063, + "grad_norm": 0.10516565293073654, + "learning_rate": 0.00024091329850232536, + "loss": 0.391397625207901, + "memory(GiB)": 78.33, + "step": 1694, + "token_acc": 0.8858848043629254, + "train_speed(iter/s)": 0.032465 + }, + { + "epoch": 0.328440633628833, + "grad_norm": 0.1099853664636612, + "learning_rate": 0.00024083681675059356, + "loss": 0.38383805751800537, + "memory(GiB)": 78.33, + "step": 1695, + "token_acc": 0.8891934663426214, + "train_speed(iter/s)": 0.032467 + }, + { + "epoch": 0.32863440391415977, + "grad_norm": 0.11349144577980042, + "learning_rate": 0.00024076029768974025, + "loss": 0.4004535675048828, + "memory(GiB)": 78.33, + "step": 1696, + "token_acc": 0.883281280229955, + "train_speed(iter/s)": 0.032468 + }, + { + "epoch": 0.3288281741994865, + "grad_norm": 0.11258336901664734, + "learning_rate": 0.00024068374135119384, + "loss": 0.4041019678115845, + "memory(GiB)": 78.33, + "step": 1697, + "token_acc": 0.8841508871328463, + "train_speed(iter/s)": 0.03247 + }, + { + "epoch": 0.32902194448481326, + "grad_norm": 0.10647819191217422, + "learning_rate": 0.00024060714776639813, + "loss": 0.36608096957206726, + "memory(GiB)": 78.33, + "step": 1698, + "token_acc": 0.8930285460648598, + "train_speed(iter/s)": 0.032471 + }, + { + "epoch": 0.32921571477014, + "grad_norm": 0.10232997685670853, + "learning_rate": 0.00024053051696681208, + "loss": 0.37514373660087585, + "memory(GiB)": 78.33, + "step": 1699, + "token_acc": 0.8901580589433514, + "train_speed(iter/s)": 0.032473 + }, + { + "epoch": 0.32940948505546674, + "grad_norm": 0.10426058620214462, + "learning_rate": 0.00024045384898391007, + "loss": 0.3706841468811035, + "memory(GiB)": 78.33, + "step": 1700, + "token_acc": 0.8918139787870547, + "train_speed(iter/s)": 0.032474 + }, + { + "epoch": 0.3296032553407935, + "grad_norm": 0.11431006342172623, + "learning_rate": 0.0002403771438491817, + "loss": 0.40189433097839355, + "memory(GiB)": 78.33, + "step": 1701, + "token_acc": 0.8844837336666964, + "train_speed(iter/s)": 0.032476 + }, + { + "epoch": 0.32979702562612023, + "grad_norm": 0.11498620361089706, + "learning_rate": 0.00024030040159413185, + "loss": 0.4026452898979187, + "memory(GiB)": 78.33, + "step": 1702, + "token_acc": 0.8820736746475984, + "train_speed(iter/s)": 0.032478 + }, + { + "epoch": 0.329990795911447, + "grad_norm": 0.10887457430362701, + "learning_rate": 0.0002402236222502805, + "loss": 0.3983825445175171, + "memory(GiB)": 78.33, + "step": 1703, + "token_acc": 0.8844022811427336, + "train_speed(iter/s)": 0.032479 + }, + { + "epoch": 0.3301845661967737, + "grad_norm": 0.10820619016885757, + "learning_rate": 0.00024014680584916322, + "loss": 0.37078577280044556, + "memory(GiB)": 78.33, + "step": 1704, + "token_acc": 0.8899442436468578, + "train_speed(iter/s)": 0.032481 + }, + { + "epoch": 0.33037833648210047, + "grad_norm": 0.12457577884197235, + "learning_rate": 0.00024006995242233038, + "loss": 0.4524819552898407, + "memory(GiB)": 78.33, + "step": 1705, + "token_acc": 0.867928674577761, + "train_speed(iter/s)": 0.032482 + }, + { + "epoch": 0.3305721067674272, + "grad_norm": 0.11018163710832596, + "learning_rate": 0.0002399930620013478, + "loss": 0.39005085825920105, + "memory(GiB)": 78.33, + "step": 1706, + "token_acc": 0.8860519838801287, + "train_speed(iter/s)": 0.032484 + }, + { + "epoch": 0.33076587705275395, + "grad_norm": 0.10681940615177155, + "learning_rate": 0.00023991613461779644, + "loss": 0.38101130723953247, + "memory(GiB)": 78.33, + "step": 1707, + "token_acc": 0.890979347101932, + "train_speed(iter/s)": 0.032485 + }, + { + "epoch": 0.3309596473380807, + "grad_norm": 0.10825340449810028, + "learning_rate": 0.00023983917030327248, + "loss": 0.36492252349853516, + "memory(GiB)": 78.33, + "step": 1708, + "token_acc": 0.8953067555147058, + "train_speed(iter/s)": 0.032487 + }, + { + "epoch": 0.33115341762340744, + "grad_norm": 0.0985020250082016, + "learning_rate": 0.00023976216908938719, + "loss": 0.36632204055786133, + "memory(GiB)": 78.33, + "step": 1709, + "token_acc": 0.8902666632577404, + "train_speed(iter/s)": 0.032488 + }, + { + "epoch": 0.3313471879087342, + "grad_norm": 0.10576551407575607, + "learning_rate": 0.00023968513100776703, + "loss": 0.3914228081703186, + "memory(GiB)": 78.33, + "step": 1710, + "token_acc": 0.8861784230125749, + "train_speed(iter/s)": 0.03249 + }, + { + "epoch": 0.33154095819406093, + "grad_norm": 0.10542413592338562, + "learning_rate": 0.00023960805609005365, + "loss": 0.40728819370269775, + "memory(GiB)": 78.33, + "step": 1711, + "token_acc": 0.8833345786943635, + "train_speed(iter/s)": 0.032491 + }, + { + "epoch": 0.3317347284793877, + "grad_norm": 0.10389473289251328, + "learning_rate": 0.0002395309443679038, + "loss": 0.3726048171520233, + "memory(GiB)": 78.33, + "step": 1712, + "token_acc": 0.8894928520954504, + "train_speed(iter/s)": 0.032493 + }, + { + "epoch": 0.3319284987647144, + "grad_norm": 0.11621604114770889, + "learning_rate": 0.0002394537958729893, + "loss": 0.39150145649909973, + "memory(GiB)": 78.33, + "step": 1713, + "token_acc": 0.8860767814882987, + "train_speed(iter/s)": 0.032494 + }, + { + "epoch": 0.33212226905004116, + "grad_norm": 0.13983365893363953, + "learning_rate": 0.00023937661063699707, + "loss": 0.4037204384803772, + "memory(GiB)": 78.33, + "step": 1714, + "token_acc": 0.882203057624461, + "train_speed(iter/s)": 0.032496 + }, + { + "epoch": 0.3323160393353679, + "grad_norm": 0.11795634776353836, + "learning_rate": 0.00023929938869162928, + "loss": 0.43276962637901306, + "memory(GiB)": 78.33, + "step": 1715, + "token_acc": 0.875131748448296, + "train_speed(iter/s)": 0.032498 + }, + { + "epoch": 0.33250980962069465, + "grad_norm": 0.10593996942043304, + "learning_rate": 0.00023922213006860292, + "loss": 0.3847067654132843, + "memory(GiB)": 78.33, + "step": 1716, + "token_acc": 0.888629105839416, + "train_speed(iter/s)": 0.032499 + }, + { + "epoch": 0.3327035799060214, + "grad_norm": 0.11202581226825714, + "learning_rate": 0.00023914483479965025, + "loss": 0.4195789396762848, + "memory(GiB)": 78.33, + "step": 1717, + "token_acc": 0.8793484873624247, + "train_speed(iter/s)": 0.0325 + }, + { + "epoch": 0.33289735019134814, + "grad_norm": 0.11114699393510818, + "learning_rate": 0.00023906750291651858, + "loss": 0.3933086395263672, + "memory(GiB)": 78.33, + "step": 1718, + "token_acc": 0.8864894491641545, + "train_speed(iter/s)": 0.032502 + }, + { + "epoch": 0.3330911204766749, + "grad_norm": 0.11813364177942276, + "learning_rate": 0.00023899013445097007, + "loss": 0.39278751611709595, + "memory(GiB)": 78.33, + "step": 1719, + "token_acc": 0.8872432671919998, + "train_speed(iter/s)": 0.032504 + }, + { + "epoch": 0.33328489076200163, + "grad_norm": 0.1009705662727356, + "learning_rate": 0.0002389127294347821, + "loss": 0.38593360781669617, + "memory(GiB)": 78.33, + "step": 1720, + "token_acc": 0.8872761447292852, + "train_speed(iter/s)": 0.032505 + }, + { + "epoch": 0.3334786610473284, + "grad_norm": 0.10921531915664673, + "learning_rate": 0.000238835287899747, + "loss": 0.38367462158203125, + "memory(GiB)": 78.33, + "step": 1721, + "token_acc": 0.8886438230274386, + "train_speed(iter/s)": 0.032507 + }, + { + "epoch": 0.3336724313326551, + "grad_norm": 0.1057095155119896, + "learning_rate": 0.00023875780987767204, + "loss": 0.3982135057449341, + "memory(GiB)": 78.33, + "step": 1722, + "token_acc": 0.8859919028340081, + "train_speed(iter/s)": 0.032508 + }, + { + "epoch": 0.33386620161798186, + "grad_norm": 0.10099222511053085, + "learning_rate": 0.0002386802954003795, + "loss": 0.37414735555648804, + "memory(GiB)": 78.33, + "step": 1723, + "token_acc": 0.8888457355911139, + "train_speed(iter/s)": 0.032509 + }, + { + "epoch": 0.3340599719033086, + "grad_norm": 0.10116644203662872, + "learning_rate": 0.0002386027444997068, + "loss": 0.4056088328361511, + "memory(GiB)": 78.33, + "step": 1724, + "token_acc": 0.8838143450628663, + "train_speed(iter/s)": 0.032511 + }, + { + "epoch": 0.33425374218863535, + "grad_norm": 0.0998440757393837, + "learning_rate": 0.000238525157207506, + "loss": 0.3904368281364441, + "memory(GiB)": 78.33, + "step": 1725, + "token_acc": 0.8871362252283732, + "train_speed(iter/s)": 0.032512 + }, + { + "epoch": 0.3344475124739621, + "grad_norm": 0.1074642464518547, + "learning_rate": 0.0002384475335556444, + "loss": 0.3967469334602356, + "memory(GiB)": 78.33, + "step": 1726, + "token_acc": 0.8839466666666667, + "train_speed(iter/s)": 0.032514 + }, + { + "epoch": 0.33464128275928884, + "grad_norm": 0.11831879615783691, + "learning_rate": 0.00023836987357600414, + "loss": 0.45349621772766113, + "memory(GiB)": 78.33, + "step": 1727, + "token_acc": 0.8706680645349176, + "train_speed(iter/s)": 0.032515 + }, + { + "epoch": 0.3348350530446156, + "grad_norm": 0.11764407902956009, + "learning_rate": 0.00023829217730048219, + "loss": 0.4561113119125366, + "memory(GiB)": 78.33, + "step": 1728, + "token_acc": 0.8694065038177228, + "train_speed(iter/s)": 0.032517 + }, + { + "epoch": 0.3350288233299423, + "grad_norm": 0.1027785986661911, + "learning_rate": 0.00023821444476099048, + "loss": 0.38838091492652893, + "memory(GiB)": 78.33, + "step": 1729, + "token_acc": 0.8851640132492787, + "train_speed(iter/s)": 0.032518 + }, + { + "epoch": 0.33522259361526907, + "grad_norm": 0.1072501540184021, + "learning_rate": 0.0002381366759894559, + "loss": 0.3992714285850525, + "memory(GiB)": 78.33, + "step": 1730, + "token_acc": 0.8841534590542398, + "train_speed(iter/s)": 0.03252 + }, + { + "epoch": 0.33541636390059587, + "grad_norm": 0.09161798655986786, + "learning_rate": 0.00023805887101782018, + "loss": 0.3421012759208679, + "memory(GiB)": 78.33, + "step": 1731, + "token_acc": 0.8976631008677287, + "train_speed(iter/s)": 0.032521 + }, + { + "epoch": 0.3356101341859226, + "grad_norm": 0.12624742090702057, + "learning_rate": 0.00023798102987803994, + "loss": 0.47877341508865356, + "memory(GiB)": 78.33, + "step": 1732, + "token_acc": 0.8629468940234756, + "train_speed(iter/s)": 0.032523 + }, + { + "epoch": 0.33580390447124936, + "grad_norm": 0.11039040237665176, + "learning_rate": 0.00023790315260208654, + "loss": 0.373019278049469, + "memory(GiB)": 78.33, + "step": 1733, + "token_acc": 0.8893133633557658, + "train_speed(iter/s)": 0.032525 + }, + { + "epoch": 0.3359976747565761, + "grad_norm": 0.11160407960414886, + "learning_rate": 0.0002378252392219463, + "loss": 0.3813738524913788, + "memory(GiB)": 78.33, + "step": 1734, + "token_acc": 0.887247745813654, + "train_speed(iter/s)": 0.032526 + }, + { + "epoch": 0.33619144504190285, + "grad_norm": 0.11238150298595428, + "learning_rate": 0.0002377472897696204, + "loss": 0.41692203283309937, + "memory(GiB)": 78.33, + "step": 1735, + "token_acc": 0.8769641013877738, + "train_speed(iter/s)": 0.032528 + }, + { + "epoch": 0.3363852153272296, + "grad_norm": 0.1071707084774971, + "learning_rate": 0.00023766930427712471, + "loss": 0.39206287264823914, + "memory(GiB)": 78.33, + "step": 1736, + "token_acc": 0.8842453737762436, + "train_speed(iter/s)": 0.032529 + }, + { + "epoch": 0.33657898561255634, + "grad_norm": 0.10300777852535248, + "learning_rate": 0.00023759128277649, + "loss": 0.3877497613430023, + "memory(GiB)": 78.33, + "step": 1737, + "token_acc": 0.8867694590155248, + "train_speed(iter/s)": 0.032531 + }, + { + "epoch": 0.3367727558978831, + "grad_norm": 0.1066984236240387, + "learning_rate": 0.0002375132252997618, + "loss": 0.3994975686073303, + "memory(GiB)": 78.33, + "step": 1738, + "token_acc": 0.8828811405205282, + "train_speed(iter/s)": 0.032532 + }, + { + "epoch": 0.3369665261832098, + "grad_norm": 0.100070521235466, + "learning_rate": 0.00023743513187900037, + "loss": 0.36340388655662537, + "memory(GiB)": 78.33, + "step": 1739, + "token_acc": 0.8943034295679132, + "train_speed(iter/s)": 0.032534 + }, + { + "epoch": 0.33716029646853657, + "grad_norm": 0.10749375075101852, + "learning_rate": 0.00023735700254628078, + "loss": 0.3817083537578583, + "memory(GiB)": 78.33, + "step": 1740, + "token_acc": 0.8865700752298691, + "train_speed(iter/s)": 0.032535 + }, + { + "epoch": 0.3373540667538633, + "grad_norm": 0.1144825890660286, + "learning_rate": 0.00023727883733369292, + "loss": 0.3933083415031433, + "memory(GiB)": 78.33, + "step": 1741, + "token_acc": 0.8860413411849537, + "train_speed(iter/s)": 0.032537 + }, + { + "epoch": 0.33754783703919006, + "grad_norm": 0.11627618223428726, + "learning_rate": 0.00023720063627334124, + "loss": 0.38191571831703186, + "memory(GiB)": 78.33, + "step": 1742, + "token_acc": 0.8882073834046985, + "train_speed(iter/s)": 0.032538 + }, + { + "epoch": 0.3377416073245168, + "grad_norm": 0.10807470977306366, + "learning_rate": 0.00023712239939734512, + "loss": 0.3824315667152405, + "memory(GiB)": 78.33, + "step": 1743, + "token_acc": 0.8895115896074527, + "train_speed(iter/s)": 0.032539 + }, + { + "epoch": 0.33793537760984355, + "grad_norm": 0.11028466373682022, + "learning_rate": 0.00023704412673783852, + "loss": 0.4183183014392853, + "memory(GiB)": 78.33, + "step": 1744, + "token_acc": 0.8795684778318642, + "train_speed(iter/s)": 0.032541 + }, + { + "epoch": 0.3381291478951703, + "grad_norm": 0.10906746983528137, + "learning_rate": 0.00023696581832697002, + "loss": 0.38075122237205505, + "memory(GiB)": 78.33, + "step": 1745, + "token_acc": 0.8904931905328574, + "train_speed(iter/s)": 0.032542 + }, + { + "epoch": 0.33832291818049703, + "grad_norm": 0.10776349902153015, + "learning_rate": 0.00023688747419690312, + "loss": 0.39934486150741577, + "memory(GiB)": 78.33, + "step": 1746, + "token_acc": 0.8850526197309921, + "train_speed(iter/s)": 0.032544 + }, + { + "epoch": 0.3385166884658238, + "grad_norm": 0.12053883075714111, + "learning_rate": 0.00023680909437981583, + "loss": 0.4140859544277191, + "memory(GiB)": 78.33, + "step": 1747, + "token_acc": 0.8816291216714198, + "train_speed(iter/s)": 0.032546 + }, + { + "epoch": 0.3387104587511505, + "grad_norm": 0.12190855294466019, + "learning_rate": 0.00023673067890790078, + "loss": 0.46971094608306885, + "memory(GiB)": 78.33, + "step": 1748, + "token_acc": 0.8653503654117791, + "train_speed(iter/s)": 0.032547 + }, + { + "epoch": 0.33890422903647727, + "grad_norm": 0.10787336528301239, + "learning_rate": 0.00023665222781336538, + "loss": 0.3823373317718506, + "memory(GiB)": 78.33, + "step": 1749, + "token_acc": 0.8879386276459724, + "train_speed(iter/s)": 0.032549 + }, + { + "epoch": 0.339097999321804, + "grad_norm": 0.10566065460443497, + "learning_rate": 0.0002365737411284316, + "loss": 0.36630237102508545, + "memory(GiB)": 78.33, + "step": 1750, + "token_acc": 0.8929974516882565, + "train_speed(iter/s)": 0.03255 + }, + { + "epoch": 0.33929176960713076, + "grad_norm": 0.11304379254579544, + "learning_rate": 0.000236495218885336, + "loss": 0.41917577385902405, + "memory(GiB)": 78.33, + "step": 1751, + "token_acc": 0.8782786241558034, + "train_speed(iter/s)": 0.032551 + }, + { + "epoch": 0.3394855398924575, + "grad_norm": 0.1184999868273735, + "learning_rate": 0.00023641666111632977, + "loss": 0.436492919921875, + "memory(GiB)": 78.33, + "step": 1752, + "token_acc": 0.8726875365530827, + "train_speed(iter/s)": 0.032553 + }, + { + "epoch": 0.33967931017778424, + "grad_norm": 0.11077001690864563, + "learning_rate": 0.00023633806785367873, + "loss": 0.35720402002334595, + "memory(GiB)": 78.33, + "step": 1753, + "token_acc": 0.8934273097826086, + "train_speed(iter/s)": 0.032554 + }, + { + "epoch": 0.339873080463111, + "grad_norm": 0.10461857914924622, + "learning_rate": 0.00023625943912966322, + "loss": 0.366897851228714, + "memory(GiB)": 78.33, + "step": 1754, + "token_acc": 0.8917701004031939, + "train_speed(iter/s)": 0.032556 + }, + { + "epoch": 0.34006685074843773, + "grad_norm": 0.1258544921875, + "learning_rate": 0.0002361807749765782, + "loss": 0.43176522850990295, + "memory(GiB)": 78.33, + "step": 1755, + "token_acc": 0.875682894257618, + "train_speed(iter/s)": 0.032557 + }, + { + "epoch": 0.3402606210337645, + "grad_norm": 0.10822499543428421, + "learning_rate": 0.0002361020754267331, + "loss": 0.33935534954071045, + "memory(GiB)": 78.33, + "step": 1756, + "token_acc": 0.9004844907278499, + "train_speed(iter/s)": 0.032559 + }, + { + "epoch": 0.3404543913190912, + "grad_norm": 0.1091342493891716, + "learning_rate": 0.00023602334051245195, + "loss": 0.38166943192481995, + "memory(GiB)": 78.33, + "step": 1757, + "token_acc": 0.8881519274376417, + "train_speed(iter/s)": 0.03256 + }, + { + "epoch": 0.34064816160441796, + "grad_norm": 0.0942879170179367, + "learning_rate": 0.00023594457026607335, + "loss": 0.3533993363380432, + "memory(GiB)": 78.33, + "step": 1758, + "token_acc": 0.8947904639000204, + "train_speed(iter/s)": 0.032562 + }, + { + "epoch": 0.3408419318897447, + "grad_norm": 0.10075850039720535, + "learning_rate": 0.00023586576471995035, + "loss": 0.35853201150894165, + "memory(GiB)": 78.33, + "step": 1759, + "token_acc": 0.8928562603474264, + "train_speed(iter/s)": 0.032563 + }, + { + "epoch": 0.34103570217507145, + "grad_norm": 0.10983631759881973, + "learning_rate": 0.00023578692390645043, + "loss": 0.38611915707588196, + "memory(GiB)": 78.33, + "step": 1760, + "token_acc": 0.8869917407878017, + "train_speed(iter/s)": 0.032564 + }, + { + "epoch": 0.3412294724603982, + "grad_norm": 0.10747679322957993, + "learning_rate": 0.00023570804785795572, + "loss": 0.3835248649120331, + "memory(GiB)": 78.33, + "step": 1761, + "token_acc": 0.8901472855851544, + "train_speed(iter/s)": 0.032566 + }, + { + "epoch": 0.34142324274572494, + "grad_norm": 0.11863560974597931, + "learning_rate": 0.00023562913660686263, + "loss": 0.4406958222389221, + "memory(GiB)": 78.33, + "step": 1762, + "token_acc": 0.8729787360066339, + "train_speed(iter/s)": 0.032567 + }, + { + "epoch": 0.3416170130310517, + "grad_norm": 0.11419124901294708, + "learning_rate": 0.00023555019018558224, + "loss": 0.39518317580223083, + "memory(GiB)": 78.33, + "step": 1763, + "token_acc": 0.88392173041787, + "train_speed(iter/s)": 0.032569 + }, + { + "epoch": 0.34181078331637843, + "grad_norm": 0.11686435341835022, + "learning_rate": 0.0002354712086265399, + "loss": 0.43183597922325134, + "memory(GiB)": 78.33, + "step": 1764, + "token_acc": 0.8757511860832894, + "train_speed(iter/s)": 0.03257 + }, + { + "epoch": 0.3420045536017052, + "grad_norm": 0.10107649117708206, + "learning_rate": 0.0002353921919621755, + "loss": 0.39429807662963867, + "memory(GiB)": 78.33, + "step": 1765, + "token_acc": 0.8837560239497639, + "train_speed(iter/s)": 0.032572 + }, + { + "epoch": 0.3421983238870319, + "grad_norm": 0.10357075929641724, + "learning_rate": 0.00023531314022494324, + "loss": 0.3501340448856354, + "memory(GiB)": 78.33, + "step": 1766, + "token_acc": 0.8969428540563897, + "train_speed(iter/s)": 0.032573 + }, + { + "epoch": 0.34239209417235866, + "grad_norm": 0.0919223427772522, + "learning_rate": 0.0002352340534473119, + "loss": 0.3332497477531433, + "memory(GiB)": 78.33, + "step": 1767, + "token_acc": 0.9006399880469158, + "train_speed(iter/s)": 0.032574 + }, + { + "epoch": 0.3425858644576854, + "grad_norm": 0.10394643247127533, + "learning_rate": 0.00023515493166176442, + "loss": 0.36848586797714233, + "memory(GiB)": 78.33, + "step": 1768, + "token_acc": 0.8926898258640502, + "train_speed(iter/s)": 0.032576 + }, + { + "epoch": 0.34277963474301215, + "grad_norm": 0.10993330925703049, + "learning_rate": 0.00023507577490079832, + "loss": 0.41358712315559387, + "memory(GiB)": 78.33, + "step": 1769, + "token_acc": 0.8809283276450512, + "train_speed(iter/s)": 0.032577 + }, + { + "epoch": 0.3429734050283389, + "grad_norm": 0.10172808170318604, + "learning_rate": 0.00023499658319692542, + "loss": 0.3734714686870575, + "memory(GiB)": 78.33, + "step": 1770, + "token_acc": 0.8903681089061836, + "train_speed(iter/s)": 0.032579 + }, + { + "epoch": 0.34316717531366564, + "grad_norm": 0.10930664837360382, + "learning_rate": 0.00023491735658267182, + "loss": 0.37374699115753174, + "memory(GiB)": 78.33, + "step": 1771, + "token_acc": 0.8907253336311352, + "train_speed(iter/s)": 0.03258 + }, + { + "epoch": 0.3433609455989924, + "grad_norm": 0.1160641685128212, + "learning_rate": 0.0002348380950905781, + "loss": 0.3713539242744446, + "memory(GiB)": 78.33, + "step": 1772, + "token_acc": 0.8907073847185395, + "train_speed(iter/s)": 0.032582 + }, + { + "epoch": 0.34355471588431913, + "grad_norm": 0.11093118786811829, + "learning_rate": 0.000234758798753199, + "loss": 0.40102115273475647, + "memory(GiB)": 78.33, + "step": 1773, + "token_acc": 0.8830998248686515, + "train_speed(iter/s)": 0.032583 + }, + { + "epoch": 0.3437484861696459, + "grad_norm": 0.10876762866973877, + "learning_rate": 0.00023467946760310368, + "loss": 0.3644455671310425, + "memory(GiB)": 78.33, + "step": 1774, + "token_acc": 0.8928700486448923, + "train_speed(iter/s)": 0.032585 + }, + { + "epoch": 0.3439422564549726, + "grad_norm": 0.1010998785495758, + "learning_rate": 0.00023460010167287564, + "loss": 0.37988942861557007, + "memory(GiB)": 78.33, + "step": 1775, + "token_acc": 0.8903876473385219, + "train_speed(iter/s)": 0.032586 + }, + { + "epoch": 0.34413602674029936, + "grad_norm": 0.10760042816400528, + "learning_rate": 0.00023452070099511249, + "loss": 0.39640262722969055, + "memory(GiB)": 78.33, + "step": 1776, + "token_acc": 0.8831872960402909, + "train_speed(iter/s)": 0.032587 + }, + { + "epoch": 0.3443297970256261, + "grad_norm": 0.10678626596927643, + "learning_rate": 0.00023444126560242634, + "loss": 0.39018261432647705, + "memory(GiB)": 78.33, + "step": 1777, + "token_acc": 0.8859104540432472, + "train_speed(iter/s)": 0.032589 + }, + { + "epoch": 0.34452356731095285, + "grad_norm": 0.1159418597817421, + "learning_rate": 0.00023436179552744333, + "loss": 0.39666786789894104, + "memory(GiB)": 78.33, + "step": 1778, + "token_acc": 0.8834541062801933, + "train_speed(iter/s)": 0.03259 + }, + { + "epoch": 0.3447173375962796, + "grad_norm": 0.10813209414482117, + "learning_rate": 0.00023428229080280403, + "loss": 0.4084324836730957, + "memory(GiB)": 78.33, + "step": 1779, + "token_acc": 0.8810328586468665, + "train_speed(iter/s)": 0.032592 + }, + { + "epoch": 0.34491110788160634, + "grad_norm": 0.1118760034441948, + "learning_rate": 0.00023420275146116318, + "loss": 0.4295273721218109, + "memory(GiB)": 78.33, + "step": 1780, + "token_acc": 0.876175413371675, + "train_speed(iter/s)": 0.032593 + }, + { + "epoch": 0.3451048781669331, + "grad_norm": 0.10425037890672684, + "learning_rate": 0.00023412317753518968, + "loss": 0.38702112436294556, + "memory(GiB)": 78.33, + "step": 1781, + "token_acc": 0.88680838382835, + "train_speed(iter/s)": 0.032595 + }, + { + "epoch": 0.3452986484522598, + "grad_norm": 0.11203489452600479, + "learning_rate": 0.0002340435690575666, + "loss": 0.36375969648361206, + "memory(GiB)": 78.33, + "step": 1782, + "token_acc": 0.8949717774449948, + "train_speed(iter/s)": 0.032596 + }, + { + "epoch": 0.34549241873758657, + "grad_norm": 0.10534288734197617, + "learning_rate": 0.00023396392606099144, + "loss": 0.37565791606903076, + "memory(GiB)": 78.33, + "step": 1783, + "token_acc": 0.8892402497364794, + "train_speed(iter/s)": 0.032597 + }, + { + "epoch": 0.3456861890229133, + "grad_norm": 0.10718753933906555, + "learning_rate": 0.00023388424857817566, + "loss": 0.37750717997550964, + "memory(GiB)": 78.33, + "step": 1784, + "token_acc": 0.8892705765022024, + "train_speed(iter/s)": 0.032599 + }, + { + "epoch": 0.34587995930824006, + "grad_norm": 0.11478148400783539, + "learning_rate": 0.00023380453664184492, + "loss": 0.42205026745796204, + "memory(GiB)": 78.33, + "step": 1785, + "token_acc": 0.8783005541671194, + "train_speed(iter/s)": 0.0326 + }, + { + "epoch": 0.3460737295935668, + "grad_norm": 0.10276792198419571, + "learning_rate": 0.00023372479028473908, + "loss": 0.3618147671222687, + "memory(GiB)": 78.33, + "step": 1786, + "token_acc": 0.8970774091627172, + "train_speed(iter/s)": 0.032601 + }, + { + "epoch": 0.34626749987889355, + "grad_norm": 0.11334466934204102, + "learning_rate": 0.0002336450095396121, + "loss": 0.3954538404941559, + "memory(GiB)": 78.33, + "step": 1787, + "token_acc": 0.8844070906454381, + "train_speed(iter/s)": 0.032603 + }, + { + "epoch": 0.3464612701642203, + "grad_norm": 0.12592172622680664, + "learning_rate": 0.00023356519443923205, + "loss": 0.45259174704551697, + "memory(GiB)": 78.33, + "step": 1788, + "token_acc": 0.8711917422583673, + "train_speed(iter/s)": 0.032604 + }, + { + "epoch": 0.34665504044954704, + "grad_norm": 0.09646034240722656, + "learning_rate": 0.00023348534501638115, + "loss": 0.35588333010673523, + "memory(GiB)": 78.33, + "step": 1789, + "token_acc": 0.8962561097320675, + "train_speed(iter/s)": 0.032606 + }, + { + "epoch": 0.3468488107348738, + "grad_norm": 0.10078699886798859, + "learning_rate": 0.00023340546130385574, + "loss": 0.3622387647628784, + "memory(GiB)": 78.33, + "step": 1790, + "token_acc": 0.8921391399616544, + "train_speed(iter/s)": 0.032607 + }, + { + "epoch": 0.3470425810202005, + "grad_norm": 0.1054060235619545, + "learning_rate": 0.00023332554333446617, + "loss": 0.39420005679130554, + "memory(GiB)": 78.33, + "step": 1791, + "token_acc": 0.8858397062477569, + "train_speed(iter/s)": 0.032609 + }, + { + "epoch": 0.3472363513055273, + "grad_norm": 0.10570289194583893, + "learning_rate": 0.0002332455911410369, + "loss": 0.37943026423454285, + "memory(GiB)": 78.33, + "step": 1792, + "token_acc": 0.8889375085464242, + "train_speed(iter/s)": 0.03261 + }, + { + "epoch": 0.34743012159085407, + "grad_norm": 0.11169130355119705, + "learning_rate": 0.00023316560475640646, + "loss": 0.4090282917022705, + "memory(GiB)": 78.33, + "step": 1793, + "token_acc": 0.8791112570459148, + "train_speed(iter/s)": 0.032612 + }, + { + "epoch": 0.3476238918761808, + "grad_norm": 0.11017318069934845, + "learning_rate": 0.00023308558421342743, + "loss": 0.38704198598861694, + "memory(GiB)": 78.33, + "step": 1794, + "token_acc": 0.8879233394591757, + "train_speed(iter/s)": 0.032613 + }, + { + "epoch": 0.34781766216150756, + "grad_norm": 0.1121833398938179, + "learning_rate": 0.0002330055295449663, + "loss": 0.3916083872318268, + "memory(GiB)": 78.33, + "step": 1795, + "token_acc": 0.8842932088924959, + "train_speed(iter/s)": 0.032615 + }, + { + "epoch": 0.3480114324468343, + "grad_norm": 0.10421619564294815, + "learning_rate": 0.00023292544078390377, + "loss": 0.3865489065647125, + "memory(GiB)": 78.33, + "step": 1796, + "token_acc": 0.8867510896758053, + "train_speed(iter/s)": 0.032616 + }, + { + "epoch": 0.34820520273216105, + "grad_norm": 0.10179581493139267, + "learning_rate": 0.00023284531796313444, + "loss": 0.36844325065612793, + "memory(GiB)": 78.33, + "step": 1797, + "token_acc": 0.8937036571572201, + "train_speed(iter/s)": 0.032617 + }, + { + "epoch": 0.3483989730174878, + "grad_norm": 0.12043815106153488, + "learning_rate": 0.0002327651611155669, + "loss": 0.3939439058303833, + "memory(GiB)": 78.33, + "step": 1798, + "token_acc": 0.8850653819683414, + "train_speed(iter/s)": 0.032619 + }, + { + "epoch": 0.34859274330281453, + "grad_norm": 0.10927697271108627, + "learning_rate": 0.00023268497027412364, + "loss": 0.4002224802970886, + "memory(GiB)": 78.33, + "step": 1799, + "token_acc": 0.8828930991564099, + "train_speed(iter/s)": 0.03262 + }, + { + "epoch": 0.3487865135881413, + "grad_norm": 0.09782009571790695, + "learning_rate": 0.0002326047454717413, + "loss": 0.3829875886440277, + "memory(GiB)": 78.33, + "step": 1800, + "token_acc": 0.8896661604720058, + "train_speed(iter/s)": 0.032621 + }, + { + "epoch": 0.348980283873468, + "grad_norm": 0.09550854563713074, + "learning_rate": 0.0002325244867413703, + "loss": 0.34558382630348206, + "memory(GiB)": 78.33, + "step": 1801, + "token_acc": 0.897237290273748, + "train_speed(iter/s)": 0.032615 + }, + { + "epoch": 0.34917405415879477, + "grad_norm": 0.10129474103450775, + "learning_rate": 0.00023244419411597508, + "loss": 0.3574253022670746, + "memory(GiB)": 78.33, + "step": 1802, + "token_acc": 0.8967041694242224, + "train_speed(iter/s)": 0.032617 + }, + { + "epoch": 0.3493678244441215, + "grad_norm": 0.11037204414606094, + "learning_rate": 0.00023236386762853398, + "loss": 0.37563031911849976, + "memory(GiB)": 78.33, + "step": 1803, + "token_acc": 0.889018691588785, + "train_speed(iter/s)": 0.032618 + }, + { + "epoch": 0.34956159472944826, + "grad_norm": 0.10526623576879501, + "learning_rate": 0.00023228350731203923, + "loss": 0.37744981050491333, + "memory(GiB)": 78.33, + "step": 1804, + "token_acc": 0.8889050705114202, + "train_speed(iter/s)": 0.032619 + }, + { + "epoch": 0.349755365014775, + "grad_norm": 0.11370985209941864, + "learning_rate": 0.000232203113199497, + "loss": 0.38519221544265747, + "memory(GiB)": 78.33, + "step": 1805, + "token_acc": 0.8884900839518189, + "train_speed(iter/s)": 0.032621 + }, + { + "epoch": 0.34994913530010174, + "grad_norm": 0.11202775686979294, + "learning_rate": 0.00023212268532392733, + "loss": 0.40250468254089355, + "memory(GiB)": 78.33, + "step": 1806, + "token_acc": 0.882124123298167, + "train_speed(iter/s)": 0.032622 + }, + { + "epoch": 0.3501429055854285, + "grad_norm": 0.10065510869026184, + "learning_rate": 0.00023204222371836405, + "loss": 0.3592451810836792, + "memory(GiB)": 78.33, + "step": 1807, + "token_acc": 0.8944679954711814, + "train_speed(iter/s)": 0.032623 + }, + { + "epoch": 0.35033667587075523, + "grad_norm": 0.11026205122470856, + "learning_rate": 0.00023196172841585488, + "loss": 0.41680392622947693, + "memory(GiB)": 78.33, + "step": 1808, + "token_acc": 0.8787629092507141, + "train_speed(iter/s)": 0.032625 + }, + { + "epoch": 0.350530446156082, + "grad_norm": 0.10042080283164978, + "learning_rate": 0.00023188119944946147, + "loss": 0.36419713497161865, + "memory(GiB)": 78.33, + "step": 1809, + "token_acc": 0.8931367912134112, + "train_speed(iter/s)": 0.032626 + }, + { + "epoch": 0.3507242164414087, + "grad_norm": 0.12159177660942078, + "learning_rate": 0.00023180063685225924, + "loss": 0.3895268142223358, + "memory(GiB)": 78.33, + "step": 1810, + "token_acc": 0.884988553363131, + "train_speed(iter/s)": 0.032628 + }, + { + "epoch": 0.35091798672673546, + "grad_norm": 0.09737487882375717, + "learning_rate": 0.0002317200406573374, + "loss": 0.3568708002567291, + "memory(GiB)": 78.33, + "step": 1811, + "token_acc": 0.8948162336884952, + "train_speed(iter/s)": 0.032629 + }, + { + "epoch": 0.3511117570120622, + "grad_norm": 0.105777308344841, + "learning_rate": 0.00023163941089779892, + "loss": 0.3754398226737976, + "memory(GiB)": 78.33, + "step": 1812, + "token_acc": 0.8915336122076238, + "train_speed(iter/s)": 0.03263 + }, + { + "epoch": 0.35130552729738895, + "grad_norm": 0.10280515253543854, + "learning_rate": 0.00023155874760676069, + "loss": 0.3664350211620331, + "memory(GiB)": 78.33, + "step": 1813, + "token_acc": 0.8921140983953261, + "train_speed(iter/s)": 0.032632 + }, + { + "epoch": 0.3514992975827157, + "grad_norm": 0.10777980089187622, + "learning_rate": 0.00023147805081735325, + "loss": 0.3817192614078522, + "memory(GiB)": 78.33, + "step": 1814, + "token_acc": 0.889120756816917, + "train_speed(iter/s)": 0.032633 + }, + { + "epoch": 0.35169306786804244, + "grad_norm": 0.10182831436395645, + "learning_rate": 0.0002313973205627209, + "loss": 0.37246525287628174, + "memory(GiB)": 78.33, + "step": 1815, + "token_acc": 0.8915448019143845, + "train_speed(iter/s)": 0.032635 + }, + { + "epoch": 0.3518868381533692, + "grad_norm": 0.10677202045917511, + "learning_rate": 0.00023131655687602174, + "loss": 0.3982056677341461, + "memory(GiB)": 78.33, + "step": 1816, + "token_acc": 0.8849781395220943, + "train_speed(iter/s)": 0.032636 + }, + { + "epoch": 0.35208060843869593, + "grad_norm": 0.10674361884593964, + "learning_rate": 0.00023123575979042767, + "loss": 0.38551852107048035, + "memory(GiB)": 78.33, + "step": 1817, + "token_acc": 0.8879332677063364, + "train_speed(iter/s)": 0.032637 + }, + { + "epoch": 0.3522743787240227, + "grad_norm": 0.10658242553472519, + "learning_rate": 0.00023115492933912412, + "loss": 0.416787713766098, + "memory(GiB)": 78.33, + "step": 1818, + "token_acc": 0.8809510899034811, + "train_speed(iter/s)": 0.032639 + }, + { + "epoch": 0.3524681490093494, + "grad_norm": 0.10621768981218338, + "learning_rate": 0.00023107406555531042, + "loss": 0.37745460867881775, + "memory(GiB)": 78.33, + "step": 1819, + "token_acc": 0.8909349857578331, + "train_speed(iter/s)": 0.03264 + }, + { + "epoch": 0.35266191929467616, + "grad_norm": 0.11966609209775925, + "learning_rate": 0.00023099316847219944, + "loss": 0.43290236592292786, + "memory(GiB)": 78.33, + "step": 1820, + "token_acc": 0.8743744060817231, + "train_speed(iter/s)": 0.032642 + }, + { + "epoch": 0.3528556895800029, + "grad_norm": 0.11253046244382858, + "learning_rate": 0.00023091223812301778, + "loss": 0.4067782759666443, + "memory(GiB)": 78.33, + "step": 1821, + "token_acc": 0.8815392109108622, + "train_speed(iter/s)": 0.032643 + }, + { + "epoch": 0.35304945986532965, + "grad_norm": 0.09745576977729797, + "learning_rate": 0.00023083127454100573, + "loss": 0.36597940325737, + "memory(GiB)": 78.33, + "step": 1822, + "token_acc": 0.8929222319310948, + "train_speed(iter/s)": 0.032645 + }, + { + "epoch": 0.3532432301506564, + "grad_norm": 0.10735499858856201, + "learning_rate": 0.00023075027775941722, + "loss": 0.3935870826244354, + "memory(GiB)": 78.33, + "step": 1823, + "token_acc": 0.8852148867679449, + "train_speed(iter/s)": 0.032646 + }, + { + "epoch": 0.35343700043598314, + "grad_norm": 0.10999494791030884, + "learning_rate": 0.00023066924781151976, + "loss": 0.3992076814174652, + "memory(GiB)": 78.33, + "step": 1824, + "token_acc": 0.8823376247649356, + "train_speed(iter/s)": 0.032648 + }, + { + "epoch": 0.3536307707213099, + "grad_norm": 0.10191714018583298, + "learning_rate": 0.00023058818473059456, + "loss": 0.38821935653686523, + "memory(GiB)": 78.33, + "step": 1825, + "token_acc": 0.8841636697432286, + "train_speed(iter/s)": 0.032649 + }, + { + "epoch": 0.35382454100663663, + "grad_norm": 0.10946159809827805, + "learning_rate": 0.00023050708854993645, + "loss": 0.3874565660953522, + "memory(GiB)": 78.33, + "step": 1826, + "token_acc": 0.8877764842840512, + "train_speed(iter/s)": 0.03265 + }, + { + "epoch": 0.3540183112919634, + "grad_norm": 0.10883501917123795, + "learning_rate": 0.00023042595930285374, + "loss": 0.36587873101234436, + "memory(GiB)": 78.33, + "step": 1827, + "token_acc": 0.8928685827436592, + "train_speed(iter/s)": 0.032652 + }, + { + "epoch": 0.3542120815772901, + "grad_norm": 0.1008007600903511, + "learning_rate": 0.0002303447970226684, + "loss": 0.3552745282649994, + "memory(GiB)": 78.33, + "step": 1828, + "token_acc": 0.8956754006614093, + "train_speed(iter/s)": 0.032653 + }, + { + "epoch": 0.35440585186261686, + "grad_norm": 0.1125805526971817, + "learning_rate": 0.00023026360174271593, + "loss": 0.4129788279533386, + "memory(GiB)": 78.33, + "step": 1829, + "token_acc": 0.8825399481301766, + "train_speed(iter/s)": 0.032655 + }, + { + "epoch": 0.3545996221479436, + "grad_norm": 0.11501467972993851, + "learning_rate": 0.00023018237349634553, + "loss": 0.41495800018310547, + "memory(GiB)": 78.33, + "step": 1830, + "token_acc": 0.8798725839542166, + "train_speed(iter/s)": 0.032656 + }, + { + "epoch": 0.35479339243327035, + "grad_norm": 0.10721118748188019, + "learning_rate": 0.00023010111231691973, + "loss": 0.36478832364082336, + "memory(GiB)": 78.33, + "step": 1831, + "token_acc": 0.891846109675567, + "train_speed(iter/s)": 0.032657 + }, + { + "epoch": 0.3549871627185971, + "grad_norm": 0.11395079642534256, + "learning_rate": 0.00023001981823781472, + "loss": 0.3958333730697632, + "memory(GiB)": 78.33, + "step": 1832, + "token_acc": 0.8860460306674152, + "train_speed(iter/s)": 0.032659 + }, + { + "epoch": 0.35518093300392384, + "grad_norm": 0.11087855696678162, + "learning_rate": 0.00022993849129242014, + "loss": 0.4031826853752136, + "memory(GiB)": 78.33, + "step": 1833, + "token_acc": 0.882696344865269, + "train_speed(iter/s)": 0.03266 + }, + { + "epoch": 0.3553747032892506, + "grad_norm": 0.1058662161231041, + "learning_rate": 0.00022985713151413913, + "loss": 0.37713879346847534, + "memory(GiB)": 78.33, + "step": 1834, + "token_acc": 0.8884106402887314, + "train_speed(iter/s)": 0.032661 + }, + { + "epoch": 0.3555684735745773, + "grad_norm": 0.10403905063867569, + "learning_rate": 0.00022977573893638836, + "loss": 0.3826426863670349, + "memory(GiB)": 78.33, + "step": 1835, + "token_acc": 0.8869446715726356, + "train_speed(iter/s)": 0.032663 + }, + { + "epoch": 0.35576224385990407, + "grad_norm": 0.09630994498729706, + "learning_rate": 0.00022969431359259797, + "loss": 0.35036543011665344, + "memory(GiB)": 78.33, + "step": 1836, + "token_acc": 0.8947007008903202, + "train_speed(iter/s)": 0.032664 + }, + { + "epoch": 0.3559560141452308, + "grad_norm": 0.12186194956302643, + "learning_rate": 0.0002296128555162115, + "loss": 0.4338938593864441, + "memory(GiB)": 78.33, + "step": 1837, + "token_acc": 0.8740203761755486, + "train_speed(iter/s)": 0.032665 + }, + { + "epoch": 0.35614978443055756, + "grad_norm": 0.11005796492099762, + "learning_rate": 0.000229531364740686, + "loss": 0.37460899353027344, + "memory(GiB)": 78.33, + "step": 1838, + "token_acc": 0.8911450261367517, + "train_speed(iter/s)": 0.032667 + }, + { + "epoch": 0.3563435547158843, + "grad_norm": 0.10339885205030441, + "learning_rate": 0.00022944984129949196, + "loss": 0.3677971661090851, + "memory(GiB)": 78.33, + "step": 1839, + "token_acc": 0.8935775621157461, + "train_speed(iter/s)": 0.032668 + }, + { + "epoch": 0.35653732500121105, + "grad_norm": 0.10451404750347137, + "learning_rate": 0.00022936828522611316, + "loss": 0.3537764847278595, + "memory(GiB)": 78.33, + "step": 1840, + "token_acc": 0.8961053422788898, + "train_speed(iter/s)": 0.032669 + }, + { + "epoch": 0.3567310952865378, + "grad_norm": 0.09972133487462997, + "learning_rate": 0.00022928669655404688, + "loss": 0.39150598645210266, + "memory(GiB)": 78.33, + "step": 1841, + "token_acc": 0.885431667414245, + "train_speed(iter/s)": 0.032671 + }, + { + "epoch": 0.35692486557186454, + "grad_norm": 0.10803169012069702, + "learning_rate": 0.0002292050753168038, + "loss": 0.3714596629142761, + "memory(GiB)": 78.33, + "step": 1842, + "token_acc": 0.8923362209736792, + "train_speed(iter/s)": 0.032672 + }, + { + "epoch": 0.3571186358571913, + "grad_norm": 0.1168799102306366, + "learning_rate": 0.00022912342154790804, + "loss": 0.421195387840271, + "memory(GiB)": 78.33, + "step": 1843, + "token_acc": 0.8766641213771124, + "train_speed(iter/s)": 0.032674 + }, + { + "epoch": 0.357312406142518, + "grad_norm": 0.10394692420959473, + "learning_rate": 0.00022904173528089686, + "loss": 0.4081256091594696, + "memory(GiB)": 78.33, + "step": 1844, + "token_acc": 0.8812434789587736, + "train_speed(iter/s)": 0.032675 + }, + { + "epoch": 0.35750617642784477, + "grad_norm": 0.0959138348698616, + "learning_rate": 0.00022896001654932105, + "loss": 0.3497539162635803, + "memory(GiB)": 78.33, + "step": 1845, + "token_acc": 0.8946740342957947, + "train_speed(iter/s)": 0.032677 + }, + { + "epoch": 0.3576999467131715, + "grad_norm": 0.10984820127487183, + "learning_rate": 0.0002288782653867448, + "loss": 0.4027239680290222, + "memory(GiB)": 78.33, + "step": 1846, + "token_acc": 0.8789472180150316, + "train_speed(iter/s)": 0.032678 + }, + { + "epoch": 0.35789371699849826, + "grad_norm": 0.11082874238491058, + "learning_rate": 0.0002287964818267453, + "loss": 0.3869192600250244, + "memory(GiB)": 78.33, + "step": 1847, + "token_acc": 0.8861776989530293, + "train_speed(iter/s)": 0.032679 + }, + { + "epoch": 0.358087487283825, + "grad_norm": 0.10880383849143982, + "learning_rate": 0.0002287146659029134, + "loss": 0.39860600233078003, + "memory(GiB)": 78.33, + "step": 1848, + "token_acc": 0.8812581533555588, + "train_speed(iter/s)": 0.032681 + }, + { + "epoch": 0.35828125756915175, + "grad_norm": 0.11083029955625534, + "learning_rate": 0.00022863281764885315, + "loss": 0.3671538233757019, + "memory(GiB)": 78.33, + "step": 1849, + "token_acc": 0.8913858792713418, + "train_speed(iter/s)": 0.032682 + }, + { + "epoch": 0.3584750278544785, + "grad_norm": 0.1035129502415657, + "learning_rate": 0.00022855093709818168, + "loss": 0.35645681619644165, + "memory(GiB)": 78.33, + "step": 1850, + "token_acc": 0.8927785347447856, + "train_speed(iter/s)": 0.032683 + }, + { + "epoch": 0.35866879813980523, + "grad_norm": 0.11510751396417618, + "learning_rate": 0.00022846902428452957, + "loss": 0.4069887101650238, + "memory(GiB)": 78.33, + "step": 1851, + "token_acc": 0.8827348409542743, + "train_speed(iter/s)": 0.032685 + }, + { + "epoch": 0.35886256842513203, + "grad_norm": 0.11083865165710449, + "learning_rate": 0.00022838707924154072, + "loss": 0.3828306794166565, + "memory(GiB)": 78.33, + "step": 1852, + "token_acc": 0.8873923603444469, + "train_speed(iter/s)": 0.032686 + }, + { + "epoch": 0.3590563387104588, + "grad_norm": 0.11098314076662064, + "learning_rate": 0.00022830510200287204, + "loss": 0.4004696309566498, + "memory(GiB)": 78.33, + "step": 1853, + "token_acc": 0.8826161182080414, + "train_speed(iter/s)": 0.032688 + }, + { + "epoch": 0.3592501089957855, + "grad_norm": 0.1142255887389183, + "learning_rate": 0.00022822309260219382, + "loss": 0.39147108793258667, + "memory(GiB)": 78.33, + "step": 1854, + "token_acc": 0.8852366362257191, + "train_speed(iter/s)": 0.032689 + }, + { + "epoch": 0.35944387928111227, + "grad_norm": 0.10014226287603378, + "learning_rate": 0.00022814105107318952, + "loss": 0.35710304975509644, + "memory(GiB)": 78.33, + "step": 1855, + "token_acc": 0.8953069555241322, + "train_speed(iter/s)": 0.03269 + }, + { + "epoch": 0.359637649566439, + "grad_norm": 0.10325758904218674, + "learning_rate": 0.00022805897744955587, + "loss": 0.3418915569782257, + "memory(GiB)": 78.33, + "step": 1856, + "token_acc": 0.899773435287454, + "train_speed(iter/s)": 0.032691 + }, + { + "epoch": 0.35983141985176575, + "grad_norm": 0.10661393404006958, + "learning_rate": 0.00022797687176500257, + "loss": 0.36435335874557495, + "memory(GiB)": 78.33, + "step": 1857, + "token_acc": 0.8942655906926346, + "train_speed(iter/s)": 0.032693 + }, + { + "epoch": 0.3600251901370925, + "grad_norm": 0.11469036340713501, + "learning_rate": 0.0002278947340532528, + "loss": 0.415936678647995, + "memory(GiB)": 78.33, + "step": 1858, + "token_acc": 0.8799646174259177, + "train_speed(iter/s)": 0.032694 + }, + { + "epoch": 0.36021896042241924, + "grad_norm": 0.11351092904806137, + "learning_rate": 0.0002278125643480426, + "loss": 0.4251920282840729, + "memory(GiB)": 78.33, + "step": 1859, + "token_acc": 0.8783027287232604, + "train_speed(iter/s)": 0.032696 + }, + { + "epoch": 0.360412730707746, + "grad_norm": 0.12002420425415039, + "learning_rate": 0.00022773036268312135, + "loss": 0.40197598934173584, + "memory(GiB)": 78.33, + "step": 1860, + "token_acc": 0.884971045690204, + "train_speed(iter/s)": 0.032697 + }, + { + "epoch": 0.36060650099307273, + "grad_norm": 0.11968737095594406, + "learning_rate": 0.00022764812909225143, + "loss": 0.444561243057251, + "memory(GiB)": 78.33, + "step": 1861, + "token_acc": 0.8720333353463373, + "train_speed(iter/s)": 0.032698 + }, + { + "epoch": 0.3608002712783995, + "grad_norm": 0.12199822068214417, + "learning_rate": 0.00022756586360920834, + "loss": 0.4252670705318451, + "memory(GiB)": 78.33, + "step": 1862, + "token_acc": 0.8757837872600405, + "train_speed(iter/s)": 0.0327 + }, + { + "epoch": 0.3609940415637262, + "grad_norm": 0.11111042648553848, + "learning_rate": 0.00022748356626778085, + "loss": 0.38725805282592773, + "memory(GiB)": 78.33, + "step": 1863, + "token_acc": 0.88624693011833, + "train_speed(iter/s)": 0.032701 + }, + { + "epoch": 0.36118781184905296, + "grad_norm": 0.09851629287004471, + "learning_rate": 0.00022740123710177063, + "loss": 0.37134605646133423, + "memory(GiB)": 78.33, + "step": 1864, + "token_acc": 0.8892687066811259, + "train_speed(iter/s)": 0.032702 + }, + { + "epoch": 0.3613815821343797, + "grad_norm": 0.10534118860960007, + "learning_rate": 0.0002273188761449925, + "loss": 0.36640486121177673, + "memory(GiB)": 78.33, + "step": 1865, + "token_acc": 0.8921657754010696, + "train_speed(iter/s)": 0.032703 + }, + { + "epoch": 0.36157535241970645, + "grad_norm": 0.10792740434408188, + "learning_rate": 0.00022723648343127428, + "loss": 0.3896613121032715, + "memory(GiB)": 78.33, + "step": 1866, + "token_acc": 0.8864207221350079, + "train_speed(iter/s)": 0.032705 + }, + { + "epoch": 0.3617691227050332, + "grad_norm": 0.11643590033054352, + "learning_rate": 0.0002271540589944569, + "loss": 0.3730244040489197, + "memory(GiB)": 78.33, + "step": 1867, + "token_acc": 0.8894352346164653, + "train_speed(iter/s)": 0.032706 + }, + { + "epoch": 0.36196289299035994, + "grad_norm": 0.11337530612945557, + "learning_rate": 0.00022707160286839425, + "loss": 0.40268588066101074, + "memory(GiB)": 78.33, + "step": 1868, + "token_acc": 0.8809530219410419, + "train_speed(iter/s)": 0.032707 + }, + { + "epoch": 0.3621566632756867, + "grad_norm": 0.11132289469242096, + "learning_rate": 0.00022698911508695335, + "loss": 0.4088488221168518, + "memory(GiB)": 78.33, + "step": 1869, + "token_acc": 0.8813299380751201, + "train_speed(iter/s)": 0.032709 + }, + { + "epoch": 0.36235043356101343, + "grad_norm": 0.10401839762926102, + "learning_rate": 0.00022690659568401405, + "loss": 0.3792577087879181, + "memory(GiB)": 78.33, + "step": 1870, + "token_acc": 0.8880251617357506, + "train_speed(iter/s)": 0.03271 + }, + { + "epoch": 0.3625442038463402, + "grad_norm": 0.10719176381826401, + "learning_rate": 0.0002268240446934694, + "loss": 0.3922528028488159, + "memory(GiB)": 78.33, + "step": 1871, + "token_acc": 0.8880327346157934, + "train_speed(iter/s)": 0.032711 + }, + { + "epoch": 0.3627379741316669, + "grad_norm": 0.10341445356607437, + "learning_rate": 0.00022674146214922522, + "loss": 0.3721281886100769, + "memory(GiB)": 78.33, + "step": 1872, + "token_acc": 0.8919034280378818, + "train_speed(iter/s)": 0.032713 + }, + { + "epoch": 0.36293174441699366, + "grad_norm": 0.09941703081130981, + "learning_rate": 0.00022665884808520045, + "loss": 0.35231590270996094, + "memory(GiB)": 78.33, + "step": 1873, + "token_acc": 0.895152484700744, + "train_speed(iter/s)": 0.032714 + }, + { + "epoch": 0.3631255147023204, + "grad_norm": 0.12776227295398712, + "learning_rate": 0.00022657620253532681, + "loss": 0.42608171701431274, + "memory(GiB)": 78.33, + "step": 1874, + "token_acc": 0.8764504054897068, + "train_speed(iter/s)": 0.032715 + }, + { + "epoch": 0.36331928498764715, + "grad_norm": 0.10132501274347305, + "learning_rate": 0.00022649352553354913, + "loss": 0.3703892230987549, + "memory(GiB)": 78.33, + "step": 1875, + "token_acc": 0.8915755231679516, + "train_speed(iter/s)": 0.032717 + }, + { + "epoch": 0.3635130552729739, + "grad_norm": 0.11589387059211731, + "learning_rate": 0.00022641081711382508, + "loss": 0.4376241862773895, + "memory(GiB)": 78.33, + "step": 1876, + "token_acc": 0.8734218545929473, + "train_speed(iter/s)": 0.032718 + }, + { + "epoch": 0.36370682555830064, + "grad_norm": 0.10825785249471664, + "learning_rate": 0.00022632807731012519, + "loss": 0.3789633810520172, + "memory(GiB)": 78.33, + "step": 1877, + "token_acc": 0.8895960154952961, + "train_speed(iter/s)": 0.032719 + }, + { + "epoch": 0.3639005958436274, + "grad_norm": 0.10502450913190842, + "learning_rate": 0.00022624530615643291, + "loss": 0.3538724482059479, + "memory(GiB)": 78.33, + "step": 1878, + "token_acc": 0.8984881209503239, + "train_speed(iter/s)": 0.03272 + }, + { + "epoch": 0.36409436612895413, + "grad_norm": 0.10411059856414795, + "learning_rate": 0.00022616250368674465, + "loss": 0.35804876685142517, + "memory(GiB)": 78.33, + "step": 1879, + "token_acc": 0.8947292874777516, + "train_speed(iter/s)": 0.032722 + }, + { + "epoch": 0.36428813641428087, + "grad_norm": 0.09905564039945602, + "learning_rate": 0.00022607966993506954, + "loss": 0.36274391412734985, + "memory(GiB)": 78.33, + "step": 1880, + "token_acc": 0.8931178818261462, + "train_speed(iter/s)": 0.032723 + }, + { + "epoch": 0.3644819066996076, + "grad_norm": 0.11155527085065842, + "learning_rate": 0.0002259968049354296, + "loss": 0.4275517165660858, + "memory(GiB)": 78.33, + "step": 1881, + "token_acc": 0.8751891922192948, + "train_speed(iter/s)": 0.032724 + }, + { + "epoch": 0.36467567698493436, + "grad_norm": 0.12305615842342377, + "learning_rate": 0.00022591390872185978, + "loss": 0.4187135696411133, + "memory(GiB)": 78.33, + "step": 1882, + "token_acc": 0.8782666539960088, + "train_speed(iter/s)": 0.032726 + }, + { + "epoch": 0.3648694472702611, + "grad_norm": 0.2306802123785019, + "learning_rate": 0.00022583098132840783, + "loss": 0.40811386704444885, + "memory(GiB)": 78.33, + "step": 1883, + "token_acc": 0.8822494609011463, + "train_speed(iter/s)": 0.032727 + }, + { + "epoch": 0.36506321755558785, + "grad_norm": 0.11205735057592392, + "learning_rate": 0.00022574802278913409, + "loss": 0.3602101504802704, + "memory(GiB)": 78.33, + "step": 1884, + "token_acc": 0.8919279519679787, + "train_speed(iter/s)": 0.032728 + }, + { + "epoch": 0.3652569878409146, + "grad_norm": 0.10754162818193436, + "learning_rate": 0.00022566503313811202, + "loss": 0.41668108105659485, + "memory(GiB)": 78.33, + "step": 1885, + "token_acc": 0.8790157211209843, + "train_speed(iter/s)": 0.032729 + }, + { + "epoch": 0.36545075812624134, + "grad_norm": 0.13130834698677063, + "learning_rate": 0.00022558201240942765, + "loss": 0.3803432881832123, + "memory(GiB)": 78.33, + "step": 1886, + "token_acc": 0.8907192443382269, + "train_speed(iter/s)": 0.03273 + }, + { + "epoch": 0.3656445284115681, + "grad_norm": 0.12778477370738983, + "learning_rate": 0.00022549896063717978, + "loss": 0.45700541138648987, + "memory(GiB)": 78.33, + "step": 1887, + "token_acc": 0.8673952641165756, + "train_speed(iter/s)": 0.032732 + }, + { + "epoch": 0.3658382986968948, + "grad_norm": 0.10660150647163391, + "learning_rate": 0.00022541587785548006, + "loss": 0.388899028301239, + "memory(GiB)": 78.33, + "step": 1888, + "token_acc": 0.8842618950793005, + "train_speed(iter/s)": 0.032733 + }, + { + "epoch": 0.36603206898222157, + "grad_norm": 0.10151253640651703, + "learning_rate": 0.0002253327640984528, + "loss": 0.3622681200504303, + "memory(GiB)": 78.33, + "step": 1889, + "token_acc": 0.8935134049603374, + "train_speed(iter/s)": 0.032734 + }, + { + "epoch": 0.3662258392675483, + "grad_norm": 0.12292792648077011, + "learning_rate": 0.00022524961940023505, + "loss": 0.42959490418434143, + "memory(GiB)": 78.33, + "step": 1890, + "token_acc": 0.8763984746777599, + "train_speed(iter/s)": 0.032736 + }, + { + "epoch": 0.36641960955287506, + "grad_norm": 0.11249762028455734, + "learning_rate": 0.00022516644379497658, + "loss": 0.3739752471446991, + "memory(GiB)": 78.33, + "step": 1891, + "token_acc": 0.8892815758980301, + "train_speed(iter/s)": 0.032737 + }, + { + "epoch": 0.3666133798382018, + "grad_norm": 0.14134949445724487, + "learning_rate": 0.00022508323731683984, + "loss": 0.37695708870887756, + "memory(GiB)": 78.33, + "step": 1892, + "token_acc": 0.8897990726429675, + "train_speed(iter/s)": 0.032738 + }, + { + "epoch": 0.36680715012352855, + "grad_norm": 0.10767961293458939, + "learning_rate": 0.000225, + "loss": 0.3695901930332184, + "memory(GiB)": 78.33, + "step": 1893, + "token_acc": 0.8925884180704907, + "train_speed(iter/s)": 0.032739 + }, + { + "epoch": 0.3670009204088553, + "grad_norm": 0.10858670622110367, + "learning_rate": 0.00022491673187864482, + "loss": 0.39885541796684265, + "memory(GiB)": 78.33, + "step": 1894, + "token_acc": 0.8855477140227064, + "train_speed(iter/s)": 0.032741 + }, + { + "epoch": 0.36719469069418204, + "grad_norm": 0.12462179362773895, + "learning_rate": 0.00022483343298697472, + "loss": 0.38520297408103943, + "memory(GiB)": 78.33, + "step": 1895, + "token_acc": 0.8870938651413259, + "train_speed(iter/s)": 0.032742 + }, + { + "epoch": 0.3673884609795088, + "grad_norm": 0.09840144217014313, + "learning_rate": 0.00022475010335920288, + "loss": 0.35938745737075806, + "memory(GiB)": 78.33, + "step": 1896, + "token_acc": 0.893057469002165, + "train_speed(iter/s)": 0.032743 + }, + { + "epoch": 0.3675822312648355, + "grad_norm": 0.11222008615732193, + "learning_rate": 0.00022466674302955495, + "loss": 0.4017634689807892, + "memory(GiB)": 78.33, + "step": 1897, + "token_acc": 0.8824197671116161, + "train_speed(iter/s)": 0.032745 + }, + { + "epoch": 0.36777600155016227, + "grad_norm": 0.0982629731297493, + "learning_rate": 0.00022458335203226932, + "loss": 0.3599035441875458, + "memory(GiB)": 78.33, + "step": 1898, + "token_acc": 0.8942642891179495, + "train_speed(iter/s)": 0.032746 + }, + { + "epoch": 0.367969771835489, + "grad_norm": 0.09826286882162094, + "learning_rate": 0.00022449993040159685, + "loss": 0.35435935854911804, + "memory(GiB)": 78.33, + "step": 1899, + "token_acc": 0.8933514973118933, + "train_speed(iter/s)": 0.032747 + }, + { + "epoch": 0.36816354212081576, + "grad_norm": 0.11281174421310425, + "learning_rate": 0.0002244164781718011, + "loss": 0.3880941569805145, + "memory(GiB)": 78.33, + "step": 1900, + "token_acc": 0.889101803692551, + "train_speed(iter/s)": 0.032748 + }, + { + "epoch": 0.3683573124061425, + "grad_norm": 0.1215512827038765, + "learning_rate": 0.0002243329953771581, + "loss": 0.43437880277633667, + "memory(GiB)": 78.33, + "step": 1901, + "token_acc": 0.875211292666291, + "train_speed(iter/s)": 0.03275 + }, + { + "epoch": 0.36855108269146925, + "grad_norm": 0.11155978590250015, + "learning_rate": 0.0002242494820519565, + "loss": 0.37402305006980896, + "memory(GiB)": 78.33, + "step": 1902, + "token_acc": 0.8897708186595872, + "train_speed(iter/s)": 0.032751 + }, + { + "epoch": 0.368744852976796, + "grad_norm": 0.10142076760530472, + "learning_rate": 0.00022416593823049746, + "loss": 0.3670305609703064, + "memory(GiB)": 78.33, + "step": 1903, + "token_acc": 0.8930082952429321, + "train_speed(iter/s)": 0.032752 + }, + { + "epoch": 0.36893862326212273, + "grad_norm": 0.12013176828622818, + "learning_rate": 0.00022408236394709464, + "loss": 0.42799264192581177, + "memory(GiB)": 78.33, + "step": 1904, + "token_acc": 0.8759844248616682, + "train_speed(iter/s)": 0.032753 + }, + { + "epoch": 0.3691323935474495, + "grad_norm": 0.11704428493976593, + "learning_rate": 0.0002239987592360743, + "loss": 0.3853393793106079, + "memory(GiB)": 78.33, + "step": 1905, + "token_acc": 0.8859077310659739, + "train_speed(iter/s)": 0.032755 + }, + { + "epoch": 0.3693261638327762, + "grad_norm": 0.09620506316423416, + "learning_rate": 0.00022391512413177516, + "loss": 0.34337371587753296, + "memory(GiB)": 78.33, + "step": 1906, + "token_acc": 0.8966349706853229, + "train_speed(iter/s)": 0.032756 + }, + { + "epoch": 0.36951993411810297, + "grad_norm": 0.1018129214644432, + "learning_rate": 0.00022383145866854834, + "loss": 0.3677298128604889, + "memory(GiB)": 78.33, + "step": 1907, + "token_acc": 0.8905968102763946, + "train_speed(iter/s)": 0.032757 + }, + { + "epoch": 0.3697137044034297, + "grad_norm": 0.10136358439922333, + "learning_rate": 0.00022374776288075745, + "loss": 0.3641367554664612, + "memory(GiB)": 78.33, + "step": 1908, + "token_acc": 0.8920720537349888, + "train_speed(iter/s)": 0.032758 + }, + { + "epoch": 0.36990747468875645, + "grad_norm": 0.10752905905246735, + "learning_rate": 0.00022366403680277875, + "loss": 0.373902291059494, + "memory(GiB)": 78.33, + "step": 1909, + "token_acc": 0.888663967611336, + "train_speed(iter/s)": 0.032759 + }, + { + "epoch": 0.3701012449740832, + "grad_norm": 0.09833654016256332, + "learning_rate": 0.00022358028046900067, + "loss": 0.3543311059474945, + "memory(GiB)": 78.33, + "step": 1910, + "token_acc": 0.89600593545196, + "train_speed(iter/s)": 0.03276 + }, + { + "epoch": 0.37029501525940994, + "grad_norm": 0.10382269322872162, + "learning_rate": 0.00022349649391382423, + "loss": 0.39178794622421265, + "memory(GiB)": 78.33, + "step": 1911, + "token_acc": 0.8857472274074888, + "train_speed(iter/s)": 0.032762 + }, + { + "epoch": 0.3704887855447367, + "grad_norm": 0.11607307940721512, + "learning_rate": 0.0002234126771716628, + "loss": 0.39549145102500916, + "memory(GiB)": 78.33, + "step": 1912, + "token_acc": 0.8873361227336123, + "train_speed(iter/s)": 0.032763 + }, + { + "epoch": 0.3706825558300635, + "grad_norm": 0.098874531686306, + "learning_rate": 0.0002233288302769422, + "loss": 0.37269628047943115, + "memory(GiB)": 78.33, + "step": 1913, + "token_acc": 0.8919546544147023, + "train_speed(iter/s)": 0.032764 + }, + { + "epoch": 0.37087632611539023, + "grad_norm": 0.10173005610704422, + "learning_rate": 0.00022324495326410057, + "loss": 0.37610965967178345, + "memory(GiB)": 78.33, + "step": 1914, + "token_acc": 0.8908233494774407, + "train_speed(iter/s)": 0.032765 + }, + { + "epoch": 0.371070096400717, + "grad_norm": 0.12058980017900467, + "learning_rate": 0.00022316104616758848, + "loss": 0.43350180983543396, + "memory(GiB)": 78.33, + "step": 1915, + "token_acc": 0.8748074872645422, + "train_speed(iter/s)": 0.032767 + }, + { + "epoch": 0.3712638666860437, + "grad_norm": 0.1139712780714035, + "learning_rate": 0.0002230771090218688, + "loss": 0.3701034486293793, + "memory(GiB)": 78.33, + "step": 1916, + "token_acc": 0.8913546669752908, + "train_speed(iter/s)": 0.032768 + }, + { + "epoch": 0.37145763697137046, + "grad_norm": 0.11131946742534637, + "learning_rate": 0.00022299314186141676, + "loss": 0.37333056330680847, + "memory(GiB)": 78.33, + "step": 1917, + "token_acc": 0.8892822835185947, + "train_speed(iter/s)": 0.032769 + }, + { + "epoch": 0.3716514072566972, + "grad_norm": 0.12218397855758667, + "learning_rate": 0.00022290914472072, + "loss": 0.4149776101112366, + "memory(GiB)": 78.33, + "step": 1918, + "token_acc": 0.8801127922136988, + "train_speed(iter/s)": 0.03277 + }, + { + "epoch": 0.37184517754202395, + "grad_norm": 0.10744242370128632, + "learning_rate": 0.00022282511763427838, + "loss": 0.3630349338054657, + "memory(GiB)": 78.33, + "step": 1919, + "token_acc": 0.8921786701935794, + "train_speed(iter/s)": 0.032772 + }, + { + "epoch": 0.3720389478273507, + "grad_norm": 0.10889404267072678, + "learning_rate": 0.00022274106063660404, + "loss": 0.3649406433105469, + "memory(GiB)": 78.33, + "step": 1920, + "token_acc": 0.8912664400768435, + "train_speed(iter/s)": 0.032773 + }, + { + "epoch": 0.37223271811267744, + "grad_norm": 0.11465780436992645, + "learning_rate": 0.00022265697376222141, + "loss": 0.3944970965385437, + "memory(GiB)": 78.33, + "step": 1921, + "token_acc": 0.8845351473922902, + "train_speed(iter/s)": 0.032774 + }, + { + "epoch": 0.3724264883980042, + "grad_norm": 0.11644507199525833, + "learning_rate": 0.00022257285704566735, + "loss": 0.41171303391456604, + "memory(GiB)": 78.33, + "step": 1922, + "token_acc": 0.8799286944436051, + "train_speed(iter/s)": 0.032775 + }, + { + "epoch": 0.37262025868333093, + "grad_norm": 0.10672671347856522, + "learning_rate": 0.00022248871052149078, + "loss": 0.37590137124061584, + "memory(GiB)": 78.33, + "step": 1923, + "token_acc": 0.8882163998831992, + "train_speed(iter/s)": 0.032777 + }, + { + "epoch": 0.3728140289686577, + "grad_norm": 0.11380830407142639, + "learning_rate": 0.00022240453422425294, + "loss": 0.41342946887016296, + "memory(GiB)": 78.33, + "step": 1924, + "token_acc": 0.8813163889880054, + "train_speed(iter/s)": 0.032778 + }, + { + "epoch": 0.3730077992539844, + "grad_norm": 0.10751762241125107, + "learning_rate": 0.00022232032818852732, + "loss": 0.37841910123825073, + "memory(GiB)": 78.33, + "step": 1925, + "token_acc": 0.887600209372159, + "train_speed(iter/s)": 0.032779 + }, + { + "epoch": 0.37320156953931116, + "grad_norm": 0.11165986955165863, + "learning_rate": 0.0002222360924488996, + "loss": 0.39904457330703735, + "memory(GiB)": 78.33, + "step": 1926, + "token_acc": 0.8864683010230814, + "train_speed(iter/s)": 0.03278 + }, + { + "epoch": 0.3733953398246379, + "grad_norm": 0.10432970523834229, + "learning_rate": 0.00022215182703996765, + "loss": 0.35338613390922546, + "memory(GiB)": 78.33, + "step": 1927, + "token_acc": 0.8975270862625093, + "train_speed(iter/s)": 0.032782 + }, + { + "epoch": 0.37358911010996465, + "grad_norm": 0.09983038157224655, + "learning_rate": 0.00022206753199634148, + "loss": 0.36258718371391296, + "memory(GiB)": 78.33, + "step": 1928, + "token_acc": 0.8939695669934641, + "train_speed(iter/s)": 0.032783 + }, + { + "epoch": 0.3737828803952914, + "grad_norm": 0.10431524366140366, + "learning_rate": 0.00022198320735264344, + "loss": 0.3931850492954254, + "memory(GiB)": 78.33, + "step": 1929, + "token_acc": 0.8851347908067064, + "train_speed(iter/s)": 0.032784 + }, + { + "epoch": 0.37397665068061814, + "grad_norm": 0.11230375617742538, + "learning_rate": 0.00022189885314350787, + "loss": 0.391851007938385, + "memory(GiB)": 78.33, + "step": 1930, + "token_acc": 0.8859056476850633, + "train_speed(iter/s)": 0.032785 + }, + { + "epoch": 0.3741704209659449, + "grad_norm": 0.1120762825012207, + "learning_rate": 0.00022181446940358135, + "loss": 0.41800612211227417, + "memory(GiB)": 78.33, + "step": 1931, + "token_acc": 0.877407731234931, + "train_speed(iter/s)": 0.032787 + }, + { + "epoch": 0.3743641912512716, + "grad_norm": 0.1048278734087944, + "learning_rate": 0.00022173005616752252, + "loss": 0.3735467791557312, + "memory(GiB)": 78.33, + "step": 1932, + "token_acc": 0.8902271252433485, + "train_speed(iter/s)": 0.032788 + }, + { + "epoch": 0.37455796153659837, + "grad_norm": 0.12189824134111404, + "learning_rate": 0.00022164561347000212, + "loss": 0.3999292254447937, + "memory(GiB)": 78.33, + "step": 1933, + "token_acc": 0.8858239307268434, + "train_speed(iter/s)": 0.032789 + }, + { + "epoch": 0.3747517318219251, + "grad_norm": 0.10540164262056351, + "learning_rate": 0.00022156114134570305, + "loss": 0.39689502120018005, + "memory(GiB)": 78.33, + "step": 1934, + "token_acc": 0.8854811041222644, + "train_speed(iter/s)": 0.03279 + }, + { + "epoch": 0.37494550210725186, + "grad_norm": 0.10333568602800369, + "learning_rate": 0.00022147663982932038, + "loss": 0.3748435080051422, + "memory(GiB)": 78.33, + "step": 1935, + "token_acc": 0.8905851118145615, + "train_speed(iter/s)": 0.032791 + }, + { + "epoch": 0.3751392723925786, + "grad_norm": 0.1070760041475296, + "learning_rate": 0.00022139210895556104, + "loss": 0.37235966324806213, + "memory(GiB)": 78.33, + "step": 1936, + "token_acc": 0.8913205876656614, + "train_speed(iter/s)": 0.032792 + }, + { + "epoch": 0.37533304267790535, + "grad_norm": 0.10744938999414444, + "learning_rate": 0.00022130754875914415, + "loss": 0.36817583441734314, + "memory(GiB)": 78.33, + "step": 1937, + "token_acc": 0.8937919693024531, + "train_speed(iter/s)": 0.032794 + }, + { + "epoch": 0.3755268129632321, + "grad_norm": 0.09824454039335251, + "learning_rate": 0.0002212229592748009, + "loss": 0.34159597754478455, + "memory(GiB)": 78.33, + "step": 1938, + "token_acc": 0.8967411946420885, + "train_speed(iter/s)": 0.032795 + }, + { + "epoch": 0.37572058324855884, + "grad_norm": 0.10819046944379807, + "learning_rate": 0.00022113834053727444, + "loss": 0.3992760181427002, + "memory(GiB)": 78.33, + "step": 1939, + "token_acc": 0.8845097429519071, + "train_speed(iter/s)": 0.032796 + }, + { + "epoch": 0.3759143535338856, + "grad_norm": 0.09654027968645096, + "learning_rate": 0.00022105369258131998, + "loss": 0.340084046125412, + "memory(GiB)": 78.33, + "step": 1940, + "token_acc": 0.9001615206348463, + "train_speed(iter/s)": 0.032797 + }, + { + "epoch": 0.3761081238192123, + "grad_norm": 0.09880183637142181, + "learning_rate": 0.00022096901544170467, + "loss": 0.36598512530326843, + "memory(GiB)": 78.33, + "step": 1941, + "token_acc": 0.8919111291880625, + "train_speed(iter/s)": 0.032798 + }, + { + "epoch": 0.37630189410453907, + "grad_norm": 0.10682184994220734, + "learning_rate": 0.0002208843091532077, + "loss": 0.39188694953918457, + "memory(GiB)": 78.33, + "step": 1942, + "token_acc": 0.8864231527226587, + "train_speed(iter/s)": 0.0328 + }, + { + "epoch": 0.3764956643898658, + "grad_norm": 0.11017350852489471, + "learning_rate": 0.00022079957375062021, + "loss": 0.40813326835632324, + "memory(GiB)": 78.33, + "step": 1943, + "token_acc": 0.8828356812531744, + "train_speed(iter/s)": 0.032801 + }, + { + "epoch": 0.37668943467519256, + "grad_norm": 0.11248282343149185, + "learning_rate": 0.00022071480926874536, + "loss": 0.41364431381225586, + "memory(GiB)": 78.33, + "step": 1944, + "token_acc": 0.8817496143077539, + "train_speed(iter/s)": 0.032802 + }, + { + "epoch": 0.3768832049605193, + "grad_norm": 0.09806734323501587, + "learning_rate": 0.00022063001574239814, + "loss": 0.36278462409973145, + "memory(GiB)": 78.33, + "step": 1945, + "token_acc": 0.8927522524764797, + "train_speed(iter/s)": 0.032803 + }, + { + "epoch": 0.37707697524584605, + "grad_norm": 0.10731010138988495, + "learning_rate": 0.00022054519320640557, + "loss": 0.39958709478378296, + "memory(GiB)": 78.33, + "step": 1946, + "token_acc": 0.882800608828006, + "train_speed(iter/s)": 0.032804 + }, + { + "epoch": 0.3772707455311728, + "grad_norm": 0.1078319326043129, + "learning_rate": 0.0002204603416956065, + "loss": 0.39028337597846985, + "memory(GiB)": 78.33, + "step": 1947, + "token_acc": 0.8870954252738982, + "train_speed(iter/s)": 0.032805 + }, + { + "epoch": 0.37746451581649954, + "grad_norm": 0.10992579162120819, + "learning_rate": 0.00022037546124485178, + "loss": 0.3915446102619171, + "memory(GiB)": 78.33, + "step": 1948, + "token_acc": 0.8861552284559077, + "train_speed(iter/s)": 0.032807 + }, + { + "epoch": 0.3776582861018263, + "grad_norm": 0.10269248485565186, + "learning_rate": 0.00022029055188900405, + "loss": 0.35439544916152954, + "memory(GiB)": 78.33, + "step": 1949, + "token_acc": 0.894891822424076, + "train_speed(iter/s)": 0.032808 + }, + { + "epoch": 0.377852056387153, + "grad_norm": 0.11859910935163498, + "learning_rate": 0.00022020561366293789, + "loss": 0.36357784271240234, + "memory(GiB)": 78.33, + "step": 1950, + "token_acc": 0.8931177855959062, + "train_speed(iter/s)": 0.032809 + }, + { + "epoch": 0.37804582667247977, + "grad_norm": 0.11494617909193039, + "learning_rate": 0.0002201206466015397, + "loss": 0.4254799485206604, + "memory(GiB)": 78.33, + "step": 1951, + "token_acc": 0.8750034286968209, + "train_speed(iter/s)": 0.03281 + }, + { + "epoch": 0.3782395969578065, + "grad_norm": 0.09963639825582504, + "learning_rate": 0.00022003565073970774, + "loss": 0.3571300506591797, + "memory(GiB)": 78.33, + "step": 1952, + "token_acc": 0.8945331269019273, + "train_speed(iter/s)": 0.032812 + }, + { + "epoch": 0.37843336724313326, + "grad_norm": 0.10854914784431458, + "learning_rate": 0.0002199506261123521, + "loss": 0.3844650089740753, + "memory(GiB)": 78.33, + "step": 1953, + "token_acc": 0.8875416461849364, + "train_speed(iter/s)": 0.032813 + }, + { + "epoch": 0.37862713752846, + "grad_norm": 0.11779830604791641, + "learning_rate": 0.00021986557275439464, + "loss": 0.40272995829582214, + "memory(GiB)": 78.33, + "step": 1954, + "token_acc": 0.8847069242264903, + "train_speed(iter/s)": 0.032814 + }, + { + "epoch": 0.37882090781378674, + "grad_norm": 0.09850434213876724, + "learning_rate": 0.00021978049070076912, + "loss": 0.3412163257598877, + "memory(GiB)": 78.33, + "step": 1955, + "token_acc": 0.8973165531228873, + "train_speed(iter/s)": 0.032816 + }, + { + "epoch": 0.3790146780991135, + "grad_norm": 0.10487658530473709, + "learning_rate": 0.00021969537998642097, + "loss": 0.37174132466316223, + "memory(GiB)": 78.33, + "step": 1956, + "token_acc": 0.8908394592093765, + "train_speed(iter/s)": 0.032817 + }, + { + "epoch": 0.37920844838444023, + "grad_norm": 0.10610850155353546, + "learning_rate": 0.00021961024064630745, + "loss": 0.3750998079776764, + "memory(GiB)": 78.33, + "step": 1957, + "token_acc": 0.8877445652173913, + "train_speed(iter/s)": 0.032818 + }, + { + "epoch": 0.379402218669767, + "grad_norm": 0.11280324310064316, + "learning_rate": 0.00021952507271539762, + "loss": 0.40227746963500977, + "memory(GiB)": 78.33, + "step": 1958, + "token_acc": 0.8827386807356404, + "train_speed(iter/s)": 0.032819 + }, + { + "epoch": 0.3795959889550937, + "grad_norm": 0.11210876703262329, + "learning_rate": 0.00021943987622867223, + "loss": 0.3853500187397003, + "memory(GiB)": 78.33, + "step": 1959, + "token_acc": 0.8886800763470856, + "train_speed(iter/s)": 0.03282 + }, + { + "epoch": 0.37978975924042047, + "grad_norm": 0.10147024691104889, + "learning_rate": 0.00021935465122112377, + "loss": 0.34372827410697937, + "memory(GiB)": 78.33, + "step": 1960, + "token_acc": 0.8980276961812841, + "train_speed(iter/s)": 0.032821 + }, + { + "epoch": 0.3799835295257472, + "grad_norm": 0.1177581325173378, + "learning_rate": 0.00021926939772775637, + "loss": 0.4162188172340393, + "memory(GiB)": 78.33, + "step": 1961, + "token_acc": 0.8789121688698469, + "train_speed(iter/s)": 0.032823 + }, + { + "epoch": 0.38017729981107395, + "grad_norm": 0.10330435633659363, + "learning_rate": 0.00021918411578358601, + "loss": 0.3811953067779541, + "memory(GiB)": 78.33, + "step": 1962, + "token_acc": 0.8878933276780744, + "train_speed(iter/s)": 0.032824 + }, + { + "epoch": 0.3803710700964007, + "grad_norm": 0.1050238385796547, + "learning_rate": 0.0002190988054236402, + "loss": 0.39693543314933777, + "memory(GiB)": 78.33, + "step": 1963, + "token_acc": 0.8836019246925838, + "train_speed(iter/s)": 0.032825 + }, + { + "epoch": 0.38056484038172744, + "grad_norm": 0.1056143268942833, + "learning_rate": 0.0002190134666829583, + "loss": 0.3753167390823364, + "memory(GiB)": 78.33, + "step": 1964, + "token_acc": 0.8911448414921026, + "train_speed(iter/s)": 0.032826 + }, + { + "epoch": 0.3807586106670542, + "grad_norm": 0.1035081148147583, + "learning_rate": 0.0002189280995965912, + "loss": 0.3400823771953583, + "memory(GiB)": 78.33, + "step": 1965, + "token_acc": 0.8973672105419812, + "train_speed(iter/s)": 0.032828 + }, + { + "epoch": 0.38095238095238093, + "grad_norm": 0.11154508590698242, + "learning_rate": 0.00021884270419960137, + "loss": 0.37742310762405396, + "memory(GiB)": 78.33, + "step": 1966, + "token_acc": 0.8898611151892416, + "train_speed(iter/s)": 0.032829 + }, + { + "epoch": 0.3811461512377077, + "grad_norm": 0.10220933705568314, + "learning_rate": 0.00021875728052706304, + "loss": 0.3489319086074829, + "memory(GiB)": 78.33, + "step": 1967, + "token_acc": 0.8957410562180579, + "train_speed(iter/s)": 0.03283 + }, + { + "epoch": 0.3813399215230344, + "grad_norm": 0.12063480913639069, + "learning_rate": 0.00021867182861406206, + "loss": 0.40946558117866516, + "memory(GiB)": 78.33, + "step": 1968, + "token_acc": 0.8786677692548975, + "train_speed(iter/s)": 0.032831 + }, + { + "epoch": 0.38153369180836116, + "grad_norm": 0.1056455671787262, + "learning_rate": 0.00021858634849569576, + "loss": 0.38398033380508423, + "memory(GiB)": 78.33, + "step": 1969, + "token_acc": 0.8870220312216843, + "train_speed(iter/s)": 0.032832 + }, + { + "epoch": 0.3817274620936879, + "grad_norm": 0.10487000644207001, + "learning_rate": 0.00021850084020707316, + "loss": 0.39127257466316223, + "memory(GiB)": 78.33, + "step": 1970, + "token_acc": 0.8856672733165466, + "train_speed(iter/s)": 0.032833 + }, + { + "epoch": 0.38192123237901465, + "grad_norm": 0.10356573760509491, + "learning_rate": 0.0002184153037833148, + "loss": 0.39030513167381287, + "memory(GiB)": 78.33, + "step": 1971, + "token_acc": 0.8840334008097166, + "train_speed(iter/s)": 0.032835 + }, + { + "epoch": 0.3821150026643414, + "grad_norm": 0.10742707550525665, + "learning_rate": 0.0002183297392595528, + "loss": 0.3961334824562073, + "memory(GiB)": 78.33, + "step": 1972, + "token_acc": 0.8832247032291641, + "train_speed(iter/s)": 0.032836 + }, + { + "epoch": 0.38230877294966814, + "grad_norm": 0.10838883370161057, + "learning_rate": 0.00021824414667093075, + "loss": 0.3667401075363159, + "memory(GiB)": 78.33, + "step": 1973, + "token_acc": 0.8917944849551588, + "train_speed(iter/s)": 0.032837 + }, + { + "epoch": 0.38250254323499494, + "grad_norm": 0.1165713295340538, + "learning_rate": 0.00021815852605260386, + "loss": 0.3943071663379669, + "memory(GiB)": 78.33, + "step": 1974, + "token_acc": 0.8838169344539811, + "train_speed(iter/s)": 0.032838 + }, + { + "epoch": 0.3826963135203217, + "grad_norm": 0.10990098863840103, + "learning_rate": 0.0002180728774397389, + "loss": 0.4022403061389923, + "memory(GiB)": 78.33, + "step": 1975, + "token_acc": 0.8827039398397908, + "train_speed(iter/s)": 0.032839 + }, + { + "epoch": 0.38289008380564843, + "grad_norm": 0.12068414688110352, + "learning_rate": 0.00021798720086751395, + "loss": 0.431316614151001, + "memory(GiB)": 78.33, + "step": 1976, + "token_acc": 0.8732160741111668, + "train_speed(iter/s)": 0.032841 + }, + { + "epoch": 0.3830838540909752, + "grad_norm": 0.09716933965682983, + "learning_rate": 0.0002179014963711187, + "loss": 0.35583171248435974, + "memory(GiB)": 78.33, + "step": 1977, + "token_acc": 0.8953317742009447, + "train_speed(iter/s)": 0.032842 + }, + { + "epoch": 0.3832776243763019, + "grad_norm": 0.11288797110319138, + "learning_rate": 0.00021781576398575433, + "loss": 0.392235666513443, + "memory(GiB)": 78.33, + "step": 1978, + "token_acc": 0.885842526497945, + "train_speed(iter/s)": 0.032843 + }, + { + "epoch": 0.38347139466162866, + "grad_norm": 0.10276081413030624, + "learning_rate": 0.0002177300037466334, + "loss": 0.37976735830307007, + "memory(GiB)": 78.33, + "step": 1979, + "token_acc": 0.8875089992800576, + "train_speed(iter/s)": 0.032844 + }, + { + "epoch": 0.3836651649469554, + "grad_norm": 0.11369860172271729, + "learning_rate": 0.00021764421568897993, + "loss": 0.3621234893798828, + "memory(GiB)": 78.33, + "step": 1980, + "token_acc": 0.890562048175558, + "train_speed(iter/s)": 0.032845 + }, + { + "epoch": 0.38385893523228215, + "grad_norm": 0.10219123214483261, + "learning_rate": 0.00021755839984802944, + "loss": 0.3366636335849762, + "memory(GiB)": 78.33, + "step": 1981, + "token_acc": 0.900472891235016, + "train_speed(iter/s)": 0.032847 + }, + { + "epoch": 0.3840527055176089, + "grad_norm": 0.10489702969789505, + "learning_rate": 0.0002174725562590288, + "loss": 0.37687474489212036, + "memory(GiB)": 78.33, + "step": 1982, + "token_acc": 0.8895873939660143, + "train_speed(iter/s)": 0.032848 + }, + { + "epoch": 0.38424647580293564, + "grad_norm": 0.09669921547174454, + "learning_rate": 0.00021738668495723616, + "loss": 0.3477326035499573, + "memory(GiB)": 78.33, + "step": 1983, + "token_acc": 0.8987814906182253, + "train_speed(iter/s)": 0.032849 + }, + { + "epoch": 0.3844402460882624, + "grad_norm": 0.11556685715913773, + "learning_rate": 0.0002173007859779213, + "loss": 0.4000082015991211, + "memory(GiB)": 78.33, + "step": 1984, + "token_acc": 0.8822343958445688, + "train_speed(iter/s)": 0.03285 + }, + { + "epoch": 0.3846340163735891, + "grad_norm": 0.10442518442869186, + "learning_rate": 0.00021721485935636523, + "loss": 0.3522685170173645, + "memory(GiB)": 78.33, + "step": 1985, + "token_acc": 0.8955117718187637, + "train_speed(iter/s)": 0.032851 + }, + { + "epoch": 0.38482778665891587, + "grad_norm": 0.11232437193393707, + "learning_rate": 0.00021712890512786027, + "loss": 0.39102703332901, + "memory(GiB)": 78.33, + "step": 1986, + "token_acc": 0.8859773174722014, + "train_speed(iter/s)": 0.032852 + }, + { + "epoch": 0.3850215569442426, + "grad_norm": 0.1146976500749588, + "learning_rate": 0.00021704292332771013, + "loss": 0.4080618619918823, + "memory(GiB)": 78.33, + "step": 1987, + "token_acc": 0.8815463453091564, + "train_speed(iter/s)": 0.032853 + }, + { + "epoch": 0.38521532722956936, + "grad_norm": 0.10926152765750885, + "learning_rate": 0.00021695691399122987, + "loss": 0.3856127858161926, + "memory(GiB)": 78.33, + "step": 1988, + "token_acc": 0.8882630577068502, + "train_speed(iter/s)": 0.032855 + }, + { + "epoch": 0.3854090975148961, + "grad_norm": 0.11085885018110275, + "learning_rate": 0.00021687087715374585, + "loss": 0.37670475244522095, + "memory(GiB)": 78.33, + "step": 1989, + "token_acc": 0.8898128898128899, + "train_speed(iter/s)": 0.032856 + }, + { + "epoch": 0.38560286780022285, + "grad_norm": 0.12100964039564133, + "learning_rate": 0.00021678481285059567, + "loss": 0.43839654326438904, + "memory(GiB)": 78.33, + "step": 1990, + "token_acc": 0.8723092599668832, + "train_speed(iter/s)": 0.032857 + }, + { + "epoch": 0.3857966380855496, + "grad_norm": 0.12123683094978333, + "learning_rate": 0.00021669872111712828, + "loss": 0.44955089688301086, + "memory(GiB)": 78.33, + "step": 1991, + "token_acc": 0.871977240398293, + "train_speed(iter/s)": 0.032858 + }, + { + "epoch": 0.38599040837087634, + "grad_norm": 0.11107171326875687, + "learning_rate": 0.0002166126019887039, + "loss": 0.4144379794597626, + "memory(GiB)": 78.33, + "step": 1992, + "token_acc": 0.8786039878072653, + "train_speed(iter/s)": 0.032859 + }, + { + "epoch": 0.3861841786562031, + "grad_norm": 0.10761483013629913, + "learning_rate": 0.00021652645550069392, + "loss": 0.3946702480316162, + "memory(GiB)": 78.33, + "step": 1993, + "token_acc": 0.8829077659918314, + "train_speed(iter/s)": 0.03286 + }, + { + "epoch": 0.3863779489415298, + "grad_norm": 0.10240019857883453, + "learning_rate": 0.000216440281688481, + "loss": 0.36723431944847107, + "memory(GiB)": 78.33, + "step": 1994, + "token_acc": 0.8909840895698291, + "train_speed(iter/s)": 0.032862 + }, + { + "epoch": 0.38657171922685657, + "grad_norm": 0.10443782061338425, + "learning_rate": 0.00021635408058745908, + "loss": 0.3627747893333435, + "memory(GiB)": 78.33, + "step": 1995, + "token_acc": 0.8937532210377844, + "train_speed(iter/s)": 0.032863 + }, + { + "epoch": 0.3867654895121833, + "grad_norm": 0.10117712616920471, + "learning_rate": 0.00021626785223303327, + "loss": 0.36782556772232056, + "memory(GiB)": 78.33, + "step": 1996, + "token_acc": 0.8936070197430273, + "train_speed(iter/s)": 0.032864 + }, + { + "epoch": 0.38695925979751006, + "grad_norm": 0.10845784842967987, + "learning_rate": 0.00021618159666061983, + "loss": 0.4153798818588257, + "memory(GiB)": 78.33, + "step": 1997, + "token_acc": 0.8783169270276054, + "train_speed(iter/s)": 0.032865 + }, + { + "epoch": 0.3871530300828368, + "grad_norm": 0.11158988624811172, + "learning_rate": 0.00021609531390564635, + "loss": 0.3699086904525757, + "memory(GiB)": 78.33, + "step": 1998, + "token_acc": 0.8905605642615397, + "train_speed(iter/s)": 0.032866 + }, + { + "epoch": 0.38734680036816355, + "grad_norm": 0.1030752882361412, + "learning_rate": 0.0002160090040035513, + "loss": 0.3778277039527893, + "memory(GiB)": 78.33, + "step": 1999, + "token_acc": 0.8885486675143369, + "train_speed(iter/s)": 0.032867 + }, + { + "epoch": 0.3875405706534903, + "grad_norm": 0.10194329917430878, + "learning_rate": 0.00021592266698978462, + "loss": 0.3767800033092499, + "memory(GiB)": 78.33, + "step": 2000, + "token_acc": 0.8890578277836907, + "train_speed(iter/s)": 0.032868 + }, + { + "epoch": 0.3875405706534903, + "eval_loss": 0.44034290313720703, + "eval_runtime": 1345.0564, + "eval_samples_per_second": 5.018, + "eval_steps_per_second": 5.018, + "eval_token_acc": 0.8890313859070251, + "step": 2000 + }, + { + "epoch": 0.38773434093881703, + "grad_norm": 0.12286636233329773, + "learning_rate": 0.00021583630289980724, + "loss": 0.41180068254470825, + "memory(GiB)": 78.33, + "step": 2001, + "token_acc": 0.8807242798353909, + "train_speed(iter/s)": 0.032152 + }, + { + "epoch": 0.3879281112241438, + "grad_norm": 0.10262475907802582, + "learning_rate": 0.00021574991176909113, + "loss": 0.37820303440093994, + "memory(GiB)": 78.33, + "step": 2002, + "token_acc": 0.8894734153940447, + "train_speed(iter/s)": 0.032154 + }, + { + "epoch": 0.3881218815094705, + "grad_norm": 0.10098009556531906, + "learning_rate": 0.00021566349363311949, + "loss": 0.3635365962982178, + "memory(GiB)": 78.33, + "step": 2003, + "token_acc": 0.8932263226068625, + "train_speed(iter/s)": 0.032155 + }, + { + "epoch": 0.38831565179479727, + "grad_norm": 0.11192551255226135, + "learning_rate": 0.00021557704852738654, + "loss": 0.39867013692855835, + "memory(GiB)": 78.33, + "step": 2004, + "token_acc": 0.8858497030607584, + "train_speed(iter/s)": 0.032157 + }, + { + "epoch": 0.388509422080124, + "grad_norm": 0.0996481254696846, + "learning_rate": 0.00021549057648739768, + "loss": 0.3613511025905609, + "memory(GiB)": 78.33, + "step": 2005, + "token_acc": 0.8933584214808787, + "train_speed(iter/s)": 0.032158 + }, + { + "epoch": 0.38870319236545076, + "grad_norm": 0.12326376140117645, + "learning_rate": 0.00021540407754866924, + "loss": 0.4421766996383667, + "memory(GiB)": 78.33, + "step": 2006, + "token_acc": 0.8737238044062332, + "train_speed(iter/s)": 0.03216 + }, + { + "epoch": 0.3888969626507775, + "grad_norm": 0.09973074495792389, + "learning_rate": 0.00021531755174672868, + "loss": 0.34252458810806274, + "memory(GiB)": 78.33, + "step": 2007, + "token_acc": 0.8975569907844689, + "train_speed(iter/s)": 0.032161 + }, + { + "epoch": 0.38909073293610424, + "grad_norm": 0.09278019517660141, + "learning_rate": 0.00021523099911711447, + "loss": 0.3389296233654022, + "memory(GiB)": 78.33, + "step": 2008, + "token_acc": 0.89790143300206, + "train_speed(iter/s)": 0.032162 + }, + { + "epoch": 0.389284503221431, + "grad_norm": 0.11313237994909286, + "learning_rate": 0.00021514441969537607, + "loss": 0.43270501494407654, + "memory(GiB)": 78.33, + "step": 2009, + "token_acc": 0.8762101038587737, + "train_speed(iter/s)": 0.032164 + }, + { + "epoch": 0.38947827350675773, + "grad_norm": 0.10002917051315308, + "learning_rate": 0.00021505781351707402, + "loss": 0.3512948751449585, + "memory(GiB)": 78.33, + "step": 2010, + "token_acc": 0.8962549078828148, + "train_speed(iter/s)": 0.032165 + }, + { + "epoch": 0.3896720437920845, + "grad_norm": 0.11693061888217926, + "learning_rate": 0.0002149711806177798, + "loss": 0.40658366680145264, + "memory(GiB)": 78.33, + "step": 2011, + "token_acc": 0.8811087609929867, + "train_speed(iter/s)": 0.032167 + }, + { + "epoch": 0.3898658140774112, + "grad_norm": 0.10790550708770752, + "learning_rate": 0.00021488452103307585, + "loss": 0.41037797927856445, + "memory(GiB)": 78.33, + "step": 2012, + "token_acc": 0.88296488946684, + "train_speed(iter/s)": 0.032168 + }, + { + "epoch": 0.39005958436273797, + "grad_norm": 0.11255151033401489, + "learning_rate": 0.0002147978347985556, + "loss": 0.37893742322921753, + "memory(GiB)": 78.33, + "step": 2013, + "token_acc": 0.889498343046509, + "train_speed(iter/s)": 0.032169 + }, + { + "epoch": 0.3902533546480647, + "grad_norm": 0.10197418183088303, + "learning_rate": 0.0002147111219498234, + "loss": 0.3498515188694, + "memory(GiB)": 78.33, + "step": 2014, + "token_acc": 0.8973286219081272, + "train_speed(iter/s)": 0.032171 + }, + { + "epoch": 0.39044712493339145, + "grad_norm": 0.09706109017133713, + "learning_rate": 0.00021462438252249457, + "loss": 0.340999037027359, + "memory(GiB)": 78.33, + "step": 2015, + "token_acc": 0.896882369710006, + "train_speed(iter/s)": 0.032172 + }, + { + "epoch": 0.3906408952187182, + "grad_norm": 0.10438670963048935, + "learning_rate": 0.00021453761655219528, + "loss": 0.389445424079895, + "memory(GiB)": 78.33, + "step": 2016, + "token_acc": 0.8829340326399674, + "train_speed(iter/s)": 0.032174 + }, + { + "epoch": 0.39083466550404494, + "grad_norm": 0.11134763062000275, + "learning_rate": 0.00021445082407456272, + "loss": 0.43515315651893616, + "memory(GiB)": 78.33, + "step": 2017, + "token_acc": 0.8730318643677499, + "train_speed(iter/s)": 0.032175 + }, + { + "epoch": 0.3910284357893717, + "grad_norm": 0.09228318929672241, + "learning_rate": 0.00021436400512524483, + "loss": 0.33381155133247375, + "memory(GiB)": 78.33, + "step": 2018, + "token_acc": 0.902964766542538, + "train_speed(iter/s)": 0.032177 + }, + { + "epoch": 0.39122220607469843, + "grad_norm": 0.10420046001672745, + "learning_rate": 0.00021427715973990056, + "loss": 0.3501606285572052, + "memory(GiB)": 78.33, + "step": 2019, + "token_acc": 0.8966936364399899, + "train_speed(iter/s)": 0.032178 + }, + { + "epoch": 0.3914159763600252, + "grad_norm": 0.09859726577997208, + "learning_rate": 0.00021419028795419953, + "loss": 0.3734872341156006, + "memory(GiB)": 78.33, + "step": 2020, + "token_acc": 0.8910265718019457, + "train_speed(iter/s)": 0.032179 + }, + { + "epoch": 0.3916097466453519, + "grad_norm": 0.11447544395923615, + "learning_rate": 0.00021410338980382238, + "loss": 0.38576456904411316, + "memory(GiB)": 78.33, + "step": 2021, + "token_acc": 0.887767163988371, + "train_speed(iter/s)": 0.032181 + }, + { + "epoch": 0.39180351693067866, + "grad_norm": 0.10994785279035568, + "learning_rate": 0.00021401646532446053, + "loss": 0.40534737706184387, + "memory(GiB)": 78.33, + "step": 2022, + "token_acc": 0.8793884919720366, + "train_speed(iter/s)": 0.032182 + }, + { + "epoch": 0.3919972872160054, + "grad_norm": 0.10715804994106293, + "learning_rate": 0.00021392951455181619, + "loss": 0.3926943242549896, + "memory(GiB)": 78.33, + "step": 2023, + "token_acc": 0.8848688300385429, + "train_speed(iter/s)": 0.032183 + }, + { + "epoch": 0.39219105750133215, + "grad_norm": 0.10480938851833344, + "learning_rate": 0.00021384253752160235, + "loss": 0.3569415807723999, + "memory(GiB)": 78.33, + "step": 2024, + "token_acc": 0.8943907463159563, + "train_speed(iter/s)": 0.032185 + }, + { + "epoch": 0.3923848277866589, + "grad_norm": 0.09940195083618164, + "learning_rate": 0.00021375553426954285, + "loss": 0.3529174029827118, + "memory(GiB)": 78.33, + "step": 2025, + "token_acc": 0.8969288159844653, + "train_speed(iter/s)": 0.032186 + }, + { + "epoch": 0.39257859807198564, + "grad_norm": 0.11303528398275375, + "learning_rate": 0.00021366850483137226, + "loss": 0.40097349882125854, + "memory(GiB)": 78.33, + "step": 2026, + "token_acc": 0.8815832710978342, + "train_speed(iter/s)": 0.032188 + }, + { + "epoch": 0.3927723683573124, + "grad_norm": 0.0959119200706482, + "learning_rate": 0.00021358144924283584, + "loss": 0.35317641496658325, + "memory(GiB)": 78.33, + "step": 2027, + "token_acc": 0.8957997557997558, + "train_speed(iter/s)": 0.032189 + }, + { + "epoch": 0.39296613864263913, + "grad_norm": 0.11829999089241028, + "learning_rate": 0.0002134943675396898, + "loss": 0.38520050048828125, + "memory(GiB)": 78.33, + "step": 2028, + "token_acc": 0.8890619591554171, + "train_speed(iter/s)": 0.032191 + }, + { + "epoch": 0.3931599089279659, + "grad_norm": 0.11327025294303894, + "learning_rate": 0.0002134072597577008, + "loss": 0.39436087012290955, + "memory(GiB)": 78.33, + "step": 2029, + "token_acc": 0.8814638027048528, + "train_speed(iter/s)": 0.032192 + }, + { + "epoch": 0.3933536792132926, + "grad_norm": 0.09940643608570099, + "learning_rate": 0.0002133201259326464, + "loss": 0.3501843810081482, + "memory(GiB)": 78.33, + "step": 2030, + "token_acc": 0.8952846160250895, + "train_speed(iter/s)": 0.032194 + }, + { + "epoch": 0.39354744949861936, + "grad_norm": 0.09761599451303482, + "learning_rate": 0.0002132329661003148, + "loss": 0.3546067178249359, + "memory(GiB)": 78.33, + "step": 2031, + "token_acc": 0.8960130106943965, + "train_speed(iter/s)": 0.032195 + }, + { + "epoch": 0.3937412197839461, + "grad_norm": 0.10584171116352081, + "learning_rate": 0.00021314578029650493, + "loss": 0.39139533042907715, + "memory(GiB)": 78.33, + "step": 2032, + "token_acc": 0.8860886829913964, + "train_speed(iter/s)": 0.032196 + }, + { + "epoch": 0.39393499006927285, + "grad_norm": 0.1090681329369545, + "learning_rate": 0.00021305856855702624, + "loss": 0.37363752722740173, + "memory(GiB)": 78.33, + "step": 2033, + "token_acc": 0.8886917688801884, + "train_speed(iter/s)": 0.032198 + }, + { + "epoch": 0.39412876035459965, + "grad_norm": 0.10559553653001785, + "learning_rate": 0.00021297133091769904, + "loss": 0.36771360039711, + "memory(GiB)": 78.33, + "step": 2034, + "token_acc": 0.8921826625386997, + "train_speed(iter/s)": 0.032199 + }, + { + "epoch": 0.3943225306399264, + "grad_norm": 0.10535501688718796, + "learning_rate": 0.00021288406741435412, + "loss": 0.36465299129486084, + "memory(GiB)": 78.33, + "step": 2035, + "token_acc": 0.8928280358598207, + "train_speed(iter/s)": 0.0322 + }, + { + "epoch": 0.39451630092525314, + "grad_norm": 0.1058509349822998, + "learning_rate": 0.0002127967780828329, + "loss": 0.3727305829524994, + "memory(GiB)": 78.33, + "step": 2036, + "token_acc": 0.8917116094237866, + "train_speed(iter/s)": 0.032202 + }, + { + "epoch": 0.3947100712105799, + "grad_norm": 0.10819867253303528, + "learning_rate": 0.00021270946295898755, + "loss": 0.36687490344047546, + "memory(GiB)": 78.33, + "step": 2037, + "token_acc": 0.8913552022967728, + "train_speed(iter/s)": 0.032203 + }, + { + "epoch": 0.3949038414959066, + "grad_norm": 0.10262859612703323, + "learning_rate": 0.0002126221220786807, + "loss": 0.3651004731655121, + "memory(GiB)": 78.33, + "step": 2038, + "token_acc": 0.892148337595908, + "train_speed(iter/s)": 0.032204 + }, + { + "epoch": 0.39509761178123337, + "grad_norm": 0.10185900330543518, + "learning_rate": 0.0002125347554777856, + "loss": 0.3557824492454529, + "memory(GiB)": 78.33, + "step": 2039, + "token_acc": 0.8953365688963917, + "train_speed(iter/s)": 0.032206 + }, + { + "epoch": 0.3952913820665601, + "grad_norm": 0.09542527794837952, + "learning_rate": 0.000212447363192186, + "loss": 0.36669260263442993, + "memory(GiB)": 78.33, + "step": 2040, + "token_acc": 0.8931179156718667, + "train_speed(iter/s)": 0.032207 + }, + { + "epoch": 0.39548515235188686, + "grad_norm": 0.10032487660646439, + "learning_rate": 0.00021235994525777637, + "loss": 0.3561350107192993, + "memory(GiB)": 78.33, + "step": 2041, + "token_acc": 0.8934586347077768, + "train_speed(iter/s)": 0.032208 + }, + { + "epoch": 0.3956789226372136, + "grad_norm": 0.1082734614610672, + "learning_rate": 0.0002122725017104615, + "loss": 0.34290170669555664, + "memory(GiB)": 78.33, + "step": 2042, + "token_acc": 0.8980645927333175, + "train_speed(iter/s)": 0.03221 + }, + { + "epoch": 0.39587269292254035, + "grad_norm": 0.10429167002439499, + "learning_rate": 0.00021218503258615688, + "loss": 0.38119545578956604, + "memory(GiB)": 78.33, + "step": 2043, + "token_acc": 0.8884871605078859, + "train_speed(iter/s)": 0.032211 + }, + { + "epoch": 0.3960664632078671, + "grad_norm": 0.10330045223236084, + "learning_rate": 0.00021209753792078836, + "loss": 0.36754027009010315, + "memory(GiB)": 78.33, + "step": 2044, + "token_acc": 0.8926996316089867, + "train_speed(iter/s)": 0.032213 + }, + { + "epoch": 0.39626023349319384, + "grad_norm": 0.10568582266569138, + "learning_rate": 0.00021201001775029244, + "loss": 0.37445148825645447, + "memory(GiB)": 78.33, + "step": 2045, + "token_acc": 0.890682963949396, + "train_speed(iter/s)": 0.032214 + }, + { + "epoch": 0.3964540037785206, + "grad_norm": 0.11252961307764053, + "learning_rate": 0.00021192247211061595, + "loss": 0.40704840421676636, + "memory(GiB)": 78.33, + "step": 2046, + "token_acc": 0.8822551520536559, + "train_speed(iter/s)": 0.032215 + }, + { + "epoch": 0.3966477740638473, + "grad_norm": 0.10527341812849045, + "learning_rate": 0.0002118349010377162, + "loss": 0.4128043055534363, + "memory(GiB)": 78.33, + "step": 2047, + "token_acc": 0.8777996488801404, + "train_speed(iter/s)": 0.032217 + }, + { + "epoch": 0.39684154434917407, + "grad_norm": 0.1078762486577034, + "learning_rate": 0.00021174730456756106, + "loss": 0.3727009892463684, + "memory(GiB)": 78.33, + "step": 2048, + "token_acc": 0.8923324669454176, + "train_speed(iter/s)": 0.032218 + }, + { + "epoch": 0.3970353146345008, + "grad_norm": 0.10819050669670105, + "learning_rate": 0.00021165968273612875, + "loss": 0.37524309754371643, + "memory(GiB)": 78.33, + "step": 2049, + "token_acc": 0.8889318457969734, + "train_speed(iter/s)": 0.03222 + }, + { + "epoch": 0.39722908491982756, + "grad_norm": 0.10360529273748398, + "learning_rate": 0.0002115720355794078, + "loss": 0.37673884630203247, + "memory(GiB)": 78.33, + "step": 2050, + "token_acc": 0.8891292318527089, + "train_speed(iter/s)": 0.032221 + }, + { + "epoch": 0.3974228552051543, + "grad_norm": 0.10623595118522644, + "learning_rate": 0.00021148436313339739, + "loss": 0.3622683882713318, + "memory(GiB)": 78.33, + "step": 2051, + "token_acc": 0.8929465428046949, + "train_speed(iter/s)": 0.032222 + }, + { + "epoch": 0.39761662549048105, + "grad_norm": 0.1145068109035492, + "learning_rate": 0.0002113966654341069, + "loss": 0.37567758560180664, + "memory(GiB)": 78.33, + "step": 2052, + "token_acc": 0.8903638151425762, + "train_speed(iter/s)": 0.032224 + }, + { + "epoch": 0.3978103957758078, + "grad_norm": 0.11284542083740234, + "learning_rate": 0.00021130894251755608, + "loss": 0.3825456202030182, + "memory(GiB)": 78.33, + "step": 2053, + "token_acc": 0.8858395490519843, + "train_speed(iter/s)": 0.032225 + }, + { + "epoch": 0.39800416606113453, + "grad_norm": 0.11392559856176376, + "learning_rate": 0.00021122119441977516, + "loss": 0.38240864872932434, + "memory(GiB)": 78.33, + "step": 2054, + "token_acc": 0.8883792048929664, + "train_speed(iter/s)": 0.032227 + }, + { + "epoch": 0.3981979363464613, + "grad_norm": 0.11121159791946411, + "learning_rate": 0.00021113342117680463, + "loss": 0.4033408761024475, + "memory(GiB)": 78.33, + "step": 2055, + "token_acc": 0.8815422034050986, + "train_speed(iter/s)": 0.032228 + }, + { + "epoch": 0.398391706631788, + "grad_norm": 0.11232481151819229, + "learning_rate": 0.00021104562282469523, + "loss": 0.3962811529636383, + "memory(GiB)": 78.33, + "step": 2056, + "token_acc": 0.8822988914577035, + "train_speed(iter/s)": 0.03223 + }, + { + "epoch": 0.39858547691711477, + "grad_norm": 0.1073499396443367, + "learning_rate": 0.00021095779939950827, + "loss": 0.36865997314453125, + "memory(GiB)": 78.33, + "step": 2057, + "token_acc": 0.890293265087614, + "train_speed(iter/s)": 0.032231 + }, + { + "epoch": 0.3987792472024415, + "grad_norm": 0.10009787231683731, + "learning_rate": 0.00021086995093731506, + "loss": 0.36048176884651184, + "memory(GiB)": 78.33, + "step": 2058, + "token_acc": 0.8924960221731766, + "train_speed(iter/s)": 0.032232 + }, + { + "epoch": 0.39897301748776826, + "grad_norm": 0.10518264770507812, + "learning_rate": 0.00021078207747419737, + "loss": 0.375562846660614, + "memory(GiB)": 78.33, + "step": 2059, + "token_acc": 0.8906748031710573, + "train_speed(iter/s)": 0.032234 + }, + { + "epoch": 0.399166787773095, + "grad_norm": 0.11405156552791595, + "learning_rate": 0.00021069417904624713, + "loss": 0.39197370409965515, + "memory(GiB)": 78.33, + "step": 2060, + "token_acc": 0.8833886430678466, + "train_speed(iter/s)": 0.032235 + }, + { + "epoch": 0.39936055805842174, + "grad_norm": 0.11229176819324493, + "learning_rate": 0.00021060625568956672, + "loss": 0.38142168521881104, + "memory(GiB)": 78.33, + "step": 2061, + "token_acc": 0.8872789019385878, + "train_speed(iter/s)": 0.032236 + }, + { + "epoch": 0.3995543283437485, + "grad_norm": 0.10720710456371307, + "learning_rate": 0.0002105183074402685, + "loss": 0.3908834755420685, + "memory(GiB)": 78.33, + "step": 2062, + "token_acc": 0.8867632578421719, + "train_speed(iter/s)": 0.032238 + }, + { + "epoch": 0.39974809862907523, + "grad_norm": 0.11731129139661789, + "learning_rate": 0.00021043033433447523, + "loss": 0.3812180757522583, + "memory(GiB)": 78.33, + "step": 2063, + "token_acc": 0.8876133286235723, + "train_speed(iter/s)": 0.032239 + }, + { + "epoch": 0.399941868914402, + "grad_norm": 0.1061178669333458, + "learning_rate": 0.00021034233640831985, + "loss": 0.35581403970718384, + "memory(GiB)": 78.33, + "step": 2064, + "token_acc": 0.8961584794538884, + "train_speed(iter/s)": 0.032241 + }, + { + "epoch": 0.4001356391997287, + "grad_norm": 0.12066232413053513, + "learning_rate": 0.0002102543136979454, + "loss": 0.407576322555542, + "memory(GiB)": 78.33, + "step": 2065, + "token_acc": 0.8834347797630735, + "train_speed(iter/s)": 0.032242 + }, + { + "epoch": 0.40032940948505547, + "grad_norm": 0.10802853107452393, + "learning_rate": 0.00021016626623950523, + "loss": 0.3750949800014496, + "memory(GiB)": 78.33, + "step": 2066, + "token_acc": 0.8900487525093204, + "train_speed(iter/s)": 0.032243 + }, + { + "epoch": 0.4005231797703822, + "grad_norm": 0.10871551930904388, + "learning_rate": 0.00021007819406916283, + "loss": 0.3510313928127289, + "memory(GiB)": 78.33, + "step": 2067, + "token_acc": 0.8955925167910985, + "train_speed(iter/s)": 0.032245 + }, + { + "epoch": 0.40071695005570895, + "grad_norm": 0.10944053530693054, + "learning_rate": 0.0002099900972230917, + "loss": 0.3768633306026459, + "memory(GiB)": 78.33, + "step": 2068, + "token_acc": 0.8907095472471034, + "train_speed(iter/s)": 0.032246 + }, + { + "epoch": 0.4009107203410357, + "grad_norm": 0.11056249588727951, + "learning_rate": 0.0002099019757374757, + "loss": 0.38695505261421204, + "memory(GiB)": 78.33, + "step": 2069, + "token_acc": 0.8846898656182149, + "train_speed(iter/s)": 0.032247 + }, + { + "epoch": 0.40110449062636244, + "grad_norm": 0.10126353055238724, + "learning_rate": 0.00020981382964850858, + "loss": 0.3784712851047516, + "memory(GiB)": 78.33, + "step": 2070, + "token_acc": 0.8864102038725541, + "train_speed(iter/s)": 0.032249 + }, + { + "epoch": 0.4012982609116892, + "grad_norm": 0.11109844595193863, + "learning_rate": 0.00020972565899239441, + "loss": 0.39917057752609253, + "memory(GiB)": 78.33, + "step": 2071, + "token_acc": 0.884891448825875, + "train_speed(iter/s)": 0.03225 + }, + { + "epoch": 0.40149203119701593, + "grad_norm": 0.10675584524869919, + "learning_rate": 0.0002096374638053472, + "loss": 0.3992624580860138, + "memory(GiB)": 78.33, + "step": 2072, + "token_acc": 0.8833933488667656, + "train_speed(iter/s)": 0.032251 + }, + { + "epoch": 0.4016858014823427, + "grad_norm": 0.09866248071193695, + "learning_rate": 0.000209549244123591, + "loss": 0.3732259273529053, + "memory(GiB)": 78.33, + "step": 2073, + "token_acc": 0.891376350844958, + "train_speed(iter/s)": 0.032253 + }, + { + "epoch": 0.4018795717676694, + "grad_norm": 0.11382555216550827, + "learning_rate": 0.00020946099998336019, + "loss": 0.40181201696395874, + "memory(GiB)": 78.33, + "step": 2074, + "token_acc": 0.8830451706345614, + "train_speed(iter/s)": 0.032254 + }, + { + "epoch": 0.40207334205299616, + "grad_norm": 0.09255445003509521, + "learning_rate": 0.0002093727314208989, + "loss": 0.317599356174469, + "memory(GiB)": 78.33, + "step": 2075, + "token_acc": 0.9049172687019342, + "train_speed(iter/s)": 0.032255 + }, + { + "epoch": 0.4022671123383229, + "grad_norm": 0.12287932634353638, + "learning_rate": 0.00020928443847246134, + "loss": 0.41253185272216797, + "memory(GiB)": 78.33, + "step": 2076, + "token_acc": 0.8792145844223732, + "train_speed(iter/s)": 0.032257 + }, + { + "epoch": 0.40246088262364965, + "grad_norm": 0.1004108339548111, + "learning_rate": 0.0002091961211743119, + "loss": 0.32616835832595825, + "memory(GiB)": 78.33, + "step": 2077, + "token_acc": 0.9020430729022693, + "train_speed(iter/s)": 0.032258 + }, + { + "epoch": 0.4026546529089764, + "grad_norm": 0.10672204196453094, + "learning_rate": 0.00020910777956272485, + "loss": 0.37214604020118713, + "memory(GiB)": 78.33, + "step": 2078, + "token_acc": 0.8916562179414111, + "train_speed(iter/s)": 0.032259 + }, + { + "epoch": 0.40284842319430314, + "grad_norm": 0.10667048394680023, + "learning_rate": 0.00020901941367398446, + "loss": 0.3868388533592224, + "memory(GiB)": 78.33, + "step": 2079, + "token_acc": 0.886404833836858, + "train_speed(iter/s)": 0.032261 + }, + { + "epoch": 0.4030421934796299, + "grad_norm": 0.1087040826678276, + "learning_rate": 0.0002089310235443849, + "loss": 0.3963964879512787, + "memory(GiB)": 78.33, + "step": 2080, + "token_acc": 0.8864254703328509, + "train_speed(iter/s)": 0.032262 + }, + { + "epoch": 0.40323596376495663, + "grad_norm": 0.10210565477609634, + "learning_rate": 0.0002088426092102305, + "loss": 0.38553428649902344, + "memory(GiB)": 78.33, + "step": 2081, + "token_acc": 0.8849096532638776, + "train_speed(iter/s)": 0.032263 + }, + { + "epoch": 0.4034297340502834, + "grad_norm": 0.10996957868337631, + "learning_rate": 0.0002087541707078353, + "loss": 0.40852048993110657, + "memory(GiB)": 78.33, + "step": 2082, + "token_acc": 0.8819868995633188, + "train_speed(iter/s)": 0.032264 + }, + { + "epoch": 0.4036235043356101, + "grad_norm": 0.1148100420832634, + "learning_rate": 0.00020866570807352337, + "loss": 0.40155094861984253, + "memory(GiB)": 78.33, + "step": 2083, + "token_acc": 0.8844750224349387, + "train_speed(iter/s)": 0.032266 + }, + { + "epoch": 0.40381727462093686, + "grad_norm": 0.09997189044952393, + "learning_rate": 0.0002085772213436288, + "loss": 0.35030126571655273, + "memory(GiB)": 78.33, + "step": 2084, + "token_acc": 0.8974117047048323, + "train_speed(iter/s)": 0.032267 + }, + { + "epoch": 0.4040110449062636, + "grad_norm": 0.10798052698373795, + "learning_rate": 0.00020848871055449537, + "loss": 0.3880666196346283, + "memory(GiB)": 78.33, + "step": 2085, + "token_acc": 0.8854523021703774, + "train_speed(iter/s)": 0.032269 + }, + { + "epoch": 0.40420481519159035, + "grad_norm": 0.11830330640077591, + "learning_rate": 0.00020840017574247683, + "loss": 0.446191668510437, + "memory(GiB)": 78.33, + "step": 2086, + "token_acc": 0.8699704224566103, + "train_speed(iter/s)": 0.03227 + }, + { + "epoch": 0.4043985854769171, + "grad_norm": 0.0969855859875679, + "learning_rate": 0.00020831161694393683, + "loss": 0.36765626072883606, + "memory(GiB)": 78.33, + "step": 2087, + "token_acc": 0.892250186892599, + "train_speed(iter/s)": 0.032271 + }, + { + "epoch": 0.40459235576224384, + "grad_norm": 0.10226907581090927, + "learning_rate": 0.00020822303419524893, + "loss": 0.36062684655189514, + "memory(GiB)": 78.33, + "step": 2088, + "token_acc": 0.893473640557773, + "train_speed(iter/s)": 0.032273 + }, + { + "epoch": 0.4047861260475706, + "grad_norm": 0.12443238496780396, + "learning_rate": 0.0002081344275327963, + "loss": 0.4231499135494232, + "memory(GiB)": 78.33, + "step": 2089, + "token_acc": 0.8769953656024717, + "train_speed(iter/s)": 0.032274 + }, + { + "epoch": 0.4049798963328973, + "grad_norm": 0.11125290393829346, + "learning_rate": 0.00020804579699297218, + "loss": 0.38650333881378174, + "memory(GiB)": 78.33, + "step": 2090, + "token_acc": 0.8851398261545724, + "train_speed(iter/s)": 0.032275 + }, + { + "epoch": 0.40517366661822407, + "grad_norm": 0.0958833247423172, + "learning_rate": 0.00020795714261217949, + "loss": 0.34758636355400085, + "memory(GiB)": 78.33, + "step": 2091, + "token_acc": 0.8987275662330263, + "train_speed(iter/s)": 0.032277 + }, + { + "epoch": 0.4053674369035508, + "grad_norm": 0.10977054387331009, + "learning_rate": 0.00020786846442683095, + "loss": 0.3909391760826111, + "memory(GiB)": 78.33, + "step": 2092, + "token_acc": 0.886636506003132, + "train_speed(iter/s)": 0.032278 + }, + { + "epoch": 0.40556120718887756, + "grad_norm": 0.10875298827886581, + "learning_rate": 0.00020777976247334906, + "loss": 0.3958970308303833, + "memory(GiB)": 78.33, + "step": 2093, + "token_acc": 0.8842546456415968, + "train_speed(iter/s)": 0.032279 + }, + { + "epoch": 0.4057549774742043, + "grad_norm": 0.11530545353889465, + "learning_rate": 0.00020769103678816616, + "loss": 0.3912769556045532, + "memory(GiB)": 78.33, + "step": 2094, + "token_acc": 0.8857514450867052, + "train_speed(iter/s)": 0.032281 + }, + { + "epoch": 0.4059487477595311, + "grad_norm": 0.11220169067382812, + "learning_rate": 0.00020760228740772423, + "loss": 0.39823243021965027, + "memory(GiB)": 78.33, + "step": 2095, + "token_acc": 0.8826204060355064, + "train_speed(iter/s)": 0.032282 + }, + { + "epoch": 0.40614251804485785, + "grad_norm": 0.10903418809175491, + "learning_rate": 0.00020751351436847497, + "loss": 0.4057612121105194, + "memory(GiB)": 78.33, + "step": 2096, + "token_acc": 0.883694474539545, + "train_speed(iter/s)": 0.032283 + }, + { + "epoch": 0.4063362883301846, + "grad_norm": 0.11058089882135391, + "learning_rate": 0.00020742471770687998, + "loss": 0.40834489464759827, + "memory(GiB)": 78.33, + "step": 2097, + "token_acc": 0.8802559666034989, + "train_speed(iter/s)": 0.032285 + }, + { + "epoch": 0.40653005861551134, + "grad_norm": 0.10095040500164032, + "learning_rate": 0.00020733589745941034, + "loss": 0.3721862733364105, + "memory(GiB)": 78.33, + "step": 2098, + "token_acc": 0.8911490351522632, + "train_speed(iter/s)": 0.032286 + }, + { + "epoch": 0.4067238289008381, + "grad_norm": 0.12033317983150482, + "learning_rate": 0.00020724705366254693, + "loss": 0.43703311681747437, + "memory(GiB)": 78.33, + "step": 2099, + "token_acc": 0.874896265560166, + "train_speed(iter/s)": 0.032287 + }, + { + "epoch": 0.4069175991861648, + "grad_norm": 0.10075613856315613, + "learning_rate": 0.0002071581863527803, + "loss": 0.33554336428642273, + "memory(GiB)": 78.33, + "step": 2100, + "token_acc": 0.900580013797685, + "train_speed(iter/s)": 0.032289 + }, + { + "epoch": 0.40711136947149157, + "grad_norm": 0.10525397956371307, + "learning_rate": 0.00020706929556661068, + "loss": 0.37453317642211914, + "memory(GiB)": 78.33, + "step": 2101, + "token_acc": 0.8898639629661186, + "train_speed(iter/s)": 0.03229 + }, + { + "epoch": 0.4073051397568183, + "grad_norm": 0.10155075043439865, + "learning_rate": 0.00020698038134054782, + "loss": 0.38334396481513977, + "memory(GiB)": 78.33, + "step": 2102, + "token_acc": 0.885774579297657, + "train_speed(iter/s)": 0.032291 + }, + { + "epoch": 0.40749891004214506, + "grad_norm": 0.11583290249109268, + "learning_rate": 0.00020689144371111118, + "loss": 0.40256598591804504, + "memory(GiB)": 78.33, + "step": 2103, + "token_acc": 0.8832213494842434, + "train_speed(iter/s)": 0.032293 + }, + { + "epoch": 0.4076926803274718, + "grad_norm": 0.10212710499763489, + "learning_rate": 0.00020680248271482993, + "loss": 0.3900299668312073, + "memory(GiB)": 78.33, + "step": 2104, + "token_acc": 0.8837178537411366, + "train_speed(iter/s)": 0.032294 + }, + { + "epoch": 0.40788645061279855, + "grad_norm": 0.1136767715215683, + "learning_rate": 0.0002067134983882427, + "loss": 0.40916168689727783, + "memory(GiB)": 78.33, + "step": 2105, + "token_acc": 0.8802716981132076, + "train_speed(iter/s)": 0.032296 + }, + { + "epoch": 0.4080802208981253, + "grad_norm": 0.1174011304974556, + "learning_rate": 0.00020662449076789768, + "loss": 0.3446325957775116, + "memory(GiB)": 78.33, + "step": 2106, + "token_acc": 0.8981224992305324, + "train_speed(iter/s)": 0.032297 + }, + { + "epoch": 0.40827399118345203, + "grad_norm": 0.10661471635103226, + "learning_rate": 0.00020653545989035278, + "loss": 0.3691784143447876, + "memory(GiB)": 78.33, + "step": 2107, + "token_acc": 0.8892190390760778, + "train_speed(iter/s)": 0.032299 + }, + { + "epoch": 0.4084677614687788, + "grad_norm": 0.10399375855922699, + "learning_rate": 0.00020644640579217533, + "loss": 0.3859068751335144, + "memory(GiB)": 78.33, + "step": 2108, + "token_acc": 0.8871030059500812, + "train_speed(iter/s)": 0.0323 + }, + { + "epoch": 0.4086615317541055, + "grad_norm": 0.10391335189342499, + "learning_rate": 0.0002063573285099422, + "loss": 0.3771337866783142, + "memory(GiB)": 78.33, + "step": 2109, + "token_acc": 0.8897867506583363, + "train_speed(iter/s)": 0.032301 + }, + { + "epoch": 0.40885530203943227, + "grad_norm": 0.11487588286399841, + "learning_rate": 0.00020626822808023993, + "loss": 0.3902420699596405, + "memory(GiB)": 78.33, + "step": 2110, + "token_acc": 0.8852734179451692, + "train_speed(iter/s)": 0.032302 + }, + { + "epoch": 0.409049072324759, + "grad_norm": 0.11222491413354874, + "learning_rate": 0.00020617910453966438, + "loss": 0.3725607693195343, + "memory(GiB)": 78.33, + "step": 2111, + "token_acc": 0.8908452798762715, + "train_speed(iter/s)": 0.032304 + }, + { + "epoch": 0.40924284261008576, + "grad_norm": 0.10845697671175003, + "learning_rate": 0.00020608995792482102, + "loss": 0.36426377296447754, + "memory(GiB)": 78.33, + "step": 2112, + "token_acc": 0.8920298507462686, + "train_speed(iter/s)": 0.032305 + }, + { + "epoch": 0.4094366128954125, + "grad_norm": 0.11745316535234451, + "learning_rate": 0.00020600078827232469, + "loss": 0.44262993335723877, + "memory(GiB)": 78.33, + "step": 2113, + "token_acc": 0.8721979182380449, + "train_speed(iter/s)": 0.032307 + }, + { + "epoch": 0.40963038318073924, + "grad_norm": 0.10488930344581604, + "learning_rate": 0.00020591159561879991, + "loss": 0.3662855923175812, + "memory(GiB)": 78.33, + "step": 2114, + "token_acc": 0.8902525438621355, + "train_speed(iter/s)": 0.032308 + }, + { + "epoch": 0.409824153466066, + "grad_norm": 0.10289178043603897, + "learning_rate": 0.00020582238000088033, + "loss": 0.37403732538223267, + "memory(GiB)": 78.33, + "step": 2115, + "token_acc": 0.8886182232818493, + "train_speed(iter/s)": 0.032309 + }, + { + "epoch": 0.41001792375139273, + "grad_norm": 0.10993564128875732, + "learning_rate": 0.0002057331414552093, + "loss": 0.400392085313797, + "memory(GiB)": 78.33, + "step": 2116, + "token_acc": 0.882896215297761, + "train_speed(iter/s)": 0.032311 + }, + { + "epoch": 0.4102116940367195, + "grad_norm": 0.10764973610639572, + "learning_rate": 0.00020564388001843945, + "loss": 0.3797903060913086, + "memory(GiB)": 78.33, + "step": 2117, + "token_acc": 0.8884940778341793, + "train_speed(iter/s)": 0.032312 + }, + { + "epoch": 0.4104054643220462, + "grad_norm": 0.11092360317707062, + "learning_rate": 0.00020555459572723294, + "loss": 0.3895651698112488, + "memory(GiB)": 78.33, + "step": 2118, + "token_acc": 0.8856181915590213, + "train_speed(iter/s)": 0.032313 + }, + { + "epoch": 0.41059923460737296, + "grad_norm": 0.11945401877164841, + "learning_rate": 0.00020546528861826107, + "loss": 0.40205317735671997, + "memory(GiB)": 78.33, + "step": 2119, + "token_acc": 0.8840717802127998, + "train_speed(iter/s)": 0.032315 + }, + { + "epoch": 0.4107930048926997, + "grad_norm": 0.1141401007771492, + "learning_rate": 0.0002053759587282048, + "loss": 0.42232364416122437, + "memory(GiB)": 78.33, + "step": 2120, + "token_acc": 0.8797556077277742, + "train_speed(iter/s)": 0.032316 + }, + { + "epoch": 0.41098677517802645, + "grad_norm": 0.10188201814889908, + "learning_rate": 0.00020528660609375426, + "loss": 0.36200839281082153, + "memory(GiB)": 78.33, + "step": 2121, + "token_acc": 0.8925009859710407, + "train_speed(iter/s)": 0.032317 + }, + { + "epoch": 0.4111805454633532, + "grad_norm": 0.09827426820993423, + "learning_rate": 0.000205197230751609, + "loss": 0.3723769187927246, + "memory(GiB)": 78.33, + "step": 2122, + "token_acc": 0.8900832517140059, + "train_speed(iter/s)": 0.032318 + }, + { + "epoch": 0.41137431574867994, + "grad_norm": 0.1049884557723999, + "learning_rate": 0.00020510783273847778, + "loss": 0.37664487957954407, + "memory(GiB)": 78.33, + "step": 2123, + "token_acc": 0.8893548130258234, + "train_speed(iter/s)": 0.032319 + }, + { + "epoch": 0.4115680860340067, + "grad_norm": 0.10434847325086594, + "learning_rate": 0.00020501841209107896, + "loss": 0.3857555091381073, + "memory(GiB)": 78.33, + "step": 2124, + "token_acc": 0.8891336270190896, + "train_speed(iter/s)": 0.032321 + }, + { + "epoch": 0.41176185631933343, + "grad_norm": 0.10164911299943924, + "learning_rate": 0.00020492896884613987, + "loss": 0.3675006628036499, + "memory(GiB)": 78.33, + "step": 2125, + "token_acc": 0.8910501783714814, + "train_speed(iter/s)": 0.032322 + }, + { + "epoch": 0.4119556266046602, + "grad_norm": 0.10134454071521759, + "learning_rate": 0.00020483950304039724, + "loss": 0.3476465940475464, + "memory(GiB)": 78.33, + "step": 2126, + "token_acc": 0.896970502258836, + "train_speed(iter/s)": 0.032323 + }, + { + "epoch": 0.4121493968899869, + "grad_norm": 0.10433205217123032, + "learning_rate": 0.00020475001471059712, + "loss": 0.3762246072292328, + "memory(GiB)": 78.33, + "step": 2127, + "token_acc": 0.8882155392268263, + "train_speed(iter/s)": 0.032324 + }, + { + "epoch": 0.41234316717531366, + "grad_norm": 0.10764237493276596, + "learning_rate": 0.0002046605038934948, + "loss": 0.3889080286026001, + "memory(GiB)": 78.33, + "step": 2128, + "token_acc": 0.8841791697411931, + "train_speed(iter/s)": 0.032326 + }, + { + "epoch": 0.4125369374606404, + "grad_norm": 0.10777831077575684, + "learning_rate": 0.00020457097062585473, + "loss": 0.38341259956359863, + "memory(GiB)": 78.33, + "step": 2129, + "token_acc": 0.8884803921568627, + "train_speed(iter/s)": 0.032327 + }, + { + "epoch": 0.41273070774596715, + "grad_norm": 0.15417581796646118, + "learning_rate": 0.00020448141494445066, + "loss": 0.37628644704818726, + "memory(GiB)": 78.33, + "step": 2130, + "token_acc": 0.8896055119537908, + "train_speed(iter/s)": 0.032328 + }, + { + "epoch": 0.4129244780312939, + "grad_norm": 0.1092265397310257, + "learning_rate": 0.00020439183688606547, + "loss": 0.367519348859787, + "memory(GiB)": 78.33, + "step": 2131, + "token_acc": 0.8922348315610759, + "train_speed(iter/s)": 0.03233 + }, + { + "epoch": 0.41311824831662064, + "grad_norm": 0.10159304738044739, + "learning_rate": 0.0002043022364874913, + "loss": 0.33829817175865173, + "memory(GiB)": 78.33, + "step": 2132, + "token_acc": 0.8995331695331695, + "train_speed(iter/s)": 0.032331 + }, + { + "epoch": 0.4133120186019474, + "grad_norm": 0.1057777926325798, + "learning_rate": 0.00020421261378552948, + "loss": 0.36473432183265686, + "memory(GiB)": 78.33, + "step": 2133, + "token_acc": 0.8916222307734594, + "train_speed(iter/s)": 0.032332 + }, + { + "epoch": 0.41350578888727413, + "grad_norm": 0.10533451288938522, + "learning_rate": 0.00020412296881699039, + "loss": 0.3586021065711975, + "memory(GiB)": 78.33, + "step": 2134, + "token_acc": 0.8953077991205739, + "train_speed(iter/s)": 0.032333 + }, + { + "epoch": 0.4136995591726009, + "grad_norm": 0.10619760304689407, + "learning_rate": 0.00020403330161869373, + "loss": 0.35481902956962585, + "memory(GiB)": 78.33, + "step": 2135, + "token_acc": 0.8953304521977443, + "train_speed(iter/s)": 0.032335 + }, + { + "epoch": 0.4138933294579276, + "grad_norm": 0.11591517180204391, + "learning_rate": 0.0002039436122274681, + "loss": 0.40828073024749756, + "memory(GiB)": 78.33, + "step": 2136, + "token_acc": 0.8834233806329039, + "train_speed(iter/s)": 0.032336 + }, + { + "epoch": 0.41408709974325436, + "grad_norm": 0.11356504261493683, + "learning_rate": 0.00020385390068015146, + "loss": 0.3991680443286896, + "memory(GiB)": 78.33, + "step": 2137, + "token_acc": 0.8845186434119241, + "train_speed(iter/s)": 0.032337 + }, + { + "epoch": 0.4142808700285811, + "grad_norm": 0.11383282393217087, + "learning_rate": 0.00020376416701359067, + "loss": 0.41105103492736816, + "memory(GiB)": 78.33, + "step": 2138, + "token_acc": 0.8808600337268128, + "train_speed(iter/s)": 0.032338 + }, + { + "epoch": 0.41447464031390785, + "grad_norm": 0.09987689554691315, + "learning_rate": 0.00020367441126464177, + "loss": 0.3558513820171356, + "memory(GiB)": 78.33, + "step": 2139, + "token_acc": 0.8945576407506702, + "train_speed(iter/s)": 0.03234 + }, + { + "epoch": 0.4146684105992346, + "grad_norm": 0.11728887259960175, + "learning_rate": 0.00020358463347016988, + "loss": 0.41319242119789124, + "memory(GiB)": 78.33, + "step": 2140, + "token_acc": 0.880601774015885, + "train_speed(iter/s)": 0.032341 + }, + { + "epoch": 0.41486218088456134, + "grad_norm": 0.10223786532878876, + "learning_rate": 0.0002034948336670492, + "loss": 0.3577488362789154, + "memory(GiB)": 78.33, + "step": 2141, + "token_acc": 0.892794648592376, + "train_speed(iter/s)": 0.032342 + }, + { + "epoch": 0.4150559511698881, + "grad_norm": 0.10343615710735321, + "learning_rate": 0.00020340501189216285, + "loss": 0.36109933257102966, + "memory(GiB)": 78.33, + "step": 2142, + "token_acc": 0.8946854716704895, + "train_speed(iter/s)": 0.032343 + }, + { + "epoch": 0.4152497214552148, + "grad_norm": 0.10884758830070496, + "learning_rate": 0.000203315168182403, + "loss": 0.38284680247306824, + "memory(GiB)": 78.33, + "step": 2143, + "token_acc": 0.8858148616687751, + "train_speed(iter/s)": 0.032345 + }, + { + "epoch": 0.41544349174054157, + "grad_norm": 0.10763585567474365, + "learning_rate": 0.00020322530257467104, + "loss": 0.3719366788864136, + "memory(GiB)": 78.33, + "step": 2144, + "token_acc": 0.8914604948124502, + "train_speed(iter/s)": 0.032346 + }, + { + "epoch": 0.4156372620258683, + "grad_norm": 0.10268665105104446, + "learning_rate": 0.00020313541510587707, + "loss": 0.3508341312408447, + "memory(GiB)": 78.33, + "step": 2145, + "token_acc": 0.8950760245695297, + "train_speed(iter/s)": 0.032347 + }, + { + "epoch": 0.41583103231119506, + "grad_norm": 0.10579564422369003, + "learning_rate": 0.00020304550581294026, + "loss": 0.39762815833091736, + "memory(GiB)": 78.33, + "step": 2146, + "token_acc": 0.8849706763321503, + "train_speed(iter/s)": 0.032348 + }, + { + "epoch": 0.4160248025965218, + "grad_norm": 0.09803339838981628, + "learning_rate": 0.00020295557473278886, + "loss": 0.33986738324165344, + "memory(GiB)": 78.33, + "step": 2147, + "token_acc": 0.8999627421758569, + "train_speed(iter/s)": 0.03235 + }, + { + "epoch": 0.41621857288184855, + "grad_norm": 0.1106083020567894, + "learning_rate": 0.00020286562190235998, + "loss": 0.3605062961578369, + "memory(GiB)": 78.33, + "step": 2148, + "token_acc": 0.8942120900468518, + "train_speed(iter/s)": 0.032351 + }, + { + "epoch": 0.4164123431671753, + "grad_norm": 0.11620527505874634, + "learning_rate": 0.00020277564735859957, + "loss": 0.42198172211647034, + "memory(GiB)": 78.33, + "step": 2149, + "token_acc": 0.8788444418918531, + "train_speed(iter/s)": 0.032352 + }, + { + "epoch": 0.41660611345250204, + "grad_norm": 0.0997370034456253, + "learning_rate": 0.0002026856511384627, + "loss": 0.33660730719566345, + "memory(GiB)": 78.33, + "step": 2150, + "token_acc": 0.9009054193173719, + "train_speed(iter/s)": 0.032354 + }, + { + "epoch": 0.4167998837378288, + "grad_norm": 0.10104697942733765, + "learning_rate": 0.00020259563327891316, + "loss": 0.3682084381580353, + "memory(GiB)": 78.33, + "step": 2151, + "token_acc": 0.8905543542362886, + "train_speed(iter/s)": 0.032355 + }, + { + "epoch": 0.4169936540231555, + "grad_norm": 0.10348115861415863, + "learning_rate": 0.00020250559381692373, + "loss": 0.35385698080062866, + "memory(GiB)": 78.33, + "step": 2152, + "token_acc": 0.8960956832554303, + "train_speed(iter/s)": 0.032356 + }, + { + "epoch": 0.41718742430848227, + "grad_norm": 0.10488869249820709, + "learning_rate": 0.00020241553278947604, + "loss": 0.36309656500816345, + "memory(GiB)": 78.33, + "step": 2153, + "token_acc": 0.8921550221268607, + "train_speed(iter/s)": 0.032357 + }, + { + "epoch": 0.417381194593809, + "grad_norm": 0.11151447147130966, + "learning_rate": 0.00020232545023356058, + "loss": 0.37561148405075073, + "memory(GiB)": 78.33, + "step": 2154, + "token_acc": 0.889549997111663, + "train_speed(iter/s)": 0.032359 + }, + { + "epoch": 0.41757496487913576, + "grad_norm": 0.11074910312891006, + "learning_rate": 0.0002022353461861767, + "loss": 0.3915635049343109, + "memory(GiB)": 78.33, + "step": 2155, + "token_acc": 0.8837005402042029, + "train_speed(iter/s)": 0.03236 + }, + { + "epoch": 0.41776873516446256, + "grad_norm": 0.10543368011713028, + "learning_rate": 0.00020214522068433247, + "loss": 0.34513017535209656, + "memory(GiB)": 78.33, + "step": 2156, + "token_acc": 0.8980891719745223, + "train_speed(iter/s)": 0.032361 + }, + { + "epoch": 0.4179625054497893, + "grad_norm": 0.0990806296467781, + "learning_rate": 0.00020205507376504494, + "loss": 0.34623822569847107, + "memory(GiB)": 78.33, + "step": 2157, + "token_acc": 0.8984733024327651, + "train_speed(iter/s)": 0.032362 + }, + { + "epoch": 0.41815627573511605, + "grad_norm": 0.0954173356294632, + "learning_rate": 0.00020196490546533987, + "loss": 0.3334534168243408, + "memory(GiB)": 78.33, + "step": 2158, + "token_acc": 0.9018397113381105, + "train_speed(iter/s)": 0.032363 + }, + { + "epoch": 0.4183500460204428, + "grad_norm": 0.10484931617975235, + "learning_rate": 0.0002018747158222517, + "loss": 0.33561134338378906, + "memory(GiB)": 78.33, + "step": 2159, + "token_acc": 0.9003498385360603, + "train_speed(iter/s)": 0.032365 + }, + { + "epoch": 0.41854381630576953, + "grad_norm": 0.10665947943925858, + "learning_rate": 0.00020178450487282385, + "loss": 0.3735751509666443, + "memory(GiB)": 78.33, + "step": 2160, + "token_acc": 0.892504140620826, + "train_speed(iter/s)": 0.032366 + }, + { + "epoch": 0.4187375865910963, + "grad_norm": 0.10096178948879242, + "learning_rate": 0.00020169427265410837, + "loss": 0.3531975746154785, + "memory(GiB)": 78.33, + "step": 2161, + "token_acc": 0.8952592753308743, + "train_speed(iter/s)": 0.032367 + }, + { + "epoch": 0.418931356876423, + "grad_norm": 0.11227719485759735, + "learning_rate": 0.00020160401920316597, + "loss": 0.3715410828590393, + "memory(GiB)": 78.33, + "step": 2162, + "token_acc": 0.8928775907477384, + "train_speed(iter/s)": 0.032368 + }, + { + "epoch": 0.41912512716174977, + "grad_norm": 0.10699176788330078, + "learning_rate": 0.0002015137445570663, + "loss": 0.3732505738735199, + "memory(GiB)": 78.33, + "step": 2163, + "token_acc": 0.8906586310804994, + "train_speed(iter/s)": 0.03237 + }, + { + "epoch": 0.4193188974470765, + "grad_norm": 0.0976463332772255, + "learning_rate": 0.0002014234487528875, + "loss": 0.37260702252388, + "memory(GiB)": 78.33, + "step": 2164, + "token_acc": 0.8918560748130923, + "train_speed(iter/s)": 0.032371 + }, + { + "epoch": 0.41951266773240325, + "grad_norm": 0.10294061154127121, + "learning_rate": 0.00020133313182771646, + "loss": 0.37537047266960144, + "memory(GiB)": 78.33, + "step": 2165, + "token_acc": 0.8896890512075307, + "train_speed(iter/s)": 0.032372 + }, + { + "epoch": 0.41970643801773, + "grad_norm": 0.10132710635662079, + "learning_rate": 0.00020124279381864883, + "loss": 0.35512280464172363, + "memory(GiB)": 78.33, + "step": 2166, + "token_acc": 0.8946123521681998, + "train_speed(iter/s)": 0.032373 + }, + { + "epoch": 0.41990020830305674, + "grad_norm": 0.1205829605460167, + "learning_rate": 0.00020115243476278883, + "loss": 0.4342115521430969, + "memory(GiB)": 78.33, + "step": 2167, + "token_acc": 0.8729766390354182, + "train_speed(iter/s)": 0.032375 + }, + { + "epoch": 0.4200939785883835, + "grad_norm": 0.10426265001296997, + "learning_rate": 0.00020106205469724937, + "loss": 0.36752408742904663, + "memory(GiB)": 78.33, + "step": 2168, + "token_acc": 0.8942200862982198, + "train_speed(iter/s)": 0.032376 + }, + { + "epoch": 0.42028774887371023, + "grad_norm": 0.10518850386142731, + "learning_rate": 0.00020097165365915188, + "loss": 0.36646583676338196, + "memory(GiB)": 78.33, + "step": 2169, + "token_acc": 0.8950598184818482, + "train_speed(iter/s)": 0.032377 + }, + { + "epoch": 0.420481519159037, + "grad_norm": 0.097692109644413, + "learning_rate": 0.00020088123168562663, + "loss": 0.31673404574394226, + "memory(GiB)": 78.33, + "step": 2170, + "token_acc": 0.9040827884466869, + "train_speed(iter/s)": 0.032378 + }, + { + "epoch": 0.4206752894443637, + "grad_norm": 0.10072191804647446, + "learning_rate": 0.00020079078881381232, + "loss": 0.3458814322948456, + "memory(GiB)": 78.33, + "step": 2171, + "token_acc": 0.8972006834012354, + "train_speed(iter/s)": 0.03238 + }, + { + "epoch": 0.42086905972969046, + "grad_norm": 0.10148821771144867, + "learning_rate": 0.00020070032508085617, + "loss": 0.362411230802536, + "memory(GiB)": 78.33, + "step": 2172, + "token_acc": 0.893191234333064, + "train_speed(iter/s)": 0.032381 + }, + { + "epoch": 0.4210628300150172, + "grad_norm": 0.09655621647834778, + "learning_rate": 0.0002006098405239142, + "loss": 0.36690768599510193, + "memory(GiB)": 78.33, + "step": 2173, + "token_acc": 0.8898027945694182, + "train_speed(iter/s)": 0.032382 + }, + { + "epoch": 0.42125660030034395, + "grad_norm": 0.10610742121934891, + "learning_rate": 0.00020051933518015077, + "loss": 0.388561487197876, + "memory(GiB)": 78.33, + "step": 2174, + "token_acc": 0.8857004153406333, + "train_speed(iter/s)": 0.032383 + }, + { + "epoch": 0.4214503705856707, + "grad_norm": 0.11180911213159561, + "learning_rate": 0.00020042880908673888, + "loss": 0.38997843861579895, + "memory(GiB)": 78.33, + "step": 2175, + "token_acc": 0.8855060034305318, + "train_speed(iter/s)": 0.032384 + }, + { + "epoch": 0.42164414087099744, + "grad_norm": 0.11040447652339935, + "learning_rate": 0.00020033826228085997, + "loss": 0.3600209355354309, + "memory(GiB)": 78.33, + "step": 2176, + "token_acc": 0.8954944743553415, + "train_speed(iter/s)": 0.032386 + }, + { + "epoch": 0.4218379111563242, + "grad_norm": 0.11115849018096924, + "learning_rate": 0.0002002476947997042, + "loss": 0.3961622714996338, + "memory(GiB)": 78.33, + "step": 2177, + "token_acc": 0.8841036617111124, + "train_speed(iter/s)": 0.032387 + }, + { + "epoch": 0.42203168144165093, + "grad_norm": 0.10502826422452927, + "learning_rate": 0.0002001571066804699, + "loss": 0.38119229674339294, + "memory(GiB)": 78.33, + "step": 2178, + "token_acc": 0.8862857463902466, + "train_speed(iter/s)": 0.032388 + }, + { + "epoch": 0.4222254517269777, + "grad_norm": 0.11460091918706894, + "learning_rate": 0.00020006649796036412, + "loss": 0.4165613353252411, + "memory(GiB)": 78.33, + "step": 2179, + "token_acc": 0.8790522573127372, + "train_speed(iter/s)": 0.032389 + }, + { + "epoch": 0.4224192220123044, + "grad_norm": 0.09722710400819778, + "learning_rate": 0.0001999758686766023, + "loss": 0.3486678898334503, + "memory(GiB)": 78.33, + "step": 2180, + "token_acc": 0.8953782314223028, + "train_speed(iter/s)": 0.03239 + }, + { + "epoch": 0.42261299229763116, + "grad_norm": 0.10160472989082336, + "learning_rate": 0.0001998852188664083, + "loss": 0.3687483072280884, + "memory(GiB)": 78.33, + "step": 2181, + "token_acc": 0.8916645094486151, + "train_speed(iter/s)": 0.032392 + }, + { + "epoch": 0.4228067625829579, + "grad_norm": 0.11015883833169937, + "learning_rate": 0.00019979454856701442, + "loss": 0.36141785979270935, + "memory(GiB)": 78.33, + "step": 2182, + "token_acc": 0.895195164429874, + "train_speed(iter/s)": 0.032393 + }, + { + "epoch": 0.42300053286828465, + "grad_norm": 0.10014893114566803, + "learning_rate": 0.00019970385781566146, + "loss": 0.3551277220249176, + "memory(GiB)": 78.33, + "step": 2183, + "token_acc": 0.8938043448633031, + "train_speed(iter/s)": 0.032395 + }, + { + "epoch": 0.4231943031536114, + "grad_norm": 0.08740631490945816, + "learning_rate": 0.00019961314664959849, + "loss": 0.3233758211135864, + "memory(GiB)": 78.33, + "step": 2184, + "token_acc": 0.9035294117647059, + "train_speed(iter/s)": 0.032396 + }, + { + "epoch": 0.42338807343893814, + "grad_norm": 0.10040253400802612, + "learning_rate": 0.00019952241510608302, + "loss": 0.34599989652633667, + "memory(GiB)": 78.33, + "step": 2185, + "token_acc": 0.8977024740700782, + "train_speed(iter/s)": 0.032397 + }, + { + "epoch": 0.4235818437242649, + "grad_norm": 0.10159041732549667, + "learning_rate": 0.00019943166322238095, + "loss": 0.34493720531463623, + "memory(GiB)": 78.33, + "step": 2186, + "token_acc": 0.8977762408895037, + "train_speed(iter/s)": 0.032398 + }, + { + "epoch": 0.42377561400959163, + "grad_norm": 0.11545497179031372, + "learning_rate": 0.00019934089103576652, + "loss": 0.3942737877368927, + "memory(GiB)": 78.33, + "step": 2187, + "token_acc": 0.8845215505557061, + "train_speed(iter/s)": 0.0324 + }, + { + "epoch": 0.42396938429491837, + "grad_norm": 0.10246077924966812, + "learning_rate": 0.00019925009858352233, + "loss": 0.37554022669792175, + "memory(GiB)": 78.33, + "step": 2188, + "token_acc": 0.8895408954606556, + "train_speed(iter/s)": 0.032401 + }, + { + "epoch": 0.4241631545802451, + "grad_norm": 0.12075801938772202, + "learning_rate": 0.00019915928590293918, + "loss": 0.40932536125183105, + "memory(GiB)": 78.33, + "step": 2189, + "token_acc": 0.8816431763492609, + "train_speed(iter/s)": 0.032402 + }, + { + "epoch": 0.42435692486557186, + "grad_norm": 0.12044385075569153, + "learning_rate": 0.00019906845303131643, + "loss": 0.4146193861961365, + "memory(GiB)": 78.33, + "step": 2190, + "token_acc": 0.8788535074955163, + "train_speed(iter/s)": 0.032403 + }, + { + "epoch": 0.4245506951508986, + "grad_norm": 0.10225894302129745, + "learning_rate": 0.00019897760000596145, + "loss": 0.3517759144306183, + "memory(GiB)": 78.33, + "step": 2191, + "token_acc": 0.8963014924651073, + "train_speed(iter/s)": 0.032405 + }, + { + "epoch": 0.42474446543622535, + "grad_norm": 0.10850612074136734, + "learning_rate": 0.00019888672686419005, + "loss": 0.34386181831359863, + "memory(GiB)": 78.33, + "step": 2192, + "token_acc": 0.898326711713018, + "train_speed(iter/s)": 0.032406 + }, + { + "epoch": 0.4249382357215521, + "grad_norm": 0.10484492033720016, + "learning_rate": 0.0001987958336433263, + "loss": 0.36306411027908325, + "memory(GiB)": 78.33, + "step": 2193, + "token_acc": 0.8922604656216891, + "train_speed(iter/s)": 0.032407 + }, + { + "epoch": 0.42513200600687884, + "grad_norm": 0.09952452033758163, + "learning_rate": 0.00019870492038070252, + "loss": 0.3434096574783325, + "memory(GiB)": 78.33, + "step": 2194, + "token_acc": 0.8980537301407457, + "train_speed(iter/s)": 0.032408 + }, + { + "epoch": 0.4253257762922056, + "grad_norm": 0.11556005477905273, + "learning_rate": 0.00019861398711365917, + "loss": 0.3726232945919037, + "memory(GiB)": 78.33, + "step": 2195, + "token_acc": 0.890572481838881, + "train_speed(iter/s)": 0.032409 + }, + { + "epoch": 0.4255195465775323, + "grad_norm": 0.0965765044093132, + "learning_rate": 0.00019852303387954496, + "loss": 0.34968358278274536, + "memory(GiB)": 78.33, + "step": 2196, + "token_acc": 0.8960782916458824, + "train_speed(iter/s)": 0.03241 + }, + { + "epoch": 0.42571331686285907, + "grad_norm": 0.11270653456449509, + "learning_rate": 0.00019843206071571692, + "loss": 0.37671494483947754, + "memory(GiB)": 78.33, + "step": 2197, + "token_acc": 0.8884828552286984, + "train_speed(iter/s)": 0.032412 + }, + { + "epoch": 0.4259070871481858, + "grad_norm": 0.10049509257078171, + "learning_rate": 0.0001983410676595401, + "loss": 0.36629050970077515, + "memory(GiB)": 78.33, + "step": 2198, + "token_acc": 0.893262368452885, + "train_speed(iter/s)": 0.032413 + }, + { + "epoch": 0.42610085743351256, + "grad_norm": 0.10621780902147293, + "learning_rate": 0.0001982500547483878, + "loss": 0.3855476677417755, + "memory(GiB)": 78.33, + "step": 2199, + "token_acc": 0.8878023674729799, + "train_speed(iter/s)": 0.032414 + }, + { + "epoch": 0.4262946277188393, + "grad_norm": 0.10796981304883957, + "learning_rate": 0.00019815902201964153, + "loss": 0.34390783309936523, + "memory(GiB)": 78.33, + "step": 2200, + "token_acc": 0.8990232532326988, + "train_speed(iter/s)": 0.032416 + }, + { + "epoch": 0.42648839800416605, + "grad_norm": 0.11975818127393723, + "learning_rate": 0.00019806796951069087, + "loss": 0.3976423442363739, + "memory(GiB)": 78.33, + "step": 2201, + "token_acc": 0.885062679353572, + "train_speed(iter/s)": 0.032411 + }, + { + "epoch": 0.4266821682894928, + "grad_norm": 0.10373754054307938, + "learning_rate": 0.00019797689725893337, + "loss": 0.3727421462535858, + "memory(GiB)": 78.33, + "step": 2202, + "token_acc": 0.8891960318280459, + "train_speed(iter/s)": 0.032412 + }, + { + "epoch": 0.42687593857481954, + "grad_norm": 0.10288981348276138, + "learning_rate": 0.00019788580530177507, + "loss": 0.35971593856811523, + "memory(GiB)": 78.33, + "step": 2203, + "token_acc": 0.8920195439739413, + "train_speed(iter/s)": 0.032413 + }, + { + "epoch": 0.4270697088601463, + "grad_norm": 0.10778756439685822, + "learning_rate": 0.0001977946936766298, + "loss": 0.38950392603874207, + "memory(GiB)": 78.33, + "step": 2204, + "token_acc": 0.8854846286340623, + "train_speed(iter/s)": 0.032415 + }, + { + "epoch": 0.427263479145473, + "grad_norm": 0.1137860119342804, + "learning_rate": 0.0001977035624209195, + "loss": 0.3468484580516815, + "memory(GiB)": 78.33, + "step": 2205, + "token_acc": 0.8975911374468634, + "train_speed(iter/s)": 0.032416 + }, + { + "epoch": 0.42745724943079977, + "grad_norm": 0.11108643561601639, + "learning_rate": 0.00019761241157207428, + "loss": 0.3628512918949127, + "memory(GiB)": 78.33, + "step": 2206, + "token_acc": 0.8934348239771646, + "train_speed(iter/s)": 0.032417 + }, + { + "epoch": 0.4276510197161265, + "grad_norm": 0.09750779718160629, + "learning_rate": 0.00019752124116753224, + "loss": 0.3502158522605896, + "memory(GiB)": 78.33, + "step": 2207, + "token_acc": 0.8962943407094514, + "train_speed(iter/s)": 0.032418 + }, + { + "epoch": 0.42784479000145326, + "grad_norm": 0.09157220274209976, + "learning_rate": 0.0001974300512447395, + "loss": 0.33457863330841064, + "memory(GiB)": 78.33, + "step": 2208, + "token_acc": 0.900385138196647, + "train_speed(iter/s)": 0.032419 + }, + { + "epoch": 0.42803856028678, + "grad_norm": 0.11325064301490784, + "learning_rate": 0.0001973388418411502, + "loss": 0.37056490778923035, + "memory(GiB)": 78.33, + "step": 2209, + "token_acc": 0.8906734699794161, + "train_speed(iter/s)": 0.03242 + }, + { + "epoch": 0.42823233057210675, + "grad_norm": 0.11709318310022354, + "learning_rate": 0.00019724761299422654, + "loss": 0.40491345524787903, + "memory(GiB)": 78.33, + "step": 2210, + "token_acc": 0.8828931933278018, + "train_speed(iter/s)": 0.032422 + }, + { + "epoch": 0.4284261008574335, + "grad_norm": 0.10216681659221649, + "learning_rate": 0.00019715636474143864, + "loss": 0.38466590642929077, + "memory(GiB)": 78.33, + "step": 2211, + "token_acc": 0.8875497597803707, + "train_speed(iter/s)": 0.032423 + }, + { + "epoch": 0.42861987114276023, + "grad_norm": 0.1109510064125061, + "learning_rate": 0.00019706509712026456, + "loss": 0.38734912872314453, + "memory(GiB)": 78.33, + "step": 2212, + "token_acc": 0.8852111060119321, + "train_speed(iter/s)": 0.032424 + }, + { + "epoch": 0.428813641428087, + "grad_norm": 0.10640700161457062, + "learning_rate": 0.00019697381016819043, + "loss": 0.3618239462375641, + "memory(GiB)": 78.33, + "step": 2213, + "token_acc": 0.8929901199097012, + "train_speed(iter/s)": 0.032425 + }, + { + "epoch": 0.4290074117134137, + "grad_norm": 0.13030792772769928, + "learning_rate": 0.00019688250392271026, + "loss": 0.3900720775127411, + "memory(GiB)": 78.33, + "step": 2214, + "token_acc": 0.8874647759294609, + "train_speed(iter/s)": 0.032427 + }, + { + "epoch": 0.42920118199874047, + "grad_norm": 0.09482412040233612, + "learning_rate": 0.00019679117842132592, + "loss": 0.35719773173332214, + "memory(GiB)": 78.33, + "step": 2215, + "token_acc": 0.8954372137062723, + "train_speed(iter/s)": 0.032428 + }, + { + "epoch": 0.4293949522840672, + "grad_norm": 0.10285639762878418, + "learning_rate": 0.00019669983370154722, + "loss": 0.3917454481124878, + "memory(GiB)": 78.33, + "step": 2216, + "token_acc": 0.8865480649188514, + "train_speed(iter/s)": 0.032429 + }, + { + "epoch": 0.429588722569394, + "grad_norm": 0.11368583887815475, + "learning_rate": 0.000196608469800892, + "loss": 0.42137694358825684, + "memory(GiB)": 78.33, + "step": 2217, + "token_acc": 0.8776351492381549, + "train_speed(iter/s)": 0.03243 + }, + { + "epoch": 0.42978249285472075, + "grad_norm": 0.10096923261880875, + "learning_rate": 0.0001965170867568858, + "loss": 0.3742446005344391, + "memory(GiB)": 78.33, + "step": 2218, + "token_acc": 0.8895205325753609, + "train_speed(iter/s)": 0.032431 + }, + { + "epoch": 0.4299762631400475, + "grad_norm": 0.10487914085388184, + "learning_rate": 0.00019642568460706214, + "loss": 0.36482223868370056, + "memory(GiB)": 78.33, + "step": 2219, + "token_acc": 0.8928659320226409, + "train_speed(iter/s)": 0.032432 + }, + { + "epoch": 0.43017003342537424, + "grad_norm": 0.2587502598762512, + "learning_rate": 0.00019633426338896227, + "loss": 0.36105775833129883, + "memory(GiB)": 78.33, + "step": 2220, + "token_acc": 0.8932460356156114, + "train_speed(iter/s)": 0.032433 + }, + { + "epoch": 0.430363803710701, + "grad_norm": 0.10027684271335602, + "learning_rate": 0.0001962428231401354, + "loss": 0.3383048176765442, + "memory(GiB)": 78.33, + "step": 2221, + "token_acc": 0.8997930908043268, + "train_speed(iter/s)": 0.032435 + }, + { + "epoch": 0.43055757399602773, + "grad_norm": 0.11608999222517014, + "learning_rate": 0.00019615136389813847, + "loss": 0.38931986689567566, + "memory(GiB)": 78.33, + "step": 2222, + "token_acc": 0.8857191859135605, + "train_speed(iter/s)": 0.032436 + }, + { + "epoch": 0.4307513442813545, + "grad_norm": 0.10588336735963821, + "learning_rate": 0.00019605988570053622, + "loss": 0.354093462228775, + "memory(GiB)": 78.33, + "step": 2223, + "token_acc": 0.8957208040448884, + "train_speed(iter/s)": 0.032437 + }, + { + "epoch": 0.4309451145666812, + "grad_norm": 0.16060945391654968, + "learning_rate": 0.0001959683885849013, + "loss": 0.3510138690471649, + "memory(GiB)": 78.33, + "step": 2224, + "token_acc": 0.8974735682241519, + "train_speed(iter/s)": 0.032438 + }, + { + "epoch": 0.43113888485200796, + "grad_norm": 0.1098427101969719, + "learning_rate": 0.00019587687258881391, + "loss": 0.3766689896583557, + "memory(GiB)": 78.33, + "step": 2225, + "token_acc": 0.8884950490521164, + "train_speed(iter/s)": 0.032439 + }, + { + "epoch": 0.4313326551373347, + "grad_norm": 0.10366590321063995, + "learning_rate": 0.00019578533774986217, + "loss": 0.3672398626804352, + "memory(GiB)": 78.33, + "step": 2226, + "token_acc": 0.8924895345973898, + "train_speed(iter/s)": 0.032441 + }, + { + "epoch": 0.43152642542266145, + "grad_norm": 0.11767855286598206, + "learning_rate": 0.00019569378410564197, + "loss": 0.3826836347579956, + "memory(GiB)": 78.33, + "step": 2227, + "token_acc": 0.8886111359104841, + "train_speed(iter/s)": 0.032442 + }, + { + "epoch": 0.4317201957079882, + "grad_norm": 0.0977967232465744, + "learning_rate": 0.0001956022116937568, + "loss": 0.3510884940624237, + "memory(GiB)": 78.33, + "step": 2228, + "token_acc": 0.8963877167735799, + "train_speed(iter/s)": 0.032443 + }, + { + "epoch": 0.43191396599331494, + "grad_norm": 0.09899823367595673, + "learning_rate": 0.00019551062055181786, + "loss": 0.3398961126804352, + "memory(GiB)": 78.33, + "step": 2229, + "token_acc": 0.8998167084699981, + "train_speed(iter/s)": 0.032444 + }, + { + "epoch": 0.4321077362786417, + "grad_norm": 0.1056177020072937, + "learning_rate": 0.0001954190107174442, + "loss": 0.38561806082725525, + "memory(GiB)": 78.33, + "step": 2230, + "token_acc": 0.8863317429132753, + "train_speed(iter/s)": 0.032445 + }, + { + "epoch": 0.43230150656396843, + "grad_norm": 0.09486104547977448, + "learning_rate": 0.00019532738222826233, + "loss": 0.3347775936126709, + "memory(GiB)": 78.33, + "step": 2231, + "token_acc": 0.9010725841379837, + "train_speed(iter/s)": 0.032447 + }, + { + "epoch": 0.4324952768492952, + "grad_norm": 0.1015876904129982, + "learning_rate": 0.0001952357351219066, + "loss": 0.3641873002052307, + "memory(GiB)": 78.33, + "step": 2232, + "token_acc": 0.8932900972811175, + "train_speed(iter/s)": 0.032448 + }, + { + "epoch": 0.4326890471346219, + "grad_norm": 0.10654337704181671, + "learning_rate": 0.00019514406943601896, + "loss": 0.37854257225990295, + "memory(GiB)": 78.33, + "step": 2233, + "token_acc": 0.8883228206372311, + "train_speed(iter/s)": 0.032449 + }, + { + "epoch": 0.43288281741994866, + "grad_norm": 0.09729617834091187, + "learning_rate": 0.00019505238520824893, + "loss": 0.342031866312027, + "memory(GiB)": 78.33, + "step": 2234, + "token_acc": 0.8982541675900618, + "train_speed(iter/s)": 0.03245 + }, + { + "epoch": 0.4330765877052754, + "grad_norm": 0.11875928193330765, + "learning_rate": 0.00019496068247625361, + "loss": 0.41712290048599243, + "memory(GiB)": 78.33, + "step": 2235, + "token_acc": 0.8765791742219315, + "train_speed(iter/s)": 0.032451 + }, + { + "epoch": 0.43327035799060215, + "grad_norm": 0.1124706119298935, + "learning_rate": 0.00019486896127769794, + "loss": 0.3886880576610565, + "memory(GiB)": 78.33, + "step": 2236, + "token_acc": 0.8862263618402484, + "train_speed(iter/s)": 0.032453 + }, + { + "epoch": 0.4334641282759289, + "grad_norm": 0.10713918507099152, + "learning_rate": 0.00019477722165025418, + "loss": 0.3783111572265625, + "memory(GiB)": 78.33, + "step": 2237, + "token_acc": 0.8903876189123953, + "train_speed(iter/s)": 0.032454 + }, + { + "epoch": 0.43365789856125564, + "grad_norm": 0.09784865379333496, + "learning_rate": 0.00019468546363160224, + "loss": 0.3712804913520813, + "memory(GiB)": 78.33, + "step": 2238, + "token_acc": 0.8909272229695255, + "train_speed(iter/s)": 0.032455 + }, + { + "epoch": 0.4338516688465824, + "grad_norm": 0.11485416442155838, + "learning_rate": 0.0001945936872594297, + "loss": 0.40341106057167053, + "memory(GiB)": 78.33, + "step": 2239, + "token_acc": 0.8822438849351115, + "train_speed(iter/s)": 0.032456 + }, + { + "epoch": 0.4340454391319091, + "grad_norm": 0.10090679675340652, + "learning_rate": 0.00019450189257143148, + "loss": 0.3727151155471802, + "memory(GiB)": 78.33, + "step": 2240, + "token_acc": 0.8921047498293182, + "train_speed(iter/s)": 0.032457 + }, + { + "epoch": 0.43423920941723587, + "grad_norm": 0.11711379885673523, + "learning_rate": 0.0001944100796053102, + "loss": 0.38696539402008057, + "memory(GiB)": 78.33, + "step": 2241, + "token_acc": 0.8871316037592384, + "train_speed(iter/s)": 0.032458 + }, + { + "epoch": 0.4344329797025626, + "grad_norm": 0.12041328847408295, + "learning_rate": 0.00019431824839877582, + "loss": 0.41463562846183777, + "memory(GiB)": 78.33, + "step": 2242, + "token_acc": 0.881887840766859, + "train_speed(iter/s)": 0.03246 + }, + { + "epoch": 0.43462674998788936, + "grad_norm": 0.1023503914475441, + "learning_rate": 0.00019422639898954603, + "loss": 0.3793856203556061, + "memory(GiB)": 78.33, + "step": 2243, + "token_acc": 0.887026578933734, + "train_speed(iter/s)": 0.032461 + }, + { + "epoch": 0.4348205202732161, + "grad_norm": 0.12375747412443161, + "learning_rate": 0.00019413453141534575, + "loss": 0.41560643911361694, + "memory(GiB)": 78.33, + "step": 2244, + "token_acc": 0.878722458568798, + "train_speed(iter/s)": 0.032462 + }, + { + "epoch": 0.43501429055854285, + "grad_norm": 0.09916850179433823, + "learning_rate": 0.00019404264571390743, + "loss": 0.360500693321228, + "memory(GiB)": 78.33, + "step": 2245, + "token_acc": 0.8931444381820977, + "train_speed(iter/s)": 0.032463 + }, + { + "epoch": 0.4352080608438696, + "grad_norm": 0.10854557156562805, + "learning_rate": 0.00019395074192297106, + "loss": 0.39239490032196045, + "memory(GiB)": 78.33, + "step": 2246, + "token_acc": 0.8853895766150005, + "train_speed(iter/s)": 0.032464 + }, + { + "epoch": 0.43540183112919634, + "grad_norm": 0.10064675658941269, + "learning_rate": 0.000193858820080284, + "loss": 0.3592490553855896, + "memory(GiB)": 78.33, + "step": 2247, + "token_acc": 0.8954478346456692, + "train_speed(iter/s)": 0.032465 + }, + { + "epoch": 0.4355956014145231, + "grad_norm": 0.11303189396858215, + "learning_rate": 0.00019376688022360099, + "loss": 0.35404136776924133, + "memory(GiB)": 78.33, + "step": 2248, + "token_acc": 0.8956532831584654, + "train_speed(iter/s)": 0.032467 + }, + { + "epoch": 0.4357893716998498, + "grad_norm": 0.1282435804605484, + "learning_rate": 0.00019367492239068417, + "loss": 0.37942934036254883, + "memory(GiB)": 78.33, + "step": 2249, + "token_acc": 0.889295596034125, + "train_speed(iter/s)": 0.032468 + }, + { + "epoch": 0.43598314198517657, + "grad_norm": 0.1114896610379219, + "learning_rate": 0.0001935829466193032, + "loss": 0.3755391538143158, + "memory(GiB)": 78.33, + "step": 2250, + "token_acc": 0.8880601168939605, + "train_speed(iter/s)": 0.032469 + }, + { + "epoch": 0.4361769122705033, + "grad_norm": 0.10226267576217651, + "learning_rate": 0.00019349095294723487, + "loss": 0.37153515219688416, + "memory(GiB)": 78.33, + "step": 2251, + "token_acc": 0.8909354937328035, + "train_speed(iter/s)": 0.03247 + }, + { + "epoch": 0.43637068255583006, + "grad_norm": 0.10146588832139969, + "learning_rate": 0.00019339894141226355, + "loss": 0.36199501156806946, + "memory(GiB)": 78.33, + "step": 2252, + "token_acc": 0.8924786237263836, + "train_speed(iter/s)": 0.032471 + }, + { + "epoch": 0.4365644528411568, + "grad_norm": 0.11089852452278137, + "learning_rate": 0.00019330691205218082, + "loss": 0.38684314489364624, + "memory(GiB)": 78.33, + "step": 2253, + "token_acc": 0.8873638079006635, + "train_speed(iter/s)": 0.032472 + }, + { + "epoch": 0.43675822312648355, + "grad_norm": 0.10390076041221619, + "learning_rate": 0.00019321486490478563, + "loss": 0.37561658024787903, + "memory(GiB)": 78.33, + "step": 2254, + "token_acc": 0.8915121559575482, + "train_speed(iter/s)": 0.032474 + }, + { + "epoch": 0.4369519934118103, + "grad_norm": 0.09438452124595642, + "learning_rate": 0.00019312280000788416, + "loss": 0.34584975242614746, + "memory(GiB)": 78.33, + "step": 2255, + "token_acc": 0.8961015827777495, + "train_speed(iter/s)": 0.032475 + }, + { + "epoch": 0.43714576369713704, + "grad_norm": 0.09714444726705551, + "learning_rate": 0.00019303071739928997, + "loss": 0.3477434515953064, + "memory(GiB)": 78.33, + "step": 2256, + "token_acc": 0.8965367189699747, + "train_speed(iter/s)": 0.032476 + }, + { + "epoch": 0.4373395339824638, + "grad_norm": 0.10147438943386078, + "learning_rate": 0.00019293861711682393, + "loss": 0.352212131023407, + "memory(GiB)": 78.33, + "step": 2257, + "token_acc": 0.895200889829359, + "train_speed(iter/s)": 0.032477 + }, + { + "epoch": 0.4375333042677905, + "grad_norm": 0.10559714585542679, + "learning_rate": 0.00019284649919831394, + "loss": 0.37872010469436646, + "memory(GiB)": 78.33, + "step": 2258, + "token_acc": 0.8881790060998526, + "train_speed(iter/s)": 0.032478 + }, + { + "epoch": 0.43772707455311727, + "grad_norm": 0.10056591033935547, + "learning_rate": 0.00019275436368159548, + "loss": 0.3722417652606964, + "memory(GiB)": 78.33, + "step": 2259, + "token_acc": 0.8918299445471349, + "train_speed(iter/s)": 0.032479 + }, + { + "epoch": 0.437920844838444, + "grad_norm": 0.10316530615091324, + "learning_rate": 0.00019266221060451096, + "loss": 0.3741195797920227, + "memory(GiB)": 78.33, + "step": 2260, + "token_acc": 0.8905550295555158, + "train_speed(iter/s)": 0.03248 + }, + { + "epoch": 0.43811461512377076, + "grad_norm": 0.1018475815653801, + "learning_rate": 0.00019257004000491017, + "loss": 0.36181291937828064, + "memory(GiB)": 78.33, + "step": 2261, + "token_acc": 0.8936356535939978, + "train_speed(iter/s)": 0.032481 + }, + { + "epoch": 0.4383083854090975, + "grad_norm": 0.10764992982149124, + "learning_rate": 0.00019247785192065003, + "loss": 0.39701730012893677, + "memory(GiB)": 78.33, + "step": 2262, + "token_acc": 0.8833886405397252, + "train_speed(iter/s)": 0.032483 + }, + { + "epoch": 0.43850215569442424, + "grad_norm": 0.09933136403560638, + "learning_rate": 0.00019238564638959473, + "loss": 0.331234335899353, + "memory(GiB)": 78.33, + "step": 2263, + "token_acc": 0.901470844428959, + "train_speed(iter/s)": 0.032484 + }, + { + "epoch": 0.438695925979751, + "grad_norm": 0.10954437404870987, + "learning_rate": 0.00019229342344961547, + "loss": 0.38815680146217346, + "memory(GiB)": 78.33, + "step": 2264, + "token_acc": 0.8864250798765668, + "train_speed(iter/s)": 0.032485 + }, + { + "epoch": 0.43888969626507773, + "grad_norm": 0.09880734980106354, + "learning_rate": 0.00019220118313859074, + "loss": 0.3450104892253876, + "memory(GiB)": 78.33, + "step": 2265, + "token_acc": 0.8972709063595393, + "train_speed(iter/s)": 0.032486 + }, + { + "epoch": 0.4390834665504045, + "grad_norm": 0.09824536740779877, + "learning_rate": 0.0001921089254944061, + "loss": 0.3457093834877014, + "memory(GiB)": 78.33, + "step": 2266, + "token_acc": 0.898, + "train_speed(iter/s)": 0.032487 + }, + { + "epoch": 0.4392772368357312, + "grad_norm": 0.11076337844133377, + "learning_rate": 0.00019201665055495427, + "loss": 0.4086833894252777, + "memory(GiB)": 78.33, + "step": 2267, + "token_acc": 0.8818113491168733, + "train_speed(iter/s)": 0.032488 + }, + { + "epoch": 0.43947100712105797, + "grad_norm": 0.10491563379764557, + "learning_rate": 0.00019192435835813502, + "loss": 0.36850589513778687, + "memory(GiB)": 78.33, + "step": 2268, + "token_acc": 0.8915903890160183, + "train_speed(iter/s)": 0.032489 + }, + { + "epoch": 0.4396647774063847, + "grad_norm": 0.10932449251413345, + "learning_rate": 0.00019183204894185522, + "loss": 0.3876577317714691, + "memory(GiB)": 78.33, + "step": 2269, + "token_acc": 0.8863226177709189, + "train_speed(iter/s)": 0.03249 + }, + { + "epoch": 0.43985854769171145, + "grad_norm": 0.11134737730026245, + "learning_rate": 0.00019173972234402887, + "loss": 0.40485280752182007, + "memory(GiB)": 78.33, + "step": 2270, + "token_acc": 0.8811312687068596, + "train_speed(iter/s)": 0.032491 + }, + { + "epoch": 0.4400523179770382, + "grad_norm": 0.1066315695643425, + "learning_rate": 0.00019164737860257692, + "loss": 0.38198620080947876, + "memory(GiB)": 78.33, + "step": 2271, + "token_acc": 0.8884394005212859, + "train_speed(iter/s)": 0.032493 + }, + { + "epoch": 0.44024608826236494, + "grad_norm": 0.10852167755365372, + "learning_rate": 0.00019155501775542752, + "loss": 0.3655702471733093, + "memory(GiB)": 78.33, + "step": 2272, + "token_acc": 0.8904522077437578, + "train_speed(iter/s)": 0.032494 + }, + { + "epoch": 0.4404398585476917, + "grad_norm": 0.10043393820524216, + "learning_rate": 0.00019146263984051574, + "loss": 0.3396553099155426, + "memory(GiB)": 78.33, + "step": 2273, + "token_acc": 0.8992605233219567, + "train_speed(iter/s)": 0.032495 + }, + { + "epoch": 0.44063362883301843, + "grad_norm": 0.09665820002555847, + "learning_rate": 0.00019137024489578354, + "loss": 0.3522893190383911, + "memory(GiB)": 78.33, + "step": 2274, + "token_acc": 0.8970611596505162, + "train_speed(iter/s)": 0.032496 + }, + { + "epoch": 0.4408273991183452, + "grad_norm": 0.10279867798089981, + "learning_rate": 0.00019127783295918015, + "loss": 0.38115227222442627, + "memory(GiB)": 78.33, + "step": 2275, + "token_acc": 0.8877062684693637, + "train_speed(iter/s)": 0.032497 + }, + { + "epoch": 0.4410211694036719, + "grad_norm": 0.10720198601484299, + "learning_rate": 0.00019118540406866158, + "loss": 0.3606134355068207, + "memory(GiB)": 78.33, + "step": 2276, + "token_acc": 0.8946840521564694, + "train_speed(iter/s)": 0.032499 + }, + { + "epoch": 0.4412149396889987, + "grad_norm": 0.10232347249984741, + "learning_rate": 0.00019109295826219086, + "loss": 0.3502247929573059, + "memory(GiB)": 78.33, + "step": 2277, + "token_acc": 0.8963676036452317, + "train_speed(iter/s)": 0.0325 + }, + { + "epoch": 0.44140870997432546, + "grad_norm": 0.10972965508699417, + "learning_rate": 0.00019100049557773798, + "loss": 0.3618561327457428, + "memory(GiB)": 78.33, + "step": 2278, + "token_acc": 0.8917501192179303, + "train_speed(iter/s)": 0.032501 + }, + { + "epoch": 0.4416024802596522, + "grad_norm": 0.10983320325613022, + "learning_rate": 0.00019090801605327982, + "loss": 0.37896856665611267, + "memory(GiB)": 78.33, + "step": 2279, + "token_acc": 0.8861007817761251, + "train_speed(iter/s)": 0.032502 + }, + { + "epoch": 0.44179625054497895, + "grad_norm": 0.10652820765972137, + "learning_rate": 0.00019081551972680025, + "loss": 0.3781920075416565, + "memory(GiB)": 78.33, + "step": 2280, + "token_acc": 0.8892166502785972, + "train_speed(iter/s)": 0.032503 + }, + { + "epoch": 0.4419900208303057, + "grad_norm": 0.10808536410331726, + "learning_rate": 0.00019072300663628997, + "loss": 0.3892320990562439, + "memory(GiB)": 78.33, + "step": 2281, + "token_acc": 0.8866169049621531, + "train_speed(iter/s)": 0.032504 + }, + { + "epoch": 0.44218379111563244, + "grad_norm": 0.09705408662557602, + "learning_rate": 0.00019063047681974656, + "loss": 0.33386996388435364, + "memory(GiB)": 78.33, + "step": 2282, + "token_acc": 0.9006132756132756, + "train_speed(iter/s)": 0.032505 + }, + { + "epoch": 0.4423775614009592, + "grad_norm": 0.10078884661197662, + "learning_rate": 0.0001905379303151746, + "loss": 0.33445224165916443, + "memory(GiB)": 78.33, + "step": 2283, + "token_acc": 0.8993649371524941, + "train_speed(iter/s)": 0.032506 + }, + { + "epoch": 0.44257133168628593, + "grad_norm": 0.11122056096792221, + "learning_rate": 0.0001904453671605853, + "loss": 0.37664756178855896, + "memory(GiB)": 78.33, + "step": 2284, + "token_acc": 0.888987135970792, + "train_speed(iter/s)": 0.032507 + }, + { + "epoch": 0.4427651019716127, + "grad_norm": 0.1016409620642662, + "learning_rate": 0.00019035278739399692, + "loss": 0.35757166147232056, + "memory(GiB)": 78.33, + "step": 2285, + "token_acc": 0.8957771559399644, + "train_speed(iter/s)": 0.032508 + }, + { + "epoch": 0.4429588722569394, + "grad_norm": 0.09880708903074265, + "learning_rate": 0.00019026019105343445, + "loss": 0.3487710952758789, + "memory(GiB)": 78.33, + "step": 2286, + "token_acc": 0.8957860078103066, + "train_speed(iter/s)": 0.03251 + }, + { + "epoch": 0.44315264254226616, + "grad_norm": 0.1038837805390358, + "learning_rate": 0.00019016757817692966, + "loss": 0.35564103722572327, + "memory(GiB)": 78.33, + "step": 2287, + "token_acc": 0.893324717649042, + "train_speed(iter/s)": 0.032511 + }, + { + "epoch": 0.4433464128275929, + "grad_norm": 0.11149082332849503, + "learning_rate": 0.0001900749488025212, + "loss": 0.3836996853351593, + "memory(GiB)": 78.33, + "step": 2288, + "token_acc": 0.8873978009585566, + "train_speed(iter/s)": 0.032512 + }, + { + "epoch": 0.44354018311291965, + "grad_norm": 0.10695455968379974, + "learning_rate": 0.00018998230296825438, + "loss": 0.3890923857688904, + "memory(GiB)": 78.33, + "step": 2289, + "token_acc": 0.8848062223414241, + "train_speed(iter/s)": 0.032513 + }, + { + "epoch": 0.4437339533982464, + "grad_norm": 0.10037211328744888, + "learning_rate": 0.00018988964071218136, + "loss": 0.34572556614875793, + "memory(GiB)": 78.33, + "step": 2290, + "token_acc": 0.8985781478077526, + "train_speed(iter/s)": 0.032514 + }, + { + "epoch": 0.44392772368357314, + "grad_norm": 0.09722217172384262, + "learning_rate": 0.000189796962072361, + "loss": 0.3216269314289093, + "memory(GiB)": 78.33, + "step": 2291, + "token_acc": 0.906508290027221, + "train_speed(iter/s)": 0.032515 + }, + { + "epoch": 0.4441214939688999, + "grad_norm": 0.1133374273777008, + "learning_rate": 0.000189704267086859, + "loss": 0.3567155599594116, + "memory(GiB)": 78.33, + "step": 2292, + "token_acc": 0.8946020433729573, + "train_speed(iter/s)": 0.032516 + }, + { + "epoch": 0.4443152642542266, + "grad_norm": 0.10639848560094833, + "learning_rate": 0.00018961155579374757, + "loss": 0.3713446855545044, + "memory(GiB)": 78.33, + "step": 2293, + "token_acc": 0.8898174505758534, + "train_speed(iter/s)": 0.032518 + }, + { + "epoch": 0.44450903453955337, + "grad_norm": 0.1109897717833519, + "learning_rate": 0.0001895188282311058, + "loss": 0.3658873438835144, + "memory(GiB)": 78.33, + "step": 2294, + "token_acc": 0.8911519867265497, + "train_speed(iter/s)": 0.032519 + }, + { + "epoch": 0.4447028048248801, + "grad_norm": 0.1037331074476242, + "learning_rate": 0.00018942608443701936, + "loss": 0.34084218740463257, + "memory(GiB)": 78.33, + "step": 2295, + "token_acc": 0.898319518564778, + "train_speed(iter/s)": 0.03252 + }, + { + "epoch": 0.44489657511020686, + "grad_norm": 0.11509410291910172, + "learning_rate": 0.00018933332444958062, + "loss": 0.38112330436706543, + "memory(GiB)": 78.33, + "step": 2296, + "token_acc": 0.8888923143323982, + "train_speed(iter/s)": 0.032521 + }, + { + "epoch": 0.4450903453955336, + "grad_norm": 0.09891083091497421, + "learning_rate": 0.00018924054830688858, + "loss": 0.33499619364738464, + "memory(GiB)": 78.33, + "step": 2297, + "token_acc": 0.9014303407076423, + "train_speed(iter/s)": 0.032522 + }, + { + "epoch": 0.44528411568086035, + "grad_norm": 0.11420368403196335, + "learning_rate": 0.0001891477560470489, + "loss": 0.3879055976867676, + "memory(GiB)": 78.33, + "step": 2298, + "token_acc": 0.8863246853022871, + "train_speed(iter/s)": 0.032523 + }, + { + "epoch": 0.4454778859661871, + "grad_norm": 0.10501628369092941, + "learning_rate": 0.0001890549477081739, + "loss": 0.3551484942436218, + "memory(GiB)": 78.33, + "step": 2299, + "token_acc": 0.8968862632448985, + "train_speed(iter/s)": 0.032524 + }, + { + "epoch": 0.44567165625151384, + "grad_norm": 0.10620047897100449, + "learning_rate": 0.00018896212332838243, + "loss": 0.36860981583595276, + "memory(GiB)": 78.33, + "step": 2300, + "token_acc": 0.8929374124117507, + "train_speed(iter/s)": 0.032525 + }, + { + "epoch": 0.4458654265368406, + "grad_norm": 0.10044432431459427, + "learning_rate": 0.0001888692829458, + "loss": 0.3472827672958374, + "memory(GiB)": 78.33, + "step": 2301, + "token_acc": 0.8963743001866169, + "train_speed(iter/s)": 0.032526 + }, + { + "epoch": 0.4460591968221673, + "grad_norm": 0.13254094123840332, + "learning_rate": 0.00018877642659855852, + "loss": 0.4520479440689087, + "memory(GiB)": 78.33, + "step": 2302, + "token_acc": 0.8687972508591065, + "train_speed(iter/s)": 0.032527 + }, + { + "epoch": 0.44625296710749407, + "grad_norm": 0.10516183078289032, + "learning_rate": 0.00018868355432479674, + "loss": 0.37986326217651367, + "memory(GiB)": 78.33, + "step": 2303, + "token_acc": 0.8892758400680562, + "train_speed(iter/s)": 0.032529 + }, + { + "epoch": 0.4464467373928208, + "grad_norm": 0.12673194706439972, + "learning_rate": 0.00018859066616265966, + "loss": 0.4348051846027374, + "memory(GiB)": 78.33, + "step": 2304, + "token_acc": 0.8738275643379285, + "train_speed(iter/s)": 0.03253 + }, + { + "epoch": 0.44664050767814756, + "grad_norm": 0.09944983571767807, + "learning_rate": 0.00018849776215029907, + "loss": 0.3587363064289093, + "memory(GiB)": 78.33, + "step": 2305, + "token_acc": 0.8943836362735872, + "train_speed(iter/s)": 0.032531 + }, + { + "epoch": 0.4468342779634743, + "grad_norm": 0.09945741295814514, + "learning_rate": 0.0001884048423258731, + "loss": 0.35953009128570557, + "memory(GiB)": 78.33, + "step": 2306, + "token_acc": 0.8941004794663331, + "train_speed(iter/s)": 0.032532 + }, + { + "epoch": 0.44702804824880105, + "grad_norm": 0.10500568151473999, + "learning_rate": 0.00018831190672754638, + "loss": 0.38044261932373047, + "memory(GiB)": 78.33, + "step": 2307, + "token_acc": 0.8867819141508644, + "train_speed(iter/s)": 0.032533 + }, + { + "epoch": 0.4472218185341278, + "grad_norm": 0.09857631474733353, + "learning_rate": 0.00018821895539349008, + "loss": 0.36684414744377136, + "memory(GiB)": 78.33, + "step": 2308, + "token_acc": 0.8926224492510708, + "train_speed(iter/s)": 0.032534 + }, + { + "epoch": 0.44741558881945453, + "grad_norm": 0.10511371493339539, + "learning_rate": 0.00018812598836188182, + "loss": 0.36754751205444336, + "memory(GiB)": 78.33, + "step": 2309, + "token_acc": 0.8913974514883942, + "train_speed(iter/s)": 0.032535 + }, + { + "epoch": 0.4476093591047813, + "grad_norm": 0.10627889633178711, + "learning_rate": 0.0001880330056709057, + "loss": 0.3721667230129242, + "memory(GiB)": 78.33, + "step": 2310, + "token_acc": 0.8893509074742403, + "train_speed(iter/s)": 0.032537 + }, + { + "epoch": 0.447803129390108, + "grad_norm": 0.09434963017702103, + "learning_rate": 0.00018794000735875208, + "loss": 0.3414641320705414, + "memory(GiB)": 78.33, + "step": 2311, + "token_acc": 0.9010579609772407, + "train_speed(iter/s)": 0.032538 + }, + { + "epoch": 0.44799689967543477, + "grad_norm": 0.10588974505662918, + "learning_rate": 0.00018784699346361802, + "loss": 0.36833858489990234, + "memory(GiB)": 78.33, + "step": 2312, + "token_acc": 0.8927507447864945, + "train_speed(iter/s)": 0.032539 + }, + { + "epoch": 0.4481906699607615, + "grad_norm": 0.10945326089859009, + "learning_rate": 0.00018775396402370673, + "loss": 0.3979440927505493, + "memory(GiB)": 78.33, + "step": 2313, + "token_acc": 0.8817913102618431, + "train_speed(iter/s)": 0.03254 + }, + { + "epoch": 0.44838444024608826, + "grad_norm": 0.10148966312408447, + "learning_rate": 0.00018766091907722795, + "loss": 0.35080429911613464, + "memory(GiB)": 78.33, + "step": 2314, + "token_acc": 0.8943991907576, + "train_speed(iter/s)": 0.032541 + }, + { + "epoch": 0.448578210531415, + "grad_norm": 0.10452762246131897, + "learning_rate": 0.00018756785866239767, + "loss": 0.36064577102661133, + "memory(GiB)": 78.33, + "step": 2315, + "token_acc": 0.8935257756351483, + "train_speed(iter/s)": 0.032542 + }, + { + "epoch": 0.44877198081674174, + "grad_norm": 0.10478675365447998, + "learning_rate": 0.00018747478281743842, + "loss": 0.36270397901535034, + "memory(GiB)": 78.33, + "step": 2316, + "token_acc": 0.8914383656318249, + "train_speed(iter/s)": 0.032543 + }, + { + "epoch": 0.4489657511020685, + "grad_norm": 0.10872726887464523, + "learning_rate": 0.0001873816915805788, + "loss": 0.3819129467010498, + "memory(GiB)": 78.33, + "step": 2317, + "token_acc": 0.8880903215425461, + "train_speed(iter/s)": 0.032544 + }, + { + "epoch": 0.44915952138739523, + "grad_norm": 0.09838545322418213, + "learning_rate": 0.00018728858499005398, + "loss": 0.34659987688064575, + "memory(GiB)": 78.33, + "step": 2318, + "token_acc": 0.89670946219167, + "train_speed(iter/s)": 0.032545 + }, + { + "epoch": 0.449353291672722, + "grad_norm": 0.11515597254037857, + "learning_rate": 0.00018719546308410538, + "loss": 0.389648973941803, + "memory(GiB)": 78.33, + "step": 2319, + "token_acc": 0.8872989067638433, + "train_speed(iter/s)": 0.032546 + }, + { + "epoch": 0.4495470619580487, + "grad_norm": 0.10723573714494705, + "learning_rate": 0.00018710232590098057, + "loss": 0.3719337582588196, + "memory(GiB)": 78.33, + "step": 2320, + "token_acc": 0.8903108357787197, + "train_speed(iter/s)": 0.032548 + }, + { + "epoch": 0.44974083224337547, + "grad_norm": 0.10834118723869324, + "learning_rate": 0.00018700917347893358, + "loss": 0.3716052770614624, + "memory(GiB)": 78.33, + "step": 2321, + "token_acc": 0.889080622347949, + "train_speed(iter/s)": 0.032549 + }, + { + "epoch": 0.4499346025287022, + "grad_norm": 0.10417478531599045, + "learning_rate": 0.0001869160058562245, + "loss": 0.35477250814437866, + "memory(GiB)": 78.33, + "step": 2322, + "token_acc": 0.8943120009258188, + "train_speed(iter/s)": 0.03255 + }, + { + "epoch": 0.45012837281402895, + "grad_norm": 0.10129362344741821, + "learning_rate": 0.00018682282307111987, + "loss": 0.3582378327846527, + "memory(GiB)": 78.33, + "step": 2323, + "token_acc": 0.8946028116311099, + "train_speed(iter/s)": 0.032551 + }, + { + "epoch": 0.4503221430993557, + "grad_norm": 0.09624646604061127, + "learning_rate": 0.0001867296251618923, + "loss": 0.3303111493587494, + "memory(GiB)": 78.33, + "step": 2324, + "token_acc": 0.9006094476429071, + "train_speed(iter/s)": 0.032552 + }, + { + "epoch": 0.45051591338468244, + "grad_norm": 0.10416833311319351, + "learning_rate": 0.00018663641216682075, + "loss": 0.3703954517841339, + "memory(GiB)": 78.33, + "step": 2325, + "token_acc": 0.8913434299411637, + "train_speed(iter/s)": 0.032553 + }, + { + "epoch": 0.4507096836700092, + "grad_norm": 0.09914438426494598, + "learning_rate": 0.0001865431841241903, + "loss": 0.3545830547809601, + "memory(GiB)": 78.33, + "step": 2326, + "token_acc": 0.8953699331461094, + "train_speed(iter/s)": 0.032554 + }, + { + "epoch": 0.45090345395533593, + "grad_norm": 0.10621084272861481, + "learning_rate": 0.00018644994107229216, + "loss": 0.37236636877059937, + "memory(GiB)": 78.33, + "step": 2327, + "token_acc": 0.8887367838522704, + "train_speed(iter/s)": 0.032555 + }, + { + "epoch": 0.4510972242406627, + "grad_norm": 0.09897017478942871, + "learning_rate": 0.0001863566830494237, + "loss": 0.33905407786369324, + "memory(GiB)": 78.33, + "step": 2328, + "token_acc": 0.8993225346373268, + "train_speed(iter/s)": 0.032557 + }, + { + "epoch": 0.4512909945259894, + "grad_norm": 0.09863407909870148, + "learning_rate": 0.00018626341009388866, + "loss": 0.3825795352458954, + "memory(GiB)": 78.33, + "step": 2329, + "token_acc": 0.8874984754238322, + "train_speed(iter/s)": 0.032558 + }, + { + "epoch": 0.45148476481131616, + "grad_norm": 0.09864397346973419, + "learning_rate": 0.00018617012224399662, + "loss": 0.3443533778190613, + "memory(GiB)": 78.33, + "step": 2330, + "token_acc": 0.9003264309386758, + "train_speed(iter/s)": 0.032559 + }, + { + "epoch": 0.4516785350966429, + "grad_norm": 0.09809895604848862, + "learning_rate": 0.00018607681953806341, + "loss": 0.34165775775909424, + "memory(GiB)": 78.33, + "step": 2331, + "token_acc": 0.8970808010361662, + "train_speed(iter/s)": 0.03256 + }, + { + "epoch": 0.45187230538196965, + "grad_norm": 0.10424879193305969, + "learning_rate": 0.00018598350201441108, + "loss": 0.33717483282089233, + "memory(GiB)": 78.33, + "step": 2332, + "token_acc": 0.8975676890237052, + "train_speed(iter/s)": 0.032561 + }, + { + "epoch": 0.4520660756672964, + "grad_norm": 0.09718424081802368, + "learning_rate": 0.00018589016971136752, + "loss": 0.32958295941352844, + "memory(GiB)": 78.33, + "step": 2333, + "token_acc": 0.9014591294853342, + "train_speed(iter/s)": 0.032562 + }, + { + "epoch": 0.45225984595262314, + "grad_norm": 0.1105201467871666, + "learning_rate": 0.00018579682266726686, + "loss": 0.38120901584625244, + "memory(GiB)": 78.33, + "step": 2334, + "token_acc": 0.8889513793200416, + "train_speed(iter/s)": 0.032563 + }, + { + "epoch": 0.4524536162379499, + "grad_norm": 0.1020718514919281, + "learning_rate": 0.00018570346092044917, + "loss": 0.3501797616481781, + "memory(GiB)": 78.33, + "step": 2335, + "token_acc": 0.8964525407478428, + "train_speed(iter/s)": 0.032564 + }, + { + "epoch": 0.45264738652327663, + "grad_norm": 0.10678357630968094, + "learning_rate": 0.00018561008450926076, + "loss": 0.3577033579349518, + "memory(GiB)": 78.33, + "step": 2336, + "token_acc": 0.8953428424127857, + "train_speed(iter/s)": 0.032565 + }, + { + "epoch": 0.4528411568086034, + "grad_norm": 0.09743531793355942, + "learning_rate": 0.0001855166934720537, + "loss": 0.33748358488082886, + "memory(GiB)": 78.33, + "step": 2337, + "token_acc": 0.8984052721601771, + "train_speed(iter/s)": 0.032566 + }, + { + "epoch": 0.4530349270939302, + "grad_norm": 0.10229934751987457, + "learning_rate": 0.00018542328784718632, + "loss": 0.3752382695674896, + "memory(GiB)": 78.33, + "step": 2338, + "token_acc": 0.891382828441375, + "train_speed(iter/s)": 0.032567 + }, + { + "epoch": 0.4532286973792569, + "grad_norm": 0.11242741346359253, + "learning_rate": 0.00018532986767302276, + "loss": 0.4054182767868042, + "memory(GiB)": 78.33, + "step": 2339, + "token_acc": 0.8786372906672795, + "train_speed(iter/s)": 0.032568 + }, + { + "epoch": 0.45342246766458366, + "grad_norm": 0.09873122721910477, + "learning_rate": 0.0001852364329879332, + "loss": 0.3532402813434601, + "memory(GiB)": 78.33, + "step": 2340, + "token_acc": 0.8958922821738706, + "train_speed(iter/s)": 0.032569 + }, + { + "epoch": 0.4536162379499104, + "grad_norm": 0.10772307217121124, + "learning_rate": 0.00018514298383029372, + "loss": 0.37972134351730347, + "memory(GiB)": 78.33, + "step": 2341, + "token_acc": 0.8879170528266914, + "train_speed(iter/s)": 0.03257 + }, + { + "epoch": 0.45381000823523715, + "grad_norm": 0.11007421463727951, + "learning_rate": 0.00018504952023848647, + "loss": 0.37062400579452515, + "memory(GiB)": 78.33, + "step": 2342, + "token_acc": 0.8900929211930756, + "train_speed(iter/s)": 0.032571 + }, + { + "epoch": 0.4540037785205639, + "grad_norm": 0.09810297191143036, + "learning_rate": 0.00018495604225089946, + "loss": 0.33862578868865967, + "memory(GiB)": 78.33, + "step": 2343, + "token_acc": 0.9004052456801936, + "train_speed(iter/s)": 0.032572 + }, + { + "epoch": 0.45419754880589064, + "grad_norm": 0.09834323078393936, + "learning_rate": 0.00018486254990592656, + "loss": 0.34658533334732056, + "memory(GiB)": 78.33, + "step": 2344, + "token_acc": 0.8964155452144384, + "train_speed(iter/s)": 0.032573 + }, + { + "epoch": 0.4543913190912174, + "grad_norm": 0.11022671312093735, + "learning_rate": 0.00018476904324196764, + "loss": 0.4045500159263611, + "memory(GiB)": 78.33, + "step": 2345, + "token_acc": 0.8820878509132359, + "train_speed(iter/s)": 0.032574 + }, + { + "epoch": 0.4545850893765441, + "grad_norm": 0.10049665719270706, + "learning_rate": 0.0001846755222974284, + "loss": 0.3371380865573883, + "memory(GiB)": 78.33, + "step": 2346, + "token_acc": 0.9003629250212349, + "train_speed(iter/s)": 0.032576 + }, + { + "epoch": 0.45477885966187087, + "grad_norm": 0.09852764755487442, + "learning_rate": 0.0001845819871107204, + "loss": 0.36543235182762146, + "memory(GiB)": 78.33, + "step": 2347, + "token_acc": 0.8905981201226657, + "train_speed(iter/s)": 0.032577 + }, + { + "epoch": 0.4549726299471976, + "grad_norm": 0.11310271173715591, + "learning_rate": 0.00018448843772026098, + "loss": 0.3698401153087616, + "memory(GiB)": 78.33, + "step": 2348, + "token_acc": 0.8884804666140106, + "train_speed(iter/s)": 0.032578 + }, + { + "epoch": 0.45516640023252436, + "grad_norm": 0.1051260381937027, + "learning_rate": 0.0001843948741644735, + "loss": 0.3575577437877655, + "memory(GiB)": 78.33, + "step": 2349, + "token_acc": 0.8939034259509037, + "train_speed(iter/s)": 0.032579 + }, + { + "epoch": 0.4553601705178511, + "grad_norm": 0.10609371960163116, + "learning_rate": 0.00018430129648178693, + "loss": 0.3754083216190338, + "memory(GiB)": 78.33, + "step": 2350, + "token_acc": 0.8901177784276713, + "train_speed(iter/s)": 0.03258 + }, + { + "epoch": 0.45555394080317785, + "grad_norm": 0.10359574854373932, + "learning_rate": 0.0001842077047106362, + "loss": 0.37402597069740295, + "memory(GiB)": 78.33, + "step": 2351, + "token_acc": 0.8886889138857643, + "train_speed(iter/s)": 0.032581 + }, + { + "epoch": 0.4557477110885046, + "grad_norm": 0.10701092332601547, + "learning_rate": 0.00018411409888946197, + "loss": 0.3984612822532654, + "memory(GiB)": 78.33, + "step": 2352, + "token_acc": 0.8840575810574465, + "train_speed(iter/s)": 0.032582 + }, + { + "epoch": 0.45594148137383134, + "grad_norm": 0.09848210960626602, + "learning_rate": 0.00018402047905671063, + "loss": 0.3315700590610504, + "memory(GiB)": 78.33, + "step": 2353, + "token_acc": 0.9021433150310767, + "train_speed(iter/s)": 0.032583 + }, + { + "epoch": 0.4561352516591581, + "grad_norm": 0.10119795799255371, + "learning_rate": 0.0001839268452508344, + "loss": 0.3340100347995758, + "memory(GiB)": 78.33, + "step": 2354, + "token_acc": 0.9002782696686061, + "train_speed(iter/s)": 0.032584 + }, + { + "epoch": 0.4563290219444848, + "grad_norm": 0.10151761025190353, + "learning_rate": 0.00018383319751029114, + "loss": 0.3325027823448181, + "memory(GiB)": 78.33, + "step": 2355, + "token_acc": 0.902809093079392, + "train_speed(iter/s)": 0.032585 + }, + { + "epoch": 0.45652279222981157, + "grad_norm": 0.11158560961484909, + "learning_rate": 0.00018373953587354452, + "loss": 0.38792747259140015, + "memory(GiB)": 78.33, + "step": 2356, + "token_acc": 0.8858068315665489, + "train_speed(iter/s)": 0.032586 + }, + { + "epoch": 0.4567165625151383, + "grad_norm": 0.0991615429520607, + "learning_rate": 0.00018364586037906391, + "loss": 0.35615038871765137, + "memory(GiB)": 78.33, + "step": 2357, + "token_acc": 0.8954778266800505, + "train_speed(iter/s)": 0.032587 + }, + { + "epoch": 0.45691033280046506, + "grad_norm": 0.10101715475320816, + "learning_rate": 0.00018355217106532436, + "loss": 0.36160457134246826, + "memory(GiB)": 78.33, + "step": 2358, + "token_acc": 0.8935985113941181, + "train_speed(iter/s)": 0.032588 + }, + { + "epoch": 0.4571041030857918, + "grad_norm": 0.10320735722780228, + "learning_rate": 0.0001834584679708066, + "loss": 0.3799397945404053, + "memory(GiB)": 78.33, + "step": 2359, + "token_acc": 0.8880186336993524, + "train_speed(iter/s)": 0.032589 + }, + { + "epoch": 0.45729787337111855, + "grad_norm": 0.1062033548951149, + "learning_rate": 0.00018336475113399692, + "loss": 0.3934144079685211, + "memory(GiB)": 78.33, + "step": 2360, + "token_acc": 0.8855768736176123, + "train_speed(iter/s)": 0.03259 + }, + { + "epoch": 0.4574916436564453, + "grad_norm": 0.10630171746015549, + "learning_rate": 0.00018327102059338744, + "loss": 0.3696288466453552, + "memory(GiB)": 78.33, + "step": 2361, + "token_acc": 0.8901493818220828, + "train_speed(iter/s)": 0.032592 + }, + { + "epoch": 0.45768541394177203, + "grad_norm": 0.10573367774486542, + "learning_rate": 0.00018317727638747576, + "loss": 0.3717585504055023, + "memory(GiB)": 78.33, + "step": 2362, + "token_acc": 0.8903601270722656, + "train_speed(iter/s)": 0.032593 + }, + { + "epoch": 0.4578791842270988, + "grad_norm": 0.10207115113735199, + "learning_rate": 0.0001830835185547652, + "loss": 0.3541256785392761, + "memory(GiB)": 78.33, + "step": 2363, + "token_acc": 0.896280064694527, + "train_speed(iter/s)": 0.032594 + }, + { + "epoch": 0.4580729545124255, + "grad_norm": 0.10319995135068893, + "learning_rate": 0.0001829897471337645, + "loss": 0.3481341004371643, + "memory(GiB)": 78.33, + "step": 2364, + "token_acc": 0.8952120676258607, + "train_speed(iter/s)": 0.032595 + }, + { + "epoch": 0.45826672479775227, + "grad_norm": 0.10591937601566315, + "learning_rate": 0.00018289596216298823, + "loss": 0.36794811487197876, + "memory(GiB)": 78.33, + "step": 2365, + "token_acc": 0.8902500987974934, + "train_speed(iter/s)": 0.032596 + }, + { + "epoch": 0.458460495083079, + "grad_norm": 0.1113501638174057, + "learning_rate": 0.00018280216368095638, + "loss": 0.37933623790740967, + "memory(GiB)": 78.33, + "step": 2366, + "token_acc": 0.8892398270522216, + "train_speed(iter/s)": 0.032597 + }, + { + "epoch": 0.45865426536840576, + "grad_norm": 0.10290928184986115, + "learning_rate": 0.00018270835172619443, + "loss": 0.3481866717338562, + "memory(GiB)": 78.33, + "step": 2367, + "token_acc": 0.8950770760815515, + "train_speed(iter/s)": 0.032598 + }, + { + "epoch": 0.4588480356537325, + "grad_norm": 0.11763869225978851, + "learning_rate": 0.00018261452633723356, + "loss": 0.41069701313972473, + "memory(GiB)": 78.33, + "step": 2368, + "token_acc": 0.8784230338208672, + "train_speed(iter/s)": 0.032599 + }, + { + "epoch": 0.45904180593905924, + "grad_norm": 0.1066950112581253, + "learning_rate": 0.00018252068755261029, + "loss": 0.3848106861114502, + "memory(GiB)": 78.33, + "step": 2369, + "token_acc": 0.8873462694725335, + "train_speed(iter/s)": 0.0326 + }, + { + "epoch": 0.459235576224386, + "grad_norm": 0.10410353541374207, + "learning_rate": 0.00018242683541086678, + "loss": 0.36915817856788635, + "memory(GiB)": 78.33, + "step": 2370, + "token_acc": 0.8926594658498576, + "train_speed(iter/s)": 0.032601 + }, + { + "epoch": 0.45942934650971273, + "grad_norm": 0.10082308948040009, + "learning_rate": 0.00018233296995055065, + "loss": 0.370003342628479, + "memory(GiB)": 78.33, + "step": 2371, + "token_acc": 0.8912730226099714, + "train_speed(iter/s)": 0.032602 + }, + { + "epoch": 0.4596231167950395, + "grad_norm": 0.10870834439992905, + "learning_rate": 0.00018223909121021495, + "loss": 0.3969360589981079, + "memory(GiB)": 78.33, + "step": 2372, + "token_acc": 0.8850561134081512, + "train_speed(iter/s)": 0.032603 + }, + { + "epoch": 0.4598168870803662, + "grad_norm": 0.09935883432626724, + "learning_rate": 0.00018214519922841817, + "loss": 0.3541335165500641, + "memory(GiB)": 78.33, + "step": 2373, + "token_acc": 0.8943371776597123, + "train_speed(iter/s)": 0.032604 + }, + { + "epoch": 0.46001065736569297, + "grad_norm": 0.12180610001087189, + "learning_rate": 0.00018205129404372431, + "loss": 0.4192396402359009, + "memory(GiB)": 78.33, + "step": 2374, + "token_acc": 0.8759435110786462, + "train_speed(iter/s)": 0.032605 + }, + { + "epoch": 0.4602044276510197, + "grad_norm": 0.11153632402420044, + "learning_rate": 0.00018195737569470273, + "loss": 0.37499576807022095, + "memory(GiB)": 78.33, + "step": 2375, + "token_acc": 0.889001271529303, + "train_speed(iter/s)": 0.032607 + }, + { + "epoch": 0.46039819793634645, + "grad_norm": 0.10651109367609024, + "learning_rate": 0.0001818634442199282, + "loss": 0.3690948486328125, + "memory(GiB)": 78.33, + "step": 2376, + "token_acc": 0.8935253398571071, + "train_speed(iter/s)": 0.032608 + }, + { + "epoch": 0.4605919682216732, + "grad_norm": 0.09841850399971008, + "learning_rate": 0.00018176949965798093, + "loss": 0.3518884778022766, + "memory(GiB)": 78.33, + "step": 2377, + "token_acc": 0.8960413453472955, + "train_speed(iter/s)": 0.032609 + }, + { + "epoch": 0.46078573850699994, + "grad_norm": 0.1050775870680809, + "learning_rate": 0.0001816755420474465, + "loss": 0.3738871216773987, + "memory(GiB)": 78.33, + "step": 2378, + "token_acc": 0.8914807588995391, + "train_speed(iter/s)": 0.03261 + }, + { + "epoch": 0.4609795087923267, + "grad_norm": 0.10667918622493744, + "learning_rate": 0.0001815815714269158, + "loss": 0.38406267762184143, + "memory(GiB)": 78.33, + "step": 2379, + "token_acc": 0.8884490352348994, + "train_speed(iter/s)": 0.032611 + }, + { + "epoch": 0.46117327907765343, + "grad_norm": 0.11324939876794815, + "learning_rate": 0.00018148758783498504, + "loss": 0.38455072045326233, + "memory(GiB)": 78.33, + "step": 2380, + "token_acc": 0.8882405081991431, + "train_speed(iter/s)": 0.032612 + }, + { + "epoch": 0.4613670493629802, + "grad_norm": 0.10635250061750412, + "learning_rate": 0.00018139359131025588, + "loss": 0.3805847764015198, + "memory(GiB)": 78.33, + "step": 2381, + "token_acc": 0.8862944162436548, + "train_speed(iter/s)": 0.032613 + }, + { + "epoch": 0.4615608196483069, + "grad_norm": 0.10868912190198898, + "learning_rate": 0.00018129958189133522, + "loss": 0.3691996932029724, + "memory(GiB)": 78.33, + "step": 2382, + "token_acc": 0.8913490258405722, + "train_speed(iter/s)": 0.032614 + }, + { + "epoch": 0.46175458993363366, + "grad_norm": 0.09865929931402206, + "learning_rate": 0.00018120555961683514, + "loss": 0.36495789885520935, + "memory(GiB)": 78.33, + "step": 2383, + "token_acc": 0.8911807418711695, + "train_speed(iter/s)": 0.032615 + }, + { + "epoch": 0.4619483602189604, + "grad_norm": 0.1034301221370697, + "learning_rate": 0.00018111152452537327, + "loss": 0.3717435598373413, + "memory(GiB)": 78.33, + "step": 2384, + "token_acc": 0.8909826100745283, + "train_speed(iter/s)": 0.032616 + }, + { + "epoch": 0.46214213050428715, + "grad_norm": 0.10854472219944, + "learning_rate": 0.00018101747665557225, + "loss": 0.3835892081260681, + "memory(GiB)": 78.33, + "step": 2385, + "token_acc": 0.8888919333625602, + "train_speed(iter/s)": 0.032617 + }, + { + "epoch": 0.4623359007896139, + "grad_norm": 0.11397820711135864, + "learning_rate": 0.00018092341604606014, + "loss": 0.39861786365509033, + "memory(GiB)": 78.33, + "step": 2386, + "token_acc": 0.8822021941317055, + "train_speed(iter/s)": 0.032618 + }, + { + "epoch": 0.46252967107494064, + "grad_norm": 0.1112833321094513, + "learning_rate": 0.00018082934273547008, + "loss": 0.40602025389671326, + "memory(GiB)": 78.33, + "step": 2387, + "token_acc": 0.8811936155447606, + "train_speed(iter/s)": 0.032619 + }, + { + "epoch": 0.4627234413602674, + "grad_norm": 0.11048243939876556, + "learning_rate": 0.00018073525676244053, + "loss": 0.3989701271057129, + "memory(GiB)": 78.33, + "step": 2388, + "token_acc": 0.8846206164812145, + "train_speed(iter/s)": 0.03262 + }, + { + "epoch": 0.46291721164559413, + "grad_norm": 0.12440559267997742, + "learning_rate": 0.00018064115816561515, + "loss": 0.397320419549942, + "memory(GiB)": 78.33, + "step": 2389, + "token_acc": 0.8848505094371137, + "train_speed(iter/s)": 0.032621 + }, + { + "epoch": 0.4631109819309209, + "grad_norm": 0.11104737967252731, + "learning_rate": 0.00018054704698364273, + "loss": 0.39772453904151917, + "memory(GiB)": 78.33, + "step": 2390, + "token_acc": 0.8835264012326995, + "train_speed(iter/s)": 0.032622 + }, + { + "epoch": 0.4633047522162476, + "grad_norm": 0.1087903156876564, + "learning_rate": 0.00018045292325517736, + "loss": 0.39890480041503906, + "memory(GiB)": 78.33, + "step": 2391, + "token_acc": 0.8840932546508243, + "train_speed(iter/s)": 0.032623 + }, + { + "epoch": 0.46349852250157436, + "grad_norm": 0.10867461562156677, + "learning_rate": 0.00018035878701887803, + "loss": 0.3494797348976135, + "memory(GiB)": 78.33, + "step": 2392, + "token_acc": 0.897205366586873, + "train_speed(iter/s)": 0.032624 + }, + { + "epoch": 0.4636922927869011, + "grad_norm": 0.11319278180599213, + "learning_rate": 0.00018026463831340915, + "loss": 0.36147987842559814, + "memory(GiB)": 78.33, + "step": 2393, + "token_acc": 0.8932517509038962, + "train_speed(iter/s)": 0.032625 + }, + { + "epoch": 0.46388606307222785, + "grad_norm": 0.1191987693309784, + "learning_rate": 0.00018017047717744006, + "loss": 0.4088186025619507, + "memory(GiB)": 78.33, + "step": 2394, + "token_acc": 0.88241711618886, + "train_speed(iter/s)": 0.032626 + }, + { + "epoch": 0.4640798333575546, + "grad_norm": 0.10697056353092194, + "learning_rate": 0.00018007630364964524, + "loss": 0.3530442714691162, + "memory(GiB)": 78.33, + "step": 2395, + "token_acc": 0.8948714966856611, + "train_speed(iter/s)": 0.032628 + }, + { + "epoch": 0.46427360364288134, + "grad_norm": 0.1151200458407402, + "learning_rate": 0.00017998211776870435, + "loss": 0.4034122824668884, + "memory(GiB)": 78.33, + "step": 2396, + "token_acc": 0.8807363035709694, + "train_speed(iter/s)": 0.032629 + }, + { + "epoch": 0.4644673739282081, + "grad_norm": 0.11261675506830215, + "learning_rate": 0.00017988791957330205, + "loss": 0.3917164206504822, + "memory(GiB)": 78.33, + "step": 2397, + "token_acc": 0.8845557080623023, + "train_speed(iter/s)": 0.03263 + }, + { + "epoch": 0.4646611442135348, + "grad_norm": 0.10412374138832092, + "learning_rate": 0.00017979370910212807, + "loss": 0.34287336468696594, + "memory(GiB)": 78.33, + "step": 2398, + "token_acc": 0.8996082907468417, + "train_speed(iter/s)": 0.032631 + }, + { + "epoch": 0.4648549144988616, + "grad_norm": 0.1128680557012558, + "learning_rate": 0.00017969948639387715, + "loss": 0.35632070899009705, + "memory(GiB)": 78.33, + "step": 2399, + "token_acc": 0.8946635868277659, + "train_speed(iter/s)": 0.032632 + }, + { + "epoch": 0.46504868478418837, + "grad_norm": 0.10247381776571274, + "learning_rate": 0.00017960525148724916, + "loss": 0.3603074848651886, + "memory(GiB)": 78.33, + "step": 2400, + "token_acc": 0.891370611730082, + "train_speed(iter/s)": 0.032633 + }, + { + "epoch": 0.4652424550695151, + "grad_norm": 0.10234736651182175, + "learning_rate": 0.00017951100442094878, + "loss": 0.3643084764480591, + "memory(GiB)": 78.33, + "step": 2401, + "token_acc": 0.892455605758117, + "train_speed(iter/s)": 0.032628 + }, + { + "epoch": 0.46543622535484186, + "grad_norm": 0.11007906496524811, + "learning_rate": 0.00017941674523368594, + "loss": 0.36536821722984314, + "memory(GiB)": 78.33, + "step": 2402, + "token_acc": 0.8915501381254806, + "train_speed(iter/s)": 0.032629 + }, + { + "epoch": 0.4656299956401686, + "grad_norm": 0.0991489514708519, + "learning_rate": 0.00017932247396417538, + "loss": 0.32952260971069336, + "memory(GiB)": 78.33, + "step": 2403, + "token_acc": 0.9006100723689086, + "train_speed(iter/s)": 0.032631 + }, + { + "epoch": 0.46582376592549535, + "grad_norm": 0.09857763350009918, + "learning_rate": 0.00017922819065113683, + "loss": 0.36439892649650574, + "memory(GiB)": 78.33, + "step": 2404, + "token_acc": 0.8917956246540396, + "train_speed(iter/s)": 0.032631 + }, + { + "epoch": 0.4660175362108221, + "grad_norm": 0.11544130742549896, + "learning_rate": 0.000179133895333295, + "loss": 0.39752769470214844, + "memory(GiB)": 78.33, + "step": 2405, + "token_acc": 0.8802836879432624, + "train_speed(iter/s)": 0.032633 + }, + { + "epoch": 0.46621130649614884, + "grad_norm": 0.11335242539644241, + "learning_rate": 0.0001790395880493795, + "loss": 0.3888709843158722, + "memory(GiB)": 78.33, + "step": 2406, + "token_acc": 0.8840124504810413, + "train_speed(iter/s)": 0.032634 + }, + { + "epoch": 0.4664050767814756, + "grad_norm": 0.1116030365228653, + "learning_rate": 0.00017894526883812485, + "loss": 0.3878939747810364, + "memory(GiB)": 78.33, + "step": 2407, + "token_acc": 0.885872988604478, + "train_speed(iter/s)": 0.032635 + }, + { + "epoch": 0.4665988470668023, + "grad_norm": 0.10574699193239212, + "learning_rate": 0.00017885093773827048, + "loss": 0.36204928159713745, + "memory(GiB)": 78.33, + "step": 2408, + "token_acc": 0.8934056241568953, + "train_speed(iter/s)": 0.032636 + }, + { + "epoch": 0.46679261735212907, + "grad_norm": 0.10200771689414978, + "learning_rate": 0.00017875659478856076, + "loss": 0.3507916033267975, + "memory(GiB)": 78.33, + "step": 2409, + "token_acc": 0.8951724847051565, + "train_speed(iter/s)": 0.032636 + }, + { + "epoch": 0.4669863876374558, + "grad_norm": 0.10833270847797394, + "learning_rate": 0.00017866224002774478, + "loss": 0.38488560914993286, + "memory(GiB)": 78.33, + "step": 2410, + "token_acc": 0.8863565847742081, + "train_speed(iter/s)": 0.032637 + }, + { + "epoch": 0.46718015792278256, + "grad_norm": 0.1008668914437294, + "learning_rate": 0.00017856787349457672, + "loss": 0.37106162309646606, + "memory(GiB)": 78.33, + "step": 2411, + "token_acc": 0.8896542726679713, + "train_speed(iter/s)": 0.032638 + }, + { + "epoch": 0.4673739282081093, + "grad_norm": 0.11065692454576492, + "learning_rate": 0.0001784734952278153, + "loss": 0.3777768313884735, + "memory(GiB)": 78.33, + "step": 2412, + "token_acc": 0.8877977553825012, + "train_speed(iter/s)": 0.03264 + }, + { + "epoch": 0.46756769849343605, + "grad_norm": 0.10646090656518936, + "learning_rate": 0.00017837910526622436, + "loss": 0.3846604824066162, + "memory(GiB)": 78.33, + "step": 2413, + "token_acc": 0.8871676430824695, + "train_speed(iter/s)": 0.032641 + }, + { + "epoch": 0.4677614687787628, + "grad_norm": 0.09328921884298325, + "learning_rate": 0.00017828470364857226, + "loss": 0.32804617285728455, + "memory(GiB)": 78.33, + "step": 2414, + "token_acc": 0.9045698603985205, + "train_speed(iter/s)": 0.032641 + }, + { + "epoch": 0.46795523906408953, + "grad_norm": 0.09627443552017212, + "learning_rate": 0.00017819029041363232, + "loss": 0.33626145124435425, + "memory(GiB)": 78.33, + "step": 2415, + "token_acc": 0.8980459016393443, + "train_speed(iter/s)": 0.032642 + }, + { + "epoch": 0.4681490093494163, + "grad_norm": 0.10673625767230988, + "learning_rate": 0.00017809586560018262, + "loss": 0.38183170557022095, + "memory(GiB)": 78.33, + "step": 2416, + "token_acc": 0.8858918947761593, + "train_speed(iter/s)": 0.032643 + }, + { + "epoch": 0.468342779634743, + "grad_norm": 0.1077309101819992, + "learning_rate": 0.00017800142924700592, + "loss": 0.3792920410633087, + "memory(GiB)": 78.33, + "step": 2417, + "token_acc": 0.8886526780784098, + "train_speed(iter/s)": 0.032644 + }, + { + "epoch": 0.46853654992006977, + "grad_norm": 0.11391132324934006, + "learning_rate": 0.00017790698139288983, + "loss": 0.3999425768852234, + "memory(GiB)": 78.33, + "step": 2418, + "token_acc": 0.8836907644413697, + "train_speed(iter/s)": 0.032645 + }, + { + "epoch": 0.4687303202053965, + "grad_norm": 0.10699284076690674, + "learning_rate": 0.0001778125220766266, + "loss": 0.3917970061302185, + "memory(GiB)": 78.33, + "step": 2419, + "token_acc": 0.8875225537820958, + "train_speed(iter/s)": 0.032646 + }, + { + "epoch": 0.46892409049072326, + "grad_norm": 0.10588902235031128, + "learning_rate": 0.00017771805133701322, + "loss": 0.3656570315361023, + "memory(GiB)": 78.33, + "step": 2420, + "token_acc": 0.8920440225153323, + "train_speed(iter/s)": 0.032648 + }, + { + "epoch": 0.46911786077605, + "grad_norm": 0.09696738421916962, + "learning_rate": 0.00017762356921285127, + "loss": 0.35896116495132446, + "memory(GiB)": 78.33, + "step": 2421, + "token_acc": 0.8952547723573225, + "train_speed(iter/s)": 0.032648 + }, + { + "epoch": 0.46931163106137674, + "grad_norm": 0.09918226301670074, + "learning_rate": 0.00017752907574294726, + "loss": 0.36196303367614746, + "memory(GiB)": 78.33, + "step": 2422, + "token_acc": 0.8924877517691889, + "train_speed(iter/s)": 0.032649 + }, + { + "epoch": 0.4695054013467035, + "grad_norm": 0.09119024872779846, + "learning_rate": 0.000177434570966112, + "loss": 0.34092551469802856, + "memory(GiB)": 78.33, + "step": 2423, + "token_acc": 0.898368601754819, + "train_speed(iter/s)": 0.03265 + }, + { + "epoch": 0.46969917163203023, + "grad_norm": 0.10667548328638077, + "learning_rate": 0.00017734005492116135, + "loss": 0.4041289687156677, + "memory(GiB)": 78.33, + "step": 2424, + "token_acc": 0.8804930885083557, + "train_speed(iter/s)": 0.032651 + }, + { + "epoch": 0.469892941917357, + "grad_norm": 0.10172632336616516, + "learning_rate": 0.00017724552764691545, + "loss": 0.3615379333496094, + "memory(GiB)": 78.33, + "step": 2425, + "token_acc": 0.8944812914528845, + "train_speed(iter/s)": 0.032652 + }, + { + "epoch": 0.4700867122026837, + "grad_norm": 0.10548295080661774, + "learning_rate": 0.00017715098918219926, + "loss": 0.36227190494537354, + "memory(GiB)": 78.33, + "step": 2426, + "token_acc": 0.8919847828938738, + "train_speed(iter/s)": 0.032653 + }, + { + "epoch": 0.47028048248801047, + "grad_norm": 0.10230764001607895, + "learning_rate": 0.0001770564395658422, + "loss": 0.3501635789871216, + "memory(GiB)": 78.33, + "step": 2427, + "token_acc": 0.8956358685880147, + "train_speed(iter/s)": 0.032654 + }, + { + "epoch": 0.4704742527733372, + "grad_norm": 0.10815402865409851, + "learning_rate": 0.00017696187883667837, + "loss": 0.3826008439064026, + "memory(GiB)": 78.33, + "step": 2428, + "token_acc": 0.8863275727763192, + "train_speed(iter/s)": 0.032655 + }, + { + "epoch": 0.47066802305866395, + "grad_norm": 0.12040967494249344, + "learning_rate": 0.00017686730703354641, + "loss": 0.43001601099967957, + "memory(GiB)": 78.33, + "step": 2429, + "token_acc": 0.8732980332829047, + "train_speed(iter/s)": 0.032656 + }, + { + "epoch": 0.4708617933439907, + "grad_norm": 0.10938671231269836, + "learning_rate": 0.00017677272419528952, + "loss": 0.39348623156547546, + "memory(GiB)": 78.33, + "step": 2430, + "token_acc": 0.8847729835272117, + "train_speed(iter/s)": 0.032657 + }, + { + "epoch": 0.47105556362931744, + "grad_norm": 0.09924609214067459, + "learning_rate": 0.00017667813036075538, + "loss": 0.35848769545555115, + "memory(GiB)": 78.33, + "step": 2431, + "token_acc": 0.8955514503969244, + "train_speed(iter/s)": 0.032658 + }, + { + "epoch": 0.4712493339146442, + "grad_norm": 0.09913709759712219, + "learning_rate": 0.00017658352556879623, + "loss": 0.3611469864845276, + "memory(GiB)": 78.33, + "step": 2432, + "token_acc": 0.8925030260887481, + "train_speed(iter/s)": 0.032659 + }, + { + "epoch": 0.47144310419997093, + "grad_norm": 0.10867290198802948, + "learning_rate": 0.00017648890985826881, + "loss": 0.34665271639823914, + "memory(GiB)": 78.33, + "step": 2433, + "token_acc": 0.8978396543446951, + "train_speed(iter/s)": 0.03266 + }, + { + "epoch": 0.4716368744852977, + "grad_norm": 0.10434415936470032, + "learning_rate": 0.00017639428326803432, + "loss": 0.3893589973449707, + "memory(GiB)": 78.33, + "step": 2434, + "token_acc": 0.8848601637972565, + "train_speed(iter/s)": 0.032661 + }, + { + "epoch": 0.4718306447706244, + "grad_norm": 0.10091494768857956, + "learning_rate": 0.00017629964583695847, + "loss": 0.3660696744918823, + "memory(GiB)": 78.33, + "step": 2435, + "token_acc": 0.893289756957954, + "train_speed(iter/s)": 0.032662 + }, + { + "epoch": 0.47202441505595116, + "grad_norm": 0.11700880527496338, + "learning_rate": 0.00017620499760391133, + "loss": 0.3919360041618347, + "memory(GiB)": 78.33, + "step": 2436, + "token_acc": 0.8846436236304077, + "train_speed(iter/s)": 0.032663 + }, + { + "epoch": 0.4722181853412779, + "grad_norm": 0.11263095587491989, + "learning_rate": 0.00017611033860776752, + "loss": 0.36513423919677734, + "memory(GiB)": 78.33, + "step": 2437, + "token_acc": 0.8920323843097422, + "train_speed(iter/s)": 0.032664 + }, + { + "epoch": 0.47241195562660465, + "grad_norm": 0.09740516543388367, + "learning_rate": 0.0001760156688874061, + "loss": 0.3375285565853119, + "memory(GiB)": 78.33, + "step": 2438, + "token_acc": 0.899163848916149, + "train_speed(iter/s)": 0.032665 + }, + { + "epoch": 0.4726057259119314, + "grad_norm": 0.10824833065271378, + "learning_rate": 0.00017592098848171037, + "loss": 0.36911553144454956, + "memory(GiB)": 78.33, + "step": 2439, + "token_acc": 0.8915403549682601, + "train_speed(iter/s)": 0.032666 + }, + { + "epoch": 0.47279949619725814, + "grad_norm": 0.1003868579864502, + "learning_rate": 0.00017582629742956816, + "loss": 0.3386095464229584, + "memory(GiB)": 78.33, + "step": 2440, + "token_acc": 0.8985514852524404, + "train_speed(iter/s)": 0.032667 + }, + { + "epoch": 0.4729932664825849, + "grad_norm": 0.13465555012226105, + "learning_rate": 0.00017573159576987155, + "loss": 0.4634130001068115, + "memory(GiB)": 78.33, + "step": 2441, + "token_acc": 0.8691293051762041, + "train_speed(iter/s)": 0.032668 + }, + { + "epoch": 0.47318703676791163, + "grad_norm": 0.11581727117300034, + "learning_rate": 0.0001756368835415172, + "loss": 0.3709140717983246, + "memory(GiB)": 78.33, + "step": 2442, + "token_acc": 0.8906245185938318, + "train_speed(iter/s)": 0.032669 + }, + { + "epoch": 0.4733808070532384, + "grad_norm": 0.09655480831861496, + "learning_rate": 0.00017554216078340582, + "loss": 0.3356662094593048, + "memory(GiB)": 78.33, + "step": 2443, + "token_acc": 0.901089996601929, + "train_speed(iter/s)": 0.03267 + }, + { + "epoch": 0.4735745773385651, + "grad_norm": 0.10045115649700165, + "learning_rate": 0.00017544742753444268, + "loss": 0.33761128783226013, + "memory(GiB)": 78.33, + "step": 2444, + "token_acc": 0.8995756718528995, + "train_speed(iter/s)": 0.032671 + }, + { + "epoch": 0.47376834762389186, + "grad_norm": 0.10537243634462357, + "learning_rate": 0.0001753526838335373, + "loss": 0.3550539016723633, + "memory(GiB)": 78.33, + "step": 2445, + "token_acc": 0.8979015012132363, + "train_speed(iter/s)": 0.032672 + }, + { + "epoch": 0.4739621179092186, + "grad_norm": 0.09614920616149902, + "learning_rate": 0.0001752579297196034, + "loss": 0.3335365056991577, + "memory(GiB)": 78.33, + "step": 2446, + "token_acc": 0.9002796136248093, + "train_speed(iter/s)": 0.032673 + }, + { + "epoch": 0.47415588819454535, + "grad_norm": 0.10471241921186447, + "learning_rate": 0.00017516316523155903, + "loss": 0.3631875216960907, + "memory(GiB)": 78.33, + "step": 2447, + "token_acc": 0.8914798436857261, + "train_speed(iter/s)": 0.032674 + }, + { + "epoch": 0.4743496584798721, + "grad_norm": 0.10309155285358429, + "learning_rate": 0.00017506839040832653, + "loss": 0.36359232664108276, + "memory(GiB)": 78.33, + "step": 2448, + "token_acc": 0.8915206063477025, + "train_speed(iter/s)": 0.032675 + }, + { + "epoch": 0.47454342876519884, + "grad_norm": 0.09676847606897354, + "learning_rate": 0.00017497360528883252, + "loss": 0.33657437562942505, + "memory(GiB)": 78.33, + "step": 2449, + "token_acc": 0.8991928296794235, + "train_speed(iter/s)": 0.032676 + }, + { + "epoch": 0.4747371990505256, + "grad_norm": 0.1178392842411995, + "learning_rate": 0.0001748788099120077, + "loss": 0.3946092426776886, + "memory(GiB)": 78.33, + "step": 2450, + "token_acc": 0.8864816204051013, + "train_speed(iter/s)": 0.032677 + }, + { + "epoch": 0.4749309693358523, + "grad_norm": 0.10294267535209656, + "learning_rate": 0.00017478400431678715, + "loss": 0.35818547010421753, + "memory(GiB)": 78.33, + "step": 2451, + "token_acc": 0.8954294409377818, + "train_speed(iter/s)": 0.032678 + }, + { + "epoch": 0.47512473962117907, + "grad_norm": 0.09975744038820267, + "learning_rate": 0.00017468918854211007, + "loss": 0.3438222408294678, + "memory(GiB)": 78.33, + "step": 2452, + "token_acc": 0.899183906851024, + "train_speed(iter/s)": 0.032679 + }, + { + "epoch": 0.4753185099065058, + "grad_norm": 0.12452986091375351, + "learning_rate": 0.00017459436262691987, + "loss": 0.41207653284072876, + "memory(GiB)": 78.33, + "step": 2453, + "token_acc": 0.8813101862650361, + "train_speed(iter/s)": 0.03268 + }, + { + "epoch": 0.47551228019183256, + "grad_norm": 0.09743805229663849, + "learning_rate": 0.00017449952661016395, + "loss": 0.3469730019569397, + "memory(GiB)": 78.33, + "step": 2454, + "token_acc": 0.8969273247713859, + "train_speed(iter/s)": 0.032681 + }, + { + "epoch": 0.4757060504771593, + "grad_norm": 0.10268343985080719, + "learning_rate": 0.0001744046805307942, + "loss": 0.33460330963134766, + "memory(GiB)": 78.33, + "step": 2455, + "token_acc": 0.900157210232957, + "train_speed(iter/s)": 0.032682 + }, + { + "epoch": 0.47589982076248605, + "grad_norm": 0.1134437769651413, + "learning_rate": 0.00017430982442776636, + "loss": 0.3615866005420685, + "memory(GiB)": 78.33, + "step": 2456, + "token_acc": 0.8943463421872682, + "train_speed(iter/s)": 0.032683 + }, + { + "epoch": 0.4760935910478128, + "grad_norm": 0.10063590854406357, + "learning_rate": 0.0001742149583400404, + "loss": 0.35712823271751404, + "memory(GiB)": 78.33, + "step": 2457, + "token_acc": 0.8925671118305679, + "train_speed(iter/s)": 0.032684 + }, + { + "epoch": 0.47628736133313954, + "grad_norm": 0.11840621381998062, + "learning_rate": 0.0001741200823065804, + "loss": 0.3843688666820526, + "memory(GiB)": 78.33, + "step": 2458, + "token_acc": 0.8865816184002235, + "train_speed(iter/s)": 0.032685 + }, + { + "epoch": 0.47648113161846634, + "grad_norm": 0.11133860051631927, + "learning_rate": 0.00017402519636635445, + "loss": 0.36577725410461426, + "memory(GiB)": 78.33, + "step": 2459, + "token_acc": 0.8931690359777295, + "train_speed(iter/s)": 0.032686 + }, + { + "epoch": 0.4766749019037931, + "grad_norm": 0.10411540418863297, + "learning_rate": 0.00017393030055833477, + "loss": 0.40943437814712524, + "memory(GiB)": 78.33, + "step": 2460, + "token_acc": 0.8801027690550957, + "train_speed(iter/s)": 0.032688 + }, + { + "epoch": 0.4768686721891198, + "grad_norm": 0.11033840477466583, + "learning_rate": 0.00017383539492149755, + "loss": 0.39954739809036255, + "memory(GiB)": 78.33, + "step": 2461, + "token_acc": 0.8828463277143869, + "train_speed(iter/s)": 0.032688 + }, + { + "epoch": 0.47706244247444657, + "grad_norm": 0.10192213207483292, + "learning_rate": 0.00017374047949482324, + "loss": 0.36716240644454956, + "memory(GiB)": 78.33, + "step": 2462, + "token_acc": 0.8911974494758457, + "train_speed(iter/s)": 0.032689 + }, + { + "epoch": 0.4772562127597733, + "grad_norm": 0.10164597630500793, + "learning_rate": 0.000173645554317296, + "loss": 0.37092214822769165, + "memory(GiB)": 78.33, + "step": 2463, + "token_acc": 0.8932100329364074, + "train_speed(iter/s)": 0.03269 + }, + { + "epoch": 0.47744998304510006, + "grad_norm": 0.10854342579841614, + "learning_rate": 0.0001735506194279043, + "loss": 0.37675389647483826, + "memory(GiB)": 78.33, + "step": 2464, + "token_acc": 0.8900420837124658, + "train_speed(iter/s)": 0.032691 + }, + { + "epoch": 0.4776437533304268, + "grad_norm": 0.1078154519200325, + "learning_rate": 0.00017345567486564033, + "loss": 0.3637319505214691, + "memory(GiB)": 78.33, + "step": 2465, + "token_acc": 0.8951967543337022, + "train_speed(iter/s)": 0.032692 + }, + { + "epoch": 0.47783752361575355, + "grad_norm": 0.11544948071241379, + "learning_rate": 0.00017336072066950043, + "loss": 0.42091140151023865, + "memory(GiB)": 78.33, + "step": 2466, + "token_acc": 0.8770483917519161, + "train_speed(iter/s)": 0.032693 + }, + { + "epoch": 0.4780312939010803, + "grad_norm": 0.10689322650432587, + "learning_rate": 0.00017326575687848483, + "loss": 0.36379310488700867, + "memory(GiB)": 78.33, + "step": 2467, + "token_acc": 0.8929775576862291, + "train_speed(iter/s)": 0.032694 + }, + { + "epoch": 0.47822506418640703, + "grad_norm": 0.1052493080496788, + "learning_rate": 0.00017317078353159767, + "loss": 0.36684519052505493, + "memory(GiB)": 78.33, + "step": 2468, + "token_acc": 0.8944831280128549, + "train_speed(iter/s)": 0.032695 + }, + { + "epoch": 0.4784188344717338, + "grad_norm": 0.11039218306541443, + "learning_rate": 0.00017307580066784706, + "loss": 0.36911848187446594, + "memory(GiB)": 78.33, + "step": 2469, + "token_acc": 0.891165015793186, + "train_speed(iter/s)": 0.032696 + }, + { + "epoch": 0.4786126047570605, + "grad_norm": 0.09582363814115524, + "learning_rate": 0.00017298080832624512, + "loss": 0.32416924834251404, + "memory(GiB)": 78.33, + "step": 2470, + "token_acc": 0.9032258064516129, + "train_speed(iter/s)": 0.032697 + }, + { + "epoch": 0.47880637504238727, + "grad_norm": 0.09783096611499786, + "learning_rate": 0.00017288580654580766, + "loss": 0.3526180386543274, + "memory(GiB)": 78.33, + "step": 2471, + "token_acc": 0.8959444254195, + "train_speed(iter/s)": 0.032698 + }, + { + "epoch": 0.479000145327714, + "grad_norm": 0.11712806671857834, + "learning_rate": 0.00017279079536555448, + "loss": 0.416570246219635, + "memory(GiB)": 78.33, + "step": 2472, + "token_acc": 0.8787166033411736, + "train_speed(iter/s)": 0.032699 + }, + { + "epoch": 0.47919391561304076, + "grad_norm": 0.10478947311639786, + "learning_rate": 0.00017269577482450927, + "loss": 0.3899994194507599, + "memory(GiB)": 78.33, + "step": 2473, + "token_acc": 0.8852699947849998, + "train_speed(iter/s)": 0.0327 + }, + { + "epoch": 0.4793876858983675, + "grad_norm": 0.10049055516719818, + "learning_rate": 0.0001726007449616994, + "loss": 0.32086169719696045, + "memory(GiB)": 78.33, + "step": 2474, + "token_acc": 0.9046260601387818, + "train_speed(iter/s)": 0.032701 + }, + { + "epoch": 0.47958145618369424, + "grad_norm": 0.09648868441581726, + "learning_rate": 0.00017250570581615632, + "loss": 0.3393422067165375, + "memory(GiB)": 78.33, + "step": 2475, + "token_acc": 0.897726723095526, + "train_speed(iter/s)": 0.032702 + }, + { + "epoch": 0.479775226469021, + "grad_norm": 0.1213340163230896, + "learning_rate": 0.00017241065742691508, + "loss": 0.3405624032020569, + "memory(GiB)": 78.33, + "step": 2476, + "token_acc": 0.8992950060470193, + "train_speed(iter/s)": 0.032703 + }, + { + "epoch": 0.47996899675434773, + "grad_norm": 0.10845254361629486, + "learning_rate": 0.00017231559983301467, + "loss": 0.3818724751472473, + "memory(GiB)": 78.33, + "step": 2477, + "token_acc": 0.8866860090264346, + "train_speed(iter/s)": 0.032704 + }, + { + "epoch": 0.4801627670396745, + "grad_norm": 0.1041831448674202, + "learning_rate": 0.00017222053307349775, + "loss": 0.36897408962249756, + "memory(GiB)": 78.33, + "step": 2478, + "token_acc": 0.8889274835675733, + "train_speed(iter/s)": 0.032705 + }, + { + "epoch": 0.4803565373250012, + "grad_norm": 0.11223351210355759, + "learning_rate": 0.00017212545718741084, + "loss": 0.39438849687576294, + "memory(GiB)": 78.33, + "step": 2479, + "token_acc": 0.8818676281855405, + "train_speed(iter/s)": 0.032706 + }, + { + "epoch": 0.48055030761032796, + "grad_norm": 0.11003533750772476, + "learning_rate": 0.0001720303722138041, + "loss": 0.3478088080883026, + "memory(GiB)": 78.33, + "step": 2480, + "token_acc": 0.8965669378188336, + "train_speed(iter/s)": 0.032707 + }, + { + "epoch": 0.4807440778956547, + "grad_norm": 0.11993599683046341, + "learning_rate": 0.0001719352781917315, + "loss": 0.35039135813713074, + "memory(GiB)": 78.33, + "step": 2481, + "token_acc": 0.896633438940834, + "train_speed(iter/s)": 0.032708 + }, + { + "epoch": 0.48093784818098145, + "grad_norm": 0.10672726482152939, + "learning_rate": 0.00017184017516025076, + "loss": 0.3407592177391052, + "memory(GiB)": 78.33, + "step": 2482, + "token_acc": 0.8988511857298958, + "train_speed(iter/s)": 0.032709 + }, + { + "epoch": 0.4811316184663082, + "grad_norm": 0.1108117550611496, + "learning_rate": 0.00017174506315842316, + "loss": 0.3430328965187073, + "memory(GiB)": 78.33, + "step": 2483, + "token_acc": 0.9010159105568695, + "train_speed(iter/s)": 0.03271 + }, + { + "epoch": 0.48132538875163494, + "grad_norm": 0.10908929258584976, + "learning_rate": 0.00017164994222531384, + "loss": 0.365764319896698, + "memory(GiB)": 78.33, + "step": 2484, + "token_acc": 0.8944871455075537, + "train_speed(iter/s)": 0.032711 + }, + { + "epoch": 0.4815191590369617, + "grad_norm": 0.10668904334306717, + "learning_rate": 0.0001715548123999915, + "loss": 0.36919450759887695, + "memory(GiB)": 78.33, + "step": 2485, + "token_acc": 0.8912290460418194, + "train_speed(iter/s)": 0.032712 + }, + { + "epoch": 0.48171292932228843, + "grad_norm": 0.10080403089523315, + "learning_rate": 0.0001714596737215285, + "loss": 0.33419883251190186, + "memory(GiB)": 78.33, + "step": 2486, + "token_acc": 0.899380222317015, + "train_speed(iter/s)": 0.032713 + }, + { + "epoch": 0.4819066996076152, + "grad_norm": 0.10038257390260696, + "learning_rate": 0.00017136452622900083, + "loss": 0.3448052406311035, + "memory(GiB)": 78.33, + "step": 2487, + "token_acc": 0.898629288314102, + "train_speed(iter/s)": 0.032714 + }, + { + "epoch": 0.4821004698929419, + "grad_norm": 0.10354321449995041, + "learning_rate": 0.0001712693699614882, + "loss": 0.35112708806991577, + "memory(GiB)": 78.33, + "step": 2488, + "token_acc": 0.8958072674031679, + "train_speed(iter/s)": 0.032715 + }, + { + "epoch": 0.48229424017826866, + "grad_norm": 0.10565353184938431, + "learning_rate": 0.00017117420495807372, + "loss": 0.34346991777420044, + "memory(GiB)": 78.33, + "step": 2489, + "token_acc": 0.8978623685413809, + "train_speed(iter/s)": 0.032716 + }, + { + "epoch": 0.4824880104635954, + "grad_norm": 0.10667932033538818, + "learning_rate": 0.00017107903125784433, + "loss": 0.39348533749580383, + "memory(GiB)": 78.33, + "step": 2490, + "token_acc": 0.8839731259234479, + "train_speed(iter/s)": 0.032717 + }, + { + "epoch": 0.48268178074892215, + "grad_norm": 0.10690039396286011, + "learning_rate": 0.00017098384889989044, + "loss": 0.36393576860427856, + "memory(GiB)": 78.33, + "step": 2491, + "token_acc": 0.891016713091922, + "train_speed(iter/s)": 0.032718 + }, + { + "epoch": 0.4828755510342489, + "grad_norm": 0.10752350091934204, + "learning_rate": 0.0001708886579233059, + "loss": 0.3595428764820099, + "memory(GiB)": 78.33, + "step": 2492, + "token_acc": 0.8925832633641197, + "train_speed(iter/s)": 0.032719 + }, + { + "epoch": 0.48306932131957564, + "grad_norm": 0.10408618301153183, + "learning_rate": 0.00017079345836718828, + "loss": 0.35838645696640015, + "memory(GiB)": 78.33, + "step": 2493, + "token_acc": 0.893934211276438, + "train_speed(iter/s)": 0.03272 + }, + { + "epoch": 0.4832630916049024, + "grad_norm": 0.10144350677728653, + "learning_rate": 0.0001706982502706385, + "loss": 0.3558150827884674, + "memory(GiB)": 78.33, + "step": 2494, + "token_acc": 0.8957832584387083, + "train_speed(iter/s)": 0.032721 + }, + { + "epoch": 0.48345686189022913, + "grad_norm": 0.10692603886127472, + "learning_rate": 0.00017060303367276121, + "loss": 0.4040507376194, + "memory(GiB)": 78.33, + "step": 2495, + "token_acc": 0.8803178194638112, + "train_speed(iter/s)": 0.032722 + }, + { + "epoch": 0.4836506321755559, + "grad_norm": 0.10149496793746948, + "learning_rate": 0.00017050780861266432, + "loss": 0.36347493529319763, + "memory(GiB)": 78.33, + "step": 2496, + "token_acc": 0.8917304707027652, + "train_speed(iter/s)": 0.032723 + }, + { + "epoch": 0.4838444024608826, + "grad_norm": 0.11008831858634949, + "learning_rate": 0.00017041257512945943, + "loss": 0.36779606342315674, + "memory(GiB)": 78.33, + "step": 2497, + "token_acc": 0.8886516076638493, + "train_speed(iter/s)": 0.032724 + }, + { + "epoch": 0.48403817274620936, + "grad_norm": 0.09817710518836975, + "learning_rate": 0.00017031733326226142, + "loss": 0.3289712071418762, + "memory(GiB)": 78.33, + "step": 2498, + "token_acc": 0.9038888272495285, + "train_speed(iter/s)": 0.032725 + }, + { + "epoch": 0.4842319430315361, + "grad_norm": 0.11836884170770645, + "learning_rate": 0.00017022208305018867, + "loss": 0.4116940498352051, + "memory(GiB)": 78.33, + "step": 2499, + "token_acc": 0.88121387283237, + "train_speed(iter/s)": 0.032726 + }, + { + "epoch": 0.48442571331686285, + "grad_norm": 0.11396266520023346, + "learning_rate": 0.00017012682453236303, + "loss": 0.3659469485282898, + "memory(GiB)": 78.33, + "step": 2500, + "token_acc": 0.8918359785092324, + "train_speed(iter/s)": 0.032727 + }, + { + "epoch": 0.48442571331686285, + "eval_loss": 0.4225236177444458, + "eval_runtime": 1345.6225, + "eval_samples_per_second": 5.016, + "eval_steps_per_second": 5.016, + "eval_token_acc": 0.8928025735859316, + "step": 2500 + }, + { + "epoch": 0.4846194836021896, + "grad_norm": 0.10215523093938828, + "learning_rate": 0.00017003155774790966, + "loss": 0.37822431325912476, + "memory(GiB)": 78.33, + "step": 2501, + "token_acc": 0.8875518780506904, + "train_speed(iter/s)": 0.032161 + }, + { + "epoch": 0.48481325388751634, + "grad_norm": 0.09499707072973251, + "learning_rate": 0.00016993628273595732, + "loss": 0.3357214629650116, + "memory(GiB)": 78.33, + "step": 2502, + "token_acc": 0.8995496714816547, + "train_speed(iter/s)": 0.032162 + }, + { + "epoch": 0.4850070241728431, + "grad_norm": 0.1082739308476448, + "learning_rate": 0.00016984099953563792, + "loss": 0.3810504078865051, + "memory(GiB)": 78.33, + "step": 2503, + "token_acc": 0.8882280116266552, + "train_speed(iter/s)": 0.032163 + }, + { + "epoch": 0.4852007944581698, + "grad_norm": 0.1032496765255928, + "learning_rate": 0.0001697457081860869, + "loss": 0.36440324783325195, + "memory(GiB)": 78.33, + "step": 2504, + "token_acc": 0.8939489607997896, + "train_speed(iter/s)": 0.032164 + }, + { + "epoch": 0.48539456474349657, + "grad_norm": 0.08897780627012253, + "learning_rate": 0.00016965040872644294, + "loss": 0.3292064964771271, + "memory(GiB)": 78.33, + "step": 2505, + "token_acc": 0.9021104037673323, + "train_speed(iter/s)": 0.032165 + }, + { + "epoch": 0.4855883350288233, + "grad_norm": 0.10932856798171997, + "learning_rate": 0.0001695551011958481, + "loss": 0.39690276980400085, + "memory(GiB)": 78.33, + "step": 2506, + "token_acc": 0.8848341960945107, + "train_speed(iter/s)": 0.032167 + }, + { + "epoch": 0.48578210531415006, + "grad_norm": 0.11414424329996109, + "learning_rate": 0.0001694597856334477, + "loss": 0.39790499210357666, + "memory(GiB)": 78.33, + "step": 2507, + "token_acc": 0.8816454951776053, + "train_speed(iter/s)": 0.032168 + }, + { + "epoch": 0.4859758755994768, + "grad_norm": 0.09871453046798706, + "learning_rate": 0.00016936446207839042, + "loss": 0.3503097593784332, + "memory(GiB)": 78.33, + "step": 2508, + "token_acc": 0.8961086541229425, + "train_speed(iter/s)": 0.032169 + }, + { + "epoch": 0.48616964588480355, + "grad_norm": 0.09798671305179596, + "learning_rate": 0.0001692691305698282, + "loss": 0.35509318113327026, + "memory(GiB)": 78.33, + "step": 2509, + "token_acc": 0.8953029405960135, + "train_speed(iter/s)": 0.03217 + }, + { + "epoch": 0.4863634161701303, + "grad_norm": 0.10052474588155746, + "learning_rate": 0.00016917379114691635, + "loss": 0.34173664450645447, + "memory(GiB)": 78.33, + "step": 2510, + "token_acc": 0.8989416623644811, + "train_speed(iter/s)": 0.032171 + }, + { + "epoch": 0.48655718645545704, + "grad_norm": 0.11249368637800217, + "learning_rate": 0.00016907844384881325, + "loss": 0.41095811128616333, + "memory(GiB)": 78.33, + "step": 2511, + "token_acc": 0.8812950699043415, + "train_speed(iter/s)": 0.032172 + }, + { + "epoch": 0.4867509567407838, + "grad_norm": 0.10713987797498703, + "learning_rate": 0.00016898308871468059, + "loss": 0.3698621988296509, + "memory(GiB)": 78.33, + "step": 2512, + "token_acc": 0.8919433163888336, + "train_speed(iter/s)": 0.032174 + }, + { + "epoch": 0.4869447270261105, + "grad_norm": 0.09921532869338989, + "learning_rate": 0.00016888772578368326, + "loss": 0.36754289269447327, + "memory(GiB)": 78.33, + "step": 2513, + "token_acc": 0.8912722283784497, + "train_speed(iter/s)": 0.032175 + }, + { + "epoch": 0.48713849731143727, + "grad_norm": 0.10043738037347794, + "learning_rate": 0.00016879235509498943, + "loss": 0.3569088280200958, + "memory(GiB)": 78.33, + "step": 2514, + "token_acc": 0.8948838737949167, + "train_speed(iter/s)": 0.032176 + }, + { + "epoch": 0.487332267596764, + "grad_norm": 0.1124173253774643, + "learning_rate": 0.00016869697668777043, + "loss": 0.37260574102401733, + "memory(GiB)": 78.33, + "step": 2515, + "token_acc": 0.8900623953736113, + "train_speed(iter/s)": 0.032177 + }, + { + "epoch": 0.48752603788209076, + "grad_norm": 0.10620686411857605, + "learning_rate": 0.00016860159060120062, + "loss": 0.36633095145225525, + "memory(GiB)": 78.33, + "step": 2516, + "token_acc": 0.8924985397602425, + "train_speed(iter/s)": 0.032178 + }, + { + "epoch": 0.4877198081674175, + "grad_norm": 0.10621728748083115, + "learning_rate": 0.00016850619687445778, + "loss": 0.36821886897087097, + "memory(GiB)": 78.33, + "step": 2517, + "token_acc": 0.8912531612453126, + "train_speed(iter/s)": 0.032179 + }, + { + "epoch": 0.48791357845274425, + "grad_norm": 0.1054278165102005, + "learning_rate": 0.0001684107955467226, + "loss": 0.3753064274787903, + "memory(GiB)": 78.33, + "step": 2518, + "token_acc": 0.891217046851523, + "train_speed(iter/s)": 0.03218 + }, + { + "epoch": 0.488107348738071, + "grad_norm": 0.09800657629966736, + "learning_rate": 0.00016831538665717895, + "loss": 0.35065239667892456, + "memory(GiB)": 78.33, + "step": 2519, + "token_acc": 0.8958839022878686, + "train_speed(iter/s)": 0.032181 + }, + { + "epoch": 0.4883011190233978, + "grad_norm": 0.12331446260213852, + "learning_rate": 0.00016821997024501386, + "loss": 0.4164249897003174, + "memory(GiB)": 78.33, + "step": 2520, + "token_acc": 0.8785413567155202, + "train_speed(iter/s)": 0.032183 + }, + { + "epoch": 0.48849488930872453, + "grad_norm": 0.10607205331325531, + "learning_rate": 0.00016812454634941739, + "loss": 0.34385666251182556, + "memory(GiB)": 78.33, + "step": 2521, + "token_acc": 0.8990350297422339, + "train_speed(iter/s)": 0.032184 + }, + { + "epoch": 0.4886886595940513, + "grad_norm": 0.10245993733406067, + "learning_rate": 0.00016802911500958268, + "loss": 0.38447538018226624, + "memory(GiB)": 78.33, + "step": 2522, + "token_acc": 0.8873564827779333, + "train_speed(iter/s)": 0.032185 + }, + { + "epoch": 0.488882429879378, + "grad_norm": 0.10764322429895401, + "learning_rate": 0.00016793367626470598, + "loss": 0.36548304557800293, + "memory(GiB)": 78.33, + "step": 2523, + "token_acc": 0.8924651924651925, + "train_speed(iter/s)": 0.032186 + }, + { + "epoch": 0.48907620016470477, + "grad_norm": 0.11456014215946198, + "learning_rate": 0.0001678382301539866, + "loss": 0.3953639566898346, + "memory(GiB)": 78.33, + "step": 2524, + "token_acc": 0.8825286212045794, + "train_speed(iter/s)": 0.032187 + }, + { + "epoch": 0.4892699704500315, + "grad_norm": 0.10977276414632797, + "learning_rate": 0.00016774277671662672, + "loss": 0.38997790217399597, + "memory(GiB)": 78.33, + "step": 2525, + "token_acc": 0.8871661125759487, + "train_speed(iter/s)": 0.032188 + }, + { + "epoch": 0.48946374073535825, + "grad_norm": 0.10588563233613968, + "learning_rate": 0.00016764731599183173, + "loss": 0.3698723614215851, + "memory(GiB)": 78.33, + "step": 2526, + "token_acc": 0.8911557154232681, + "train_speed(iter/s)": 0.03219 + }, + { + "epoch": 0.489657511020685, + "grad_norm": 0.1090141013264656, + "learning_rate": 0.00016755184801880976, + "loss": 0.3647971749305725, + "memory(GiB)": 78.33, + "step": 2527, + "token_acc": 0.8910625354128768, + "train_speed(iter/s)": 0.032191 + }, + { + "epoch": 0.48985128130601174, + "grad_norm": 0.11149538308382034, + "learning_rate": 0.00016745637283677227, + "loss": 0.4022625982761383, + "memory(GiB)": 78.33, + "step": 2528, + "token_acc": 0.8828381516777448, + "train_speed(iter/s)": 0.032192 + }, + { + "epoch": 0.4900450515913385, + "grad_norm": 0.102491594851017, + "learning_rate": 0.0001673608904849333, + "loss": 0.3276543915271759, + "memory(GiB)": 78.33, + "step": 2529, + "token_acc": 0.9035930180828514, + "train_speed(iter/s)": 0.032193 + }, + { + "epoch": 0.49023882187666523, + "grad_norm": 0.09906461089849472, + "learning_rate": 0.00016726540100251013, + "loss": 0.33269554376602173, + "memory(GiB)": 78.33, + "step": 2530, + "token_acc": 0.9001250312578144, + "train_speed(iter/s)": 0.032194 + }, + { + "epoch": 0.490432592161992, + "grad_norm": 0.10517586022615433, + "learning_rate": 0.00016716990442872286, + "loss": 0.3707316517829895, + "memory(GiB)": 78.33, + "step": 2531, + "token_acc": 0.8911390265885534, + "train_speed(iter/s)": 0.032195 + }, + { + "epoch": 0.4906263624473187, + "grad_norm": 0.10482536256313324, + "learning_rate": 0.00016707440080279448, + "loss": 0.37238699197769165, + "memory(GiB)": 78.33, + "step": 2532, + "token_acc": 0.8896209510682288, + "train_speed(iter/s)": 0.032196 + }, + { + "epoch": 0.49082013273264546, + "grad_norm": 0.10107911378145218, + "learning_rate": 0.00016697889016395085, + "loss": 0.34942498803138733, + "memory(GiB)": 78.33, + "step": 2533, + "token_acc": 0.8957952468007313, + "train_speed(iter/s)": 0.032197 + }, + { + "epoch": 0.4910139030179722, + "grad_norm": 0.11440771073102951, + "learning_rate": 0.00016688337255142078, + "loss": 0.39366450905799866, + "memory(GiB)": 78.33, + "step": 2534, + "token_acc": 0.8840879915976193, + "train_speed(iter/s)": 0.032199 + }, + { + "epoch": 0.49120767330329895, + "grad_norm": 0.11480151861906052, + "learning_rate": 0.00016678784800443593, + "loss": 0.3709234595298767, + "memory(GiB)": 78.33, + "step": 2535, + "token_acc": 0.8920728858433266, + "train_speed(iter/s)": 0.0322 + }, + { + "epoch": 0.4914014435886257, + "grad_norm": 0.10538162291049957, + "learning_rate": 0.00016669231656223082, + "loss": 0.3718525171279907, + "memory(GiB)": 78.33, + "step": 2536, + "token_acc": 0.8928541556305238, + "train_speed(iter/s)": 0.032201 + }, + { + "epoch": 0.49159521387395244, + "grad_norm": 0.10045890510082245, + "learning_rate": 0.00016659677826404273, + "loss": 0.3458371162414551, + "memory(GiB)": 78.33, + "step": 2537, + "token_acc": 0.8953016402557687, + "train_speed(iter/s)": 0.032202 + }, + { + "epoch": 0.4917889841592792, + "grad_norm": 0.09948378801345825, + "learning_rate": 0.00016650123314911188, + "loss": 0.3648657500743866, + "memory(GiB)": 78.33, + "step": 2538, + "token_acc": 0.8927988345515284, + "train_speed(iter/s)": 0.032203 + }, + { + "epoch": 0.49198275444460593, + "grad_norm": 0.11414239555597305, + "learning_rate": 0.00016640568125668117, + "loss": 0.39501774311065674, + "memory(GiB)": 78.33, + "step": 2539, + "token_acc": 0.8854694665701183, + "train_speed(iter/s)": 0.032204 + }, + { + "epoch": 0.4921765247299327, + "grad_norm": 0.11848455667495728, + "learning_rate": 0.00016631012262599632, + "loss": 0.4026211202144623, + "memory(GiB)": 78.33, + "step": 2540, + "token_acc": 0.8802395209580839, + "train_speed(iter/s)": 0.032205 + }, + { + "epoch": 0.4923702950152594, + "grad_norm": 0.10458105802536011, + "learning_rate": 0.0001662145572963058, + "loss": 0.37393152713775635, + "memory(GiB)": 78.33, + "step": 2541, + "token_acc": 0.8897102626590847, + "train_speed(iter/s)": 0.032206 + }, + { + "epoch": 0.49256406530058616, + "grad_norm": 0.11251001805067062, + "learning_rate": 0.0001661189853068609, + "loss": 0.39233535528182983, + "memory(GiB)": 78.33, + "step": 2542, + "token_acc": 0.8865367607200936, + "train_speed(iter/s)": 0.032208 + }, + { + "epoch": 0.4927578355859129, + "grad_norm": 0.11127537488937378, + "learning_rate": 0.00016602340669691563, + "loss": 0.38980281352996826, + "memory(GiB)": 78.33, + "step": 2543, + "token_acc": 0.8849459159053324, + "train_speed(iter/s)": 0.032209 + }, + { + "epoch": 0.49295160587123965, + "grad_norm": 0.11086184531450272, + "learning_rate": 0.00016592782150572666, + "loss": 0.3862883448600769, + "memory(GiB)": 78.33, + "step": 2544, + "token_acc": 0.8859040848435406, + "train_speed(iter/s)": 0.03221 + }, + { + "epoch": 0.4931453761565664, + "grad_norm": 0.10376356542110443, + "learning_rate": 0.00016583222977255337, + "loss": 0.36207854747772217, + "memory(GiB)": 78.33, + "step": 2545, + "token_acc": 0.8943563260789377, + "train_speed(iter/s)": 0.032211 + }, + { + "epoch": 0.49333914644189314, + "grad_norm": 0.10361557453870773, + "learning_rate": 0.00016573663153665792, + "loss": 0.36922216415405273, + "memory(GiB)": 78.33, + "step": 2546, + "token_acc": 0.8913299533978594, + "train_speed(iter/s)": 0.032212 + }, + { + "epoch": 0.4935329167272199, + "grad_norm": 0.09568388015031815, + "learning_rate": 0.000165641026837305, + "loss": 0.32921454310417175, + "memory(GiB)": 78.33, + "step": 2547, + "token_acc": 0.9026607486174076, + "train_speed(iter/s)": 0.032213 + }, + { + "epoch": 0.49372668701254663, + "grad_norm": 0.1072535291314125, + "learning_rate": 0.00016554541571376212, + "loss": 0.34089717268943787, + "memory(GiB)": 78.33, + "step": 2548, + "token_acc": 0.899007279947055, + "train_speed(iter/s)": 0.032214 + }, + { + "epoch": 0.49392045729787337, + "grad_norm": 0.11042487621307373, + "learning_rate": 0.00016544979820529924, + "loss": 0.381551057100296, + "memory(GiB)": 78.33, + "step": 2549, + "token_acc": 0.8886517557338797, + "train_speed(iter/s)": 0.032215 + }, + { + "epoch": 0.4941142275832001, + "grad_norm": 0.10860154032707214, + "learning_rate": 0.0001653541743511891, + "loss": 0.3577283024787903, + "memory(GiB)": 78.33, + "step": 2550, + "token_acc": 0.8940223463687151, + "train_speed(iter/s)": 0.032216 + }, + { + "epoch": 0.49430799786852686, + "grad_norm": 0.10476289689540863, + "learning_rate": 0.00016525854419070698, + "loss": 0.34515032172203064, + "memory(GiB)": 78.33, + "step": 2551, + "token_acc": 0.896808724928779, + "train_speed(iter/s)": 0.032218 + }, + { + "epoch": 0.4945017681538536, + "grad_norm": 0.11404412239789963, + "learning_rate": 0.00016516290776313075, + "loss": 0.3582231402397156, + "memory(GiB)": 78.33, + "step": 2552, + "token_acc": 0.8928893430305944, + "train_speed(iter/s)": 0.032219 + }, + { + "epoch": 0.49469553843918035, + "grad_norm": 0.11500284075737, + "learning_rate": 0.00016506726510774085, + "loss": 0.3722653388977051, + "memory(GiB)": 78.33, + "step": 2553, + "token_acc": 0.8909366240293843, + "train_speed(iter/s)": 0.03222 + }, + { + "epoch": 0.4948893087245071, + "grad_norm": 0.10306849330663681, + "learning_rate": 0.00016497161626382028, + "loss": 0.361613392829895, + "memory(GiB)": 78.33, + "step": 2554, + "token_acc": 0.8950341710758377, + "train_speed(iter/s)": 0.032221 + }, + { + "epoch": 0.49508307900983384, + "grad_norm": 0.10094691812992096, + "learning_rate": 0.0001648759612706546, + "loss": 0.364359587430954, + "memory(GiB)": 78.33, + "step": 2555, + "token_acc": 0.8946505999798327, + "train_speed(iter/s)": 0.032222 + }, + { + "epoch": 0.4952768492951606, + "grad_norm": 0.11370456963777542, + "learning_rate": 0.00016478030016753195, + "loss": 0.3835192620754242, + "memory(GiB)": 78.33, + "step": 2556, + "token_acc": 0.88659125721692, + "train_speed(iter/s)": 0.032223 + }, + { + "epoch": 0.4954706195804873, + "grad_norm": 0.09802607446908951, + "learning_rate": 0.00016468463299374283, + "loss": 0.3576071858406067, + "memory(GiB)": 78.33, + "step": 2557, + "token_acc": 0.8939546925566343, + "train_speed(iter/s)": 0.032224 + }, + { + "epoch": 0.49566438986581407, + "grad_norm": 0.10389211028814316, + "learning_rate": 0.00016458895978858034, + "loss": 0.33778145909309387, + "memory(GiB)": 78.33, + "step": 2558, + "token_acc": 0.901243754650792, + "train_speed(iter/s)": 0.032225 + }, + { + "epoch": 0.4958581601511408, + "grad_norm": 0.10826588422060013, + "learning_rate": 0.00016449328059134008, + "loss": 0.39494338631629944, + "memory(GiB)": 78.33, + "step": 2559, + "token_acc": 0.8841801579743431, + "train_speed(iter/s)": 0.032226 + }, + { + "epoch": 0.49605193043646756, + "grad_norm": 0.1128348559141159, + "learning_rate": 0.00016439759544132, + "loss": 0.35508641600608826, + "memory(GiB)": 78.33, + "step": 2560, + "token_acc": 0.8955709517264593, + "train_speed(iter/s)": 0.032227 + }, + { + "epoch": 0.4962457007217943, + "grad_norm": 0.10237224400043488, + "learning_rate": 0.00016430190437782057, + "loss": 0.3304956555366516, + "memory(GiB)": 78.33, + "step": 2561, + "token_acc": 0.9007643970512228, + "train_speed(iter/s)": 0.032229 + }, + { + "epoch": 0.49643947100712105, + "grad_norm": 0.10696551948785782, + "learning_rate": 0.00016420620744014473, + "loss": 0.34542012214660645, + "memory(GiB)": 78.33, + "step": 2562, + "token_acc": 0.8984604105571847, + "train_speed(iter/s)": 0.03223 + }, + { + "epoch": 0.4966332412924478, + "grad_norm": 0.10525023192167282, + "learning_rate": 0.00016411050466759775, + "loss": 0.37965142726898193, + "memory(GiB)": 78.33, + "step": 2563, + "token_acc": 0.8895881006864989, + "train_speed(iter/s)": 0.032231 + }, + { + "epoch": 0.49682701157777454, + "grad_norm": 0.10651940107345581, + "learning_rate": 0.00016401479609948736, + "loss": 0.36009615659713745, + "memory(GiB)": 78.33, + "step": 2564, + "token_acc": 0.8930840596007872, + "train_speed(iter/s)": 0.032232 + }, + { + "epoch": 0.4970207818631013, + "grad_norm": 0.10184381902217865, + "learning_rate": 0.00016391908177512362, + "loss": 0.3449605405330658, + "memory(GiB)": 78.33, + "step": 2565, + "token_acc": 0.8950802590120822, + "train_speed(iter/s)": 0.032233 + }, + { + "epoch": 0.497214552148428, + "grad_norm": 0.10290346294641495, + "learning_rate": 0.00016382336173381899, + "loss": 0.3701861798763275, + "memory(GiB)": 78.33, + "step": 2566, + "token_acc": 0.8891216519527159, + "train_speed(iter/s)": 0.032234 + }, + { + "epoch": 0.49740832243375477, + "grad_norm": 0.10077276080846786, + "learning_rate": 0.00016372763601488818, + "loss": 0.36543571949005127, + "memory(GiB)": 78.33, + "step": 2567, + "token_acc": 0.891035628960216, + "train_speed(iter/s)": 0.032235 + }, + { + "epoch": 0.4976020927190815, + "grad_norm": 0.10975835472345352, + "learning_rate": 0.00016363190465764837, + "loss": 0.3935272693634033, + "memory(GiB)": 78.33, + "step": 2568, + "token_acc": 0.882976987106251, + "train_speed(iter/s)": 0.032236 + }, + { + "epoch": 0.49779586300440826, + "grad_norm": 0.10267551988363266, + "learning_rate": 0.0001635361677014191, + "loss": 0.3506276607513428, + "memory(GiB)": 78.33, + "step": 2569, + "token_acc": 0.8953292213611682, + "train_speed(iter/s)": 0.032237 + }, + { + "epoch": 0.497989633289735, + "grad_norm": 0.10422598570585251, + "learning_rate": 0.00016344042518552198, + "loss": 0.3643769323825836, + "memory(GiB)": 78.33, + "step": 2570, + "token_acc": 0.8916066426570628, + "train_speed(iter/s)": 0.032238 + }, + { + "epoch": 0.49818340357506175, + "grad_norm": 0.110415019094944, + "learning_rate": 0.00016334467714928112, + "loss": 0.38099807500839233, + "memory(GiB)": 78.33, + "step": 2571, + "token_acc": 0.8894837676823347, + "train_speed(iter/s)": 0.032239 + }, + { + "epoch": 0.4983771738603885, + "grad_norm": 0.0991300493478775, + "learning_rate": 0.00016324892363202273, + "loss": 0.33743083477020264, + "memory(GiB)": 78.33, + "step": 2572, + "token_acc": 0.8997278841710735, + "train_speed(iter/s)": 0.03224 + }, + { + "epoch": 0.49857094414571523, + "grad_norm": 0.11681834608316422, + "learning_rate": 0.00016315316467307544, + "loss": 0.41430431604385376, + "memory(GiB)": 78.33, + "step": 2573, + "token_acc": 0.8792227548003788, + "train_speed(iter/s)": 0.032242 + }, + { + "epoch": 0.498764714431042, + "grad_norm": 0.10748863965272903, + "learning_rate": 0.0001630574003117699, + "loss": 0.393510639667511, + "memory(GiB)": 78.33, + "step": 2574, + "token_acc": 0.8857627401373405, + "train_speed(iter/s)": 0.032243 + }, + { + "epoch": 0.4989584847163687, + "grad_norm": 0.10948104411363602, + "learning_rate": 0.00016296163058743919, + "loss": 0.35196927189826965, + "memory(GiB)": 78.33, + "step": 2575, + "token_acc": 0.8924734374072535, + "train_speed(iter/s)": 0.032244 + }, + { + "epoch": 0.49915225500169547, + "grad_norm": 0.1006598174571991, + "learning_rate": 0.00016286585553941857, + "loss": 0.32736673951148987, + "memory(GiB)": 78.33, + "step": 2576, + "token_acc": 0.9026124709933379, + "train_speed(iter/s)": 0.032245 + }, + { + "epoch": 0.4993460252870222, + "grad_norm": 0.09987809509038925, + "learning_rate": 0.00016277007520704533, + "loss": 0.3588752746582031, + "memory(GiB)": 78.33, + "step": 2577, + "token_acc": 0.8945117224048975, + "train_speed(iter/s)": 0.032246 + }, + { + "epoch": 0.49953979557234895, + "grad_norm": 0.10553018003702164, + "learning_rate": 0.00016267428962965906, + "loss": 0.3588329255580902, + "memory(GiB)": 78.33, + "step": 2578, + "token_acc": 0.8936772386426634, + "train_speed(iter/s)": 0.032247 + }, + { + "epoch": 0.4997335658576757, + "grad_norm": 0.10368996113538742, + "learning_rate": 0.00016257849884660148, + "loss": 0.36948418617248535, + "memory(GiB)": 78.33, + "step": 2579, + "token_acc": 0.8906377490590371, + "train_speed(iter/s)": 0.032248 + }, + { + "epoch": 0.49992733614300244, + "grad_norm": 0.1034737378358841, + "learning_rate": 0.00016248270289721646, + "loss": 0.3439171612262726, + "memory(GiB)": 78.33, + "step": 2580, + "token_acc": 0.8986508719973676, + "train_speed(iter/s)": 0.032249 + }, + { + "epoch": 0.5001211064283292, + "grad_norm": 0.10310948640108109, + "learning_rate": 0.00016238690182084986, + "loss": 0.336532324552536, + "memory(GiB)": 78.33, + "step": 2581, + "token_acc": 0.9005147686293942, + "train_speed(iter/s)": 0.03225 + }, + { + "epoch": 0.5003148767136559, + "grad_norm": 0.1009165570139885, + "learning_rate": 0.0001622910956568498, + "loss": 0.33198171854019165, + "memory(GiB)": 78.33, + "step": 2582, + "token_acc": 0.9027183088253284, + "train_speed(iter/s)": 0.032251 + }, + { + "epoch": 0.5005086469989827, + "grad_norm": 0.11531993001699448, + "learning_rate": 0.00016219528444456658, + "loss": 0.38337087631225586, + "memory(GiB)": 78.33, + "step": 2583, + "token_acc": 0.8874007793332371, + "train_speed(iter/s)": 0.032253 + }, + { + "epoch": 0.5007024172843094, + "grad_norm": 0.11452654004096985, + "learning_rate": 0.0001620994682233523, + "loss": 0.3833463191986084, + "memory(GiB)": 78.33, + "step": 2584, + "token_acc": 0.887452540227807, + "train_speed(iter/s)": 0.032254 + }, + { + "epoch": 0.5008961875696362, + "grad_norm": 0.09829486906528473, + "learning_rate": 0.00016200364703256132, + "loss": 0.3433375358581543, + "memory(GiB)": 78.33, + "step": 2585, + "token_acc": 0.898637268412188, + "train_speed(iter/s)": 0.032255 + }, + { + "epoch": 0.5010899578549629, + "grad_norm": 0.11422152817249298, + "learning_rate": 0.00016190782091154993, + "loss": 0.3638113737106323, + "memory(GiB)": 78.33, + "step": 2586, + "token_acc": 0.8920692223941321, + "train_speed(iter/s)": 0.032256 + }, + { + "epoch": 0.5012837281402897, + "grad_norm": 0.1095975786447525, + "learning_rate": 0.00016181198989967648, + "loss": 0.375558078289032, + "memory(GiB)": 78.33, + "step": 2587, + "token_acc": 0.8875113896788801, + "train_speed(iter/s)": 0.032257 + }, + { + "epoch": 0.5014774984256164, + "grad_norm": 0.12371022254228592, + "learning_rate": 0.0001617161540363014, + "loss": 0.3656739592552185, + "memory(GiB)": 78.33, + "step": 2588, + "token_acc": 0.890730205842746, + "train_speed(iter/s)": 0.032258 + }, + { + "epoch": 0.5016712687109431, + "grad_norm": 0.1015966534614563, + "learning_rate": 0.00016162031336078707, + "loss": 0.35260072350502014, + "memory(GiB)": 78.33, + "step": 2589, + "token_acc": 0.8963511941792216, + "train_speed(iter/s)": 0.032259 + }, + { + "epoch": 0.5018650389962699, + "grad_norm": 0.09963703155517578, + "learning_rate": 0.00016152446791249775, + "loss": 0.3328312933444977, + "memory(GiB)": 78.33, + "step": 2590, + "token_acc": 0.9016321007428681, + "train_speed(iter/s)": 0.03226 + }, + { + "epoch": 0.5020588092815966, + "grad_norm": 0.11252304911613464, + "learning_rate": 0.00016142861773079983, + "loss": 0.3886357545852661, + "memory(GiB)": 78.33, + "step": 2591, + "token_acc": 0.8855965345045425, + "train_speed(iter/s)": 0.032261 + }, + { + "epoch": 0.5022525795669234, + "grad_norm": 0.10422030091285706, + "learning_rate": 0.00016133276285506152, + "loss": 0.3719256520271301, + "memory(GiB)": 78.33, + "step": 2592, + "token_acc": 0.8892891253922772, + "train_speed(iter/s)": 0.032262 + }, + { + "epoch": 0.5024463498522501, + "grad_norm": 0.11819025874137878, + "learning_rate": 0.00016123690332465294, + "loss": 0.3973425626754761, + "memory(GiB)": 78.33, + "step": 2593, + "token_acc": 0.8866967825205125, + "train_speed(iter/s)": 0.032264 + }, + { + "epoch": 0.5026401201375769, + "grad_norm": 0.11547715216875076, + "learning_rate": 0.00016114103917894617, + "loss": 0.3757579028606415, + "memory(GiB)": 78.33, + "step": 2594, + "token_acc": 0.8914766343788905, + "train_speed(iter/s)": 0.032265 + }, + { + "epoch": 0.5028338904229036, + "grad_norm": 0.1119546890258789, + "learning_rate": 0.0001610451704573153, + "loss": 0.365306556224823, + "memory(GiB)": 78.33, + "step": 2595, + "token_acc": 0.8929053225410841, + "train_speed(iter/s)": 0.032266 + }, + { + "epoch": 0.5030276607082304, + "grad_norm": 0.10307694226503372, + "learning_rate": 0.00016094929719913612, + "loss": 0.34190264344215393, + "memory(GiB)": 78.33, + "step": 2596, + "token_acc": 0.8982385908726982, + "train_speed(iter/s)": 0.032267 + }, + { + "epoch": 0.5032214309935571, + "grad_norm": 0.10450293868780136, + "learning_rate": 0.00016085341944378634, + "loss": 0.37576237320899963, + "memory(GiB)": 78.33, + "step": 2597, + "token_acc": 0.8903388098419935, + "train_speed(iter/s)": 0.032268 + }, + { + "epoch": 0.5034152012788838, + "grad_norm": 0.11997717618942261, + "learning_rate": 0.00016075753723064558, + "loss": 0.3967727720737457, + "memory(GiB)": 78.33, + "step": 2598, + "token_acc": 0.8815336184366833, + "train_speed(iter/s)": 0.032269 + }, + { + "epoch": 0.5036089715642106, + "grad_norm": 0.10347646474838257, + "learning_rate": 0.00016066165059909523, + "loss": 0.3663371801376343, + "memory(GiB)": 78.33, + "step": 2599, + "token_acc": 0.8920181267202313, + "train_speed(iter/s)": 0.03227 + }, + { + "epoch": 0.5038027418495373, + "grad_norm": 0.10675547271966934, + "learning_rate": 0.00016056575958851843, + "loss": 0.3652355670928955, + "memory(GiB)": 78.33, + "step": 2600, + "token_acc": 0.890871055842873, + "train_speed(iter/s)": 0.032271 + }, + { + "epoch": 0.5039965121348641, + "grad_norm": 0.11141776293516159, + "learning_rate": 0.0001604698642383003, + "loss": 0.37232065200805664, + "memory(GiB)": 78.33, + "step": 2601, + "token_acc": 0.8899046440325167, + "train_speed(iter/s)": 0.032267 + }, + { + "epoch": 0.5041902824201908, + "grad_norm": 0.08926242589950562, + "learning_rate": 0.00016037396458782759, + "loss": 0.3233528435230255, + "memory(GiB)": 78.33, + "step": 2602, + "token_acc": 0.9036984098624263, + "train_speed(iter/s)": 0.032268 + }, + { + "epoch": 0.5043840527055176, + "grad_norm": 0.09639449417591095, + "learning_rate": 0.00016027806067648884, + "loss": 0.31952396035194397, + "memory(GiB)": 78.33, + "step": 2603, + "token_acc": 0.9034606910634639, + "train_speed(iter/s)": 0.032269 + }, + { + "epoch": 0.5045778229908443, + "grad_norm": 0.09628106653690338, + "learning_rate": 0.0001601821525436744, + "loss": 0.3303004503250122, + "memory(GiB)": 78.33, + "step": 2604, + "token_acc": 0.9007667907185853, + "train_speed(iter/s)": 0.03227 + }, + { + "epoch": 0.504771593276171, + "grad_norm": 0.10307995975017548, + "learning_rate": 0.0001600862402287763, + "loss": 0.370724618434906, + "memory(GiB)": 78.33, + "step": 2605, + "token_acc": 0.8890643615105422, + "train_speed(iter/s)": 0.032271 + }, + { + "epoch": 0.5049653635614978, + "grad_norm": 0.11356647312641144, + "learning_rate": 0.00015999032377118834, + "loss": 0.3665652275085449, + "memory(GiB)": 78.33, + "step": 2606, + "token_acc": 0.8920088410173396, + "train_speed(iter/s)": 0.032272 + }, + { + "epoch": 0.5051591338468245, + "grad_norm": 0.11644966900348663, + "learning_rate": 0.0001598944032103059, + "loss": 0.38838085532188416, + "memory(GiB)": 78.33, + "step": 2607, + "token_acc": 0.8869162270049968, + "train_speed(iter/s)": 0.032273 + }, + { + "epoch": 0.5053529041321513, + "grad_norm": 0.10509419441223145, + "learning_rate": 0.0001597984785855262, + "loss": 0.37085598707199097, + "memory(GiB)": 78.33, + "step": 2608, + "token_acc": 0.8929580496730619, + "train_speed(iter/s)": 0.032274 + }, + { + "epoch": 0.505546674417478, + "grad_norm": 0.10611365735530853, + "learning_rate": 0.0001597025499362481, + "loss": 0.3378239870071411, + "memory(GiB)": 78.33, + "step": 2609, + "token_acc": 0.9013872771389387, + "train_speed(iter/s)": 0.032275 + }, + { + "epoch": 0.5057404447028048, + "grad_norm": 0.11018887907266617, + "learning_rate": 0.000159606617301872, + "loss": 0.3307594954967499, + "memory(GiB)": 78.33, + "step": 2610, + "token_acc": 0.9017131424284106, + "train_speed(iter/s)": 0.032276 + }, + { + "epoch": 0.5059342149881316, + "grad_norm": 0.1073288694024086, + "learning_rate": 0.00015951068072180002, + "loss": 0.38456863164901733, + "memory(GiB)": 78.33, + "step": 2611, + "token_acc": 0.8856738792250035, + "train_speed(iter/s)": 0.032277 + }, + { + "epoch": 0.5061279852734584, + "grad_norm": 0.11159303784370422, + "learning_rate": 0.0001594147402354359, + "loss": 0.3925580680370331, + "memory(GiB)": 78.33, + "step": 2612, + "token_acc": 0.8856811904125457, + "train_speed(iter/s)": 0.032279 + }, + { + "epoch": 0.5063217555587851, + "grad_norm": 0.09800157696008682, + "learning_rate": 0.00015931879588218503, + "loss": 0.3453458845615387, + "memory(GiB)": 78.33, + "step": 2613, + "token_acc": 0.8966416230180362, + "train_speed(iter/s)": 0.03228 + }, + { + "epoch": 0.5065155258441119, + "grad_norm": 0.09460814297199249, + "learning_rate": 0.00015922284770145424, + "loss": 0.32773369550704956, + "memory(GiB)": 78.33, + "step": 2614, + "token_acc": 0.9022055463357145, + "train_speed(iter/s)": 0.032281 + }, + { + "epoch": 0.5067092961294386, + "grad_norm": 0.10572027415037155, + "learning_rate": 0.00015912689573265208, + "loss": 0.37495627999305725, + "memory(GiB)": 78.33, + "step": 2615, + "token_acc": 0.8893242702918832, + "train_speed(iter/s)": 0.032282 + }, + { + "epoch": 0.5069030664147653, + "grad_norm": 0.10272035747766495, + "learning_rate": 0.00015903094001518857, + "loss": 0.3710392713546753, + "memory(GiB)": 78.33, + "step": 2616, + "token_acc": 0.8908351579605881, + "train_speed(iter/s)": 0.032283 + }, + { + "epoch": 0.5070968367000921, + "grad_norm": 0.11143457889556885, + "learning_rate": 0.0001589349805884754, + "loss": 0.38444697856903076, + "memory(GiB)": 78.33, + "step": 2617, + "token_acc": 0.8872893621420449, + "train_speed(iter/s)": 0.032284 + }, + { + "epoch": 0.5072906069854188, + "grad_norm": 0.10727585107088089, + "learning_rate": 0.00015883901749192555, + "loss": 0.3834106922149658, + "memory(GiB)": 78.33, + "step": 2618, + "token_acc": 0.888004011444415, + "train_speed(iter/s)": 0.032285 + }, + { + "epoch": 0.5074843772707456, + "grad_norm": 0.10124249756336212, + "learning_rate": 0.00015874305076495372, + "loss": 0.37049931287765503, + "memory(GiB)": 78.33, + "step": 2619, + "token_acc": 0.8904576168763391, + "train_speed(iter/s)": 0.032286 + }, + { + "epoch": 0.5076781475560723, + "grad_norm": 0.1076463907957077, + "learning_rate": 0.00015864708044697597, + "loss": 0.35899150371551514, + "memory(GiB)": 78.33, + "step": 2620, + "token_acc": 0.8948525469168901, + "train_speed(iter/s)": 0.032287 + }, + { + "epoch": 0.5078719178413991, + "grad_norm": 0.09248903393745422, + "learning_rate": 0.00015855110657740998, + "loss": 0.3522607386112213, + "memory(GiB)": 78.33, + "step": 2621, + "token_acc": 0.8958730306269461, + "train_speed(iter/s)": 0.032288 + }, + { + "epoch": 0.5080656881267258, + "grad_norm": 0.10546861588954926, + "learning_rate": 0.00015845512919567467, + "loss": 0.3607703447341919, + "memory(GiB)": 78.33, + "step": 2622, + "token_acc": 0.8937468225724453, + "train_speed(iter/s)": 0.032289 + }, + { + "epoch": 0.5082594584120526, + "grad_norm": 0.08962032198905945, + "learning_rate": 0.00015835914834119066, + "loss": 0.3159025311470032, + "memory(GiB)": 78.33, + "step": 2623, + "token_acc": 0.9060619623054353, + "train_speed(iter/s)": 0.03229 + }, + { + "epoch": 0.5084532286973793, + "grad_norm": 0.10019563883543015, + "learning_rate": 0.0001582631640533798, + "loss": 0.34771329164505005, + "memory(GiB)": 78.33, + "step": 2624, + "token_acc": 0.8975572054242686, + "train_speed(iter/s)": 0.032291 + }, + { + "epoch": 0.508646998982706, + "grad_norm": 0.11215566098690033, + "learning_rate": 0.00015816717637166545, + "loss": 0.38069719076156616, + "memory(GiB)": 78.33, + "step": 2625, + "token_acc": 0.8880416751549722, + "train_speed(iter/s)": 0.032292 + }, + { + "epoch": 0.5088407692680328, + "grad_norm": 0.10857607424259186, + "learning_rate": 0.00015807118533547228, + "loss": 0.36432355642318726, + "memory(GiB)": 78.33, + "step": 2626, + "token_acc": 0.8919505825043285, + "train_speed(iter/s)": 0.032293 + }, + { + "epoch": 0.5090345395533595, + "grad_norm": 0.10065297037363052, + "learning_rate": 0.00015797519098422638, + "loss": 0.3655664920806885, + "memory(GiB)": 78.33, + "step": 2627, + "token_acc": 0.8914216996258685, + "train_speed(iter/s)": 0.032294 + }, + { + "epoch": 0.5092283098386863, + "grad_norm": 0.10752709954977036, + "learning_rate": 0.00015787919335735523, + "loss": 0.35718318819999695, + "memory(GiB)": 78.33, + "step": 2628, + "token_acc": 0.8937131050767414, + "train_speed(iter/s)": 0.032295 + }, + { + "epoch": 0.509422080124013, + "grad_norm": 0.10011646896600723, + "learning_rate": 0.0001577831924942877, + "loss": 0.3524722158908844, + "memory(GiB)": 78.33, + "step": 2629, + "token_acc": 0.8952115870315963, + "train_speed(iter/s)": 0.032296 + }, + { + "epoch": 0.5096158504093398, + "grad_norm": 0.09924966841936111, + "learning_rate": 0.00015768718843445386, + "loss": 0.35439997911453247, + "memory(GiB)": 78.33, + "step": 2630, + "token_acc": 0.8962517059855722, + "train_speed(iter/s)": 0.032297 + }, + { + "epoch": 0.5098096206946665, + "grad_norm": 0.10966690629720688, + "learning_rate": 0.00015759118121728516, + "loss": 0.4074428677558899, + "memory(GiB)": 78.33, + "step": 2631, + "token_acc": 0.8822711142654365, + "train_speed(iter/s)": 0.032298 + }, + { + "epoch": 0.5100033909799933, + "grad_norm": 0.11511880904436111, + "learning_rate": 0.00015749517088221434, + "loss": 0.4096870720386505, + "memory(GiB)": 78.33, + "step": 2632, + "token_acc": 0.8811336465830012, + "train_speed(iter/s)": 0.032299 + }, + { + "epoch": 0.51019716126532, + "grad_norm": 0.10656805336475372, + "learning_rate": 0.00015739915746867546, + "loss": 0.3948022127151489, + "memory(GiB)": 78.33, + "step": 2633, + "token_acc": 0.8854893199651264, + "train_speed(iter/s)": 0.0323 + }, + { + "epoch": 0.5103909315506467, + "grad_norm": 0.09918279200792313, + "learning_rate": 0.00015730314101610376, + "loss": 0.32525914907455444, + "memory(GiB)": 78.33, + "step": 2634, + "token_acc": 0.9005807402270954, + "train_speed(iter/s)": 0.032301 + }, + { + "epoch": 0.5105847018359735, + "grad_norm": 0.09818840026855469, + "learning_rate": 0.00015720712156393579, + "loss": 0.33824896812438965, + "memory(GiB)": 78.33, + "step": 2635, + "token_acc": 0.9008060904612629, + "train_speed(iter/s)": 0.032302 + }, + { + "epoch": 0.5107784721213002, + "grad_norm": 0.09556617587804794, + "learning_rate": 0.00015711109915160932, + "loss": 0.3609238564968109, + "memory(GiB)": 78.33, + "step": 2636, + "token_acc": 0.8941985496374093, + "train_speed(iter/s)": 0.032303 + }, + { + "epoch": 0.510972242406627, + "grad_norm": 0.10253620892763138, + "learning_rate": 0.00015701507381856342, + "loss": 0.35230863094329834, + "memory(GiB)": 78.33, + "step": 2637, + "token_acc": 0.8980842250604073, + "train_speed(iter/s)": 0.032304 + }, + { + "epoch": 0.5111660126919537, + "grad_norm": 0.10715737193822861, + "learning_rate": 0.00015691904560423818, + "loss": 0.37759995460510254, + "memory(GiB)": 78.33, + "step": 2638, + "token_acc": 0.8882215743440234, + "train_speed(iter/s)": 0.032305 + }, + { + "epoch": 0.5113597829772805, + "grad_norm": 0.1301286518573761, + "learning_rate": 0.00015682301454807496, + "loss": 0.3717860281467438, + "memory(GiB)": 78.33, + "step": 2639, + "token_acc": 0.8927836337553795, + "train_speed(iter/s)": 0.032306 + }, + { + "epoch": 0.5115535532626072, + "grad_norm": 0.09518956393003464, + "learning_rate": 0.00015672698068951632, + "loss": 0.3172317445278168, + "memory(GiB)": 78.33, + "step": 2640, + "token_acc": 0.9052939066263517, + "train_speed(iter/s)": 0.032307 + }, + { + "epoch": 0.511747323547934, + "grad_norm": 0.09442702680826187, + "learning_rate": 0.00015663094406800592, + "loss": 0.3069186210632324, + "memory(GiB)": 78.33, + "step": 2641, + "token_acc": 0.9095811612439332, + "train_speed(iter/s)": 0.032308 + }, + { + "epoch": 0.5119410938332607, + "grad_norm": 0.11501803249120712, + "learning_rate": 0.00015653490472298864, + "loss": 0.3687857389450073, + "memory(GiB)": 78.33, + "step": 2642, + "token_acc": 0.8907549189100189, + "train_speed(iter/s)": 0.032309 + }, + { + "epoch": 0.5121348641185874, + "grad_norm": 0.0969555526971817, + "learning_rate": 0.00015643886269391043, + "loss": 0.3279804587364197, + "memory(GiB)": 78.33, + "step": 2643, + "token_acc": 0.9027313266443702, + "train_speed(iter/s)": 0.03231 + }, + { + "epoch": 0.5123286344039142, + "grad_norm": 0.10562612861394882, + "learning_rate": 0.00015634281802021826, + "loss": 0.345773845911026, + "memory(GiB)": 78.33, + "step": 2644, + "token_acc": 0.8951558127530089, + "train_speed(iter/s)": 0.032311 + }, + { + "epoch": 0.5125224046892409, + "grad_norm": 0.10318602621555328, + "learning_rate": 0.0001562467707413603, + "loss": 0.38389700651168823, + "memory(GiB)": 78.33, + "step": 2645, + "token_acc": 0.8879924999383219, + "train_speed(iter/s)": 0.032312 + }, + { + "epoch": 0.5127161749745677, + "grad_norm": 0.0975809097290039, + "learning_rate": 0.00015615072089678574, + "loss": 0.3352702260017395, + "memory(GiB)": 78.33, + "step": 2646, + "token_acc": 0.8997019155590305, + "train_speed(iter/s)": 0.032313 + }, + { + "epoch": 0.5129099452598944, + "grad_norm": 0.102164626121521, + "learning_rate": 0.00015605466852594481, + "loss": 0.36365604400634766, + "memory(GiB)": 78.33, + "step": 2647, + "token_acc": 0.8929846070017708, + "train_speed(iter/s)": 0.032314 + }, + { + "epoch": 0.5131037155452212, + "grad_norm": 0.09489741176366806, + "learning_rate": 0.00015595861366828883, + "loss": 0.32492002844810486, + "memory(GiB)": 78.33, + "step": 2648, + "token_acc": 0.9037146465248159, + "train_speed(iter/s)": 0.032315 + }, + { + "epoch": 0.5132974858305479, + "grad_norm": 0.0985584408044815, + "learning_rate": 0.00015586255636327012, + "loss": 0.3407394289970398, + "memory(GiB)": 78.33, + "step": 2649, + "token_acc": 0.9006668376506797, + "train_speed(iter/s)": 0.032316 + }, + { + "epoch": 0.5134912561158747, + "grad_norm": 0.0985412672162056, + "learning_rate": 0.00015576649665034197, + "loss": 0.3170143961906433, + "memory(GiB)": 78.33, + "step": 2650, + "token_acc": 0.907279489904357, + "train_speed(iter/s)": 0.032317 + }, + { + "epoch": 0.5136850264012014, + "grad_norm": 0.10823136568069458, + "learning_rate": 0.00015567043456895868, + "loss": 0.37427809834480286, + "memory(GiB)": 78.33, + "step": 2651, + "token_acc": 0.8882963136611381, + "train_speed(iter/s)": 0.032318 + }, + { + "epoch": 0.5138787966865281, + "grad_norm": 0.09839842468500137, + "learning_rate": 0.0001555743701585756, + "loss": 0.3554091155529022, + "memory(GiB)": 78.33, + "step": 2652, + "token_acc": 0.8943349139006738, + "train_speed(iter/s)": 0.032319 + }, + { + "epoch": 0.5140725669718549, + "grad_norm": 0.10772431641817093, + "learning_rate": 0.00015547830345864885, + "loss": 0.3907697796821594, + "memory(GiB)": 78.33, + "step": 2653, + "token_acc": 0.8854761904761905, + "train_speed(iter/s)": 0.03232 + }, + { + "epoch": 0.5142663372571816, + "grad_norm": 0.10258005559444427, + "learning_rate": 0.00015538223450863565, + "loss": 0.3574678897857666, + "memory(GiB)": 78.33, + "step": 2654, + "token_acc": 0.893127167301461, + "train_speed(iter/s)": 0.032322 + }, + { + "epoch": 0.5144601075425084, + "grad_norm": 0.10088451206684113, + "learning_rate": 0.0001552861633479941, + "loss": 0.37425774335861206, + "memory(GiB)": 78.33, + "step": 2655, + "token_acc": 0.8893967324057473, + "train_speed(iter/s)": 0.032322 + }, + { + "epoch": 0.5146538778278351, + "grad_norm": 0.10441171377897263, + "learning_rate": 0.00015519009001618327, + "loss": 0.3345174193382263, + "memory(GiB)": 78.33, + "step": 2656, + "token_acc": 0.8989620545132345, + "train_speed(iter/s)": 0.032323 + }, + { + "epoch": 0.5148476481131619, + "grad_norm": 0.10342314839363098, + "learning_rate": 0.000155094014552663, + "loss": 0.3678382635116577, + "memory(GiB)": 78.33, + "step": 2657, + "token_acc": 0.8913637207329038, + "train_speed(iter/s)": 0.032324 + }, + { + "epoch": 0.5150414183984886, + "grad_norm": 0.10229925811290741, + "learning_rate": 0.00015499793699689406, + "loss": 0.3740781247615814, + "memory(GiB)": 78.33, + "step": 2658, + "token_acc": 0.8912131626660894, + "train_speed(iter/s)": 0.032325 + }, + { + "epoch": 0.5152351886838153, + "grad_norm": 0.10311522334814072, + "learning_rate": 0.0001549018573883381, + "loss": 0.3588406443595886, + "memory(GiB)": 78.33, + "step": 2659, + "token_acc": 0.8927260150055872, + "train_speed(iter/s)": 0.032326 + }, + { + "epoch": 0.5154289589691421, + "grad_norm": 0.1111614927649498, + "learning_rate": 0.00015480577576645758, + "loss": 0.3687216639518738, + "memory(GiB)": 78.33, + "step": 2660, + "token_acc": 0.8915793809579852, + "train_speed(iter/s)": 0.032327 + }, + { + "epoch": 0.5156227292544688, + "grad_norm": 0.10557236522436142, + "learning_rate": 0.00015470969217071582, + "loss": 0.39404910802841187, + "memory(GiB)": 78.33, + "step": 2661, + "token_acc": 0.884908754848619, + "train_speed(iter/s)": 0.032328 + }, + { + "epoch": 0.5158164995397956, + "grad_norm": 0.11680735647678375, + "learning_rate": 0.00015461360664057692, + "loss": 0.391072154045105, + "memory(GiB)": 78.33, + "step": 2662, + "token_acc": 0.8852517451358978, + "train_speed(iter/s)": 0.03233 + }, + { + "epoch": 0.5160102698251223, + "grad_norm": 0.1172829270362854, + "learning_rate": 0.00015451751921550583, + "loss": 0.38999128341674805, + "memory(GiB)": 78.33, + "step": 2663, + "token_acc": 0.8861199122593224, + "train_speed(iter/s)": 0.032331 + }, + { + "epoch": 0.5162040401104491, + "grad_norm": 0.09996732324361801, + "learning_rate": 0.0001544214299349682, + "loss": 0.3332677483558655, + "memory(GiB)": 78.33, + "step": 2664, + "token_acc": 0.900718860279702, + "train_speed(iter/s)": 0.032332 + }, + { + "epoch": 0.5163978103957758, + "grad_norm": 0.11197924613952637, + "learning_rate": 0.00015432533883843048, + "loss": 0.35336822271347046, + "memory(GiB)": 78.33, + "step": 2665, + "token_acc": 0.8947896063077465, + "train_speed(iter/s)": 0.032333 + }, + { + "epoch": 0.5165915806811026, + "grad_norm": 0.10348919034004211, + "learning_rate": 0.0001542292459653599, + "loss": 0.3489812910556793, + "memory(GiB)": 78.33, + "step": 2666, + "token_acc": 0.8955829903978052, + "train_speed(iter/s)": 0.032334 + }, + { + "epoch": 0.5167853509664293, + "grad_norm": 0.10220210254192352, + "learning_rate": 0.00015413315135522432, + "loss": 0.3711531162261963, + "memory(GiB)": 78.33, + "step": 2667, + "token_acc": 0.8892863670783613, + "train_speed(iter/s)": 0.032335 + }, + { + "epoch": 0.516979121251756, + "grad_norm": 0.1041957437992096, + "learning_rate": 0.00015403705504749238, + "loss": 0.3776172697544098, + "memory(GiB)": 78.33, + "step": 2668, + "token_acc": 0.8862548629964455, + "train_speed(iter/s)": 0.032336 + }, + { + "epoch": 0.5171728915370828, + "grad_norm": 0.10291523486375809, + "learning_rate": 0.0001539409570816335, + "loss": 0.36820337176322937, + "memory(GiB)": 78.33, + "step": 2669, + "token_acc": 0.8891734392557556, + "train_speed(iter/s)": 0.032337 + }, + { + "epoch": 0.5173666618224095, + "grad_norm": 0.10148876905441284, + "learning_rate": 0.00015384485749711768, + "loss": 0.34953948855400085, + "memory(GiB)": 78.33, + "step": 2670, + "token_acc": 0.8953482075165169, + "train_speed(iter/s)": 0.032338 + }, + { + "epoch": 0.5175604321077363, + "grad_norm": 0.09922674298286438, + "learning_rate": 0.0001537487563334155, + "loss": 0.306792676448822, + "memory(GiB)": 78.33, + "step": 2671, + "token_acc": 0.9090061848682538, + "train_speed(iter/s)": 0.032339 + }, + { + "epoch": 0.517754202393063, + "grad_norm": 0.10890819132328033, + "learning_rate": 0.00015365265362999846, + "loss": 0.3788120746612549, + "memory(GiB)": 78.33, + "step": 2672, + "token_acc": 0.8892348255357343, + "train_speed(iter/s)": 0.03234 + }, + { + "epoch": 0.5179479726783898, + "grad_norm": 0.09601421654224396, + "learning_rate": 0.00015355654942633833, + "loss": 0.3305688500404358, + "memory(GiB)": 78.33, + "step": 2673, + "token_acc": 0.9015699037955296, + "train_speed(iter/s)": 0.032341 + }, + { + "epoch": 0.5181417429637165, + "grad_norm": 0.09569491446018219, + "learning_rate": 0.00015346044376190782, + "loss": 0.31505027413368225, + "memory(GiB)": 78.33, + "step": 2674, + "token_acc": 0.9047790339157246, + "train_speed(iter/s)": 0.032342 + }, + { + "epoch": 0.5183355132490433, + "grad_norm": 0.11235766857862473, + "learning_rate": 0.00015336433667618004, + "loss": 0.368362158536911, + "memory(GiB)": 78.33, + "step": 2675, + "token_acc": 0.8934327846364883, + "train_speed(iter/s)": 0.032343 + }, + { + "epoch": 0.51852928353437, + "grad_norm": 0.09810005128383636, + "learning_rate": 0.00015326822820862883, + "loss": 0.3768079876899719, + "memory(GiB)": 78.33, + "step": 2676, + "token_acc": 0.8896894012167788, + "train_speed(iter/s)": 0.032344 + }, + { + "epoch": 0.5187230538196967, + "grad_norm": 0.1080244854092598, + "learning_rate": 0.00015317211839872846, + "loss": 0.3397291898727417, + "memory(GiB)": 78.33, + "step": 2677, + "token_acc": 0.90013633265167, + "train_speed(iter/s)": 0.032345 + }, + { + "epoch": 0.5189168241050235, + "grad_norm": 0.10235600918531418, + "learning_rate": 0.00015307600728595383, + "loss": 0.3567368686199188, + "memory(GiB)": 78.33, + "step": 2678, + "token_acc": 0.89522800645682, + "train_speed(iter/s)": 0.032346 + }, + { + "epoch": 0.5191105943903502, + "grad_norm": 0.10997829586267471, + "learning_rate": 0.00015297989490978037, + "loss": 0.3763918876647949, + "memory(GiB)": 78.33, + "step": 2679, + "token_acc": 0.8877150980098758, + "train_speed(iter/s)": 0.032347 + }, + { + "epoch": 0.519304364675677, + "grad_norm": 0.11425752192735672, + "learning_rate": 0.00015288378130968395, + "loss": 0.350836843252182, + "memory(GiB)": 78.33, + "step": 2680, + "token_acc": 0.8964464422016792, + "train_speed(iter/s)": 0.032348 + }, + { + "epoch": 0.5194981349610037, + "grad_norm": 0.10013525933027267, + "learning_rate": 0.00015278766652514103, + "loss": 0.332501083612442, + "memory(GiB)": 78.33, + "step": 2681, + "token_acc": 0.8996237243171544, + "train_speed(iter/s)": 0.032349 + }, + { + "epoch": 0.5196919052463305, + "grad_norm": 0.09917152673006058, + "learning_rate": 0.00015269155059562863, + "loss": 0.35868263244628906, + "memory(GiB)": 78.33, + "step": 2682, + "token_acc": 0.8944104296421408, + "train_speed(iter/s)": 0.03235 + }, + { + "epoch": 0.5198856755316572, + "grad_norm": 0.10261884331703186, + "learning_rate": 0.00015259543356062406, + "loss": 0.3666459321975708, + "memory(GiB)": 78.33, + "step": 2683, + "token_acc": 0.8901406178581991, + "train_speed(iter/s)": 0.032351 + }, + { + "epoch": 0.520079445816984, + "grad_norm": 0.11286422610282898, + "learning_rate": 0.00015249931545960517, + "loss": 0.38164597749710083, + "memory(GiB)": 78.33, + "step": 2684, + "token_acc": 0.8868490627692125, + "train_speed(iter/s)": 0.032352 + }, + { + "epoch": 0.5202732161023107, + "grad_norm": 0.10428116470575333, + "learning_rate": 0.0001524031963320503, + "loss": 0.34657108783721924, + "memory(GiB)": 78.33, + "step": 2685, + "token_acc": 0.8960804399396162, + "train_speed(iter/s)": 0.032353 + }, + { + "epoch": 0.5204669863876374, + "grad_norm": 0.10740246623754501, + "learning_rate": 0.00015230707621743809, + "loss": 0.3531382381916046, + "memory(GiB)": 78.33, + "step": 2686, + "token_acc": 0.8949211908931699, + "train_speed(iter/s)": 0.032354 + }, + { + "epoch": 0.5206607566729642, + "grad_norm": 0.10802320390939713, + "learning_rate": 0.00015221095515524768, + "loss": 0.38364243507385254, + "memory(GiB)": 78.33, + "step": 2687, + "token_acc": 0.8869632473001232, + "train_speed(iter/s)": 0.032355 + }, + { + "epoch": 0.5208545269582909, + "grad_norm": 0.10873539000749588, + "learning_rate": 0.00015211483318495854, + "loss": 0.37823835015296936, + "memory(GiB)": 78.33, + "step": 2688, + "token_acc": 0.8876921812052947, + "train_speed(iter/s)": 0.032356 + }, + { + "epoch": 0.5210482972436177, + "grad_norm": 0.09489670395851135, + "learning_rate": 0.00015201871034605064, + "loss": 0.32838284969329834, + "memory(GiB)": 78.33, + "step": 2689, + "token_acc": 0.903539469079638, + "train_speed(iter/s)": 0.032357 + }, + { + "epoch": 0.5212420675289444, + "grad_norm": 0.09505011886358261, + "learning_rate": 0.00015192258667800414, + "loss": 0.3134308457374573, + "memory(GiB)": 78.33, + "step": 2690, + "token_acc": 0.9066839378238342, + "train_speed(iter/s)": 0.032358 + }, + { + "epoch": 0.5214358378142712, + "grad_norm": 0.10631779581308365, + "learning_rate": 0.00015182646222029964, + "loss": 0.39203402400016785, + "memory(GiB)": 78.33, + "step": 2691, + "token_acc": 0.88480466768138, + "train_speed(iter/s)": 0.032359 + }, + { + "epoch": 0.5216296080995979, + "grad_norm": 0.11067686975002289, + "learning_rate": 0.00015173033701241804, + "loss": 0.38351666927337646, + "memory(GiB)": 78.33, + "step": 2692, + "token_acc": 0.8888694485755111, + "train_speed(iter/s)": 0.03236 + }, + { + "epoch": 0.5218233783849247, + "grad_norm": 0.11114225536584854, + "learning_rate": 0.00015163421109384048, + "loss": 0.3793320655822754, + "memory(GiB)": 78.33, + "step": 2693, + "token_acc": 0.8876760563380282, + "train_speed(iter/s)": 0.032361 + }, + { + "epoch": 0.5220171486702514, + "grad_norm": 0.10015545785427094, + "learning_rate": 0.0001515380845040485, + "loss": 0.31924664974212646, + "memory(GiB)": 78.33, + "step": 2694, + "token_acc": 0.9028615097223401, + "train_speed(iter/s)": 0.032362 + }, + { + "epoch": 0.5222109189555781, + "grad_norm": 0.11381295323371887, + "learning_rate": 0.00015144195728252396, + "loss": 0.36840251088142395, + "memory(GiB)": 78.33, + "step": 2695, + "token_acc": 0.889958889452047, + "train_speed(iter/s)": 0.032363 + }, + { + "epoch": 0.5224046892409049, + "grad_norm": 0.09790168702602386, + "learning_rate": 0.00015134582946874875, + "loss": 0.3553347587585449, + "memory(GiB)": 78.33, + "step": 2696, + "token_acc": 0.8931691146224018, + "train_speed(iter/s)": 0.032364 + }, + { + "epoch": 0.5225984595262316, + "grad_norm": 0.10989172011613846, + "learning_rate": 0.00015124970110220526, + "loss": 0.37042251229286194, + "memory(GiB)": 78.33, + "step": 2697, + "token_acc": 0.8891240509008175, + "train_speed(iter/s)": 0.032365 + }, + { + "epoch": 0.5227922298115584, + "grad_norm": 0.11018986999988556, + "learning_rate": 0.00015115357222237596, + "loss": 0.34254854917526245, + "memory(GiB)": 78.33, + "step": 2698, + "token_acc": 0.8972499929196522, + "train_speed(iter/s)": 0.032366 + }, + { + "epoch": 0.5229860000968851, + "grad_norm": 0.10905101150274277, + "learning_rate": 0.00015105744286874354, + "loss": 0.37283697724342346, + "memory(GiB)": 78.33, + "step": 2699, + "token_acc": 0.8891810881122045, + "train_speed(iter/s)": 0.032367 + }, + { + "epoch": 0.5231797703822119, + "grad_norm": 0.10779301077127457, + "learning_rate": 0.00015096131308079086, + "loss": 0.35086601972579956, + "memory(GiB)": 78.33, + "step": 2700, + "token_acc": 0.894660338178582, + "train_speed(iter/s)": 0.032368 + }, + { + "epoch": 0.5233735406675386, + "grad_norm": 0.10483751446008682, + "learning_rate": 0.00015086518289800108, + "loss": 0.3449743688106537, + "memory(GiB)": 78.33, + "step": 2701, + "token_acc": 0.8989076300800705, + "train_speed(iter/s)": 0.032369 + }, + { + "epoch": 0.5235673109528654, + "grad_norm": 0.11302126944065094, + "learning_rate": 0.00015076905235985748, + "loss": 0.4011099338531494, + "memory(GiB)": 78.33, + "step": 2702, + "token_acc": 0.8809768411165576, + "train_speed(iter/s)": 0.03237 + }, + { + "epoch": 0.5237610812381921, + "grad_norm": 0.09251462668180466, + "learning_rate": 0.0001506729215058434, + "loss": 0.2971772253513336, + "memory(GiB)": 78.33, + "step": 2703, + "token_acc": 0.9121730788649904, + "train_speed(iter/s)": 0.032371 + }, + { + "epoch": 0.5239548515235188, + "grad_norm": 0.1044599711894989, + "learning_rate": 0.0001505767903754424, + "loss": 0.3731986880302429, + "memory(GiB)": 78.33, + "step": 2704, + "token_acc": 0.8906576333350313, + "train_speed(iter/s)": 0.032372 + }, + { + "epoch": 0.5241486218088456, + "grad_norm": 0.11340048164129257, + "learning_rate": 0.00015048065900813805, + "loss": 0.4006243944168091, + "memory(GiB)": 78.33, + "step": 2705, + "token_acc": 0.8819369331891173, + "train_speed(iter/s)": 0.032373 + }, + { + "epoch": 0.5243423920941723, + "grad_norm": 0.1074516549706459, + "learning_rate": 0.00015038452744341416, + "loss": 0.3777906894683838, + "memory(GiB)": 78.33, + "step": 2706, + "token_acc": 0.8883070102521474, + "train_speed(iter/s)": 0.032374 + }, + { + "epoch": 0.5245361623794991, + "grad_norm": 0.10201266407966614, + "learning_rate": 0.00015028839572075447, + "loss": 0.36712780594825745, + "memory(GiB)": 78.33, + "step": 2707, + "token_acc": 0.8929159371815307, + "train_speed(iter/s)": 0.032375 + }, + { + "epoch": 0.5247299326648258, + "grad_norm": 0.1035100668668747, + "learning_rate": 0.0001501922638796429, + "loss": 0.36527976393699646, + "memory(GiB)": 78.33, + "step": 2708, + "token_acc": 0.8913647477217159, + "train_speed(iter/s)": 0.032376 + }, + { + "epoch": 0.5249237029501526, + "grad_norm": 0.1076916977763176, + "learning_rate": 0.00015009613195956343, + "loss": 0.38062483072280884, + "memory(GiB)": 78.33, + "step": 2709, + "token_acc": 0.8895733585435028, + "train_speed(iter/s)": 0.032377 + }, + { + "epoch": 0.5251174732354793, + "grad_norm": 0.09896742552518845, + "learning_rate": 0.00015, + "loss": 0.3540569841861725, + "memory(GiB)": 78.33, + "step": 2710, + "token_acc": 0.8951074201119336, + "train_speed(iter/s)": 0.032378 + }, + { + "epoch": 0.525311243520806, + "grad_norm": 0.09472720324993134, + "learning_rate": 0.00014990386804043652, + "loss": 0.3247393071651459, + "memory(GiB)": 78.33, + "step": 2711, + "token_acc": 0.901641010008637, + "train_speed(iter/s)": 0.032379 + }, + { + "epoch": 0.5255050138061328, + "grad_norm": 0.09038470685482025, + "learning_rate": 0.0001498077361203571, + "loss": 0.3078896105289459, + "memory(GiB)": 78.33, + "step": 2712, + "token_acc": 0.9055924034928663, + "train_speed(iter/s)": 0.03238 + }, + { + "epoch": 0.5256987840914595, + "grad_norm": 0.10283378511667252, + "learning_rate": 0.00014971160427924553, + "loss": 0.3739337623119354, + "memory(GiB)": 78.33, + "step": 2713, + "token_acc": 0.8916592017517667, + "train_speed(iter/s)": 0.032381 + }, + { + "epoch": 0.5258925543767863, + "grad_norm": 0.10884794592857361, + "learning_rate": 0.00014961547255658587, + "loss": 0.3748420178890228, + "memory(GiB)": 78.33, + "step": 2714, + "token_acc": 0.890541823727448, + "train_speed(iter/s)": 0.032382 + }, + { + "epoch": 0.526086324662113, + "grad_norm": 0.10610167682170868, + "learning_rate": 0.00014951934099186195, + "loss": 0.39279741048812866, + "memory(GiB)": 78.33, + "step": 2715, + "token_acc": 0.8841656649546498, + "train_speed(iter/s)": 0.032383 + }, + { + "epoch": 0.5262800949474398, + "grad_norm": 0.10813671350479126, + "learning_rate": 0.00014942320962455766, + "loss": 0.36780744791030884, + "memory(GiB)": 78.33, + "step": 2716, + "token_acc": 0.8913100322496296, + "train_speed(iter/s)": 0.032384 + }, + { + "epoch": 0.5264738652327665, + "grad_norm": 0.103311687707901, + "learning_rate": 0.0001493270784941566, + "loss": 0.37279364466667175, + "memory(GiB)": 78.33, + "step": 2717, + "token_acc": 0.8909463356782836, + "train_speed(iter/s)": 0.032385 + }, + { + "epoch": 0.5266676355180933, + "grad_norm": 0.10846543312072754, + "learning_rate": 0.00014923094764014247, + "loss": 0.36826059222221375, + "memory(GiB)": 78.33, + "step": 2718, + "token_acc": 0.8907717095394189, + "train_speed(iter/s)": 0.032386 + }, + { + "epoch": 0.52686140580342, + "grad_norm": 0.09876382350921631, + "learning_rate": 0.0001491348171019989, + "loss": 0.358868807554245, + "memory(GiB)": 78.33, + "step": 2719, + "token_acc": 0.8912382783074005, + "train_speed(iter/s)": 0.032387 + }, + { + "epoch": 0.5270551760887467, + "grad_norm": 0.09218183159828186, + "learning_rate": 0.00014903868691920911, + "loss": 0.31816765666007996, + "memory(GiB)": 78.33, + "step": 2720, + "token_acc": 0.9072353603603603, + "train_speed(iter/s)": 0.032388 + }, + { + "epoch": 0.5272489463740735, + "grad_norm": 0.10111497342586517, + "learning_rate": 0.0001489425571312565, + "loss": 0.3605380952358246, + "memory(GiB)": 78.33, + "step": 2721, + "token_acc": 0.8904896257271978, + "train_speed(iter/s)": 0.032389 + }, + { + "epoch": 0.5274427166594002, + "grad_norm": 0.10805071890354156, + "learning_rate": 0.00014884642777762404, + "loss": 0.35685086250305176, + "memory(GiB)": 78.33, + "step": 2722, + "token_acc": 0.8938120928038679, + "train_speed(iter/s)": 0.03239 + }, + { + "epoch": 0.527636486944727, + "grad_norm": 0.10611846297979355, + "learning_rate": 0.00014875029889779476, + "loss": 0.366268128156662, + "memory(GiB)": 78.33, + "step": 2723, + "token_acc": 0.8927133114965365, + "train_speed(iter/s)": 0.032391 + }, + { + "epoch": 0.5278302572300537, + "grad_norm": 0.10951778292655945, + "learning_rate": 0.00014865417053125122, + "loss": 0.35670673847198486, + "memory(GiB)": 78.33, + "step": 2724, + "token_acc": 0.8942021014849336, + "train_speed(iter/s)": 0.032392 + }, + { + "epoch": 0.5280240275153805, + "grad_norm": 0.10806987434625626, + "learning_rate": 0.000148558042717476, + "loss": 0.34922337532043457, + "memory(GiB)": 78.33, + "step": 2725, + "token_acc": 0.8949587478767289, + "train_speed(iter/s)": 0.032393 + }, + { + "epoch": 0.5282177978007072, + "grad_norm": 0.09839235991239548, + "learning_rate": 0.0001484619154959515, + "loss": 0.34449273347854614, + "memory(GiB)": 78.33, + "step": 2726, + "token_acc": 0.8982513498786348, + "train_speed(iter/s)": 0.032394 + }, + { + "epoch": 0.528411568086034, + "grad_norm": 0.10562780499458313, + "learning_rate": 0.00014836578890615952, + "loss": 0.34781414270401, + "memory(GiB)": 78.33, + "step": 2727, + "token_acc": 0.8980476002925787, + "train_speed(iter/s)": 0.032395 + }, + { + "epoch": 0.5286053383713607, + "grad_norm": 0.11507020145654678, + "learning_rate": 0.00014826966298758202, + "loss": 0.3725394010543823, + "memory(GiB)": 78.33, + "step": 2728, + "token_acc": 0.8897241588360109, + "train_speed(iter/s)": 0.032396 + }, + { + "epoch": 0.5287991086566874, + "grad_norm": 0.1054215356707573, + "learning_rate": 0.00014817353777970036, + "loss": 0.3395271897315979, + "memory(GiB)": 78.33, + "step": 2729, + "token_acc": 0.8995790406558103, + "train_speed(iter/s)": 0.032397 + }, + { + "epoch": 0.5289928789420142, + "grad_norm": 0.11207542568445206, + "learning_rate": 0.00014807741332199584, + "loss": 0.3861173093318939, + "memory(GiB)": 78.33, + "step": 2730, + "token_acc": 0.8875055334218681, + "train_speed(iter/s)": 0.032398 + }, + { + "epoch": 0.5291866492273409, + "grad_norm": 0.11059516668319702, + "learning_rate": 0.00014798128965394936, + "loss": 0.3895573019981384, + "memory(GiB)": 78.33, + "step": 2731, + "token_acc": 0.8851554949115925, + "train_speed(iter/s)": 0.032399 + }, + { + "epoch": 0.5293804195126677, + "grad_norm": 0.1085188090801239, + "learning_rate": 0.0001478851668150414, + "loss": 0.37232252955436707, + "memory(GiB)": 78.33, + "step": 2732, + "token_acc": 0.8892735392827885, + "train_speed(iter/s)": 0.0324 + }, + { + "epoch": 0.5295741897979945, + "grad_norm": 0.09967105835676193, + "learning_rate": 0.00014778904484475235, + "loss": 0.3428942561149597, + "memory(GiB)": 78.33, + "step": 2733, + "token_acc": 0.8976353126642144, + "train_speed(iter/s)": 0.032401 + }, + { + "epoch": 0.5297679600833213, + "grad_norm": 0.11610176414251328, + "learning_rate": 0.0001476929237825619, + "loss": 0.40362459421157837, + "memory(GiB)": 78.33, + "step": 2734, + "token_acc": 0.8841030058763807, + "train_speed(iter/s)": 0.032402 + }, + { + "epoch": 0.529961730368648, + "grad_norm": 0.10144146531820297, + "learning_rate": 0.00014759680366794974, + "loss": 0.3451736271381378, + "memory(GiB)": 78.33, + "step": 2735, + "token_acc": 0.8984730993684548, + "train_speed(iter/s)": 0.032403 + }, + { + "epoch": 0.5301555006539748, + "grad_norm": 0.097495436668396, + "learning_rate": 0.0001475006845403948, + "loss": 0.3148418366909027, + "memory(GiB)": 78.33, + "step": 2736, + "token_acc": 0.9057284734309409, + "train_speed(iter/s)": 0.032404 + }, + { + "epoch": 0.5303492709393015, + "grad_norm": 0.10190415382385254, + "learning_rate": 0.00014740456643937591, + "loss": 0.35173338651657104, + "memory(GiB)": 78.33, + "step": 2737, + "token_acc": 0.895157419172667, + "train_speed(iter/s)": 0.032405 + }, + { + "epoch": 0.5305430412246283, + "grad_norm": 0.118934765458107, + "learning_rate": 0.00014730844940437138, + "loss": 0.3918536901473999, + "memory(GiB)": 78.33, + "step": 2738, + "token_acc": 0.8861999064400436, + "train_speed(iter/s)": 0.032406 + }, + { + "epoch": 0.530736811509955, + "grad_norm": 0.10452469438314438, + "learning_rate": 0.00014721233347485892, + "loss": 0.36255908012390137, + "memory(GiB)": 78.33, + "step": 2739, + "token_acc": 0.8927415372798595, + "train_speed(iter/s)": 0.032407 + }, + { + "epoch": 0.5309305817952817, + "grad_norm": 0.10265407711267471, + "learning_rate": 0.00014711621869031608, + "loss": 0.3520941436290741, + "memory(GiB)": 78.33, + "step": 2740, + "token_acc": 0.8986634006070611, + "train_speed(iter/s)": 0.032408 + }, + { + "epoch": 0.5311243520806085, + "grad_norm": 0.10262811183929443, + "learning_rate": 0.00014702010509021963, + "loss": 0.33090177178382874, + "memory(GiB)": 78.33, + "step": 2741, + "token_acc": 0.9019104647847163, + "train_speed(iter/s)": 0.032409 + }, + { + "epoch": 0.5313181223659352, + "grad_norm": 0.10777027159929276, + "learning_rate": 0.0001469239927140462, + "loss": 0.3813689649105072, + "memory(GiB)": 78.33, + "step": 2742, + "token_acc": 0.8873789612676056, + "train_speed(iter/s)": 0.03241 + }, + { + "epoch": 0.531511892651262, + "grad_norm": 0.10505778342485428, + "learning_rate": 0.00014682788160127154, + "loss": 0.37121495604515076, + "memory(GiB)": 78.33, + "step": 2743, + "token_acc": 0.8890379187973286, + "train_speed(iter/s)": 0.032411 + }, + { + "epoch": 0.5317056629365887, + "grad_norm": 0.11017142981290817, + "learning_rate": 0.00014673177179137114, + "loss": 0.3495400547981262, + "memory(GiB)": 78.33, + "step": 2744, + "token_acc": 0.8959156433529097, + "train_speed(iter/s)": 0.032412 + }, + { + "epoch": 0.5318994332219155, + "grad_norm": 0.11223854124546051, + "learning_rate": 0.00014663566332381994, + "loss": 0.3930763900279999, + "memory(GiB)": 78.33, + "step": 2745, + "token_acc": 0.8840929833038063, + "train_speed(iter/s)": 0.032413 + }, + { + "epoch": 0.5320932035072422, + "grad_norm": 0.11058208346366882, + "learning_rate": 0.00014653955623809215, + "loss": 0.3463561534881592, + "memory(GiB)": 78.33, + "step": 2746, + "token_acc": 0.8972069472423368, + "train_speed(iter/s)": 0.032413 + }, + { + "epoch": 0.532286973792569, + "grad_norm": 0.10111848264932632, + "learning_rate": 0.00014644345057366167, + "loss": 0.33341148495674133, + "memory(GiB)": 78.33, + "step": 2747, + "token_acc": 0.9006800088456435, + "train_speed(iter/s)": 0.032414 + }, + { + "epoch": 0.5324807440778957, + "grad_norm": 0.10276893526315689, + "learning_rate": 0.00014634734637000154, + "loss": 0.35466858744621277, + "memory(GiB)": 78.33, + "step": 2748, + "token_acc": 0.8964071405257475, + "train_speed(iter/s)": 0.032415 + }, + { + "epoch": 0.5326745143632224, + "grad_norm": 0.09825938940048218, + "learning_rate": 0.0001462512436665845, + "loss": 0.35709887742996216, + "memory(GiB)": 78.33, + "step": 2749, + "token_acc": 0.8944281524926686, + "train_speed(iter/s)": 0.032416 + }, + { + "epoch": 0.5328682846485492, + "grad_norm": 0.1046825423836708, + "learning_rate": 0.00014615514250288232, + "loss": 0.3530852496623993, + "memory(GiB)": 78.33, + "step": 2750, + "token_acc": 0.8973124966203428, + "train_speed(iter/s)": 0.032417 + }, + { + "epoch": 0.5330620549338759, + "grad_norm": 0.09290501475334167, + "learning_rate": 0.00014605904291836643, + "loss": 0.3255302309989929, + "memory(GiB)": 78.33, + "step": 2751, + "token_acc": 0.9017647611001596, + "train_speed(iter/s)": 0.032418 + }, + { + "epoch": 0.5332558252192027, + "grad_norm": 0.09813160449266434, + "learning_rate": 0.0001459629449525076, + "loss": 0.35900381207466125, + "memory(GiB)": 78.33, + "step": 2752, + "token_acc": 0.8953540595838115, + "train_speed(iter/s)": 0.032419 + }, + { + "epoch": 0.5334495955045294, + "grad_norm": 0.10881511121988297, + "learning_rate": 0.0001458668486447757, + "loss": 0.3696974217891693, + "memory(GiB)": 78.33, + "step": 2753, + "token_acc": 0.8900663072442213, + "train_speed(iter/s)": 0.03242 + }, + { + "epoch": 0.5336433657898562, + "grad_norm": 0.09696649760007858, + "learning_rate": 0.00014577075403464013, + "loss": 0.3419326841831207, + "memory(GiB)": 78.33, + "step": 2754, + "token_acc": 0.8965491278704982, + "train_speed(iter/s)": 0.032421 + }, + { + "epoch": 0.5338371360751829, + "grad_norm": 0.10864214599132538, + "learning_rate": 0.0001456746611615695, + "loss": 0.3853573799133301, + "memory(GiB)": 78.33, + "step": 2755, + "token_acc": 0.8867099400407064, + "train_speed(iter/s)": 0.032422 + }, + { + "epoch": 0.5340309063605096, + "grad_norm": 0.105714350938797, + "learning_rate": 0.00014557857006503182, + "loss": 0.3608066737651825, + "memory(GiB)": 78.33, + "step": 2756, + "token_acc": 0.8907904278462654, + "train_speed(iter/s)": 0.032423 + }, + { + "epoch": 0.5342246766458364, + "grad_norm": 0.10220605880022049, + "learning_rate": 0.00014548248078449417, + "loss": 0.35340073704719543, + "memory(GiB)": 78.33, + "step": 2757, + "token_acc": 0.8937097622685939, + "train_speed(iter/s)": 0.032424 + }, + { + "epoch": 0.5344184469311631, + "grad_norm": 0.12155355513095856, + "learning_rate": 0.00014538639335942303, + "loss": 0.3872877061367035, + "memory(GiB)": 78.33, + "step": 2758, + "token_acc": 0.8883419182369433, + "train_speed(iter/s)": 0.032425 + }, + { + "epoch": 0.5346122172164899, + "grad_norm": 0.11298363655805588, + "learning_rate": 0.0001452903078292842, + "loss": 0.36656370759010315, + "memory(GiB)": 78.33, + "step": 2759, + "token_acc": 0.8930003250623036, + "train_speed(iter/s)": 0.032426 + }, + { + "epoch": 0.5348059875018166, + "grad_norm": 0.1053992435336113, + "learning_rate": 0.00014519422423354243, + "loss": 0.3747759163379669, + "memory(GiB)": 78.33, + "step": 2760, + "token_acc": 0.8891952102478419, + "train_speed(iter/s)": 0.032427 + }, + { + "epoch": 0.5349997577871434, + "grad_norm": 0.09907464683055878, + "learning_rate": 0.00014509814261166193, + "loss": 0.33399906754493713, + "memory(GiB)": 78.33, + "step": 2761, + "token_acc": 0.8976413830072366, + "train_speed(iter/s)": 0.032428 + }, + { + "epoch": 0.5351935280724701, + "grad_norm": 0.0989702120423317, + "learning_rate": 0.00014500206300310594, + "loss": 0.3469410538673401, + "memory(GiB)": 78.33, + "step": 2762, + "token_acc": 0.8955082378865913, + "train_speed(iter/s)": 0.032428 + }, + { + "epoch": 0.5353872983577969, + "grad_norm": 0.09886979311704636, + "learning_rate": 0.00014490598544733695, + "loss": 0.3301083445549011, + "memory(GiB)": 78.33, + "step": 2763, + "token_acc": 0.903747359636999, + "train_speed(iter/s)": 0.032429 + }, + { + "epoch": 0.5355810686431236, + "grad_norm": 0.11633176356554031, + "learning_rate": 0.00014480990998381674, + "loss": 0.36927592754364014, + "memory(GiB)": 78.33, + "step": 2764, + "token_acc": 0.8909205359551572, + "train_speed(iter/s)": 0.03243 + }, + { + "epoch": 0.5357748389284503, + "grad_norm": 0.10796020925045013, + "learning_rate": 0.00014471383665200585, + "loss": 0.3573460876941681, + "memory(GiB)": 78.33, + "step": 2765, + "token_acc": 0.8945837063563116, + "train_speed(iter/s)": 0.032431 + }, + { + "epoch": 0.5359686092137771, + "grad_norm": 0.10915590077638626, + "learning_rate": 0.00014461776549136435, + "loss": 0.3306170701980591, + "memory(GiB)": 78.33, + "step": 2766, + "token_acc": 0.9005574136008918, + "train_speed(iter/s)": 0.032432 + }, + { + "epoch": 0.5361623794991038, + "grad_norm": 0.09537974745035172, + "learning_rate": 0.00014452169654135115, + "loss": 0.3284014165401459, + "memory(GiB)": 78.33, + "step": 2767, + "token_acc": 0.9025765702541481, + "train_speed(iter/s)": 0.032433 + }, + { + "epoch": 0.5363561497844306, + "grad_norm": 0.09599710255861282, + "learning_rate": 0.00014442562984142446, + "loss": 0.34253832697868347, + "memory(GiB)": 78.33, + "step": 2768, + "token_acc": 0.8958434143036872, + "train_speed(iter/s)": 0.032434 + }, + { + "epoch": 0.5365499200697573, + "grad_norm": 0.10483718663454056, + "learning_rate": 0.0001443295654310413, + "loss": 0.3458617329597473, + "memory(GiB)": 78.33, + "step": 2769, + "token_acc": 0.8977225020990764, + "train_speed(iter/s)": 0.032435 + }, + { + "epoch": 0.5367436903550841, + "grad_norm": 0.10079243779182434, + "learning_rate": 0.000144233503349658, + "loss": 0.3476545810699463, + "memory(GiB)": 78.33, + "step": 2770, + "token_acc": 0.8959448042804844, + "train_speed(iter/s)": 0.032436 + }, + { + "epoch": 0.5369374606404108, + "grad_norm": 0.10120805352926254, + "learning_rate": 0.00014413744363672988, + "loss": 0.35417628288269043, + "memory(GiB)": 78.33, + "step": 2771, + "token_acc": 0.8963836812952706, + "train_speed(iter/s)": 0.032437 + }, + { + "epoch": 0.5371312309257376, + "grad_norm": 0.10923577845096588, + "learning_rate": 0.00014404138633171114, + "loss": 0.388058602809906, + "memory(GiB)": 78.33, + "step": 2772, + "token_acc": 0.8873768455115426, + "train_speed(iter/s)": 0.032438 + }, + { + "epoch": 0.5373250012110643, + "grad_norm": 0.10782677680253983, + "learning_rate": 0.00014394533147405519, + "loss": 0.3875938057899475, + "memory(GiB)": 78.33, + "step": 2773, + "token_acc": 0.8839462734913853, + "train_speed(iter/s)": 0.032439 + }, + { + "epoch": 0.537518771496391, + "grad_norm": 0.10002797842025757, + "learning_rate": 0.00014384927910321424, + "loss": 0.35020288825035095, + "memory(GiB)": 78.33, + "step": 2774, + "token_acc": 0.8941337890860073, + "train_speed(iter/s)": 0.03244 + }, + { + "epoch": 0.5377125417817178, + "grad_norm": 0.09954270720481873, + "learning_rate": 0.0001437532292586397, + "loss": 0.3441551625728607, + "memory(GiB)": 78.33, + "step": 2775, + "token_acc": 0.9002784888463811, + "train_speed(iter/s)": 0.032441 + }, + { + "epoch": 0.5379063120670445, + "grad_norm": 0.10037088394165039, + "learning_rate": 0.00014365718197978172, + "loss": 0.34260034561157227, + "memory(GiB)": 78.33, + "step": 2776, + "token_acc": 0.89792182924082, + "train_speed(iter/s)": 0.032442 + }, + { + "epoch": 0.5381000823523713, + "grad_norm": 0.1107875406742096, + "learning_rate": 0.00014356113730608954, + "loss": 0.3863231837749481, + "memory(GiB)": 78.33, + "step": 2777, + "token_acc": 0.8852114077738917, + "train_speed(iter/s)": 0.032443 + }, + { + "epoch": 0.538293852637698, + "grad_norm": 0.10534848272800446, + "learning_rate": 0.00014346509527701133, + "loss": 0.38511624932289124, + "memory(GiB)": 78.33, + "step": 2778, + "token_acc": 0.8850161787561467, + "train_speed(iter/s)": 0.032444 + }, + { + "epoch": 0.5384876229230248, + "grad_norm": 0.10484083741903305, + "learning_rate": 0.00014336905593199405, + "loss": 0.3411652445793152, + "memory(GiB)": 78.33, + "step": 2779, + "token_acc": 0.8983436180578109, + "train_speed(iter/s)": 0.032445 + }, + { + "epoch": 0.5386813932083515, + "grad_norm": 0.09589672088623047, + "learning_rate": 0.00014327301931048368, + "loss": 0.32667702436447144, + "memory(GiB)": 78.33, + "step": 2780, + "token_acc": 0.902627403397424, + "train_speed(iter/s)": 0.032445 + }, + { + "epoch": 0.5388751634936783, + "grad_norm": 0.11629176884889603, + "learning_rate": 0.00014317698545192504, + "loss": 0.39405229687690735, + "memory(GiB)": 78.33, + "step": 2781, + "token_acc": 0.8830865833285327, + "train_speed(iter/s)": 0.032446 + }, + { + "epoch": 0.539068933779005, + "grad_norm": 0.11103569716215134, + "learning_rate": 0.00014308095439576188, + "loss": 0.37723052501678467, + "memory(GiB)": 78.33, + "step": 2782, + "token_acc": 0.8878711653086893, + "train_speed(iter/s)": 0.032447 + }, + { + "epoch": 0.5392627040643317, + "grad_norm": 0.10809724777936935, + "learning_rate": 0.00014298492618143658, + "loss": 0.3773004710674286, + "memory(GiB)": 78.33, + "step": 2783, + "token_acc": 0.8881794280195311, + "train_speed(iter/s)": 0.032448 + }, + { + "epoch": 0.5394564743496585, + "grad_norm": 0.10347483307123184, + "learning_rate": 0.0001428889008483906, + "loss": 0.3722970187664032, + "memory(GiB)": 78.33, + "step": 2784, + "token_acc": 0.8895179639134346, + "train_speed(iter/s)": 0.032449 + }, + { + "epoch": 0.5396502446349852, + "grad_norm": 0.10195406526327133, + "learning_rate": 0.00014279287843606422, + "loss": 0.3863418996334076, + "memory(GiB)": 78.33, + "step": 2785, + "token_acc": 0.8859166207897503, + "train_speed(iter/s)": 0.03245 + }, + { + "epoch": 0.539844014920312, + "grad_norm": 0.10171383619308472, + "learning_rate": 0.00014269685898389624, + "loss": 0.3722241222858429, + "memory(GiB)": 78.33, + "step": 2786, + "token_acc": 0.889989379306938, + "train_speed(iter/s)": 0.032451 + }, + { + "epoch": 0.5400377852056387, + "grad_norm": 0.10024231672286987, + "learning_rate": 0.00014260084253132457, + "loss": 0.3747716546058655, + "memory(GiB)": 78.33, + "step": 2787, + "token_acc": 0.8894973834732871, + "train_speed(iter/s)": 0.032452 + }, + { + "epoch": 0.5402315554909655, + "grad_norm": 0.09642113000154495, + "learning_rate": 0.00014250482911778563, + "loss": 0.34401756525039673, + "memory(GiB)": 78.33, + "step": 2788, + "token_acc": 0.8965368793647757, + "train_speed(iter/s)": 0.032453 + }, + { + "epoch": 0.5404253257762922, + "grad_norm": 0.09063483029603958, + "learning_rate": 0.00014240881878271487, + "loss": 0.31945639848709106, + "memory(GiB)": 78.33, + "step": 2789, + "token_acc": 0.9047063862187114, + "train_speed(iter/s)": 0.032454 + }, + { + "epoch": 0.540619096061619, + "grad_norm": 0.1115802749991417, + "learning_rate": 0.00014231281156554615, + "loss": 0.37289944291114807, + "memory(GiB)": 78.33, + "step": 2790, + "token_acc": 0.8908885411232829, + "train_speed(iter/s)": 0.032455 + }, + { + "epoch": 0.5408128663469457, + "grad_norm": 0.1019621267914772, + "learning_rate": 0.00014221680750571228, + "loss": 0.3510439693927765, + "memory(GiB)": 78.33, + "step": 2791, + "token_acc": 0.8948518823287389, + "train_speed(iter/s)": 0.032456 + }, + { + "epoch": 0.5410066366322724, + "grad_norm": 0.09841447323560715, + "learning_rate": 0.00014212080664264477, + "loss": 0.3427790403366089, + "memory(GiB)": 78.33, + "step": 2792, + "token_acc": 0.8993469501181047, + "train_speed(iter/s)": 0.032456 + }, + { + "epoch": 0.5412004069175992, + "grad_norm": 0.10309161990880966, + "learning_rate": 0.00014202480901577362, + "loss": 0.34060966968536377, + "memory(GiB)": 78.33, + "step": 2793, + "token_acc": 0.900046502721777, + "train_speed(iter/s)": 0.032457 + }, + { + "epoch": 0.5413941772029259, + "grad_norm": 0.10474463552236557, + "learning_rate": 0.00014192881466452775, + "loss": 0.3322206437587738, + "memory(GiB)": 78.33, + "step": 2794, + "token_acc": 0.8990165235382479, + "train_speed(iter/s)": 0.032458 + }, + { + "epoch": 0.5415879474882527, + "grad_norm": 0.11492858082056046, + "learning_rate": 0.00014183282362833455, + "loss": 0.37814396619796753, + "memory(GiB)": 78.33, + "step": 2795, + "token_acc": 0.8911760892116183, + "train_speed(iter/s)": 0.032459 + }, + { + "epoch": 0.5417817177735794, + "grad_norm": 0.11693891882896423, + "learning_rate": 0.00014173683594662014, + "loss": 0.3709845244884491, + "memory(GiB)": 78.33, + "step": 2796, + "token_acc": 0.8902887474818534, + "train_speed(iter/s)": 0.03246 + }, + { + "epoch": 0.5419754880589062, + "grad_norm": 0.1105792298913002, + "learning_rate": 0.00014164085165880932, + "loss": 0.3588966131210327, + "memory(GiB)": 78.33, + "step": 2797, + "token_acc": 0.8919163847269213, + "train_speed(iter/s)": 0.032461 + }, + { + "epoch": 0.5421692583442329, + "grad_norm": 0.10854232311248779, + "learning_rate": 0.00014154487080432528, + "loss": 0.3848278820514679, + "memory(GiB)": 78.33, + "step": 2798, + "token_acc": 0.8873765907664213, + "train_speed(iter/s)": 0.032462 + }, + { + "epoch": 0.5423630286295597, + "grad_norm": 0.09642491489648819, + "learning_rate": 0.00014144889342259002, + "loss": 0.3380579352378845, + "memory(GiB)": 78.33, + "step": 2799, + "token_acc": 0.8996538924558587, + "train_speed(iter/s)": 0.032463 + }, + { + "epoch": 0.5425567989148864, + "grad_norm": 0.10592179000377655, + "learning_rate": 0.000141352919553024, + "loss": 0.37832123041152954, + "memory(GiB)": 78.33, + "step": 2800, + "token_acc": 0.8886095633018012, + "train_speed(iter/s)": 0.032464 + }, + { + "epoch": 0.5427505692002131, + "grad_norm": 0.10104614496231079, + "learning_rate": 0.0001412569492350463, + "loss": 0.36123543977737427, + "memory(GiB)": 78.33, + "step": 2801, + "token_acc": 0.8938950807577793, + "train_speed(iter/s)": 0.03246 + }, + { + "epoch": 0.5429443394855399, + "grad_norm": 0.10246588289737701, + "learning_rate": 0.00014116098250807445, + "loss": 0.35974156856536865, + "memory(GiB)": 78.33, + "step": 2802, + "token_acc": 0.893833531441162, + "train_speed(iter/s)": 0.032461 + }, + { + "epoch": 0.5431381097708666, + "grad_norm": 0.10312814265489578, + "learning_rate": 0.00014106501941152459, + "loss": 0.357723206281662, + "memory(GiB)": 78.33, + "step": 2803, + "token_acc": 0.8940801971556195, + "train_speed(iter/s)": 0.032462 + }, + { + "epoch": 0.5433318800561934, + "grad_norm": 0.10023822635412216, + "learning_rate": 0.0001409690599848114, + "loss": 0.33914467692375183, + "memory(GiB)": 78.33, + "step": 2804, + "token_acc": 0.8970815359216062, + "train_speed(iter/s)": 0.032463 + }, + { + "epoch": 0.5435256503415201, + "grad_norm": 0.10362284630537033, + "learning_rate": 0.0001408731042673479, + "loss": 0.3780994415283203, + "memory(GiB)": 78.33, + "step": 2805, + "token_acc": 0.8886399282993502, + "train_speed(iter/s)": 0.032464 + }, + { + "epoch": 0.5437194206268469, + "grad_norm": 0.10058917850255966, + "learning_rate": 0.00014077715229854576, + "loss": 0.34822142124176025, + "memory(GiB)": 78.33, + "step": 2806, + "token_acc": 0.8962744785862123, + "train_speed(iter/s)": 0.032465 + }, + { + "epoch": 0.5439131909121736, + "grad_norm": 0.10602036118507385, + "learning_rate": 0.00014068120411781497, + "loss": 0.3826134502887726, + "memory(GiB)": 78.33, + "step": 2807, + "token_acc": 0.8868417471764866, + "train_speed(iter/s)": 0.032466 + }, + { + "epoch": 0.5441069611975003, + "grad_norm": 0.09917270392179489, + "learning_rate": 0.0001405852597645641, + "loss": 0.35776597261428833, + "memory(GiB)": 78.33, + "step": 2808, + "token_acc": 0.8961601781909743, + "train_speed(iter/s)": 0.032467 + }, + { + "epoch": 0.5443007314828271, + "grad_norm": 0.20605452358722687, + "learning_rate": 0.00014048931927819995, + "loss": 0.3961770832538605, + "memory(GiB)": 78.33, + "step": 2809, + "token_acc": 0.8819182185180362, + "train_speed(iter/s)": 0.032467 + }, + { + "epoch": 0.5444945017681538, + "grad_norm": 0.11399701237678528, + "learning_rate": 0.00014039338269812796, + "loss": 0.39141345024108887, + "memory(GiB)": 78.33, + "step": 2810, + "token_acc": 0.8836162513606542, + "train_speed(iter/s)": 0.032468 + }, + { + "epoch": 0.5446882720534806, + "grad_norm": 0.10171796381473541, + "learning_rate": 0.0001402974500637519, + "loss": 0.3601566255092621, + "memory(GiB)": 78.33, + "step": 2811, + "token_acc": 0.8961741354505248, + "train_speed(iter/s)": 0.032469 + }, + { + "epoch": 0.5448820423388073, + "grad_norm": 0.09617502242326736, + "learning_rate": 0.00014020152141447375, + "loss": 0.34712108969688416, + "memory(GiB)": 78.33, + "step": 2812, + "token_acc": 0.8963927855711423, + "train_speed(iter/s)": 0.03247 + }, + { + "epoch": 0.5450758126241341, + "grad_norm": 0.10345807671546936, + "learning_rate": 0.00014010559678969407, + "loss": 0.36251404881477356, + "memory(GiB)": 78.33, + "step": 2813, + "token_acc": 0.8929795640683466, + "train_speed(iter/s)": 0.032471 + }, + { + "epoch": 0.5452695829094608, + "grad_norm": 0.09582247585058212, + "learning_rate": 0.00014000967622881166, + "loss": 0.3179134130477905, + "memory(GiB)": 78.33, + "step": 2814, + "token_acc": 0.9051344993351744, + "train_speed(iter/s)": 0.032472 + }, + { + "epoch": 0.5454633531947876, + "grad_norm": 0.125186488032341, + "learning_rate": 0.0001399137597712237, + "loss": 0.36605405807495117, + "memory(GiB)": 78.33, + "step": 2815, + "token_acc": 0.8923891831614162, + "train_speed(iter/s)": 0.032473 + }, + { + "epoch": 0.5456571234801143, + "grad_norm": 0.10813166946172714, + "learning_rate": 0.00013981784745632558, + "loss": 0.36997631192207336, + "memory(GiB)": 78.33, + "step": 2816, + "token_acc": 0.8912589239073655, + "train_speed(iter/s)": 0.032474 + }, + { + "epoch": 0.545850893765441, + "grad_norm": 0.10874950140714645, + "learning_rate": 0.00013972193932351113, + "loss": 0.36081942915916443, + "memory(GiB)": 78.33, + "step": 2817, + "token_acc": 0.89447387926538, + "train_speed(iter/s)": 0.032475 + }, + { + "epoch": 0.5460446640507678, + "grad_norm": 0.09806004166603088, + "learning_rate": 0.00013962603541217244, + "loss": 0.34919509291648865, + "memory(GiB)": 78.33, + "step": 2818, + "token_acc": 0.8967576382560315, + "train_speed(iter/s)": 0.032476 + }, + { + "epoch": 0.5462384343360945, + "grad_norm": 0.1022680252790451, + "learning_rate": 0.0001395301357616997, + "loss": 0.3764371871948242, + "memory(GiB)": 78.33, + "step": 2819, + "token_acc": 0.8876494023904382, + "train_speed(iter/s)": 0.032477 + }, + { + "epoch": 0.5464322046214213, + "grad_norm": 0.10649038106203079, + "learning_rate": 0.00013943424041148154, + "loss": 0.38199445605278015, + "memory(GiB)": 78.33, + "step": 2820, + "token_acc": 0.8861415110903026, + "train_speed(iter/s)": 0.032478 + }, + { + "epoch": 0.546625974906748, + "grad_norm": 0.10255075246095657, + "learning_rate": 0.00013933834940090475, + "loss": 0.36467188596725464, + "memory(GiB)": 78.33, + "step": 2821, + "token_acc": 0.8910845821691643, + "train_speed(iter/s)": 0.032479 + }, + { + "epoch": 0.5468197451920748, + "grad_norm": 0.10376548022031784, + "learning_rate": 0.00013924246276935442, + "loss": 0.32367387413978577, + "memory(GiB)": 78.33, + "step": 2822, + "token_acc": 0.9025662838508436, + "train_speed(iter/s)": 0.03248 + }, + { + "epoch": 0.5470135154774015, + "grad_norm": 0.10719739645719528, + "learning_rate": 0.00013914658055621363, + "loss": 0.3771483302116394, + "memory(GiB)": 78.33, + "step": 2823, + "token_acc": 0.8899454274847032, + "train_speed(iter/s)": 0.03248 + }, + { + "epoch": 0.5472072857627283, + "grad_norm": 0.11074833571910858, + "learning_rate": 0.00013905070280086386, + "loss": 0.36725014448165894, + "memory(GiB)": 78.33, + "step": 2824, + "token_acc": 0.8935861944063543, + "train_speed(iter/s)": 0.032481 + }, + { + "epoch": 0.547401056048055, + "grad_norm": 0.10274602472782135, + "learning_rate": 0.0001389548295426847, + "loss": 0.3463117480278015, + "memory(GiB)": 78.33, + "step": 2825, + "token_acc": 0.8964165043036476, + "train_speed(iter/s)": 0.032482 + }, + { + "epoch": 0.5475948263333817, + "grad_norm": 0.10460248589515686, + "learning_rate": 0.0001388589608210538, + "loss": 0.35207241773605347, + "memory(GiB)": 78.33, + "step": 2826, + "token_acc": 0.8935823860501049, + "train_speed(iter/s)": 0.032483 + }, + { + "epoch": 0.5477885966187085, + "grad_norm": 0.11349425464868546, + "learning_rate": 0.0001387630966753471, + "loss": 0.377665638923645, + "memory(GiB)": 78.33, + "step": 2827, + "token_acc": 0.8892741984547059, + "train_speed(iter/s)": 0.032484 + }, + { + "epoch": 0.5479823669040352, + "grad_norm": 0.11117535084486008, + "learning_rate": 0.0001386672371449385, + "loss": 0.38577699661254883, + "memory(GiB)": 78.33, + "step": 2828, + "token_acc": 0.887010551652719, + "train_speed(iter/s)": 0.032485 + }, + { + "epoch": 0.548176137189362, + "grad_norm": 0.09501176327466965, + "learning_rate": 0.0001385713822692001, + "loss": 0.3158635199069977, + "memory(GiB)": 78.33, + "step": 2829, + "token_acc": 0.9051193667815651, + "train_speed(iter/s)": 0.032486 + }, + { + "epoch": 0.5483699074746887, + "grad_norm": 0.0978478193283081, + "learning_rate": 0.00013847553208750222, + "loss": 0.33727461099624634, + "memory(GiB)": 78.33, + "step": 2830, + "token_acc": 0.9000374953130859, + "train_speed(iter/s)": 0.032487 + }, + { + "epoch": 0.5485636777600155, + "grad_norm": 0.11235343664884567, + "learning_rate": 0.0001383796866392129, + "loss": 0.3979712128639221, + "memory(GiB)": 78.33, + "step": 2831, + "token_acc": 0.8826383993927235, + "train_speed(iter/s)": 0.032488 + }, + { + "epoch": 0.5487574480453422, + "grad_norm": 0.11587338894605637, + "learning_rate": 0.0001382838459636986, + "loss": 0.4109783470630646, + "memory(GiB)": 78.33, + "step": 2832, + "token_acc": 0.8784734879286655, + "train_speed(iter/s)": 0.032489 + }, + { + "epoch": 0.548951218330669, + "grad_norm": 0.09932563453912735, + "learning_rate": 0.0001381880101003235, + "loss": 0.3608021140098572, + "memory(GiB)": 78.33, + "step": 2833, + "token_acc": 0.892873446614253, + "train_speed(iter/s)": 0.03249 + }, + { + "epoch": 0.5491449886159957, + "grad_norm": 0.1027938574552536, + "learning_rate": 0.00013809217908845008, + "loss": 0.3469353914260864, + "memory(GiB)": 78.33, + "step": 2834, + "token_acc": 0.8969847914789889, + "train_speed(iter/s)": 0.032491 + }, + { + "epoch": 0.5493387589013224, + "grad_norm": 0.1079033836722374, + "learning_rate": 0.00013799635296743868, + "loss": 0.3682827055454254, + "memory(GiB)": 78.33, + "step": 2835, + "token_acc": 0.8886550135749367, + "train_speed(iter/s)": 0.032492 + }, + { + "epoch": 0.5495325291866492, + "grad_norm": 0.10848791897296906, + "learning_rate": 0.00013790053177664766, + "loss": 0.3711949288845062, + "memory(GiB)": 78.33, + "step": 2836, + "token_acc": 0.8927511279981002, + "train_speed(iter/s)": 0.032492 + }, + { + "epoch": 0.5497262994719759, + "grad_norm": 0.10043694078922272, + "learning_rate": 0.00013780471555543343, + "loss": 0.3552500009536743, + "memory(GiB)": 78.33, + "step": 2837, + "token_acc": 0.8943481138318994, + "train_speed(iter/s)": 0.032493 + }, + { + "epoch": 0.5499200697573027, + "grad_norm": 0.10964163392782211, + "learning_rate": 0.00013770890434315012, + "loss": 0.3622366786003113, + "memory(GiB)": 78.33, + "step": 2838, + "token_acc": 0.8923598464684952, + "train_speed(iter/s)": 0.032494 + }, + { + "epoch": 0.5501138400426294, + "grad_norm": 0.1045740619301796, + "learning_rate": 0.00013761309817915014, + "loss": 0.3492569923400879, + "memory(GiB)": 78.33, + "step": 2839, + "token_acc": 0.8954375260019415, + "train_speed(iter/s)": 0.032495 + }, + { + "epoch": 0.5503076103279562, + "grad_norm": 0.11920612305402756, + "learning_rate": 0.00013751729710278354, + "loss": 0.32998332381248474, + "memory(GiB)": 78.33, + "step": 2840, + "token_acc": 0.9013660163219938, + "train_speed(iter/s)": 0.032496 + }, + { + "epoch": 0.5505013806132829, + "grad_norm": 0.09879645705223083, + "learning_rate": 0.00013742150115339852, + "loss": 0.31799447536468506, + "memory(GiB)": 78.33, + "step": 2841, + "token_acc": 0.9049382374886009, + "train_speed(iter/s)": 0.032497 + }, + { + "epoch": 0.5506951508986097, + "grad_norm": 0.10987204313278198, + "learning_rate": 0.0001373257103703409, + "loss": 0.3859724998474121, + "memory(GiB)": 78.33, + "step": 2842, + "token_acc": 0.885055264529223, + "train_speed(iter/s)": 0.032498 + }, + { + "epoch": 0.5508889211839364, + "grad_norm": 0.1112445592880249, + "learning_rate": 0.00013722992479295461, + "loss": 0.3733959496021271, + "memory(GiB)": 78.33, + "step": 2843, + "token_acc": 0.8924345643599857, + "train_speed(iter/s)": 0.032499 + }, + { + "epoch": 0.5510826914692631, + "grad_norm": 0.10867438465356827, + "learning_rate": 0.00013713414446058143, + "loss": 0.3374147415161133, + "memory(GiB)": 78.33, + "step": 2844, + "token_acc": 0.8994350282485876, + "train_speed(iter/s)": 0.0325 + }, + { + "epoch": 0.5512764617545899, + "grad_norm": 0.10243318974971771, + "learning_rate": 0.00013703836941256073, + "loss": 0.36659640073776245, + "memory(GiB)": 78.33, + "step": 2845, + "token_acc": 0.8927052150622341, + "train_speed(iter/s)": 0.032501 + }, + { + "epoch": 0.5514702320399166, + "grad_norm": 0.10131695866584778, + "learning_rate": 0.00013694259968823007, + "loss": 0.3265146017074585, + "memory(GiB)": 78.33, + "step": 2846, + "token_acc": 0.9022528879828551, + "train_speed(iter/s)": 0.032501 + }, + { + "epoch": 0.5516640023252434, + "grad_norm": 0.1028154119849205, + "learning_rate": 0.00013684683532692456, + "loss": 0.34832775592803955, + "memory(GiB)": 78.33, + "step": 2847, + "token_acc": 0.8964531768308241, + "train_speed(iter/s)": 0.032502 + }, + { + "epoch": 0.5518577726105701, + "grad_norm": 0.10362169146537781, + "learning_rate": 0.00013675107636797727, + "loss": 0.35685330629348755, + "memory(GiB)": 78.33, + "step": 2848, + "token_acc": 0.8953478775207021, + "train_speed(iter/s)": 0.032503 + }, + { + "epoch": 0.5520515428958969, + "grad_norm": 0.10931023210287094, + "learning_rate": 0.00013665532285071885, + "loss": 0.3599531650543213, + "memory(GiB)": 78.33, + "step": 2849, + "token_acc": 0.894084200338737, + "train_speed(iter/s)": 0.032504 + }, + { + "epoch": 0.5522453131812236, + "grad_norm": 0.10313741117715836, + "learning_rate": 0.00013655957481447796, + "loss": 0.3336341977119446, + "memory(GiB)": 78.33, + "step": 2850, + "token_acc": 0.8995257854179016, + "train_speed(iter/s)": 0.032505 + }, + { + "epoch": 0.5524390834665504, + "grad_norm": 0.10181530565023422, + "learning_rate": 0.00013646383229858088, + "loss": 0.3731972575187683, + "memory(GiB)": 78.33, + "step": 2851, + "token_acc": 0.8911425098754178, + "train_speed(iter/s)": 0.032506 + }, + { + "epoch": 0.5526328537518771, + "grad_norm": 0.09568587690591812, + "learning_rate": 0.00013636809534235155, + "loss": 0.3068424463272095, + "memory(GiB)": 78.33, + "step": 2852, + "token_acc": 0.9089695605517314, + "train_speed(iter/s)": 0.032507 + }, + { + "epoch": 0.5528266240372038, + "grad_norm": 0.10480746626853943, + "learning_rate": 0.00013627236398511183, + "loss": 0.3546777069568634, + "memory(GiB)": 78.33, + "step": 2853, + "token_acc": 0.8944366727644464, + "train_speed(iter/s)": 0.032508 + }, + { + "epoch": 0.5530203943225307, + "grad_norm": 0.1180429756641388, + "learning_rate": 0.00013617663826618102, + "loss": 0.40899646282196045, + "memory(GiB)": 78.33, + "step": 2854, + "token_acc": 0.8810652619256658, + "train_speed(iter/s)": 0.032509 + }, + { + "epoch": 0.5532141646078574, + "grad_norm": 0.09670621156692505, + "learning_rate": 0.0001360809182248764, + "loss": 0.33177006244659424, + "memory(GiB)": 78.33, + "step": 2855, + "token_acc": 0.9025397480900268, + "train_speed(iter/s)": 0.03251 + }, + { + "epoch": 0.5534079348931842, + "grad_norm": 0.10695379227399826, + "learning_rate": 0.00013598520390051264, + "loss": 0.34610581398010254, + "memory(GiB)": 78.33, + "step": 2856, + "token_acc": 0.898416321871879, + "train_speed(iter/s)": 0.032511 + }, + { + "epoch": 0.5536017051785109, + "grad_norm": 0.10109010338783264, + "learning_rate": 0.00013588949533240222, + "loss": 0.3390766978263855, + "memory(GiB)": 78.33, + "step": 2857, + "token_acc": 0.89864415955117, + "train_speed(iter/s)": 0.032511 + }, + { + "epoch": 0.5537954754638377, + "grad_norm": 0.101463183760643, + "learning_rate": 0.00013579379255985528, + "loss": 0.3392132818698883, + "memory(GiB)": 78.33, + "step": 2858, + "token_acc": 0.898758677643294, + "train_speed(iter/s)": 0.032512 + }, + { + "epoch": 0.5539892457491644, + "grad_norm": 0.09950132668018341, + "learning_rate": 0.00013569809562217943, + "loss": 0.32112976908683777, + "memory(GiB)": 78.33, + "step": 2859, + "token_acc": 0.902511359616902, + "train_speed(iter/s)": 0.032513 + }, + { + "epoch": 0.5541830160344912, + "grad_norm": 0.10303032398223877, + "learning_rate": 0.00013560240455868003, + "loss": 0.3673378825187683, + "memory(GiB)": 78.33, + "step": 2860, + "token_acc": 0.8928259417236241, + "train_speed(iter/s)": 0.032514 + }, + { + "epoch": 0.5543767863198179, + "grad_norm": 0.11683917790651321, + "learning_rate": 0.00013550671940865992, + "loss": 0.3810235261917114, + "memory(GiB)": 78.33, + "step": 2861, + "token_acc": 0.8879626045791527, + "train_speed(iter/s)": 0.032515 + }, + { + "epoch": 0.5545705566051446, + "grad_norm": 0.11016968637704849, + "learning_rate": 0.0001354110402114196, + "loss": 0.3756512999534607, + "memory(GiB)": 78.33, + "step": 2862, + "token_acc": 0.8894170776635569, + "train_speed(iter/s)": 0.032516 + }, + { + "epoch": 0.5547643268904714, + "grad_norm": 0.09833884984254837, + "learning_rate": 0.00013531536700625715, + "loss": 0.3510299026966095, + "memory(GiB)": 78.33, + "step": 2863, + "token_acc": 0.8975372743295384, + "train_speed(iter/s)": 0.032517 + }, + { + "epoch": 0.5549580971757981, + "grad_norm": 0.09793855249881744, + "learning_rate": 0.00013521969983246803, + "loss": 0.36319395899772644, + "memory(GiB)": 78.33, + "step": 2864, + "token_acc": 0.892717529189427, + "train_speed(iter/s)": 0.032517 + }, + { + "epoch": 0.5551518674611249, + "grad_norm": 0.11692684143781662, + "learning_rate": 0.0001351240387293454, + "loss": 0.407484233379364, + "memory(GiB)": 78.33, + "step": 2865, + "token_acc": 0.8815019139833371, + "train_speed(iter/s)": 0.032518 + }, + { + "epoch": 0.5553456377464516, + "grad_norm": 0.10228422284126282, + "learning_rate": 0.0001350283837361797, + "loss": 0.340445339679718, + "memory(GiB)": 78.33, + "step": 2866, + "token_acc": 0.8981877995519129, + "train_speed(iter/s)": 0.032519 + }, + { + "epoch": 0.5555394080317784, + "grad_norm": 0.09636260569095612, + "learning_rate": 0.00013493273489225915, + "loss": 0.32771408557891846, + "memory(GiB)": 78.33, + "step": 2867, + "token_acc": 0.9005267118133935, + "train_speed(iter/s)": 0.03252 + }, + { + "epoch": 0.5557331783171051, + "grad_norm": 0.10914786905050278, + "learning_rate": 0.00013483709223686922, + "loss": 0.35096076130867004, + "memory(GiB)": 78.33, + "step": 2868, + "token_acc": 0.8962214601291438, + "train_speed(iter/s)": 0.032521 + }, + { + "epoch": 0.5559269486024319, + "grad_norm": 0.10578683018684387, + "learning_rate": 0.00013474145580929297, + "loss": 0.36036306619644165, + "memory(GiB)": 78.33, + "step": 2869, + "token_acc": 0.8910474829632886, + "train_speed(iter/s)": 0.032522 + }, + { + "epoch": 0.5561207188877586, + "grad_norm": 0.10106171667575836, + "learning_rate": 0.00013464582564881087, + "loss": 0.34388551115989685, + "memory(GiB)": 78.33, + "step": 2870, + "token_acc": 0.8960813087667808, + "train_speed(iter/s)": 0.032523 + }, + { + "epoch": 0.5563144891730853, + "grad_norm": 0.11406324058771133, + "learning_rate": 0.00013455020179470073, + "loss": 0.3458818197250366, + "memory(GiB)": 78.33, + "step": 2871, + "token_acc": 0.8983081299595748, + "train_speed(iter/s)": 0.032524 + }, + { + "epoch": 0.5565082594584121, + "grad_norm": 0.09773491322994232, + "learning_rate": 0.00013445458428623788, + "loss": 0.33812999725341797, + "memory(GiB)": 78.33, + "step": 2872, + "token_acc": 0.899073884568154, + "train_speed(iter/s)": 0.032525 + }, + { + "epoch": 0.5567020297437388, + "grad_norm": 0.10276000201702118, + "learning_rate": 0.000134358973162695, + "loss": 0.3428000807762146, + "memory(GiB)": 78.33, + "step": 2873, + "token_acc": 0.8970639124763755, + "train_speed(iter/s)": 0.032525 + }, + { + "epoch": 0.5568958000290656, + "grad_norm": 0.10354767739772797, + "learning_rate": 0.00013426336846334208, + "loss": 0.359587162733078, + "memory(GiB)": 78.33, + "step": 2874, + "token_acc": 0.8922890471140736, + "train_speed(iter/s)": 0.032526 + }, + { + "epoch": 0.5570895703143923, + "grad_norm": 0.12495241314172745, + "learning_rate": 0.0001341677702274466, + "loss": 0.3915575444698334, + "memory(GiB)": 78.33, + "step": 2875, + "token_acc": 0.8831632312720966, + "train_speed(iter/s)": 0.032527 + }, + { + "epoch": 0.5572833405997191, + "grad_norm": 0.09519120305776596, + "learning_rate": 0.00013407217849427332, + "loss": 0.3396569788455963, + "memory(GiB)": 78.33, + "step": 2876, + "token_acc": 0.8975312241823395, + "train_speed(iter/s)": 0.032528 + }, + { + "epoch": 0.5574771108850458, + "grad_norm": 0.10415952652692795, + "learning_rate": 0.0001339765933030844, + "loss": 0.36318159103393555, + "memory(GiB)": 78.33, + "step": 2877, + "token_acc": 0.8913055970578094, + "train_speed(iter/s)": 0.032529 + }, + { + "epoch": 0.5576708811703726, + "grad_norm": 0.09890494495630264, + "learning_rate": 0.00013388101469313907, + "loss": 0.34472253918647766, + "memory(GiB)": 78.33, + "step": 2878, + "token_acc": 0.8975010936132983, + "train_speed(iter/s)": 0.03253 + }, + { + "epoch": 0.5578646514556993, + "grad_norm": 0.10054649412631989, + "learning_rate": 0.0001337854427036942, + "loss": 0.3560715615749359, + "memory(GiB)": 78.33, + "step": 2879, + "token_acc": 0.8977050131798505, + "train_speed(iter/s)": 0.032531 + }, + { + "epoch": 0.558058421741026, + "grad_norm": 0.10295595228672028, + "learning_rate": 0.00013368987737400368, + "loss": 0.3403087556362152, + "memory(GiB)": 78.33, + "step": 2880, + "token_acc": 0.8993411629905471, + "train_speed(iter/s)": 0.032532 + }, + { + "epoch": 0.5582521920263528, + "grad_norm": 0.11277662217617035, + "learning_rate": 0.00013359431874331886, + "loss": 0.38854244351387024, + "memory(GiB)": 78.33, + "step": 2881, + "token_acc": 0.8866336122301415, + "train_speed(iter/s)": 0.032533 + }, + { + "epoch": 0.5584459623116795, + "grad_norm": 0.09544237703084946, + "learning_rate": 0.0001334987668508881, + "loss": 0.35088953375816345, + "memory(GiB)": 78.33, + "step": 2882, + "token_acc": 0.8954308027790456, + "train_speed(iter/s)": 0.032533 + }, + { + "epoch": 0.5586397325970063, + "grad_norm": 0.09790968149900436, + "learning_rate": 0.0001334032217359572, + "loss": 0.3613382875919342, + "memory(GiB)": 78.33, + "step": 2883, + "token_acc": 0.8920747907163267, + "train_speed(iter/s)": 0.032534 + }, + { + "epoch": 0.558833502882333, + "grad_norm": 0.10290265083312988, + "learning_rate": 0.00013330768343776918, + "loss": 0.35918739438056946, + "memory(GiB)": 78.33, + "step": 2884, + "token_acc": 0.8945180422371265, + "train_speed(iter/s)": 0.032535 + }, + { + "epoch": 0.5590272731676598, + "grad_norm": 0.11126357316970825, + "learning_rate": 0.00013321215199556404, + "loss": 0.34764158725738525, + "memory(GiB)": 78.33, + "step": 2885, + "token_acc": 0.8981644381890688, + "train_speed(iter/s)": 0.032536 + }, + { + "epoch": 0.5592210434529865, + "grad_norm": 0.10342656821012497, + "learning_rate": 0.0001331166274485792, + "loss": 0.3714340329170227, + "memory(GiB)": 78.33, + "step": 2886, + "token_acc": 0.8889918774544479, + "train_speed(iter/s)": 0.032537 + }, + { + "epoch": 0.5594148137383133, + "grad_norm": 0.09674359858036041, + "learning_rate": 0.00013302110983604912, + "loss": 0.3426961302757263, + "memory(GiB)": 78.33, + "step": 2887, + "token_acc": 0.8992525579932754, + "train_speed(iter/s)": 0.032538 + }, + { + "epoch": 0.55960858402364, + "grad_norm": 0.1152225211262703, + "learning_rate": 0.00013292559919720554, + "loss": 0.3851836621761322, + "memory(GiB)": 78.33, + "step": 2888, + "token_acc": 0.8864522728725784, + "train_speed(iter/s)": 0.032539 + }, + { + "epoch": 0.5598023543089667, + "grad_norm": 0.2307002991437912, + "learning_rate": 0.00013283009557127712, + "loss": 0.3549976050853729, + "memory(GiB)": 78.33, + "step": 2889, + "token_acc": 0.8931888544891641, + "train_speed(iter/s)": 0.03254 + }, + { + "epoch": 0.5599961245942935, + "grad_norm": 0.10933719575405121, + "learning_rate": 0.0001327345989974898, + "loss": 0.40044257044792175, + "memory(GiB)": 78.33, + "step": 2890, + "token_acc": 0.8821779976652399, + "train_speed(iter/s)": 0.03254 + }, + { + "epoch": 0.5601898948796202, + "grad_norm": 0.08904779702425003, + "learning_rate": 0.00013263910951506668, + "loss": 0.31354546546936035, + "memory(GiB)": 78.33, + "step": 2891, + "token_acc": 0.9041093058828393, + "train_speed(iter/s)": 0.032541 + }, + { + "epoch": 0.560383665164947, + "grad_norm": 0.10197526961565018, + "learning_rate": 0.00013254362716322776, + "loss": 0.3503738045692444, + "memory(GiB)": 78.33, + "step": 2892, + "token_acc": 0.8949555782295996, + "train_speed(iter/s)": 0.032542 + }, + { + "epoch": 0.5605774354502737, + "grad_norm": 0.11662586033344269, + "learning_rate": 0.00013244815198119024, + "loss": 0.3930249810218811, + "memory(GiB)": 78.33, + "step": 2893, + "token_acc": 0.8859180035650623, + "train_speed(iter/s)": 0.032543 + }, + { + "epoch": 0.5607712057356005, + "grad_norm": 0.11080929636955261, + "learning_rate": 0.0001323526840081683, + "loss": 0.3820610046386719, + "memory(GiB)": 78.33, + "step": 2894, + "token_acc": 0.8872899535216303, + "train_speed(iter/s)": 0.032544 + }, + { + "epoch": 0.5609649760209272, + "grad_norm": 0.10336665064096451, + "learning_rate": 0.00013225722328337323, + "loss": 0.34890395402908325, + "memory(GiB)": 78.33, + "step": 2895, + "token_acc": 0.8951938666153719, + "train_speed(iter/s)": 0.032545 + }, + { + "epoch": 0.561158746306254, + "grad_norm": 0.09736684709787369, + "learning_rate": 0.0001321617698460134, + "loss": 0.33721378445625305, + "memory(GiB)": 78.33, + "step": 2896, + "token_acc": 0.9000335025642348, + "train_speed(iter/s)": 0.032546 + }, + { + "epoch": 0.5613525165915807, + "grad_norm": 0.10685203224420547, + "learning_rate": 0.00013206632373529396, + "loss": 0.35752072930336, + "memory(GiB)": 78.33, + "step": 2897, + "token_acc": 0.8925185941373779, + "train_speed(iter/s)": 0.032546 + }, + { + "epoch": 0.5615462868769074, + "grad_norm": 0.10666251182556152, + "learning_rate": 0.00013197088499041732, + "loss": 0.3803097903728485, + "memory(GiB)": 78.33, + "step": 2898, + "token_acc": 0.8869661896345288, + "train_speed(iter/s)": 0.032547 + }, + { + "epoch": 0.5617400571622342, + "grad_norm": 0.09380005300045013, + "learning_rate": 0.00013187545365058261, + "loss": 0.3471141755580902, + "memory(GiB)": 78.33, + "step": 2899, + "token_acc": 0.8968915295993044, + "train_speed(iter/s)": 0.032548 + }, + { + "epoch": 0.5619338274475609, + "grad_norm": 0.10400458425283432, + "learning_rate": 0.00013178002975498614, + "loss": 0.3575308322906494, + "memory(GiB)": 78.33, + "step": 2900, + "token_acc": 0.8943859748659613, + "train_speed(iter/s)": 0.032549 + }, + { + "epoch": 0.5621275977328877, + "grad_norm": 0.09898320585489273, + "learning_rate": 0.00013168461334282103, + "loss": 0.34250903129577637, + "memory(GiB)": 78.33, + "step": 2901, + "token_acc": 0.895111494593793, + "train_speed(iter/s)": 0.03255 + }, + { + "epoch": 0.5623213680182144, + "grad_norm": 0.09259682148694992, + "learning_rate": 0.00013158920445327738, + "loss": 0.30667221546173096, + "memory(GiB)": 78.33, + "step": 2902, + "token_acc": 0.9075316927665921, + "train_speed(iter/s)": 0.032551 + }, + { + "epoch": 0.5625151383035412, + "grad_norm": 0.10204022377729416, + "learning_rate": 0.0001314938031255422, + "loss": 0.35288378596305847, + "memory(GiB)": 78.33, + "step": 2903, + "token_acc": 0.8955367449018854, + "train_speed(iter/s)": 0.032552 + }, + { + "epoch": 0.5627089085888679, + "grad_norm": 0.0911250039935112, + "learning_rate": 0.00013139840939879933, + "loss": 0.3128979206085205, + "memory(GiB)": 78.33, + "step": 2904, + "token_acc": 0.9082222013523666, + "train_speed(iter/s)": 0.032553 + }, + { + "epoch": 0.5629026788741947, + "grad_norm": 0.10293769836425781, + "learning_rate": 0.00013130302331222963, + "loss": 0.34590229392051697, + "memory(GiB)": 78.33, + "step": 2905, + "token_acc": 0.8969505783385909, + "train_speed(iter/s)": 0.032553 + }, + { + "epoch": 0.5630964491595214, + "grad_norm": 0.09894620627164841, + "learning_rate": 0.00013120764490501057, + "loss": 0.33227023482322693, + "memory(GiB)": 78.33, + "step": 2906, + "token_acc": 0.9007358424899319, + "train_speed(iter/s)": 0.032554 + }, + { + "epoch": 0.5632902194448481, + "grad_norm": 0.10376887768507004, + "learning_rate": 0.00013111227421631674, + "loss": 0.36455827951431274, + "memory(GiB)": 78.33, + "step": 2907, + "token_acc": 0.8945593638331069, + "train_speed(iter/s)": 0.032555 + }, + { + "epoch": 0.5634839897301749, + "grad_norm": 0.10269268602132797, + "learning_rate": 0.00013101691128531942, + "loss": 0.3566417694091797, + "memory(GiB)": 78.33, + "step": 2908, + "token_acc": 0.8924794993526112, + "train_speed(iter/s)": 0.032556 + }, + { + "epoch": 0.5636777600155016, + "grad_norm": 0.11318562924861908, + "learning_rate": 0.00013092155615118672, + "loss": 0.3609810471534729, + "memory(GiB)": 78.33, + "step": 2909, + "token_acc": 0.8935661992803078, + "train_speed(iter/s)": 0.032557 + }, + { + "epoch": 0.5638715303008284, + "grad_norm": 0.09679584205150604, + "learning_rate": 0.00013082620885308363, + "loss": 0.33762603998184204, + "memory(GiB)": 78.33, + "step": 2910, + "token_acc": 0.8995445957075723, + "train_speed(iter/s)": 0.032558 + }, + { + "epoch": 0.5640653005861551, + "grad_norm": 0.11616901308298111, + "learning_rate": 0.00013073086943017173, + "loss": 0.351492315530777, + "memory(GiB)": 78.33, + "step": 2911, + "token_acc": 0.8948040342142218, + "train_speed(iter/s)": 0.032559 + }, + { + "epoch": 0.5642590708714819, + "grad_norm": 0.10330884903669357, + "learning_rate": 0.00013063553792160958, + "loss": 0.3321197032928467, + "memory(GiB)": 78.33, + "step": 2912, + "token_acc": 0.8990494397535977, + "train_speed(iter/s)": 0.03256 + }, + { + "epoch": 0.5644528411568086, + "grad_norm": 0.10223756730556488, + "learning_rate": 0.0001305402143665523, + "loss": 0.32255592942237854, + "memory(GiB)": 78.33, + "step": 2913, + "token_acc": 0.9044650149741356, + "train_speed(iter/s)": 0.03256 + }, + { + "epoch": 0.5646466114421353, + "grad_norm": 0.09594756364822388, + "learning_rate": 0.00013044489880415194, + "loss": 0.3452211320400238, + "memory(GiB)": 78.33, + "step": 2914, + "token_acc": 0.8991926235148401, + "train_speed(iter/s)": 0.032561 + }, + { + "epoch": 0.5648403817274621, + "grad_norm": 0.09111212939023972, + "learning_rate": 0.00013034959127355703, + "loss": 0.3073510229587555, + "memory(GiB)": 78.33, + "step": 2915, + "token_acc": 0.9079986434765758, + "train_speed(iter/s)": 0.032562 + }, + { + "epoch": 0.5650341520127888, + "grad_norm": 0.10147152841091156, + "learning_rate": 0.00013025429181391304, + "loss": 0.3540181815624237, + "memory(GiB)": 78.33, + "step": 2916, + "token_acc": 0.8969390771546049, + "train_speed(iter/s)": 0.032563 + }, + { + "epoch": 0.5652279222981156, + "grad_norm": 0.10001610219478607, + "learning_rate": 0.00013015900046436205, + "loss": 0.35491663217544556, + "memory(GiB)": 78.33, + "step": 2917, + "token_acc": 0.8945139415897574, + "train_speed(iter/s)": 0.032564 + }, + { + "epoch": 0.5654216925834423, + "grad_norm": 0.09321217238903046, + "learning_rate": 0.00013006371726404265, + "loss": 0.3310143053531647, + "memory(GiB)": 78.33, + "step": 2918, + "token_acc": 0.9035609732687047, + "train_speed(iter/s)": 0.032564 + }, + { + "epoch": 0.5656154628687691, + "grad_norm": 0.10879889130592346, + "learning_rate": 0.0001299684422520903, + "loss": 0.3868524730205536, + "memory(GiB)": 78.33, + "step": 2919, + "token_acc": 0.8842917053674463, + "train_speed(iter/s)": 0.032565 + }, + { + "epoch": 0.5658092331540958, + "grad_norm": 0.09358043968677521, + "learning_rate": 0.00012987317546763697, + "loss": 0.3301950991153717, + "memory(GiB)": 78.33, + "step": 2920, + "token_acc": 0.9018064033885635, + "train_speed(iter/s)": 0.032566 + }, + { + "epoch": 0.5660030034394226, + "grad_norm": 0.09514044970273972, + "learning_rate": 0.00012977791694981136, + "loss": 0.34315773844718933, + "memory(GiB)": 78.33, + "step": 2921, + "token_acc": 0.8958858102434929, + "train_speed(iter/s)": 0.032567 + }, + { + "epoch": 0.5661967737247493, + "grad_norm": 0.10747494548559189, + "learning_rate": 0.00012968266673773858, + "loss": 0.3809034824371338, + "memory(GiB)": 78.33, + "step": 2922, + "token_acc": 0.8869167528719987, + "train_speed(iter/s)": 0.032568 + }, + { + "epoch": 0.566390544010076, + "grad_norm": 0.09282265603542328, + "learning_rate": 0.00012958742487054054, + "loss": 0.3388338088989258, + "memory(GiB)": 78.33, + "step": 2923, + "token_acc": 0.9004572462527813, + "train_speed(iter/s)": 0.032569 + }, + { + "epoch": 0.5665843142954028, + "grad_norm": 0.11763904243707657, + "learning_rate": 0.00012949219138733565, + "loss": 0.403276652097702, + "memory(GiB)": 78.33, + "step": 2924, + "token_acc": 0.8811169562025936, + "train_speed(iter/s)": 0.03257 + }, + { + "epoch": 0.5667780845807295, + "grad_norm": 0.09826403111219406, + "learning_rate": 0.00012939696632723876, + "loss": 0.34657537937164307, + "memory(GiB)": 78.33, + "step": 2925, + "token_acc": 0.8960861857357153, + "train_speed(iter/s)": 0.03257 + }, + { + "epoch": 0.5669718548660563, + "grad_norm": 0.09996719658374786, + "learning_rate": 0.00012930174972936148, + "loss": 0.34664636850357056, + "memory(GiB)": 78.33, + "step": 2926, + "token_acc": 0.8960221082560519, + "train_speed(iter/s)": 0.032571 + }, + { + "epoch": 0.567165625151383, + "grad_norm": 0.09884995222091675, + "learning_rate": 0.00012920654163281172, + "loss": 0.3174511790275574, + "memory(GiB)": 78.33, + "step": 2927, + "token_acc": 0.9049272486772487, + "train_speed(iter/s)": 0.032572 + }, + { + "epoch": 0.5673593954367098, + "grad_norm": 0.10128315538167953, + "learning_rate": 0.00012911134207669412, + "loss": 0.33141183853149414, + "memory(GiB)": 78.33, + "step": 2928, + "token_acc": 0.9011783988470342, + "train_speed(iter/s)": 0.032573 + }, + { + "epoch": 0.5675531657220365, + "grad_norm": 0.11448942869901657, + "learning_rate": 0.00012901615110010956, + "loss": 0.3866661489009857, + "memory(GiB)": 78.33, + "step": 2929, + "token_acc": 0.8871186120469788, + "train_speed(iter/s)": 0.032574 + }, + { + "epoch": 0.5677469360073633, + "grad_norm": 0.09554385393857956, + "learning_rate": 0.00012892096874215562, + "loss": 0.31373119354248047, + "memory(GiB)": 78.33, + "step": 2930, + "token_acc": 0.9060102797220859, + "train_speed(iter/s)": 0.032575 + }, + { + "epoch": 0.56794070629269, + "grad_norm": 0.10202641785144806, + "learning_rate": 0.00012882579504192628, + "loss": 0.3446533977985382, + "memory(GiB)": 78.33, + "step": 2931, + "token_acc": 0.8982907151647751, + "train_speed(iter/s)": 0.032576 + }, + { + "epoch": 0.5681344765780167, + "grad_norm": 0.10661419481039047, + "learning_rate": 0.00012873063003851184, + "loss": 0.35394954681396484, + "memory(GiB)": 78.33, + "step": 2932, + "token_acc": 0.8950952106174265, + "train_speed(iter/s)": 0.032577 + }, + { + "epoch": 0.5683282468633435, + "grad_norm": 0.10493983328342438, + "learning_rate": 0.00012863547377099918, + "loss": 0.34970593452453613, + "memory(GiB)": 78.33, + "step": 2933, + "token_acc": 0.8950512907225792, + "train_speed(iter/s)": 0.032578 + }, + { + "epoch": 0.5685220171486702, + "grad_norm": 0.11756689846515656, + "learning_rate": 0.0001285403262784715, + "loss": 0.3975431025028229, + "memory(GiB)": 78.33, + "step": 2934, + "token_acc": 0.8840085287846482, + "train_speed(iter/s)": 0.032578 + }, + { + "epoch": 0.568715787433997, + "grad_norm": 0.10590403527021408, + "learning_rate": 0.00012844518760000848, + "loss": 0.37834692001342773, + "memory(GiB)": 78.33, + "step": 2935, + "token_acc": 0.8883588043448963, + "train_speed(iter/s)": 0.032579 + }, + { + "epoch": 0.5689095577193237, + "grad_norm": 0.0980941578745842, + "learning_rate": 0.0001283500577746862, + "loss": 0.32845622301101685, + "memory(GiB)": 78.33, + "step": 2936, + "token_acc": 0.9010894350139347, + "train_speed(iter/s)": 0.03258 + }, + { + "epoch": 0.5691033280046505, + "grad_norm": 0.09987200796604156, + "learning_rate": 0.00012825493684157682, + "loss": 0.334673672914505, + "memory(GiB)": 78.33, + "step": 2937, + "token_acc": 0.8996542412235318, + "train_speed(iter/s)": 0.032581 + }, + { + "epoch": 0.5692970982899772, + "grad_norm": 0.1141529381275177, + "learning_rate": 0.0001281598248397493, + "loss": 0.39329400658607483, + "memory(GiB)": 78.33, + "step": 2938, + "token_acc": 0.886091163126108, + "train_speed(iter/s)": 0.032582 + }, + { + "epoch": 0.569490868575304, + "grad_norm": 0.09881948679685593, + "learning_rate": 0.0001280647218082685, + "loss": 0.3378741145133972, + "memory(GiB)": 78.33, + "step": 2939, + "token_acc": 0.8991566664118831, + "train_speed(iter/s)": 0.032583 + }, + { + "epoch": 0.5696846388606307, + "grad_norm": 0.10220920294523239, + "learning_rate": 0.00012796962778619593, + "loss": 0.35935983061790466, + "memory(GiB)": 78.33, + "step": 2940, + "token_acc": 0.8916394328504156, + "train_speed(iter/s)": 0.032583 + }, + { + "epoch": 0.5698784091459574, + "grad_norm": 0.10239671915769577, + "learning_rate": 0.00012787454281258916, + "loss": 0.34669652581214905, + "memory(GiB)": 78.33, + "step": 2941, + "token_acc": 0.8974839774985307, + "train_speed(iter/s)": 0.032584 + }, + { + "epoch": 0.5700721794312842, + "grad_norm": 0.10328203439712524, + "learning_rate": 0.0001277794669265022, + "loss": 0.33819228410720825, + "memory(GiB)": 78.33, + "step": 2942, + "token_acc": 0.8988887064104669, + "train_speed(iter/s)": 0.032585 + }, + { + "epoch": 0.5702659497166109, + "grad_norm": 0.10673517733812332, + "learning_rate": 0.00012768440016698533, + "loss": 0.36250045895576477, + "memory(GiB)": 78.33, + "step": 2943, + "token_acc": 0.8937628879783616, + "train_speed(iter/s)": 0.032586 + }, + { + "epoch": 0.5704597200019377, + "grad_norm": 0.10135802626609802, + "learning_rate": 0.0001275893425730849, + "loss": 0.3382996618747711, + "memory(GiB)": 78.33, + "step": 2944, + "token_acc": 0.8998493193390834, + "train_speed(iter/s)": 0.032587 + }, + { + "epoch": 0.5706534902872644, + "grad_norm": 0.09103412926197052, + "learning_rate": 0.00012749429418384368, + "loss": 0.3121355175971985, + "memory(GiB)": 78.33, + "step": 2945, + "token_acc": 0.907722643769968, + "train_speed(iter/s)": 0.032588 + }, + { + "epoch": 0.5708472605725912, + "grad_norm": 0.10437119007110596, + "learning_rate": 0.00012739925503830058, + "loss": 0.3552001118659973, + "memory(GiB)": 78.33, + "step": 2946, + "token_acc": 0.8937973264809601, + "train_speed(iter/s)": 0.032589 + }, + { + "epoch": 0.5710410308579179, + "grad_norm": 0.09624162316322327, + "learning_rate": 0.00012730422517549076, + "loss": 0.34696635603904724, + "memory(GiB)": 78.33, + "step": 2947, + "token_acc": 0.89641196508939, + "train_speed(iter/s)": 0.032589 + }, + { + "epoch": 0.5712348011432447, + "grad_norm": 0.0997808501124382, + "learning_rate": 0.0001272092046344455, + "loss": 0.33023878931999207, + "memory(GiB)": 78.33, + "step": 2948, + "token_acc": 0.9004624871531346, + "train_speed(iter/s)": 0.03259 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.10305418819189072, + "learning_rate": 0.0001271141934541923, + "loss": 0.3459644019603729, + "memory(GiB)": 78.33, + "step": 2949, + "token_acc": 0.8978806469604016, + "train_speed(iter/s)": 0.032591 + }, + { + "epoch": 0.5716223417138981, + "grad_norm": 0.09685331583023071, + "learning_rate": 0.00012701919167375488, + "loss": 0.34739017486572266, + "memory(GiB)": 78.33, + "step": 2950, + "token_acc": 0.8971735019041944, + "train_speed(iter/s)": 0.032592 + }, + { + "epoch": 0.5718161119992249, + "grad_norm": 0.09449600428342819, + "learning_rate": 0.00012692419933215288, + "loss": 0.3246384859085083, + "memory(GiB)": 78.33, + "step": 2951, + "token_acc": 0.9017080904162831, + "train_speed(iter/s)": 0.032593 + }, + { + "epoch": 0.5720098822845516, + "grad_norm": 0.10447492450475693, + "learning_rate": 0.00012682921646840233, + "loss": 0.37390848994255066, + "memory(GiB)": 78.33, + "step": 2952, + "token_acc": 0.8906833284795214, + "train_speed(iter/s)": 0.032594 + }, + { + "epoch": 0.5722036525698784, + "grad_norm": 0.11614307761192322, + "learning_rate": 0.00012673424312151517, + "loss": 0.40665432810783386, + "memory(GiB)": 78.33, + "step": 2953, + "token_acc": 0.8813434035031336, + "train_speed(iter/s)": 0.032594 + }, + { + "epoch": 0.5723974228552051, + "grad_norm": 0.09071079641580582, + "learning_rate": 0.0001266392793304996, + "loss": 0.33611175417900085, + "memory(GiB)": 78.33, + "step": 2954, + "token_acc": 0.8994031782555547, + "train_speed(iter/s)": 0.032595 + }, + { + "epoch": 0.5725911931405319, + "grad_norm": 0.10135752707719803, + "learning_rate": 0.00012654432513435965, + "loss": 0.36446985602378845, + "memory(GiB)": 78.33, + "step": 2955, + "token_acc": 0.8912203779100115, + "train_speed(iter/s)": 0.032596 + }, + { + "epoch": 0.5727849634258586, + "grad_norm": 0.10277879238128662, + "learning_rate": 0.00012644938057209567, + "loss": 0.3852466940879822, + "memory(GiB)": 78.33, + "step": 2956, + "token_acc": 0.8869683281547821, + "train_speed(iter/s)": 0.032597 + }, + { + "epoch": 0.5729787337111854, + "grad_norm": 0.10757278650999069, + "learning_rate": 0.00012635444568270398, + "loss": 0.35662007331848145, + "memory(GiB)": 78.33, + "step": 2957, + "token_acc": 0.893989887382211, + "train_speed(iter/s)": 0.032598 + }, + { + "epoch": 0.5731725039965121, + "grad_norm": 0.11290912330150604, + "learning_rate": 0.00012625952050517673, + "loss": 0.365123450756073, + "memory(GiB)": 78.33, + "step": 2958, + "token_acc": 0.890861820997898, + "train_speed(iter/s)": 0.032599 + }, + { + "epoch": 0.5733662742818388, + "grad_norm": 0.11363532394170761, + "learning_rate": 0.00012616460507850242, + "loss": 0.3688386082649231, + "memory(GiB)": 78.33, + "step": 2959, + "token_acc": 0.8900687757909216, + "train_speed(iter/s)": 0.032599 + }, + { + "epoch": 0.5735600445671656, + "grad_norm": 0.10175785422325134, + "learning_rate": 0.00012606969944166523, + "loss": 0.3480740785598755, + "memory(GiB)": 78.33, + "step": 2960, + "token_acc": 0.8961689531507941, + "train_speed(iter/s)": 0.0326 + }, + { + "epoch": 0.5737538148524923, + "grad_norm": 0.10226535052061081, + "learning_rate": 0.00012597480363364558, + "loss": 0.36537787318229675, + "memory(GiB)": 78.33, + "step": 2961, + "token_acc": 0.8930911178973194, + "train_speed(iter/s)": 0.032601 + }, + { + "epoch": 0.5739475851378191, + "grad_norm": 0.10284057259559631, + "learning_rate": 0.0001258799176934196, + "loss": 0.3630596399307251, + "memory(GiB)": 78.33, + "step": 2962, + "token_acc": 0.8929182958930716, + "train_speed(iter/s)": 0.032602 + }, + { + "epoch": 0.5741413554231458, + "grad_norm": 0.10394224524497986, + "learning_rate": 0.00012578504165995953, + "loss": 0.3422529995441437, + "memory(GiB)": 78.33, + "step": 2963, + "token_acc": 0.8979251265505924, + "train_speed(iter/s)": 0.032603 + }, + { + "epoch": 0.5743351257084726, + "grad_norm": 0.10181540995836258, + "learning_rate": 0.00012569017557223362, + "loss": 0.35082393884658813, + "memory(GiB)": 78.33, + "step": 2964, + "token_acc": 0.8960905890389365, + "train_speed(iter/s)": 0.032604 + }, + { + "epoch": 0.5745288959937993, + "grad_norm": 0.10336862504482269, + "learning_rate": 0.00012559531946920578, + "loss": 0.36020544171333313, + "memory(GiB)": 78.33, + "step": 2965, + "token_acc": 0.8921231485743684, + "train_speed(iter/s)": 0.032604 + }, + { + "epoch": 0.574722666279126, + "grad_norm": 0.10156133770942688, + "learning_rate": 0.00012550047338983603, + "loss": 0.36286041140556335, + "memory(GiB)": 78.33, + "step": 2966, + "token_acc": 0.8922610015174507, + "train_speed(iter/s)": 0.032605 + }, + { + "epoch": 0.5749164365644528, + "grad_norm": 0.09440826624631882, + "learning_rate": 0.00012540563737308016, + "loss": 0.3350200057029724, + "memory(GiB)": 78.33, + "step": 2967, + "token_acc": 0.8994114241605558, + "train_speed(iter/s)": 0.032606 + }, + { + "epoch": 0.5751102068497795, + "grad_norm": 0.117142453789711, + "learning_rate": 0.00012531081145788987, + "loss": 0.4059712290763855, + "memory(GiB)": 78.33, + "step": 2968, + "token_acc": 0.8823188237682337, + "train_speed(iter/s)": 0.032607 + }, + { + "epoch": 0.5753039771351063, + "grad_norm": 0.10292242467403412, + "learning_rate": 0.00012521599568321283, + "loss": 0.3511313199996948, + "memory(GiB)": 78.33, + "step": 2969, + "token_acc": 0.8962899832210296, + "train_speed(iter/s)": 0.032608 + }, + { + "epoch": 0.575497747420433, + "grad_norm": 0.10700348764657974, + "learning_rate": 0.00012512119008799226, + "loss": 0.3738468885421753, + "memory(GiB)": 78.33, + "step": 2970, + "token_acc": 0.8902907594626264, + "train_speed(iter/s)": 0.032609 + }, + { + "epoch": 0.5756915177057598, + "grad_norm": 0.10004039853811264, + "learning_rate": 0.0001250263947111675, + "loss": 0.3366740942001343, + "memory(GiB)": 78.33, + "step": 2971, + "token_acc": 0.9001073537305422, + "train_speed(iter/s)": 0.032609 + }, + { + "epoch": 0.5758852879910865, + "grad_norm": 0.10136931389570236, + "learning_rate": 0.00012493160959167347, + "loss": 0.35105177760124207, + "memory(GiB)": 78.33, + "step": 2972, + "token_acc": 0.8961308889362609, + "train_speed(iter/s)": 0.03261 + }, + { + "epoch": 0.5760790582764133, + "grad_norm": 0.10934333503246307, + "learning_rate": 0.000124836834768441, + "loss": 0.37874335050582886, + "memory(GiB)": 78.33, + "step": 2973, + "token_acc": 0.8891520475888849, + "train_speed(iter/s)": 0.032611 + }, + { + "epoch": 0.57627282856174, + "grad_norm": 0.09537496417760849, + "learning_rate": 0.0001247420702803966, + "loss": 0.3463536500930786, + "memory(GiB)": 78.33, + "step": 2974, + "token_acc": 0.8957062728096603, + "train_speed(iter/s)": 0.032612 + }, + { + "epoch": 0.5764665988470667, + "grad_norm": 0.11320365220308304, + "learning_rate": 0.00012464731616646267, + "loss": 0.3580982983112335, + "memory(GiB)": 78.33, + "step": 2975, + "token_acc": 0.8927235580700927, + "train_speed(iter/s)": 0.032613 + }, + { + "epoch": 0.5766603691323936, + "grad_norm": 0.1089281439781189, + "learning_rate": 0.0001245525724655573, + "loss": 0.3534199297428131, + "memory(GiB)": 78.33, + "step": 2976, + "token_acc": 0.8954800657857427, + "train_speed(iter/s)": 0.032614 + }, + { + "epoch": 0.5768541394177203, + "grad_norm": 0.11043395847082138, + "learning_rate": 0.00012445783921659416, + "loss": 0.3937772214412689, + "memory(GiB)": 78.33, + "step": 2977, + "token_acc": 0.8828118422898835, + "train_speed(iter/s)": 0.032614 + }, + { + "epoch": 0.5770479097030471, + "grad_norm": 0.09332658350467682, + "learning_rate": 0.00012436311645848286, + "loss": 0.3024745285511017, + "memory(GiB)": 78.33, + "step": 2978, + "token_acc": 0.9083586811047573, + "train_speed(iter/s)": 0.032615 + }, + { + "epoch": 0.5772416799883738, + "grad_norm": 0.10384024679660797, + "learning_rate": 0.00012426840423012845, + "loss": 0.3482080101966858, + "memory(GiB)": 78.33, + "step": 2979, + "token_acc": 0.8957580870528065, + "train_speed(iter/s)": 0.032616 + }, + { + "epoch": 0.5774354502737006, + "grad_norm": 0.10794834792613983, + "learning_rate": 0.0001241737025704319, + "loss": 0.35951748490333557, + "memory(GiB)": 78.33, + "step": 2980, + "token_acc": 0.894366402850384, + "train_speed(iter/s)": 0.032617 + }, + { + "epoch": 0.5776292205590273, + "grad_norm": 0.13303066790103912, + "learning_rate": 0.00012407901151828963, + "loss": 0.3524818420410156, + "memory(GiB)": 78.33, + "step": 2981, + "token_acc": 0.8969385499557914, + "train_speed(iter/s)": 0.032618 + }, + { + "epoch": 0.5778229908443541, + "grad_norm": 0.11768164485692978, + "learning_rate": 0.00012398433111259386, + "loss": 0.3718627691268921, + "memory(GiB)": 78.33, + "step": 2982, + "token_acc": 0.8906458313955291, + "train_speed(iter/s)": 0.032619 + }, + { + "epoch": 0.5780167611296808, + "grad_norm": 0.10189583152532578, + "learning_rate": 0.00012388966139223245, + "loss": 0.3596772253513336, + "memory(GiB)": 78.33, + "step": 2983, + "token_acc": 0.8941813261163735, + "train_speed(iter/s)": 0.03262 + }, + { + "epoch": 0.5782105314150076, + "grad_norm": 0.10249454528093338, + "learning_rate": 0.00012379500239608865, + "loss": 0.3092750906944275, + "memory(GiB)": 78.33, + "step": 2984, + "token_acc": 0.9063748245708734, + "train_speed(iter/s)": 0.03262 + }, + { + "epoch": 0.5784043017003343, + "grad_norm": 0.12180888652801514, + "learning_rate": 0.00012370035416304153, + "loss": 0.3937700092792511, + "memory(GiB)": 78.33, + "step": 2985, + "token_acc": 0.8859492919528534, + "train_speed(iter/s)": 0.032621 + }, + { + "epoch": 0.578598071985661, + "grad_norm": 0.0898217260837555, + "learning_rate": 0.00012360571673196565, + "loss": 0.3232322931289673, + "memory(GiB)": 78.33, + "step": 2986, + "token_acc": 0.9051626763526832, + "train_speed(iter/s)": 0.032622 + }, + { + "epoch": 0.5787918422709878, + "grad_norm": 0.09598486870527267, + "learning_rate": 0.0001235110901417312, + "loss": 0.32468363642692566, + "memory(GiB)": 78.33, + "step": 2987, + "token_acc": 0.901153603034134, + "train_speed(iter/s)": 0.032623 + }, + { + "epoch": 0.5789856125563145, + "grad_norm": 0.10317398607730865, + "learning_rate": 0.00012341647443120374, + "loss": 0.33831608295440674, + "memory(GiB)": 78.33, + "step": 2988, + "token_acc": 0.900511402902557, + "train_speed(iter/s)": 0.032624 + }, + { + "epoch": 0.5791793828416413, + "grad_norm": 0.10351257771253586, + "learning_rate": 0.0001233218696392446, + "loss": 0.3458552956581116, + "memory(GiB)": 78.33, + "step": 2989, + "token_acc": 0.8962962962962963, + "train_speed(iter/s)": 0.032624 + }, + { + "epoch": 0.579373153126968, + "grad_norm": 0.09751173853874207, + "learning_rate": 0.00012322727580471048, + "loss": 0.338079035282135, + "memory(GiB)": 78.33, + "step": 2990, + "token_acc": 0.8986410108266975, + "train_speed(iter/s)": 0.032625 + }, + { + "epoch": 0.5795669234122948, + "grad_norm": 0.10420162230730057, + "learning_rate": 0.00012313269296645356, + "loss": 0.3703789710998535, + "memory(GiB)": 78.33, + "step": 2991, + "token_acc": 0.8920427978106005, + "train_speed(iter/s)": 0.032626 + }, + { + "epoch": 0.5797606936976215, + "grad_norm": 0.10459905862808228, + "learning_rate": 0.00012303812116332163, + "loss": 0.35847145318984985, + "memory(GiB)": 78.33, + "step": 2992, + "token_acc": 0.893757727364214, + "train_speed(iter/s)": 0.032627 + }, + { + "epoch": 0.5799544639829483, + "grad_norm": 0.09815093129873276, + "learning_rate": 0.0001229435604341578, + "loss": 0.3507353663444519, + "memory(GiB)": 78.33, + "step": 2993, + "token_acc": 0.8970371099517669, + "train_speed(iter/s)": 0.032628 + }, + { + "epoch": 0.580148234268275, + "grad_norm": 0.10171345621347427, + "learning_rate": 0.00012284901081780077, + "loss": 0.34216856956481934, + "memory(GiB)": 78.33, + "step": 2994, + "token_acc": 0.8977114312267658, + "train_speed(iter/s)": 0.032629 + }, + { + "epoch": 0.5803420045536017, + "grad_norm": 0.09991803765296936, + "learning_rate": 0.00012275447235308453, + "loss": 0.33179599046707153, + "memory(GiB)": 78.33, + "step": 2995, + "token_acc": 0.9008316831683169, + "train_speed(iter/s)": 0.032629 + }, + { + "epoch": 0.5805357748389285, + "grad_norm": 0.12053931504487991, + "learning_rate": 0.00012265994507883863, + "loss": 0.3269277811050415, + "memory(GiB)": 78.33, + "step": 2996, + "token_acc": 0.9013863084178934, + "train_speed(iter/s)": 0.03263 + }, + { + "epoch": 0.5807295451242552, + "grad_norm": 0.10479523986577988, + "learning_rate": 0.00012256542903388797, + "loss": 0.3800506591796875, + "memory(GiB)": 78.33, + "step": 2997, + "token_acc": 0.8882348086815394, + "train_speed(iter/s)": 0.032631 + }, + { + "epoch": 0.580923315409582, + "grad_norm": 0.10834317654371262, + "learning_rate": 0.00012247092425705274, + "loss": 0.3906557559967041, + "memory(GiB)": 78.33, + "step": 2998, + "token_acc": 0.8865690162121335, + "train_speed(iter/s)": 0.032632 + }, + { + "epoch": 0.5811170856949087, + "grad_norm": 0.09533150494098663, + "learning_rate": 0.0001223764307871487, + "loss": 0.32510894536972046, + "memory(GiB)": 78.33, + "step": 2999, + "token_acc": 0.9044620191368873, + "train_speed(iter/s)": 0.032633 + }, + { + "epoch": 0.5813108559802355, + "grad_norm": 0.11682204157114029, + "learning_rate": 0.00012228194866298678, + "loss": 0.3705749809741974, + "memory(GiB)": 78.33, + "step": 3000, + "token_acc": 0.8889822334235664, + "train_speed(iter/s)": 0.032633 + }, + { + "epoch": 0.5813108559802355, + "eval_loss": 0.40707629919052124, + "eval_runtime": 1344.136, + "eval_samples_per_second": 5.021, + "eval_steps_per_second": 5.021, + "eval_token_acc": 0.8961095095372348, + "step": 3000 + }, + { + "epoch": 0.5815046262655622, + "grad_norm": 0.1150721088051796, + "learning_rate": 0.00012218747792337335, + "loss": 0.3748778998851776, + "memory(GiB)": 78.33, + "step": 3001, + "token_acc": 0.8879601402956698, + "train_speed(iter/s)": 0.03216 + }, + { + "epoch": 0.581698396550889, + "grad_norm": 0.10332752019166946, + "learning_rate": 0.00012209301860711017, + "loss": 0.34594491124153137, + "memory(GiB)": 78.33, + "step": 3002, + "token_acc": 0.8974166376049575, + "train_speed(iter/s)": 0.032161 + }, + { + "epoch": 0.5818921668362157, + "grad_norm": 0.10606271028518677, + "learning_rate": 0.00012199857075299403, + "loss": 0.33615994453430176, + "memory(GiB)": 78.33, + "step": 3003, + "token_acc": 0.900671290493301, + "train_speed(iter/s)": 0.032162 + }, + { + "epoch": 0.5820859371215424, + "grad_norm": 0.11355333030223846, + "learning_rate": 0.00012190413439981741, + "loss": 0.3375508785247803, + "memory(GiB)": 78.33, + "step": 3004, + "token_acc": 0.8986658671863289, + "train_speed(iter/s)": 0.032163 + }, + { + "epoch": 0.5822797074068692, + "grad_norm": 0.0899207592010498, + "learning_rate": 0.00012180970958636769, + "loss": 0.32201769948005676, + "memory(GiB)": 78.33, + "step": 3005, + "token_acc": 0.9033656775887123, + "train_speed(iter/s)": 0.032164 + }, + { + "epoch": 0.5824734776921959, + "grad_norm": 0.10488248616456985, + "learning_rate": 0.00012171529635142777, + "loss": 0.3354138433933258, + "memory(GiB)": 78.33, + "step": 3006, + "token_acc": 0.8999153020892151, + "train_speed(iter/s)": 0.032165 + }, + { + "epoch": 0.5826672479775227, + "grad_norm": 0.11078818142414093, + "learning_rate": 0.00012162089473377564, + "loss": 0.39330416917800903, + "memory(GiB)": 78.33, + "step": 3007, + "token_acc": 0.8855033101341029, + "train_speed(iter/s)": 0.032166 + }, + { + "epoch": 0.5828610182628494, + "grad_norm": 0.09363757818937302, + "learning_rate": 0.00012152650477218462, + "loss": 0.34882262349128723, + "memory(GiB)": 78.33, + "step": 3008, + "token_acc": 0.8959220498015157, + "train_speed(iter/s)": 0.032167 + }, + { + "epoch": 0.5830547885481762, + "grad_norm": 0.11309264600276947, + "learning_rate": 0.00012143212650542327, + "loss": 0.38344091176986694, + "memory(GiB)": 78.33, + "step": 3009, + "token_acc": 0.8874630723781388, + "train_speed(iter/s)": 0.032168 + }, + { + "epoch": 0.5832485588335029, + "grad_norm": 0.1011514663696289, + "learning_rate": 0.00012133775997225515, + "loss": 0.3377688527107239, + "memory(GiB)": 78.33, + "step": 3010, + "token_acc": 0.8985852981969487, + "train_speed(iter/s)": 0.032169 + }, + { + "epoch": 0.5834423291188297, + "grad_norm": 0.09630289673805237, + "learning_rate": 0.00012124340521143926, + "loss": 0.3347204327583313, + "memory(GiB)": 78.33, + "step": 3011, + "token_acc": 0.9005485624724494, + "train_speed(iter/s)": 0.03217 + }, + { + "epoch": 0.5836360994041564, + "grad_norm": 0.09772694110870361, + "learning_rate": 0.0001211490622617295, + "loss": 0.33570656180381775, + "memory(GiB)": 78.33, + "step": 3012, + "token_acc": 0.9012391298992437, + "train_speed(iter/s)": 0.032171 + }, + { + "epoch": 0.5838298696894831, + "grad_norm": 0.11302408576011658, + "learning_rate": 0.00012105473116187517, + "loss": 0.36947059631347656, + "memory(GiB)": 78.33, + "step": 3013, + "token_acc": 0.8906702025072324, + "train_speed(iter/s)": 0.032172 + }, + { + "epoch": 0.5840236399748099, + "grad_norm": 0.09968356788158417, + "learning_rate": 0.0001209604119506205, + "loss": 0.3836635947227478, + "memory(GiB)": 78.33, + "step": 3014, + "token_acc": 0.8870716703312993, + "train_speed(iter/s)": 0.032172 + }, + { + "epoch": 0.5842174102601366, + "grad_norm": 0.1118093952536583, + "learning_rate": 0.00012086610466670495, + "loss": 0.3891493082046509, + "memory(GiB)": 78.33, + "step": 3015, + "token_acc": 0.8857653605512787, + "train_speed(iter/s)": 0.032173 + }, + { + "epoch": 0.5844111805454634, + "grad_norm": 0.10725325345993042, + "learning_rate": 0.00012077180934886317, + "loss": 0.3714507520198822, + "memory(GiB)": 78.33, + "step": 3016, + "token_acc": 0.8898680855780956, + "train_speed(iter/s)": 0.032174 + }, + { + "epoch": 0.5846049508307901, + "grad_norm": 0.10335146635770798, + "learning_rate": 0.00012067752603582458, + "loss": 0.32297301292419434, + "memory(GiB)": 78.33, + "step": 3017, + "token_acc": 0.9023583923365265, + "train_speed(iter/s)": 0.032175 + }, + { + "epoch": 0.5847987211161169, + "grad_norm": 0.09637671709060669, + "learning_rate": 0.00012058325476631404, + "loss": 0.33302274346351624, + "memory(GiB)": 78.33, + "step": 3018, + "token_acc": 0.9011041882171811, + "train_speed(iter/s)": 0.032176 + }, + { + "epoch": 0.5849924914014436, + "grad_norm": 0.10183451324701309, + "learning_rate": 0.0001204889955790512, + "loss": 0.35752299427986145, + "memory(GiB)": 78.33, + "step": 3019, + "token_acc": 0.8940680214357722, + "train_speed(iter/s)": 0.032177 + }, + { + "epoch": 0.5851862616867703, + "grad_norm": 0.09284891933202744, + "learning_rate": 0.00012039474851275087, + "loss": 0.31136441230773926, + "memory(GiB)": 78.33, + "step": 3020, + "token_acc": 0.906412598570867, + "train_speed(iter/s)": 0.032178 + }, + { + "epoch": 0.5853800319720971, + "grad_norm": 0.12801574170589447, + "learning_rate": 0.00012030051360612282, + "loss": 0.36925309896469116, + "memory(GiB)": 78.33, + "step": 3021, + "token_acc": 0.890402707664066, + "train_speed(iter/s)": 0.032179 + }, + { + "epoch": 0.5855738022574238, + "grad_norm": 0.10945330560207367, + "learning_rate": 0.0001202062908978719, + "loss": 0.37747853994369507, + "memory(GiB)": 78.33, + "step": 3022, + "token_acc": 0.8870110767568217, + "train_speed(iter/s)": 0.03218 + }, + { + "epoch": 0.5857675725427506, + "grad_norm": 0.10129844397306442, + "learning_rate": 0.00012011208042669797, + "loss": 0.3541242778301239, + "memory(GiB)": 78.33, + "step": 3023, + "token_acc": 0.893968527045489, + "train_speed(iter/s)": 0.032181 + }, + { + "epoch": 0.5859613428280773, + "grad_norm": 0.10958780348300934, + "learning_rate": 0.00012001788223129563, + "loss": 0.37422338128089905, + "memory(GiB)": 78.33, + "step": 3024, + "token_acc": 0.8888448160126929, + "train_speed(iter/s)": 0.032182 + }, + { + "epoch": 0.5861551131134041, + "grad_norm": 0.09564699977636337, + "learning_rate": 0.00011992369635035475, + "loss": 0.3554079532623291, + "memory(GiB)": 78.33, + "step": 3025, + "token_acc": 0.8962656629191224, + "train_speed(iter/s)": 0.032183 + }, + { + "epoch": 0.5863488833987308, + "grad_norm": 0.11680517345666885, + "learning_rate": 0.00011982952282255994, + "loss": 0.37712615728378296, + "memory(GiB)": 78.33, + "step": 3026, + "token_acc": 0.8882045642012648, + "train_speed(iter/s)": 0.032184 + }, + { + "epoch": 0.5865426536840576, + "grad_norm": 0.1049395278096199, + "learning_rate": 0.00011973536168659089, + "loss": 0.36447393894195557, + "memory(GiB)": 78.33, + "step": 3027, + "token_acc": 0.8916041342205862, + "train_speed(iter/s)": 0.032185 + }, + { + "epoch": 0.5867364239693843, + "grad_norm": 0.1006232500076294, + "learning_rate": 0.00011964121298112194, + "loss": 0.34536200761795044, + "memory(GiB)": 78.33, + "step": 3028, + "token_acc": 0.897648835202761, + "train_speed(iter/s)": 0.032185 + }, + { + "epoch": 0.586930194254711, + "grad_norm": 0.12608642876148224, + "learning_rate": 0.00011954707674482263, + "loss": 0.3553128242492676, + "memory(GiB)": 78.33, + "step": 3029, + "token_acc": 0.8939438229119608, + "train_speed(iter/s)": 0.032186 + }, + { + "epoch": 0.5871239645400378, + "grad_norm": 0.10169053822755814, + "learning_rate": 0.00011945295301635724, + "loss": 0.3616080582141876, + "memory(GiB)": 78.33, + "step": 3030, + "token_acc": 0.892506928160307, + "train_speed(iter/s)": 0.032187 + }, + { + "epoch": 0.5873177348253645, + "grad_norm": 0.09629808366298676, + "learning_rate": 0.00011935884183438483, + "loss": 0.33636754751205444, + "memory(GiB)": 78.33, + "step": 3031, + "token_acc": 0.9002651087436792, + "train_speed(iter/s)": 0.032188 + }, + { + "epoch": 0.5875115051106913, + "grad_norm": 0.10230912268161774, + "learning_rate": 0.00011926474323755947, + "loss": 0.352095365524292, + "memory(GiB)": 78.33, + "step": 3032, + "token_acc": 0.8956463675213675, + "train_speed(iter/s)": 0.032189 + }, + { + "epoch": 0.587705275396018, + "grad_norm": 0.09654974937438965, + "learning_rate": 0.00011917065726452991, + "loss": 0.33314049243927, + "memory(GiB)": 78.33, + "step": 3033, + "token_acc": 0.9005795098091339, + "train_speed(iter/s)": 0.03219 + }, + { + "epoch": 0.5878990456813448, + "grad_norm": 0.10360792279243469, + "learning_rate": 0.00011907658395393982, + "loss": 0.35966095328330994, + "memory(GiB)": 78.33, + "step": 3034, + "token_acc": 0.8929588157145568, + "train_speed(iter/s)": 0.032191 + }, + { + "epoch": 0.5880928159666715, + "grad_norm": 0.09588982164859772, + "learning_rate": 0.00011898252334442771, + "loss": 0.3092660903930664, + "memory(GiB)": 78.33, + "step": 3035, + "token_acc": 0.9049737273763518, + "train_speed(iter/s)": 0.032192 + }, + { + "epoch": 0.5882865862519983, + "grad_norm": 0.11765430122613907, + "learning_rate": 0.00011888847547462669, + "loss": 0.3882688879966736, + "memory(GiB)": 78.33, + "step": 3036, + "token_acc": 0.8858762254901961, + "train_speed(iter/s)": 0.032193 + }, + { + "epoch": 0.588480356537325, + "grad_norm": 0.1067943125963211, + "learning_rate": 0.00011879444038316485, + "loss": 0.34796375036239624, + "memory(GiB)": 78.33, + "step": 3037, + "token_acc": 0.8972947233713282, + "train_speed(iter/s)": 0.032194 + }, + { + "epoch": 0.5886741268226517, + "grad_norm": 0.12499988824129105, + "learning_rate": 0.0001187004181086648, + "loss": 0.34971049427986145, + "memory(GiB)": 78.33, + "step": 3038, + "token_acc": 0.8963723439392272, + "train_speed(iter/s)": 0.032195 + }, + { + "epoch": 0.5888678971079785, + "grad_norm": 0.10188789665699005, + "learning_rate": 0.0001186064086897441, + "loss": 0.355500727891922, + "memory(GiB)": 78.33, + "step": 3039, + "token_acc": 0.8935444389469024, + "train_speed(iter/s)": 0.032196 + }, + { + "epoch": 0.5890616673933052, + "grad_norm": 0.10140416026115417, + "learning_rate": 0.00011851241216501492, + "loss": 0.3354438245296478, + "memory(GiB)": 78.33, + "step": 3040, + "token_acc": 0.8993545502808126, + "train_speed(iter/s)": 0.032197 + }, + { + "epoch": 0.589255437678632, + "grad_norm": 0.10051760822534561, + "learning_rate": 0.00011841842857308416, + "loss": 0.35278022289276123, + "memory(GiB)": 78.33, + "step": 3041, + "token_acc": 0.8940331066341589, + "train_speed(iter/s)": 0.032198 + }, + { + "epoch": 0.5894492079639587, + "grad_norm": 0.11390508711338043, + "learning_rate": 0.00011832445795255348, + "loss": 0.36132004857063293, + "memory(GiB)": 78.33, + "step": 3042, + "token_acc": 0.891473121085595, + "train_speed(iter/s)": 0.032199 + }, + { + "epoch": 0.5896429782492855, + "grad_norm": 0.1137322410941124, + "learning_rate": 0.00011823050034201902, + "loss": 0.38379916548728943, + "memory(GiB)": 78.33, + "step": 3043, + "token_acc": 0.8879169618381741, + "train_speed(iter/s)": 0.0322 + }, + { + "epoch": 0.5898367485346122, + "grad_norm": 0.09486231952905655, + "learning_rate": 0.00011813655578007181, + "loss": 0.3457268476486206, + "memory(GiB)": 78.33, + "step": 3044, + "token_acc": 0.8960932211584953, + "train_speed(iter/s)": 0.0322 + }, + { + "epoch": 0.590030518819939, + "grad_norm": 0.09855726361274719, + "learning_rate": 0.00011804262430529727, + "loss": 0.33907079696655273, + "memory(GiB)": 78.33, + "step": 3045, + "token_acc": 0.8978774011737636, + "train_speed(iter/s)": 0.032201 + }, + { + "epoch": 0.5902242891052657, + "grad_norm": 0.10438596457242966, + "learning_rate": 0.0001179487059562757, + "loss": 0.32949021458625793, + "memory(GiB)": 78.33, + "step": 3046, + "token_acc": 0.9006081617063784, + "train_speed(iter/s)": 0.032202 + }, + { + "epoch": 0.5904180593905924, + "grad_norm": 0.10170579701662064, + "learning_rate": 0.0001178548007715818, + "loss": 0.3779696226119995, + "memory(GiB)": 78.33, + "step": 3047, + "token_acc": 0.887333483856401, + "train_speed(iter/s)": 0.032203 + }, + { + "epoch": 0.5906118296759192, + "grad_norm": 0.10463224351406097, + "learning_rate": 0.000117760908789785, + "loss": 0.37798070907592773, + "memory(GiB)": 78.33, + "step": 3048, + "token_acc": 0.8907643133516514, + "train_speed(iter/s)": 0.032204 + }, + { + "epoch": 0.5908055999612459, + "grad_norm": 0.09821716696023941, + "learning_rate": 0.00011766703004944934, + "loss": 0.3446789085865021, + "memory(GiB)": 78.33, + "step": 3049, + "token_acc": 0.8958164505672609, + "train_speed(iter/s)": 0.032205 + }, + { + "epoch": 0.5909993702465727, + "grad_norm": 0.12338458746671677, + "learning_rate": 0.00011757316458913317, + "loss": 0.3765670657157898, + "memory(GiB)": 78.33, + "step": 3050, + "token_acc": 0.8903645271623579, + "train_speed(iter/s)": 0.032206 + }, + { + "epoch": 0.5911931405318994, + "grad_norm": 0.10490836948156357, + "learning_rate": 0.00011747931244738973, + "loss": 0.3376193344593048, + "memory(GiB)": 78.33, + "step": 3051, + "token_acc": 0.8987225491603272, + "train_speed(iter/s)": 0.032207 + }, + { + "epoch": 0.5913869108172262, + "grad_norm": 0.10726680606603622, + "learning_rate": 0.00011738547366276645, + "loss": 0.36437898874282837, + "memory(GiB)": 78.33, + "step": 3052, + "token_acc": 0.8919469627617164, + "train_speed(iter/s)": 0.032208 + }, + { + "epoch": 0.5915806811025529, + "grad_norm": 0.10234256833791733, + "learning_rate": 0.00011729164827380557, + "loss": 0.335938960313797, + "memory(GiB)": 78.33, + "step": 3053, + "token_acc": 0.8999602227525855, + "train_speed(iter/s)": 0.032209 + }, + { + "epoch": 0.5917744513878797, + "grad_norm": 0.09306668490171432, + "learning_rate": 0.00011719783631904362, + "loss": 0.3333013653755188, + "memory(GiB)": 78.33, + "step": 3054, + "token_acc": 0.8992164035860263, + "train_speed(iter/s)": 0.03221 + }, + { + "epoch": 0.5919682216732064, + "grad_norm": 0.10699167847633362, + "learning_rate": 0.00011710403783701172, + "loss": 0.36375150084495544, + "memory(GiB)": 78.33, + "step": 3055, + "token_acc": 0.8928638271703965, + "train_speed(iter/s)": 0.032211 + }, + { + "epoch": 0.5921619919585331, + "grad_norm": 0.1067671999335289, + "learning_rate": 0.0001170102528662355, + "loss": 0.37255626916885376, + "memory(GiB)": 78.33, + "step": 3056, + "token_acc": 0.8900267627942542, + "train_speed(iter/s)": 0.032212 + }, + { + "epoch": 0.5923557622438599, + "grad_norm": 0.11016660928726196, + "learning_rate": 0.00011691648144523482, + "loss": 0.3919016420841217, + "memory(GiB)": 78.33, + "step": 3057, + "token_acc": 0.8844034568760513, + "train_speed(iter/s)": 0.032213 + }, + { + "epoch": 0.5925495325291866, + "grad_norm": 0.10009429603815079, + "learning_rate": 0.00011682272361252423, + "loss": 0.32291102409362793, + "memory(GiB)": 78.33, + "step": 3058, + "token_acc": 0.9015109432285302, + "train_speed(iter/s)": 0.032214 + }, + { + "epoch": 0.5927433028145134, + "grad_norm": 0.1145942285656929, + "learning_rate": 0.00011672897940661254, + "loss": 0.3807049095630646, + "memory(GiB)": 78.33, + "step": 3059, + "token_acc": 0.8866095219750233, + "train_speed(iter/s)": 0.032215 + }, + { + "epoch": 0.5929370730998401, + "grad_norm": 0.12313009053468704, + "learning_rate": 0.00011663524886600309, + "loss": 0.40620920062065125, + "memory(GiB)": 78.33, + "step": 3060, + "token_acc": 0.8780752287360217, + "train_speed(iter/s)": 0.032216 + }, + { + "epoch": 0.5931308433851669, + "grad_norm": 0.09408815950155258, + "learning_rate": 0.00011654153202919341, + "loss": 0.3425576388835907, + "memory(GiB)": 78.33, + "step": 3061, + "token_acc": 0.8985101178563487, + "train_speed(iter/s)": 0.032216 + }, + { + "epoch": 0.5933246136704936, + "grad_norm": 0.09209151566028595, + "learning_rate": 0.00011644782893467559, + "loss": 0.2937813997268677, + "memory(GiB)": 78.33, + "step": 3062, + "token_acc": 0.9103508585965656, + "train_speed(iter/s)": 0.032217 + }, + { + "epoch": 0.5935183839558204, + "grad_norm": 0.10529588907957077, + "learning_rate": 0.00011635413962093607, + "loss": 0.33743685483932495, + "memory(GiB)": 78.33, + "step": 3063, + "token_acc": 0.9011610773855131, + "train_speed(iter/s)": 0.032218 + }, + { + "epoch": 0.5937121542411471, + "grad_norm": 0.10583112388849258, + "learning_rate": 0.00011626046412645546, + "loss": 0.384732186794281, + "memory(GiB)": 78.33, + "step": 3064, + "token_acc": 0.886413967142195, + "train_speed(iter/s)": 0.032219 + }, + { + "epoch": 0.5939059245264738, + "grad_norm": 0.10110019147396088, + "learning_rate": 0.00011616680248970887, + "loss": 0.35833442211151123, + "memory(GiB)": 78.33, + "step": 3065, + "token_acc": 0.8952516405918607, + "train_speed(iter/s)": 0.03222 + }, + { + "epoch": 0.5940996948118006, + "grad_norm": 0.10245722532272339, + "learning_rate": 0.0001160731547491656, + "loss": 0.39178815484046936, + "memory(GiB)": 78.33, + "step": 3066, + "token_acc": 0.8862267136788917, + "train_speed(iter/s)": 0.032221 + }, + { + "epoch": 0.5942934650971273, + "grad_norm": 0.11041680723428726, + "learning_rate": 0.00011597952094328933, + "loss": 0.36065673828125, + "memory(GiB)": 78.33, + "step": 3067, + "token_acc": 0.8902277926345974, + "train_speed(iter/s)": 0.032222 + }, + { + "epoch": 0.5944872353824541, + "grad_norm": 0.1108374372124672, + "learning_rate": 0.00011588590111053803, + "loss": 0.3488714396953583, + "memory(GiB)": 78.33, + "step": 3068, + "token_acc": 0.8970215739365883, + "train_speed(iter/s)": 0.032223 + }, + { + "epoch": 0.5946810056677808, + "grad_norm": 0.09806838631629944, + "learning_rate": 0.00011579229528936375, + "loss": 0.3352605104446411, + "memory(GiB)": 78.33, + "step": 3069, + "token_acc": 0.9009188114555908, + "train_speed(iter/s)": 0.032224 + }, + { + "epoch": 0.5948747759531076, + "grad_norm": 0.0958922728896141, + "learning_rate": 0.00011569870351821308, + "loss": 0.3334873616695404, + "memory(GiB)": 78.33, + "step": 3070, + "token_acc": 0.8991558523371551, + "train_speed(iter/s)": 0.032225 + }, + { + "epoch": 0.5950685462384343, + "grad_norm": 0.08912528306245804, + "learning_rate": 0.00011560512583552649, + "loss": 0.2977232336997986, + "memory(GiB)": 78.33, + "step": 3071, + "token_acc": 0.9110933022487736, + "train_speed(iter/s)": 0.032225 + }, + { + "epoch": 0.595262316523761, + "grad_norm": 0.11086632311344147, + "learning_rate": 0.000115511562279739, + "loss": 0.385816752910614, + "memory(GiB)": 78.33, + "step": 3072, + "token_acc": 0.8883094428021641, + "train_speed(iter/s)": 0.032226 + }, + { + "epoch": 0.5954560868090878, + "grad_norm": 0.09204145520925522, + "learning_rate": 0.0001154180128892796, + "loss": 0.31663045287132263, + "memory(GiB)": 78.33, + "step": 3073, + "token_acc": 0.9048406026202965, + "train_speed(iter/s)": 0.032227 + }, + { + "epoch": 0.5956498570944145, + "grad_norm": 0.11149836331605911, + "learning_rate": 0.00011532447770257153, + "loss": 0.4004219174385071, + "memory(GiB)": 78.33, + "step": 3074, + "token_acc": 0.883167884804189, + "train_speed(iter/s)": 0.032228 + }, + { + "epoch": 0.5958436273797413, + "grad_norm": 0.10164281725883484, + "learning_rate": 0.00011523095675803232, + "loss": 0.3479783535003662, + "memory(GiB)": 78.33, + "step": 3075, + "token_acc": 0.8950982001203567, + "train_speed(iter/s)": 0.032229 + }, + { + "epoch": 0.596037397665068, + "grad_norm": 0.09977372735738754, + "learning_rate": 0.00011513745009407339, + "loss": 0.34006303548812866, + "memory(GiB)": 78.33, + "step": 3076, + "token_acc": 0.8972231350640815, + "train_speed(iter/s)": 0.03223 + }, + { + "epoch": 0.5962311679503948, + "grad_norm": 0.09573015570640564, + "learning_rate": 0.00011504395774910056, + "loss": 0.32083189487457275, + "memory(GiB)": 78.33, + "step": 3077, + "token_acc": 0.9044013992586017, + "train_speed(iter/s)": 0.032231 + }, + { + "epoch": 0.5964249382357215, + "grad_norm": 0.10882751643657684, + "learning_rate": 0.00011495047976151352, + "loss": 0.35326480865478516, + "memory(GiB)": 78.33, + "step": 3078, + "token_acc": 0.8945277761309065, + "train_speed(iter/s)": 0.032232 + }, + { + "epoch": 0.5966187085210483, + "grad_norm": 0.10612433403730392, + "learning_rate": 0.00011485701616970628, + "loss": 0.37883704900741577, + "memory(GiB)": 78.33, + "step": 3079, + "token_acc": 0.8884327457135958, + "train_speed(iter/s)": 0.032233 + }, + { + "epoch": 0.596812478806375, + "grad_norm": 0.114121213555336, + "learning_rate": 0.00011476356701206683, + "loss": 0.3798381984233856, + "memory(GiB)": 78.33, + "step": 3080, + "token_acc": 0.887470047620492, + "train_speed(iter/s)": 0.032234 + }, + { + "epoch": 0.5970062490917017, + "grad_norm": 0.09286189824342728, + "learning_rate": 0.00011467013232697721, + "loss": 0.30042147636413574, + "memory(GiB)": 78.33, + "step": 3081, + "token_acc": 0.9096791788565478, + "train_speed(iter/s)": 0.032235 + }, + { + "epoch": 0.5972000193770285, + "grad_norm": 0.103480763733387, + "learning_rate": 0.00011457671215281367, + "loss": 0.35093623399734497, + "memory(GiB)": 78.33, + "step": 3082, + "token_acc": 0.8954995525533328, + "train_speed(iter/s)": 0.032235 + }, + { + "epoch": 0.5973937896623552, + "grad_norm": 0.0931040421128273, + "learning_rate": 0.00011448330652794625, + "loss": 0.3155452013015747, + "memory(GiB)": 78.33, + "step": 3083, + "token_acc": 0.9038670039754246, + "train_speed(iter/s)": 0.032236 + }, + { + "epoch": 0.597587559947682, + "grad_norm": 0.10366553068161011, + "learning_rate": 0.00011438991549073928, + "loss": 0.3622359037399292, + "memory(GiB)": 78.33, + "step": 3084, + "token_acc": 0.8940353881278539, + "train_speed(iter/s)": 0.032237 + }, + { + "epoch": 0.5977813302330087, + "grad_norm": 0.09329306334257126, + "learning_rate": 0.00011429653907955083, + "loss": 0.3334829807281494, + "memory(GiB)": 78.33, + "step": 3085, + "token_acc": 0.9010285939268665, + "train_speed(iter/s)": 0.032238 + }, + { + "epoch": 0.5979751005183355, + "grad_norm": 0.09546185284852982, + "learning_rate": 0.00011420317733273319, + "loss": 0.32923221588134766, + "memory(GiB)": 78.33, + "step": 3086, + "token_acc": 0.9020269413227124, + "train_speed(iter/s)": 0.032239 + }, + { + "epoch": 0.5981688708036622, + "grad_norm": 0.09393859654664993, + "learning_rate": 0.00011410983028863249, + "loss": 0.315865159034729, + "memory(GiB)": 78.33, + "step": 3087, + "token_acc": 0.9042561262423184, + "train_speed(iter/s)": 0.03224 + }, + { + "epoch": 0.598362641088989, + "grad_norm": 0.10992894321680069, + "learning_rate": 0.0001140164979855889, + "loss": 0.39187929034233093, + "memory(GiB)": 78.33, + "step": 3088, + "token_acc": 0.8844834915686552, + "train_speed(iter/s)": 0.032241 + }, + { + "epoch": 0.5985564113743157, + "grad_norm": 0.10544363409280777, + "learning_rate": 0.00011392318046193656, + "loss": 0.3612055778503418, + "memory(GiB)": 78.33, + "step": 3089, + "token_acc": 0.8903434157031227, + "train_speed(iter/s)": 0.032242 + }, + { + "epoch": 0.5987501816596424, + "grad_norm": 0.10547579824924469, + "learning_rate": 0.00011382987775600336, + "loss": 0.37269455194473267, + "memory(GiB)": 78.33, + "step": 3090, + "token_acc": 0.8905209503713244, + "train_speed(iter/s)": 0.032242 + }, + { + "epoch": 0.5989439519449692, + "grad_norm": 0.10508310049772263, + "learning_rate": 0.00011373658990611134, + "loss": 0.36013132333755493, + "memory(GiB)": 78.33, + "step": 3091, + "token_acc": 0.8923727863707689, + "train_speed(iter/s)": 0.032243 + }, + { + "epoch": 0.5991377222302959, + "grad_norm": 0.10769172757863998, + "learning_rate": 0.00011364331695057627, + "loss": 0.36448052525520325, + "memory(GiB)": 78.33, + "step": 3092, + "token_acc": 0.8934169278996865, + "train_speed(iter/s)": 0.032244 + }, + { + "epoch": 0.5993314925156227, + "grad_norm": 0.10621456056833267, + "learning_rate": 0.00011355005892770788, + "loss": 0.35971012711524963, + "memory(GiB)": 78.33, + "step": 3093, + "token_acc": 0.8954783003862758, + "train_speed(iter/s)": 0.032245 + }, + { + "epoch": 0.5995252628009494, + "grad_norm": 0.10547253489494324, + "learning_rate": 0.00011345681587580971, + "loss": 0.38230541348457336, + "memory(GiB)": 78.33, + "step": 3094, + "token_acc": 0.8890573803785643, + "train_speed(iter/s)": 0.032246 + }, + { + "epoch": 0.5997190330862762, + "grad_norm": 0.11314172297716141, + "learning_rate": 0.00011336358783317918, + "loss": 0.3828127086162567, + "memory(GiB)": 78.33, + "step": 3095, + "token_acc": 0.8876047830374754, + "train_speed(iter/s)": 0.032247 + }, + { + "epoch": 0.5999128033716029, + "grad_norm": 0.1083948016166687, + "learning_rate": 0.00011327037483810767, + "loss": 0.34444913268089294, + "memory(GiB)": 78.33, + "step": 3096, + "token_acc": 0.8984524686809138, + "train_speed(iter/s)": 0.032248 + }, + { + "epoch": 0.6001065736569298, + "grad_norm": 0.1084941178560257, + "learning_rate": 0.00011317717692888012, + "loss": 0.35603395104408264, + "memory(GiB)": 78.33, + "step": 3097, + "token_acc": 0.8943460416891673, + "train_speed(iter/s)": 0.032249 + }, + { + "epoch": 0.6003003439422565, + "grad_norm": 0.10329198837280273, + "learning_rate": 0.0001130839941437755, + "loss": 0.3441343903541565, + "memory(GiB)": 78.33, + "step": 3098, + "token_acc": 0.8977043317759704, + "train_speed(iter/s)": 0.03225 + }, + { + "epoch": 0.6004941142275833, + "grad_norm": 0.10437928140163422, + "learning_rate": 0.00011299082652106642, + "loss": 0.32754242420196533, + "memory(GiB)": 78.33, + "step": 3099, + "token_acc": 0.9024354801209021, + "train_speed(iter/s)": 0.032251 + }, + { + "epoch": 0.60068788451291, + "grad_norm": 0.10441198199987411, + "learning_rate": 0.00011289767409901936, + "loss": 0.34080564975738525, + "memory(GiB)": 78.33, + "step": 3100, + "token_acc": 0.9004198514371837, + "train_speed(iter/s)": 0.032252 + }, + { + "epoch": 0.6008816547982367, + "grad_norm": 0.09256397187709808, + "learning_rate": 0.00011280453691589461, + "loss": 0.34055426716804504, + "memory(GiB)": 78.33, + "step": 3101, + "token_acc": 0.8970888269749687, + "train_speed(iter/s)": 0.032252 + }, + { + "epoch": 0.6010754250835635, + "grad_norm": 0.10573244839906693, + "learning_rate": 0.00011271141500994595, + "loss": 0.37066394090652466, + "memory(GiB)": 78.33, + "step": 3102, + "token_acc": 0.890913167778463, + "train_speed(iter/s)": 0.032253 + }, + { + "epoch": 0.6012691953688902, + "grad_norm": 0.1301407516002655, + "learning_rate": 0.0001126183084194212, + "loss": 0.34675294160842896, + "memory(GiB)": 78.33, + "step": 3103, + "token_acc": 0.8968794248992484, + "train_speed(iter/s)": 0.032254 + }, + { + "epoch": 0.601462965654217, + "grad_norm": 0.09512253850698471, + "learning_rate": 0.00011252521718256159, + "loss": 0.34905996918678284, + "memory(GiB)": 78.33, + "step": 3104, + "token_acc": 0.8967058594888738, + "train_speed(iter/s)": 0.032255 + }, + { + "epoch": 0.6016567359395437, + "grad_norm": 0.11211273819208145, + "learning_rate": 0.0001124321413376023, + "loss": 0.3654071092605591, + "memory(GiB)": 78.33, + "step": 3105, + "token_acc": 0.8934378629500581, + "train_speed(iter/s)": 0.032256 + }, + { + "epoch": 0.6018505062248705, + "grad_norm": 0.10533998906612396, + "learning_rate": 0.00011233908092277203, + "loss": 0.3517376780509949, + "memory(GiB)": 78.33, + "step": 3106, + "token_acc": 0.8942503737936659, + "train_speed(iter/s)": 0.032257 + }, + { + "epoch": 0.6020442765101972, + "grad_norm": 0.11167991906404495, + "learning_rate": 0.00011224603597629322, + "loss": 0.3245575428009033, + "memory(GiB)": 78.33, + "step": 3107, + "token_acc": 0.9026755252289459, + "train_speed(iter/s)": 0.032258 + }, + { + "epoch": 0.602238046795524, + "grad_norm": 0.11581658571958542, + "learning_rate": 0.00011215300653638199, + "loss": 0.36745405197143555, + "memory(GiB)": 78.33, + "step": 3108, + "token_acc": 0.892201612120035, + "train_speed(iter/s)": 0.032259 + }, + { + "epoch": 0.6024318170808507, + "grad_norm": 0.10512839257717133, + "learning_rate": 0.00011205999264124786, + "loss": 0.3423347771167755, + "memory(GiB)": 78.33, + "step": 3109, + "token_acc": 0.8962487920583326, + "train_speed(iter/s)": 0.03226 + }, + { + "epoch": 0.6026255873661774, + "grad_norm": 0.10966593772172928, + "learning_rate": 0.00011196699432909435, + "loss": 0.3704321086406708, + "memory(GiB)": 78.33, + "step": 3110, + "token_acc": 0.8911972524843059, + "train_speed(iter/s)": 0.032261 + }, + { + "epoch": 0.6028193576515042, + "grad_norm": 0.09262005239725113, + "learning_rate": 0.00011187401163811816, + "loss": 0.31054481863975525, + "memory(GiB)": 78.33, + "step": 3111, + "token_acc": 0.9069608679751311, + "train_speed(iter/s)": 0.032261 + }, + { + "epoch": 0.6030131279368309, + "grad_norm": 0.11650776118040085, + "learning_rate": 0.00011178104460650993, + "loss": 0.40273237228393555, + "memory(GiB)": 78.33, + "step": 3112, + "token_acc": 0.8809349247778527, + "train_speed(iter/s)": 0.032262 + }, + { + "epoch": 0.6032068982221577, + "grad_norm": 0.10926227271556854, + "learning_rate": 0.00011168809327245361, + "loss": 0.3614426851272583, + "memory(GiB)": 78.33, + "step": 3113, + "token_acc": 0.8933805533961015, + "train_speed(iter/s)": 0.032263 + }, + { + "epoch": 0.6034006685074844, + "grad_norm": 0.09592621773481369, + "learning_rate": 0.00011159515767412688, + "loss": 0.3322632312774658, + "memory(GiB)": 78.33, + "step": 3114, + "token_acc": 0.9001508923461158, + "train_speed(iter/s)": 0.032264 + }, + { + "epoch": 0.6035944387928112, + "grad_norm": 0.09553615748882294, + "learning_rate": 0.00011150223784970092, + "loss": 0.31129273772239685, + "memory(GiB)": 78.33, + "step": 3115, + "token_acc": 0.9059551739629611, + "train_speed(iter/s)": 0.032265 + }, + { + "epoch": 0.6037882090781379, + "grad_norm": 0.113133504986763, + "learning_rate": 0.0001114093338373403, + "loss": 0.38860899209976196, + "memory(GiB)": 78.33, + "step": 3116, + "token_acc": 0.885463585843814, + "train_speed(iter/s)": 0.032266 + }, + { + "epoch": 0.6039819793634646, + "grad_norm": 0.09849842637777328, + "learning_rate": 0.0001113164456752033, + "loss": 0.3431742489337921, + "memory(GiB)": 78.33, + "step": 3117, + "token_acc": 0.8975163928200713, + "train_speed(iter/s)": 0.032267 + }, + { + "epoch": 0.6041757496487914, + "grad_norm": 0.10167428106069565, + "learning_rate": 0.00011122357340144148, + "loss": 0.37376028299331665, + "memory(GiB)": 78.33, + "step": 3118, + "token_acc": 0.8888616757830571, + "train_speed(iter/s)": 0.032268 + }, + { + "epoch": 0.6043695199341181, + "grad_norm": 0.11035677045583725, + "learning_rate": 0.00011113071705420004, + "loss": 0.36835113167762756, + "memory(GiB)": 78.33, + "step": 3119, + "token_acc": 0.8914904111178809, + "train_speed(iter/s)": 0.032269 + }, + { + "epoch": 0.6045632902194449, + "grad_norm": 0.10013226419687271, + "learning_rate": 0.00011103787667161753, + "loss": 0.3682772219181061, + "memory(GiB)": 78.33, + "step": 3120, + "token_acc": 0.8906798812862472, + "train_speed(iter/s)": 0.03227 + }, + { + "epoch": 0.6047570605047716, + "grad_norm": 0.10240922123193741, + "learning_rate": 0.00011094505229182605, + "loss": 0.33320745825767517, + "memory(GiB)": 78.33, + "step": 3121, + "token_acc": 0.8995880980499467, + "train_speed(iter/s)": 0.03227 + }, + { + "epoch": 0.6049508307900984, + "grad_norm": 0.10636462271213531, + "learning_rate": 0.00011085224395295109, + "loss": 0.3461098074913025, + "memory(GiB)": 78.33, + "step": 3122, + "token_acc": 0.8970420099088271, + "train_speed(iter/s)": 0.032271 + }, + { + "epoch": 0.6051446010754251, + "grad_norm": 0.15055952966213226, + "learning_rate": 0.00011075945169311141, + "loss": 0.4095023572444916, + "memory(GiB)": 78.33, + "step": 3123, + "token_acc": 0.8795589223532286, + "train_speed(iter/s)": 0.032272 + }, + { + "epoch": 0.6053383713607519, + "grad_norm": 0.10108061134815216, + "learning_rate": 0.00011066667555041942, + "loss": 0.33705589175224304, + "memory(GiB)": 78.33, + "step": 3124, + "token_acc": 0.8990451697627971, + "train_speed(iter/s)": 0.032273 + }, + { + "epoch": 0.6055321416460786, + "grad_norm": 0.10497518628835678, + "learning_rate": 0.00011057391556298065, + "loss": 0.3509282171726227, + "memory(GiB)": 78.33, + "step": 3125, + "token_acc": 0.8944882780464354, + "train_speed(iter/s)": 0.032274 + }, + { + "epoch": 0.6057259119314053, + "grad_norm": 0.0899055078625679, + "learning_rate": 0.0001104811717688942, + "loss": 0.3112187385559082, + "memory(GiB)": 78.33, + "step": 3126, + "token_acc": 0.9061983165066279, + "train_speed(iter/s)": 0.032275 + }, + { + "epoch": 0.6059196822167321, + "grad_norm": 0.10332870483398438, + "learning_rate": 0.00011038844420625239, + "loss": 0.40198662877082825, + "memory(GiB)": 78.33, + "step": 3127, + "token_acc": 0.8823683727275112, + "train_speed(iter/s)": 0.032276 + }, + { + "epoch": 0.6061134525020588, + "grad_norm": 0.1093793734908104, + "learning_rate": 0.00011029573291314094, + "loss": 0.36452385783195496, + "memory(GiB)": 78.33, + "step": 3128, + "token_acc": 0.8916964258920266, + "train_speed(iter/s)": 0.032277 + }, + { + "epoch": 0.6063072227873856, + "grad_norm": 0.10296124219894409, + "learning_rate": 0.00011020303792763896, + "loss": 0.31651031970977783, + "memory(GiB)": 78.33, + "step": 3129, + "token_acc": 0.9070627097315436, + "train_speed(iter/s)": 0.032277 + }, + { + "epoch": 0.6065009930727123, + "grad_norm": 0.10293302685022354, + "learning_rate": 0.00011011035928781861, + "loss": 0.34327608346939087, + "memory(GiB)": 78.33, + "step": 3130, + "token_acc": 0.896756412974348, + "train_speed(iter/s)": 0.032278 + }, + { + "epoch": 0.6066947633580391, + "grad_norm": 0.1089029610157013, + "learning_rate": 0.00011001769703174564, + "loss": 0.3363596796989441, + "memory(GiB)": 78.33, + "step": 3131, + "token_acc": 0.9004472523691865, + "train_speed(iter/s)": 0.032279 + }, + { + "epoch": 0.6068885336433658, + "grad_norm": 0.10041207820177078, + "learning_rate": 0.0001099250511974788, + "loss": 0.3575053811073303, + "memory(GiB)": 78.33, + "step": 3132, + "token_acc": 0.8955265421790395, + "train_speed(iter/s)": 0.03228 + }, + { + "epoch": 0.6070823039286926, + "grad_norm": 0.11202222853899002, + "learning_rate": 0.00010983242182307032, + "loss": 0.363272488117218, + "memory(GiB)": 78.33, + "step": 3133, + "token_acc": 0.8899773926149209, + "train_speed(iter/s)": 0.032281 + }, + { + "epoch": 0.6072760742140193, + "grad_norm": 0.10849933326244354, + "learning_rate": 0.00010973980894656555, + "loss": 0.3696746826171875, + "memory(GiB)": 78.33, + "step": 3134, + "token_acc": 0.8896282494898412, + "train_speed(iter/s)": 0.032282 + }, + { + "epoch": 0.607469844499346, + "grad_norm": 0.10315241664648056, + "learning_rate": 0.00010964721260600305, + "loss": 0.34507814049720764, + "memory(GiB)": 78.33, + "step": 3135, + "token_acc": 0.897193022987178, + "train_speed(iter/s)": 0.032283 + }, + { + "epoch": 0.6076636147846728, + "grad_norm": 0.11026407033205032, + "learning_rate": 0.00010955463283941472, + "loss": 0.3476986289024353, + "memory(GiB)": 78.33, + "step": 3136, + "token_acc": 0.8966703526231724, + "train_speed(iter/s)": 0.032284 + }, + { + "epoch": 0.6078573850699995, + "grad_norm": 0.09877464920282364, + "learning_rate": 0.00010946206968482542, + "loss": 0.33894383907318115, + "memory(GiB)": 78.33, + "step": 3137, + "token_acc": 0.8988343465809174, + "train_speed(iter/s)": 0.032284 + }, + { + "epoch": 0.6080511553553263, + "grad_norm": 0.09642918407917023, + "learning_rate": 0.00010936952318025344, + "loss": 0.3351293206214905, + "memory(GiB)": 78.33, + "step": 3138, + "token_acc": 0.8997663427327279, + "train_speed(iter/s)": 0.032285 + }, + { + "epoch": 0.608244925640653, + "grad_norm": 0.10105688869953156, + "learning_rate": 0.00010927699336371003, + "loss": 0.32609879970550537, + "memory(GiB)": 78.33, + "step": 3139, + "token_acc": 0.9029436501261564, + "train_speed(iter/s)": 0.032286 + }, + { + "epoch": 0.6084386959259798, + "grad_norm": 0.10417565703392029, + "learning_rate": 0.00010918448027319971, + "loss": 0.35680675506591797, + "memory(GiB)": 78.33, + "step": 3140, + "token_acc": 0.8950784207679827, + "train_speed(iter/s)": 0.032287 + }, + { + "epoch": 0.6086324662113065, + "grad_norm": 0.10269538313150406, + "learning_rate": 0.00010909198394672018, + "loss": 0.36641865968704224, + "memory(GiB)": 78.33, + "step": 3141, + "token_acc": 0.8919290565190549, + "train_speed(iter/s)": 0.032288 + }, + { + "epoch": 0.6088262364966333, + "grad_norm": 0.10089493542909622, + "learning_rate": 0.000108999504422262, + "loss": 0.37600627541542053, + "memory(GiB)": 78.33, + "step": 3142, + "token_acc": 0.8877864583333334, + "train_speed(iter/s)": 0.032289 + }, + { + "epoch": 0.60902000678196, + "grad_norm": 0.10720504820346832, + "learning_rate": 0.00010890704173780916, + "loss": 0.35292261838912964, + "memory(GiB)": 78.33, + "step": 3143, + "token_acc": 0.8938151494093121, + "train_speed(iter/s)": 0.03229 + }, + { + "epoch": 0.6092137770672867, + "grad_norm": 0.09673202037811279, + "learning_rate": 0.00010881459593133842, + "loss": 0.32286348938941956, + "memory(GiB)": 78.33, + "step": 3144, + "token_acc": 0.9035548686244204, + "train_speed(iter/s)": 0.032291 + }, + { + "epoch": 0.6094075473526135, + "grad_norm": 0.10169202834367752, + "learning_rate": 0.00010872216704081986, + "loss": 0.345810204744339, + "memory(GiB)": 78.33, + "step": 3145, + "token_acc": 0.8962894744659993, + "train_speed(iter/s)": 0.032291 + }, + { + "epoch": 0.6096013176379402, + "grad_norm": 0.09147030860185623, + "learning_rate": 0.00010862975510421642, + "loss": 0.30978208780288696, + "memory(GiB)": 78.33, + "step": 3146, + "token_acc": 0.9076807434741014, + "train_speed(iter/s)": 0.032292 + }, + { + "epoch": 0.609795087923267, + "grad_norm": 0.09909648448228836, + "learning_rate": 0.00010853736015948425, + "loss": 0.339855819940567, + "memory(GiB)": 78.33, + "step": 3147, + "token_acc": 0.8987607001405391, + "train_speed(iter/s)": 0.032293 + }, + { + "epoch": 0.6099888582085937, + "grad_norm": 0.21334044635295868, + "learning_rate": 0.00010844498224457246, + "loss": 0.34740781784057617, + "memory(GiB)": 78.33, + "step": 3148, + "token_acc": 0.8966986427186447, + "train_speed(iter/s)": 0.032294 + }, + { + "epoch": 0.6101826284939205, + "grad_norm": 0.11788555234670639, + "learning_rate": 0.00010835262139742303, + "loss": 0.36575233936309814, + "memory(GiB)": 78.33, + "step": 3149, + "token_acc": 0.8924587929111414, + "train_speed(iter/s)": 0.032295 + }, + { + "epoch": 0.6103763987792472, + "grad_norm": 0.09829849749803543, + "learning_rate": 0.00010826027765597116, + "loss": 0.3394019901752472, + "memory(GiB)": 78.33, + "step": 3150, + "token_acc": 0.8985713198162762, + "train_speed(iter/s)": 0.032296 + }, + { + "epoch": 0.610570169064574, + "grad_norm": 0.1058797761797905, + "learning_rate": 0.00010816795105814479, + "loss": 0.3209468424320221, + "memory(GiB)": 78.33, + "step": 3151, + "token_acc": 0.9035917319430982, + "train_speed(iter/s)": 0.032296 + }, + { + "epoch": 0.6107639393499007, + "grad_norm": 0.09663695096969604, + "learning_rate": 0.000108075641641865, + "loss": 0.3185003399848938, + "memory(GiB)": 78.33, + "step": 3152, + "token_acc": 0.904547132985177, + "train_speed(iter/s)": 0.032297 + }, + { + "epoch": 0.6109577096352274, + "grad_norm": 0.09900177270174026, + "learning_rate": 0.00010798334944504572, + "loss": 0.3523489832878113, + "memory(GiB)": 78.33, + "step": 3153, + "token_acc": 0.8945492208011177, + "train_speed(iter/s)": 0.032298 + }, + { + "epoch": 0.6111514799205542, + "grad_norm": 0.11104562878608704, + "learning_rate": 0.00010789107450559386, + "loss": 0.37791165709495544, + "memory(GiB)": 78.33, + "step": 3154, + "token_acc": 0.8904507301368287, + "train_speed(iter/s)": 0.032299 + }, + { + "epoch": 0.6113452502058809, + "grad_norm": 0.11416266113519669, + "learning_rate": 0.00010779881686140927, + "loss": 0.34979140758514404, + "memory(GiB)": 78.33, + "step": 3155, + "token_acc": 0.895737220889463, + "train_speed(iter/s)": 0.0323 + }, + { + "epoch": 0.6115390204912077, + "grad_norm": 0.10298382490873337, + "learning_rate": 0.00010770657655038453, + "loss": 0.339542031288147, + "memory(GiB)": 78.33, + "step": 3156, + "token_acc": 0.8982814065053508, + "train_speed(iter/s)": 0.032301 + }, + { + "epoch": 0.6117327907765344, + "grad_norm": 0.09547599405050278, + "learning_rate": 0.00010761435361040531, + "loss": 0.36547231674194336, + "memory(GiB)": 78.33, + "step": 3157, + "token_acc": 0.8895893839578759, + "train_speed(iter/s)": 0.032302 + }, + { + "epoch": 0.6119265610618612, + "grad_norm": 0.10469914972782135, + "learning_rate": 0.00010752214807934996, + "loss": 0.31865012645721436, + "memory(GiB)": 78.33, + "step": 3158, + "token_acc": 0.9054883979435876, + "train_speed(iter/s)": 0.032302 + }, + { + "epoch": 0.6121203313471879, + "grad_norm": 0.10441134870052338, + "learning_rate": 0.00010742995999508987, + "loss": 0.36503875255584717, + "memory(GiB)": 78.33, + "step": 3159, + "token_acc": 0.8934720034758051, + "train_speed(iter/s)": 0.032303 + }, + { + "epoch": 0.6123141016325147, + "grad_norm": 0.10514453798532486, + "learning_rate": 0.00010733778939548905, + "loss": 0.3756176829338074, + "memory(GiB)": 78.33, + "step": 3160, + "token_acc": 0.8889959795120339, + "train_speed(iter/s)": 0.032304 + }, + { + "epoch": 0.6125078719178414, + "grad_norm": 0.10831741243600845, + "learning_rate": 0.00010724563631840451, + "loss": 0.39807701110839844, + "memory(GiB)": 78.33, + "step": 3161, + "token_acc": 0.8838846894490271, + "train_speed(iter/s)": 0.032305 + }, + { + "epoch": 0.6127016422031681, + "grad_norm": 0.10723893344402313, + "learning_rate": 0.00010715350080168606, + "loss": 0.38960257172584534, + "memory(GiB)": 78.33, + "step": 3162, + "token_acc": 0.8852941176470588, + "train_speed(iter/s)": 0.032306 + }, + { + "epoch": 0.6128954124884949, + "grad_norm": 0.10730850696563721, + "learning_rate": 0.00010706138288317609, + "loss": 0.3630613386631012, + "memory(GiB)": 78.33, + "step": 3163, + "token_acc": 0.8937855052504734, + "train_speed(iter/s)": 0.032307 + }, + { + "epoch": 0.6130891827738216, + "grad_norm": 0.09366155415773392, + "learning_rate": 0.00010696928260070999, + "loss": 0.3480615019798279, + "memory(GiB)": 78.33, + "step": 3164, + "token_acc": 0.8973462201951934, + "train_speed(iter/s)": 0.032308 + }, + { + "epoch": 0.6132829530591484, + "grad_norm": 0.10169912874698639, + "learning_rate": 0.00010687719999211583, + "loss": 0.36037591099739075, + "memory(GiB)": 78.33, + "step": 3165, + "token_acc": 0.892155500597438, + "train_speed(iter/s)": 0.032309 + }, + { + "epoch": 0.6134767233444751, + "grad_norm": 0.10275442153215408, + "learning_rate": 0.00010678513509521435, + "loss": 0.3725361227989197, + "memory(GiB)": 78.33, + "step": 3166, + "token_acc": 0.8887706641060908, + "train_speed(iter/s)": 0.032309 + }, + { + "epoch": 0.6136704936298019, + "grad_norm": 0.10723954439163208, + "learning_rate": 0.00010669308794781914, + "loss": 0.3526693880558014, + "memory(GiB)": 78.33, + "step": 3167, + "token_acc": 0.8959228785459904, + "train_speed(iter/s)": 0.03231 + }, + { + "epoch": 0.6138642639151286, + "grad_norm": 0.1202649399638176, + "learning_rate": 0.0001066010585877364, + "loss": 0.4182667136192322, + "memory(GiB)": 78.33, + "step": 3168, + "token_acc": 0.8773098963363596, + "train_speed(iter/s)": 0.032311 + }, + { + "epoch": 0.6140580342004553, + "grad_norm": 0.1005726233124733, + "learning_rate": 0.00010650904705276513, + "loss": 0.33930516242980957, + "memory(GiB)": 78.33, + "step": 3169, + "token_acc": 0.8973083132625784, + "train_speed(iter/s)": 0.032312 + }, + { + "epoch": 0.6142518044857821, + "grad_norm": 0.10162294656038284, + "learning_rate": 0.0001064170533806968, + "loss": 0.3408072292804718, + "memory(GiB)": 78.33, + "step": 3170, + "token_acc": 0.8978023358030893, + "train_speed(iter/s)": 0.032313 + }, + { + "epoch": 0.6144455747711088, + "grad_norm": 0.0949673056602478, + "learning_rate": 0.00010632507760931581, + "loss": 0.3130902349948883, + "memory(GiB)": 78.33, + "step": 3171, + "token_acc": 0.9057638586329737, + "train_speed(iter/s)": 0.032314 + }, + { + "epoch": 0.6146393450564356, + "grad_norm": 0.10987738519906998, + "learning_rate": 0.000106233119776399, + "loss": 0.35283100605010986, + "memory(GiB)": 78.33, + "step": 3172, + "token_acc": 0.8905293376688052, + "train_speed(iter/s)": 0.032315 + }, + { + "epoch": 0.6148331153417623, + "grad_norm": 0.10509679466485977, + "learning_rate": 0.00010614117991971598, + "loss": 0.36292973160743713, + "memory(GiB)": 78.33, + "step": 3173, + "token_acc": 0.8947354138398914, + "train_speed(iter/s)": 0.032316 + }, + { + "epoch": 0.6150268856270891, + "grad_norm": 0.1026267558336258, + "learning_rate": 0.00010604925807702895, + "loss": 0.3576149046421051, + "memory(GiB)": 78.33, + "step": 3174, + "token_acc": 0.8960543506663182, + "train_speed(iter/s)": 0.032317 + }, + { + "epoch": 0.6152206559124158, + "grad_norm": 0.1011863574385643, + "learning_rate": 0.00010595735428609256, + "loss": 0.3429381251335144, + "memory(GiB)": 78.33, + "step": 3175, + "token_acc": 0.8972618182729163, + "train_speed(iter/s)": 0.032317 + }, + { + "epoch": 0.6154144261977426, + "grad_norm": 0.09697016328573227, + "learning_rate": 0.0001058654685846543, + "loss": 0.3507676422595978, + "memory(GiB)": 78.33, + "step": 3176, + "token_acc": 0.8970009395870896, + "train_speed(iter/s)": 0.032318 + }, + { + "epoch": 0.6156081964830693, + "grad_norm": 0.0992831438779831, + "learning_rate": 0.00010577360101045396, + "loss": 0.3351018726825714, + "memory(GiB)": 78.33, + "step": 3177, + "token_acc": 0.8992732671719934, + "train_speed(iter/s)": 0.032319 + }, + { + "epoch": 0.615801966768396, + "grad_norm": 0.1062702089548111, + "learning_rate": 0.00010568175160122414, + "loss": 0.38858115673065186, + "memory(GiB)": 78.33, + "step": 3178, + "token_acc": 0.8845210155749946, + "train_speed(iter/s)": 0.03232 + }, + { + "epoch": 0.6159957370537228, + "grad_norm": 0.09542369842529297, + "learning_rate": 0.00010558992039468979, + "loss": 0.32729557156562805, + "memory(GiB)": 78.33, + "step": 3179, + "token_acc": 0.9012937964120754, + "train_speed(iter/s)": 0.032321 + }, + { + "epoch": 0.6161895073390495, + "grad_norm": 0.09666220843791962, + "learning_rate": 0.00010549810742856847, + "loss": 0.3226274847984314, + "memory(GiB)": 78.33, + "step": 3180, + "token_acc": 0.9038066656136471, + "train_speed(iter/s)": 0.032322 + }, + { + "epoch": 0.6163832776243763, + "grad_norm": 0.12249480187892914, + "learning_rate": 0.0001054063127405703, + "loss": 0.4036695063114166, + "memory(GiB)": 78.33, + "step": 3181, + "token_acc": 0.8821568293927539, + "train_speed(iter/s)": 0.032323 + }, + { + "epoch": 0.616577047909703, + "grad_norm": 0.10205438733100891, + "learning_rate": 0.00010531453636839771, + "loss": 0.3525814116001129, + "memory(GiB)": 78.33, + "step": 3182, + "token_acc": 0.8946187098204389, + "train_speed(iter/s)": 0.032323 + }, + { + "epoch": 0.6167708181950298, + "grad_norm": 0.10525421798229218, + "learning_rate": 0.00010522277834974585, + "loss": 0.3661832809448242, + "memory(GiB)": 78.33, + "step": 3183, + "token_acc": 0.8905501755879536, + "train_speed(iter/s)": 0.032324 + }, + { + "epoch": 0.6169645884803565, + "grad_norm": 0.09824883937835693, + "learning_rate": 0.00010513103872230206, + "loss": 0.3395775854587555, + "memory(GiB)": 78.33, + "step": 3184, + "token_acc": 0.8979004582756048, + "train_speed(iter/s)": 0.032325 + }, + { + "epoch": 0.6171583587656833, + "grad_norm": 0.10357562452554703, + "learning_rate": 0.00010503931752374637, + "loss": 0.330016553401947, + "memory(GiB)": 78.33, + "step": 3185, + "token_acc": 0.903437815975733, + "train_speed(iter/s)": 0.032326 + }, + { + "epoch": 0.61735212905101, + "grad_norm": 0.09775417298078537, + "learning_rate": 0.00010494761479175107, + "loss": 0.3158339262008667, + "memory(GiB)": 78.33, + "step": 3186, + "token_acc": 0.9043734015345268, + "train_speed(iter/s)": 0.032327 + }, + { + "epoch": 0.6175458993363367, + "grad_norm": 0.09946906566619873, + "learning_rate": 0.000104855930563981, + "loss": 0.3278699517250061, + "memory(GiB)": 78.33, + "step": 3187, + "token_acc": 0.9031324520176881, + "train_speed(iter/s)": 0.032328 + }, + { + "epoch": 0.6177396696216635, + "grad_norm": 0.09458330273628235, + "learning_rate": 0.00010476426487809338, + "loss": 0.3177909255027771, + "memory(GiB)": 78.33, + "step": 3188, + "token_acc": 0.9047277202338222, + "train_speed(iter/s)": 0.032329 + }, + { + "epoch": 0.6179334399069902, + "grad_norm": 0.09574276953935623, + "learning_rate": 0.00010467261777173763, + "loss": 0.3308386206626892, + "memory(GiB)": 78.33, + "step": 3189, + "token_acc": 0.9015635322616163, + "train_speed(iter/s)": 0.032329 + }, + { + "epoch": 0.618127210192317, + "grad_norm": 0.12292765825986862, + "learning_rate": 0.00010458098928255584, + "loss": 0.3934580385684967, + "memory(GiB)": 78.33, + "step": 3190, + "token_acc": 0.8843327802796871, + "train_speed(iter/s)": 0.03233 + }, + { + "epoch": 0.6183209804776437, + "grad_norm": 0.10484007745981216, + "learning_rate": 0.00010448937944818211, + "loss": 0.33495286107063293, + "memory(GiB)": 78.33, + "step": 3191, + "token_acc": 0.9003335885376791, + "train_speed(iter/s)": 0.032331 + }, + { + "epoch": 0.6185147507629705, + "grad_norm": 0.09170569479465485, + "learning_rate": 0.00010439778830624321, + "loss": 0.32464128732681274, + "memory(GiB)": 78.33, + "step": 3192, + "token_acc": 0.9023982419262373, + "train_speed(iter/s)": 0.032332 + }, + { + "epoch": 0.6187085210482972, + "grad_norm": 0.10959605127573013, + "learning_rate": 0.00010430621589435801, + "loss": 0.36965322494506836, + "memory(GiB)": 78.33, + "step": 3193, + "token_acc": 0.8880881729546418, + "train_speed(iter/s)": 0.032333 + }, + { + "epoch": 0.618902291333624, + "grad_norm": 0.10908178985118866, + "learning_rate": 0.00010421466225013776, + "loss": 0.39024633169174194, + "memory(GiB)": 78.33, + "step": 3194, + "token_acc": 0.8856543263021954, + "train_speed(iter/s)": 0.032334 + }, + { + "epoch": 0.6190960616189507, + "grad_norm": 0.10703379660844803, + "learning_rate": 0.0001041231274111861, + "loss": 0.3596927523612976, + "memory(GiB)": 78.33, + "step": 3195, + "token_acc": 0.8922619047619048, + "train_speed(iter/s)": 0.032334 + }, + { + "epoch": 0.6192898319042774, + "grad_norm": 0.10117658227682114, + "learning_rate": 0.00010403161141509872, + "loss": 0.34861043095588684, + "memory(GiB)": 78.33, + "step": 3196, + "token_acc": 0.89493396959272, + "train_speed(iter/s)": 0.032335 + }, + { + "epoch": 0.6194836021896042, + "grad_norm": 0.09983016550540924, + "learning_rate": 0.0001039401142994638, + "loss": 0.34936952590942383, + "memory(GiB)": 78.33, + "step": 3197, + "token_acc": 0.8953481619141646, + "train_speed(iter/s)": 0.032336 + }, + { + "epoch": 0.6196773724749309, + "grad_norm": 0.08768882602453232, + "learning_rate": 0.00010384863610186155, + "loss": 0.3130777180194855, + "memory(GiB)": 78.33, + "step": 3198, + "token_acc": 0.9038072289156627, + "train_speed(iter/s)": 0.032337 + }, + { + "epoch": 0.6198711427602577, + "grad_norm": 0.09570645540952682, + "learning_rate": 0.00010375717685986459, + "loss": 0.31712067127227783, + "memory(GiB)": 78.33, + "step": 3199, + "token_acc": 0.9068876881917851, + "train_speed(iter/s)": 0.032338 + }, + { + "epoch": 0.6200649130455844, + "grad_norm": 0.09237085282802582, + "learning_rate": 0.0001036657366110377, + "loss": 0.33153122663497925, + "memory(GiB)": 78.33, + "step": 3200, + "token_acc": 0.9015142538922724, + "train_speed(iter/s)": 0.032339 + }, + { + "epoch": 0.6202586833309112, + "grad_norm": 0.11354105174541473, + "learning_rate": 0.00010357431539293784, + "loss": 0.3915308713912964, + "memory(GiB)": 78.33, + "step": 3201, + "token_acc": 0.8843476653936183, + "train_speed(iter/s)": 0.032336 + }, + { + "epoch": 0.6204524536162379, + "grad_norm": 0.09536038339138031, + "learning_rate": 0.00010348291324311418, + "loss": 0.3190258741378784, + "memory(GiB)": 78.33, + "step": 3202, + "token_acc": 0.9054259501965924, + "train_speed(iter/s)": 0.032337 + }, + { + "epoch": 0.6206462239015647, + "grad_norm": 0.10331059992313385, + "learning_rate": 0.00010339153019910797, + "loss": 0.34562522172927856, + "memory(GiB)": 78.33, + "step": 3203, + "token_acc": 0.8966327964544374, + "train_speed(iter/s)": 0.032337 + }, + { + "epoch": 0.6208399941868914, + "grad_norm": 0.1046903133392334, + "learning_rate": 0.00010330016629845274, + "loss": 0.3687852919101715, + "memory(GiB)": 78.33, + "step": 3204, + "token_acc": 0.8896271888020478, + "train_speed(iter/s)": 0.032338 + }, + { + "epoch": 0.6210337644722181, + "grad_norm": 0.12125846743583679, + "learning_rate": 0.00010320882157867408, + "loss": 0.32853639125823975, + "memory(GiB)": 78.33, + "step": 3205, + "token_acc": 0.9024209282514908, + "train_speed(iter/s)": 0.032339 + }, + { + "epoch": 0.6212275347575449, + "grad_norm": 0.09862146526575089, + "learning_rate": 0.00010311749607728976, + "loss": 0.3248175084590912, + "memory(GiB)": 78.33, + "step": 3206, + "token_acc": 0.9051060112761058, + "train_speed(iter/s)": 0.03234 + }, + { + "epoch": 0.6214213050428716, + "grad_norm": 0.10406461358070374, + "learning_rate": 0.00010302618983180955, + "loss": 0.3480234742164612, + "memory(GiB)": 78.33, + "step": 3207, + "token_acc": 0.8964719652212685, + "train_speed(iter/s)": 0.032341 + }, + { + "epoch": 0.6216150753281984, + "grad_norm": 0.0927383154630661, + "learning_rate": 0.00010293490287973539, + "loss": 0.31758391857147217, + "memory(GiB)": 78.33, + "step": 3208, + "token_acc": 0.9062631046646439, + "train_speed(iter/s)": 0.032341 + }, + { + "epoch": 0.6218088456135251, + "grad_norm": 0.10273189842700958, + "learning_rate": 0.00010284363525856138, + "loss": 0.34117552638053894, + "memory(GiB)": 78.33, + "step": 3209, + "token_acc": 0.897176918619091, + "train_speed(iter/s)": 0.032342 + }, + { + "epoch": 0.6220026158988519, + "grad_norm": 0.10961279273033142, + "learning_rate": 0.00010275238700577344, + "loss": 0.3611040711402893, + "memory(GiB)": 78.33, + "step": 3210, + "token_acc": 0.8908705991531813, + "train_speed(iter/s)": 0.032343 + }, + { + "epoch": 0.6221963861841786, + "grad_norm": 0.11966651678085327, + "learning_rate": 0.00010266115815884978, + "loss": 0.3134154975414276, + "memory(GiB)": 78.33, + "step": 3211, + "token_acc": 0.9054380748603662, + "train_speed(iter/s)": 0.032344 + }, + { + "epoch": 0.6223901564695054, + "grad_norm": 0.10242405533790588, + "learning_rate": 0.00010256994875526048, + "loss": 0.3505587577819824, + "memory(GiB)": 78.33, + "step": 3212, + "token_acc": 0.8964926844453668, + "train_speed(iter/s)": 0.032345 + }, + { + "epoch": 0.6225839267548321, + "grad_norm": 0.09929542243480682, + "learning_rate": 0.0001024787588324677, + "loss": 0.3694930672645569, + "memory(GiB)": 78.33, + "step": 3213, + "token_acc": 0.8909321660752674, + "train_speed(iter/s)": 0.032346 + }, + { + "epoch": 0.6227776970401588, + "grad_norm": 0.11470238864421844, + "learning_rate": 0.00010238758842792571, + "loss": 0.34870725870132446, + "memory(GiB)": 78.33, + "step": 3214, + "token_acc": 0.8959076482318906, + "train_speed(iter/s)": 0.032347 + }, + { + "epoch": 0.6229714673254856, + "grad_norm": 0.10671926289796829, + "learning_rate": 0.00010229643757908047, + "loss": 0.34584519267082214, + "memory(GiB)": 78.33, + "step": 3215, + "token_acc": 0.8956783512316554, + "train_speed(iter/s)": 0.032347 + }, + { + "epoch": 0.6231652376108123, + "grad_norm": 0.10204023122787476, + "learning_rate": 0.00010220530632337022, + "loss": 0.33685633540153503, + "memory(GiB)": 78.33, + "step": 3216, + "token_acc": 0.8971209005303604, + "train_speed(iter/s)": 0.032348 + }, + { + "epoch": 0.6233590078961391, + "grad_norm": 0.09859669208526611, + "learning_rate": 0.0001021141946982249, + "loss": 0.33895328640937805, + "memory(GiB)": 78.33, + "step": 3217, + "token_acc": 0.8996550051210177, + "train_speed(iter/s)": 0.032349 + }, + { + "epoch": 0.6235527781814659, + "grad_norm": 0.09662448614835739, + "learning_rate": 0.00010202310274106659, + "loss": 0.3390568494796753, + "memory(GiB)": 78.33, + "step": 3218, + "token_acc": 0.8996795034847637, + "train_speed(iter/s)": 0.03235 + }, + { + "epoch": 0.6237465484667927, + "grad_norm": 0.12932774424552917, + "learning_rate": 0.00010193203048930914, + "loss": 0.3782866597175598, + "memory(GiB)": 78.33, + "step": 3219, + "token_acc": 0.8888921265808031, + "train_speed(iter/s)": 0.032351 + }, + { + "epoch": 0.6239403187521194, + "grad_norm": 0.09401915222406387, + "learning_rate": 0.0001018409779803584, + "loss": 0.29952648282051086, + "memory(GiB)": 78.33, + "step": 3220, + "token_acc": 0.9101730793176441, + "train_speed(iter/s)": 0.032352 + }, + { + "epoch": 0.6241340890374462, + "grad_norm": 0.09950324892997742, + "learning_rate": 0.00010174994525161215, + "loss": 0.32064443826675415, + "memory(GiB)": 78.33, + "step": 3221, + "token_acc": 0.9026459405633293, + "train_speed(iter/s)": 0.032352 + }, + { + "epoch": 0.6243278593227729, + "grad_norm": 0.11111953109502792, + "learning_rate": 0.00010165893234045988, + "loss": 0.3619105815887451, + "memory(GiB)": 78.33, + "step": 3222, + "token_acc": 0.8926043878273178, + "train_speed(iter/s)": 0.032353 + }, + { + "epoch": 0.6245216296080996, + "grad_norm": 0.10427284240722656, + "learning_rate": 0.0001015679392842831, + "loss": 0.3563496768474579, + "memory(GiB)": 78.33, + "step": 3223, + "token_acc": 0.8932768896563249, + "train_speed(iter/s)": 0.032354 + }, + { + "epoch": 0.6247153998934264, + "grad_norm": 0.10940185189247131, + "learning_rate": 0.00010147696612045502, + "loss": 0.35510045289993286, + "memory(GiB)": 78.33, + "step": 3224, + "token_acc": 0.892183207930252, + "train_speed(iter/s)": 0.032355 + }, + { + "epoch": 0.6249091701787531, + "grad_norm": 0.10383673757314682, + "learning_rate": 0.00010138601288634085, + "loss": 0.34664323925971985, + "memory(GiB)": 78.33, + "step": 3225, + "token_acc": 0.897200460505995, + "train_speed(iter/s)": 0.032356 + }, + { + "epoch": 0.6251029404640799, + "grad_norm": 0.1071397215127945, + "learning_rate": 0.00010129507961929748, + "loss": 0.3815678358078003, + "memory(GiB)": 78.33, + "step": 3226, + "token_acc": 0.8863624317591562, + "train_speed(iter/s)": 0.032357 + }, + { + "epoch": 0.6252967107494066, + "grad_norm": 0.09263280034065247, + "learning_rate": 0.00010120416635667364, + "loss": 0.30259522795677185, + "memory(GiB)": 78.33, + "step": 3227, + "token_acc": 0.9077235976497765, + "train_speed(iter/s)": 0.032358 + }, + { + "epoch": 0.6254904810347334, + "grad_norm": 0.12750935554504395, + "learning_rate": 0.00010111327313580994, + "loss": 0.3879462480545044, + "memory(GiB)": 78.33, + "step": 3228, + "token_acc": 0.8852969484232577, + "train_speed(iter/s)": 0.032359 + }, + { + "epoch": 0.6256842513200601, + "grad_norm": 0.09431038796901703, + "learning_rate": 0.00010102239999403857, + "loss": 0.3204158842563629, + "memory(GiB)": 78.33, + "step": 3229, + "token_acc": 0.9041399892865342, + "train_speed(iter/s)": 0.032359 + }, + { + "epoch": 0.6258780216053869, + "grad_norm": 0.1006726399064064, + "learning_rate": 0.00010093154696868362, + "loss": 0.34341204166412354, + "memory(GiB)": 78.33, + "step": 3230, + "token_acc": 0.8983400146801422, + "train_speed(iter/s)": 0.03236 + }, + { + "epoch": 0.6260717918907136, + "grad_norm": 0.09969964623451233, + "learning_rate": 0.0001008407140970608, + "loss": 0.318872332572937, + "memory(GiB)": 78.33, + "step": 3231, + "token_acc": 0.9055216821471309, + "train_speed(iter/s)": 0.032361 + }, + { + "epoch": 0.6262655621760403, + "grad_norm": 0.1142285168170929, + "learning_rate": 0.00010074990141647767, + "loss": 0.33352869749069214, + "memory(GiB)": 78.33, + "step": 3232, + "token_acc": 0.9006948304613674, + "train_speed(iter/s)": 0.032362 + }, + { + "epoch": 0.6264593324613671, + "grad_norm": 0.10268845409154892, + "learning_rate": 0.00010065910896423346, + "loss": 0.33974435925483704, + "memory(GiB)": 78.33, + "step": 3233, + "token_acc": 0.8981594097137761, + "train_speed(iter/s)": 0.032363 + }, + { + "epoch": 0.6266531027466938, + "grad_norm": 0.11217531561851501, + "learning_rate": 0.000100568336777619, + "loss": 0.40757495164871216, + "memory(GiB)": 78.33, + "step": 3234, + "token_acc": 0.8796624013068336, + "train_speed(iter/s)": 0.032364 + }, + { + "epoch": 0.6268468730320206, + "grad_norm": 0.11023017019033432, + "learning_rate": 0.00010047758489391698, + "loss": 0.37037602066993713, + "memory(GiB)": 78.33, + "step": 3235, + "token_acc": 0.8889917278880083, + "train_speed(iter/s)": 0.032365 + }, + { + "epoch": 0.6270406433173473, + "grad_norm": 0.09839659184217453, + "learning_rate": 0.00010038685335040149, + "loss": 0.330047607421875, + "memory(GiB)": 78.33, + "step": 3236, + "token_acc": 0.9030212027856163, + "train_speed(iter/s)": 0.032365 + }, + { + "epoch": 0.6272344136026741, + "grad_norm": 0.10475650429725647, + "learning_rate": 0.00010029614218433851, + "loss": 0.3539055585861206, + "memory(GiB)": 78.33, + "step": 3237, + "token_acc": 0.8929911208269236, + "train_speed(iter/s)": 0.032366 + }, + { + "epoch": 0.6274281838880008, + "grad_norm": 0.11595990508794785, + "learning_rate": 0.00010020545143298555, + "loss": 0.3532840311527252, + "memory(GiB)": 78.33, + "step": 3238, + "token_acc": 0.8959765685216587, + "train_speed(iter/s)": 0.032367 + }, + { + "epoch": 0.6276219541733276, + "grad_norm": 0.0995573177933693, + "learning_rate": 0.0001001147811335917, + "loss": 0.3350241482257843, + "memory(GiB)": 78.33, + "step": 3239, + "token_acc": 0.900135550084373, + "train_speed(iter/s)": 0.032368 + }, + { + "epoch": 0.6278157244586543, + "grad_norm": 0.10098245739936829, + "learning_rate": 0.0001000241313233977, + "loss": 0.3346031606197357, + "memory(GiB)": 78.33, + "step": 3240, + "token_acc": 0.9002930289944479, + "train_speed(iter/s)": 0.032369 + }, + { + "epoch": 0.628009494743981, + "grad_norm": 0.09921301156282425, + "learning_rate": 9.993350203963586e-05, + "loss": 0.329167902469635, + "memory(GiB)": 78.33, + "step": 3241, + "token_acc": 0.9022172464638741, + "train_speed(iter/s)": 0.03237 + }, + { + "epoch": 0.6282032650293078, + "grad_norm": 0.09442011266946793, + "learning_rate": 9.984289331953012e-05, + "loss": 0.32895606756210327, + "memory(GiB)": 78.33, + "step": 3242, + "token_acc": 0.9014620960805996, + "train_speed(iter/s)": 0.03237 + }, + { + "epoch": 0.6283970353146345, + "grad_norm": 0.09743113070726395, + "learning_rate": 9.975230520029581e-05, + "loss": 0.33065280318260193, + "memory(GiB)": 78.33, + "step": 3243, + "token_acc": 0.9017754207977865, + "train_speed(iter/s)": 0.032371 + }, + { + "epoch": 0.6285908055999613, + "grad_norm": 0.10759622603654861, + "learning_rate": 9.966173771913999e-05, + "loss": 0.3607182502746582, + "memory(GiB)": 78.33, + "step": 3244, + "token_acc": 0.8951261723983921, + "train_speed(iter/s)": 0.032372 + }, + { + "epoch": 0.628784575885288, + "grad_norm": 0.10882527381181717, + "learning_rate": 9.957119091326111e-05, + "loss": 0.3616001307964325, + "memory(GiB)": 78.33, + "step": 3245, + "token_acc": 0.8927234113899291, + "train_speed(iter/s)": 0.032373 + }, + { + "epoch": 0.6289783461706148, + "grad_norm": 0.10181569308042526, + "learning_rate": 9.948066481984919e-05, + "loss": 0.32742297649383545, + "memory(GiB)": 78.33, + "step": 3246, + "token_acc": 0.9032007815193656, + "train_speed(iter/s)": 0.032374 + }, + { + "epoch": 0.6291721164559415, + "grad_norm": 0.09965640306472778, + "learning_rate": 9.939015947608579e-05, + "loss": 0.32857993245124817, + "memory(GiB)": 78.33, + "step": 3247, + "token_acc": 0.901941986907338, + "train_speed(iter/s)": 0.032375 + }, + { + "epoch": 0.6293658867412683, + "grad_norm": 0.11179827153682709, + "learning_rate": 9.929967491914378e-05, + "loss": 0.3481506407260895, + "memory(GiB)": 78.33, + "step": 3248, + "token_acc": 0.8953111091968875, + "train_speed(iter/s)": 0.032375 + }, + { + "epoch": 0.629559657026595, + "grad_norm": 0.10431916266679764, + "learning_rate": 9.920921118618772e-05, + "loss": 0.36535415053367615, + "memory(GiB)": 78.33, + "step": 3249, + "token_acc": 0.8926919518963923, + "train_speed(iter/s)": 0.032376 + }, + { + "epoch": 0.6297534273119217, + "grad_norm": 0.11817894130945206, + "learning_rate": 9.911876831437334e-05, + "loss": 0.4042690098285675, + "memory(GiB)": 78.33, + "step": 3250, + "token_acc": 0.8816427447580516, + "train_speed(iter/s)": 0.032377 + }, + { + "epoch": 0.6299471975972485, + "grad_norm": 0.10380962491035461, + "learning_rate": 9.902834634084809e-05, + "loss": 0.33863088488578796, + "memory(GiB)": 78.33, + "step": 3251, + "token_acc": 0.8970420766782492, + "train_speed(iter/s)": 0.032378 + }, + { + "epoch": 0.6301409678825752, + "grad_norm": 0.10345399379730225, + "learning_rate": 9.893794530275065e-05, + "loss": 0.33550775051116943, + "memory(GiB)": 78.33, + "step": 3252, + "token_acc": 0.8967549151027703, + "train_speed(iter/s)": 0.032379 + }, + { + "epoch": 0.630334738167902, + "grad_norm": 0.10701797902584076, + "learning_rate": 9.884756523721115e-05, + "loss": 0.33787888288497925, + "memory(GiB)": 78.33, + "step": 3253, + "token_acc": 0.8981328701693443, + "train_speed(iter/s)": 0.03238 + }, + { + "epoch": 0.6305285084532287, + "grad_norm": 0.11159101128578186, + "learning_rate": 9.875720618135118e-05, + "loss": 0.36240842938423157, + "memory(GiB)": 78.33, + "step": 3254, + "token_acc": 0.8929006465315301, + "train_speed(iter/s)": 0.032381 + }, + { + "epoch": 0.6307222787385555, + "grad_norm": 0.10191599279642105, + "learning_rate": 9.866686817228351e-05, + "loss": 0.35280781984329224, + "memory(GiB)": 78.33, + "step": 3255, + "token_acc": 0.8953659778617565, + "train_speed(iter/s)": 0.032381 + }, + { + "epoch": 0.6309160490238822, + "grad_norm": 0.10553169250488281, + "learning_rate": 9.857655124711252e-05, + "loss": 0.3603316843509674, + "memory(GiB)": 78.33, + "step": 3256, + "token_acc": 0.8932977749043693, + "train_speed(iter/s)": 0.032382 + }, + { + "epoch": 0.631109819309209, + "grad_norm": 0.12384941428899765, + "learning_rate": 9.84862554429337e-05, + "loss": 0.33838897943496704, + "memory(GiB)": 78.33, + "step": 3257, + "token_acc": 0.8999357294705097, + "train_speed(iter/s)": 0.032383 + }, + { + "epoch": 0.6313035895945357, + "grad_norm": 0.09418811649084091, + "learning_rate": 9.839598079683399e-05, + "loss": 0.3201935887336731, + "memory(GiB)": 78.33, + "step": 3258, + "token_acc": 0.9044349822983446, + "train_speed(iter/s)": 0.032384 + }, + { + "epoch": 0.6314973598798624, + "grad_norm": 0.08840276300907135, + "learning_rate": 9.830572734589162e-05, + "loss": 0.295772910118103, + "memory(GiB)": 78.33, + "step": 3259, + "token_acc": 0.9113684822640047, + "train_speed(iter/s)": 0.032385 + }, + { + "epoch": 0.6316911301651892, + "grad_norm": 0.10140841454267502, + "learning_rate": 9.82154951271761e-05, + "loss": 0.3301541805267334, + "memory(GiB)": 78.33, + "step": 3260, + "token_acc": 0.9020520414639306, + "train_speed(iter/s)": 0.032385 + }, + { + "epoch": 0.6318849004505159, + "grad_norm": 0.11607446521520615, + "learning_rate": 9.81252841777483e-05, + "loss": 0.376188725233078, + "memory(GiB)": 78.33, + "step": 3261, + "token_acc": 0.8894073426006934, + "train_speed(iter/s)": 0.032386 + }, + { + "epoch": 0.6320786707358427, + "grad_norm": 0.09963465481996536, + "learning_rate": 9.803509453466015e-05, + "loss": 0.3333686590194702, + "memory(GiB)": 78.33, + "step": 3262, + "token_acc": 0.8974518502488639, + "train_speed(iter/s)": 0.032387 + }, + { + "epoch": 0.6322724410211694, + "grad_norm": 0.0981069952249527, + "learning_rate": 9.794492623495509e-05, + "loss": 0.3355132043361664, + "memory(GiB)": 78.33, + "step": 3263, + "token_acc": 0.8998677114073471, + "train_speed(iter/s)": 0.032388 + }, + { + "epoch": 0.6324662113064962, + "grad_norm": 0.11496897041797638, + "learning_rate": 9.785477931566753e-05, + "loss": 0.3889937400817871, + "memory(GiB)": 78.33, + "step": 3264, + "token_acc": 0.884316427783903, + "train_speed(iter/s)": 0.032389 + }, + { + "epoch": 0.6326599815918229, + "grad_norm": 0.10167025029659271, + "learning_rate": 9.77646538138233e-05, + "loss": 0.33570176362991333, + "memory(GiB)": 78.33, + "step": 3265, + "token_acc": 0.8996981339187706, + "train_speed(iter/s)": 0.03239 + }, + { + "epoch": 0.6328537518771497, + "grad_norm": 0.1109309270977974, + "learning_rate": 9.767454976643939e-05, + "loss": 0.3936588168144226, + "memory(GiB)": 78.33, + "step": 3266, + "token_acc": 0.8842383328441444, + "train_speed(iter/s)": 0.032391 + }, + { + "epoch": 0.6330475221624764, + "grad_norm": 0.1110692024230957, + "learning_rate": 9.758446721052394e-05, + "loss": 0.35908806324005127, + "memory(GiB)": 78.33, + "step": 3267, + "token_acc": 0.8943084600760456, + "train_speed(iter/s)": 0.032391 + }, + { + "epoch": 0.6332412924478031, + "grad_norm": 0.10611660033464432, + "learning_rate": 9.749440618307628e-05, + "loss": 0.3543277680873871, + "memory(GiB)": 78.33, + "step": 3268, + "token_acc": 0.8957095521023766, + "train_speed(iter/s)": 0.032392 + }, + { + "epoch": 0.6334350627331299, + "grad_norm": 0.0958552435040474, + "learning_rate": 9.740436672108685e-05, + "loss": 0.3312506675720215, + "memory(GiB)": 78.33, + "step": 3269, + "token_acc": 0.901442661563374, + "train_speed(iter/s)": 0.032393 + }, + { + "epoch": 0.6336288330184566, + "grad_norm": 0.1073455959558487, + "learning_rate": 9.731434886153735e-05, + "loss": 0.37120985984802246, + "memory(GiB)": 78.33, + "step": 3270, + "token_acc": 0.8908422837761447, + "train_speed(iter/s)": 0.032394 + }, + { + "epoch": 0.6338226033037834, + "grad_norm": 0.10486573725938797, + "learning_rate": 9.722435264140043e-05, + "loss": 0.34987497329711914, + "memory(GiB)": 78.33, + "step": 3271, + "token_acc": 0.8957631675770527, + "train_speed(iter/s)": 0.032395 + }, + { + "epoch": 0.6340163735891101, + "grad_norm": 0.0972420871257782, + "learning_rate": 9.713437809764002e-05, + "loss": 0.3343609869480133, + "memory(GiB)": 78.33, + "step": 3272, + "token_acc": 0.9014839885446498, + "train_speed(iter/s)": 0.032395 + }, + { + "epoch": 0.6342101438744369, + "grad_norm": 0.09882809221744537, + "learning_rate": 9.704442526721112e-05, + "loss": 0.31326884031295776, + "memory(GiB)": 78.33, + "step": 3273, + "token_acc": 0.9058581477465455, + "train_speed(iter/s)": 0.032396 + }, + { + "epoch": 0.6344039141597636, + "grad_norm": 0.15750029683113098, + "learning_rate": 9.69544941870597e-05, + "loss": 0.32955771684646606, + "memory(GiB)": 78.33, + "step": 3274, + "token_acc": 0.9013154082401345, + "train_speed(iter/s)": 0.032397 + }, + { + "epoch": 0.6345976844450903, + "grad_norm": 0.10274738818407059, + "learning_rate": 9.686458489412296e-05, + "loss": 0.37514811754226685, + "memory(GiB)": 78.33, + "step": 3275, + "token_acc": 0.8886818439601394, + "train_speed(iter/s)": 0.032398 + }, + { + "epoch": 0.6347914547304171, + "grad_norm": 0.10508158802986145, + "learning_rate": 9.677469742532896e-05, + "loss": 0.34448888897895813, + "memory(GiB)": 78.33, + "step": 3276, + "token_acc": 0.8970773975620502, + "train_speed(iter/s)": 0.032399 + }, + { + "epoch": 0.6349852250157438, + "grad_norm": 0.1056324765086174, + "learning_rate": 9.668483181759696e-05, + "loss": 0.35540270805358887, + "memory(GiB)": 78.33, + "step": 3277, + "token_acc": 0.8941543700340522, + "train_speed(iter/s)": 0.032399 + }, + { + "epoch": 0.6351789953010706, + "grad_norm": 0.1182514950633049, + "learning_rate": 9.659498810783716e-05, + "loss": 0.38734346628189087, + "memory(GiB)": 78.33, + "step": 3278, + "token_acc": 0.8861613506065693, + "train_speed(iter/s)": 0.0324 + }, + { + "epoch": 0.6353727655863973, + "grad_norm": 0.09534314274787903, + "learning_rate": 9.65051663329508e-05, + "loss": 0.3013858199119568, + "memory(GiB)": 78.33, + "step": 3279, + "token_acc": 0.9102011699243829, + "train_speed(iter/s)": 0.032401 + }, + { + "epoch": 0.6355665358717241, + "grad_norm": 0.11075269430875778, + "learning_rate": 9.641536652983008e-05, + "loss": 0.3610069453716278, + "memory(GiB)": 78.33, + "step": 3280, + "token_acc": 0.8919886003799873, + "train_speed(iter/s)": 0.032402 + }, + { + "epoch": 0.6357603061570508, + "grad_norm": 0.1012001633644104, + "learning_rate": 9.63255887353582e-05, + "loss": 0.32832014560699463, + "memory(GiB)": 78.33, + "step": 3281, + "token_acc": 0.9021164021164021, + "train_speed(iter/s)": 0.032403 + }, + { + "epoch": 0.6359540764423776, + "grad_norm": 0.09684737026691437, + "learning_rate": 9.623583298640937e-05, + "loss": 0.35295921564102173, + "memory(GiB)": 78.33, + "step": 3282, + "token_acc": 0.8938139656941098, + "train_speed(iter/s)": 0.032404 + }, + { + "epoch": 0.6361478467277043, + "grad_norm": 0.12095388025045395, + "learning_rate": 9.614609931984854e-05, + "loss": 0.3737364709377289, + "memory(GiB)": 78.33, + "step": 3283, + "token_acc": 0.8893832689984383, + "train_speed(iter/s)": 0.032405 + }, + { + "epoch": 0.636341617013031, + "grad_norm": 0.10381918400526047, + "learning_rate": 9.60563877725319e-05, + "loss": 0.3494938313961029, + "memory(GiB)": 78.33, + "step": 3284, + "token_acc": 0.8952139037433156, + "train_speed(iter/s)": 0.032405 + }, + { + "epoch": 0.6365353872983578, + "grad_norm": 0.10965927690267563, + "learning_rate": 9.596669838130627e-05, + "loss": 0.36877620220184326, + "memory(GiB)": 78.33, + "step": 3285, + "token_acc": 0.8932712103153104, + "train_speed(iter/s)": 0.032406 + }, + { + "epoch": 0.6367291575836845, + "grad_norm": 0.09375525265932083, + "learning_rate": 9.587703118300955e-05, + "loss": 0.33250147104263306, + "memory(GiB)": 78.33, + "step": 3286, + "token_acc": 0.9001272429821601, + "train_speed(iter/s)": 0.032407 + }, + { + "epoch": 0.6369229278690113, + "grad_norm": 0.11146512627601624, + "learning_rate": 9.578738621447052e-05, + "loss": 0.3969725966453552, + "memory(GiB)": 78.33, + "step": 3287, + "token_acc": 0.8843091971199565, + "train_speed(iter/s)": 0.032408 + }, + { + "epoch": 0.637116698154338, + "grad_norm": 0.10300832241773605, + "learning_rate": 9.569776351250867e-05, + "loss": 0.3498913645744324, + "memory(GiB)": 78.33, + "step": 3288, + "token_acc": 0.896644596566042, + "train_speed(iter/s)": 0.032409 + }, + { + "epoch": 0.6373104684396648, + "grad_norm": 0.10484420508146286, + "learning_rate": 9.560816311393456e-05, + "loss": 0.3171748220920563, + "memory(GiB)": 78.33, + "step": 3289, + "token_acc": 0.9038370440549502, + "train_speed(iter/s)": 0.032409 + }, + { + "epoch": 0.6375042387249915, + "grad_norm": 0.11446377635002136, + "learning_rate": 9.551858505554935e-05, + "loss": 0.3828757703304291, + "memory(GiB)": 78.33, + "step": 3290, + "token_acc": 0.8870528771384136, + "train_speed(iter/s)": 0.03241 + }, + { + "epoch": 0.6376980090103183, + "grad_norm": 0.10568659007549286, + "learning_rate": 9.542902937414528e-05, + "loss": 0.3619765043258667, + "memory(GiB)": 78.33, + "step": 3291, + "token_acc": 0.8938068279293189, + "train_speed(iter/s)": 0.032411 + }, + { + "epoch": 0.637891779295645, + "grad_norm": 0.09850284457206726, + "learning_rate": 9.533949610650519e-05, + "loss": 0.3398081660270691, + "memory(GiB)": 78.33, + "step": 3292, + "token_acc": 0.8959392848759878, + "train_speed(iter/s)": 0.032412 + }, + { + "epoch": 0.6380855495809717, + "grad_norm": 0.10148289799690247, + "learning_rate": 9.524998528940282e-05, + "loss": 0.3508596420288086, + "memory(GiB)": 78.33, + "step": 3293, + "token_acc": 0.8957725382847403, + "train_speed(iter/s)": 0.032413 + }, + { + "epoch": 0.6382793198662985, + "grad_norm": 0.09729644656181335, + "learning_rate": 9.516049695960277e-05, + "loss": 0.33748623728752136, + "memory(GiB)": 78.33, + "step": 3294, + "token_acc": 0.8995384056843525, + "train_speed(iter/s)": 0.032414 + }, + { + "epoch": 0.6384730901516252, + "grad_norm": 0.11033184826374054, + "learning_rate": 9.507103115386013e-05, + "loss": 0.36528030037879944, + "memory(GiB)": 78.33, + "step": 3295, + "token_acc": 0.8912418790604698, + "train_speed(iter/s)": 0.032415 + }, + { + "epoch": 0.638666860436952, + "grad_norm": 0.09952208399772644, + "learning_rate": 9.498158790892105e-05, + "loss": 0.3313583433628082, + "memory(GiB)": 78.33, + "step": 3296, + "token_acc": 0.9018691588785047, + "train_speed(iter/s)": 0.032415 + }, + { + "epoch": 0.6388606307222787, + "grad_norm": 0.11369100958108902, + "learning_rate": 9.489216726152218e-05, + "loss": 0.3779648542404175, + "memory(GiB)": 78.33, + "step": 3297, + "token_acc": 0.8898235347901619, + "train_speed(iter/s)": 0.032416 + }, + { + "epoch": 0.6390544010076055, + "grad_norm": 0.10706917941570282, + "learning_rate": 9.480276924839101e-05, + "loss": 0.3606652319431305, + "memory(GiB)": 78.33, + "step": 3298, + "token_acc": 0.8924252794889346, + "train_speed(iter/s)": 0.032417 + }, + { + "epoch": 0.6392481712929322, + "grad_norm": 0.107958123087883, + "learning_rate": 9.471339390624573e-05, + "loss": 0.36460253596305847, + "memory(GiB)": 78.33, + "step": 3299, + "token_acc": 0.8916521320743369, + "train_speed(iter/s)": 0.032418 + }, + { + "epoch": 0.639441941578259, + "grad_norm": 0.10166801512241364, + "learning_rate": 9.462404127179517e-05, + "loss": 0.34821823239326477, + "memory(GiB)": 78.33, + "step": 3300, + "token_acc": 0.8970175808932179, + "train_speed(iter/s)": 0.032418 + }, + { + "epoch": 0.6396357118635857, + "grad_norm": 0.11107532680034637, + "learning_rate": 9.453471138173893e-05, + "loss": 0.3458663523197174, + "memory(GiB)": 78.33, + "step": 3301, + "token_acc": 0.8959231235833094, + "train_speed(iter/s)": 0.032419 + }, + { + "epoch": 0.6398294821489124, + "grad_norm": 0.11017818003892899, + "learning_rate": 9.444540427276707e-05, + "loss": 0.36541783809661865, + "memory(GiB)": 78.33, + "step": 3302, + "token_acc": 0.8919458960959115, + "train_speed(iter/s)": 0.03242 + }, + { + "epoch": 0.6400232524342392, + "grad_norm": 0.09530369192361832, + "learning_rate": 9.435611998156055e-05, + "loss": 0.31924229860305786, + "memory(GiB)": 78.33, + "step": 3303, + "token_acc": 0.9019935658738, + "train_speed(iter/s)": 0.032421 + }, + { + "epoch": 0.6402170227195659, + "grad_norm": 0.11861889809370041, + "learning_rate": 9.42668585447907e-05, + "loss": 0.3566751480102539, + "memory(GiB)": 78.33, + "step": 3304, + "token_acc": 0.8942750741652464, + "train_speed(iter/s)": 0.032422 + }, + { + "epoch": 0.6404107930048927, + "grad_norm": 0.09427324682474136, + "learning_rate": 9.417761999911965e-05, + "loss": 0.35689249634742737, + "memory(GiB)": 78.33, + "step": 3305, + "token_acc": 0.8942868072275602, + "train_speed(iter/s)": 0.032422 + }, + { + "epoch": 0.6406045632902194, + "grad_norm": 0.09987188875675201, + "learning_rate": 9.40884043812001e-05, + "loss": 0.3596974313259125, + "memory(GiB)": 78.33, + "step": 3306, + "token_acc": 0.8958737616178123, + "train_speed(iter/s)": 0.032423 + }, + { + "epoch": 0.6407983335755462, + "grad_norm": 0.0936884880065918, + "learning_rate": 9.399921172767525e-05, + "loss": 0.32182028889656067, + "memory(GiB)": 78.33, + "step": 3307, + "token_acc": 0.9035014440174715, + "train_speed(iter/s)": 0.032424 + }, + { + "epoch": 0.6409921038608729, + "grad_norm": 0.10136570781469345, + "learning_rate": 9.3910042075179e-05, + "loss": 0.34020885825157166, + "memory(GiB)": 78.33, + "step": 3308, + "token_acc": 0.8990686593025774, + "train_speed(iter/s)": 0.032425 + }, + { + "epoch": 0.6411858741461997, + "grad_norm": 0.09547661989927292, + "learning_rate": 9.38208954603356e-05, + "loss": 0.320523738861084, + "memory(GiB)": 78.33, + "step": 3309, + "token_acc": 0.9048575160868935, + "train_speed(iter/s)": 0.032425 + }, + { + "epoch": 0.6413796444315264, + "grad_norm": 0.09538991749286652, + "learning_rate": 9.373177191976007e-05, + "loss": 0.3119708299636841, + "memory(GiB)": 78.33, + "step": 3310, + "token_acc": 0.906541066892464, + "train_speed(iter/s)": 0.032426 + }, + { + "epoch": 0.6415734147168531, + "grad_norm": 0.09219719469547272, + "learning_rate": 9.36426714900578e-05, + "loss": 0.3281041383743286, + "memory(GiB)": 78.33, + "step": 3311, + "token_acc": 0.9008394285994796, + "train_speed(iter/s)": 0.032427 + }, + { + "epoch": 0.6417671850021799, + "grad_norm": 0.09361285716295242, + "learning_rate": 9.355359420782467e-05, + "loss": 0.3183348476886749, + "memory(GiB)": 78.33, + "step": 3312, + "token_acc": 0.9031101372046116, + "train_speed(iter/s)": 0.032428 + }, + { + "epoch": 0.6419609552875066, + "grad_norm": 0.10026978701353073, + "learning_rate": 9.346454010964722e-05, + "loss": 0.3270958364009857, + "memory(GiB)": 78.33, + "step": 3313, + "token_acc": 0.90237444958475, + "train_speed(iter/s)": 0.032428 + }, + { + "epoch": 0.6421547255728334, + "grad_norm": 0.10751143842935562, + "learning_rate": 9.337550923210228e-05, + "loss": 0.3549875020980835, + "memory(GiB)": 78.33, + "step": 3314, + "token_acc": 0.8923901567449037, + "train_speed(iter/s)": 0.032429 + }, + { + "epoch": 0.6423484958581601, + "grad_norm": 0.10847034305334091, + "learning_rate": 9.328650161175735e-05, + "loss": 0.3592216372489929, + "memory(GiB)": 78.33, + "step": 3315, + "token_acc": 0.8951582706875199, + "train_speed(iter/s)": 0.03243 + }, + { + "epoch": 0.6425422661434869, + "grad_norm": 0.09726880490779877, + "learning_rate": 9.319751728517007e-05, + "loss": 0.31801801919937134, + "memory(GiB)": 78.33, + "step": 3316, + "token_acc": 0.9054611953118507, + "train_speed(iter/s)": 0.032431 + }, + { + "epoch": 0.6427360364288136, + "grad_norm": 0.10297710448503494, + "learning_rate": 9.31085562888888e-05, + "loss": 0.3436722159385681, + "memory(GiB)": 78.33, + "step": 3317, + "token_acc": 0.897666406481307, + "train_speed(iter/s)": 0.032432 + }, + { + "epoch": 0.6429298067141404, + "grad_norm": 0.1005515605211258, + "learning_rate": 9.301961865945217e-05, + "loss": 0.3575522005558014, + "memory(GiB)": 78.33, + "step": 3318, + "token_acc": 0.8908450704225352, + "train_speed(iter/s)": 0.032433 + }, + { + "epoch": 0.6431235769994671, + "grad_norm": 0.11259230971336365, + "learning_rate": 9.29307044333893e-05, + "loss": 0.3851853311061859, + "memory(GiB)": 78.33, + "step": 3319, + "token_acc": 0.8852057033875216, + "train_speed(iter/s)": 0.032433 + }, + { + "epoch": 0.6433173472847938, + "grad_norm": 0.1026308611035347, + "learning_rate": 9.284181364721967e-05, + "loss": 0.33121898770332336, + "memory(GiB)": 78.33, + "step": 3320, + "token_acc": 0.9000698616738857, + "train_speed(iter/s)": 0.032434 + }, + { + "epoch": 0.6435111175701206, + "grad_norm": 0.10448215901851654, + "learning_rate": 9.275294633745302e-05, + "loss": 0.35694530606269836, + "memory(GiB)": 78.33, + "step": 3321, + "token_acc": 0.8939416251877595, + "train_speed(iter/s)": 0.032435 + }, + { + "epoch": 0.6437048878554473, + "grad_norm": 0.09308988600969315, + "learning_rate": 9.266410254058966e-05, + "loss": 0.3261651396751404, + "memory(GiB)": 78.33, + "step": 3322, + "token_acc": 0.9015043921939929, + "train_speed(iter/s)": 0.032436 + }, + { + "epoch": 0.6438986581407741, + "grad_norm": 0.10853360593318939, + "learning_rate": 9.257528229312e-05, + "loss": 0.36176615953445435, + "memory(GiB)": 78.33, + "step": 3323, + "token_acc": 0.8933630754077465, + "train_speed(iter/s)": 0.032437 + }, + { + "epoch": 0.6440924284261008, + "grad_norm": 0.10240910202264786, + "learning_rate": 9.248648563152499e-05, + "loss": 0.3467825651168823, + "memory(GiB)": 78.33, + "step": 3324, + "token_acc": 0.8980282133305365, + "train_speed(iter/s)": 0.032437 + }, + { + "epoch": 0.6442861987114276, + "grad_norm": 0.10371696203947067, + "learning_rate": 9.239771259227577e-05, + "loss": 0.35027819871902466, + "memory(GiB)": 78.33, + "step": 3325, + "token_acc": 0.8956741618858588, + "train_speed(iter/s)": 0.032438 + }, + { + "epoch": 0.6444799689967543, + "grad_norm": 0.11719199270009995, + "learning_rate": 9.230896321183379e-05, + "loss": 0.3709411919116974, + "memory(GiB)": 78.33, + "step": 3326, + "token_acc": 0.8905724445013935, + "train_speed(iter/s)": 0.032439 + }, + { + "epoch": 0.644673739282081, + "grad_norm": 0.09655088186264038, + "learning_rate": 9.222023752665094e-05, + "loss": 0.35260531306266785, + "memory(GiB)": 78.33, + "step": 3327, + "token_acc": 0.8940183178216111, + "train_speed(iter/s)": 0.03244 + }, + { + "epoch": 0.6448675095674078, + "grad_norm": 0.10662583261728287, + "learning_rate": 9.213153557316904e-05, + "loss": 0.3466799259185791, + "memory(GiB)": 78.33, + "step": 3328, + "token_acc": 0.8973979206598129, + "train_speed(iter/s)": 0.03244 + }, + { + "epoch": 0.6450612798527345, + "grad_norm": 0.1059870645403862, + "learning_rate": 9.204285738782053e-05, + "loss": 0.35201188921928406, + "memory(GiB)": 78.33, + "step": 3329, + "token_acc": 0.8970674486803519, + "train_speed(iter/s)": 0.032441 + }, + { + "epoch": 0.6452550501380613, + "grad_norm": 0.09518367052078247, + "learning_rate": 9.195420300702782e-05, + "loss": 0.3428179919719696, + "memory(GiB)": 78.33, + "step": 3330, + "token_acc": 0.894964203373377, + "train_speed(iter/s)": 0.032442 + }, + { + "epoch": 0.645448820423388, + "grad_norm": 0.10190161317586899, + "learning_rate": 9.18655724672037e-05, + "loss": 0.34315353631973267, + "memory(GiB)": 78.33, + "step": 3331, + "token_acc": 0.8971641880907941, + "train_speed(iter/s)": 0.032443 + }, + { + "epoch": 0.6456425907087148, + "grad_norm": 0.09689725935459137, + "learning_rate": 9.177696580475109e-05, + "loss": 0.32769232988357544, + "memory(GiB)": 78.33, + "step": 3332, + "token_acc": 0.9002272727272728, + "train_speed(iter/s)": 0.032444 + }, + { + "epoch": 0.6458363609940415, + "grad_norm": 0.09515693038702011, + "learning_rate": 9.168838305606311e-05, + "loss": 0.2983517050743103, + "memory(GiB)": 78.33, + "step": 3333, + "token_acc": 0.9090839107005388, + "train_speed(iter/s)": 0.032444 + }, + { + "epoch": 0.6460301312793683, + "grad_norm": 0.11088063567876816, + "learning_rate": 9.159982425752319e-05, + "loss": 0.38608160614967346, + "memory(GiB)": 78.33, + "step": 3334, + "token_acc": 0.8878626217586326, + "train_speed(iter/s)": 0.032445 + }, + { + "epoch": 0.646223901564695, + "grad_norm": 0.09727376699447632, + "learning_rate": 9.151128944550465e-05, + "loss": 0.3358725905418396, + "memory(GiB)": 78.33, + "step": 3335, + "token_acc": 0.9006044153050249, + "train_speed(iter/s)": 0.032446 + }, + { + "epoch": 0.6464176718500217, + "grad_norm": 0.1008610799908638, + "learning_rate": 9.142277865637124e-05, + "loss": 0.350691556930542, + "memory(GiB)": 78.33, + "step": 3336, + "token_acc": 0.8959597901617732, + "train_speed(iter/s)": 0.032447 + }, + { + "epoch": 0.6466114421353485, + "grad_norm": 0.10170338302850723, + "learning_rate": 9.133429192647661e-05, + "loss": 0.3640455901622772, + "memory(GiB)": 78.33, + "step": 3337, + "token_acc": 0.8921849481941582, + "train_speed(iter/s)": 0.032447 + }, + { + "epoch": 0.6468052124206752, + "grad_norm": 0.10477067530155182, + "learning_rate": 9.124582929216471e-05, + "loss": 0.3344863951206207, + "memory(GiB)": 78.33, + "step": 3338, + "token_acc": 0.8980883180952892, + "train_speed(iter/s)": 0.032448 + }, + { + "epoch": 0.646998982706002, + "grad_norm": 0.10310588032007217, + "learning_rate": 9.11573907897695e-05, + "loss": 0.35701289772987366, + "memory(GiB)": 78.33, + "step": 3339, + "token_acc": 0.895986649522611, + "train_speed(iter/s)": 0.032449 + }, + { + "epoch": 0.6471927529913288, + "grad_norm": 0.0928465947508812, + "learning_rate": 9.106897645561506e-05, + "loss": 0.3364104628562927, + "memory(GiB)": 78.33, + "step": 3340, + "token_acc": 0.8999741468459153, + "train_speed(iter/s)": 0.03245 + }, + { + "epoch": 0.6473865232766556, + "grad_norm": 0.09853307157754898, + "learning_rate": 9.098058632601557e-05, + "loss": 0.33070456981658936, + "memory(GiB)": 78.33, + "step": 3341, + "token_acc": 0.900615836532763, + "train_speed(iter/s)": 0.03245 + }, + { + "epoch": 0.6475802935619823, + "grad_norm": 0.10872960090637207, + "learning_rate": 9.089222043727512e-05, + "loss": 0.38177114725112915, + "memory(GiB)": 78.33, + "step": 3342, + "token_acc": 0.8859876130382949, + "train_speed(iter/s)": 0.032451 + }, + { + "epoch": 0.6477740638473091, + "grad_norm": 0.10483535379171371, + "learning_rate": 9.08038788256881e-05, + "loss": 0.3313630223274231, + "memory(GiB)": 78.33, + "step": 3343, + "token_acc": 0.9004806641905178, + "train_speed(iter/s)": 0.032452 + }, + { + "epoch": 0.6479678341326358, + "grad_norm": 0.09030815213918686, + "learning_rate": 9.071556152753866e-05, + "loss": 0.3115072548389435, + "memory(GiB)": 78.33, + "step": 3344, + "token_acc": 0.9051830718021874, + "train_speed(iter/s)": 0.032453 + }, + { + "epoch": 0.6481616044179626, + "grad_norm": 0.1190243512392044, + "learning_rate": 9.062726857910111e-05, + "loss": 0.3789428472518921, + "memory(GiB)": 78.33, + "step": 3345, + "token_acc": 0.8881057268722466, + "train_speed(iter/s)": 0.032454 + }, + { + "epoch": 0.6483553747032893, + "grad_norm": 0.10349465906620026, + "learning_rate": 9.05390000166398e-05, + "loss": 0.33017659187316895, + "memory(GiB)": 78.33, + "step": 3346, + "token_acc": 0.9012410514386799, + "train_speed(iter/s)": 0.032455 + }, + { + "epoch": 0.648549144988616, + "grad_norm": 0.10818509757518768, + "learning_rate": 9.045075587640894e-05, + "loss": 0.3425545394420624, + "memory(GiB)": 78.33, + "step": 3347, + "token_acc": 0.8974606721217429, + "train_speed(iter/s)": 0.032455 + }, + { + "epoch": 0.6487429152739428, + "grad_norm": 0.09324289113283157, + "learning_rate": 9.036253619465285e-05, + "loss": 0.3058326244354248, + "memory(GiB)": 78.33, + "step": 3348, + "token_acc": 0.9082648317882186, + "train_speed(iter/s)": 0.032456 + }, + { + "epoch": 0.6489366855592695, + "grad_norm": 0.10658305138349533, + "learning_rate": 9.027434100760559e-05, + "loss": 0.3650735318660736, + "memory(GiB)": 78.33, + "step": 3349, + "token_acc": 0.8911853628023353, + "train_speed(iter/s)": 0.032457 + }, + { + "epoch": 0.6491304558445963, + "grad_norm": 0.10458887368440628, + "learning_rate": 9.018617035149141e-05, + "loss": 0.34233736991882324, + "memory(GiB)": 78.33, + "step": 3350, + "token_acc": 0.89612028460196, + "train_speed(iter/s)": 0.032458 + }, + { + "epoch": 0.649324226129923, + "grad_norm": 0.10068278759717941, + "learning_rate": 9.00980242625243e-05, + "loss": 0.3308500647544861, + "memory(GiB)": 78.33, + "step": 3351, + "token_acc": 0.8995980595980596, + "train_speed(iter/s)": 0.032458 + }, + { + "epoch": 0.6495179964152498, + "grad_norm": 0.10845746099948883, + "learning_rate": 9.000990277690828e-05, + "loss": 0.349687397480011, + "memory(GiB)": 78.33, + "step": 3352, + "token_acc": 0.8954418825203011, + "train_speed(iter/s)": 0.032459 + }, + { + "epoch": 0.6497117667005765, + "grad_norm": 0.10593696683645248, + "learning_rate": 8.992180593083718e-05, + "loss": 0.35012173652648926, + "memory(GiB)": 78.33, + "step": 3353, + "token_acc": 0.8978386677689855, + "train_speed(iter/s)": 0.03246 + }, + { + "epoch": 0.6499055369859033, + "grad_norm": 0.10230911523103714, + "learning_rate": 8.983373376049473e-05, + "loss": 0.32801395654678345, + "memory(GiB)": 78.33, + "step": 3354, + "token_acc": 0.9028890794570777, + "train_speed(iter/s)": 0.032461 + }, + { + "epoch": 0.65009930727123, + "grad_norm": 0.09752920269966125, + "learning_rate": 8.97456863020546e-05, + "loss": 0.33961349725723267, + "memory(GiB)": 78.33, + "step": 3355, + "token_acc": 0.8986005964670796, + "train_speed(iter/s)": 0.032462 + }, + { + "epoch": 0.6502930775565567, + "grad_norm": 0.09952106326818466, + "learning_rate": 8.965766359168017e-05, + "loss": 0.3362662196159363, + "memory(GiB)": 78.33, + "step": 3356, + "token_acc": 0.8996824741203501, + "train_speed(iter/s)": 0.032462 + }, + { + "epoch": 0.6504868478418835, + "grad_norm": 0.09671690315008163, + "learning_rate": 8.956966566552476e-05, + "loss": 0.3496435880661011, + "memory(GiB)": 78.33, + "step": 3357, + "token_acc": 0.8963326523493946, + "train_speed(iter/s)": 0.032463 + }, + { + "epoch": 0.6506806181272102, + "grad_norm": 0.10482124984264374, + "learning_rate": 8.948169255973147e-05, + "loss": 0.3635352551937103, + "memory(GiB)": 78.33, + "step": 3358, + "token_acc": 0.8922592685440517, + "train_speed(iter/s)": 0.032464 + }, + { + "epoch": 0.650874388412537, + "grad_norm": 0.104715995490551, + "learning_rate": 8.939374431043325e-05, + "loss": 0.33896604180336, + "memory(GiB)": 78.33, + "step": 3359, + "token_acc": 0.8968829586978186, + "train_speed(iter/s)": 0.032465 + }, + { + "epoch": 0.6510681586978637, + "grad_norm": 0.10908566415309906, + "learning_rate": 8.930582095375283e-05, + "loss": 0.354704350233078, + "memory(GiB)": 78.33, + "step": 3360, + "token_acc": 0.8975314829897876, + "train_speed(iter/s)": 0.032465 + }, + { + "epoch": 0.6512619289831905, + "grad_norm": 0.10956370085477829, + "learning_rate": 8.921792252580263e-05, + "loss": 0.3756663203239441, + "memory(GiB)": 78.33, + "step": 3361, + "token_acc": 0.8876317598533455, + "train_speed(iter/s)": 0.032466 + }, + { + "epoch": 0.6514556992685172, + "grad_norm": 0.08828612416982651, + "learning_rate": 8.913004906268495e-05, + "loss": 0.3033355474472046, + "memory(GiB)": 78.33, + "step": 3362, + "token_acc": 0.9071128935601346, + "train_speed(iter/s)": 0.032467 + }, + { + "epoch": 0.651649469553844, + "grad_norm": 0.10207577049732208, + "learning_rate": 8.904220060049172e-05, + "loss": 0.3187861144542694, + "memory(GiB)": 78.33, + "step": 3363, + "token_acc": 0.9046379128103328, + "train_speed(iter/s)": 0.032468 + }, + { + "epoch": 0.6518432398391707, + "grad_norm": 0.10097214579582214, + "learning_rate": 8.895437717530473e-05, + "loss": 0.31431207060813904, + "memory(GiB)": 78.33, + "step": 3364, + "token_acc": 0.9061064038885495, + "train_speed(iter/s)": 0.032468 + }, + { + "epoch": 0.6520370101244974, + "grad_norm": 0.10009994357824326, + "learning_rate": 8.886657882319537e-05, + "loss": 0.32470712065696716, + "memory(GiB)": 78.33, + "step": 3365, + "token_acc": 0.9029475799698543, + "train_speed(iter/s)": 0.032469 + }, + { + "epoch": 0.6522307804098242, + "grad_norm": 0.1002596840262413, + "learning_rate": 8.877880558022478e-05, + "loss": 0.340162992477417, + "memory(GiB)": 78.33, + "step": 3366, + "token_acc": 0.9000396877893901, + "train_speed(iter/s)": 0.03247 + }, + { + "epoch": 0.6524245506951509, + "grad_norm": 0.09453920274972916, + "learning_rate": 8.869105748244392e-05, + "loss": 0.3194851279258728, + "memory(GiB)": 78.33, + "step": 3367, + "token_acc": 0.9037842692144585, + "train_speed(iter/s)": 0.032471 + }, + { + "epoch": 0.6526183209804777, + "grad_norm": 0.09737507253885269, + "learning_rate": 8.86033345658931e-05, + "loss": 0.33026716113090515, + "memory(GiB)": 78.33, + "step": 3368, + "token_acc": 0.901304817031296, + "train_speed(iter/s)": 0.032471 + }, + { + "epoch": 0.6528120912658044, + "grad_norm": 0.27890968322753906, + "learning_rate": 8.851563686660263e-05, + "loss": 0.3396485149860382, + "memory(GiB)": 78.33, + "step": 3369, + "token_acc": 0.8994968003062954, + "train_speed(iter/s)": 0.032472 + }, + { + "epoch": 0.6530058615511312, + "grad_norm": 0.09766080230474472, + "learning_rate": 8.842796442059217e-05, + "loss": 0.33352982997894287, + "memory(GiB)": 78.33, + "step": 3370, + "token_acc": 0.9007515752714021, + "train_speed(iter/s)": 0.032473 + }, + { + "epoch": 0.6531996318364579, + "grad_norm": 0.0987740233540535, + "learning_rate": 8.834031726387126e-05, + "loss": 0.33398640155792236, + "memory(GiB)": 78.33, + "step": 3371, + "token_acc": 0.9004823690252018, + "train_speed(iter/s)": 0.032474 + }, + { + "epoch": 0.6533934021217846, + "grad_norm": 0.09064597636461258, + "learning_rate": 8.825269543243891e-05, + "loss": 0.3236205577850342, + "memory(GiB)": 78.33, + "step": 3372, + "token_acc": 0.9059911385089578, + "train_speed(iter/s)": 0.032474 + }, + { + "epoch": 0.6535871724071114, + "grad_norm": 0.0983690693974495, + "learning_rate": 8.816509896228374e-05, + "loss": 0.32531166076660156, + "memory(GiB)": 78.33, + "step": 3373, + "token_acc": 0.9019883621521745, + "train_speed(iter/s)": 0.032475 + }, + { + "epoch": 0.6537809426924381, + "grad_norm": 0.10600029677152634, + "learning_rate": 8.807752788938406e-05, + "loss": 0.33703696727752686, + "memory(GiB)": 78.33, + "step": 3374, + "token_acc": 0.898842851810377, + "train_speed(iter/s)": 0.032476 + }, + { + "epoch": 0.6539747129777649, + "grad_norm": 0.10279665142297745, + "learning_rate": 8.798998224970756e-05, + "loss": 0.36671680212020874, + "memory(GiB)": 78.33, + "step": 3375, + "token_acc": 0.8890161118835365, + "train_speed(iter/s)": 0.032477 + }, + { + "epoch": 0.6541684832630916, + "grad_norm": 0.10394495725631714, + "learning_rate": 8.790246207921164e-05, + "loss": 0.344325453042984, + "memory(GiB)": 78.33, + "step": 3376, + "token_acc": 0.8953350437253441, + "train_speed(iter/s)": 0.032478 + }, + { + "epoch": 0.6543622535484184, + "grad_norm": 0.10396216064691544, + "learning_rate": 8.781496741384314e-05, + "loss": 0.3480278551578522, + "memory(GiB)": 78.33, + "step": 3377, + "token_acc": 0.8966755763448045, + "train_speed(iter/s)": 0.032478 + }, + { + "epoch": 0.6545560238337451, + "grad_norm": 0.10462364554405212, + "learning_rate": 8.772749828953848e-05, + "loss": 0.3206149935722351, + "memory(GiB)": 78.33, + "step": 3378, + "token_acc": 0.9034575662325999, + "train_speed(iter/s)": 0.032479 + }, + { + "epoch": 0.6547497941190719, + "grad_norm": 0.10066306591033936, + "learning_rate": 8.764005474222365e-05, + "loss": 0.32818686962127686, + "memory(GiB)": 78.33, + "step": 3379, + "token_acc": 0.9018111463427743, + "train_speed(iter/s)": 0.03248 + }, + { + "epoch": 0.6549435644043986, + "grad_norm": 0.10638129711151123, + "learning_rate": 8.7552636807814e-05, + "loss": 0.3548268675804138, + "memory(GiB)": 78.33, + "step": 3380, + "token_acc": 0.8971172104263767, + "train_speed(iter/s)": 0.032481 + }, + { + "epoch": 0.6551373346897253, + "grad_norm": 0.08810947835445404, + "learning_rate": 8.746524452221442e-05, + "loss": 0.31981220841407776, + "memory(GiB)": 78.33, + "step": 3381, + "token_acc": 0.9033225494131816, + "train_speed(iter/s)": 0.032481 + }, + { + "epoch": 0.6553311049750521, + "grad_norm": 0.0968799740076065, + "learning_rate": 8.737787792131926e-05, + "loss": 0.29892611503601074, + "memory(GiB)": 78.33, + "step": 3382, + "token_acc": 0.9107401514971624, + "train_speed(iter/s)": 0.032482 + }, + { + "epoch": 0.6555248752603788, + "grad_norm": 0.10424499958753586, + "learning_rate": 8.729053704101246e-05, + "loss": 0.3598312735557556, + "memory(GiB)": 78.33, + "step": 3383, + "token_acc": 0.8916409545107142, + "train_speed(iter/s)": 0.032483 + }, + { + "epoch": 0.6557186455457056, + "grad_norm": 0.1125241369009018, + "learning_rate": 8.720322191716708e-05, + "loss": 0.3586179316043854, + "memory(GiB)": 78.33, + "step": 3384, + "token_acc": 0.8950719017611892, + "train_speed(iter/s)": 0.032484 + }, + { + "epoch": 0.6559124158310323, + "grad_norm": 0.1094302088022232, + "learning_rate": 8.71159325856459e-05, + "loss": 0.35498055815696716, + "memory(GiB)": 78.33, + "step": 3385, + "token_acc": 0.8951116185440015, + "train_speed(iter/s)": 0.032485 + }, + { + "epoch": 0.6561061861163591, + "grad_norm": 0.09570290893316269, + "learning_rate": 8.702866908230096e-05, + "loss": 0.30832698941230774, + "memory(GiB)": 78.33, + "step": 3386, + "token_acc": 0.9092654575646714, + "train_speed(iter/s)": 0.032485 + }, + { + "epoch": 0.6562999564016858, + "grad_norm": 0.09967100620269775, + "learning_rate": 8.694143144297376e-05, + "loss": 0.32282644510269165, + "memory(GiB)": 78.33, + "step": 3387, + "token_acc": 0.9025936599423631, + "train_speed(iter/s)": 0.032486 + }, + { + "epoch": 0.6564937266870126, + "grad_norm": 0.09364413470029831, + "learning_rate": 8.685421970349511e-05, + "loss": 0.3126744031906128, + "memory(GiB)": 78.33, + "step": 3388, + "token_acc": 0.9047869806163645, + "train_speed(iter/s)": 0.032487 + }, + { + "epoch": 0.6566874969723393, + "grad_norm": 0.11045239120721817, + "learning_rate": 8.676703389968515e-05, + "loss": 0.3734998106956482, + "memory(GiB)": 78.33, + "step": 3389, + "token_acc": 0.8892863324294698, + "train_speed(iter/s)": 0.032488 + }, + { + "epoch": 0.656881267257666, + "grad_norm": 0.0982174277305603, + "learning_rate": 8.667987406735363e-05, + "loss": 0.3432157337665558, + "memory(GiB)": 78.33, + "step": 3390, + "token_acc": 0.8978511530398323, + "train_speed(iter/s)": 0.032488 + }, + { + "epoch": 0.6570750375429928, + "grad_norm": 0.09781992435455322, + "learning_rate": 8.659274024229918e-05, + "loss": 0.34358012676239014, + "memory(GiB)": 78.33, + "step": 3391, + "token_acc": 0.8971234051399319, + "train_speed(iter/s)": 0.032489 + }, + { + "epoch": 0.6572688078283195, + "grad_norm": 0.09683690965175629, + "learning_rate": 8.65056324603102e-05, + "loss": 0.3415444493293762, + "memory(GiB)": 78.33, + "step": 3392, + "token_acc": 0.8975164282953227, + "train_speed(iter/s)": 0.03249 + }, + { + "epoch": 0.6574625781136463, + "grad_norm": 0.09927819669246674, + "learning_rate": 8.641855075716413e-05, + "loss": 0.33681708574295044, + "memory(GiB)": 78.33, + "step": 3393, + "token_acc": 0.9005583061202992, + "train_speed(iter/s)": 0.032491 + }, + { + "epoch": 0.657656348398973, + "grad_norm": 0.10389687120914459, + "learning_rate": 8.633149516862775e-05, + "loss": 0.3464776277542114, + "memory(GiB)": 78.33, + "step": 3394, + "token_acc": 0.8982288677996028, + "train_speed(iter/s)": 0.032491 + }, + { + "epoch": 0.6578501186842998, + "grad_norm": 0.09941697865724564, + "learning_rate": 8.624446573045717e-05, + "loss": 0.33485954999923706, + "memory(GiB)": 78.33, + "step": 3395, + "token_acc": 0.8993164654649689, + "train_speed(iter/s)": 0.032492 + }, + { + "epoch": 0.6580438889696265, + "grad_norm": 0.10718297958374023, + "learning_rate": 8.61574624783976e-05, + "loss": 0.3694123923778534, + "memory(GiB)": 78.33, + "step": 3396, + "token_acc": 0.8908513223063265, + "train_speed(iter/s)": 0.032493 + }, + { + "epoch": 0.6582376592549533, + "grad_norm": 0.09040777385234833, + "learning_rate": 8.607048544818386e-05, + "loss": 0.29911673069000244, + "memory(GiB)": 78.33, + "step": 3397, + "token_acc": 0.9073398896617894, + "train_speed(iter/s)": 0.032494 + }, + { + "epoch": 0.65843142954028, + "grad_norm": 0.09987498074769974, + "learning_rate": 8.598353467553945e-05, + "loss": 0.3411368727684021, + "memory(GiB)": 78.33, + "step": 3398, + "token_acc": 0.898384837029792, + "train_speed(iter/s)": 0.032495 + }, + { + "epoch": 0.6586251998256067, + "grad_norm": 0.10136095434427261, + "learning_rate": 8.58966101961776e-05, + "loss": 0.3573547601699829, + "memory(GiB)": 78.33, + "step": 3399, + "token_acc": 0.8953447859995176, + "train_speed(iter/s)": 0.032495 + }, + { + "epoch": 0.6588189701109335, + "grad_norm": 0.10064728558063507, + "learning_rate": 8.580971204580049e-05, + "loss": 0.3405356705188751, + "memory(GiB)": 78.33, + "step": 3400, + "token_acc": 0.8976995468804462, + "train_speed(iter/s)": 0.032496 + }, + { + "epoch": 0.6590127403962602, + "grad_norm": 0.11121074110269547, + "learning_rate": 8.572284026009947e-05, + "loss": 0.3688167333602905, + "memory(GiB)": 78.33, + "step": 3401, + "token_acc": 0.8922276364073873, + "train_speed(iter/s)": 0.032493 + }, + { + "epoch": 0.659206510681587, + "grad_norm": 0.09966862946748734, + "learning_rate": 8.563599487475517e-05, + "loss": 0.32531312108039856, + "memory(GiB)": 78.33, + "step": 3402, + "token_acc": 0.9004441763544637, + "train_speed(iter/s)": 0.032494 + }, + { + "epoch": 0.6594002809669137, + "grad_norm": 0.09778161346912384, + "learning_rate": 8.554917592543724e-05, + "loss": 0.3067454695701599, + "memory(GiB)": 78.33, + "step": 3403, + "token_acc": 0.9069717187542206, + "train_speed(iter/s)": 0.032495 + }, + { + "epoch": 0.6595940512522405, + "grad_norm": 0.10241376608610153, + "learning_rate": 8.546238344780468e-05, + "loss": 0.3466450572013855, + "memory(GiB)": 78.33, + "step": 3404, + "token_acc": 0.8972174435433221, + "train_speed(iter/s)": 0.032496 + }, + { + "epoch": 0.6597878215375672, + "grad_norm": 0.10399215668439865, + "learning_rate": 8.537561747750542e-05, + "loss": 0.35718533396720886, + "memory(GiB)": 78.33, + "step": 3405, + "token_acc": 0.894998862084661, + "train_speed(iter/s)": 0.032496 + }, + { + "epoch": 0.659981591822894, + "grad_norm": 0.12148671597242355, + "learning_rate": 8.528887805017661e-05, + "loss": 0.38953131437301636, + "memory(GiB)": 78.33, + "step": 3406, + "token_acc": 0.8871596564074931, + "train_speed(iter/s)": 0.032497 + }, + { + "epoch": 0.6601753621082207, + "grad_norm": 0.10579821467399597, + "learning_rate": 8.520216520144442e-05, + "loss": 0.33564966917037964, + "memory(GiB)": 78.33, + "step": 3407, + "token_acc": 0.8999078135039249, + "train_speed(iter/s)": 0.032498 + }, + { + "epoch": 0.6603691323935474, + "grad_norm": 0.11077598482370377, + "learning_rate": 8.51154789669241e-05, + "loss": 0.3519205152988434, + "memory(GiB)": 78.33, + "step": 3408, + "token_acc": 0.89746127867707, + "train_speed(iter/s)": 0.032499 + }, + { + "epoch": 0.6605629026788742, + "grad_norm": 0.09937475621700287, + "learning_rate": 8.502881938222021e-05, + "loss": 0.36071473360061646, + "memory(GiB)": 78.33, + "step": 3409, + "token_acc": 0.8904598045082784, + "train_speed(iter/s)": 0.032499 + }, + { + "epoch": 0.6607566729642009, + "grad_norm": 0.09916140139102936, + "learning_rate": 8.494218648292594e-05, + "loss": 0.3261912763118744, + "memory(GiB)": 78.33, + "step": 3410, + "token_acc": 0.9021432978352424, + "train_speed(iter/s)": 0.0325 + }, + { + "epoch": 0.6609504432495277, + "grad_norm": 0.10734451562166214, + "learning_rate": 8.485558030462389e-05, + "loss": 0.32132577896118164, + "memory(GiB)": 78.33, + "step": 3411, + "token_acc": 0.9042697182232066, + "train_speed(iter/s)": 0.032501 + }, + { + "epoch": 0.6611442135348544, + "grad_norm": 0.09797913581132889, + "learning_rate": 8.476900088288554e-05, + "loss": 0.33835476636886597, + "memory(GiB)": 78.33, + "step": 3412, + "token_acc": 0.9016252123424863, + "train_speed(iter/s)": 0.032502 + }, + { + "epoch": 0.6613379838201812, + "grad_norm": 0.10399965941905975, + "learning_rate": 8.468244825327132e-05, + "loss": 0.3561461269855499, + "memory(GiB)": 78.33, + "step": 3413, + "token_acc": 0.8911394426552394, + "train_speed(iter/s)": 0.032502 + }, + { + "epoch": 0.6615317541055079, + "grad_norm": 0.14848816394805908, + "learning_rate": 8.459592245133076e-05, + "loss": 0.3276277780532837, + "memory(GiB)": 78.33, + "step": 3414, + "token_acc": 0.9025031223980017, + "train_speed(iter/s)": 0.032503 + }, + { + "epoch": 0.6617255243908347, + "grad_norm": 0.10193125158548355, + "learning_rate": 8.450942351260228e-05, + "loss": 0.3469090163707733, + "memory(GiB)": 78.33, + "step": 3415, + "token_acc": 0.89754166435967, + "train_speed(iter/s)": 0.032504 + }, + { + "epoch": 0.6619192946761614, + "grad_norm": 0.0984477773308754, + "learning_rate": 8.442295147261347e-05, + "loss": 0.33425408601760864, + "memory(GiB)": 78.33, + "step": 3416, + "token_acc": 0.8991217336922533, + "train_speed(iter/s)": 0.032505 + }, + { + "epoch": 0.6621130649614881, + "grad_norm": 0.09679713100194931, + "learning_rate": 8.43365063668805e-05, + "loss": 0.3265281617641449, + "memory(GiB)": 78.33, + "step": 3417, + "token_acc": 0.9010822632198733, + "train_speed(iter/s)": 0.032505 + }, + { + "epoch": 0.6623068352468149, + "grad_norm": 0.10674044489860535, + "learning_rate": 8.425008823090885e-05, + "loss": 0.32397177815437317, + "memory(GiB)": 78.33, + "step": 3418, + "token_acc": 0.9022934871751365, + "train_speed(iter/s)": 0.032506 + }, + { + "epoch": 0.6625006055321416, + "grad_norm": 0.09785594791173935, + "learning_rate": 8.416369710019276e-05, + "loss": 0.36127400398254395, + "memory(GiB)": 78.33, + "step": 3419, + "token_acc": 0.8940439348587084, + "train_speed(iter/s)": 0.032507 + }, + { + "epoch": 0.6626943758174684, + "grad_norm": 0.0925799012184143, + "learning_rate": 8.407733301021534e-05, + "loss": 0.3258381485939026, + "memory(GiB)": 78.33, + "step": 3420, + "token_acc": 0.9027204212265125, + "train_speed(iter/s)": 0.032508 + }, + { + "epoch": 0.6628881461027951, + "grad_norm": 0.09700040519237518, + "learning_rate": 8.399099599644869e-05, + "loss": 0.3296799957752228, + "memory(GiB)": 78.33, + "step": 3421, + "token_acc": 0.9014329474066767, + "train_speed(iter/s)": 0.032509 + }, + { + "epoch": 0.6630819163881219, + "grad_norm": 0.09748223423957825, + "learning_rate": 8.390468609435364e-05, + "loss": 0.3318982422351837, + "memory(GiB)": 78.33, + "step": 3422, + "token_acc": 0.8991830021301265, + "train_speed(iter/s)": 0.032509 + }, + { + "epoch": 0.6632756866734486, + "grad_norm": 0.1090392917394638, + "learning_rate": 8.381840333938017e-05, + "loss": 0.36316922307014465, + "memory(GiB)": 78.33, + "step": 3423, + "token_acc": 0.8909560870306198, + "train_speed(iter/s)": 0.03251 + }, + { + "epoch": 0.6634694569587753, + "grad_norm": 0.10358504951000214, + "learning_rate": 8.37321477669667e-05, + "loss": 0.3546435236930847, + "memory(GiB)": 78.33, + "step": 3424, + "token_acc": 0.8954622502570373, + "train_speed(iter/s)": 0.032511 + }, + { + "epoch": 0.6636632272441021, + "grad_norm": 0.1012730523943901, + "learning_rate": 8.364591941254091e-05, + "loss": 0.3311161994934082, + "memory(GiB)": 78.33, + "step": 3425, + "token_acc": 0.9006039512744396, + "train_speed(iter/s)": 0.032512 + }, + { + "epoch": 0.6638569975294288, + "grad_norm": 0.10826534032821655, + "learning_rate": 8.355971831151901e-05, + "loss": 0.36814382672309875, + "memory(GiB)": 78.33, + "step": 3426, + "token_acc": 0.8914054979142154, + "train_speed(iter/s)": 0.032512 + }, + { + "epoch": 0.6640507678147556, + "grad_norm": 0.10928687453269958, + "learning_rate": 8.347354449930611e-05, + "loss": 0.35287556052207947, + "memory(GiB)": 78.33, + "step": 3427, + "token_acc": 0.8944273647136077, + "train_speed(iter/s)": 0.032513 + }, + { + "epoch": 0.6642445381000823, + "grad_norm": 0.10338309407234192, + "learning_rate": 8.338739801129611e-05, + "loss": 0.3381505608558655, + "memory(GiB)": 78.33, + "step": 3428, + "token_acc": 0.8976415225684292, + "train_speed(iter/s)": 0.032514 + }, + { + "epoch": 0.6644383083854091, + "grad_norm": 0.09947198629379272, + "learning_rate": 8.330127888287165e-05, + "loss": 0.3421975374221802, + "memory(GiB)": 78.33, + "step": 3429, + "token_acc": 0.8979885810067572, + "train_speed(iter/s)": 0.032515 + }, + { + "epoch": 0.6646320786707358, + "grad_norm": 0.11111465096473694, + "learning_rate": 8.321518714940434e-05, + "loss": 0.35146766901016235, + "memory(GiB)": 78.33, + "step": 3430, + "token_acc": 0.8957091043216328, + "train_speed(iter/s)": 0.032516 + }, + { + "epoch": 0.6648258489560626, + "grad_norm": 0.09252513200044632, + "learning_rate": 8.312912284625412e-05, + "loss": 0.3317872881889343, + "memory(GiB)": 78.33, + "step": 3431, + "token_acc": 0.8990752267472879, + "train_speed(iter/s)": 0.032516 + }, + { + "epoch": 0.6650196192413893, + "grad_norm": 0.09670988470315933, + "learning_rate": 8.30430860087701e-05, + "loss": 0.3309570848941803, + "memory(GiB)": 78.33, + "step": 3432, + "token_acc": 0.9016609407692506, + "train_speed(iter/s)": 0.032517 + }, + { + "epoch": 0.665213389526716, + "grad_norm": 0.10486641526222229, + "learning_rate": 8.295707667228987e-05, + "loss": 0.33131322264671326, + "memory(GiB)": 78.33, + "step": 3433, + "token_acc": 0.9014130083054687, + "train_speed(iter/s)": 0.032518 + }, + { + "epoch": 0.6654071598120428, + "grad_norm": 0.09246989339590073, + "learning_rate": 8.287109487213974e-05, + "loss": 0.3162573575973511, + "memory(GiB)": 78.33, + "step": 3434, + "token_acc": 0.9054944021074419, + "train_speed(iter/s)": 0.032519 + }, + { + "epoch": 0.6656009300973695, + "grad_norm": 0.08992563188076019, + "learning_rate": 8.278514064363477e-05, + "loss": 0.31209853291511536, + "memory(GiB)": 78.33, + "step": 3435, + "token_acc": 0.9090142626291013, + "train_speed(iter/s)": 0.032519 + }, + { + "epoch": 0.6657947003826963, + "grad_norm": 0.102995365858078, + "learning_rate": 8.269921402207863e-05, + "loss": 0.3250240385532379, + "memory(GiB)": 78.33, + "step": 3436, + "token_acc": 0.9044458079991075, + "train_speed(iter/s)": 0.03252 + }, + { + "epoch": 0.665988470668023, + "grad_norm": 0.09776205569505692, + "learning_rate": 8.261331504276378e-05, + "loss": 0.31458067893981934, + "memory(GiB)": 78.33, + "step": 3437, + "token_acc": 0.9068092088396656, + "train_speed(iter/s)": 0.032521 + }, + { + "epoch": 0.6661822409533498, + "grad_norm": 0.12002206593751907, + "learning_rate": 8.25274437409712e-05, + "loss": 0.4206904470920563, + "memory(GiB)": 78.33, + "step": 3438, + "token_acc": 0.8826992485651737, + "train_speed(iter/s)": 0.032522 + }, + { + "epoch": 0.6663760112386765, + "grad_norm": 0.09399055689573288, + "learning_rate": 8.244160015197054e-05, + "loss": 0.3172173798084259, + "memory(GiB)": 78.33, + "step": 3439, + "token_acc": 0.9048228875854827, + "train_speed(iter/s)": 0.032522 + }, + { + "epoch": 0.6665697815240033, + "grad_norm": 0.10963206738233566, + "learning_rate": 8.235578431102004e-05, + "loss": 0.357374906539917, + "memory(GiB)": 78.33, + "step": 3440, + "token_acc": 0.8945577178923919, + "train_speed(iter/s)": 0.032523 + }, + { + "epoch": 0.66676355180933, + "grad_norm": 0.09495268017053604, + "learning_rate": 8.226999625336662e-05, + "loss": 0.3170054852962494, + "memory(GiB)": 78.33, + "step": 3441, + "token_acc": 0.9031417624521073, + "train_speed(iter/s)": 0.032524 + }, + { + "epoch": 0.6669573220946567, + "grad_norm": 0.10406588762998581, + "learning_rate": 8.21842360142457e-05, + "loss": 0.3348080813884735, + "memory(GiB)": 78.33, + "step": 3442, + "token_acc": 0.8991230567735322, + "train_speed(iter/s)": 0.032524 + }, + { + "epoch": 0.6671510923799835, + "grad_norm": 0.0943104475736618, + "learning_rate": 8.209850362888126e-05, + "loss": 0.3240373432636261, + "memory(GiB)": 78.33, + "step": 3443, + "token_acc": 0.9041046505639388, + "train_speed(iter/s)": 0.032525 + }, + { + "epoch": 0.6673448626653102, + "grad_norm": 0.10466547310352325, + "learning_rate": 8.201279913248606e-05, + "loss": 0.35648593306541443, + "memory(GiB)": 78.33, + "step": 3444, + "token_acc": 0.8950629795859273, + "train_speed(iter/s)": 0.032526 + }, + { + "epoch": 0.667538632950637, + "grad_norm": 0.1078016608953476, + "learning_rate": 8.192712256026111e-05, + "loss": 0.3920597434043884, + "memory(GiB)": 78.33, + "step": 3445, + "token_acc": 0.8864499320972257, + "train_speed(iter/s)": 0.032527 + }, + { + "epoch": 0.6677324032359637, + "grad_norm": 0.0912395566701889, + "learning_rate": 8.18414739473961e-05, + "loss": 0.3125567138195038, + "memory(GiB)": 78.33, + "step": 3446, + "token_acc": 0.9068691030238295, + "train_speed(iter/s)": 0.032528 + }, + { + "epoch": 0.6679261735212905, + "grad_norm": 0.10371241718530655, + "learning_rate": 8.175585332906928e-05, + "loss": 0.35416972637176514, + "memory(GiB)": 78.33, + "step": 3447, + "token_acc": 0.8940065267570507, + "train_speed(iter/s)": 0.032528 + }, + { + "epoch": 0.6681199438066172, + "grad_norm": 0.1040647029876709, + "learning_rate": 8.167026074044719e-05, + "loss": 0.34062814712524414, + "memory(GiB)": 78.33, + "step": 3448, + "token_acc": 0.8997950416935733, + "train_speed(iter/s)": 0.032529 + }, + { + "epoch": 0.668313714091944, + "grad_norm": 0.1095561683177948, + "learning_rate": 8.158469621668522e-05, + "loss": 0.35409224033355713, + "memory(GiB)": 78.33, + "step": 3449, + "token_acc": 0.8959530893440019, + "train_speed(iter/s)": 0.03253 + }, + { + "epoch": 0.6685074843772707, + "grad_norm": 0.09262026101350784, + "learning_rate": 8.149915979292683e-05, + "loss": 0.3020830750465393, + "memory(GiB)": 78.33, + "step": 3450, + "token_acc": 0.9076835711816701, + "train_speed(iter/s)": 0.032531 + }, + { + "epoch": 0.6687012546625974, + "grad_norm": 0.109112448990345, + "learning_rate": 8.141365150430421e-05, + "loss": 0.3594379723072052, + "memory(GiB)": 78.33, + "step": 3451, + "token_acc": 0.893763065114973, + "train_speed(iter/s)": 0.032531 + }, + { + "epoch": 0.6688950249479242, + "grad_norm": 0.10545790940523148, + "learning_rate": 8.132817138593792e-05, + "loss": 0.339529812335968, + "memory(GiB)": 78.33, + "step": 3452, + "token_acc": 0.8981471052038748, + "train_speed(iter/s)": 0.032532 + }, + { + "epoch": 0.6690887952332509, + "grad_norm": 0.10486025363206863, + "learning_rate": 8.124271947293695e-05, + "loss": 0.3370751142501831, + "memory(GiB)": 78.33, + "step": 3453, + "token_acc": 0.9002499368137269, + "train_speed(iter/s)": 0.032533 + }, + { + "epoch": 0.6692825655185777, + "grad_norm": 0.10516992211341858, + "learning_rate": 8.115729580039863e-05, + "loss": 0.31734785437583923, + "memory(GiB)": 78.33, + "step": 3454, + "token_acc": 0.9048377947651333, + "train_speed(iter/s)": 0.032534 + }, + { + "epoch": 0.6694763358039044, + "grad_norm": 0.09757381677627563, + "learning_rate": 8.107190040340878e-05, + "loss": 0.3384143114089966, + "memory(GiB)": 78.33, + "step": 3455, + "token_acc": 0.8986423042761071, + "train_speed(iter/s)": 0.032534 + }, + { + "epoch": 0.6696701060892312, + "grad_norm": 0.10361029207706451, + "learning_rate": 8.09865333170417e-05, + "loss": 0.36140722036361694, + "memory(GiB)": 78.33, + "step": 3456, + "token_acc": 0.8917902428540726, + "train_speed(iter/s)": 0.032535 + }, + { + "epoch": 0.6698638763745579, + "grad_norm": 0.10952255129814148, + "learning_rate": 8.090119457635973e-05, + "loss": 0.3548586666584015, + "memory(GiB)": 78.33, + "step": 3457, + "token_acc": 0.8944674205396651, + "train_speed(iter/s)": 0.032536 + }, + { + "epoch": 0.6700576466598847, + "grad_norm": 0.09695498645305634, + "learning_rate": 8.081588421641399e-05, + "loss": 0.34991052746772766, + "memory(GiB)": 78.33, + "step": 3458, + "token_acc": 0.8968775374621687, + "train_speed(iter/s)": 0.032537 + }, + { + "epoch": 0.6702514169452114, + "grad_norm": 0.096768319606781, + "learning_rate": 8.073060227224364e-05, + "loss": 0.3430905044078827, + "memory(GiB)": 78.33, + "step": 3459, + "token_acc": 0.8984231274638633, + "train_speed(iter/s)": 0.032537 + }, + { + "epoch": 0.6704451872305381, + "grad_norm": 0.10539654642343521, + "learning_rate": 8.064534877887625e-05, + "loss": 0.3595712184906006, + "memory(GiB)": 78.33, + "step": 3460, + "token_acc": 0.8929441752331867, + "train_speed(iter/s)": 0.032538 + }, + { + "epoch": 0.670638957515865, + "grad_norm": 0.109794020652771, + "learning_rate": 8.056012377132778e-05, + "loss": 0.38728106021881104, + "memory(GiB)": 78.33, + "step": 3461, + "token_acc": 0.8862599615278923, + "train_speed(iter/s)": 0.032539 + }, + { + "epoch": 0.6708327278011917, + "grad_norm": 0.10039215534925461, + "learning_rate": 8.047492728460232e-05, + "loss": 0.3494102358818054, + "memory(GiB)": 78.33, + "step": 3462, + "token_acc": 0.8965799655454828, + "train_speed(iter/s)": 0.03254 + }, + { + "epoch": 0.6710264980865185, + "grad_norm": 0.11146795004606247, + "learning_rate": 8.038975935369256e-05, + "loss": 0.34334322810173035, + "memory(GiB)": 78.33, + "step": 3463, + "token_acc": 0.8990417830644919, + "train_speed(iter/s)": 0.032541 + }, + { + "epoch": 0.6712202683718452, + "grad_norm": 0.10396383702754974, + "learning_rate": 8.030462001357903e-05, + "loss": 0.3301684558391571, + "memory(GiB)": 78.33, + "step": 3464, + "token_acc": 0.9044740024183797, + "train_speed(iter/s)": 0.032541 + }, + { + "epoch": 0.671414038657172, + "grad_norm": 0.10532711446285248, + "learning_rate": 8.02195092992309e-05, + "loss": 0.3416173756122589, + "memory(GiB)": 78.33, + "step": 3465, + "token_acc": 0.8965047796620201, + "train_speed(iter/s)": 0.032542 + }, + { + "epoch": 0.6716078089424987, + "grad_norm": 0.10811363905668259, + "learning_rate": 8.013442724560537e-05, + "loss": 0.3821423649787903, + "memory(GiB)": 78.33, + "step": 3466, + "token_acc": 0.8860308779126075, + "train_speed(iter/s)": 0.032543 + }, + { + "epoch": 0.6718015792278255, + "grad_norm": 0.10519671440124512, + "learning_rate": 8.004937388764793e-05, + "loss": 0.36607247591018677, + "memory(GiB)": 78.33, + "step": 3467, + "token_acc": 0.892551655187736, + "train_speed(iter/s)": 0.032543 + }, + { + "epoch": 0.6719953495131522, + "grad_norm": 0.10614033788442612, + "learning_rate": 7.996434926029227e-05, + "loss": 0.36225974559783936, + "memory(GiB)": 78.33, + "step": 3468, + "token_acc": 0.8928445644517492, + "train_speed(iter/s)": 0.032544 + }, + { + "epoch": 0.672189119798479, + "grad_norm": 0.11390534043312073, + "learning_rate": 7.987935339846025e-05, + "loss": 0.3544899821281433, + "memory(GiB)": 78.33, + "step": 3469, + "token_acc": 0.895393061480967, + "train_speed(iter/s)": 0.032545 + }, + { + "epoch": 0.6723828900838057, + "grad_norm": 0.09529854357242584, + "learning_rate": 7.979438633706206e-05, + "loss": 0.3296326994895935, + "memory(GiB)": 78.33, + "step": 3470, + "token_acc": 0.8999336332128899, + "train_speed(iter/s)": 0.032546 + }, + { + "epoch": 0.6725766603691324, + "grad_norm": 0.09383497387170792, + "learning_rate": 7.97094481109959e-05, + "loss": 0.31189560890197754, + "memory(GiB)": 78.33, + "step": 3471, + "token_acc": 0.9070854413313131, + "train_speed(iter/s)": 0.032547 + }, + { + "epoch": 0.6727704306544592, + "grad_norm": 0.10056065768003464, + "learning_rate": 7.962453875514821e-05, + "loss": 0.3454468250274658, + "memory(GiB)": 78.33, + "step": 3472, + "token_acc": 0.8972385445115727, + "train_speed(iter/s)": 0.032547 + }, + { + "epoch": 0.6729642009397859, + "grad_norm": 0.09275511652231216, + "learning_rate": 7.953965830439349e-05, + "loss": 0.3189311623573303, + "memory(GiB)": 78.33, + "step": 3473, + "token_acc": 0.9043669819878191, + "train_speed(iter/s)": 0.032548 + }, + { + "epoch": 0.6731579712251127, + "grad_norm": 0.09983082860708237, + "learning_rate": 7.945480679359443e-05, + "loss": 0.33363524079322815, + "memory(GiB)": 78.33, + "step": 3474, + "token_acc": 0.901058171118153, + "train_speed(iter/s)": 0.032549 + }, + { + "epoch": 0.6733517415104394, + "grad_norm": 0.3244771361351013, + "learning_rate": 7.936998425760186e-05, + "loss": 0.3636043071746826, + "memory(GiB)": 78.33, + "step": 3475, + "token_acc": 0.8894176781990268, + "train_speed(iter/s)": 0.03255 + }, + { + "epoch": 0.6735455117957662, + "grad_norm": 0.10669998079538345, + "learning_rate": 7.928519073125461e-05, + "loss": 0.35683298110961914, + "memory(GiB)": 78.33, + "step": 3476, + "token_acc": 0.8926932475117707, + "train_speed(iter/s)": 0.03255 + }, + { + "epoch": 0.6737392820810929, + "grad_norm": 0.09464793652296066, + "learning_rate": 7.920042624937976e-05, + "loss": 0.3355555236339569, + "memory(GiB)": 78.33, + "step": 3477, + "token_acc": 0.9, + "train_speed(iter/s)": 0.032551 + }, + { + "epoch": 0.6739330523664196, + "grad_norm": 0.10269086062908173, + "learning_rate": 7.911569084679229e-05, + "loss": 0.357306569814682, + "memory(GiB)": 78.33, + "step": 3478, + "token_acc": 0.8936895083236547, + "train_speed(iter/s)": 0.032552 + }, + { + "epoch": 0.6741268226517464, + "grad_norm": 0.09656988829374313, + "learning_rate": 7.903098455829535e-05, + "loss": 0.32650524377822876, + "memory(GiB)": 78.33, + "step": 3479, + "token_acc": 0.9009720369210662, + "train_speed(iter/s)": 0.032552 + }, + { + "epoch": 0.6743205929370731, + "grad_norm": 0.10245799273252487, + "learning_rate": 7.894630741868004e-05, + "loss": 0.3412097096443176, + "memory(GiB)": 78.33, + "step": 3480, + "token_acc": 0.8984286408553378, + "train_speed(iter/s)": 0.032553 + }, + { + "epoch": 0.6745143632223999, + "grad_norm": 0.0951901376247406, + "learning_rate": 7.88616594627255e-05, + "loss": 0.31834906339645386, + "memory(GiB)": 78.33, + "step": 3481, + "token_acc": 0.9045248313917842, + "train_speed(iter/s)": 0.032554 + }, + { + "epoch": 0.6747081335077266, + "grad_norm": 0.09806448966264725, + "learning_rate": 7.877704072519911e-05, + "loss": 0.3509221076965332, + "memory(GiB)": 78.33, + "step": 3482, + "token_acc": 0.8939818589482267, + "train_speed(iter/s)": 0.032555 + }, + { + "epoch": 0.6749019037930534, + "grad_norm": 0.10363534837961197, + "learning_rate": 7.869245124085581e-05, + "loss": 0.3548724353313446, + "memory(GiB)": 78.33, + "step": 3483, + "token_acc": 0.8933054967997143, + "train_speed(iter/s)": 0.032555 + }, + { + "epoch": 0.6750956740783801, + "grad_norm": 0.10435101389884949, + "learning_rate": 7.860789104443896e-05, + "loss": 0.36100128293037415, + "memory(GiB)": 78.33, + "step": 3484, + "token_acc": 0.8941173294457493, + "train_speed(iter/s)": 0.032556 + }, + { + "epoch": 0.6752894443637069, + "grad_norm": 0.09948544204235077, + "learning_rate": 7.852336017067964e-05, + "loss": 0.32601821422576904, + "memory(GiB)": 78.33, + "step": 3485, + "token_acc": 0.901149788196911, + "train_speed(iter/s)": 0.032557 + }, + { + "epoch": 0.6754832146490336, + "grad_norm": 0.10575611889362335, + "learning_rate": 7.843885865429693e-05, + "loss": 0.36941444873809814, + "memory(GiB)": 78.33, + "step": 3486, + "token_acc": 0.8913294131211806, + "train_speed(iter/s)": 0.032557 + }, + { + "epoch": 0.6756769849343603, + "grad_norm": 0.10058741271495819, + "learning_rate": 7.835438652999791e-05, + "loss": 0.3273415267467499, + "memory(GiB)": 78.33, + "step": 3487, + "token_acc": 0.9005714607437435, + "train_speed(iter/s)": 0.032558 + }, + { + "epoch": 0.6758707552196871, + "grad_norm": 0.09746947884559631, + "learning_rate": 7.826994383247747e-05, + "loss": 0.31902948021888733, + "memory(GiB)": 78.33, + "step": 3488, + "token_acc": 0.9043132861435046, + "train_speed(iter/s)": 0.032559 + }, + { + "epoch": 0.6760645255050138, + "grad_norm": 0.0960841104388237, + "learning_rate": 7.818553059641867e-05, + "loss": 0.32989510893821716, + "memory(GiB)": 78.33, + "step": 3489, + "token_acc": 0.9024306199803854, + "train_speed(iter/s)": 0.03256 + }, + { + "epoch": 0.6762582957903406, + "grad_norm": 0.10085577517747879, + "learning_rate": 7.810114685649207e-05, + "loss": 0.3315393626689911, + "memory(GiB)": 78.33, + "step": 3490, + "token_acc": 0.8987733391228832, + "train_speed(iter/s)": 0.03256 + }, + { + "epoch": 0.6764520660756673, + "grad_norm": 0.1091570183634758, + "learning_rate": 7.801679264735652e-05, + "loss": 0.3644520044326782, + "memory(GiB)": 78.33, + "step": 3491, + "token_acc": 0.8908445452104651, + "train_speed(iter/s)": 0.032561 + }, + { + "epoch": 0.6766458363609941, + "grad_norm": 0.11124227195978165, + "learning_rate": 7.793246800365848e-05, + "loss": 0.36782822012901306, + "memory(GiB)": 78.33, + "step": 3492, + "token_acc": 0.8893680254071851, + "train_speed(iter/s)": 0.032562 + }, + { + "epoch": 0.6768396066463208, + "grad_norm": 0.11075468361377716, + "learning_rate": 7.784817296003237e-05, + "loss": 0.3444291353225708, + "memory(GiB)": 78.33, + "step": 3493, + "token_acc": 0.8969756577329727, + "train_speed(iter/s)": 0.032563 + }, + { + "epoch": 0.6770333769316476, + "grad_norm": 0.10527750104665756, + "learning_rate": 7.776390755110041e-05, + "loss": 0.36560899019241333, + "memory(GiB)": 78.33, + "step": 3494, + "token_acc": 0.8897428155792825, + "train_speed(iter/s)": 0.032564 + }, + { + "epoch": 0.6772271472169743, + "grad_norm": 0.10618400573730469, + "learning_rate": 7.767967181147265e-05, + "loss": 0.35203975439071655, + "memory(GiB)": 78.33, + "step": 3495, + "token_acc": 0.8978139451084018, + "train_speed(iter/s)": 0.032564 + }, + { + "epoch": 0.677420917502301, + "grad_norm": 0.09277181327342987, + "learning_rate": 7.759546577574708e-05, + "loss": 0.3013547956943512, + "memory(GiB)": 78.33, + "step": 3496, + "token_acc": 0.9089381973172872, + "train_speed(iter/s)": 0.032565 + }, + { + "epoch": 0.6776146877876278, + "grad_norm": 0.09669741243124008, + "learning_rate": 7.751128947850921e-05, + "loss": 0.3188782334327698, + "memory(GiB)": 78.33, + "step": 3497, + "token_acc": 0.9045035268583831, + "train_speed(iter/s)": 0.032566 + }, + { + "epoch": 0.6778084580729545, + "grad_norm": 0.09220936894416809, + "learning_rate": 7.742714295433265e-05, + "loss": 0.3000409007072449, + "memory(GiB)": 78.33, + "step": 3498, + "token_acc": 0.9094853531300161, + "train_speed(iter/s)": 0.032566 + }, + { + "epoch": 0.6780022283582813, + "grad_norm": 0.1013851910829544, + "learning_rate": 7.734302623777857e-05, + "loss": 0.35849493741989136, + "memory(GiB)": 78.33, + "step": 3499, + "token_acc": 0.8939226809538817, + "train_speed(iter/s)": 0.032567 + }, + { + "epoch": 0.678195998643608, + "grad_norm": 0.10033900290727615, + "learning_rate": 7.7258939363396e-05, + "loss": 0.3273484408855438, + "memory(GiB)": 78.33, + "step": 3500, + "token_acc": 0.902882797731569, + "train_speed(iter/s)": 0.032568 + }, + { + "epoch": 0.678195998643608, + "eval_loss": 0.3950374126434326, + "eval_runtime": 1346.3478, + "eval_samples_per_second": 5.013, + "eval_steps_per_second": 5.013, + "eval_token_acc": 0.8989412672013904, + "step": 3500 + }, + { + "epoch": 0.6783897689289348, + "grad_norm": 0.09484906494617462, + "learning_rate": 7.717488236572166e-05, + "loss": 0.3013468384742737, + "memory(GiB)": 78.33, + "step": 3501, + "token_acc": 0.9093957031354772, + "train_speed(iter/s)": 0.032166 + }, + { + "epoch": 0.6785835392142615, + "grad_norm": 0.1061227023601532, + "learning_rate": 7.709085527927994e-05, + "loss": 0.3452511131763458, + "memory(GiB)": 78.33, + "step": 3502, + "token_acc": 0.8964009033475315, + "train_speed(iter/s)": 0.032166 + }, + { + "epoch": 0.6787773094995883, + "grad_norm": 0.10096945613622665, + "learning_rate": 7.700685813858317e-05, + "loss": 0.330334335565567, + "memory(GiB)": 78.33, + "step": 3503, + "token_acc": 0.9003261023677868, + "train_speed(iter/s)": 0.032167 + }, + { + "epoch": 0.678971079784915, + "grad_norm": 0.09609825909137726, + "learning_rate": 7.692289097813119e-05, + "loss": 0.32408520579338074, + "memory(GiB)": 78.33, + "step": 3504, + "token_acc": 0.9045769211614124, + "train_speed(iter/s)": 0.032168 + }, + { + "epoch": 0.6791648500702417, + "grad_norm": 0.08847975730895996, + "learning_rate": 7.683895383241152e-05, + "loss": 0.3026714622974396, + "memory(GiB)": 78.33, + "step": 3505, + "token_acc": 0.9074179743223966, + "train_speed(iter/s)": 0.032169 + }, + { + "epoch": 0.6793586203555685, + "grad_norm": 0.10310973227024078, + "learning_rate": 7.675504673589942e-05, + "loss": 0.3564712107181549, + "memory(GiB)": 78.33, + "step": 3506, + "token_acc": 0.8931326434619002, + "train_speed(iter/s)": 0.03217 + }, + { + "epoch": 0.6795523906408952, + "grad_norm": 0.09145442396402359, + "learning_rate": 7.66711697230578e-05, + "loss": 0.29558998346328735, + "memory(GiB)": 78.33, + "step": 3507, + "token_acc": 0.9112166695232732, + "train_speed(iter/s)": 0.03217 + }, + { + "epoch": 0.679746160926222, + "grad_norm": 0.11162301152944565, + "learning_rate": 7.65873228283372e-05, + "loss": 0.37096238136291504, + "memory(GiB)": 78.33, + "step": 3508, + "token_acc": 0.8897501419647927, + "train_speed(iter/s)": 0.032171 + }, + { + "epoch": 0.6799399312115487, + "grad_norm": 0.09270161390304565, + "learning_rate": 7.650350608617573e-05, + "loss": 0.30103829503059387, + "memory(GiB)": 78.33, + "step": 3509, + "token_acc": 0.9094411328318193, + "train_speed(iter/s)": 0.032172 + }, + { + "epoch": 0.6801337014968755, + "grad_norm": 0.11339203268289566, + "learning_rate": 7.641971953099932e-05, + "loss": 0.36674901843070984, + "memory(GiB)": 78.33, + "step": 3510, + "token_acc": 0.8918313357088867, + "train_speed(iter/s)": 0.032173 + }, + { + "epoch": 0.6803274717822022, + "grad_norm": 0.11586851626634598, + "learning_rate": 7.633596319722123e-05, + "loss": 0.3334874212741852, + "memory(GiB)": 78.33, + "step": 3511, + "token_acc": 0.8987653673752917, + "train_speed(iter/s)": 0.032174 + }, + { + "epoch": 0.680521242067529, + "grad_norm": 0.09603973478078842, + "learning_rate": 7.625223711924251e-05, + "loss": 0.3322643041610718, + "memory(GiB)": 78.33, + "step": 3512, + "token_acc": 0.9002838519764508, + "train_speed(iter/s)": 0.032174 + }, + { + "epoch": 0.6807150123528557, + "grad_norm": 0.09271606057882309, + "learning_rate": 7.616854133145168e-05, + "loss": 0.3315150737762451, + "memory(GiB)": 78.33, + "step": 3513, + "token_acc": 0.9017941454202077, + "train_speed(iter/s)": 0.032175 + }, + { + "epoch": 0.6809087826381824, + "grad_norm": 0.10619889944791794, + "learning_rate": 7.608487586822484e-05, + "loss": 0.3216935396194458, + "memory(GiB)": 78.33, + "step": 3514, + "token_acc": 0.9023508686672336, + "train_speed(iter/s)": 0.032176 + }, + { + "epoch": 0.6811025529235092, + "grad_norm": 0.10402272641658783, + "learning_rate": 7.600124076392569e-05, + "loss": 0.3527657687664032, + "memory(GiB)": 78.33, + "step": 3515, + "token_acc": 0.8967574223559994, + "train_speed(iter/s)": 0.032177 + }, + { + "epoch": 0.6812963232088359, + "grad_norm": 0.09466518461704254, + "learning_rate": 7.591763605290532e-05, + "loss": 0.33050382137298584, + "memory(GiB)": 78.33, + "step": 3516, + "token_acc": 0.9007306626354246, + "train_speed(iter/s)": 0.032178 + }, + { + "epoch": 0.6814900934941627, + "grad_norm": 0.10368410497903824, + "learning_rate": 7.583406176950252e-05, + "loss": 0.35686179995536804, + "memory(GiB)": 78.33, + "step": 3517, + "token_acc": 0.8919963619827194, + "train_speed(iter/s)": 0.032178 + }, + { + "epoch": 0.6816838637794894, + "grad_norm": 0.1045493334531784, + "learning_rate": 7.57505179480435e-05, + "loss": 0.34543779492378235, + "memory(GiB)": 78.33, + "step": 3518, + "token_acc": 0.89681269102023, + "train_speed(iter/s)": 0.032179 + }, + { + "epoch": 0.6818776340648162, + "grad_norm": 0.10913045704364777, + "learning_rate": 7.56670046228419e-05, + "loss": 0.3246757388114929, + "memory(GiB)": 78.33, + "step": 3519, + "token_acc": 0.901501614470673, + "train_speed(iter/s)": 0.03218 + }, + { + "epoch": 0.6820714043501429, + "grad_norm": 0.10240820050239563, + "learning_rate": 7.55835218281989e-05, + "loss": 0.35042083263397217, + "memory(GiB)": 78.33, + "step": 3520, + "token_acc": 0.8953667078178932, + "train_speed(iter/s)": 0.032181 + }, + { + "epoch": 0.6822651746354697, + "grad_norm": 0.08281584084033966, + "learning_rate": 7.55000695984031e-05, + "loss": 0.28523337841033936, + "memory(GiB)": 78.33, + "step": 3521, + "token_acc": 0.9129096006250287, + "train_speed(iter/s)": 0.032182 + }, + { + "epoch": 0.6824589449207964, + "grad_norm": 0.10515667498111725, + "learning_rate": 7.54166479677307e-05, + "loss": 0.3321034014225006, + "memory(GiB)": 78.33, + "step": 3522, + "token_acc": 0.8979174190441797, + "train_speed(iter/s)": 0.032182 + }, + { + "epoch": 0.6826527152061231, + "grad_norm": 0.0955941304564476, + "learning_rate": 7.5333256970445e-05, + "loss": 0.3286648988723755, + "memory(GiB)": 78.33, + "step": 3523, + "token_acc": 0.9015049247537623, + "train_speed(iter/s)": 0.032183 + }, + { + "epoch": 0.6828464854914499, + "grad_norm": 0.09993550926446915, + "learning_rate": 7.52498966407971e-05, + "loss": 0.3626023232936859, + "memory(GiB)": 78.33, + "step": 3524, + "token_acc": 0.8902110817941953, + "train_speed(iter/s)": 0.032184 + }, + { + "epoch": 0.6830402557767766, + "grad_norm": 0.0978488177061081, + "learning_rate": 7.516656701302527e-05, + "loss": 0.3279617726802826, + "memory(GiB)": 78.33, + "step": 3525, + "token_acc": 0.9018437274393292, + "train_speed(iter/s)": 0.032185 + }, + { + "epoch": 0.6832340260621034, + "grad_norm": 0.11416416615247726, + "learning_rate": 7.508326812135521e-05, + "loss": 0.36074215173721313, + "memory(GiB)": 78.33, + "step": 3526, + "token_acc": 0.891769873711183, + "train_speed(iter/s)": 0.032186 + }, + { + "epoch": 0.6834277963474301, + "grad_norm": 0.10453812032938004, + "learning_rate": 7.500000000000002e-05, + "loss": 0.3612178862094879, + "memory(GiB)": 78.33, + "step": 3527, + "token_acc": 0.8947847093932073, + "train_speed(iter/s)": 0.032187 + }, + { + "epoch": 0.6836215666327569, + "grad_norm": 0.09948738664388657, + "learning_rate": 7.49167626831601e-05, + "loss": 0.34740930795669556, + "memory(GiB)": 78.33, + "step": 3528, + "token_acc": 0.8971372214521804, + "train_speed(iter/s)": 0.032187 + }, + { + "epoch": 0.6838153369180836, + "grad_norm": 0.09419828653335571, + "learning_rate": 7.483355620502344e-05, + "loss": 0.3287207782268524, + "memory(GiB)": 78.33, + "step": 3529, + "token_acc": 0.9001759014951627, + "train_speed(iter/s)": 0.032188 + }, + { + "epoch": 0.6840091072034103, + "grad_norm": 0.0952015146613121, + "learning_rate": 7.475038059976492e-05, + "loss": 0.3348086476325989, + "memory(GiB)": 78.33, + "step": 3530, + "token_acc": 0.8985031033223805, + "train_speed(iter/s)": 0.032189 + }, + { + "epoch": 0.6842028774887371, + "grad_norm": 0.09707789123058319, + "learning_rate": 7.466723590154719e-05, + "loss": 0.32328489422798157, + "memory(GiB)": 78.33, + "step": 3531, + "token_acc": 0.9013299458170223, + "train_speed(iter/s)": 0.03219 + }, + { + "epoch": 0.6843966477740638, + "grad_norm": 0.09463740885257721, + "learning_rate": 7.458412214451992e-05, + "loss": 0.3230316638946533, + "memory(GiB)": 78.33, + "step": 3532, + "token_acc": 0.9013585495772858, + "train_speed(iter/s)": 0.03219 + }, + { + "epoch": 0.6845904180593906, + "grad_norm": 0.09360900521278381, + "learning_rate": 7.450103936282022e-05, + "loss": 0.3111358880996704, + "memory(GiB)": 78.33, + "step": 3533, + "token_acc": 0.9069836922693888, + "train_speed(iter/s)": 0.032191 + }, + { + "epoch": 0.6847841883447173, + "grad_norm": 0.10580860823392868, + "learning_rate": 7.441798759057238e-05, + "loss": 0.3405246138572693, + "memory(GiB)": 78.33, + "step": 3534, + "token_acc": 0.897554945054945, + "train_speed(iter/s)": 0.032192 + }, + { + "epoch": 0.6849779586300441, + "grad_norm": 0.10613741725683212, + "learning_rate": 7.433496686188794e-05, + "loss": 0.3461315929889679, + "memory(GiB)": 78.33, + "step": 3535, + "token_acc": 0.8966271542167963, + "train_speed(iter/s)": 0.032193 + }, + { + "epoch": 0.6851717289153708, + "grad_norm": 0.1079440712928772, + "learning_rate": 7.425197721086587e-05, + "loss": 0.3699726462364197, + "memory(GiB)": 78.33, + "step": 3536, + "token_acc": 0.8911746065279391, + "train_speed(iter/s)": 0.032194 + }, + { + "epoch": 0.6853654992006976, + "grad_norm": 0.09260550886392593, + "learning_rate": 7.416901867159219e-05, + "loss": 0.29420557618141174, + "memory(GiB)": 78.33, + "step": 3537, + "token_acc": 0.9136413222721161, + "train_speed(iter/s)": 0.032194 + }, + { + "epoch": 0.6855592694860243, + "grad_norm": 0.10380339622497559, + "learning_rate": 7.408609127814019e-05, + "loss": 0.346148818731308, + "memory(GiB)": 78.33, + "step": 3538, + "token_acc": 0.8991937710530675, + "train_speed(iter/s)": 0.032195 + }, + { + "epoch": 0.685753039771351, + "grad_norm": 0.09581930190324783, + "learning_rate": 7.400319506457039e-05, + "loss": 0.3182103633880615, + "memory(GiB)": 78.33, + "step": 3539, + "token_acc": 0.9065670467951239, + "train_speed(iter/s)": 0.032196 + }, + { + "epoch": 0.6859468100566778, + "grad_norm": 0.11510851234197617, + "learning_rate": 7.39203300649305e-05, + "loss": 0.3609999716281891, + "memory(GiB)": 78.33, + "step": 3540, + "token_acc": 0.892073988583549, + "train_speed(iter/s)": 0.032197 + }, + { + "epoch": 0.6861405803420045, + "grad_norm": 0.11120989918708801, + "learning_rate": 7.383749631325538e-05, + "loss": 0.3553025722503662, + "memory(GiB)": 78.33, + "step": 3541, + "token_acc": 0.8961922030825022, + "train_speed(iter/s)": 0.032197 + }, + { + "epoch": 0.6863343506273313, + "grad_norm": 0.1011735275387764, + "learning_rate": 7.375469384356705e-05, + "loss": 0.33759036660194397, + "memory(GiB)": 78.33, + "step": 3542, + "token_acc": 0.8999340949033392, + "train_speed(iter/s)": 0.032198 + }, + { + "epoch": 0.686528120912658, + "grad_norm": 0.11077761650085449, + "learning_rate": 7.367192268987479e-05, + "loss": 0.38028085231781006, + "memory(GiB)": 78.33, + "step": 3543, + "token_acc": 0.8891689052590767, + "train_speed(iter/s)": 0.032199 + }, + { + "epoch": 0.6867218911979848, + "grad_norm": 0.10262443870306015, + "learning_rate": 7.35891828861749e-05, + "loss": 0.35626545548439026, + "memory(GiB)": 78.33, + "step": 3544, + "token_acc": 0.8935800392486684, + "train_speed(iter/s)": 0.0322 + }, + { + "epoch": 0.6869156614833115, + "grad_norm": 0.09497423470020294, + "learning_rate": 7.350647446645084e-05, + "loss": 0.3267231583595276, + "memory(GiB)": 78.33, + "step": 3545, + "token_acc": 0.9022584504935687, + "train_speed(iter/s)": 0.032201 + }, + { + "epoch": 0.6871094317686383, + "grad_norm": 0.09908232092857361, + "learning_rate": 7.342379746467317e-05, + "loss": 0.3486153185367584, + "memory(GiB)": 78.33, + "step": 3546, + "token_acc": 0.8962997272399773, + "train_speed(iter/s)": 0.032201 + }, + { + "epoch": 0.687303202053965, + "grad_norm": 0.10710590332746506, + "learning_rate": 7.334115191479958e-05, + "loss": 0.35125356912612915, + "memory(GiB)": 78.33, + "step": 3547, + "token_acc": 0.8947753459732976, + "train_speed(iter/s)": 0.032202 + }, + { + "epoch": 0.6874969723392917, + "grad_norm": 0.09003735333681107, + "learning_rate": 7.325853785077478e-05, + "loss": 0.3048159182071686, + "memory(GiB)": 78.33, + "step": 3548, + "token_acc": 0.907744732028764, + "train_speed(iter/s)": 0.032203 + }, + { + "epoch": 0.6876907426246185, + "grad_norm": 0.11373089253902435, + "learning_rate": 7.317595530653055e-05, + "loss": 0.3735848367214203, + "memory(GiB)": 78.33, + "step": 3549, + "token_acc": 0.8891771564011403, + "train_speed(iter/s)": 0.032204 + }, + { + "epoch": 0.6878845129099452, + "grad_norm": 0.10183597356081009, + "learning_rate": 7.30934043159859e-05, + "loss": 0.3513960838317871, + "memory(GiB)": 78.33, + "step": 3550, + "token_acc": 0.8948632459312839, + "train_speed(iter/s)": 0.032205 + }, + { + "epoch": 0.688078283195272, + "grad_norm": 0.10609740763902664, + "learning_rate": 7.301088491304664e-05, + "loss": 0.32522451877593994, + "memory(GiB)": 78.33, + "step": 3551, + "token_acc": 0.9038668263767314, + "train_speed(iter/s)": 0.032206 + }, + { + "epoch": 0.6882720534805987, + "grad_norm": 0.10257891565561295, + "learning_rate": 7.292839713160572e-05, + "loss": 0.34625405073165894, + "memory(GiB)": 78.33, + "step": 3552, + "token_acc": 0.8976674404121611, + "train_speed(iter/s)": 0.032206 + }, + { + "epoch": 0.6884658237659255, + "grad_norm": 0.09951046854257584, + "learning_rate": 7.28459410055431e-05, + "loss": 0.32491254806518555, + "memory(GiB)": 78.33, + "step": 3553, + "token_acc": 0.9030543047794716, + "train_speed(iter/s)": 0.032207 + }, + { + "epoch": 0.6886595940512522, + "grad_norm": 0.10017047822475433, + "learning_rate": 7.276351656872567e-05, + "loss": 0.3391422927379608, + "memory(GiB)": 78.33, + "step": 3554, + "token_acc": 0.8989633042619714, + "train_speed(iter/s)": 0.032208 + }, + { + "epoch": 0.688853364336579, + "grad_norm": 0.09386296570301056, + "learning_rate": 7.268112385500751e-05, + "loss": 0.3156377077102661, + "memory(GiB)": 78.33, + "step": 3555, + "token_acc": 0.9045845939027228, + "train_speed(iter/s)": 0.032209 + }, + { + "epoch": 0.6890471346219057, + "grad_norm": 0.0977504625916481, + "learning_rate": 7.259876289822932e-05, + "loss": 0.2982153594493866, + "memory(GiB)": 78.33, + "step": 3556, + "token_acc": 0.9096926576077448, + "train_speed(iter/s)": 0.03221 + }, + { + "epoch": 0.6892409049072324, + "grad_norm": 0.10796993225812912, + "learning_rate": 7.25164337322191e-05, + "loss": 0.33437347412109375, + "memory(GiB)": 78.33, + "step": 3557, + "token_acc": 0.9011005762019451, + "train_speed(iter/s)": 0.03221 + }, + { + "epoch": 0.6894346751925592, + "grad_norm": 0.09524484723806381, + "learning_rate": 7.243413639079164e-05, + "loss": 0.3119055926799774, + "memory(GiB)": 78.33, + "step": 3558, + "token_acc": 0.9061108966936833, + "train_speed(iter/s)": 0.032211 + }, + { + "epoch": 0.6896284454778859, + "grad_norm": 0.09212891757488251, + "learning_rate": 7.235187090774861e-05, + "loss": 0.31174278259277344, + "memory(GiB)": 78.33, + "step": 3559, + "token_acc": 0.9064973268274461, + "train_speed(iter/s)": 0.032212 + }, + { + "epoch": 0.6898222157632127, + "grad_norm": 0.09447701275348663, + "learning_rate": 7.22696373168787e-05, + "loss": 0.3395736515522003, + "memory(GiB)": 78.33, + "step": 3560, + "token_acc": 0.899402390438247, + "train_speed(iter/s)": 0.032213 + }, + { + "epoch": 0.6900159860485394, + "grad_norm": 0.09689807891845703, + "learning_rate": 7.218743565195736e-05, + "loss": 0.33678123354911804, + "memory(GiB)": 78.33, + "step": 3561, + "token_acc": 0.8996187046549806, + "train_speed(iter/s)": 0.032214 + }, + { + "epoch": 0.6902097563338662, + "grad_norm": 0.10127212852239609, + "learning_rate": 7.210526594674724e-05, + "loss": 0.35242077708244324, + "memory(GiB)": 78.33, + "step": 3562, + "token_acc": 0.8943310420532657, + "train_speed(iter/s)": 0.032214 + }, + { + "epoch": 0.6904035266191929, + "grad_norm": 0.0959462970495224, + "learning_rate": 7.202312823499738e-05, + "loss": 0.30481332540512085, + "memory(GiB)": 78.33, + "step": 3563, + "token_acc": 0.9080480651942592, + "train_speed(iter/s)": 0.032215 + }, + { + "epoch": 0.6905972969045197, + "grad_norm": 0.09734071046113968, + "learning_rate": 7.194102255044415e-05, + "loss": 0.31086277961730957, + "memory(GiB)": 78.33, + "step": 3564, + "token_acc": 0.9077373688064437, + "train_speed(iter/s)": 0.032216 + }, + { + "epoch": 0.6907910671898464, + "grad_norm": 0.10370344668626785, + "learning_rate": 7.185894892681048e-05, + "loss": 0.3449724316596985, + "memory(GiB)": 78.33, + "step": 3565, + "token_acc": 0.8976901609168102, + "train_speed(iter/s)": 0.032217 + }, + { + "epoch": 0.6909848374751731, + "grad_norm": 0.10260160267353058, + "learning_rate": 7.17769073978062e-05, + "loss": 0.3641132712364197, + "memory(GiB)": 78.33, + "step": 3566, + "token_acc": 0.8912745666262711, + "train_speed(iter/s)": 0.032218 + }, + { + "epoch": 0.6911786077604999, + "grad_norm": 0.10054480284452438, + "learning_rate": 7.169489799712799e-05, + "loss": 0.3622581660747528, + "memory(GiB)": 78.33, + "step": 3567, + "token_acc": 0.8936806148590948, + "train_speed(iter/s)": 0.032218 + }, + { + "epoch": 0.6913723780458266, + "grad_norm": 0.0974610224366188, + "learning_rate": 7.161292075845926e-05, + "loss": 0.3504323363304138, + "memory(GiB)": 78.33, + "step": 3568, + "token_acc": 0.8961522654885441, + "train_speed(iter/s)": 0.032219 + }, + { + "epoch": 0.6915661483311534, + "grad_norm": 0.10981731861829758, + "learning_rate": 7.153097571547038e-05, + "loss": 0.3665009140968323, + "memory(GiB)": 78.33, + "step": 3569, + "token_acc": 0.8888823146559375, + "train_speed(iter/s)": 0.03222 + }, + { + "epoch": 0.6917599186164801, + "grad_norm": 0.10058823972940445, + "learning_rate": 7.144906290181832e-05, + "loss": 0.3594769239425659, + "memory(GiB)": 78.33, + "step": 3570, + "token_acc": 0.8944776275804255, + "train_speed(iter/s)": 0.032221 + }, + { + "epoch": 0.6919536889018069, + "grad_norm": 0.11253203451633453, + "learning_rate": 7.136718235114686e-05, + "loss": 0.36248651146888733, + "memory(GiB)": 78.33, + "step": 3571, + "token_acc": 0.8918202731428905, + "train_speed(iter/s)": 0.032222 + }, + { + "epoch": 0.6921474591871336, + "grad_norm": 0.11419709771871567, + "learning_rate": 7.128533409708656e-05, + "loss": 0.3548899292945862, + "memory(GiB)": 78.33, + "step": 3572, + "token_acc": 0.8954283004658169, + "train_speed(iter/s)": 0.032222 + }, + { + "epoch": 0.6923412294724604, + "grad_norm": 0.10111390799283981, + "learning_rate": 7.120351817325469e-05, + "loss": 0.33687660098075867, + "memory(GiB)": 78.33, + "step": 3573, + "token_acc": 0.8996950958880688, + "train_speed(iter/s)": 0.032223 + }, + { + "epoch": 0.6925349997577871, + "grad_norm": 0.10509663820266724, + "learning_rate": 7.112173461325525e-05, + "loss": 0.36578071117401123, + "memory(GiB)": 78.33, + "step": 3574, + "token_acc": 0.889935521532624, + "train_speed(iter/s)": 0.032224 + }, + { + "epoch": 0.6927287700431138, + "grad_norm": 0.11243990808725357, + "learning_rate": 7.10399834506789e-05, + "loss": 0.3173210620880127, + "memory(GiB)": 78.33, + "step": 3575, + "token_acc": 0.9059147392523605, + "train_speed(iter/s)": 0.032225 + }, + { + "epoch": 0.6929225403284406, + "grad_norm": 0.10139571875333786, + "learning_rate": 7.095826471910313e-05, + "loss": 0.3282420039176941, + "memory(GiB)": 78.33, + "step": 3576, + "token_acc": 0.902722545120832, + "train_speed(iter/s)": 0.032226 + }, + { + "epoch": 0.6931163106137673, + "grad_norm": 0.0943324863910675, + "learning_rate": 7.087657845209196e-05, + "loss": 0.32712027430534363, + "memory(GiB)": 78.33, + "step": 3577, + "token_acc": 0.9013381123058543, + "train_speed(iter/s)": 0.032226 + }, + { + "epoch": 0.6933100808990941, + "grad_norm": 0.13730654120445251, + "learning_rate": 7.079492468319618e-05, + "loss": 0.3298265337944031, + "memory(GiB)": 78.33, + "step": 3578, + "token_acc": 0.901703923714241, + "train_speed(iter/s)": 0.032227 + }, + { + "epoch": 0.6935038511844208, + "grad_norm": 0.10465666651725769, + "learning_rate": 7.071330344595314e-05, + "loss": 0.34704989194869995, + "memory(GiB)": 78.33, + "step": 3579, + "token_acc": 0.8968518114360541, + "train_speed(iter/s)": 0.032228 + }, + { + "epoch": 0.6936976214697476, + "grad_norm": 0.10772595554590225, + "learning_rate": 7.063171477388688e-05, + "loss": 0.35015714168548584, + "memory(GiB)": 78.33, + "step": 3580, + "token_acc": 0.894221193443329, + "train_speed(iter/s)": 0.032229 + }, + { + "epoch": 0.6938913917550743, + "grad_norm": 0.10779722779989243, + "learning_rate": 7.055015870050809e-05, + "loss": 0.3421753942966461, + "memory(GiB)": 78.33, + "step": 3581, + "token_acc": 0.8975127263354598, + "train_speed(iter/s)": 0.03223 + }, + { + "epoch": 0.694085162040401, + "grad_norm": 0.09361686557531357, + "learning_rate": 7.046863525931395e-05, + "loss": 0.3193609416484833, + "memory(GiB)": 78.33, + "step": 3582, + "token_acc": 0.9052085463095482, + "train_speed(iter/s)": 0.03223 + }, + { + "epoch": 0.6942789323257279, + "grad_norm": 0.10432402044534683, + "learning_rate": 7.038714448378846e-05, + "loss": 0.3353957235813141, + "memory(GiB)": 78.33, + "step": 3583, + "token_acc": 0.8987451070688465, + "train_speed(iter/s)": 0.032231 + }, + { + "epoch": 0.6944727026110546, + "grad_norm": 0.09659415483474731, + "learning_rate": 7.030568640740201e-05, + "loss": 0.3202442526817322, + "memory(GiB)": 78.33, + "step": 3584, + "token_acc": 0.9025401069518717, + "train_speed(iter/s)": 0.032232 + }, + { + "epoch": 0.6946664728963814, + "grad_norm": 0.09830014407634735, + "learning_rate": 7.022426106361163e-05, + "loss": 0.31335949897766113, + "memory(GiB)": 78.33, + "step": 3585, + "token_acc": 0.906624622120296, + "train_speed(iter/s)": 0.032233 + }, + { + "epoch": 0.6948602431817081, + "grad_norm": 0.09905708581209183, + "learning_rate": 7.014286848586088e-05, + "loss": 0.3123447895050049, + "memory(GiB)": 78.33, + "step": 3586, + "token_acc": 0.9041099546303709, + "train_speed(iter/s)": 0.032233 + }, + { + "epoch": 0.6950540134670349, + "grad_norm": 0.09144581854343414, + "learning_rate": 7.006150870757989e-05, + "loss": 0.3135398030281067, + "memory(GiB)": 78.33, + "step": 3587, + "token_acc": 0.9057776084938225, + "train_speed(iter/s)": 0.032234 + }, + { + "epoch": 0.6952477837523616, + "grad_norm": 0.10966409742832184, + "learning_rate": 6.99801817621853e-05, + "loss": 0.3710702657699585, + "memory(GiB)": 78.33, + "step": 3588, + "token_acc": 0.8925160936022211, + "train_speed(iter/s)": 0.032235 + }, + { + "epoch": 0.6954415540376884, + "grad_norm": 0.09838887304067612, + "learning_rate": 6.989888768308024e-05, + "loss": 0.3230520486831665, + "memory(GiB)": 78.33, + "step": 3589, + "token_acc": 0.901048865761444, + "train_speed(iter/s)": 0.032236 + }, + { + "epoch": 0.6956353243230151, + "grad_norm": 0.0968567356467247, + "learning_rate": 6.981762650365443e-05, + "loss": 0.34151309728622437, + "memory(GiB)": 78.33, + "step": 3590, + "token_acc": 0.8988749172733289, + "train_speed(iter/s)": 0.032237 + }, + { + "epoch": 0.6958290946083419, + "grad_norm": 0.09684547781944275, + "learning_rate": 6.973639825728401e-05, + "loss": 0.3080379366874695, + "memory(GiB)": 78.33, + "step": 3591, + "token_acc": 0.9065901444686325, + "train_speed(iter/s)": 0.032237 + }, + { + "epoch": 0.6960228648936686, + "grad_norm": 0.09826657176017761, + "learning_rate": 6.965520297733161e-05, + "loss": 0.3160586953163147, + "memory(GiB)": 78.33, + "step": 3592, + "token_acc": 0.9051656151419558, + "train_speed(iter/s)": 0.032238 + }, + { + "epoch": 0.6962166351789953, + "grad_norm": 0.10795781761407852, + "learning_rate": 6.957404069714629e-05, + "loss": 0.3557586073875427, + "memory(GiB)": 78.33, + "step": 3593, + "token_acc": 0.8951461742901105, + "train_speed(iter/s)": 0.032239 + }, + { + "epoch": 0.6964104054643221, + "grad_norm": 0.10063203424215317, + "learning_rate": 6.949291145006353e-05, + "loss": 0.31674492359161377, + "memory(GiB)": 78.33, + "step": 3594, + "token_acc": 0.9046947410840218, + "train_speed(iter/s)": 0.03224 + }, + { + "epoch": 0.6966041757496488, + "grad_norm": 0.10855990648269653, + "learning_rate": 6.941181526940546e-05, + "loss": 0.3493427634239197, + "memory(GiB)": 78.33, + "step": 3595, + "token_acc": 0.895584936843348, + "train_speed(iter/s)": 0.032241 + }, + { + "epoch": 0.6967979460349756, + "grad_norm": 0.09571418911218643, + "learning_rate": 6.933075218848022e-05, + "loss": 0.3080763518810272, + "memory(GiB)": 78.33, + "step": 3596, + "token_acc": 0.9050366723085603, + "train_speed(iter/s)": 0.032241 + }, + { + "epoch": 0.6969917163203023, + "grad_norm": 0.0985865592956543, + "learning_rate": 6.924972224058278e-05, + "loss": 0.3179894983768463, + "memory(GiB)": 78.33, + "step": 3597, + "token_acc": 0.9029965847747053, + "train_speed(iter/s)": 0.032242 + }, + { + "epoch": 0.6971854866056291, + "grad_norm": 0.10527476668357849, + "learning_rate": 6.916872545899427e-05, + "loss": 0.34273964166641235, + "memory(GiB)": 78.33, + "step": 3598, + "token_acc": 0.8985260287978188, + "train_speed(iter/s)": 0.032243 + }, + { + "epoch": 0.6973792568909558, + "grad_norm": 0.10250242054462433, + "learning_rate": 6.908776187698222e-05, + "loss": 0.34843844175338745, + "memory(GiB)": 78.33, + "step": 3599, + "token_acc": 0.8955212090892335, + "train_speed(iter/s)": 0.032244 + }, + { + "epoch": 0.6975730271762826, + "grad_norm": 0.10623464733362198, + "learning_rate": 6.900683152780059e-05, + "loss": 0.34149301052093506, + "memory(GiB)": 78.33, + "step": 3600, + "token_acc": 0.8972149929701756, + "train_speed(iter/s)": 0.032244 + }, + { + "epoch": 0.6977667974616093, + "grad_norm": 0.10202284902334213, + "learning_rate": 6.892593444468954e-05, + "loss": 0.34131869673728943, + "memory(GiB)": 78.33, + "step": 3601, + "token_acc": 0.8982538616521155, + "train_speed(iter/s)": 0.032242 + }, + { + "epoch": 0.697960567746936, + "grad_norm": 0.24693740904331207, + "learning_rate": 6.884507066087584e-05, + "loss": 0.341753214597702, + "memory(GiB)": 78.33, + "step": 3602, + "token_acc": 0.8990424814565071, + "train_speed(iter/s)": 0.032242 + }, + { + "epoch": 0.6981543380322628, + "grad_norm": 0.11156189441680908, + "learning_rate": 6.87642402095723e-05, + "loss": 0.3889864385128021, + "memory(GiB)": 78.33, + "step": 3603, + "token_acc": 0.885625468967735, + "train_speed(iter/s)": 0.032243 + }, + { + "epoch": 0.6983481083175895, + "grad_norm": 0.10035014897584915, + "learning_rate": 6.868344312397823e-05, + "loss": 0.3296211063861847, + "memory(GiB)": 78.33, + "step": 3604, + "token_acc": 0.9014160142449337, + "train_speed(iter/s)": 0.032244 + }, + { + "epoch": 0.6985418786029163, + "grad_norm": 0.09430637955665588, + "learning_rate": 6.860267943727912e-05, + "loss": 0.31993281841278076, + "memory(GiB)": 78.33, + "step": 3605, + "token_acc": 0.9017905151091515, + "train_speed(iter/s)": 0.032245 + }, + { + "epoch": 0.698735648888243, + "grad_norm": 0.09148237109184265, + "learning_rate": 6.852194918264679e-05, + "loss": 0.3246019780635834, + "memory(GiB)": 78.33, + "step": 3606, + "token_acc": 0.9024793779101373, + "train_speed(iter/s)": 0.032246 + }, + { + "epoch": 0.6989294191735698, + "grad_norm": 0.10248222947120667, + "learning_rate": 6.844125239323933e-05, + "loss": 0.33933860063552856, + "memory(GiB)": 78.33, + "step": 3607, + "token_acc": 0.8989493117584775, + "train_speed(iter/s)": 0.032246 + }, + { + "epoch": 0.6991231894588965, + "grad_norm": 0.09796436131000519, + "learning_rate": 6.836058910220102e-05, + "loss": 0.3447071611881256, + "memory(GiB)": 78.33, + "step": 3608, + "token_acc": 0.8983826213034023, + "train_speed(iter/s)": 0.032247 + }, + { + "epoch": 0.6993169597442233, + "grad_norm": 0.10065195709466934, + "learning_rate": 6.827995934266259e-05, + "loss": 0.35696181654930115, + "memory(GiB)": 78.33, + "step": 3609, + "token_acc": 0.8937146709536897, + "train_speed(iter/s)": 0.032248 + }, + { + "epoch": 0.69951073002955, + "grad_norm": 0.08694633841514587, + "learning_rate": 6.819936314774074e-05, + "loss": 0.3089888095855713, + "memory(GiB)": 78.33, + "step": 3610, + "token_acc": 0.9059485912481589, + "train_speed(iter/s)": 0.032249 + }, + { + "epoch": 0.6997045003148767, + "grad_norm": 0.11966582387685776, + "learning_rate": 6.81188005505385e-05, + "loss": 0.3933137357234955, + "memory(GiB)": 78.33, + "step": 3611, + "token_acc": 0.8836271567891973, + "train_speed(iter/s)": 0.032249 + }, + { + "epoch": 0.6998982706002035, + "grad_norm": 0.11115437746047974, + "learning_rate": 6.803827158414512e-05, + "loss": 0.36456286907196045, + "memory(GiB)": 78.33, + "step": 3612, + "token_acc": 0.8943506748392162, + "train_speed(iter/s)": 0.03225 + }, + { + "epoch": 0.7000920408855302, + "grad_norm": 0.09643102437257767, + "learning_rate": 6.795777628163599e-05, + "loss": 0.3220784366130829, + "memory(GiB)": 78.33, + "step": 3613, + "token_acc": 0.9028170546008266, + "train_speed(iter/s)": 0.032251 + }, + { + "epoch": 0.700285811170857, + "grad_norm": 0.09812808781862259, + "learning_rate": 6.78773146760727e-05, + "loss": 0.3644194006919861, + "memory(GiB)": 78.33, + "step": 3614, + "token_acc": 0.8943334712050802, + "train_speed(iter/s)": 0.032252 + }, + { + "epoch": 0.7004795814561837, + "grad_norm": 0.08493607491254807, + "learning_rate": 6.779688680050296e-05, + "loss": 0.2795659899711609, + "memory(GiB)": 78.33, + "step": 3615, + "token_acc": 0.9142221903464104, + "train_speed(iter/s)": 0.032253 + }, + { + "epoch": 0.7006733517415105, + "grad_norm": 0.10760082304477692, + "learning_rate": 6.771649268796073e-05, + "loss": 0.3571077883243561, + "memory(GiB)": 78.33, + "step": 3616, + "token_acc": 0.8955498133792708, + "train_speed(iter/s)": 0.032253 + }, + { + "epoch": 0.7008671220268372, + "grad_norm": 0.10084082186222076, + "learning_rate": 6.7636132371466e-05, + "loss": 0.3274276554584503, + "memory(GiB)": 78.33, + "step": 3617, + "token_acc": 0.9002256477176956, + "train_speed(iter/s)": 0.032254 + }, + { + "epoch": 0.701060892312164, + "grad_norm": 0.10685470700263977, + "learning_rate": 6.755580588402492e-05, + "loss": 0.34894925355911255, + "memory(GiB)": 78.33, + "step": 3618, + "token_acc": 0.8940826727066817, + "train_speed(iter/s)": 0.032255 + }, + { + "epoch": 0.7012546625974907, + "grad_norm": 0.11531514674425125, + "learning_rate": 6.74755132586297e-05, + "loss": 0.40001946687698364, + "memory(GiB)": 78.33, + "step": 3619, + "token_acc": 0.885813818463325, + "train_speed(iter/s)": 0.032256 + }, + { + "epoch": 0.7014484328828174, + "grad_norm": 0.1036171242594719, + "learning_rate": 6.739525452825871e-05, + "loss": 0.35469454526901245, + "memory(GiB)": 78.33, + "step": 3620, + "token_acc": 0.8936814976351638, + "train_speed(iter/s)": 0.032257 + }, + { + "epoch": 0.7016422031681442, + "grad_norm": 0.10101023316383362, + "learning_rate": 6.731502972587637e-05, + "loss": 0.31730735301971436, + "memory(GiB)": 78.33, + "step": 3621, + "token_acc": 0.9038396016249508, + "train_speed(iter/s)": 0.032257 + }, + { + "epoch": 0.7018359734534709, + "grad_norm": 0.10922136157751083, + "learning_rate": 6.72348388844331e-05, + "loss": 0.3624739944934845, + "memory(GiB)": 78.33, + "step": 3622, + "token_acc": 0.8922535011220635, + "train_speed(iter/s)": 0.032258 + }, + { + "epoch": 0.7020297437387977, + "grad_norm": 0.09917715191841125, + "learning_rate": 6.715468203686553e-05, + "loss": 0.3212898373603821, + "memory(GiB)": 78.33, + "step": 3623, + "token_acc": 0.9037098911227606, + "train_speed(iter/s)": 0.032259 + }, + { + "epoch": 0.7022235140241244, + "grad_norm": 0.1009821966290474, + "learning_rate": 6.70745592160962e-05, + "loss": 0.3310352563858032, + "memory(GiB)": 78.33, + "step": 3624, + "token_acc": 0.9000106797671811, + "train_speed(iter/s)": 0.03226 + }, + { + "epoch": 0.7024172843094512, + "grad_norm": 0.1044231653213501, + "learning_rate": 6.699447045503368e-05, + "loss": 0.3453122079372406, + "memory(GiB)": 78.33, + "step": 3625, + "token_acc": 0.8964209722297998, + "train_speed(iter/s)": 0.03226 + }, + { + "epoch": 0.7026110545947779, + "grad_norm": 0.10527423024177551, + "learning_rate": 6.69144157865726e-05, + "loss": 0.34135711193084717, + "memory(GiB)": 78.33, + "step": 3626, + "token_acc": 0.9015418633837328, + "train_speed(iter/s)": 0.032261 + }, + { + "epoch": 0.7028048248801047, + "grad_norm": 0.10108046233654022, + "learning_rate": 6.683439524359351e-05, + "loss": 0.3644823431968689, + "memory(GiB)": 78.33, + "step": 3627, + "token_acc": 0.8922698922698923, + "train_speed(iter/s)": 0.032262 + }, + { + "epoch": 0.7029985951654314, + "grad_norm": 0.10416973382234573, + "learning_rate": 6.675440885896313e-05, + "loss": 0.3607743978500366, + "memory(GiB)": 78.33, + "step": 3628, + "token_acc": 0.8915753781950965, + "train_speed(iter/s)": 0.032263 + }, + { + "epoch": 0.7031923654507581, + "grad_norm": 0.09729216992855072, + "learning_rate": 6.66744566655338e-05, + "loss": 0.3052827715873718, + "memory(GiB)": 78.33, + "step": 3629, + "token_acc": 0.907312711319753, + "train_speed(iter/s)": 0.032264 + }, + { + "epoch": 0.7033861357360849, + "grad_norm": 0.11203377693891525, + "learning_rate": 6.659453869614426e-05, + "loss": 0.35420122742652893, + "memory(GiB)": 78.33, + "step": 3630, + "token_acc": 0.8935774567300815, + "train_speed(iter/s)": 0.032265 + }, + { + "epoch": 0.7035799060214116, + "grad_norm": 0.09870520234107971, + "learning_rate": 6.651465498361885e-05, + "loss": 0.34778672456741333, + "memory(GiB)": 78.33, + "step": 3631, + "token_acc": 0.8956560099647083, + "train_speed(iter/s)": 0.032265 + }, + { + "epoch": 0.7037736763067384, + "grad_norm": 0.10209493339061737, + "learning_rate": 6.643480556076796e-05, + "loss": 0.3381112515926361, + "memory(GiB)": 78.33, + "step": 3632, + "token_acc": 0.9000748406410488, + "train_speed(iter/s)": 0.032266 + }, + { + "epoch": 0.7039674465920651, + "grad_norm": 0.10792220383882523, + "learning_rate": 6.635499046038794e-05, + "loss": 0.3817068338394165, + "memory(GiB)": 78.33, + "step": 3633, + "token_acc": 0.8868650435828074, + "train_speed(iter/s)": 0.032267 + }, + { + "epoch": 0.7041612168773919, + "grad_norm": 0.1084747463464737, + "learning_rate": 6.627520971526088e-05, + "loss": 0.36603009700775146, + "memory(GiB)": 78.33, + "step": 3634, + "token_acc": 0.8899790989317232, + "train_speed(iter/s)": 0.032268 + }, + { + "epoch": 0.7043549871627186, + "grad_norm": 0.10230504721403122, + "learning_rate": 6.619546335815503e-05, + "loss": 0.3367150127887726, + "memory(GiB)": 78.33, + "step": 3635, + "token_acc": 0.898678290999525, + "train_speed(iter/s)": 0.032268 + }, + { + "epoch": 0.7045487574480453, + "grad_norm": 0.0944862887263298, + "learning_rate": 6.61157514218243e-05, + "loss": 0.33380186557769775, + "memory(GiB)": 78.33, + "step": 3636, + "token_acc": 0.9002516989680343, + "train_speed(iter/s)": 0.032269 + }, + { + "epoch": 0.7047425277333721, + "grad_norm": 0.1018281877040863, + "learning_rate": 6.603607393900852e-05, + "loss": 0.3347662687301636, + "memory(GiB)": 78.33, + "step": 3637, + "token_acc": 0.8994067307187483, + "train_speed(iter/s)": 0.03227 + }, + { + "epoch": 0.7049362980186988, + "grad_norm": 0.10647895187139511, + "learning_rate": 6.595643094243335e-05, + "loss": 0.3350790739059448, + "memory(GiB)": 78.33, + "step": 3638, + "token_acc": 0.9001211906239839, + "train_speed(iter/s)": 0.032271 + }, + { + "epoch": 0.7051300683040256, + "grad_norm": 0.10115791112184525, + "learning_rate": 6.587682246481036e-05, + "loss": 0.3523610234260559, + "memory(GiB)": 78.33, + "step": 3639, + "token_acc": 0.8971207273951843, + "train_speed(iter/s)": 0.032272 + }, + { + "epoch": 0.7053238385893523, + "grad_norm": 0.10161978751420975, + "learning_rate": 6.579724853883684e-05, + "loss": 0.3423236012458801, + "memory(GiB)": 78.33, + "step": 3640, + "token_acc": 0.90029210201101, + "train_speed(iter/s)": 0.032272 + }, + { + "epoch": 0.7055176088746791, + "grad_norm": 0.09845045208930969, + "learning_rate": 6.571770919719592e-05, + "loss": 0.32784488797187805, + "memory(GiB)": 78.33, + "step": 3641, + "token_acc": 0.9032830751813925, + "train_speed(iter/s)": 0.032273 + }, + { + "epoch": 0.7057113791600058, + "grad_norm": 0.09614937752485275, + "learning_rate": 6.563820447255663e-05, + "loss": 0.33316075801849365, + "memory(GiB)": 78.33, + "step": 3642, + "token_acc": 0.8991082090500266, + "train_speed(iter/s)": 0.032274 + }, + { + "epoch": 0.7059051494453326, + "grad_norm": 0.10282139480113983, + "learning_rate": 6.555873439757366e-05, + "loss": 0.32991480827331543, + "memory(GiB)": 78.33, + "step": 3643, + "token_acc": 0.9022564374834086, + "train_speed(iter/s)": 0.032274 + }, + { + "epoch": 0.7060989197306593, + "grad_norm": 0.11456986516714096, + "learning_rate": 6.547929900488749e-05, + "loss": 0.36666494607925415, + "memory(GiB)": 78.33, + "step": 3644, + "token_acc": 0.889168765743073, + "train_speed(iter/s)": 0.032275 + }, + { + "epoch": 0.706292690015986, + "grad_norm": 0.10417237877845764, + "learning_rate": 6.539989832712439e-05, + "loss": 0.36223533749580383, + "memory(GiB)": 78.33, + "step": 3645, + "token_acc": 0.8901788429112419, + "train_speed(iter/s)": 0.032276 + }, + { + "epoch": 0.7064864603013128, + "grad_norm": 0.09668152034282684, + "learning_rate": 6.532053239689631e-05, + "loss": 0.32515013217926025, + "memory(GiB)": 78.33, + "step": 3646, + "token_acc": 0.901520462181827, + "train_speed(iter/s)": 0.032277 + }, + { + "epoch": 0.7066802305866395, + "grad_norm": 0.10676920413970947, + "learning_rate": 6.524120124680104e-05, + "loss": 0.35744109749794006, + "memory(GiB)": 78.33, + "step": 3647, + "token_acc": 0.8961123110151188, + "train_speed(iter/s)": 0.032278 + }, + { + "epoch": 0.7068740008719663, + "grad_norm": 0.10558585822582245, + "learning_rate": 6.51619049094219e-05, + "loss": 0.3541412353515625, + "memory(GiB)": 78.33, + "step": 3648, + "token_acc": 0.8950543561476038, + "train_speed(iter/s)": 0.032278 + }, + { + "epoch": 0.707067771157293, + "grad_norm": 0.10470271855592728, + "learning_rate": 6.508264341732815e-05, + "loss": 0.36191800236701965, + "memory(GiB)": 78.33, + "step": 3649, + "token_acc": 0.8917385076885658, + "train_speed(iter/s)": 0.032279 + }, + { + "epoch": 0.7072615414426198, + "grad_norm": 0.10651751607656479, + "learning_rate": 6.500341680307457e-05, + "loss": 0.34071311354637146, + "memory(GiB)": 78.33, + "step": 3650, + "token_acc": 0.8990374215877136, + "train_speed(iter/s)": 0.03228 + }, + { + "epoch": 0.7074553117279465, + "grad_norm": 0.10043511539697647, + "learning_rate": 6.492422509920167e-05, + "loss": 0.32280540466308594, + "memory(GiB)": 78.33, + "step": 3651, + "token_acc": 0.9044051318031472, + "train_speed(iter/s)": 0.03228 + }, + { + "epoch": 0.7076490820132733, + "grad_norm": 0.11648175865411758, + "learning_rate": 6.484506833823559e-05, + "loss": 0.36728546023368835, + "memory(GiB)": 78.33, + "step": 3652, + "token_acc": 0.890401807018102, + "train_speed(iter/s)": 0.032281 + }, + { + "epoch": 0.7078428522986, + "grad_norm": 0.10505972057580948, + "learning_rate": 6.476594655268814e-05, + "loss": 0.3796302080154419, + "memory(GiB)": 78.33, + "step": 3653, + "token_acc": 0.8889887273321353, + "train_speed(iter/s)": 0.032282 + }, + { + "epoch": 0.7080366225839267, + "grad_norm": 0.1008792519569397, + "learning_rate": 6.468685977505676e-05, + "loss": 0.31763315200805664, + "memory(GiB)": 78.33, + "step": 3654, + "token_acc": 0.905323467447716, + "train_speed(iter/s)": 0.032283 + }, + { + "epoch": 0.7082303928692535, + "grad_norm": 0.09355062991380692, + "learning_rate": 6.460780803782448e-05, + "loss": 0.31916338205337524, + "memory(GiB)": 78.33, + "step": 3655, + "token_acc": 0.9061786058790678, + "train_speed(iter/s)": 0.032283 + }, + { + "epoch": 0.7084241631545802, + "grad_norm": 0.10963544994592667, + "learning_rate": 6.452879137346007e-05, + "loss": 0.3544541895389557, + "memory(GiB)": 78.33, + "step": 3656, + "token_acc": 0.8935794884272636, + "train_speed(iter/s)": 0.032284 + }, + { + "epoch": 0.708617933439907, + "grad_norm": 0.09566653519868851, + "learning_rate": 6.444980981441775e-05, + "loss": 0.3308408558368683, + "memory(GiB)": 78.33, + "step": 3657, + "token_acc": 0.9017979944484733, + "train_speed(iter/s)": 0.032285 + }, + { + "epoch": 0.7088117037252337, + "grad_norm": 0.10379898548126221, + "learning_rate": 6.437086339313735e-05, + "loss": 0.32990795373916626, + "memory(GiB)": 78.33, + "step": 3658, + "token_acc": 0.8986880466472303, + "train_speed(iter/s)": 0.032286 + }, + { + "epoch": 0.7090054740105605, + "grad_norm": 0.11237087845802307, + "learning_rate": 6.429195214204428e-05, + "loss": 0.3397267162799835, + "memory(GiB)": 78.33, + "step": 3659, + "token_acc": 0.8990950108233576, + "train_speed(iter/s)": 0.032286 + }, + { + "epoch": 0.7091992442958872, + "grad_norm": 0.10298223793506622, + "learning_rate": 6.421307609354957e-05, + "loss": 0.32215797901153564, + "memory(GiB)": 78.33, + "step": 3660, + "token_acc": 0.9034365070024715, + "train_speed(iter/s)": 0.032287 + }, + { + "epoch": 0.709393014581214, + "grad_norm": 0.11267991364002228, + "learning_rate": 6.413423528004968e-05, + "loss": 0.3442709445953369, + "memory(GiB)": 78.33, + "step": 3661, + "token_acc": 0.8958262848706671, + "train_speed(iter/s)": 0.032288 + }, + { + "epoch": 0.7095867848665407, + "grad_norm": 0.09616924822330475, + "learning_rate": 6.40554297339266e-05, + "loss": 0.3290918469429016, + "memory(GiB)": 78.33, + "step": 3662, + "token_acc": 0.9001304801670146, + "train_speed(iter/s)": 0.032289 + }, + { + "epoch": 0.7097805551518674, + "grad_norm": 0.09400229156017303, + "learning_rate": 6.3976659487548e-05, + "loss": 0.3193153440952301, + "memory(GiB)": 78.33, + "step": 3663, + "token_acc": 0.9033483612785573, + "train_speed(iter/s)": 0.032289 + }, + { + "epoch": 0.7099743254371942, + "grad_norm": 0.0955566018819809, + "learning_rate": 6.38979245732669e-05, + "loss": 0.3172321915626526, + "memory(GiB)": 78.33, + "step": 3664, + "token_acc": 0.905316643945966, + "train_speed(iter/s)": 0.03229 + }, + { + "epoch": 0.7101680957225209, + "grad_norm": 0.1006002202630043, + "learning_rate": 6.381922502342182e-05, + "loss": 0.31889939308166504, + "memory(GiB)": 78.33, + "step": 3665, + "token_acc": 0.9029685900544152, + "train_speed(iter/s)": 0.032291 + }, + { + "epoch": 0.7103618660078477, + "grad_norm": 0.10536913573741913, + "learning_rate": 6.37405608703368e-05, + "loss": 0.3331918716430664, + "memory(GiB)": 78.33, + "step": 3666, + "token_acc": 0.9007023825919295, + "train_speed(iter/s)": 0.032292 + }, + { + "epoch": 0.7105556362931744, + "grad_norm": 0.09231302887201309, + "learning_rate": 6.366193214632123e-05, + "loss": 0.32499587535858154, + "memory(GiB)": 78.33, + "step": 3667, + "token_acc": 0.903846596533229, + "train_speed(iter/s)": 0.032292 + }, + { + "epoch": 0.7107494065785012, + "grad_norm": 0.1009543314576149, + "learning_rate": 6.35833388836702e-05, + "loss": 0.3330029845237732, + "memory(GiB)": 78.33, + "step": 3668, + "token_acc": 0.9015039132436442, + "train_speed(iter/s)": 0.032293 + }, + { + "epoch": 0.7109431768638279, + "grad_norm": 0.10333767533302307, + "learning_rate": 6.350478111466399e-05, + "loss": 0.3277633786201477, + "memory(GiB)": 78.33, + "step": 3669, + "token_acc": 0.9013294198895028, + "train_speed(iter/s)": 0.032294 + }, + { + "epoch": 0.7111369471491547, + "grad_norm": 0.12599217891693115, + "learning_rate": 6.342625887156839e-05, + "loss": 0.38569512963294983, + "memory(GiB)": 78.33, + "step": 3670, + "token_acc": 0.8859112900889416, + "train_speed(iter/s)": 0.032295 + }, + { + "epoch": 0.7113307174344814, + "grad_norm": 0.10265874862670898, + "learning_rate": 6.334777218663461e-05, + "loss": 0.36148056387901306, + "memory(GiB)": 78.33, + "step": 3671, + "token_acc": 0.8917856569870046, + "train_speed(iter/s)": 0.032295 + }, + { + "epoch": 0.7115244877198081, + "grad_norm": 0.1083201915025711, + "learning_rate": 6.326932109209922e-05, + "loss": 0.34301403164863586, + "memory(GiB)": 78.33, + "step": 3672, + "token_acc": 0.8956853082741233, + "train_speed(iter/s)": 0.032296 + }, + { + "epoch": 0.7117182580051349, + "grad_norm": 0.11269722878932953, + "learning_rate": 6.319090562018419e-05, + "loss": 0.34771838784217834, + "memory(GiB)": 78.33, + "step": 3673, + "token_acc": 0.8975308282074455, + "train_speed(iter/s)": 0.032297 + }, + { + "epoch": 0.7119120282904616, + "grad_norm": 0.1110716462135315, + "learning_rate": 6.311252580309682e-05, + "loss": 0.361664742231369, + "memory(GiB)": 78.33, + "step": 3674, + "token_acc": 0.8944661718966526, + "train_speed(iter/s)": 0.032298 + }, + { + "epoch": 0.7121057985757884, + "grad_norm": 0.11600884050130844, + "learning_rate": 6.303418167302994e-05, + "loss": 0.38586270809173584, + "memory(GiB)": 78.33, + "step": 3675, + "token_acc": 0.8842967627548681, + "train_speed(iter/s)": 0.032298 + }, + { + "epoch": 0.7122995688611151, + "grad_norm": 0.1069970428943634, + "learning_rate": 6.295587326216149e-05, + "loss": 0.34455546736717224, + "memory(GiB)": 78.33, + "step": 3676, + "token_acc": 0.898117085260072, + "train_speed(iter/s)": 0.032299 + }, + { + "epoch": 0.7124933391464419, + "grad_norm": 0.10031075775623322, + "learning_rate": 6.287760060265485e-05, + "loss": 0.33782869577407837, + "memory(GiB)": 78.33, + "step": 3677, + "token_acc": 0.8992902208201893, + "train_speed(iter/s)": 0.0323 + }, + { + "epoch": 0.7126871094317686, + "grad_norm": 0.10248647630214691, + "learning_rate": 6.279936372665874e-05, + "loss": 0.33945292234420776, + "memory(GiB)": 78.33, + "step": 3678, + "token_acc": 0.8977860913237329, + "train_speed(iter/s)": 0.032301 + }, + { + "epoch": 0.7128808797170954, + "grad_norm": 0.10595156252384186, + "learning_rate": 6.27211626663071e-05, + "loss": 0.36181285977363586, + "memory(GiB)": 78.33, + "step": 3679, + "token_acc": 0.8937350415317471, + "train_speed(iter/s)": 0.032301 + }, + { + "epoch": 0.7130746500024221, + "grad_norm": 0.09816795587539673, + "learning_rate": 6.264299745371922e-05, + "loss": 0.33072012662887573, + "memory(GiB)": 78.33, + "step": 3680, + "token_acc": 0.9027671022290545, + "train_speed(iter/s)": 0.032302 + }, + { + "epoch": 0.7132684202877488, + "grad_norm": 0.09826884418725967, + "learning_rate": 6.256486812099961e-05, + "loss": 0.32538434863090515, + "memory(GiB)": 78.33, + "step": 3681, + "token_acc": 0.901966256501332, + "train_speed(iter/s)": 0.032303 + }, + { + "epoch": 0.7134621905730756, + "grad_norm": 0.09893721342086792, + "learning_rate": 6.248677470023819e-05, + "loss": 0.31855225563049316, + "memory(GiB)": 78.33, + "step": 3682, + "token_acc": 0.9033751256414326, + "train_speed(iter/s)": 0.032304 + }, + { + "epoch": 0.7136559608584023, + "grad_norm": 0.10306079685688019, + "learning_rate": 6.240871722350998e-05, + "loss": 0.3382839262485504, + "memory(GiB)": 78.33, + "step": 3683, + "token_acc": 0.8988074461896451, + "train_speed(iter/s)": 0.032304 + }, + { + "epoch": 0.7138497311437291, + "grad_norm": 0.10051840543746948, + "learning_rate": 6.233069572287527e-05, + "loss": 0.31791266798973083, + "memory(GiB)": 78.33, + "step": 3684, + "token_acc": 0.90387971539804, + "train_speed(iter/s)": 0.032305 + }, + { + "epoch": 0.7140435014290558, + "grad_norm": 0.11294718831777573, + "learning_rate": 6.22527102303796e-05, + "loss": 0.3825894892215729, + "memory(GiB)": 78.33, + "step": 3685, + "token_acc": 0.8891299885640559, + "train_speed(iter/s)": 0.032306 + }, + { + "epoch": 0.7142372717143826, + "grad_norm": 0.0918063074350357, + "learning_rate": 6.217476077805369e-05, + "loss": 0.31590160727500916, + "memory(GiB)": 78.33, + "step": 3686, + "token_acc": 0.9046886822386566, + "train_speed(iter/s)": 0.032307 + }, + { + "epoch": 0.7144310419997093, + "grad_norm": 0.09278496354818344, + "learning_rate": 6.209684739791347e-05, + "loss": 0.3343659043312073, + "memory(GiB)": 78.33, + "step": 3687, + "token_acc": 0.8985444692228377, + "train_speed(iter/s)": 0.032307 + }, + { + "epoch": 0.714624812285036, + "grad_norm": 0.11030339449644089, + "learning_rate": 6.201897012196005e-05, + "loss": 0.33175480365753174, + "memory(GiB)": 78.33, + "step": 3688, + "token_acc": 0.9004147113390811, + "train_speed(iter/s)": 0.032308 + }, + { + "epoch": 0.7148185825703628, + "grad_norm": 0.09746501594781876, + "learning_rate": 6.194112898217978e-05, + "loss": 0.3286242187023163, + "memory(GiB)": 78.33, + "step": 3689, + "token_acc": 0.9018423513356415, + "train_speed(iter/s)": 0.032309 + }, + { + "epoch": 0.7150123528556895, + "grad_norm": 0.10385642945766449, + "learning_rate": 6.186332401054406e-05, + "loss": 0.3613872528076172, + "memory(GiB)": 78.33, + "step": 3690, + "token_acc": 0.891545143039286, + "train_speed(iter/s)": 0.03231 + }, + { + "epoch": 0.7152061231410163, + "grad_norm": 0.10428830236196518, + "learning_rate": 6.17855552390095e-05, + "loss": 0.3543146252632141, + "memory(GiB)": 78.33, + "step": 3691, + "token_acc": 0.8936885245901639, + "train_speed(iter/s)": 0.03231 + }, + { + "epoch": 0.715399893426343, + "grad_norm": 0.09830185770988464, + "learning_rate": 6.170782269951783e-05, + "loss": 0.2993859648704529, + "memory(GiB)": 78.33, + "step": 3692, + "token_acc": 0.9107328114684523, + "train_speed(iter/s)": 0.032311 + }, + { + "epoch": 0.7155936637116698, + "grad_norm": 0.10561109334230423, + "learning_rate": 6.163012642399587e-05, + "loss": 0.3516858220100403, + "memory(GiB)": 78.33, + "step": 3693, + "token_acc": 0.8974372357638708, + "train_speed(iter/s)": 0.032312 + }, + { + "epoch": 0.7157874339969965, + "grad_norm": 0.0973869115114212, + "learning_rate": 6.155246644435558e-05, + "loss": 0.3680126965045929, + "memory(GiB)": 78.33, + "step": 3694, + "token_acc": 0.8894600330110823, + "train_speed(iter/s)": 0.032312 + }, + { + "epoch": 0.7159812042823233, + "grad_norm": 0.09466208517551422, + "learning_rate": 6.147484279249396e-05, + "loss": 0.32024601101875305, + "memory(GiB)": 78.33, + "step": 3695, + "token_acc": 0.9047858675212543, + "train_speed(iter/s)": 0.032313 + }, + { + "epoch": 0.71617497456765, + "grad_norm": 0.10687924921512604, + "learning_rate": 6.13972555002932e-05, + "loss": 0.33103132247924805, + "memory(GiB)": 78.33, + "step": 3696, + "token_acc": 0.9001981178801387, + "train_speed(iter/s)": 0.032314 + }, + { + "epoch": 0.7163687448529767, + "grad_norm": 0.09712281078100204, + "learning_rate": 6.131970459962046e-05, + "loss": 0.3220095634460449, + "memory(GiB)": 78.33, + "step": 3697, + "token_acc": 0.9038578331704998, + "train_speed(iter/s)": 0.032315 + }, + { + "epoch": 0.7165625151383035, + "grad_norm": 0.11033158749341965, + "learning_rate": 6.124219012232798e-05, + "loss": 0.36744678020477295, + "memory(GiB)": 78.33, + "step": 3698, + "token_acc": 0.8917987391739822, + "train_speed(iter/s)": 0.032315 + }, + { + "epoch": 0.7167562854236302, + "grad_norm": 0.09989314526319504, + "learning_rate": 6.1164712100253e-05, + "loss": 0.33150678873062134, + "memory(GiB)": 78.33, + "step": 3699, + "token_acc": 0.8993894415495552, + "train_speed(iter/s)": 0.032316 + }, + { + "epoch": 0.716950055708957, + "grad_norm": 0.09984250366687775, + "learning_rate": 6.108727056521783e-05, + "loss": 0.30265843868255615, + "memory(GiB)": 78.33, + "step": 3700, + "token_acc": 0.9069431182649025, + "train_speed(iter/s)": 0.032317 + }, + { + "epoch": 0.7171438259942837, + "grad_norm": 0.09416959434747696, + "learning_rate": 6.100986554902988e-05, + "loss": 0.3297763168811798, + "memory(GiB)": 78.33, + "step": 3701, + "token_acc": 0.9030869331246025, + "train_speed(iter/s)": 0.032318 + }, + { + "epoch": 0.7173375962796105, + "grad_norm": 0.10124395787715912, + "learning_rate": 6.0932497083481404e-05, + "loss": 0.3468609154224396, + "memory(GiB)": 78.33, + "step": 3702, + "token_acc": 0.8960674907226953, + "train_speed(iter/s)": 0.032318 + }, + { + "epoch": 0.7175313665649372, + "grad_norm": 0.10942904651165009, + "learning_rate": 6.08551652003497e-05, + "loss": 0.3687775135040283, + "memory(GiB)": 78.33, + "step": 3703, + "token_acc": 0.890260192821466, + "train_speed(iter/s)": 0.032319 + }, + { + "epoch": 0.7177251368502641, + "grad_norm": 0.0992734357714653, + "learning_rate": 6.077786993139706e-05, + "loss": 0.34562569856643677, + "memory(GiB)": 78.33, + "step": 3704, + "token_acc": 0.8980179107439656, + "train_speed(iter/s)": 0.03232 + }, + { + "epoch": 0.7179189071355908, + "grad_norm": 0.09762708097696304, + "learning_rate": 6.070061130837074e-05, + "loss": 0.3264696002006531, + "memory(GiB)": 78.33, + "step": 3705, + "token_acc": 0.9032848106089475, + "train_speed(iter/s)": 0.032321 + }, + { + "epoch": 0.7181126774209176, + "grad_norm": 0.1083015501499176, + "learning_rate": 6.0623389363002925e-05, + "loss": 0.3502708375453949, + "memory(GiB)": 78.33, + "step": 3706, + "token_acc": 0.8949533057084544, + "train_speed(iter/s)": 0.032321 + }, + { + "epoch": 0.7183064477062443, + "grad_norm": 0.1071147471666336, + "learning_rate": 6.054620412701069e-05, + "loss": 0.33522728085517883, + "memory(GiB)": 78.33, + "step": 3707, + "token_acc": 0.89790950744559, + "train_speed(iter/s)": 0.032322 + }, + { + "epoch": 0.718500217991571, + "grad_norm": 0.08930321782827377, + "learning_rate": 6.0469055632096186e-05, + "loss": 0.2909185290336609, + "memory(GiB)": 78.33, + "step": 3708, + "token_acc": 0.9108066749253079, + "train_speed(iter/s)": 0.032323 + }, + { + "epoch": 0.7186939882768978, + "grad_norm": 0.09592770040035248, + "learning_rate": 6.039194390994632e-05, + "loss": 0.3235243260860443, + "memory(GiB)": 78.33, + "step": 3709, + "token_acc": 0.9030587138380254, + "train_speed(iter/s)": 0.032323 + }, + { + "epoch": 0.7188877585622245, + "grad_norm": 0.11995202302932739, + "learning_rate": 6.031486899223295e-05, + "loss": 0.39347177743911743, + "memory(GiB)": 78.33, + "step": 3710, + "token_acc": 0.8830674846625767, + "train_speed(iter/s)": 0.032324 + }, + { + "epoch": 0.7190815288475513, + "grad_norm": 0.11326012015342712, + "learning_rate": 6.0237830910612816e-05, + "loss": 0.36819055676460266, + "memory(GiB)": 78.33, + "step": 3711, + "token_acc": 0.8888051139157516, + "train_speed(iter/s)": 0.032325 + }, + { + "epoch": 0.719275299132878, + "grad_norm": 0.099938303232193, + "learning_rate": 6.0160829696727535e-05, + "loss": 0.3334549367427826, + "memory(GiB)": 78.33, + "step": 3712, + "token_acc": 0.9019944353433705, + "train_speed(iter/s)": 0.032326 + }, + { + "epoch": 0.7194690694182048, + "grad_norm": 0.10359780490398407, + "learning_rate": 6.008386538220357e-05, + "loss": 0.3458460569381714, + "memory(GiB)": 78.33, + "step": 3713, + "token_acc": 0.8952391251453135, + "train_speed(iter/s)": 0.032326 + }, + { + "epoch": 0.7196628397035315, + "grad_norm": 0.09753096848726273, + "learning_rate": 6.0006937998652174e-05, + "loss": 0.3544035255908966, + "memory(GiB)": 78.33, + "step": 3714, + "token_acc": 0.8957159256940096, + "train_speed(iter/s)": 0.032327 + }, + { + "epoch": 0.7198566099888583, + "grad_norm": 0.10259803384542465, + "learning_rate": 5.993004757766961e-05, + "loss": 0.3598324954509735, + "memory(GiB)": 78.33, + "step": 3715, + "token_acc": 0.8931450536543698, + "train_speed(iter/s)": 0.032328 + }, + { + "epoch": 0.720050380274185, + "grad_norm": 0.10409785062074661, + "learning_rate": 5.9853194150836776e-05, + "loss": 0.34401389956474304, + "memory(GiB)": 78.33, + "step": 3716, + "token_acc": 0.8969972797656414, + "train_speed(iter/s)": 0.032328 + }, + { + "epoch": 0.7202441505595117, + "grad_norm": 0.10885158181190491, + "learning_rate": 5.977637774971945e-05, + "loss": 0.34587976336479187, + "memory(GiB)": 78.33, + "step": 3717, + "token_acc": 0.8954475087437647, + "train_speed(iter/s)": 0.032329 + }, + { + "epoch": 0.7204379208448385, + "grad_norm": 0.0933261513710022, + "learning_rate": 5.9699598405868184e-05, + "loss": 0.30791398882865906, + "memory(GiB)": 78.33, + "step": 3718, + "token_acc": 0.9086631971823582, + "train_speed(iter/s)": 0.03233 + }, + { + "epoch": 0.7206316911301652, + "grad_norm": 0.10228514671325684, + "learning_rate": 5.962285615081831e-05, + "loss": 0.32020220160484314, + "memory(GiB)": 78.33, + "step": 3719, + "token_acc": 0.9048829981248876, + "train_speed(iter/s)": 0.032331 + }, + { + "epoch": 0.720825461415492, + "grad_norm": 0.10021097958087921, + "learning_rate": 5.9546151016089935e-05, + "loss": 0.3199518024921417, + "memory(GiB)": 78.33, + "step": 3720, + "token_acc": 0.9031611499255526, + "train_speed(iter/s)": 0.032331 + }, + { + "epoch": 0.7210192317008187, + "grad_norm": 0.0932881161570549, + "learning_rate": 5.946948303318788e-05, + "loss": 0.30388596653938293, + "memory(GiB)": 78.33, + "step": 3721, + "token_acc": 0.9094827586206896, + "train_speed(iter/s)": 0.032332 + }, + { + "epoch": 0.7212130019861455, + "grad_norm": 0.09003807604312897, + "learning_rate": 5.939285223360185e-05, + "loss": 0.3002552092075348, + "memory(GiB)": 78.33, + "step": 3722, + "token_acc": 0.9080838912957857, + "train_speed(iter/s)": 0.032333 + }, + { + "epoch": 0.7214067722714722, + "grad_norm": 0.08356068283319473, + "learning_rate": 5.931625864880612e-05, + "loss": 0.2760453224182129, + "memory(GiB)": 78.33, + "step": 3723, + "token_acc": 0.9180010863661053, + "train_speed(iter/s)": 0.032333 + }, + { + "epoch": 0.721600542556799, + "grad_norm": 0.09654924273490906, + "learning_rate": 5.9239702310259726e-05, + "loss": 0.3264361023902893, + "memory(GiB)": 78.33, + "step": 3724, + "token_acc": 0.9010648901012432, + "train_speed(iter/s)": 0.032334 + }, + { + "epoch": 0.7217943128421257, + "grad_norm": 0.1332063376903534, + "learning_rate": 5.916318324940643e-05, + "loss": 0.3769557476043701, + "memory(GiB)": 78.33, + "step": 3725, + "token_acc": 0.8894034827713968, + "train_speed(iter/s)": 0.032335 + }, + { + "epoch": 0.7219880831274524, + "grad_norm": 0.10913225263357162, + "learning_rate": 5.9086701497674636e-05, + "loss": 0.3281722962856293, + "memory(GiB)": 78.33, + "step": 3726, + "token_acc": 0.9037930929929353, + "train_speed(iter/s)": 0.032335 + }, + { + "epoch": 0.7221818534127792, + "grad_norm": 0.0964721292257309, + "learning_rate": 5.9010257086477465e-05, + "loss": 0.31369680166244507, + "memory(GiB)": 78.33, + "step": 3727, + "token_acc": 0.9058114664301737, + "train_speed(iter/s)": 0.032336 + }, + { + "epoch": 0.7223756236981059, + "grad_norm": 0.09561733901500702, + "learning_rate": 5.893385004721265e-05, + "loss": 0.31413954496383667, + "memory(GiB)": 78.33, + "step": 3728, + "token_acc": 0.9042112486339492, + "train_speed(iter/s)": 0.032337 + }, + { + "epoch": 0.7225693939834327, + "grad_norm": 0.09478065371513367, + "learning_rate": 5.885748041126273e-05, + "loss": 0.32629698514938354, + "memory(GiB)": 78.33, + "step": 3729, + "token_acc": 0.9058655593044381, + "train_speed(iter/s)": 0.032338 + }, + { + "epoch": 0.7227631642687594, + "grad_norm": 0.09354487806558609, + "learning_rate": 5.8781148209994684e-05, + "loss": 0.31036120653152466, + "memory(GiB)": 78.33, + "step": 3730, + "token_acc": 0.9080553710987384, + "train_speed(iter/s)": 0.032338 + }, + { + "epoch": 0.7229569345540862, + "grad_norm": 0.10508932918310165, + "learning_rate": 5.870485347476023e-05, + "loss": 0.3312036693096161, + "memory(GiB)": 78.33, + "step": 3731, + "token_acc": 0.9002311682453766, + "train_speed(iter/s)": 0.032339 + }, + { + "epoch": 0.7231507048394129, + "grad_norm": 0.09594292938709259, + "learning_rate": 5.862859623689564e-05, + "loss": 0.33407771587371826, + "memory(GiB)": 78.33, + "step": 3732, + "token_acc": 0.8988324898531843, + "train_speed(iter/s)": 0.03234 + }, + { + "epoch": 0.7233444751247396, + "grad_norm": 0.09291541576385498, + "learning_rate": 5.855237652772182e-05, + "loss": 0.30694496631622314, + "memory(GiB)": 78.33, + "step": 3733, + "token_acc": 0.9065133287402618, + "train_speed(iter/s)": 0.03234 + }, + { + "epoch": 0.7235382454100664, + "grad_norm": 0.10045888274908066, + "learning_rate": 5.847619437854425e-05, + "loss": 0.3402080535888672, + "memory(GiB)": 78.33, + "step": 3734, + "token_acc": 0.8967975233486823, + "train_speed(iter/s)": 0.032341 + }, + { + "epoch": 0.7237320156953931, + "grad_norm": 0.10660912096500397, + "learning_rate": 5.8400049820652944e-05, + "loss": 0.36076852679252625, + "memory(GiB)": 78.33, + "step": 3735, + "token_acc": 0.8916170432607337, + "train_speed(iter/s)": 0.032342 + }, + { + "epoch": 0.7239257859807199, + "grad_norm": 0.10676801949739456, + "learning_rate": 5.8323942885322605e-05, + "loss": 0.3120606243610382, + "memory(GiB)": 78.33, + "step": 3736, + "token_acc": 0.907629155894226, + "train_speed(iter/s)": 0.032343 + }, + { + "epoch": 0.7241195562660466, + "grad_norm": 0.10707957297563553, + "learning_rate": 5.8247873603812364e-05, + "loss": 0.35654643177986145, + "memory(GiB)": 78.33, + "step": 3737, + "token_acc": 0.8950555588218002, + "train_speed(iter/s)": 0.032343 + }, + { + "epoch": 0.7243133265513734, + "grad_norm": 0.11162468791007996, + "learning_rate": 5.8171842007365906e-05, + "loss": 0.3537403345108032, + "memory(GiB)": 78.33, + "step": 3738, + "token_acc": 0.895508172545164, + "train_speed(iter/s)": 0.032344 + }, + { + "epoch": 0.7245070968367001, + "grad_norm": 0.10323680192232132, + "learning_rate": 5.809584812721145e-05, + "loss": 0.34856048226356506, + "memory(GiB)": 78.33, + "step": 3739, + "token_acc": 0.8983748949285514, + "train_speed(iter/s)": 0.032345 + }, + { + "epoch": 0.7247008671220269, + "grad_norm": 0.1021660715341568, + "learning_rate": 5.801989199456167e-05, + "loss": 0.3216470181941986, + "memory(GiB)": 78.33, + "step": 3740, + "token_acc": 0.9034610943704413, + "train_speed(iter/s)": 0.032346 + }, + { + "epoch": 0.7248946374073536, + "grad_norm": 0.09654397517442703, + "learning_rate": 5.794397364061391e-05, + "loss": 0.33012211322784424, + "memory(GiB)": 78.33, + "step": 3741, + "token_acc": 0.9008311162385515, + "train_speed(iter/s)": 0.032346 + }, + { + "epoch": 0.7250884076926803, + "grad_norm": 0.11007391661405563, + "learning_rate": 5.786809309654982e-05, + "loss": 0.3086721897125244, + "memory(GiB)": 78.33, + "step": 3742, + "token_acc": 0.9068431837791199, + "train_speed(iter/s)": 0.032347 + }, + { + "epoch": 0.7252821779780071, + "grad_norm": 0.10440944880247116, + "learning_rate": 5.7792250393535575e-05, + "loss": 0.3401702344417572, + "memory(GiB)": 78.33, + "step": 3743, + "token_acc": 0.8993664717348928, + "train_speed(iter/s)": 0.032348 + }, + { + "epoch": 0.7254759482633338, + "grad_norm": 0.11093514412641525, + "learning_rate": 5.771644556272181e-05, + "loss": 0.3583501875400543, + "memory(GiB)": 78.33, + "step": 3744, + "token_acc": 0.8940592570311325, + "train_speed(iter/s)": 0.032349 + }, + { + "epoch": 0.7256697185486606, + "grad_norm": 0.10289919376373291, + "learning_rate": 5.7640678635243606e-05, + "loss": 0.3268412947654724, + "memory(GiB)": 78.33, + "step": 3745, + "token_acc": 0.9035391502660698, + "train_speed(iter/s)": 0.032349 + }, + { + "epoch": 0.7258634888339873, + "grad_norm": 0.095411017537117, + "learning_rate": 5.756494964222047e-05, + "loss": 0.32108423113822937, + "memory(GiB)": 78.33, + "step": 3746, + "token_acc": 0.9025717749632419, + "train_speed(iter/s)": 0.03235 + }, + { + "epoch": 0.7260572591193141, + "grad_norm": 0.10212237387895584, + "learning_rate": 5.748925861475631e-05, + "loss": 0.35688862204551697, + "memory(GiB)": 78.33, + "step": 3747, + "token_acc": 0.8916946508368809, + "train_speed(iter/s)": 0.032351 + }, + { + "epoch": 0.7262510294046408, + "grad_norm": 0.11504284292459488, + "learning_rate": 5.741360558393953e-05, + "loss": 0.3791845738887787, + "memory(GiB)": 78.33, + "step": 3748, + "token_acc": 0.8880055524007824, + "train_speed(iter/s)": 0.032352 + }, + { + "epoch": 0.7264447996899676, + "grad_norm": 0.09927377104759216, + "learning_rate": 5.733799058084284e-05, + "loss": 0.33466947078704834, + "memory(GiB)": 78.33, + "step": 3749, + "token_acc": 0.899494293530826, + "train_speed(iter/s)": 0.032352 + }, + { + "epoch": 0.7266385699752943, + "grad_norm": 0.09528510272502899, + "learning_rate": 5.7262413636523343e-05, + "loss": 0.3107728362083435, + "memory(GiB)": 78.33, + "step": 3750, + "token_acc": 0.9060210094798873, + "train_speed(iter/s)": 0.032353 + }, + { + "epoch": 0.726832340260621, + "grad_norm": 0.09576904028654099, + "learning_rate": 5.718687478202252e-05, + "loss": 0.3216266930103302, + "memory(GiB)": 78.33, + "step": 3751, + "token_acc": 0.9044459023011707, + "train_speed(iter/s)": 0.032354 + }, + { + "epoch": 0.7270261105459478, + "grad_norm": 0.10115578025579453, + "learning_rate": 5.7111374048366204e-05, + "loss": 0.3389231264591217, + "memory(GiB)": 78.33, + "step": 3752, + "token_acc": 0.8973095364944059, + "train_speed(iter/s)": 0.032354 + }, + { + "epoch": 0.7272198808312745, + "grad_norm": 0.10029512643814087, + "learning_rate": 5.703591146656458e-05, + "loss": 0.31098711490631104, + "memory(GiB)": 78.33, + "step": 3753, + "token_acc": 0.906193576626361, + "train_speed(iter/s)": 0.032355 + }, + { + "epoch": 0.7274136511166013, + "grad_norm": 0.10533425211906433, + "learning_rate": 5.696048706761211e-05, + "loss": 0.33651649951934814, + "memory(GiB)": 78.33, + "step": 3754, + "token_acc": 0.8975294455616202, + "train_speed(iter/s)": 0.032356 + }, + { + "epoch": 0.727607421401928, + "grad_norm": 0.09996731579303741, + "learning_rate": 5.688510088248772e-05, + "loss": 0.31849730014801025, + "memory(GiB)": 78.33, + "step": 3755, + "token_acc": 0.9020276110740658, + "train_speed(iter/s)": 0.032357 + }, + { + "epoch": 0.7278011916872548, + "grad_norm": 0.10525427013635635, + "learning_rate": 5.6809752942154505e-05, + "loss": 0.3365975618362427, + "memory(GiB)": 78.33, + "step": 3756, + "token_acc": 0.8993686078063035, + "train_speed(iter/s)": 0.032357 + }, + { + "epoch": 0.7279949619725815, + "grad_norm": 0.11113554239273071, + "learning_rate": 5.673444327755986e-05, + "loss": 0.36573588848114014, + "memory(GiB)": 78.33, + "step": 3757, + "token_acc": 0.8919042258663178, + "train_speed(iter/s)": 0.032358 + }, + { + "epoch": 0.7281887322579083, + "grad_norm": 0.11570943146944046, + "learning_rate": 5.6659171919635504e-05, + "loss": 0.3126344382762909, + "memory(GiB)": 78.33, + "step": 3758, + "token_acc": 0.9062464954581138, + "train_speed(iter/s)": 0.032359 + }, + { + "epoch": 0.728382502543235, + "grad_norm": 0.11019326746463776, + "learning_rate": 5.6583938899297404e-05, + "loss": 0.3683561086654663, + "memory(GiB)": 78.33, + "step": 3759, + "token_acc": 0.8913200775664959, + "train_speed(iter/s)": 0.032359 + }, + { + "epoch": 0.7285762728285617, + "grad_norm": 0.11105161160230637, + "learning_rate": 5.650874424744579e-05, + "loss": 0.34568020701408386, + "memory(GiB)": 78.33, + "step": 3760, + "token_acc": 0.898310245090577, + "train_speed(iter/s)": 0.03236 + }, + { + "epoch": 0.7287700431138885, + "grad_norm": 0.10734663158655167, + "learning_rate": 5.643358799496508e-05, + "loss": 0.3574303090572357, + "memory(GiB)": 78.33, + "step": 3761, + "token_acc": 0.8934692112234416, + "train_speed(iter/s)": 0.032361 + }, + { + "epoch": 0.7289638133992152, + "grad_norm": 0.1054970771074295, + "learning_rate": 5.635847017272404e-05, + "loss": 0.33780547976493835, + "memory(GiB)": 78.33, + "step": 3762, + "token_acc": 0.898160262743318, + "train_speed(iter/s)": 0.032362 + }, + { + "epoch": 0.729157583684542, + "grad_norm": 0.10268279910087585, + "learning_rate": 5.628339081157556e-05, + "loss": 0.3292725682258606, + "memory(GiB)": 78.33, + "step": 3763, + "token_acc": 0.9022685641820487, + "train_speed(iter/s)": 0.032362 + }, + { + "epoch": 0.7293513539698687, + "grad_norm": 0.10835665464401245, + "learning_rate": 5.620834994235673e-05, + "loss": 0.3525397479534149, + "memory(GiB)": 78.33, + "step": 3764, + "token_acc": 0.8940980485483103, + "train_speed(iter/s)": 0.032363 + }, + { + "epoch": 0.7295451242551955, + "grad_norm": 0.09963172674179077, + "learning_rate": 5.613334759588885e-05, + "loss": 0.3277450501918793, + "memory(GiB)": 78.33, + "step": 3765, + "token_acc": 0.9008741844812914, + "train_speed(iter/s)": 0.032364 + }, + { + "epoch": 0.7297388945405222, + "grad_norm": 0.10072033107280731, + "learning_rate": 5.605838380297742e-05, + "loss": 0.3499065041542053, + "memory(GiB)": 78.33, + "step": 3766, + "token_acc": 0.8946452543131308, + "train_speed(iter/s)": 0.032364 + }, + { + "epoch": 0.729932664825849, + "grad_norm": 0.09813085943460464, + "learning_rate": 5.5983458594412075e-05, + "loss": 0.3212866187095642, + "memory(GiB)": 78.33, + "step": 3767, + "token_acc": 0.9034384826047267, + "train_speed(iter/s)": 0.032365 + }, + { + "epoch": 0.7301264351111757, + "grad_norm": 0.1046958863735199, + "learning_rate": 5.5908572000966545e-05, + "loss": 0.3336329162120819, + "memory(GiB)": 78.33, + "step": 3768, + "token_acc": 0.9022138594066974, + "train_speed(iter/s)": 0.032366 + }, + { + "epoch": 0.7303202053965024, + "grad_norm": 0.10364958643913269, + "learning_rate": 5.583372405339888e-05, + "loss": 0.3599414527416229, + "memory(GiB)": 78.33, + "step": 3769, + "token_acc": 0.8923859525040124, + "train_speed(iter/s)": 0.032367 + }, + { + "epoch": 0.7305139756818292, + "grad_norm": 0.11301209777593613, + "learning_rate": 5.5758914782451094e-05, + "loss": 0.3293749690055847, + "memory(GiB)": 78.33, + "step": 3770, + "token_acc": 0.9023539911864785, + "train_speed(iter/s)": 0.032367 + }, + { + "epoch": 0.7307077459671559, + "grad_norm": 0.10696696490049362, + "learning_rate": 5.5684144218849364e-05, + "loss": 0.3482256531715393, + "memory(GiB)": 78.33, + "step": 3771, + "token_acc": 0.8958611481975968, + "train_speed(iter/s)": 0.032368 + }, + { + "epoch": 0.7309015162524827, + "grad_norm": 0.1035601943731308, + "learning_rate": 5.5609412393303983e-05, + "loss": 0.35395896434783936, + "memory(GiB)": 78.33, + "step": 3772, + "token_acc": 0.8959904183792906, + "train_speed(iter/s)": 0.032369 + }, + { + "epoch": 0.7310952865378094, + "grad_norm": 0.10243549197912216, + "learning_rate": 5.553471933650922e-05, + "loss": 0.37264150381088257, + "memory(GiB)": 78.33, + "step": 3773, + "token_acc": 0.8881558441558441, + "train_speed(iter/s)": 0.032369 + }, + { + "epoch": 0.7312890568231362, + "grad_norm": 0.10226655006408691, + "learning_rate": 5.546006507914369e-05, + "loss": 0.3363930284976959, + "memory(GiB)": 78.33, + "step": 3774, + "token_acc": 0.8988527079466847, + "train_speed(iter/s)": 0.03237 + }, + { + "epoch": 0.7314828271084629, + "grad_norm": 0.11039669811725616, + "learning_rate": 5.5385449651869815e-05, + "loss": 0.34301939606666565, + "memory(GiB)": 78.33, + "step": 3775, + "token_acc": 0.8971273781025576, + "train_speed(iter/s)": 0.032371 + }, + { + "epoch": 0.7316765973937897, + "grad_norm": 0.10423394292593002, + "learning_rate": 5.531087308533414e-05, + "loss": 0.3568193316459656, + "memory(GiB)": 78.33, + "step": 3776, + "token_acc": 0.8942443919716647, + "train_speed(iter/s)": 0.032372 + }, + { + "epoch": 0.7318703676791164, + "grad_norm": 0.11283637583255768, + "learning_rate": 5.52363354101673e-05, + "loss": 0.36575639247894287, + "memory(GiB)": 78.33, + "step": 3777, + "token_acc": 0.8924577682607662, + "train_speed(iter/s)": 0.032372 + }, + { + "epoch": 0.7320641379644431, + "grad_norm": 0.09643404930830002, + "learning_rate": 5.51618366569839e-05, + "loss": 0.30832773447036743, + "memory(GiB)": 78.33, + "step": 3778, + "token_acc": 0.9075776332077857, + "train_speed(iter/s)": 0.032373 + }, + { + "epoch": 0.7322579082497699, + "grad_norm": 0.10145960003137589, + "learning_rate": 5.508737685638259e-05, + "loss": 0.32317882776260376, + "memory(GiB)": 78.33, + "step": 3779, + "token_acc": 0.9013355214758644, + "train_speed(iter/s)": 0.032374 + }, + { + "epoch": 0.7324516785350966, + "grad_norm": 0.10353747755289078, + "learning_rate": 5.501295603894594e-05, + "loss": 0.34720805287361145, + "memory(GiB)": 78.33, + "step": 3780, + "token_acc": 0.8948657321398412, + "train_speed(iter/s)": 0.032375 + }, + { + "epoch": 0.7326454488204234, + "grad_norm": 0.09833744168281555, + "learning_rate": 5.49385742352407e-05, + "loss": 0.33571141958236694, + "memory(GiB)": 78.33, + "step": 3781, + "token_acc": 0.9020605793832017, + "train_speed(iter/s)": 0.032375 + }, + { + "epoch": 0.7328392191057501, + "grad_norm": 0.09903203696012497, + "learning_rate": 5.486423147581744e-05, + "loss": 0.3094678521156311, + "memory(GiB)": 78.33, + "step": 3782, + "token_acc": 0.9053752361325778, + "train_speed(iter/s)": 0.032376 + }, + { + "epoch": 0.7330329893910769, + "grad_norm": 0.1019536554813385, + "learning_rate": 5.4789927791210694e-05, + "loss": 0.34034597873687744, + "memory(GiB)": 78.33, + "step": 3783, + "token_acc": 0.8967783128042779, + "train_speed(iter/s)": 0.032377 + }, + { + "epoch": 0.7332267596764036, + "grad_norm": 0.10213874280452728, + "learning_rate": 5.4715663211939e-05, + "loss": 0.34146547317504883, + "memory(GiB)": 78.33, + "step": 3784, + "token_acc": 0.8989790738338316, + "train_speed(iter/s)": 0.032377 + }, + { + "epoch": 0.7334205299617303, + "grad_norm": 0.09893448650836945, + "learning_rate": 5.4641437768504824e-05, + "loss": 0.3442443907260895, + "memory(GiB)": 78.33, + "step": 3785, + "token_acc": 0.8979535398230089, + "train_speed(iter/s)": 0.032378 + }, + { + "epoch": 0.7336143002470571, + "grad_norm": 0.10614677518606186, + "learning_rate": 5.456725149139454e-05, + "loss": 0.34340110421180725, + "memory(GiB)": 78.33, + "step": 3786, + "token_acc": 0.895698897732694, + "train_speed(iter/s)": 0.032379 + }, + { + "epoch": 0.7338080705323838, + "grad_norm": 0.0985707938671112, + "learning_rate": 5.449310441107838e-05, + "loss": 0.33455923199653625, + "memory(GiB)": 78.33, + "step": 3787, + "token_acc": 0.8980871664348876, + "train_speed(iter/s)": 0.032379 + }, + { + "epoch": 0.7340018408177106, + "grad_norm": 0.1156158521771431, + "learning_rate": 5.4418996558010667e-05, + "loss": 0.3170754015445709, + "memory(GiB)": 78.33, + "step": 3788, + "token_acc": 0.9032800672834315, + "train_speed(iter/s)": 0.03238 + }, + { + "epoch": 0.7341956111030373, + "grad_norm": 0.11739075183868408, + "learning_rate": 5.434492796262942e-05, + "loss": 0.38899776339530945, + "memory(GiB)": 78.33, + "step": 3789, + "token_acc": 0.8846523748952937, + "train_speed(iter/s)": 0.032381 + }, + { + "epoch": 0.7343893813883641, + "grad_norm": 0.1062496230006218, + "learning_rate": 5.4270898655356625e-05, + "loss": 0.335146427154541, + "memory(GiB)": 78.33, + "step": 3790, + "token_acc": 0.8996969696969697, + "train_speed(iter/s)": 0.032382 + }, + { + "epoch": 0.7345831516736908, + "grad_norm": 0.11981339007616043, + "learning_rate": 5.4196908666598075e-05, + "loss": 0.41245871782302856, + "memory(GiB)": 78.33, + "step": 3791, + "token_acc": 0.8796483136350534, + "train_speed(iter/s)": 0.032382 + }, + { + "epoch": 0.7347769219590176, + "grad_norm": 0.10465206950902939, + "learning_rate": 5.412295802674348e-05, + "loss": 0.3469778895378113, + "memory(GiB)": 78.33, + "step": 3792, + "token_acc": 0.8962869653889989, + "train_speed(iter/s)": 0.032383 + }, + { + "epoch": 0.7349706922443443, + "grad_norm": 0.0999392494559288, + "learning_rate": 5.4049046766166335e-05, + "loss": 0.3266690969467163, + "memory(GiB)": 78.33, + "step": 3793, + "token_acc": 0.9025028714907711, + "train_speed(iter/s)": 0.032384 + }, + { + "epoch": 0.735164462529671, + "grad_norm": 0.10005468130111694, + "learning_rate": 5.397517491522393e-05, + "loss": 0.34978044033050537, + "memory(GiB)": 78.33, + "step": 3794, + "token_acc": 0.8954549081141459, + "train_speed(iter/s)": 0.032384 + }, + { + "epoch": 0.7353582328149978, + "grad_norm": 0.11191490292549133, + "learning_rate": 5.390134250425753e-05, + "loss": 0.3621137738227844, + "memory(GiB)": 78.33, + "step": 3795, + "token_acc": 0.8916416576208416, + "train_speed(iter/s)": 0.032385 + }, + { + "epoch": 0.7355520031003245, + "grad_norm": 0.0992647185921669, + "learning_rate": 5.382754956359204e-05, + "loss": 0.33923065662384033, + "memory(GiB)": 78.33, + "step": 3796, + "token_acc": 0.8980301274623407, + "train_speed(iter/s)": 0.032386 + }, + { + "epoch": 0.7357457733856513, + "grad_norm": 0.10284343361854553, + "learning_rate": 5.37537961235362e-05, + "loss": 0.3275540769100189, + "memory(GiB)": 78.33, + "step": 3797, + "token_acc": 0.9016500897717233, + "train_speed(iter/s)": 0.032387 + }, + { + "epoch": 0.735939543670978, + "grad_norm": 0.09948836266994476, + "learning_rate": 5.368008221438251e-05, + "loss": 0.3380359709262848, + "memory(GiB)": 78.33, + "step": 3798, + "token_acc": 0.8977125965043193, + "train_speed(iter/s)": 0.032387 + }, + { + "epoch": 0.7361333139563048, + "grad_norm": 0.09476862102746964, + "learning_rate": 5.360640786640729e-05, + "loss": 0.3325924873352051, + "memory(GiB)": 78.33, + "step": 3799, + "token_acc": 0.9025462002294754, + "train_speed(iter/s)": 0.032388 + }, + { + "epoch": 0.7363270842416315, + "grad_norm": 0.10117123275995255, + "learning_rate": 5.3532773109870544e-05, + "loss": 0.3492782711982727, + "memory(GiB)": 78.33, + "step": 3800, + "token_acc": 0.8976592977893368, + "train_speed(iter/s)": 0.032389 + }, + { + "epoch": 0.7365208545269583, + "grad_norm": 0.11379625648260117, + "learning_rate": 5.3459177975016e-05, + "loss": 0.34902098774909973, + "memory(GiB)": 78.33, + "step": 3801, + "token_acc": 0.8949467686309791, + "train_speed(iter/s)": 0.032386 + }, + { + "epoch": 0.736714624812285, + "grad_norm": 0.10552668571472168, + "learning_rate": 5.338562249207128e-05, + "loss": 0.35885700583457947, + "memory(GiB)": 78.33, + "step": 3802, + "token_acc": 0.8908405111749141, + "train_speed(iter/s)": 0.032387 + }, + { + "epoch": 0.7369083950976117, + "grad_norm": 0.10673682391643524, + "learning_rate": 5.331210669124752e-05, + "loss": 0.34370338916778564, + "memory(GiB)": 78.33, + "step": 3803, + "token_acc": 0.897491594013152, + "train_speed(iter/s)": 0.032387 + }, + { + "epoch": 0.7371021653829385, + "grad_norm": 0.09526897221803665, + "learning_rate": 5.323863060273966e-05, + "loss": 0.303422749042511, + "memory(GiB)": 78.33, + "step": 3804, + "token_acc": 0.909460807476243, + "train_speed(iter/s)": 0.032388 + }, + { + "epoch": 0.7372959356682652, + "grad_norm": 0.08512071520090103, + "learning_rate": 5.3165194256726275e-05, + "loss": 0.2895103394985199, + "memory(GiB)": 78.33, + "step": 3805, + "token_acc": 0.9112764240057593, + "train_speed(iter/s)": 0.032389 + }, + { + "epoch": 0.737489705953592, + "grad_norm": 0.103308767080307, + "learning_rate": 5.309179768336967e-05, + "loss": 0.34746530652046204, + "memory(GiB)": 78.33, + "step": 3806, + "token_acc": 0.8955654981027354, + "train_speed(iter/s)": 0.032389 + }, + { + "epoch": 0.7376834762389187, + "grad_norm": 0.10745726525783539, + "learning_rate": 5.301844091281573e-05, + "loss": 0.33433717489242554, + "memory(GiB)": 78.33, + "step": 3807, + "token_acc": 0.9007903055848261, + "train_speed(iter/s)": 0.03239 + }, + { + "epoch": 0.7378772465242455, + "grad_norm": 0.09775236248970032, + "learning_rate": 5.294512397519414e-05, + "loss": 0.3289939761161804, + "memory(GiB)": 78.33, + "step": 3808, + "token_acc": 0.9027567020738493, + "train_speed(iter/s)": 0.032391 + }, + { + "epoch": 0.7380710168095722, + "grad_norm": 0.10481875389814377, + "learning_rate": 5.287184690061811e-05, + "loss": 0.32689011096954346, + "memory(GiB)": 78.33, + "step": 3809, + "token_acc": 0.9029928528886242, + "train_speed(iter/s)": 0.032391 + }, + { + "epoch": 0.738264787094899, + "grad_norm": 0.1059269830584526, + "learning_rate": 5.279860971918449e-05, + "loss": 0.37165161967277527, + "memory(GiB)": 78.33, + "step": 3810, + "token_acc": 0.8881279218652295, + "train_speed(iter/s)": 0.032392 + }, + { + "epoch": 0.7384585573802257, + "grad_norm": 0.09881359338760376, + "learning_rate": 5.272541246097376e-05, + "loss": 0.309497207403183, + "memory(GiB)": 78.33, + "step": 3811, + "token_acc": 0.9069961956668375, + "train_speed(iter/s)": 0.032393 + }, + { + "epoch": 0.7386523276655524, + "grad_norm": 0.10563083738088608, + "learning_rate": 5.265225515605001e-05, + "loss": 0.3451386094093323, + "memory(GiB)": 78.33, + "step": 3812, + "token_acc": 0.8983088749126485, + "train_speed(iter/s)": 0.032394 + }, + { + "epoch": 0.7388460979508792, + "grad_norm": 0.10384704172611237, + "learning_rate": 5.257913783446086e-05, + "loss": 0.35985156893730164, + "memory(GiB)": 78.33, + "step": 3813, + "token_acc": 0.8926537745954783, + "train_speed(iter/s)": 0.032394 + }, + { + "epoch": 0.7390398682362059, + "grad_norm": 0.10183234512805939, + "learning_rate": 5.250606052623762e-05, + "loss": 0.315267413854599, + "memory(GiB)": 78.33, + "step": 3814, + "token_acc": 0.9050984876045943, + "train_speed(iter/s)": 0.032395 + }, + { + "epoch": 0.7392336385215327, + "grad_norm": 0.10309257358312607, + "learning_rate": 5.2433023261395113e-05, + "loss": 0.34021520614624023, + "memory(GiB)": 78.33, + "step": 3815, + "token_acc": 0.8987062341073574, + "train_speed(iter/s)": 0.032396 + }, + { + "epoch": 0.7394274088068594, + "grad_norm": 0.11030527949333191, + "learning_rate": 5.236002606993167e-05, + "loss": 0.35504722595214844, + "memory(GiB)": 78.33, + "step": 3816, + "token_acc": 0.8941146380971327, + "train_speed(iter/s)": 0.032396 + }, + { + "epoch": 0.7396211790921862, + "grad_norm": 0.09362789988517761, + "learning_rate": 5.228706898182921e-05, + "loss": 0.3009899854660034, + "memory(GiB)": 78.33, + "step": 3817, + "token_acc": 0.909036220077704, + "train_speed(iter/s)": 0.032397 + }, + { + "epoch": 0.7398149493775129, + "grad_norm": 0.1043652817606926, + "learning_rate": 5.221415202705316e-05, + "loss": 0.3409806489944458, + "memory(GiB)": 78.33, + "step": 3818, + "token_acc": 0.8996104618809126, + "train_speed(iter/s)": 0.032398 + }, + { + "epoch": 0.7400087196628397, + "grad_norm": 0.10487966984510422, + "learning_rate": 5.214127523555249e-05, + "loss": 0.3638575077056885, + "memory(GiB)": 78.33, + "step": 3819, + "token_acc": 0.8908792978947954, + "train_speed(iter/s)": 0.032398 + }, + { + "epoch": 0.7402024899481664, + "grad_norm": 0.10344404727220535, + "learning_rate": 5.206843863725959e-05, + "loss": 0.3182569742202759, + "memory(GiB)": 78.33, + "step": 3820, + "token_acc": 0.9035023713973003, + "train_speed(iter/s)": 0.032399 + }, + { + "epoch": 0.7403962602334931, + "grad_norm": 0.09493482857942581, + "learning_rate": 5.199564226209051e-05, + "loss": 0.31004172563552856, + "memory(GiB)": 78.33, + "step": 3821, + "token_acc": 0.905801490532747, + "train_speed(iter/s)": 0.0324 + }, + { + "epoch": 0.7405900305188199, + "grad_norm": 0.10709976404905319, + "learning_rate": 5.192288613994464e-05, + "loss": 0.35697808861732483, + "memory(GiB)": 78.33, + "step": 3822, + "token_acc": 0.8928056450375824, + "train_speed(iter/s)": 0.032401 + }, + { + "epoch": 0.7407838008041466, + "grad_norm": 0.09773747622966766, + "learning_rate": 5.185017030070487e-05, + "loss": 0.34137359261512756, + "memory(GiB)": 78.33, + "step": 3823, + "token_acc": 0.8971325975603316, + "train_speed(iter/s)": 0.032401 + }, + { + "epoch": 0.7409775710894734, + "grad_norm": 0.10251982510089874, + "learning_rate": 5.1777494774237534e-05, + "loss": 0.3589448034763336, + "memory(GiB)": 78.33, + "step": 3824, + "token_acc": 0.8923671206762486, + "train_speed(iter/s)": 0.032402 + }, + { + "epoch": 0.7411713413748001, + "grad_norm": 0.09967760741710663, + "learning_rate": 5.170485959039244e-05, + "loss": 0.33076420426368713, + "memory(GiB)": 78.33, + "step": 3825, + "token_acc": 0.9000113526707157, + "train_speed(iter/s)": 0.032403 + }, + { + "epoch": 0.741365111660127, + "grad_norm": 0.09976795315742493, + "learning_rate": 5.163226477900281e-05, + "loss": 0.33364495635032654, + "memory(GiB)": 78.33, + "step": 3826, + "token_acc": 0.9020033344801122, + "train_speed(iter/s)": 0.032403 + }, + { + "epoch": 0.7415588819454537, + "grad_norm": 0.09653392434120178, + "learning_rate": 5.155971036988522e-05, + "loss": 0.32705751061439514, + "memory(GiB)": 78.33, + "step": 3827, + "token_acc": 0.8996669334641502, + "train_speed(iter/s)": 0.032404 + }, + { + "epoch": 0.7417526522307805, + "grad_norm": 0.10461395233869553, + "learning_rate": 5.148719639283984e-05, + "loss": 0.35183149576187134, + "memory(GiB)": 78.33, + "step": 3828, + "token_acc": 0.8943066752210661, + "train_speed(iter/s)": 0.032405 + }, + { + "epoch": 0.7419464225161072, + "grad_norm": 0.09616634994745255, + "learning_rate": 5.1414722877650025e-05, + "loss": 0.34106606245040894, + "memory(GiB)": 78.33, + "step": 3829, + "token_acc": 0.8989504013171434, + "train_speed(iter/s)": 0.032405 + }, + { + "epoch": 0.742140192801434, + "grad_norm": 0.0909215584397316, + "learning_rate": 5.134228985408262e-05, + "loss": 0.3328753709793091, + "memory(GiB)": 78.33, + "step": 3830, + "token_acc": 0.8994169096209913, + "train_speed(iter/s)": 0.032406 + }, + { + "epoch": 0.7423339630867607, + "grad_norm": 0.107168048620224, + "learning_rate": 5.126989735188782e-05, + "loss": 0.3591649532318115, + "memory(GiB)": 78.33, + "step": 3831, + "token_acc": 0.8922912205567451, + "train_speed(iter/s)": 0.032407 + }, + { + "epoch": 0.7425277333720874, + "grad_norm": 0.10682724416255951, + "learning_rate": 5.119754540079914e-05, + "loss": 0.35992226004600525, + "memory(GiB)": 78.33, + "step": 3832, + "token_acc": 0.893421090615167, + "train_speed(iter/s)": 0.032408 + }, + { + "epoch": 0.7427215036574142, + "grad_norm": 0.10819026827812195, + "learning_rate": 5.1125234030533494e-05, + "loss": 0.3347112536430359, + "memory(GiB)": 78.33, + "step": 3833, + "token_acc": 0.8996055354371444, + "train_speed(iter/s)": 0.032408 + }, + { + "epoch": 0.7429152739427409, + "grad_norm": 0.09705478698015213, + "learning_rate": 5.1052963270791045e-05, + "loss": 0.33945244550704956, + "memory(GiB)": 78.33, + "step": 3834, + "token_acc": 0.89829610196889, + "train_speed(iter/s)": 0.032409 + }, + { + "epoch": 0.7431090442280677, + "grad_norm": 0.09978172183036804, + "learning_rate": 5.098073315125545e-05, + "loss": 0.314132958650589, + "memory(GiB)": 78.33, + "step": 3835, + "token_acc": 0.9039825686962838, + "train_speed(iter/s)": 0.03241 + }, + { + "epoch": 0.7433028145133944, + "grad_norm": 0.1035098284482956, + "learning_rate": 5.09085437015935e-05, + "loss": 0.3336668908596039, + "memory(GiB)": 78.33, + "step": 3836, + "token_acc": 0.9000965741587883, + "train_speed(iter/s)": 0.03241 + }, + { + "epoch": 0.7434965847987212, + "grad_norm": 0.10066191852092743, + "learning_rate": 5.083639495145534e-05, + "loss": 0.3203166127204895, + "memory(GiB)": 78.33, + "step": 3837, + "token_acc": 0.9036123032904149, + "train_speed(iter/s)": 0.032411 + }, + { + "epoch": 0.7436903550840479, + "grad_norm": 0.11868193745613098, + "learning_rate": 5.076428693047439e-05, + "loss": 0.3131275177001953, + "memory(GiB)": 78.33, + "step": 3838, + "token_acc": 0.9023334587881069, + "train_speed(iter/s)": 0.032412 + }, + { + "epoch": 0.7438841253693746, + "grad_norm": 0.108455590903759, + "learning_rate": 5.069221966826738e-05, + "loss": 0.3502568006515503, + "memory(GiB)": 78.33, + "step": 3839, + "token_acc": 0.8950985915492958, + "train_speed(iter/s)": 0.032412 + }, + { + "epoch": 0.7440778956547014, + "grad_norm": 0.10078644752502441, + "learning_rate": 5.06201931944342e-05, + "loss": 0.33233729004859924, + "memory(GiB)": 78.33, + "step": 3840, + "token_acc": 0.8990275860042892, + "train_speed(iter/s)": 0.032413 + }, + { + "epoch": 0.7442716659400281, + "grad_norm": 0.10141783952713013, + "learning_rate": 5.054820753855817e-05, + "loss": 0.3332747220993042, + "memory(GiB)": 78.33, + "step": 3841, + "token_acc": 0.902451309603761, + "train_speed(iter/s)": 0.032414 + }, + { + "epoch": 0.7444654362253549, + "grad_norm": 0.0925682932138443, + "learning_rate": 5.047626273020568e-05, + "loss": 0.30119389295578003, + "memory(GiB)": 78.33, + "step": 3842, + "token_acc": 0.9076820241130888, + "train_speed(iter/s)": 0.032414 + }, + { + "epoch": 0.7446592065106816, + "grad_norm": 0.09351833164691925, + "learning_rate": 5.040435879892639e-05, + "loss": 0.3276900053024292, + "memory(GiB)": 78.33, + "step": 3843, + "token_acc": 0.9007225727661307, + "train_speed(iter/s)": 0.032415 + }, + { + "epoch": 0.7448529767960084, + "grad_norm": 0.1048940047621727, + "learning_rate": 5.0332495774253165e-05, + "loss": 0.33195894956588745, + "memory(GiB)": 78.33, + "step": 3844, + "token_acc": 0.8993898230210663, + "train_speed(iter/s)": 0.032416 + }, + { + "epoch": 0.7450467470813351, + "grad_norm": 0.09521888941526413, + "learning_rate": 5.026067368570211e-05, + "loss": 0.28278443217277527, + "memory(GiB)": 78.33, + "step": 3845, + "token_acc": 0.9150288711129947, + "train_speed(iter/s)": 0.032416 + }, + { + "epoch": 0.7452405173666619, + "grad_norm": 0.12543639540672302, + "learning_rate": 5.018889256277241e-05, + "loss": 0.32016804814338684, + "memory(GiB)": 78.33, + "step": 3846, + "token_acc": 0.9048030793261185, + "train_speed(iter/s)": 0.032417 + }, + { + "epoch": 0.7454342876519886, + "grad_norm": 0.09576548635959625, + "learning_rate": 5.011715243494663e-05, + "loss": 0.3216573894023895, + "memory(GiB)": 78.33, + "step": 3847, + "token_acc": 0.9055984653440355, + "train_speed(iter/s)": 0.032418 + }, + { + "epoch": 0.7456280579373153, + "grad_norm": 0.09915050864219666, + "learning_rate": 5.004545333169028e-05, + "loss": 0.3430394232273102, + "memory(GiB)": 78.33, + "step": 3848, + "token_acc": 0.8962812609632637, + "train_speed(iter/s)": 0.032419 + }, + { + "epoch": 0.7458218282226421, + "grad_norm": 0.09718713909387589, + "learning_rate": 4.997379528245215e-05, + "loss": 0.3420059084892273, + "memory(GiB)": 78.33, + "step": 3849, + "token_acc": 0.8961742910262117, + "train_speed(iter/s)": 0.032419 + }, + { + "epoch": 0.7460155985079688, + "grad_norm": 0.10262469947338104, + "learning_rate": 4.990217831666409e-05, + "loss": 0.3337242007255554, + "memory(GiB)": 78.33, + "step": 3850, + "token_acc": 0.9016568394653292, + "train_speed(iter/s)": 0.03242 + }, + { + "epoch": 0.7462093687932956, + "grad_norm": 0.09261801093816757, + "learning_rate": 4.983060246374115e-05, + "loss": 0.2943536341190338, + "memory(GiB)": 78.33, + "step": 3851, + "token_acc": 0.9127831715210356, + "train_speed(iter/s)": 0.032421 + }, + { + "epoch": 0.7464031390786223, + "grad_norm": 0.11604174971580505, + "learning_rate": 4.9759067753081414e-05, + "loss": 0.3443485200405121, + "memory(GiB)": 78.33, + "step": 3852, + "token_acc": 0.8987715146996856, + "train_speed(iter/s)": 0.032421 + }, + { + "epoch": 0.7465969093639491, + "grad_norm": 0.10266567766666412, + "learning_rate": 4.968757421406608e-05, + "loss": 0.3430730998516083, + "memory(GiB)": 78.33, + "step": 3853, + "token_acc": 0.897044232074782, + "train_speed(iter/s)": 0.032422 + }, + { + "epoch": 0.7467906796492758, + "grad_norm": 0.10636512190103531, + "learning_rate": 4.961612187605958e-05, + "loss": 0.3752942383289337, + "memory(GiB)": 78.33, + "step": 3854, + "token_acc": 0.886795650331874, + "train_speed(iter/s)": 0.032423 + }, + { + "epoch": 0.7469844499346026, + "grad_norm": 0.11186108738183975, + "learning_rate": 4.954471076840922e-05, + "loss": 0.37107086181640625, + "memory(GiB)": 78.33, + "step": 3855, + "token_acc": 0.8894728014759322, + "train_speed(iter/s)": 0.032423 + }, + { + "epoch": 0.7471782202199293, + "grad_norm": 0.10150814801454544, + "learning_rate": 4.947334092044552e-05, + "loss": 0.3377690315246582, + "memory(GiB)": 78.33, + "step": 3856, + "token_acc": 0.8990276738967838, + "train_speed(iter/s)": 0.032424 + }, + { + "epoch": 0.747371990505256, + "grad_norm": 0.10091419517993927, + "learning_rate": 4.9402012361481934e-05, + "loss": 0.30891650915145874, + "memory(GiB)": 78.33, + "step": 3857, + "token_acc": 0.9063477562771641, + "train_speed(iter/s)": 0.032425 + }, + { + "epoch": 0.7475657607905828, + "grad_norm": 0.09730595350265503, + "learning_rate": 4.9330725120815054e-05, + "loss": 0.3267110586166382, + "memory(GiB)": 78.33, + "step": 3858, + "token_acc": 0.900251940973829, + "train_speed(iter/s)": 0.032425 + }, + { + "epoch": 0.7477595310759095, + "grad_norm": 0.10822945088148117, + "learning_rate": 4.925947922772445e-05, + "loss": 0.4060097932815552, + "memory(GiB)": 78.33, + "step": 3859, + "token_acc": 0.8794587622081238, + "train_speed(iter/s)": 0.032426 + }, + { + "epoch": 0.7479533013612363, + "grad_norm": 0.10585605353116989, + "learning_rate": 4.918827471147268e-05, + "loss": 0.338094025850296, + "memory(GiB)": 78.33, + "step": 3860, + "token_acc": 0.8971483116573201, + "train_speed(iter/s)": 0.032427 + }, + { + "epoch": 0.748147071646563, + "grad_norm": 0.12144403904676437, + "learning_rate": 4.911711160130546e-05, + "loss": 0.36428892612457275, + "memory(GiB)": 78.33, + "step": 3861, + "token_acc": 0.8935286935286936, + "train_speed(iter/s)": 0.032427 + }, + { + "epoch": 0.7483408419318898, + "grad_norm": 0.09023165702819824, + "learning_rate": 4.904598992645132e-05, + "loss": 0.3032684028148651, + "memory(GiB)": 78.33, + "step": 3862, + "token_acc": 0.9088954518606025, + "train_speed(iter/s)": 0.032428 + }, + { + "epoch": 0.7485346122172165, + "grad_norm": 0.10547629743814468, + "learning_rate": 4.897490971612187e-05, + "loss": 0.33661413192749023, + "memory(GiB)": 78.33, + "step": 3863, + "token_acc": 0.8987709009233841, + "train_speed(iter/s)": 0.032429 + }, + { + "epoch": 0.7487283825025433, + "grad_norm": 0.10550445318222046, + "learning_rate": 4.890387099951164e-05, + "loss": 0.34583577513694763, + "memory(GiB)": 78.33, + "step": 3864, + "token_acc": 0.8972740894421393, + "train_speed(iter/s)": 0.03243 + }, + { + "epoch": 0.74892215278787, + "grad_norm": 0.09834988415241241, + "learning_rate": 4.883287380579816e-05, + "loss": 0.3329737186431885, + "memory(GiB)": 78.33, + "step": 3865, + "token_acc": 0.900711867118138, + "train_speed(iter/s)": 0.03243 + }, + { + "epoch": 0.7491159230731967, + "grad_norm": 0.0946660041809082, + "learning_rate": 4.876191816414186e-05, + "loss": 0.310972660779953, + "memory(GiB)": 78.33, + "step": 3866, + "token_acc": 0.9068650482902636, + "train_speed(iter/s)": 0.032431 + }, + { + "epoch": 0.7493096933585235, + "grad_norm": 0.09578848630189896, + "learning_rate": 4.869100410368609e-05, + "loss": 0.31944969296455383, + "memory(GiB)": 78.33, + "step": 3867, + "token_acc": 0.904564120614807, + "train_speed(iter/s)": 0.032432 + }, + { + "epoch": 0.7495034636438502, + "grad_norm": 0.09447766840457916, + "learning_rate": 4.862013165355728e-05, + "loss": 0.30866914987564087, + "memory(GiB)": 78.33, + "step": 3868, + "token_acc": 0.9051225521981585, + "train_speed(iter/s)": 0.032432 + }, + { + "epoch": 0.749697233929177, + "grad_norm": 0.09313666820526123, + "learning_rate": 4.8549300842864576e-05, + "loss": 0.32266178727149963, + "memory(GiB)": 78.33, + "step": 3869, + "token_acc": 0.90316239526255, + "train_speed(iter/s)": 0.032433 + }, + { + "epoch": 0.7498910042145037, + "grad_norm": 0.10308795422315598, + "learning_rate": 4.84785117007001e-05, + "loss": 0.3472236096858978, + "memory(GiB)": 78.33, + "step": 3870, + "token_acc": 0.894967925873129, + "train_speed(iter/s)": 0.032434 + }, + { + "epoch": 0.7500847744998305, + "grad_norm": 0.10628514736890793, + "learning_rate": 4.840776425613886e-05, + "loss": 0.3184339702129364, + "memory(GiB)": 78.33, + "step": 3871, + "token_acc": 0.9032356532356532, + "train_speed(iter/s)": 0.032434 + }, + { + "epoch": 0.7502785447851572, + "grad_norm": 0.09150487929582596, + "learning_rate": 4.833705853823872e-05, + "loss": 0.3118763864040375, + "memory(GiB)": 78.33, + "step": 3872, + "token_acc": 0.9086181652950976, + "train_speed(iter/s)": 0.032435 + }, + { + "epoch": 0.750472315070484, + "grad_norm": 0.10045349597930908, + "learning_rate": 4.826639457604039e-05, + "loss": 0.317714124917984, + "memory(GiB)": 78.33, + "step": 3873, + "token_acc": 0.9055200119029906, + "train_speed(iter/s)": 0.032436 + }, + { + "epoch": 0.7506660853558107, + "grad_norm": 0.09495769441127777, + "learning_rate": 4.819577239856754e-05, + "loss": 0.3022938072681427, + "memory(GiB)": 78.33, + "step": 3874, + "token_acc": 0.9077078844206755, + "train_speed(iter/s)": 0.032436 + }, + { + "epoch": 0.7508598556411374, + "grad_norm": 0.09479624778032303, + "learning_rate": 4.812519203482655e-05, + "loss": 0.3185397982597351, + "memory(GiB)": 78.33, + "step": 3875, + "token_acc": 0.9046973580441641, + "train_speed(iter/s)": 0.032437 + }, + { + "epoch": 0.7510536259264642, + "grad_norm": 0.10061098635196686, + "learning_rate": 4.805465351380666e-05, + "loss": 0.3666335940361023, + "memory(GiB)": 78.33, + "step": 3876, + "token_acc": 0.8898552194528259, + "train_speed(iter/s)": 0.032438 + }, + { + "epoch": 0.7512473962117909, + "grad_norm": 0.1007319763302803, + "learning_rate": 4.798415686447997e-05, + "loss": 0.3402915298938751, + "memory(GiB)": 78.33, + "step": 3877, + "token_acc": 0.8979099678456591, + "train_speed(iter/s)": 0.032438 + }, + { + "epoch": 0.7514411664971177, + "grad_norm": 0.09538646787405014, + "learning_rate": 4.791370211580132e-05, + "loss": 0.3278900384902954, + "memory(GiB)": 78.33, + "step": 3878, + "token_acc": 0.9009969149180286, + "train_speed(iter/s)": 0.032439 + }, + { + "epoch": 0.7516349367824444, + "grad_norm": 0.10622713714838028, + "learning_rate": 4.7843289296708384e-05, + "loss": 0.33736902475357056, + "memory(GiB)": 78.33, + "step": 3879, + "token_acc": 0.8988551338682051, + "train_speed(iter/s)": 0.03244 + }, + { + "epoch": 0.7518287070677712, + "grad_norm": 0.10398625582456589, + "learning_rate": 4.777291843612153e-05, + "loss": 0.31338000297546387, + "memory(GiB)": 78.33, + "step": 3880, + "token_acc": 0.9074961626413558, + "train_speed(iter/s)": 0.03244 + }, + { + "epoch": 0.7520224773530979, + "grad_norm": 0.1060367301106453, + "learning_rate": 4.770258956294408e-05, + "loss": 0.3464204668998718, + "memory(GiB)": 78.33, + "step": 3881, + "token_acc": 0.8942851343906753, + "train_speed(iter/s)": 0.032441 + }, + { + "epoch": 0.7522162476384247, + "grad_norm": 0.11519785970449448, + "learning_rate": 4.7632302706061925e-05, + "loss": 0.3710397779941559, + "memory(GiB)": 78.33, + "step": 3882, + "token_acc": 0.8889779944825816, + "train_speed(iter/s)": 0.032442 + }, + { + "epoch": 0.7524100179237514, + "grad_norm": 0.0994989275932312, + "learning_rate": 4.756205789434379e-05, + "loss": 0.3080540597438812, + "memory(GiB)": 78.33, + "step": 3883, + "token_acc": 0.9073027181909703, + "train_speed(iter/s)": 0.032442 + }, + { + "epoch": 0.7526037882090781, + "grad_norm": 0.11023864895105362, + "learning_rate": 4.749185515664109e-05, + "loss": 0.3553957939147949, + "memory(GiB)": 78.33, + "step": 3884, + "token_acc": 0.8958528037383178, + "train_speed(iter/s)": 0.032443 + }, + { + "epoch": 0.7527975584944049, + "grad_norm": 0.10181353241205215, + "learning_rate": 4.742169452178796e-05, + "loss": 0.3252992630004883, + "memory(GiB)": 78.33, + "step": 3885, + "token_acc": 0.9014353801817778, + "train_speed(iter/s)": 0.032444 + }, + { + "epoch": 0.7529913287797316, + "grad_norm": 0.0931096225976944, + "learning_rate": 4.735157601860123e-05, + "loss": 0.3285648226737976, + "memory(GiB)": 78.33, + "step": 3886, + "token_acc": 0.9007917114651549, + "train_speed(iter/s)": 0.032444 + }, + { + "epoch": 0.7531850990650584, + "grad_norm": 0.09871333092451096, + "learning_rate": 4.7281499675880564e-05, + "loss": 0.31424829363822937, + "memory(GiB)": 78.33, + "step": 3887, + "token_acc": 0.9050426360669545, + "train_speed(iter/s)": 0.032445 + }, + { + "epoch": 0.7533788693503851, + "grad_norm": 0.0981900617480278, + "learning_rate": 4.7211465522408124e-05, + "loss": 0.2891225814819336, + "memory(GiB)": 78.33, + "step": 3888, + "token_acc": 0.9127433168959252, + "train_speed(iter/s)": 0.032446 + }, + { + "epoch": 0.7535726396357119, + "grad_norm": 0.10310321301221848, + "learning_rate": 4.714147358694883e-05, + "loss": 0.341362327337265, + "memory(GiB)": 78.33, + "step": 3889, + "token_acc": 0.899036050593414, + "train_speed(iter/s)": 0.032446 + }, + { + "epoch": 0.7537664099210386, + "grad_norm": 0.08967840671539307, + "learning_rate": 4.7071523898250246e-05, + "loss": 0.3140985369682312, + "memory(GiB)": 78.33, + "step": 3890, + "token_acc": 0.902944590528984, + "train_speed(iter/s)": 0.032447 + }, + { + "epoch": 0.7539601802063653, + "grad_norm": 0.10281458497047424, + "learning_rate": 4.700161648504261e-05, + "loss": 0.34009885787963867, + "memory(GiB)": 78.33, + "step": 3891, + "token_acc": 0.8981371841155235, + "train_speed(iter/s)": 0.032448 + }, + { + "epoch": 0.7541539504916921, + "grad_norm": 0.0905076265335083, + "learning_rate": 4.6931751376038735e-05, + "loss": 0.2962040305137634, + "memory(GiB)": 78.33, + "step": 3892, + "token_acc": 0.9114481536497017, + "train_speed(iter/s)": 0.032448 + }, + { + "epoch": 0.7543477207770188, + "grad_norm": 0.11229556798934937, + "learning_rate": 4.6861928599934086e-05, + "loss": 0.38348227739334106, + "memory(GiB)": 78.33, + "step": 3893, + "token_acc": 0.8880430168388284, + "train_speed(iter/s)": 0.032449 + }, + { + "epoch": 0.7545414910623456, + "grad_norm": 0.10800474882125854, + "learning_rate": 4.679214818540683e-05, + "loss": 0.3647887110710144, + "memory(GiB)": 78.33, + "step": 3894, + "token_acc": 0.8906179648806843, + "train_speed(iter/s)": 0.03245 + }, + { + "epoch": 0.7547352613476723, + "grad_norm": 0.09147538244724274, + "learning_rate": 4.672241016111761e-05, + "loss": 0.32499605417251587, + "memory(GiB)": 78.33, + "step": 3895, + "token_acc": 0.9020622260544657, + "train_speed(iter/s)": 0.03245 + }, + { + "epoch": 0.7549290316329991, + "grad_norm": 0.09846985340118408, + "learning_rate": 4.6652714555709734e-05, + "loss": 0.32619956135749817, + "memory(GiB)": 78.33, + "step": 3896, + "token_acc": 0.9018954111099443, + "train_speed(iter/s)": 0.032451 + }, + { + "epoch": 0.7551228019183258, + "grad_norm": 0.09857229888439178, + "learning_rate": 4.658306139780902e-05, + "loss": 0.31706997752189636, + "memory(GiB)": 78.33, + "step": 3897, + "token_acc": 0.9044509455647268, + "train_speed(iter/s)": 0.032451 + }, + { + "epoch": 0.7553165722036526, + "grad_norm": 0.09921880811452866, + "learning_rate": 4.6513450716023924e-05, + "loss": 0.32637161016464233, + "memory(GiB)": 78.33, + "step": 3898, + "token_acc": 0.9035741835947544, + "train_speed(iter/s)": 0.032452 + }, + { + "epoch": 0.7555103424889793, + "grad_norm": 0.10863330215215683, + "learning_rate": 4.64438825389454e-05, + "loss": 0.33484184741973877, + "memory(GiB)": 78.33, + "step": 3899, + "token_acc": 0.8991859737006888, + "train_speed(iter/s)": 0.032453 + }, + { + "epoch": 0.755704112774306, + "grad_norm": 0.10528206825256348, + "learning_rate": 4.637435689514693e-05, + "loss": 0.3199610710144043, + "memory(GiB)": 78.33, + "step": 3900, + "token_acc": 0.9030449124587668, + "train_speed(iter/s)": 0.032453 + }, + { + "epoch": 0.7558978830596328, + "grad_norm": 0.09049303084611893, + "learning_rate": 4.630487381318466e-05, + "loss": 0.3117978572845459, + "memory(GiB)": 78.33, + "step": 3901, + "token_acc": 0.906941374967325, + "train_speed(iter/s)": 0.032454 + }, + { + "epoch": 0.7560916533449595, + "grad_norm": 0.09448360651731491, + "learning_rate": 4.6235433321597124e-05, + "loss": 0.2956331968307495, + "memory(GiB)": 78.33, + "step": 3902, + "token_acc": 0.9114258416231102, + "train_speed(iter/s)": 0.032455 + }, + { + "epoch": 0.7562854236302863, + "grad_norm": 0.09764683246612549, + "learning_rate": 4.616603544890537e-05, + "loss": 0.32111823558807373, + "memory(GiB)": 78.33, + "step": 3903, + "token_acc": 0.9027792510873024, + "train_speed(iter/s)": 0.032455 + }, + { + "epoch": 0.756479193915613, + "grad_norm": 0.10489743947982788, + "learning_rate": 4.609668022361299e-05, + "loss": 0.32993102073669434, + "memory(GiB)": 78.33, + "step": 3904, + "token_acc": 0.9007670182166826, + "train_speed(iter/s)": 0.032456 + }, + { + "epoch": 0.7566729642009398, + "grad_norm": 0.09750455617904663, + "learning_rate": 4.6027367674206034e-05, + "loss": 0.32114407420158386, + "memory(GiB)": 78.33, + "step": 3905, + "token_acc": 0.9016605685336335, + "train_speed(iter/s)": 0.032457 + }, + { + "epoch": 0.7568667344862665, + "grad_norm": 0.10215258598327637, + "learning_rate": 4.595809782915298e-05, + "loss": 0.3243396580219269, + "memory(GiB)": 78.33, + "step": 3906, + "token_acc": 0.9038705677423164, + "train_speed(iter/s)": 0.032457 + }, + { + "epoch": 0.7570605047715933, + "grad_norm": 0.10810741782188416, + "learning_rate": 4.58888707169049e-05, + "loss": 0.33883118629455566, + "memory(GiB)": 78.33, + "step": 3907, + "token_acc": 0.8982128982128982, + "train_speed(iter/s)": 0.032458 + }, + { + "epoch": 0.75725427505692, + "grad_norm": 0.09572840481996536, + "learning_rate": 4.581968636589521e-05, + "loss": 0.3282477557659149, + "memory(GiB)": 78.33, + "step": 3908, + "token_acc": 0.9006182113741041, + "train_speed(iter/s)": 0.032459 + }, + { + "epoch": 0.7574480453422467, + "grad_norm": 0.11090946942567825, + "learning_rate": 4.575054480453975e-05, + "loss": 0.3538365960121155, + "memory(GiB)": 78.33, + "step": 3909, + "token_acc": 0.8962534285800404, + "train_speed(iter/s)": 0.032459 + }, + { + "epoch": 0.7576418156275735, + "grad_norm": 0.10753259807825089, + "learning_rate": 4.568144606123683e-05, + "loss": 0.3474193513393402, + "memory(GiB)": 78.33, + "step": 3910, + "token_acc": 0.8973380854262961, + "train_speed(iter/s)": 0.03246 + }, + { + "epoch": 0.7578355859129002, + "grad_norm": 0.10958699882030487, + "learning_rate": 4.561239016436716e-05, + "loss": 0.3798098564147949, + "memory(GiB)": 78.33, + "step": 3911, + "token_acc": 0.8861535899641975, + "train_speed(iter/s)": 0.032461 + }, + { + "epoch": 0.758029356198227, + "grad_norm": 0.10337840020656586, + "learning_rate": 4.5543377142293856e-05, + "loss": 0.3272544741630554, + "memory(GiB)": 78.33, + "step": 3912, + "token_acc": 0.9024021501763817, + "train_speed(iter/s)": 0.032461 + }, + { + "epoch": 0.7582231264835537, + "grad_norm": 0.10478585213422775, + "learning_rate": 4.5474407023362374e-05, + "loss": 0.334266722202301, + "memory(GiB)": 78.33, + "step": 3913, + "token_acc": 0.8995371061669404, + "train_speed(iter/s)": 0.032462 + }, + { + "epoch": 0.7584168967688805, + "grad_norm": 0.10417360812425613, + "learning_rate": 4.5405479835900685e-05, + "loss": 0.29992321133613586, + "memory(GiB)": 78.33, + "step": 3914, + "token_acc": 0.9098251531019792, + "train_speed(iter/s)": 0.032463 + }, + { + "epoch": 0.7586106670542072, + "grad_norm": 0.09397785365581512, + "learning_rate": 4.533659560821898e-05, + "loss": 0.31968510150909424, + "memory(GiB)": 78.33, + "step": 3915, + "token_acc": 0.9051098506427563, + "train_speed(iter/s)": 0.032463 + }, + { + "epoch": 0.758804437339534, + "grad_norm": 0.09900712221860886, + "learning_rate": 4.526775436860988e-05, + "loss": 0.316898375749588, + "memory(GiB)": 78.33, + "step": 3916, + "token_acc": 0.9043134151167392, + "train_speed(iter/s)": 0.032464 + }, + { + "epoch": 0.7589982076248607, + "grad_norm": 0.08824368566274643, + "learning_rate": 4.519895614534833e-05, + "loss": 0.3089524209499359, + "memory(GiB)": 78.33, + "step": 3917, + "token_acc": 0.9057074210072986, + "train_speed(iter/s)": 0.032465 + }, + { + "epoch": 0.7591919779101874, + "grad_norm": 0.09777352958917618, + "learning_rate": 4.513020096669161e-05, + "loss": 0.31515491008758545, + "memory(GiB)": 78.33, + "step": 3918, + "token_acc": 0.9042631075068986, + "train_speed(iter/s)": 0.032465 + }, + { + "epoch": 0.7593857481955142, + "grad_norm": 0.10553678125143051, + "learning_rate": 4.506148886087925e-05, + "loss": 0.3489447236061096, + "memory(GiB)": 78.33, + "step": 3919, + "token_acc": 0.8933819207711297, + "train_speed(iter/s)": 0.032466 + }, + { + "epoch": 0.7595795184808409, + "grad_norm": 0.10306905955076218, + "learning_rate": 4.4992819856133285e-05, + "loss": 0.35067906975746155, + "memory(GiB)": 78.33, + "step": 3920, + "token_acc": 0.8936492337113867, + "train_speed(iter/s)": 0.032467 + }, + { + "epoch": 0.7597732887661677, + "grad_norm": 0.12322760373353958, + "learning_rate": 4.492419398065784e-05, + "loss": 0.42081135511398315, + "memory(GiB)": 78.33, + "step": 3921, + "token_acc": 0.8778385546162771, + "train_speed(iter/s)": 0.032468 + }, + { + "epoch": 0.7599670590514944, + "grad_norm": 0.0956936627626419, + "learning_rate": 4.485561126263944e-05, + "loss": 0.30873122811317444, + "memory(GiB)": 78.33, + "step": 3922, + "token_acc": 0.9064496298078208, + "train_speed(iter/s)": 0.032468 + }, + { + "epoch": 0.7601608293368212, + "grad_norm": 0.10779135674238205, + "learning_rate": 4.4787071730246834e-05, + "loss": 0.33779773116111755, + "memory(GiB)": 78.33, + "step": 3923, + "token_acc": 0.8981919669942968, + "train_speed(iter/s)": 0.032469 + }, + { + "epoch": 0.7603545996221479, + "grad_norm": 0.094157375395298, + "learning_rate": 4.471857541163103e-05, + "loss": 0.3073067367076874, + "memory(GiB)": 78.33, + "step": 3924, + "token_acc": 0.9095217049695541, + "train_speed(iter/s)": 0.03247 + }, + { + "epoch": 0.7605483699074747, + "grad_norm": 0.09603159129619598, + "learning_rate": 4.465012233492535e-05, + "loss": 0.31469446420669556, + "memory(GiB)": 78.33, + "step": 3925, + "token_acc": 0.9049290982825726, + "train_speed(iter/s)": 0.03247 + }, + { + "epoch": 0.7607421401928014, + "grad_norm": 0.09002597630023956, + "learning_rate": 4.4581712528245226e-05, + "loss": 0.30716589093208313, + "memory(GiB)": 78.33, + "step": 3926, + "token_acc": 0.9074883267996704, + "train_speed(iter/s)": 0.032471 + }, + { + "epoch": 0.7609359104781281, + "grad_norm": 0.09838228672742844, + "learning_rate": 4.4513346019688514e-05, + "loss": 0.3335438072681427, + "memory(GiB)": 78.33, + "step": 3927, + "token_acc": 0.8979319720777014, + "train_speed(iter/s)": 0.032471 + }, + { + "epoch": 0.7611296807634549, + "grad_norm": 0.10846085846424103, + "learning_rate": 4.444502283733512e-05, + "loss": 0.3647676110267639, + "memory(GiB)": 78.33, + "step": 3928, + "token_acc": 0.8943367089359427, + "train_speed(iter/s)": 0.032472 + }, + { + "epoch": 0.7613234510487816, + "grad_norm": 0.09758076071739197, + "learning_rate": 4.437674300924724e-05, + "loss": 0.32494428753852844, + "memory(GiB)": 78.33, + "step": 3929, + "token_acc": 0.9025488420235477, + "train_speed(iter/s)": 0.032473 + }, + { + "epoch": 0.7615172213341084, + "grad_norm": 0.09471366554498672, + "learning_rate": 4.430850656346919e-05, + "loss": 0.31988587975502014, + "memory(GiB)": 78.33, + "step": 3930, + "token_acc": 0.9036783713916415, + "train_speed(iter/s)": 0.032473 + }, + { + "epoch": 0.7617109916194351, + "grad_norm": 0.11201035976409912, + "learning_rate": 4.4240313528027545e-05, + "loss": 0.35305842757225037, + "memory(GiB)": 78.33, + "step": 3931, + "token_acc": 0.8956776695356096, + "train_speed(iter/s)": 0.032474 + }, + { + "epoch": 0.7619047619047619, + "grad_norm": 0.09610579162836075, + "learning_rate": 4.417216393093102e-05, + "loss": 0.3121870160102844, + "memory(GiB)": 78.33, + "step": 3932, + "token_acc": 0.9052815243581577, + "train_speed(iter/s)": 0.032475 + }, + { + "epoch": 0.7620985321900886, + "grad_norm": 0.09443981200456619, + "learning_rate": 4.410405780017041e-05, + "loss": 0.2963784635066986, + "memory(GiB)": 78.33, + "step": 3933, + "token_acc": 0.9106874456134799, + "train_speed(iter/s)": 0.032476 + }, + { + "epoch": 0.7622923024754154, + "grad_norm": 0.0956021174788475, + "learning_rate": 4.403599516371884e-05, + "loss": 0.28646087646484375, + "memory(GiB)": 78.33, + "step": 3934, + "token_acc": 0.9147495102155052, + "train_speed(iter/s)": 0.032476 + }, + { + "epoch": 0.7624860727607421, + "grad_norm": 0.08942447602748871, + "learning_rate": 4.396797604953143e-05, + "loss": 0.31425604224205017, + "memory(GiB)": 78.33, + "step": 3935, + "token_acc": 0.9043874975732867, + "train_speed(iter/s)": 0.032477 + }, + { + "epoch": 0.7626798430460688, + "grad_norm": 0.10449796169996262, + "learning_rate": 4.3900000485545445e-05, + "loss": 0.3526667654514313, + "memory(GiB)": 78.33, + "step": 3936, + "token_acc": 0.8961538461538462, + "train_speed(iter/s)": 0.032478 + }, + { + "epoch": 0.7628736133313956, + "grad_norm": 0.10736709088087082, + "learning_rate": 4.3832068499680276e-05, + "loss": 0.3539313077926636, + "memory(GiB)": 78.33, + "step": 3937, + "token_acc": 0.8955620144206584, + "train_speed(iter/s)": 0.032478 + }, + { + "epoch": 0.7630673836167223, + "grad_norm": 0.10252580791711807, + "learning_rate": 4.376418011983741e-05, + "loss": 0.3328987956047058, + "memory(GiB)": 78.33, + "step": 3938, + "token_acc": 0.9002593667342903, + "train_speed(iter/s)": 0.032479 + }, + { + "epoch": 0.7632611539020491, + "grad_norm": 0.09203853458166122, + "learning_rate": 4.369633537390041e-05, + "loss": 0.30216261744499207, + "memory(GiB)": 78.33, + "step": 3939, + "token_acc": 0.9080275516593613, + "train_speed(iter/s)": 0.03248 + }, + { + "epoch": 0.7634549241873758, + "grad_norm": 0.10732089728116989, + "learning_rate": 4.3628534289734996e-05, + "loss": 0.3707207441329956, + "memory(GiB)": 78.33, + "step": 3940, + "token_acc": 0.8905606159184558, + "train_speed(iter/s)": 0.03248 + }, + { + "epoch": 0.7636486944727026, + "grad_norm": 0.10623017698526382, + "learning_rate": 4.3560776895188856e-05, + "loss": 0.3556334972381592, + "memory(GiB)": 78.33, + "step": 3941, + "token_acc": 0.8951635685631957, + "train_speed(iter/s)": 0.032481 + }, + { + "epoch": 0.7638424647580293, + "grad_norm": 0.10812726616859436, + "learning_rate": 4.3493063218091784e-05, + "loss": 0.3475589156150818, + "memory(GiB)": 78.33, + "step": 3942, + "token_acc": 0.8958073637165181, + "train_speed(iter/s)": 0.032482 + }, + { + "epoch": 0.764036235043356, + "grad_norm": 0.10460913926362991, + "learning_rate": 4.342539328625559e-05, + "loss": 0.36832839250564575, + "memory(GiB)": 78.33, + "step": 3943, + "token_acc": 0.8916668762733607, + "train_speed(iter/s)": 0.032482 + }, + { + "epoch": 0.7642300053286828, + "grad_norm": 0.09330563992261887, + "learning_rate": 4.335776712747416e-05, + "loss": 0.2897856533527374, + "memory(GiB)": 78.33, + "step": 3944, + "token_acc": 0.9121778725053284, + "train_speed(iter/s)": 0.032483 + }, + { + "epoch": 0.7644237756140095, + "grad_norm": 0.1091540977358818, + "learning_rate": 4.329018476952336e-05, + "loss": 0.35247859358787537, + "memory(GiB)": 78.33, + "step": 3945, + "token_acc": 0.8945392125308443, + "train_speed(iter/s)": 0.032484 + }, + { + "epoch": 0.7646175458993363, + "grad_norm": 0.1135362833738327, + "learning_rate": 4.3222646240161014e-05, + "loss": 0.37347984313964844, + "memory(GiB)": 78.33, + "step": 3946, + "token_acc": 0.8913823511524661, + "train_speed(iter/s)": 0.032484 + }, + { + "epoch": 0.7648113161846631, + "grad_norm": 0.09469881653785706, + "learning_rate": 4.315515156712714e-05, + "loss": 0.2968969941139221, + "memory(GiB)": 78.33, + "step": 3947, + "token_acc": 0.9084439266538344, + "train_speed(iter/s)": 0.032485 + }, + { + "epoch": 0.7650050864699899, + "grad_norm": 0.09792491793632507, + "learning_rate": 4.308770077814354e-05, + "loss": 0.33856552839279175, + "memory(GiB)": 78.33, + "step": 3948, + "token_acc": 0.8967915156396832, + "train_speed(iter/s)": 0.032486 + }, + { + "epoch": 0.7651988567553166, + "grad_norm": 0.10359305143356323, + "learning_rate": 4.3020293900914075e-05, + "loss": 0.31331631541252136, + "memory(GiB)": 78.33, + "step": 3949, + "token_acc": 0.9054741250373916, + "train_speed(iter/s)": 0.032486 + }, + { + "epoch": 0.7653926270406434, + "grad_norm": 0.10234098881483078, + "learning_rate": 4.295293096312457e-05, + "loss": 0.3446482717990875, + "memory(GiB)": 78.33, + "step": 3950, + "token_acc": 0.8954361313351653, + "train_speed(iter/s)": 0.032487 + }, + { + "epoch": 0.7655863973259701, + "grad_norm": 0.10561185330152512, + "learning_rate": 4.288561199244277e-05, + "loss": 0.33969467878341675, + "memory(GiB)": 78.33, + "step": 3951, + "token_acc": 0.8973197015750207, + "train_speed(iter/s)": 0.032487 + }, + { + "epoch": 0.7657801676112969, + "grad_norm": 0.10193517059087753, + "learning_rate": 4.281833701651841e-05, + "loss": 0.34807661175727844, + "memory(GiB)": 78.33, + "step": 3952, + "token_acc": 0.8959937746531784, + "train_speed(iter/s)": 0.032488 + }, + { + "epoch": 0.7659739378966236, + "grad_norm": 0.10420801490545273, + "learning_rate": 4.275110606298307e-05, + "loss": 0.34150460362434387, + "memory(GiB)": 78.33, + "step": 3953, + "token_acc": 0.8981601881311385, + "train_speed(iter/s)": 0.032489 + }, + { + "epoch": 0.7661677081819503, + "grad_norm": 0.1057552695274353, + "learning_rate": 4.26839191594504e-05, + "loss": 0.34214961528778076, + "memory(GiB)": 78.33, + "step": 3954, + "token_acc": 0.8992475734944477, + "train_speed(iter/s)": 0.032489 + }, + { + "epoch": 0.7663614784672771, + "grad_norm": 0.10361424833536148, + "learning_rate": 4.2616776333515844e-05, + "loss": 0.3399190306663513, + "memory(GiB)": 78.33, + "step": 3955, + "token_acc": 0.895477586158072, + "train_speed(iter/s)": 0.03249 + }, + { + "epoch": 0.7665552487526038, + "grad_norm": 0.10096311569213867, + "learning_rate": 4.254967761275672e-05, + "loss": 0.3292155861854553, + "memory(GiB)": 78.33, + "step": 3956, + "token_acc": 0.8995320229977269, + "train_speed(iter/s)": 0.032491 + }, + { + "epoch": 0.7667490190379306, + "grad_norm": 0.10260067880153656, + "learning_rate": 4.2482623024732334e-05, + "loss": 0.3111547827720642, + "memory(GiB)": 78.33, + "step": 3957, + "token_acc": 0.9036062791684345, + "train_speed(iter/s)": 0.032491 + }, + { + "epoch": 0.7669427893232573, + "grad_norm": 0.10281860083341599, + "learning_rate": 4.241561259698376e-05, + "loss": 0.3256504237651825, + "memory(GiB)": 78.33, + "step": 3958, + "token_acc": 0.900353960698157, + "train_speed(iter/s)": 0.032492 + }, + { + "epoch": 0.7671365596085841, + "grad_norm": 0.1012672632932663, + "learning_rate": 4.2348646357033944e-05, + "loss": 0.3229145407676697, + "memory(GiB)": 78.33, + "step": 3959, + "token_acc": 0.902934926196083, + "train_speed(iter/s)": 0.032493 + }, + { + "epoch": 0.7673303298939108, + "grad_norm": 0.11049380153417587, + "learning_rate": 4.228172433238783e-05, + "loss": 0.3576306700706482, + "memory(GiB)": 78.33, + "step": 3960, + "token_acc": 0.892226424530366, + "train_speed(iter/s)": 0.032493 + }, + { + "epoch": 0.7675241001792376, + "grad_norm": 0.10522151738405228, + "learning_rate": 4.2214846550532026e-05, + "loss": 0.34801146388053894, + "memory(GiB)": 78.33, + "step": 3961, + "token_acc": 0.8971957437375305, + "train_speed(iter/s)": 0.032494 + }, + { + "epoch": 0.7677178704645643, + "grad_norm": 0.10627347230911255, + "learning_rate": 4.2148013038935054e-05, + "loss": 0.34363991022109985, + "memory(GiB)": 78.33, + "step": 3962, + "token_acc": 0.8970234186911797, + "train_speed(iter/s)": 0.032495 + }, + { + "epoch": 0.767911640749891, + "grad_norm": 0.09957989305257797, + "learning_rate": 4.2081223825047214e-05, + "loss": 0.3352537453174591, + "memory(GiB)": 78.33, + "step": 3963, + "token_acc": 0.8995469112609128, + "train_speed(iter/s)": 0.032495 + }, + { + "epoch": 0.7681054110352178, + "grad_norm": 0.10130537301301956, + "learning_rate": 4.201447893630065e-05, + "loss": 0.331424355506897, + "memory(GiB)": 78.33, + "step": 3964, + "token_acc": 0.9031016042780748, + "train_speed(iter/s)": 0.032496 + }, + { + "epoch": 0.7682991813205445, + "grad_norm": 0.08747418969869614, + "learning_rate": 4.194777840010926e-05, + "loss": 0.29280680418014526, + "memory(GiB)": 78.33, + "step": 3965, + "token_acc": 0.9117611623816294, + "train_speed(iter/s)": 0.032497 + }, + { + "epoch": 0.7684929516058713, + "grad_norm": 0.09481216967105865, + "learning_rate": 4.1881122243868715e-05, + "loss": 0.34195125102996826, + "memory(GiB)": 78.33, + "step": 3966, + "token_acc": 0.8987415491352898, + "train_speed(iter/s)": 0.032497 + }, + { + "epoch": 0.768686721891198, + "grad_norm": 0.09969401359558105, + "learning_rate": 4.181451049495657e-05, + "loss": 0.3363822400569916, + "memory(GiB)": 78.33, + "step": 3967, + "token_acc": 0.9003247963367232, + "train_speed(iter/s)": 0.032498 + }, + { + "epoch": 0.7688804921765248, + "grad_norm": 0.10244782269001007, + "learning_rate": 4.174794318073202e-05, + "loss": 0.3243018090724945, + "memory(GiB)": 78.33, + "step": 3968, + "token_acc": 0.900588806951352, + "train_speed(iter/s)": 0.032499 + }, + { + "epoch": 0.7690742624618515, + "grad_norm": 0.09450022131204605, + "learning_rate": 4.168142032853605e-05, + "loss": 0.3295527398586273, + "memory(GiB)": 78.33, + "step": 3969, + "token_acc": 0.9003620957213587, + "train_speed(iter/s)": 0.032499 + }, + { + "epoch": 0.7692680327471783, + "grad_norm": 0.09936364740133286, + "learning_rate": 4.16149419656914e-05, + "loss": 0.31701233983039856, + "memory(GiB)": 78.33, + "step": 3970, + "token_acc": 0.9036584700943242, + "train_speed(iter/s)": 0.0325 + }, + { + "epoch": 0.769461803032505, + "grad_norm": 0.11445298790931702, + "learning_rate": 4.15485081195025e-05, + "loss": 0.3477141261100769, + "memory(GiB)": 78.33, + "step": 3971, + "token_acc": 0.8983733184957288, + "train_speed(iter/s)": 0.032501 + }, + { + "epoch": 0.7696555733178317, + "grad_norm": 0.09958308935165405, + "learning_rate": 4.148211881725547e-05, + "loss": 0.31291618943214417, + "memory(GiB)": 78.33, + "step": 3972, + "token_acc": 0.9050311502015601, + "train_speed(iter/s)": 0.032501 + }, + { + "epoch": 0.7698493436031585, + "grad_norm": 0.10363567620515823, + "learning_rate": 4.141577408621827e-05, + "loss": 0.33989396691322327, + "memory(GiB)": 78.33, + "step": 3973, + "token_acc": 0.8975318087789396, + "train_speed(iter/s)": 0.032502 + }, + { + "epoch": 0.7700431138884852, + "grad_norm": 0.10342077165842056, + "learning_rate": 4.134947395364043e-05, + "loss": 0.33245235681533813, + "memory(GiB)": 78.33, + "step": 3974, + "token_acc": 0.8993934211581113, + "train_speed(iter/s)": 0.032502 + }, + { + "epoch": 0.770236884173812, + "grad_norm": 0.10198888927698135, + "learning_rate": 4.128321844675318e-05, + "loss": 0.32281598448753357, + "memory(GiB)": 78.33, + "step": 3975, + "token_acc": 0.9067534827377347, + "train_speed(iter/s)": 0.032503 + }, + { + "epoch": 0.7704306544591387, + "grad_norm": 0.09553690254688263, + "learning_rate": 4.121700759276946e-05, + "loss": 0.31830698251724243, + "memory(GiB)": 78.33, + "step": 3976, + "token_acc": 0.905037092785722, + "train_speed(iter/s)": 0.032504 + }, + { + "epoch": 0.7706244247444655, + "grad_norm": 0.09600663185119629, + "learning_rate": 4.1150841418883845e-05, + "loss": 0.3234773874282837, + "memory(GiB)": 78.33, + "step": 3977, + "token_acc": 0.904327039295211, + "train_speed(iter/s)": 0.032504 + }, + { + "epoch": 0.7708181950297922, + "grad_norm": 0.10290994495153427, + "learning_rate": 4.1084719952272524e-05, + "loss": 0.32649847865104675, + "memory(GiB)": 78.33, + "step": 3978, + "token_acc": 0.9009803039841713, + "train_speed(iter/s)": 0.032505 + }, + { + "epoch": 0.771011965315119, + "grad_norm": 0.09713493287563324, + "learning_rate": 4.101864322009335e-05, + "loss": 0.326259970664978, + "memory(GiB)": 78.33, + "step": 3979, + "token_acc": 0.9027269351159213, + "train_speed(iter/s)": 0.032506 + }, + { + "epoch": 0.7712057356004457, + "grad_norm": 0.1079777181148529, + "learning_rate": 4.0952611249485906e-05, + "loss": 0.33296194672584534, + "memory(GiB)": 78.33, + "step": 3980, + "token_acc": 0.9001692971437933, + "train_speed(iter/s)": 0.032506 + }, + { + "epoch": 0.7713995058857724, + "grad_norm": 0.10135926306247711, + "learning_rate": 4.0886624067571215e-05, + "loss": 0.3391050100326538, + "memory(GiB)": 78.33, + "step": 3981, + "token_acc": 0.9001635300313202, + "train_speed(iter/s)": 0.032507 + }, + { + "epoch": 0.7715932761710992, + "grad_norm": 0.09656066447496414, + "learning_rate": 4.0820681701452034e-05, + "loss": 0.3262328803539276, + "memory(GiB)": 78.33, + "step": 3982, + "token_acc": 0.9005832056541294, + "train_speed(iter/s)": 0.032508 + }, + { + "epoch": 0.7717870464564259, + "grad_norm": 0.0958666205406189, + "learning_rate": 4.0754784178212616e-05, + "loss": 0.30787983536720276, + "memory(GiB)": 78.33, + "step": 3983, + "token_acc": 0.9089610766847405, + "train_speed(iter/s)": 0.032508 + }, + { + "epoch": 0.7719808167417527, + "grad_norm": 0.11135300248861313, + "learning_rate": 4.068893152491888e-05, + "loss": 0.3504694402217865, + "memory(GiB)": 78.33, + "step": 3984, + "token_acc": 0.8946419448868588, + "train_speed(iter/s)": 0.032509 + }, + { + "epoch": 0.7721745870270794, + "grad_norm": 0.09396642446517944, + "learning_rate": 4.062312376861828e-05, + "loss": 0.2973524332046509, + "memory(GiB)": 78.33, + "step": 3985, + "token_acc": 0.9072842658845541, + "train_speed(iter/s)": 0.032509 + }, + { + "epoch": 0.7723683573124062, + "grad_norm": 0.10528262704610825, + "learning_rate": 4.0557360936339754e-05, + "loss": 0.3348216414451599, + "memory(GiB)": 78.33, + "step": 3986, + "token_acc": 0.8983733985893191, + "train_speed(iter/s)": 0.03251 + }, + { + "epoch": 0.7725621275977329, + "grad_norm": 0.10733997821807861, + "learning_rate": 4.049164305509398e-05, + "loss": 0.36273688077926636, + "memory(GiB)": 78.33, + "step": 3987, + "token_acc": 0.8929391901242769, + "train_speed(iter/s)": 0.032511 + }, + { + "epoch": 0.7727558978830597, + "grad_norm": 0.09470584243535995, + "learning_rate": 4.042597015187301e-05, + "loss": 0.3208099603652954, + "memory(GiB)": 78.33, + "step": 3988, + "token_acc": 0.9038011841385144, + "train_speed(iter/s)": 0.032511 + }, + { + "epoch": 0.7729496681683864, + "grad_norm": 0.10173063725233078, + "learning_rate": 4.036034225365047e-05, + "loss": 0.33486083149909973, + "memory(GiB)": 78.33, + "step": 3989, + "token_acc": 0.9000950410912953, + "train_speed(iter/s)": 0.032512 + }, + { + "epoch": 0.7731434384537131, + "grad_norm": 0.09967856109142303, + "learning_rate": 4.029475938738149e-05, + "loss": 0.31119605898857117, + "memory(GiB)": 78.33, + "step": 3990, + "token_acc": 0.9062775816416593, + "train_speed(iter/s)": 0.032513 + }, + { + "epoch": 0.7733372087390399, + "grad_norm": 0.10067766904830933, + "learning_rate": 4.0229221580002736e-05, + "loss": 0.31823796033859253, + "memory(GiB)": 78.33, + "step": 3991, + "token_acc": 0.9016983016983017, + "train_speed(iter/s)": 0.032513 + }, + { + "epoch": 0.7735309790243666, + "grad_norm": 0.10268165916204453, + "learning_rate": 4.016372885843228e-05, + "loss": 0.3410794138908386, + "memory(GiB)": 78.33, + "step": 3992, + "token_acc": 0.8990299823633157, + "train_speed(iter/s)": 0.032514 + }, + { + "epoch": 0.7737247493096934, + "grad_norm": 0.1047457829117775, + "learning_rate": 4.0098281249569845e-05, + "loss": 0.3615070581436157, + "memory(GiB)": 78.33, + "step": 3993, + "token_acc": 0.8931236597569693, + "train_speed(iter/s)": 0.032515 + }, + { + "epoch": 0.7739185195950201, + "grad_norm": 0.10737626999616623, + "learning_rate": 4.0032878780296476e-05, + "loss": 0.3728790879249573, + "memory(GiB)": 78.33, + "step": 3994, + "token_acc": 0.8893093379417198, + "train_speed(iter/s)": 0.032515 + }, + { + "epoch": 0.7741122898803469, + "grad_norm": 0.10214757919311523, + "learning_rate": 3.9967521477474726e-05, + "loss": 0.34999844431877136, + "memory(GiB)": 78.33, + "step": 3995, + "token_acc": 0.8950407514850117, + "train_speed(iter/s)": 0.032516 + }, + { + "epoch": 0.7743060601656736, + "grad_norm": 0.09131192415952682, + "learning_rate": 3.990220936794859e-05, + "loss": 0.302643358707428, + "memory(GiB)": 78.33, + "step": 3996, + "token_acc": 0.9078601502709995, + "train_speed(iter/s)": 0.032517 + }, + { + "epoch": 0.7744998304510003, + "grad_norm": 0.10554607212543488, + "learning_rate": 3.98369424785435e-05, + "loss": 0.33304092288017273, + "memory(GiB)": 78.33, + "step": 3997, + "token_acc": 0.9006529325678262, + "train_speed(iter/s)": 0.032517 + }, + { + "epoch": 0.7746936007363271, + "grad_norm": 0.0956023558974266, + "learning_rate": 3.977172083606634e-05, + "loss": 0.32441043853759766, + "memory(GiB)": 78.33, + "step": 3998, + "token_acc": 0.9003093260588937, + "train_speed(iter/s)": 0.032518 + }, + { + "epoch": 0.7748873710216538, + "grad_norm": 0.09569355845451355, + "learning_rate": 3.9706544467305316e-05, + "loss": 0.2999122738838196, + "memory(GiB)": 78.33, + "step": 3999, + "token_acc": 0.9101693571824883, + "train_speed(iter/s)": 0.032519 + }, + { + "epoch": 0.7750811413069806, + "grad_norm": 0.1037716194987297, + "learning_rate": 3.964141339903026e-05, + "loss": 0.33561694622039795, + "memory(GiB)": 78.33, + "step": 4000, + "token_acc": 0.9002222414639074, + "train_speed(iter/s)": 0.03252 + }, + { + "epoch": 0.7750811413069806, + "eval_loss": 0.38567423820495605, + "eval_runtime": 1344.6983, + "eval_samples_per_second": 5.019, + "eval_steps_per_second": 5.019, + "eval_token_acc": 0.9009321879927275, + "step": 4000 + }, + { + "epoch": 0.7752749115923073, + "grad_norm": 0.09706872701644897, + "learning_rate": 3.9576327657992144e-05, + "loss": 0.31833964586257935, + "memory(GiB)": 78.33, + "step": 4001, + "token_acc": 0.9042262932234534, + "train_speed(iter/s)": 0.032165 + }, + { + "epoch": 0.7754686818776341, + "grad_norm": 0.10196474194526672, + "learning_rate": 3.951128727092346e-05, + "loss": 0.3550868630409241, + "memory(GiB)": 78.33, + "step": 4002, + "token_acc": 0.8949007501286321, + "train_speed(iter/s)": 0.032166 + }, + { + "epoch": 0.7756624521629608, + "grad_norm": 0.09914345294237137, + "learning_rate": 3.9446292264538046e-05, + "loss": 0.33100420236587524, + "memory(GiB)": 78.33, + "step": 4003, + "token_acc": 0.900188852592895, + "train_speed(iter/s)": 0.032167 + }, + { + "epoch": 0.7758562224482876, + "grad_norm": 0.11188572645187378, + "learning_rate": 3.93813426655311e-05, + "loss": 0.35677969455718994, + "memory(GiB)": 78.33, + "step": 4004, + "token_acc": 0.8938544116760148, + "train_speed(iter/s)": 0.032168 + }, + { + "epoch": 0.7760499927336143, + "grad_norm": 0.10255058854818344, + "learning_rate": 3.9316438500579103e-05, + "loss": 0.32014256715774536, + "memory(GiB)": 78.33, + "step": 4005, + "token_acc": 0.9037872184890489, + "train_speed(iter/s)": 0.032168 + }, + { + "epoch": 0.776243763018941, + "grad_norm": 0.10134012997150421, + "learning_rate": 3.925157979634005e-05, + "loss": 0.32840850949287415, + "memory(GiB)": 78.33, + "step": 4006, + "token_acc": 0.9019783197831979, + "train_speed(iter/s)": 0.032169 + }, + { + "epoch": 0.7764375333042678, + "grad_norm": 0.10212652385234833, + "learning_rate": 3.918676657945308e-05, + "loss": 0.3236843943595886, + "memory(GiB)": 78.33, + "step": 4007, + "token_acc": 0.9032230290158711, + "train_speed(iter/s)": 0.03217 + }, + { + "epoch": 0.7766313035895945, + "grad_norm": 0.09699150919914246, + "learning_rate": 3.9121998876538775e-05, + "loss": 0.3039090037345886, + "memory(GiB)": 78.33, + "step": 4008, + "token_acc": 0.9065003465003465, + "train_speed(iter/s)": 0.03217 + }, + { + "epoch": 0.7768250738749213, + "grad_norm": 0.10348644107580185, + "learning_rate": 3.905727671419891e-05, + "loss": 0.36719679832458496, + "memory(GiB)": 78.33, + "step": 4009, + "token_acc": 0.8896081591671731, + "train_speed(iter/s)": 0.032171 + }, + { + "epoch": 0.777018844160248, + "grad_norm": 0.09221908450126648, + "learning_rate": 3.899260011901666e-05, + "loss": 0.2919962406158447, + "memory(GiB)": 78.33, + "step": 4010, + "token_acc": 0.9112894709050586, + "train_speed(iter/s)": 0.032172 + }, + { + "epoch": 0.7772126144455748, + "grad_norm": 0.10690614581108093, + "learning_rate": 3.892796911755642e-05, + "loss": 0.3488962650299072, + "memory(GiB)": 78.33, + "step": 4011, + "token_acc": 0.8949435444280806, + "train_speed(iter/s)": 0.032173 + }, + { + "epoch": 0.7774063847309015, + "grad_norm": 0.11350135505199432, + "learning_rate": 3.886338373636385e-05, + "loss": 0.36426064372062683, + "memory(GiB)": 78.33, + "step": 4012, + "token_acc": 0.892725139337049, + "train_speed(iter/s)": 0.032173 + }, + { + "epoch": 0.7776001550162283, + "grad_norm": 0.10248113423585892, + "learning_rate": 3.8798844001965976e-05, + "loss": 0.3667606711387634, + "memory(GiB)": 78.33, + "step": 4013, + "token_acc": 0.8912181013079423, + "train_speed(iter/s)": 0.032174 + }, + { + "epoch": 0.777793925301555, + "grad_norm": 0.09691166877746582, + "learning_rate": 3.873434994087095e-05, + "loss": 0.3389664888381958, + "memory(GiB)": 78.33, + "step": 4014, + "token_acc": 0.8992298909214586, + "train_speed(iter/s)": 0.032175 + }, + { + "epoch": 0.7779876955868817, + "grad_norm": 0.09862842410802841, + "learning_rate": 3.866990157956823e-05, + "loss": 0.35453882813453674, + "memory(GiB)": 78.33, + "step": 4015, + "token_acc": 0.8935185185185185, + "train_speed(iter/s)": 0.032175 + }, + { + "epoch": 0.7781814658722085, + "grad_norm": 0.11863162368535995, + "learning_rate": 3.86054989445285e-05, + "loss": 0.3620249032974243, + "memory(GiB)": 78.33, + "step": 4016, + "token_acc": 0.8918080939947781, + "train_speed(iter/s)": 0.032176 + }, + { + "epoch": 0.7783752361575352, + "grad_norm": 0.10086744278669357, + "learning_rate": 3.854114206220364e-05, + "loss": 0.3244988024234772, + "memory(GiB)": 78.33, + "step": 4017, + "token_acc": 0.9012764208850194, + "train_speed(iter/s)": 0.032177 + }, + { + "epoch": 0.778569006442862, + "grad_norm": 0.12051405757665634, + "learning_rate": 3.8476830959026735e-05, + "loss": 0.38493812084198, + "memory(GiB)": 78.33, + "step": 4018, + "token_acc": 0.8884372177055104, + "train_speed(iter/s)": 0.032178 + }, + { + "epoch": 0.7787627767281887, + "grad_norm": 0.10893973708152771, + "learning_rate": 3.8412565661412056e-05, + "loss": 0.35063496232032776, + "memory(GiB)": 78.33, + "step": 4019, + "token_acc": 0.8946015424164524, + "train_speed(iter/s)": 0.032178 + }, + { + "epoch": 0.7789565470135155, + "grad_norm": 0.09621429443359375, + "learning_rate": 3.834834619575519e-05, + "loss": 0.3024739623069763, + "memory(GiB)": 78.33, + "step": 4020, + "token_acc": 0.9086043745798645, + "train_speed(iter/s)": 0.032179 + }, + { + "epoch": 0.7791503172988422, + "grad_norm": 0.09711124747991562, + "learning_rate": 3.8284172588432716e-05, + "loss": 0.32116764783859253, + "memory(GiB)": 78.33, + "step": 4021, + "token_acc": 0.9038179359553471, + "train_speed(iter/s)": 0.03218 + }, + { + "epoch": 0.779344087584169, + "grad_norm": 0.10393381118774414, + "learning_rate": 3.822004486580251e-05, + "loss": 0.32372620701789856, + "memory(GiB)": 78.33, + "step": 4022, + "token_acc": 0.9042860117188661, + "train_speed(iter/s)": 0.032181 + }, + { + "epoch": 0.7795378578694957, + "grad_norm": 0.1017252653837204, + "learning_rate": 3.815596305420349e-05, + "loss": 0.353562593460083, + "memory(GiB)": 78.33, + "step": 4023, + "token_acc": 0.8964561961709364, + "train_speed(iter/s)": 0.032181 + }, + { + "epoch": 0.7797316281548224, + "grad_norm": 0.09764081239700317, + "learning_rate": 3.809192717995584e-05, + "loss": 0.3372447192668915, + "memory(GiB)": 78.33, + "step": 4024, + "token_acc": 0.9003608608738545, + "train_speed(iter/s)": 0.032182 + }, + { + "epoch": 0.7799253984401492, + "grad_norm": 0.1030493900179863, + "learning_rate": 3.8027937269360757e-05, + "loss": 0.32105734944343567, + "memory(GiB)": 78.33, + "step": 4025, + "token_acc": 0.9018271999087826, + "train_speed(iter/s)": 0.032183 + }, + { + "epoch": 0.7801191687254759, + "grad_norm": 0.10457610338926315, + "learning_rate": 3.796399334870061e-05, + "loss": 0.3225201964378357, + "memory(GiB)": 78.33, + "step": 4026, + "token_acc": 0.9034423897581793, + "train_speed(iter/s)": 0.032184 + }, + { + "epoch": 0.7803129390108027, + "grad_norm": 0.10154607892036438, + "learning_rate": 3.7900095444238965e-05, + "loss": 0.3304111063480377, + "memory(GiB)": 78.33, + "step": 4027, + "token_acc": 0.9023520164459762, + "train_speed(iter/s)": 0.032184 + }, + { + "epoch": 0.7805067092961294, + "grad_norm": 0.09185813367366791, + "learning_rate": 3.783624358222036e-05, + "loss": 0.29117369651794434, + "memory(GiB)": 78.33, + "step": 4028, + "token_acc": 0.9135486512770641, + "train_speed(iter/s)": 0.032185 + }, + { + "epoch": 0.7807004795814562, + "grad_norm": 0.10176742821931839, + "learning_rate": 3.777243778887047e-05, + "loss": 0.33322426676750183, + "memory(GiB)": 78.33, + "step": 4029, + "token_acc": 0.8997748686733928, + "train_speed(iter/s)": 0.032186 + }, + { + "epoch": 0.7808942498667829, + "grad_norm": 0.10486084967851639, + "learning_rate": 3.770867809039604e-05, + "loss": 0.3594956398010254, + "memory(GiB)": 78.33, + "step": 4030, + "token_acc": 0.8941469133331427, + "train_speed(iter/s)": 0.032186 + }, + { + "epoch": 0.7810880201521097, + "grad_norm": 0.09931764006614685, + "learning_rate": 3.764496451298492e-05, + "loss": 0.30685773491859436, + "memory(GiB)": 78.33, + "step": 4031, + "token_acc": 0.9082772375966093, + "train_speed(iter/s)": 0.032187 + }, + { + "epoch": 0.7812817904374364, + "grad_norm": 0.1024843156337738, + "learning_rate": 3.758129708280593e-05, + "loss": 0.3311740458011627, + "memory(GiB)": 78.33, + "step": 4032, + "token_acc": 0.8998150183641189, + "train_speed(iter/s)": 0.032188 + }, + { + "epoch": 0.7814755607227631, + "grad_norm": 0.10810218751430511, + "learning_rate": 3.751767582600908e-05, + "loss": 0.3451961576938629, + "memory(GiB)": 78.33, + "step": 4033, + "token_acc": 0.8986039894484128, + "train_speed(iter/s)": 0.032188 + }, + { + "epoch": 0.7816693310080899, + "grad_norm": 0.11089378595352173, + "learning_rate": 3.745410076872528e-05, + "loss": 0.3630661964416504, + "memory(GiB)": 78.33, + "step": 4034, + "token_acc": 0.8932108218478816, + "train_speed(iter/s)": 0.032189 + }, + { + "epoch": 0.7818631012934166, + "grad_norm": 0.09974395483732224, + "learning_rate": 3.739057193706651e-05, + "loss": 0.3139222264289856, + "memory(GiB)": 78.33, + "step": 4035, + "token_acc": 0.9046637100646875, + "train_speed(iter/s)": 0.03219 + }, + { + "epoch": 0.7820568715787434, + "grad_norm": 0.10395882278680801, + "learning_rate": 3.7327089357125794e-05, + "loss": 0.3373940885066986, + "memory(GiB)": 78.33, + "step": 4036, + "token_acc": 0.9000392222782541, + "train_speed(iter/s)": 0.032191 + }, + { + "epoch": 0.7822506418640701, + "grad_norm": 0.09552538394927979, + "learning_rate": 3.7263653054977106e-05, + "loss": 0.3279712200164795, + "memory(GiB)": 78.33, + "step": 4037, + "token_acc": 0.9012946233238103, + "train_speed(iter/s)": 0.032191 + }, + { + "epoch": 0.7824444121493969, + "grad_norm": 0.09712161868810654, + "learning_rate": 3.7200263056675424e-05, + "loss": 0.3257930278778076, + "memory(GiB)": 78.33, + "step": 4038, + "token_acc": 0.9013559672911707, + "train_speed(iter/s)": 0.032192 + }, + { + "epoch": 0.7826381824347236, + "grad_norm": 0.09889403730630875, + "learning_rate": 3.713691938825677e-05, + "loss": 0.30284392833709717, + "memory(GiB)": 78.33, + "step": 4039, + "token_acc": 0.9077128339058109, + "train_speed(iter/s)": 0.032193 + }, + { + "epoch": 0.7828319527200504, + "grad_norm": 0.11424127966165543, + "learning_rate": 3.7073622075738085e-05, + "loss": 0.340787798166275, + "memory(GiB)": 78.33, + "step": 4040, + "token_acc": 0.8969184444871549, + "train_speed(iter/s)": 0.032193 + }, + { + "epoch": 0.7830257230053771, + "grad_norm": 0.1108805313706398, + "learning_rate": 3.701037114511727e-05, + "loss": 0.3701401948928833, + "memory(GiB)": 78.33, + "step": 4041, + "token_acc": 0.8902578623930594, + "train_speed(iter/s)": 0.032194 + }, + { + "epoch": 0.7832194932907038, + "grad_norm": 0.10004852712154388, + "learning_rate": 3.694716662237317e-05, + "loss": 0.3417167067527771, + "memory(GiB)": 78.33, + "step": 4042, + "token_acc": 0.8959614939126406, + "train_speed(iter/s)": 0.032195 + }, + { + "epoch": 0.7834132635760306, + "grad_norm": 0.09257663786411285, + "learning_rate": 3.6884008533465575e-05, + "loss": 0.2933918833732605, + "memory(GiB)": 78.33, + "step": 4043, + "token_acc": 0.9114590016681637, + "train_speed(iter/s)": 0.032195 + }, + { + "epoch": 0.7836070338613573, + "grad_norm": 0.10794571042060852, + "learning_rate": 3.682089690433522e-05, + "loss": 0.34912464022636414, + "memory(GiB)": 78.33, + "step": 4044, + "token_acc": 0.8977625199506128, + "train_speed(iter/s)": 0.032196 + }, + { + "epoch": 0.7838008041466841, + "grad_norm": 0.09921461343765259, + "learning_rate": 3.675783176090373e-05, + "loss": 0.3479025363922119, + "memory(GiB)": 78.33, + "step": 4045, + "token_acc": 0.8996169970498421, + "train_speed(iter/s)": 0.032197 + }, + { + "epoch": 0.7839945744320108, + "grad_norm": 0.091604083776474, + "learning_rate": 3.669481312907369e-05, + "loss": 0.3121592104434967, + "memory(GiB)": 78.33, + "step": 4046, + "token_acc": 0.9041123022455497, + "train_speed(iter/s)": 0.032197 + }, + { + "epoch": 0.7841883447173376, + "grad_norm": 0.09730125963687897, + "learning_rate": 3.663184103472852e-05, + "loss": 0.35410815477371216, + "memory(GiB)": 78.33, + "step": 4047, + "token_acc": 0.8955283408408409, + "train_speed(iter/s)": 0.032198 + }, + { + "epoch": 0.7843821150026643, + "grad_norm": 0.09932620078325272, + "learning_rate": 3.6568915503732577e-05, + "loss": 0.3147258162498474, + "memory(GiB)": 78.33, + "step": 4048, + "token_acc": 0.9058458354888774, + "train_speed(iter/s)": 0.032199 + }, + { + "epoch": 0.784575885287991, + "grad_norm": 0.09612176567316055, + "learning_rate": 3.650603656193105e-05, + "loss": 0.3254881203174591, + "memory(GiB)": 78.33, + "step": 4049, + "token_acc": 0.902174750301692, + "train_speed(iter/s)": 0.032199 + }, + { + "epoch": 0.7847696555733178, + "grad_norm": 0.08921913802623749, + "learning_rate": 3.6443204235149995e-05, + "loss": 0.30229154229164124, + "memory(GiB)": 78.33, + "step": 4050, + "token_acc": 0.9090634861704102, + "train_speed(iter/s)": 0.0322 + }, + { + "epoch": 0.7849634258586445, + "grad_norm": 0.09413807839155197, + "learning_rate": 3.638041854919634e-05, + "loss": 0.32846495509147644, + "memory(GiB)": 78.33, + "step": 4051, + "token_acc": 0.901771947119638, + "train_speed(iter/s)": 0.032201 + }, + { + "epoch": 0.7851571961439713, + "grad_norm": 0.09915482252836227, + "learning_rate": 3.6317679529857844e-05, + "loss": 0.32399147748947144, + "memory(GiB)": 78.33, + "step": 4052, + "token_acc": 0.903708087800108, + "train_speed(iter/s)": 0.032202 + }, + { + "epoch": 0.785350966429298, + "grad_norm": 0.10495869070291519, + "learning_rate": 3.625498720290315e-05, + "loss": 0.3514251112937927, + "memory(GiB)": 78.33, + "step": 4053, + "token_acc": 0.8951106706309878, + "train_speed(iter/s)": 0.032202 + }, + { + "epoch": 0.7855447367146248, + "grad_norm": 0.10493289679288864, + "learning_rate": 3.619234159408168e-05, + "loss": 0.34921297430992126, + "memory(GiB)": 78.33, + "step": 4054, + "token_acc": 0.8932814420319541, + "train_speed(iter/s)": 0.032203 + }, + { + "epoch": 0.7857385069999515, + "grad_norm": 0.09735289216041565, + "learning_rate": 3.6129742729123625e-05, + "loss": 0.30153387784957886, + "memory(GiB)": 78.33, + "step": 4055, + "token_acc": 0.9098055563826937, + "train_speed(iter/s)": 0.032204 + }, + { + "epoch": 0.7859322772852783, + "grad_norm": 0.09159845113754272, + "learning_rate": 3.606719063374006e-05, + "loss": 0.30557680130004883, + "memory(GiB)": 78.33, + "step": 4056, + "token_acc": 0.907328966162636, + "train_speed(iter/s)": 0.032204 + }, + { + "epoch": 0.786126047570605, + "grad_norm": 0.0924990177154541, + "learning_rate": 3.600468533362279e-05, + "loss": 0.2820108234882355, + "memory(GiB)": 78.33, + "step": 4057, + "token_acc": 0.9154507656948067, + "train_speed(iter/s)": 0.032205 + }, + { + "epoch": 0.7863198178559317, + "grad_norm": 0.1056913435459137, + "learning_rate": 3.594222685444441e-05, + "loss": 0.33108171820640564, + "memory(GiB)": 78.33, + "step": 4058, + "token_acc": 0.8996641845524894, + "train_speed(iter/s)": 0.032206 + }, + { + "epoch": 0.7865135881412585, + "grad_norm": 0.11276807636022568, + "learning_rate": 3.587981522185829e-05, + "loss": 0.31702518463134766, + "memory(GiB)": 78.33, + "step": 4059, + "token_acc": 0.9057384014048604, + "train_speed(iter/s)": 0.032206 + }, + { + "epoch": 0.7867073584265852, + "grad_norm": 0.09739526361227036, + "learning_rate": 3.5817450461498634e-05, + "loss": 0.3203843832015991, + "memory(GiB)": 78.33, + "step": 4060, + "token_acc": 0.9029685310230603, + "train_speed(iter/s)": 0.032207 + }, + { + "epoch": 0.786901128711912, + "grad_norm": 0.09344157576560974, + "learning_rate": 3.575513259898027e-05, + "loss": 0.31456148624420166, + "memory(GiB)": 78.33, + "step": 4061, + "token_acc": 0.9028077753779697, + "train_speed(iter/s)": 0.032208 + }, + { + "epoch": 0.7870948989972387, + "grad_norm": 0.09507586807012558, + "learning_rate": 3.569286165989881e-05, + "loss": 0.326376736164093, + "memory(GiB)": 78.33, + "step": 4062, + "token_acc": 0.900634411067798, + "train_speed(iter/s)": 0.032208 + }, + { + "epoch": 0.7872886692825655, + "grad_norm": 0.10197113454341888, + "learning_rate": 3.5630637669830645e-05, + "loss": 0.3019747734069824, + "memory(GiB)": 78.33, + "step": 4063, + "token_acc": 0.9101308469795503, + "train_speed(iter/s)": 0.032209 + }, + { + "epoch": 0.7874824395678922, + "grad_norm": 0.10532142966985703, + "learning_rate": 3.556846065433279e-05, + "loss": 0.33741429448127747, + "memory(GiB)": 78.33, + "step": 4064, + "token_acc": 0.900261802212651, + "train_speed(iter/s)": 0.03221 + }, + { + "epoch": 0.787676209853219, + "grad_norm": 0.10591472685337067, + "learning_rate": 3.550633063894301e-05, + "loss": 0.3443160951137543, + "memory(GiB)": 78.33, + "step": 4065, + "token_acc": 0.8973382917044889, + "train_speed(iter/s)": 0.032211 + }, + { + "epoch": 0.7878699801385457, + "grad_norm": 0.10201407968997955, + "learning_rate": 3.544424764917983e-05, + "loss": 0.33403322100639343, + "memory(GiB)": 78.33, + "step": 4066, + "token_acc": 0.8988485568234625, + "train_speed(iter/s)": 0.032211 + }, + { + "epoch": 0.7880637504238724, + "grad_norm": 0.09689588844776154, + "learning_rate": 3.538221171054239e-05, + "loss": 0.3143858015537262, + "memory(GiB)": 78.33, + "step": 4067, + "token_acc": 0.9059065757302792, + "train_speed(iter/s)": 0.032212 + }, + { + "epoch": 0.7882575207091993, + "grad_norm": 0.09144670516252518, + "learning_rate": 3.532022284851048e-05, + "loss": 0.3066175878047943, + "memory(GiB)": 78.33, + "step": 4068, + "token_acc": 0.9066947038975011, + "train_speed(iter/s)": 0.032213 + }, + { + "epoch": 0.788451290994526, + "grad_norm": 0.09848953783512115, + "learning_rate": 3.525828108854464e-05, + "loss": 0.3331592381000519, + "memory(GiB)": 78.33, + "step": 4069, + "token_acc": 0.9009377322888673, + "train_speed(iter/s)": 0.032213 + }, + { + "epoch": 0.7886450612798528, + "grad_norm": 0.11075198650360107, + "learning_rate": 3.519638645608596e-05, + "loss": 0.37963664531707764, + "memory(GiB)": 78.33, + "step": 4070, + "token_acc": 0.8859161767085604, + "train_speed(iter/s)": 0.032214 + }, + { + "epoch": 0.7888388315651795, + "grad_norm": 0.09600254893302917, + "learning_rate": 3.513453897655622e-05, + "loss": 0.30960047245025635, + "memory(GiB)": 78.33, + "step": 4071, + "token_acc": 0.9076073849781613, + "train_speed(iter/s)": 0.032215 + }, + { + "epoch": 0.7890326018505063, + "grad_norm": 0.10882295668125153, + "learning_rate": 3.507273867535793e-05, + "loss": 0.35110461711883545, + "memory(GiB)": 78.33, + "step": 4072, + "token_acc": 0.8942715048811936, + "train_speed(iter/s)": 0.032215 + }, + { + "epoch": 0.789226372135833, + "grad_norm": 0.10820236057043076, + "learning_rate": 3.5010985577874066e-05, + "loss": 0.34106817841529846, + "memory(GiB)": 78.33, + "step": 4073, + "token_acc": 0.8956751190629425, + "train_speed(iter/s)": 0.032216 + }, + { + "epoch": 0.7894201424211598, + "grad_norm": 0.11212671548128128, + "learning_rate": 3.494927970946831e-05, + "loss": 0.3203110694885254, + "memory(GiB)": 78.33, + "step": 4074, + "token_acc": 0.9028558826772521, + "train_speed(iter/s)": 0.032217 + }, + { + "epoch": 0.7896139127064865, + "grad_norm": 0.09849805384874344, + "learning_rate": 3.4887621095484905e-05, + "loss": 0.3194783627986908, + "memory(GiB)": 78.33, + "step": 4075, + "token_acc": 0.9021505376344086, + "train_speed(iter/s)": 0.032217 + }, + { + "epoch": 0.7898076829918133, + "grad_norm": 0.10881268978118896, + "learning_rate": 3.482600976124871e-05, + "loss": 0.33694878220558167, + "memory(GiB)": 78.33, + "step": 4076, + "token_acc": 0.8999273783587509, + "train_speed(iter/s)": 0.032218 + }, + { + "epoch": 0.79000145327714, + "grad_norm": 0.08902069926261902, + "learning_rate": 3.476444573206515e-05, + "loss": 0.28375521302223206, + "memory(GiB)": 78.33, + "step": 4077, + "token_acc": 0.9137976536071591, + "train_speed(iter/s)": 0.032219 + }, + { + "epoch": 0.7901952235624667, + "grad_norm": 0.11011825501918793, + "learning_rate": 3.4702929033220174e-05, + "loss": 0.36214667558670044, + "memory(GiB)": 78.33, + "step": 4078, + "token_acc": 0.8909107047754306, + "train_speed(iter/s)": 0.032219 + }, + { + "epoch": 0.7903889938477935, + "grad_norm": 0.09892592579126358, + "learning_rate": 3.464145968998045e-05, + "loss": 0.3139350414276123, + "memory(GiB)": 78.33, + "step": 4079, + "token_acc": 0.904188324670132, + "train_speed(iter/s)": 0.03222 + }, + { + "epoch": 0.7905827641331202, + "grad_norm": 0.09487756341695786, + "learning_rate": 3.4580037727593033e-05, + "loss": 0.32229354977607727, + "memory(GiB)": 78.33, + "step": 4080, + "token_acc": 0.9033888688296281, + "train_speed(iter/s)": 0.032221 + }, + { + "epoch": 0.790776534418447, + "grad_norm": 0.09870006889104843, + "learning_rate": 3.4518663171285563e-05, + "loss": 0.3076024651527405, + "memory(GiB)": 78.33, + "step": 4081, + "token_acc": 0.9079937641113859, + "train_speed(iter/s)": 0.032222 + }, + { + "epoch": 0.7909703047037737, + "grad_norm": 0.08845412731170654, + "learning_rate": 3.445733604626626e-05, + "loss": 0.3001633286476135, + "memory(GiB)": 78.33, + "step": 4082, + "token_acc": 0.9098186843296562, + "train_speed(iter/s)": 0.032222 + }, + { + "epoch": 0.7911640749891005, + "grad_norm": 0.10530709475278854, + "learning_rate": 3.4396056377723766e-05, + "loss": 0.32672393321990967, + "memory(GiB)": 78.33, + "step": 4083, + "token_acc": 0.9015249886758266, + "train_speed(iter/s)": 0.032223 + }, + { + "epoch": 0.7913578452744272, + "grad_norm": 0.09617147594690323, + "learning_rate": 3.433482419082734e-05, + "loss": 0.31130674481391907, + "memory(GiB)": 78.33, + "step": 4084, + "token_acc": 0.9056026236676687, + "train_speed(iter/s)": 0.032224 + }, + { + "epoch": 0.791551615559754, + "grad_norm": 0.09556283056735992, + "learning_rate": 3.4273639510726617e-05, + "loss": 0.3225637972354889, + "memory(GiB)": 78.33, + "step": 4085, + "token_acc": 0.9032013734599071, + "train_speed(iter/s)": 0.032224 + }, + { + "epoch": 0.7917453858450807, + "grad_norm": 0.10910790413618088, + "learning_rate": 3.4212502362551864e-05, + "loss": 0.3229255974292755, + "memory(GiB)": 78.33, + "step": 4086, + "token_acc": 0.9025904897090135, + "train_speed(iter/s)": 0.032225 + }, + { + "epoch": 0.7919391561304074, + "grad_norm": 0.10885477811098099, + "learning_rate": 3.415141277141372e-05, + "loss": 0.36971014738082886, + "memory(GiB)": 78.33, + "step": 4087, + "token_acc": 0.8883316991175234, + "train_speed(iter/s)": 0.032226 + }, + { + "epoch": 0.7921329264157342, + "grad_norm": 0.10303046554327011, + "learning_rate": 3.409037076240334e-05, + "loss": 0.3518933653831482, + "memory(GiB)": 78.33, + "step": 4088, + "token_acc": 0.8961959684672769, + "train_speed(iter/s)": 0.032226 + }, + { + "epoch": 0.7923266967010609, + "grad_norm": 0.09691599756479263, + "learning_rate": 3.4029376360592284e-05, + "loss": 0.3284243941307068, + "memory(GiB)": 78.33, + "step": 4089, + "token_acc": 0.9007073741985853, + "train_speed(iter/s)": 0.032227 + }, + { + "epoch": 0.7925204669863877, + "grad_norm": 0.10007129609584808, + "learning_rate": 3.396842959103262e-05, + "loss": 0.30757999420166016, + "memory(GiB)": 78.33, + "step": 4090, + "token_acc": 0.9069416756121249, + "train_speed(iter/s)": 0.032228 + }, + { + "epoch": 0.7927142372717144, + "grad_norm": 0.12611159682273865, + "learning_rate": 3.3907530478756793e-05, + "loss": 0.3721444308757782, + "memory(GiB)": 78.33, + "step": 4091, + "token_acc": 0.8881480744352672, + "train_speed(iter/s)": 0.032228 + }, + { + "epoch": 0.7929080075570412, + "grad_norm": 0.10851044952869415, + "learning_rate": 3.38466790487777e-05, + "loss": 0.3504268229007721, + "memory(GiB)": 78.33, + "step": 4092, + "token_acc": 0.8952597994530538, + "train_speed(iter/s)": 0.032229 + }, + { + "epoch": 0.7931017778423679, + "grad_norm": 0.09891311824321747, + "learning_rate": 3.378587532608872e-05, + "loss": 0.3422066867351532, + "memory(GiB)": 78.33, + "step": 4093, + "token_acc": 0.8977621295777654, + "train_speed(iter/s)": 0.03223 + }, + { + "epoch": 0.7932955481276946, + "grad_norm": 0.10003877431154251, + "learning_rate": 3.372511933566355e-05, + "loss": 0.3378649652004242, + "memory(GiB)": 78.33, + "step": 4094, + "token_acc": 0.8983621269912497, + "train_speed(iter/s)": 0.032231 + }, + { + "epoch": 0.7934893184130214, + "grad_norm": 0.10703529417514801, + "learning_rate": 3.366441110245627e-05, + "loss": 0.34462597966194153, + "memory(GiB)": 78.33, + "step": 4095, + "token_acc": 0.896659132256464, + "train_speed(iter/s)": 0.032231 + }, + { + "epoch": 0.7936830886983481, + "grad_norm": 0.10170795023441315, + "learning_rate": 3.360375065140142e-05, + "loss": 0.33412742614746094, + "memory(GiB)": 78.33, + "step": 4096, + "token_acc": 0.9000777389083237, + "train_speed(iter/s)": 0.032232 + }, + { + "epoch": 0.7938768589836749, + "grad_norm": 0.09521227329969406, + "learning_rate": 3.354313800741387e-05, + "loss": 0.32723385095596313, + "memory(GiB)": 78.33, + "step": 4097, + "token_acc": 0.9021288088982738, + "train_speed(iter/s)": 0.032233 + }, + { + "epoch": 0.7940706292690016, + "grad_norm": 0.10300496965646744, + "learning_rate": 3.3482573195388854e-05, + "loss": 0.35852038860321045, + "memory(GiB)": 78.33, + "step": 4098, + "token_acc": 0.8925953869368398, + "train_speed(iter/s)": 0.032233 + }, + { + "epoch": 0.7942643995543284, + "grad_norm": 0.09719771891832352, + "learning_rate": 3.342205624020194e-05, + "loss": 0.3195003569126129, + "memory(GiB)": 78.33, + "step": 4099, + "token_acc": 0.9037184087959075, + "train_speed(iter/s)": 0.032234 + }, + { + "epoch": 0.7944581698396551, + "grad_norm": 0.09721704572439194, + "learning_rate": 3.336158716670913e-05, + "loss": 0.3213385343551636, + "memory(GiB)": 78.33, + "step": 4100, + "token_acc": 0.9026950799122532, + "train_speed(iter/s)": 0.032235 + }, + { + "epoch": 0.7946519401249819, + "grad_norm": 0.1064034178853035, + "learning_rate": 3.330116599974666e-05, + "loss": 0.34772008657455444, + "memory(GiB)": 78.33, + "step": 4101, + "token_acc": 0.8958247453035746, + "train_speed(iter/s)": 0.032235 + }, + { + "epoch": 0.7948457104103086, + "grad_norm": 0.09646876156330109, + "learning_rate": 3.324079276413114e-05, + "loss": 0.3242574632167816, + "memory(GiB)": 78.33, + "step": 4102, + "token_acc": 0.9017802644964394, + "train_speed(iter/s)": 0.032236 + }, + { + "epoch": 0.7950394806956353, + "grad_norm": 0.09698193520307541, + "learning_rate": 3.318046748465949e-05, + "loss": 0.3176778554916382, + "memory(GiB)": 78.33, + "step": 4103, + "token_acc": 0.9066922523386903, + "train_speed(iter/s)": 0.032237 + }, + { + "epoch": 0.7952332509809621, + "grad_norm": 0.09748407453298569, + "learning_rate": 3.312019018610884e-05, + "loss": 0.31994813680648804, + "memory(GiB)": 78.33, + "step": 4104, + "token_acc": 0.9049721189591078, + "train_speed(iter/s)": 0.032237 + }, + { + "epoch": 0.7954270212662888, + "grad_norm": 0.09335286915302277, + "learning_rate": 3.305996089323681e-05, + "loss": 0.3147365152835846, + "memory(GiB)": 78.33, + "step": 4105, + "token_acc": 0.9052823315118397, + "train_speed(iter/s)": 0.032238 + }, + { + "epoch": 0.7956207915516156, + "grad_norm": 0.09451750665903091, + "learning_rate": 3.299977963078115e-05, + "loss": 0.31313663721084595, + "memory(GiB)": 78.33, + "step": 4106, + "token_acc": 0.9066659892287369, + "train_speed(iter/s)": 0.032239 + }, + { + "epoch": 0.7958145618369423, + "grad_norm": 0.10167912393808365, + "learning_rate": 3.29396464234599e-05, + "loss": 0.3258751928806305, + "memory(GiB)": 78.33, + "step": 4107, + "token_acc": 0.9005933489719884, + "train_speed(iter/s)": 0.032239 + }, + { + "epoch": 0.7960083321222691, + "grad_norm": 0.11654622852802277, + "learning_rate": 3.287956129597142e-05, + "loss": 0.39499345421791077, + "memory(GiB)": 78.33, + "step": 4108, + "token_acc": 0.883764646907701, + "train_speed(iter/s)": 0.03224 + }, + { + "epoch": 0.7962021024075958, + "grad_norm": 0.09777145832777023, + "learning_rate": 3.281952427299424e-05, + "loss": 0.3374737501144409, + "memory(GiB)": 78.33, + "step": 4109, + "token_acc": 0.8980137309365337, + "train_speed(iter/s)": 0.032241 + }, + { + "epoch": 0.7963958726929226, + "grad_norm": 0.09370385110378265, + "learning_rate": 3.2759535379187214e-05, + "loss": 0.3268592059612274, + "memory(GiB)": 78.33, + "step": 4110, + "token_acc": 0.9006880405179416, + "train_speed(iter/s)": 0.032241 + }, + { + "epoch": 0.7965896429782493, + "grad_norm": 0.09882562607526779, + "learning_rate": 3.269959463918934e-05, + "loss": 0.32429930567741394, + "memory(GiB)": 78.33, + "step": 4111, + "token_acc": 0.9020517759682456, + "train_speed(iter/s)": 0.032242 + }, + { + "epoch": 0.796783413263576, + "grad_norm": 0.09826599806547165, + "learning_rate": 3.263970207761997e-05, + "loss": 0.32509344816207886, + "memory(GiB)": 78.33, + "step": 4112, + "token_acc": 0.9012852928782132, + "train_speed(iter/s)": 0.032243 + }, + { + "epoch": 0.7969771835489028, + "grad_norm": 0.0987883061170578, + "learning_rate": 3.257985771907856e-05, + "loss": 0.3253635764122009, + "memory(GiB)": 78.33, + "step": 4113, + "token_acc": 0.9022095821483264, + "train_speed(iter/s)": 0.032243 + }, + { + "epoch": 0.7971709538342295, + "grad_norm": 0.10667404532432556, + "learning_rate": 3.252006158814478e-05, + "loss": 0.333732932806015, + "memory(GiB)": 78.33, + "step": 4114, + "token_acc": 0.8992663220134353, + "train_speed(iter/s)": 0.032244 + }, + { + "epoch": 0.7973647241195563, + "grad_norm": 0.10525521636009216, + "learning_rate": 3.246031370937851e-05, + "loss": 0.3293308913707733, + "memory(GiB)": 78.33, + "step": 4115, + "token_acc": 0.9034758242413402, + "train_speed(iter/s)": 0.032245 + }, + { + "epoch": 0.797558494404883, + "grad_norm": 0.09787974506616592, + "learning_rate": 3.240061410731981e-05, + "loss": 0.31691908836364746, + "memory(GiB)": 78.33, + "step": 4116, + "token_acc": 0.9044979630296816, + "train_speed(iter/s)": 0.032246 + }, + { + "epoch": 0.7977522646902098, + "grad_norm": 0.10533692687749863, + "learning_rate": 3.234096280648892e-05, + "loss": 0.35985928773880005, + "memory(GiB)": 78.33, + "step": 4117, + "token_acc": 0.8940023752969121, + "train_speed(iter/s)": 0.032246 + }, + { + "epoch": 0.7979460349755365, + "grad_norm": 0.09776397794485092, + "learning_rate": 3.228135983138618e-05, + "loss": 0.320921391248703, + "memory(GiB)": 78.33, + "step": 4118, + "token_acc": 0.9044611133043822, + "train_speed(iter/s)": 0.032247 + }, + { + "epoch": 0.7981398052608633, + "grad_norm": 0.10431236773729324, + "learning_rate": 3.222180520649224e-05, + "loss": 0.3434464633464813, + "memory(GiB)": 78.33, + "step": 4119, + "token_acc": 0.8964882943143813, + "train_speed(iter/s)": 0.032248 + }, + { + "epoch": 0.79833357554619, + "grad_norm": 0.09289419651031494, + "learning_rate": 3.216229895626769e-05, + "loss": 0.2984315752983093, + "memory(GiB)": 78.33, + "step": 4120, + "token_acc": 0.9101831539207184, + "train_speed(iter/s)": 0.032248 + }, + { + "epoch": 0.7985273458315167, + "grad_norm": 0.09537661075592041, + "learning_rate": 3.2102841105153414e-05, + "loss": 0.3033888041973114, + "memory(GiB)": 78.33, + "step": 4121, + "token_acc": 0.9094319549819424, + "train_speed(iter/s)": 0.032249 + }, + { + "epoch": 0.7987211161168435, + "grad_norm": 0.09890120476484299, + "learning_rate": 3.2043431677570295e-05, + "loss": 0.33709731698036194, + "memory(GiB)": 78.33, + "step": 4122, + "token_acc": 0.899075500770416, + "train_speed(iter/s)": 0.03225 + }, + { + "epoch": 0.7989148864021702, + "grad_norm": 0.10694926232099533, + "learning_rate": 3.19840706979194e-05, + "loss": 0.3586742877960205, + "memory(GiB)": 78.33, + "step": 4123, + "token_acc": 0.8929906273359783, + "train_speed(iter/s)": 0.03225 + }, + { + "epoch": 0.799108656687497, + "grad_norm": 0.09874139726161957, + "learning_rate": 3.1924758190581886e-05, + "loss": 0.3332917392253876, + "memory(GiB)": 78.33, + "step": 4124, + "token_acc": 0.9008399249775748, + "train_speed(iter/s)": 0.032251 + }, + { + "epoch": 0.7993024269728237, + "grad_norm": 0.10014494508504868, + "learning_rate": 3.186549417991895e-05, + "loss": 0.3542085289955139, + "memory(GiB)": 78.33, + "step": 4125, + "token_acc": 0.892789227527368, + "train_speed(iter/s)": 0.032252 + }, + { + "epoch": 0.7994961972581505, + "grad_norm": 0.10449165850877762, + "learning_rate": 3.1806278690272005e-05, + "loss": 0.3253108263015747, + "memory(GiB)": 78.33, + "step": 4126, + "token_acc": 0.9014500959391023, + "train_speed(iter/s)": 0.032253 + }, + { + "epoch": 0.7996899675434772, + "grad_norm": 0.1035015657544136, + "learning_rate": 3.174711174596238e-05, + "loss": 0.3380778133869171, + "memory(GiB)": 78.33, + "step": 4127, + "token_acc": 0.9001489459211732, + "train_speed(iter/s)": 0.032253 + }, + { + "epoch": 0.799883737828804, + "grad_norm": 0.10077136009931564, + "learning_rate": 3.1687993371291525e-05, + "loss": 0.32532066106796265, + "memory(GiB)": 78.33, + "step": 4128, + "token_acc": 0.9027537506026488, + "train_speed(iter/s)": 0.032254 + }, + { + "epoch": 0.8000775081141307, + "grad_norm": 0.10059913992881775, + "learning_rate": 3.162892359054098e-05, + "loss": 0.3570432662963867, + "memory(GiB)": 78.33, + "step": 4129, + "token_acc": 0.8933640100302458, + "train_speed(iter/s)": 0.032255 + }, + { + "epoch": 0.8002712783994574, + "grad_norm": 0.1088043823838234, + "learning_rate": 3.156990242797226e-05, + "loss": 0.3387261927127838, + "memory(GiB)": 78.33, + "step": 4130, + "token_acc": 0.8997582550409318, + "train_speed(iter/s)": 0.032255 + }, + { + "epoch": 0.8004650486847842, + "grad_norm": 0.10076975077390671, + "learning_rate": 3.151092990782695e-05, + "loss": 0.32494884729385376, + "memory(GiB)": 78.33, + "step": 4131, + "token_acc": 0.9004781745476916, + "train_speed(iter/s)": 0.032256 + }, + { + "epoch": 0.8006588189701109, + "grad_norm": 0.1097509041428566, + "learning_rate": 3.145200605432662e-05, + "loss": 0.3796813189983368, + "memory(GiB)": 78.33, + "step": 4132, + "token_acc": 0.8863678979936246, + "train_speed(iter/s)": 0.032257 + }, + { + "epoch": 0.8008525892554377, + "grad_norm": 0.10167836397886276, + "learning_rate": 3.1393130891672944e-05, + "loss": 0.3483812212944031, + "memory(GiB)": 78.33, + "step": 4133, + "token_acc": 0.896118628960376, + "train_speed(iter/s)": 0.032257 + }, + { + "epoch": 0.8010463595407644, + "grad_norm": 0.10103829205036163, + "learning_rate": 3.1334304444047495e-05, + "loss": 0.3420543372631073, + "memory(GiB)": 78.33, + "step": 4134, + "token_acc": 0.8974770039421813, + "train_speed(iter/s)": 0.032258 + }, + { + "epoch": 0.8012401298260912, + "grad_norm": 0.10454893857240677, + "learning_rate": 3.1275526735611896e-05, + "loss": 0.31737667322158813, + "memory(GiB)": 78.33, + "step": 4135, + "token_acc": 0.9039817974971559, + "train_speed(iter/s)": 0.032259 + }, + { + "epoch": 0.8014339001114179, + "grad_norm": 0.14402469992637634, + "learning_rate": 3.12167977905077e-05, + "loss": 0.36191561818122864, + "memory(GiB)": 78.33, + "step": 4136, + "token_acc": 0.8928331193053263, + "train_speed(iter/s)": 0.03226 + }, + { + "epoch": 0.8016276703967447, + "grad_norm": 0.09334685653448105, + "learning_rate": 3.1158117632856454e-05, + "loss": 0.3147181570529938, + "memory(GiB)": 78.33, + "step": 4137, + "token_acc": 0.905297142173861, + "train_speed(iter/s)": 0.03226 + }, + { + "epoch": 0.8018214406820714, + "grad_norm": 0.09789223968982697, + "learning_rate": 3.109948628675974e-05, + "loss": 0.3045772612094879, + "memory(GiB)": 78.33, + "step": 4138, + "token_acc": 0.9074807619528195, + "train_speed(iter/s)": 0.032261 + }, + { + "epoch": 0.8020152109673981, + "grad_norm": 0.11379068344831467, + "learning_rate": 3.104090377629899e-05, + "loss": 0.3640574514865875, + "memory(GiB)": 78.33, + "step": 4139, + "token_acc": 0.8914368285480933, + "train_speed(iter/s)": 0.032262 + }, + { + "epoch": 0.8022089812527249, + "grad_norm": 0.11807750910520554, + "learning_rate": 3.098237012553562e-05, + "loss": 0.35527053475379944, + "memory(GiB)": 78.33, + "step": 4140, + "token_acc": 0.8920514040932889, + "train_speed(iter/s)": 0.032262 + }, + { + "epoch": 0.8024027515380516, + "grad_norm": 0.09791693091392517, + "learning_rate": 3.0923885358510946e-05, + "loss": 0.3340778946876526, + "memory(GiB)": 78.33, + "step": 4141, + "token_acc": 0.9000325538494927, + "train_speed(iter/s)": 0.032263 + }, + { + "epoch": 0.8025965218233784, + "grad_norm": 0.10278962552547455, + "learning_rate": 3.086544949924627e-05, + "loss": 0.34071099758148193, + "memory(GiB)": 78.33, + "step": 4142, + "token_acc": 0.9005618602581219, + "train_speed(iter/s)": 0.032264 + }, + { + "epoch": 0.8027902921087051, + "grad_norm": 0.100987508893013, + "learning_rate": 3.0807062571742755e-05, + "loss": 0.32279205322265625, + "memory(GiB)": 78.33, + "step": 4143, + "token_acc": 0.9030330758672659, + "train_speed(iter/s)": 0.032264 + }, + { + "epoch": 0.8029840623940319, + "grad_norm": 0.10697422176599503, + "learning_rate": 3.074872459998143e-05, + "loss": 0.3390357792377472, + "memory(GiB)": 78.33, + "step": 4144, + "token_acc": 0.9002963590177815, + "train_speed(iter/s)": 0.032265 + }, + { + "epoch": 0.8031778326793586, + "grad_norm": 0.09046490490436554, + "learning_rate": 3.069043560792336e-05, + "loss": 0.3146419823169708, + "memory(GiB)": 78.33, + "step": 4145, + "token_acc": 0.9062027231467473, + "train_speed(iter/s)": 0.032266 + }, + { + "epoch": 0.8033716029646853, + "grad_norm": 0.09514185786247253, + "learning_rate": 3.063219561950936e-05, + "loss": 0.3226296007633209, + "memory(GiB)": 78.33, + "step": 4146, + "token_acc": 0.9025114098311594, + "train_speed(iter/s)": 0.032266 + }, + { + "epoch": 0.8035653732500121, + "grad_norm": 0.10075315088033676, + "learning_rate": 3.057400465866016e-05, + "loss": 0.31182047724723816, + "memory(GiB)": 78.33, + "step": 4147, + "token_acc": 0.9075695604585517, + "train_speed(iter/s)": 0.032267 + }, + { + "epoch": 0.8037591435353388, + "grad_norm": 0.10024786740541458, + "learning_rate": 3.0515862749276353e-05, + "loss": 0.3048425018787384, + "memory(GiB)": 78.33, + "step": 4148, + "token_acc": 0.9080320590439946, + "train_speed(iter/s)": 0.032268 + }, + { + "epoch": 0.8039529138206656, + "grad_norm": 0.09824330359697342, + "learning_rate": 3.0457769915238368e-05, + "loss": 0.3147960603237152, + "memory(GiB)": 78.33, + "step": 4149, + "token_acc": 0.9054583112323508, + "train_speed(iter/s)": 0.032268 + }, + { + "epoch": 0.8041466841059923, + "grad_norm": 0.10100057721138, + "learning_rate": 3.03997261804065e-05, + "loss": 0.35047298669815063, + "memory(GiB)": 78.33, + "step": 4150, + "token_acc": 0.8985602958658038, + "train_speed(iter/s)": 0.032269 + }, + { + "epoch": 0.8043404543913191, + "grad_norm": 0.10795464366674423, + "learning_rate": 3.034173156862084e-05, + "loss": 0.3506011664867401, + "memory(GiB)": 78.33, + "step": 4151, + "token_acc": 0.8946405657028967, + "train_speed(iter/s)": 0.03227 + }, + { + "epoch": 0.8045342246766458, + "grad_norm": 0.09722840040922165, + "learning_rate": 3.028378610370141e-05, + "loss": 0.30743473768234253, + "memory(GiB)": 78.33, + "step": 4152, + "token_acc": 0.9074273940345369, + "train_speed(iter/s)": 0.03227 + }, + { + "epoch": 0.8047279949619726, + "grad_norm": 0.10855165868997574, + "learning_rate": 3.022588980944792e-05, + "loss": 0.37705856561660767, + "memory(GiB)": 78.33, + "step": 4153, + "token_acc": 0.889402452187872, + "train_speed(iter/s)": 0.032271 + }, + { + "epoch": 0.8049217652472993, + "grad_norm": 0.09210513532161713, + "learning_rate": 3.0168042709639932e-05, + "loss": 0.30485376715660095, + "memory(GiB)": 78.33, + "step": 4154, + "token_acc": 0.9075583735909822, + "train_speed(iter/s)": 0.032271 + }, + { + "epoch": 0.805115535532626, + "grad_norm": 0.10625799745321274, + "learning_rate": 3.011024482803684e-05, + "loss": 0.3092432916164398, + "memory(GiB)": 78.33, + "step": 4155, + "token_acc": 0.9070884146341464, + "train_speed(iter/s)": 0.032272 + }, + { + "epoch": 0.8053093058179528, + "grad_norm": 0.10007067769765854, + "learning_rate": 3.0052496188377735e-05, + "loss": 0.3402232229709625, + "memory(GiB)": 78.33, + "step": 4156, + "token_acc": 0.8997760214195608, + "train_speed(iter/s)": 0.032273 + }, + { + "epoch": 0.8055030761032795, + "grad_norm": 0.10713425278663635, + "learning_rate": 2.999479681438156e-05, + "loss": 0.3407253921031952, + "memory(GiB)": 78.33, + "step": 4157, + "token_acc": 0.8987352889513438, + "train_speed(iter/s)": 0.032273 + }, + { + "epoch": 0.8056968463886063, + "grad_norm": 0.09479415416717529, + "learning_rate": 2.993714672974698e-05, + "loss": 0.3190673887729645, + "memory(GiB)": 78.33, + "step": 4158, + "token_acc": 0.9022564050323435, + "train_speed(iter/s)": 0.032274 + }, + { + "epoch": 0.805890616673933, + "grad_norm": 0.10293829441070557, + "learning_rate": 2.987954595815247e-05, + "loss": 0.3430511951446533, + "memory(GiB)": 78.33, + "step": 4159, + "token_acc": 0.8964779192630723, + "train_speed(iter/s)": 0.032275 + }, + { + "epoch": 0.8060843869592598, + "grad_norm": 0.09652558714151382, + "learning_rate": 2.98219945232562e-05, + "loss": 0.3083397150039673, + "memory(GiB)": 78.33, + "step": 4160, + "token_acc": 0.9067033176387339, + "train_speed(iter/s)": 0.032275 + }, + { + "epoch": 0.8062781572445865, + "grad_norm": 0.11536381393671036, + "learning_rate": 2.9764492448696098e-05, + "loss": 0.3753798305988312, + "memory(GiB)": 78.33, + "step": 4161, + "token_acc": 0.8890807651434643, + "train_speed(iter/s)": 0.032276 + }, + { + "epoch": 0.8064719275299133, + "grad_norm": 0.09727875888347626, + "learning_rate": 2.970703975808979e-05, + "loss": 0.32429182529449463, + "memory(GiB)": 78.33, + "step": 4162, + "token_acc": 0.9036051879533964, + "train_speed(iter/s)": 0.032277 + }, + { + "epoch": 0.80666569781524, + "grad_norm": 0.10008195042610168, + "learning_rate": 2.964963647503465e-05, + "loss": 0.326107919216156, + "memory(GiB)": 78.33, + "step": 4163, + "token_acc": 0.9037652447142781, + "train_speed(iter/s)": 0.032278 + }, + { + "epoch": 0.8068594681005667, + "grad_norm": 0.0974980965256691, + "learning_rate": 2.9592282623107765e-05, + "loss": 0.31634485721588135, + "memory(GiB)": 78.33, + "step": 4164, + "token_acc": 0.9050801412659604, + "train_speed(iter/s)": 0.032278 + }, + { + "epoch": 0.8070532383858935, + "grad_norm": 0.10140351206064224, + "learning_rate": 2.953497822586583e-05, + "loss": 0.32958346605300903, + "memory(GiB)": 78.33, + "step": 4165, + "token_acc": 0.900670556920557, + "train_speed(iter/s)": 0.032279 + }, + { + "epoch": 0.8072470086712202, + "grad_norm": 0.10700841248035431, + "learning_rate": 2.9477723306845414e-05, + "loss": 0.34634923934936523, + "memory(GiB)": 78.33, + "step": 4166, + "token_acc": 0.8981357595218982, + "train_speed(iter/s)": 0.032279 + }, + { + "epoch": 0.807440778956547, + "grad_norm": 0.09883598238229752, + "learning_rate": 2.9420517889562574e-05, + "loss": 0.3215301036834717, + "memory(GiB)": 78.33, + "step": 4167, + "token_acc": 0.9049081311541975, + "train_speed(iter/s)": 0.03228 + }, + { + "epoch": 0.8076345492418737, + "grad_norm": 0.10146182030439377, + "learning_rate": 2.9363361997513145e-05, + "loss": 0.3307911157608032, + "memory(GiB)": 78.33, + "step": 4168, + "token_acc": 0.9003170028818444, + "train_speed(iter/s)": 0.032281 + }, + { + "epoch": 0.8078283195272005, + "grad_norm": 0.10559407621622086, + "learning_rate": 2.9306255654172572e-05, + "loss": 0.33857250213623047, + "memory(GiB)": 78.33, + "step": 4169, + "token_acc": 0.8989095106466997, + "train_speed(iter/s)": 0.032281 + }, + { + "epoch": 0.8080220898125272, + "grad_norm": 0.10372511297464371, + "learning_rate": 2.9249198882995973e-05, + "loss": 0.31755688786506653, + "memory(GiB)": 78.33, + "step": 4170, + "token_acc": 0.9028887891034276, + "train_speed(iter/s)": 0.032282 + }, + { + "epoch": 0.808215860097854, + "grad_norm": 0.10989030450582504, + "learning_rate": 2.91921917074181e-05, + "loss": 0.3677654266357422, + "memory(GiB)": 78.33, + "step": 4171, + "token_acc": 0.8904222209172588, + "train_speed(iter/s)": 0.032283 + }, + { + "epoch": 0.8084096303831807, + "grad_norm": 0.10606292635202408, + "learning_rate": 2.9135234150853276e-05, + "loss": 0.35702183842658997, + "memory(GiB)": 78.33, + "step": 4172, + "token_acc": 0.8924521259797307, + "train_speed(iter/s)": 0.032283 + }, + { + "epoch": 0.8086034006685074, + "grad_norm": 0.09554623067378998, + "learning_rate": 2.907832623669559e-05, + "loss": 0.32034456729888916, + "memory(GiB)": 78.33, + "step": 4173, + "token_acc": 0.9017811962351988, + "train_speed(iter/s)": 0.032284 + }, + { + "epoch": 0.8087971709538342, + "grad_norm": 0.09658126533031464, + "learning_rate": 2.90214679883186e-05, + "loss": 0.3179253339767456, + "memory(GiB)": 78.33, + "step": 4174, + "token_acc": 0.9031473726867486, + "train_speed(iter/s)": 0.032285 + }, + { + "epoch": 0.8089909412391609, + "grad_norm": 0.09535779058933258, + "learning_rate": 2.8964659429075543e-05, + "loss": 0.3256949186325073, + "memory(GiB)": 78.33, + "step": 4175, + "token_acc": 0.9016154247003648, + "train_speed(iter/s)": 0.032285 + }, + { + "epoch": 0.8091847115244877, + "grad_norm": 0.09355632960796356, + "learning_rate": 2.890790058229919e-05, + "loss": 0.3242107629776001, + "memory(GiB)": 78.33, + "step": 4176, + "token_acc": 0.9035344140313579, + "train_speed(iter/s)": 0.032286 + }, + { + "epoch": 0.8093784818098144, + "grad_norm": 0.10124616324901581, + "learning_rate": 2.8851191471301903e-05, + "loss": 0.31790977716445923, + "memory(GiB)": 78.33, + "step": 4177, + "token_acc": 0.9053812224322622, + "train_speed(iter/s)": 0.032287 + }, + { + "epoch": 0.8095722520951412, + "grad_norm": 0.10209498554468155, + "learning_rate": 2.8794532119375712e-05, + "loss": 0.3283519148826599, + "memory(GiB)": 78.33, + "step": 4178, + "token_acc": 0.9008973858759266, + "train_speed(iter/s)": 0.032287 + }, + { + "epoch": 0.8097660223804679, + "grad_norm": 0.11376667767763138, + "learning_rate": 2.8737922549792103e-05, + "loss": 0.3559627830982208, + "memory(GiB)": 78.33, + "step": 4179, + "token_acc": 0.8943670846197467, + "train_speed(iter/s)": 0.032288 + }, + { + "epoch": 0.8099597926657947, + "grad_norm": 0.0886598601937294, + "learning_rate": 2.868136278580214e-05, + "loss": 0.3123997449874878, + "memory(GiB)": 78.33, + "step": 4180, + "token_acc": 0.9062552047395845, + "train_speed(iter/s)": 0.032289 + }, + { + "epoch": 0.8101535629511214, + "grad_norm": 0.09667520970106125, + "learning_rate": 2.8624852850636432e-05, + "loss": 0.3109186589717865, + "memory(GiB)": 78.33, + "step": 4181, + "token_acc": 0.9074359598582596, + "train_speed(iter/s)": 0.032289 + }, + { + "epoch": 0.8103473332364481, + "grad_norm": 0.09185932576656342, + "learning_rate": 2.856839276750514e-05, + "loss": 0.3074433505535126, + "memory(GiB)": 78.33, + "step": 4182, + "token_acc": 0.9070412315028378, + "train_speed(iter/s)": 0.03229 + }, + { + "epoch": 0.8105411035217749, + "grad_norm": 0.09254490584135056, + "learning_rate": 2.851198255959793e-05, + "loss": 0.3106043040752411, + "memory(GiB)": 78.33, + "step": 4183, + "token_acc": 0.9067268041237113, + "train_speed(iter/s)": 0.032291 + }, + { + "epoch": 0.8107348738071016, + "grad_norm": 0.10701245069503784, + "learning_rate": 2.8455622250083953e-05, + "loss": 0.32825660705566406, + "memory(GiB)": 78.33, + "step": 4184, + "token_acc": 0.9028698224852071, + "train_speed(iter/s)": 0.032291 + }, + { + "epoch": 0.8109286440924284, + "grad_norm": 0.09500328451395035, + "learning_rate": 2.8399311862111978e-05, + "loss": 0.28734296560287476, + "memory(GiB)": 78.33, + "step": 4185, + "token_acc": 0.9113909591935806, + "train_speed(iter/s)": 0.032292 + }, + { + "epoch": 0.8111224143777551, + "grad_norm": 0.11596144735813141, + "learning_rate": 2.834305141881017e-05, + "loss": 0.3888537287712097, + "memory(GiB)": 78.33, + "step": 4186, + "token_acc": 0.8838215903227655, + "train_speed(iter/s)": 0.032293 + }, + { + "epoch": 0.8113161846630819, + "grad_norm": 0.10639077425003052, + "learning_rate": 2.8286840943286178e-05, + "loss": 0.34377238154411316, + "memory(GiB)": 78.33, + "step": 4187, + "token_acc": 0.9001137980085349, + "train_speed(iter/s)": 0.032293 + }, + { + "epoch": 0.8115099549484086, + "grad_norm": 0.10659915953874588, + "learning_rate": 2.823068045862718e-05, + "loss": 0.36640581488609314, + "memory(GiB)": 78.33, + "step": 4188, + "token_acc": 0.8934043246767358, + "train_speed(iter/s)": 0.032294 + }, + { + "epoch": 0.8117037252337354, + "grad_norm": 0.10051631182432175, + "learning_rate": 2.817456998789978e-05, + "loss": 0.3765765428543091, + "memory(GiB)": 78.33, + "step": 4189, + "token_acc": 0.8884732824427481, + "train_speed(iter/s)": 0.032294 + }, + { + "epoch": 0.8118974955190622, + "grad_norm": 0.10304338485002518, + "learning_rate": 2.8118509554150076e-05, + "loss": 0.34053856134414673, + "memory(GiB)": 78.33, + "step": 4190, + "token_acc": 0.8995534483709136, + "train_speed(iter/s)": 0.032295 + }, + { + "epoch": 0.812091265804389, + "grad_norm": 0.09566718339920044, + "learning_rate": 2.8062499180403532e-05, + "loss": 0.3288200795650482, + "memory(GiB)": 78.33, + "step": 4191, + "token_acc": 0.900259556989139, + "train_speed(iter/s)": 0.032296 + }, + { + "epoch": 0.8122850360897157, + "grad_norm": 0.10790709406137466, + "learning_rate": 2.800653888966519e-05, + "loss": 0.32764533162117004, + "memory(GiB)": 78.33, + "step": 4192, + "token_acc": 0.9018782286721013, + "train_speed(iter/s)": 0.032296 + }, + { + "epoch": 0.8124788063750424, + "grad_norm": 0.09332863241434097, + "learning_rate": 2.7950628704919426e-05, + "loss": 0.3051561415195465, + "memory(GiB)": 78.33, + "step": 4193, + "token_acc": 0.9071980963712076, + "train_speed(iter/s)": 0.032297 + }, + { + "epoch": 0.8126725766603692, + "grad_norm": 0.10486903041601181, + "learning_rate": 2.7894768649130044e-05, + "loss": 0.358365923166275, + "memory(GiB)": 78.33, + "step": 4194, + "token_acc": 0.8943505007578234, + "train_speed(iter/s)": 0.032298 + }, + { + "epoch": 0.8128663469456959, + "grad_norm": 0.08717334270477295, + "learning_rate": 2.783895874524028e-05, + "loss": 0.30397284030914307, + "memory(GiB)": 78.33, + "step": 4195, + "token_acc": 0.9074197783971805, + "train_speed(iter/s)": 0.032298 + }, + { + "epoch": 0.8130601172310227, + "grad_norm": 0.09161286801099777, + "learning_rate": 2.7783199016172765e-05, + "loss": 0.3110318183898926, + "memory(GiB)": 78.33, + "step": 4196, + "token_acc": 0.907268415519644, + "train_speed(iter/s)": 0.032299 + }, + { + "epoch": 0.8132538875163494, + "grad_norm": 0.10017764568328857, + "learning_rate": 2.772748948482949e-05, + "loss": 0.33178234100341797, + "memory(GiB)": 78.33, + "step": 4197, + "token_acc": 0.8992754418706773, + "train_speed(iter/s)": 0.0323 + }, + { + "epoch": 0.8134476578016762, + "grad_norm": 0.09984217584133148, + "learning_rate": 2.7671830174091824e-05, + "loss": 0.33245033025741577, + "memory(GiB)": 78.33, + "step": 4198, + "token_acc": 0.9006675354378872, + "train_speed(iter/s)": 0.0323 + }, + { + "epoch": 0.8136414280870029, + "grad_norm": 0.09028760343790054, + "learning_rate": 2.7616221106820645e-05, + "loss": 0.29679808020591736, + "memory(GiB)": 78.33, + "step": 4199, + "token_acc": 0.908898198152728, + "train_speed(iter/s)": 0.032301 + }, + { + "epoch": 0.8138351983723296, + "grad_norm": 0.10453902184963226, + "learning_rate": 2.7560662305856036e-05, + "loss": 0.3445345163345337, + "memory(GiB)": 78.33, + "step": 4200, + "token_acc": 0.8972627996028932, + "train_speed(iter/s)": 0.032301 + }, + { + "epoch": 0.8140289686576564, + "grad_norm": 0.09469582140445709, + "learning_rate": 2.7505153794017487e-05, + "loss": 0.3133549690246582, + "memory(GiB)": 78.33, + "step": 4201, + "token_acc": 0.905493996966347, + "train_speed(iter/s)": 0.032299 + }, + { + "epoch": 0.8142227389429831, + "grad_norm": 0.10252521187067032, + "learning_rate": 2.744969559410385e-05, + "loss": 0.32331135869026184, + "memory(GiB)": 78.33, + "step": 4202, + "token_acc": 0.9024316857682103, + "train_speed(iter/s)": 0.0323 + }, + { + "epoch": 0.8144165092283099, + "grad_norm": 0.09786242991685867, + "learning_rate": 2.7394287728893265e-05, + "loss": 0.3233289122581482, + "memory(GiB)": 78.33, + "step": 4203, + "token_acc": 0.9021999564365062, + "train_speed(iter/s)": 0.0323 + }, + { + "epoch": 0.8146102795136366, + "grad_norm": 0.09682147949934006, + "learning_rate": 2.733893022114327e-05, + "loss": 0.3290054202079773, + "memory(GiB)": 78.33, + "step": 4204, + "token_acc": 0.9020085944667086, + "train_speed(iter/s)": 0.032301 + }, + { + "epoch": 0.8148040497989634, + "grad_norm": 0.09070180356502533, + "learning_rate": 2.728362309359062e-05, + "loss": 0.3250106871128082, + "memory(GiB)": 78.33, + "step": 4205, + "token_acc": 0.9020958593997225, + "train_speed(iter/s)": 0.032302 + }, + { + "epoch": 0.8149978200842901, + "grad_norm": 0.10017143189907074, + "learning_rate": 2.7228366368951525e-05, + "loss": 0.36836662888526917, + "memory(GiB)": 78.33, + "step": 4206, + "token_acc": 0.8892802334649929, + "train_speed(iter/s)": 0.032302 + }, + { + "epoch": 0.8151915903696169, + "grad_norm": 0.10824721306562424, + "learning_rate": 2.7173160069921357e-05, + "loss": 0.31380200386047363, + "memory(GiB)": 78.33, + "step": 4207, + "token_acc": 0.9043143002803976, + "train_speed(iter/s)": 0.032303 + }, + { + "epoch": 0.8153853606549436, + "grad_norm": 0.09697490185499191, + "learning_rate": 2.7118004219174838e-05, + "loss": 0.3323417901992798, + "memory(GiB)": 78.33, + "step": 4208, + "token_acc": 0.8995884773662551, + "train_speed(iter/s)": 0.032304 + }, + { + "epoch": 0.8155791309402703, + "grad_norm": 0.0982755497097969, + "learning_rate": 2.706289883936595e-05, + "loss": 0.3338177800178528, + "memory(GiB)": 78.33, + "step": 4209, + "token_acc": 0.8988449691991787, + "train_speed(iter/s)": 0.032304 + }, + { + "epoch": 0.8157729012255971, + "grad_norm": 0.09161195158958435, + "learning_rate": 2.7007843953127917e-05, + "loss": 0.3077443540096283, + "memory(GiB)": 78.33, + "step": 4210, + "token_acc": 0.9083794426997657, + "train_speed(iter/s)": 0.032305 + }, + { + "epoch": 0.8159666715109238, + "grad_norm": 0.10864771902561188, + "learning_rate": 2.6952839583073355e-05, + "loss": 0.36505550146102905, + "memory(GiB)": 78.33, + "step": 4211, + "token_acc": 0.8941932795618918, + "train_speed(iter/s)": 0.032306 + }, + { + "epoch": 0.8161604417962506, + "grad_norm": 0.10426463186740875, + "learning_rate": 2.6897885751793956e-05, + "loss": 0.3455352485179901, + "memory(GiB)": 78.33, + "step": 4212, + "token_acc": 0.8963470566017374, + "train_speed(iter/s)": 0.032306 + }, + { + "epoch": 0.8163542120815773, + "grad_norm": 0.1076427772641182, + "learning_rate": 2.6842982481860768e-05, + "loss": 0.3745838403701782, + "memory(GiB)": 78.33, + "step": 4213, + "token_acc": 0.8894979479395064, + "train_speed(iter/s)": 0.032307 + }, + { + "epoch": 0.8165479823669041, + "grad_norm": 0.09672832489013672, + "learning_rate": 2.6788129795824054e-05, + "loss": 0.34184157848358154, + "memory(GiB)": 78.33, + "step": 4214, + "token_acc": 0.8963274582336017, + "train_speed(iter/s)": 0.032308 + }, + { + "epoch": 0.8167417526522308, + "grad_norm": 0.10097566246986389, + "learning_rate": 2.6733327716213236e-05, + "loss": 0.30746835470199585, + "memory(GiB)": 78.33, + "step": 4215, + "token_acc": 0.9063103281853282, + "train_speed(iter/s)": 0.032308 + }, + { + "epoch": 0.8169355229375576, + "grad_norm": 0.10790715366601944, + "learning_rate": 2.667857626553705e-05, + "loss": 0.34957319498062134, + "memory(GiB)": 78.33, + "step": 4216, + "token_acc": 0.8963655436258176, + "train_speed(iter/s)": 0.032309 + }, + { + "epoch": 0.8171292932228843, + "grad_norm": 0.10515126585960388, + "learning_rate": 2.662387546628332e-05, + "loss": 0.3403055965900421, + "memory(GiB)": 78.33, + "step": 4217, + "token_acc": 0.8979972597148937, + "train_speed(iter/s)": 0.032309 + }, + { + "epoch": 0.817323063508211, + "grad_norm": 0.10571117699146271, + "learning_rate": 2.6569225340919202e-05, + "loss": 0.3421645164489746, + "memory(GiB)": 78.33, + "step": 4218, + "token_acc": 0.8976865691740056, + "train_speed(iter/s)": 0.03231 + }, + { + "epoch": 0.8175168337935378, + "grad_norm": 0.10033620893955231, + "learning_rate": 2.651462591189097e-05, + "loss": 0.32853877544403076, + "memory(GiB)": 78.33, + "step": 4219, + "token_acc": 0.9037694765715578, + "train_speed(iter/s)": 0.032311 + }, + { + "epoch": 0.8177106040788645, + "grad_norm": 0.10828068852424622, + "learning_rate": 2.6460077201624058e-05, + "loss": 0.3596659302711487, + "memory(GiB)": 78.33, + "step": 4220, + "token_acc": 0.8916654015485046, + "train_speed(iter/s)": 0.032311 + }, + { + "epoch": 0.8179043743641913, + "grad_norm": 0.09712978452444077, + "learning_rate": 2.6405579232523066e-05, + "loss": 0.31051602959632874, + "memory(GiB)": 78.33, + "step": 4221, + "token_acc": 0.9060798808091227, + "train_speed(iter/s)": 0.032312 + }, + { + "epoch": 0.818098144649518, + "grad_norm": 0.10436290502548218, + "learning_rate": 2.6351132026971823e-05, + "loss": 0.3625693619251251, + "memory(GiB)": 78.33, + "step": 4222, + "token_acc": 0.8920728157908663, + "train_speed(iter/s)": 0.032313 + }, + { + "epoch": 0.8182919149348448, + "grad_norm": 0.10246887058019638, + "learning_rate": 2.6296735607333202e-05, + "loss": 0.34993377327919006, + "memory(GiB)": 78.33, + "step": 4223, + "token_acc": 0.8966694249069094, + "train_speed(iter/s)": 0.032313 + }, + { + "epoch": 0.8184856852201715, + "grad_norm": 0.09719887375831604, + "learning_rate": 2.6242389995949286e-05, + "loss": 0.31395548582077026, + "memory(GiB)": 78.33, + "step": 4224, + "token_acc": 0.9058810845776508, + "train_speed(iter/s)": 0.032314 + }, + { + "epoch": 0.8186794555054983, + "grad_norm": 0.09809587150812149, + "learning_rate": 2.618809521514132e-05, + "loss": 0.3162482976913452, + "memory(GiB)": 78.33, + "step": 4225, + "token_acc": 0.9043280182232346, + "train_speed(iter/s)": 0.032315 + }, + { + "epoch": 0.818873225790825, + "grad_norm": 0.09690183401107788, + "learning_rate": 2.613385128720961e-05, + "loss": 0.308881551027298, + "memory(GiB)": 78.33, + "step": 4226, + "token_acc": 0.9085266774992675, + "train_speed(iter/s)": 0.032315 + }, + { + "epoch": 0.8190669960761517, + "grad_norm": 0.10261222720146179, + "learning_rate": 2.6079658234433575e-05, + "loss": 0.3542589843273163, + "memory(GiB)": 78.33, + "step": 4227, + "token_acc": 0.8961422008770098, + "train_speed(iter/s)": 0.032316 + }, + { + "epoch": 0.8192607663614785, + "grad_norm": 0.09491068869829178, + "learning_rate": 2.602551607907179e-05, + "loss": 0.31204402446746826, + "memory(GiB)": 78.33, + "step": 4228, + "token_acc": 0.9075091359188966, + "train_speed(iter/s)": 0.032316 + }, + { + "epoch": 0.8194545366468052, + "grad_norm": 0.10251081734895706, + "learning_rate": 2.5971424843361865e-05, + "loss": 0.3522550165653229, + "memory(GiB)": 78.33, + "step": 4229, + "token_acc": 0.8963797611666712, + "train_speed(iter/s)": 0.032317 + }, + { + "epoch": 0.819648306932132, + "grad_norm": 0.11336258798837662, + "learning_rate": 2.591738454952055e-05, + "loss": 0.33237776160240173, + "memory(GiB)": 78.33, + "step": 4230, + "token_acc": 0.8990595976358925, + "train_speed(iter/s)": 0.032318 + }, + { + "epoch": 0.8198420772174587, + "grad_norm": 0.0893642008304596, + "learning_rate": 2.5863395219743565e-05, + "loss": 0.3061623275279999, + "memory(GiB)": 78.33, + "step": 4231, + "token_acc": 0.9074907292954264, + "train_speed(iter/s)": 0.032318 + }, + { + "epoch": 0.8200358475027855, + "grad_norm": 0.10706465691328049, + "learning_rate": 2.5809456876205897e-05, + "loss": 0.35384026169776917, + "memory(GiB)": 78.33, + "step": 4232, + "token_acc": 0.8951451380857006, + "train_speed(iter/s)": 0.032319 + }, + { + "epoch": 0.8202296177881122, + "grad_norm": 0.11483049392700195, + "learning_rate": 2.575556954106142e-05, + "loss": 0.345024049282074, + "memory(GiB)": 78.33, + "step": 4233, + "token_acc": 0.8970334598137288, + "train_speed(iter/s)": 0.03232 + }, + { + "epoch": 0.820423388073439, + "grad_norm": 0.08666758239269257, + "learning_rate": 2.57017332364431e-05, + "loss": 0.2846605181694031, + "memory(GiB)": 78.33, + "step": 4234, + "token_acc": 0.9113689315771956, + "train_speed(iter/s)": 0.03232 + }, + { + "epoch": 0.8206171583587657, + "grad_norm": 0.10516150295734406, + "learning_rate": 2.564794798446298e-05, + "loss": 0.353068470954895, + "memory(GiB)": 78.33, + "step": 4235, + "token_acc": 0.8956983240223464, + "train_speed(iter/s)": 0.032321 + }, + { + "epoch": 0.8208109286440924, + "grad_norm": 0.10344908386468887, + "learning_rate": 2.559421380721207e-05, + "loss": 0.3344540596008301, + "memory(GiB)": 78.33, + "step": 4236, + "token_acc": 0.8966224100413132, + "train_speed(iter/s)": 0.032321 + }, + { + "epoch": 0.8210046989294192, + "grad_norm": 0.09849216789007187, + "learning_rate": 2.554053072676049e-05, + "loss": 0.3291811943054199, + "memory(GiB)": 78.33, + "step": 4237, + "token_acc": 0.9034900284900285, + "train_speed(iter/s)": 0.032322 + }, + { + "epoch": 0.8211984692147459, + "grad_norm": 0.10524845868349075, + "learning_rate": 2.5486898765157227e-05, + "loss": 0.36191216111183167, + "memory(GiB)": 78.33, + "step": 4238, + "token_acc": 0.8912652571926766, + "train_speed(iter/s)": 0.032323 + }, + { + "epoch": 0.8213922395000727, + "grad_norm": 0.10183076560497284, + "learning_rate": 2.5433317944430497e-05, + "loss": 0.31616219878196716, + "memory(GiB)": 78.33, + "step": 4239, + "token_acc": 0.9054447035789353, + "train_speed(iter/s)": 0.032323 + }, + { + "epoch": 0.8215860097853994, + "grad_norm": 0.10111679881811142, + "learning_rate": 2.5379788286587317e-05, + "loss": 0.3214479386806488, + "memory(GiB)": 78.33, + "step": 4240, + "token_acc": 0.9007918449076181, + "train_speed(iter/s)": 0.032324 + }, + { + "epoch": 0.8217797800707262, + "grad_norm": 0.10107513517141342, + "learning_rate": 2.532630981361376e-05, + "loss": 0.3232714831829071, + "memory(GiB)": 78.33, + "step": 4241, + "token_acc": 0.9016899178246744, + "train_speed(iter/s)": 0.032325 + }, + { + "epoch": 0.8219735503560529, + "grad_norm": 0.10409308224916458, + "learning_rate": 2.5272882547474877e-05, + "loss": 0.3329227566719055, + "memory(GiB)": 78.33, + "step": 4242, + "token_acc": 0.8991776806795668, + "train_speed(iter/s)": 0.032325 + }, + { + "epoch": 0.8221673206413797, + "grad_norm": 0.09440822154283524, + "learning_rate": 2.5219506510114647e-05, + "loss": 0.33597031235694885, + "memory(GiB)": 78.33, + "step": 4243, + "token_acc": 0.9017857142857143, + "train_speed(iter/s)": 0.032326 + }, + { + "epoch": 0.8223610909267064, + "grad_norm": 0.09774498641490936, + "learning_rate": 2.5166181723456147e-05, + "loss": 0.3483470678329468, + "memory(GiB)": 78.33, + "step": 4244, + "token_acc": 0.8975424647632815, + "train_speed(iter/s)": 0.032327 + }, + { + "epoch": 0.8225548612120331, + "grad_norm": 0.09778609871864319, + "learning_rate": 2.5112908209401144e-05, + "loss": 0.29115307331085205, + "memory(GiB)": 78.33, + "step": 4245, + "token_acc": 0.9137274419007227, + "train_speed(iter/s)": 0.032327 + }, + { + "epoch": 0.8227486314973599, + "grad_norm": 0.09965585917234421, + "learning_rate": 2.5059685989830636e-05, + "loss": 0.32971906661987305, + "memory(GiB)": 78.33, + "step": 4246, + "token_acc": 0.9021879021879022, + "train_speed(iter/s)": 0.032328 + }, + { + "epoch": 0.8229424017826866, + "grad_norm": 0.10306161642074585, + "learning_rate": 2.5006515086604368e-05, + "loss": 0.3302563726902008, + "memory(GiB)": 78.33, + "step": 4247, + "token_acc": 0.9004143366267783, + "train_speed(iter/s)": 0.032328 + }, + { + "epoch": 0.8231361720680134, + "grad_norm": 0.08937028795480728, + "learning_rate": 2.4953395521561053e-05, + "loss": 0.30359315872192383, + "memory(GiB)": 78.33, + "step": 4248, + "token_acc": 0.9071417392665241, + "train_speed(iter/s)": 0.032329 + }, + { + "epoch": 0.8233299423533401, + "grad_norm": 0.10544510930776596, + "learning_rate": 2.4900327316518326e-05, + "loss": 0.3514764606952667, + "memory(GiB)": 78.33, + "step": 4249, + "token_acc": 0.8938008836033232, + "train_speed(iter/s)": 0.03233 + }, + { + "epoch": 0.8235237126386669, + "grad_norm": 0.10431473702192307, + "learning_rate": 2.48473104932727e-05, + "loss": 0.3315693438053131, + "memory(GiB)": 78.33, + "step": 4250, + "token_acc": 0.8996884388561255, + "train_speed(iter/s)": 0.03233 + }, + { + "epoch": 0.8237174829239936, + "grad_norm": 0.10015621036291122, + "learning_rate": 2.479434507359967e-05, + "loss": 0.321992427110672, + "memory(GiB)": 78.33, + "step": 4251, + "token_acc": 0.9025872260867175, + "train_speed(iter/s)": 0.032331 + }, + { + "epoch": 0.8239112532093203, + "grad_norm": 0.10041700303554535, + "learning_rate": 2.474143107925352e-05, + "loss": 0.3204496204853058, + "memory(GiB)": 78.33, + "step": 4252, + "token_acc": 0.9054709141274239, + "train_speed(iter/s)": 0.032332 + }, + { + "epoch": 0.8241050234946471, + "grad_norm": 0.09637311100959778, + "learning_rate": 2.4688568531967467e-05, + "loss": 0.31289243698120117, + "memory(GiB)": 78.33, + "step": 4253, + "token_acc": 0.9049771363624349, + "train_speed(iter/s)": 0.032332 + }, + { + "epoch": 0.8242987937799738, + "grad_norm": 0.10459338128566742, + "learning_rate": 2.463575745345356e-05, + "loss": 0.34315067529678345, + "memory(GiB)": 78.33, + "step": 4254, + "token_acc": 0.8960445130013222, + "train_speed(iter/s)": 0.032333 + }, + { + "epoch": 0.8244925640653006, + "grad_norm": 0.09525644779205322, + "learning_rate": 2.4582997865402727e-05, + "loss": 0.32378891110420227, + "memory(GiB)": 78.33, + "step": 4255, + "token_acc": 0.9033018867924528, + "train_speed(iter/s)": 0.032333 + }, + { + "epoch": 0.8246863343506273, + "grad_norm": 0.0998750701546669, + "learning_rate": 2.453028978948477e-05, + "loss": 0.31236085295677185, + "memory(GiB)": 78.33, + "step": 4256, + "token_acc": 0.9072579542034853, + "train_speed(iter/s)": 0.032334 + }, + { + "epoch": 0.8248801046359541, + "grad_norm": 0.09134574234485626, + "learning_rate": 2.4477633247348238e-05, + "loss": 0.3001454472541809, + "memory(GiB)": 78.33, + "step": 4257, + "token_acc": 0.9097239492663517, + "train_speed(iter/s)": 0.032335 + }, + { + "epoch": 0.8250738749212808, + "grad_norm": 0.09457145631313324, + "learning_rate": 2.4425028260620715e-05, + "loss": 0.32455000281333923, + "memory(GiB)": 78.33, + "step": 4258, + "token_acc": 0.902986820556812, + "train_speed(iter/s)": 0.032335 + }, + { + "epoch": 0.8252676452066076, + "grad_norm": 0.1064552590250969, + "learning_rate": 2.4372474850908404e-05, + "loss": 0.3257259130477905, + "memory(GiB)": 78.33, + "step": 4259, + "token_acc": 0.9031152183633925, + "train_speed(iter/s)": 0.032336 + }, + { + "epoch": 0.8254614154919343, + "grad_norm": 0.10509302467107773, + "learning_rate": 2.4319973039796397e-05, + "loss": 0.3298068344593048, + "memory(GiB)": 78.33, + "step": 4260, + "token_acc": 0.8993223921422355, + "train_speed(iter/s)": 0.032336 + }, + { + "epoch": 0.825655185777261, + "grad_norm": 0.09444321691989899, + "learning_rate": 2.4267522848848635e-05, + "loss": 0.29824700951576233, + "memory(GiB)": 78.33, + "step": 4261, + "token_acc": 0.908746618575293, + "train_speed(iter/s)": 0.032337 + }, + { + "epoch": 0.8258489560625878, + "grad_norm": 0.09932407736778259, + "learning_rate": 2.4215124299607802e-05, + "loss": 0.33362001180648804, + "memory(GiB)": 78.33, + "step": 4262, + "token_acc": 0.8990427838595377, + "train_speed(iter/s)": 0.032338 + }, + { + "epoch": 0.8260427263479145, + "grad_norm": 0.09288123995065689, + "learning_rate": 2.416277741359538e-05, + "loss": 0.31670305132865906, + "memory(GiB)": 78.33, + "step": 4263, + "token_acc": 0.9042440573573285, + "train_speed(iter/s)": 0.032338 + }, + { + "epoch": 0.8262364966332413, + "grad_norm": 0.11048420518636703, + "learning_rate": 2.411048221231162e-05, + "loss": 0.3662915825843811, + "memory(GiB)": 78.33, + "step": 4264, + "token_acc": 0.8945862079354081, + "train_speed(iter/s)": 0.032339 + }, + { + "epoch": 0.826430266918568, + "grad_norm": 0.10002768039703369, + "learning_rate": 2.4058238717235628e-05, + "loss": 0.32539236545562744, + "memory(GiB)": 78.33, + "step": 4265, + "token_acc": 0.9030578297219892, + "train_speed(iter/s)": 0.03234 + }, + { + "epoch": 0.8266240372038948, + "grad_norm": 0.10625031590461731, + "learning_rate": 2.4006046949825186e-05, + "loss": 0.31954246759414673, + "memory(GiB)": 78.33, + "step": 4266, + "token_acc": 0.9040422214225741, + "train_speed(iter/s)": 0.03234 + }, + { + "epoch": 0.8268178074892215, + "grad_norm": 0.09548264741897583, + "learning_rate": 2.3953906931516848e-05, + "loss": 0.31130972504615784, + "memory(GiB)": 78.33, + "step": 4267, + "token_acc": 0.9068991470145509, + "train_speed(iter/s)": 0.032341 + }, + { + "epoch": 0.8270115777745483, + "grad_norm": 0.1018620952963829, + "learning_rate": 2.390181868372593e-05, + "loss": 0.37592288851737976, + "memory(GiB)": 78.33, + "step": 4268, + "token_acc": 0.8873175527489979, + "train_speed(iter/s)": 0.032341 + }, + { + "epoch": 0.827205348059875, + "grad_norm": 0.10313120484352112, + "learning_rate": 2.384978222784646e-05, + "loss": 0.3224887251853943, + "memory(GiB)": 78.33, + "step": 4269, + "token_acc": 0.9022740524781341, + "train_speed(iter/s)": 0.032342 + }, + { + "epoch": 0.8273991183452017, + "grad_norm": 0.1032845601439476, + "learning_rate": 2.379779758525123e-05, + "loss": 0.33453071117401123, + "memory(GiB)": 78.33, + "step": 4270, + "token_acc": 0.8983879998821207, + "train_speed(iter/s)": 0.032343 + }, + { + "epoch": 0.8275928886305285, + "grad_norm": 0.09773199260234833, + "learning_rate": 2.3745864777291674e-05, + "loss": 0.3257528841495514, + "memory(GiB)": 78.33, + "step": 4271, + "token_acc": 0.9016068290233492, + "train_speed(iter/s)": 0.032343 + }, + { + "epoch": 0.8277866589158552, + "grad_norm": 0.09065309911966324, + "learning_rate": 2.369398382529807e-05, + "loss": 0.29769429564476013, + "memory(GiB)": 78.33, + "step": 4272, + "token_acc": 0.9092235329627487, + "train_speed(iter/s)": 0.032344 + }, + { + "epoch": 0.827980429201182, + "grad_norm": 0.08960139006376266, + "learning_rate": 2.3642154750579272e-05, + "loss": 0.31141579151153564, + "memory(GiB)": 78.33, + "step": 4273, + "token_acc": 0.9071020707684546, + "train_speed(iter/s)": 0.032345 + }, + { + "epoch": 0.8281741994865087, + "grad_norm": 0.10664994269609451, + "learning_rate": 2.3590377574422892e-05, + "loss": 0.37568601965904236, + "memory(GiB)": 78.33, + "step": 4274, + "token_acc": 0.8949000498366465, + "train_speed(iter/s)": 0.032345 + }, + { + "epoch": 0.8283679697718355, + "grad_norm": 0.11605322360992432, + "learning_rate": 2.3538652318095198e-05, + "loss": 0.3583415746688843, + "memory(GiB)": 78.33, + "step": 4275, + "token_acc": 0.8928342520189719, + "train_speed(iter/s)": 0.032346 + }, + { + "epoch": 0.8285617400571622, + "grad_norm": 0.10450875759124756, + "learning_rate": 2.348697900284111e-05, + "loss": 0.34997832775115967, + "memory(GiB)": 78.33, + "step": 4276, + "token_acc": 0.8950859618248274, + "train_speed(iter/s)": 0.032347 + }, + { + "epoch": 0.828755510342489, + "grad_norm": 0.09127886593341827, + "learning_rate": 2.3435357649884357e-05, + "loss": 0.31003209948539734, + "memory(GiB)": 78.33, + "step": 4277, + "token_acc": 0.9049302739590545, + "train_speed(iter/s)": 0.032347 + }, + { + "epoch": 0.8289492806278157, + "grad_norm": 0.10241317003965378, + "learning_rate": 2.3383788280427074e-05, + "loss": 0.347523957490921, + "memory(GiB)": 78.33, + "step": 4278, + "token_acc": 0.8969976905311778, + "train_speed(iter/s)": 0.032348 + }, + { + "epoch": 0.8291430509131424, + "grad_norm": 0.09595993906259537, + "learning_rate": 2.3332270915650285e-05, + "loss": 0.3249264061450958, + "memory(GiB)": 78.33, + "step": 4279, + "token_acc": 0.9030240097254584, + "train_speed(iter/s)": 0.032348 + }, + { + "epoch": 0.8293368211984692, + "grad_norm": 0.0956515520811081, + "learning_rate": 2.328080557671352e-05, + "loss": 0.3111531734466553, + "memory(GiB)": 78.33, + "step": 4280, + "token_acc": 0.9046177726038028, + "train_speed(iter/s)": 0.032349 + }, + { + "epoch": 0.8295305914837959, + "grad_norm": 0.08913633227348328, + "learning_rate": 2.3229392284754994e-05, + "loss": 0.30270490050315857, + "memory(GiB)": 78.33, + "step": 4281, + "token_acc": 0.9090954225002482, + "train_speed(iter/s)": 0.03235 + }, + { + "epoch": 0.8297243617691227, + "grad_norm": 0.10194297879934311, + "learning_rate": 2.3178031060891507e-05, + "loss": 0.35240015387535095, + "memory(GiB)": 78.33, + "step": 4282, + "token_acc": 0.8955933833943508, + "train_speed(iter/s)": 0.03235 + }, + { + "epoch": 0.8299181320544494, + "grad_norm": 0.10122035443782806, + "learning_rate": 2.312672192621846e-05, + "loss": 0.33214977383613586, + "memory(GiB)": 78.33, + "step": 4283, + "token_acc": 0.9003510196977169, + "train_speed(iter/s)": 0.032351 + }, + { + "epoch": 0.8301119023397762, + "grad_norm": 0.09273627400398254, + "learning_rate": 2.307546490180997e-05, + "loss": 0.3047739863395691, + "memory(GiB)": 78.33, + "step": 4284, + "token_acc": 0.909302266165913, + "train_speed(iter/s)": 0.032351 + }, + { + "epoch": 0.8303056726251029, + "grad_norm": 0.1077071949839592, + "learning_rate": 2.3024260008718642e-05, + "loss": 0.34293609857559204, + "memory(GiB)": 78.33, + "step": 4285, + "token_acc": 0.896573135034446, + "train_speed(iter/s)": 0.032352 + }, + { + "epoch": 0.8304994429104297, + "grad_norm": 0.0920027419924736, + "learning_rate": 2.2973107267975703e-05, + "loss": 0.3134732246398926, + "memory(GiB)": 78.33, + "step": 4286, + "token_acc": 0.9056259577262666, + "train_speed(iter/s)": 0.032353 + }, + { + "epoch": 0.8306932131957564, + "grad_norm": 0.09819093346595764, + "learning_rate": 2.292200670059095e-05, + "loss": 0.32229092717170715, + "memory(GiB)": 78.33, + "step": 4287, + "token_acc": 0.9032614234754544, + "train_speed(iter/s)": 0.032353 + }, + { + "epoch": 0.8308869834810831, + "grad_norm": 0.10240425914525986, + "learning_rate": 2.2870958327552774e-05, + "loss": 0.35109835863113403, + "memory(GiB)": 78.33, + "step": 4288, + "token_acc": 0.8959884028881269, + "train_speed(iter/s)": 0.032354 + }, + { + "epoch": 0.8310807537664099, + "grad_norm": 0.10077088326215744, + "learning_rate": 2.2819962169828088e-05, + "loss": 0.32499995827674866, + "memory(GiB)": 78.33, + "step": 4289, + "token_acc": 0.9027956087719758, + "train_speed(iter/s)": 0.032355 + }, + { + "epoch": 0.8312745240517366, + "grad_norm": 0.09702709317207336, + "learning_rate": 2.276901824836237e-05, + "loss": 0.3244347870349884, + "memory(GiB)": 78.33, + "step": 4290, + "token_acc": 0.9018396633577902, + "train_speed(iter/s)": 0.032355 + }, + { + "epoch": 0.8314682943370634, + "grad_norm": 0.10198424011468887, + "learning_rate": 2.2718126584079734e-05, + "loss": 0.3321649730205536, + "memory(GiB)": 78.33, + "step": 4291, + "token_acc": 0.9009141765588757, + "train_speed(iter/s)": 0.032356 + }, + { + "epoch": 0.8316620646223901, + "grad_norm": 0.1072782501578331, + "learning_rate": 2.266728719788269e-05, + "loss": 0.3425625264644623, + "memory(GiB)": 78.33, + "step": 4292, + "token_acc": 0.8971817900278725, + "train_speed(iter/s)": 0.032356 + }, + { + "epoch": 0.8318558349077169, + "grad_norm": 0.10138484835624695, + "learning_rate": 2.2616500110652352e-05, + "loss": 0.32518693804740906, + "memory(GiB)": 78.33, + "step": 4293, + "token_acc": 0.9017971942055487, + "train_speed(iter/s)": 0.032357 + }, + { + "epoch": 0.8320496051930436, + "grad_norm": 0.09211868792772293, + "learning_rate": 2.2565765343248353e-05, + "loss": 0.3022071123123169, + "memory(GiB)": 78.33, + "step": 4294, + "token_acc": 0.9080617941061924, + "train_speed(iter/s)": 0.032358 + }, + { + "epoch": 0.8322433754783704, + "grad_norm": 0.09903164952993393, + "learning_rate": 2.2515082916508824e-05, + "loss": 0.3520633280277252, + "memory(GiB)": 78.33, + "step": 4295, + "token_acc": 0.8949784239892258, + "train_speed(iter/s)": 0.032358 + }, + { + "epoch": 0.8324371457636971, + "grad_norm": 0.10028848052024841, + "learning_rate": 2.24644528512504e-05, + "loss": 0.34844347834587097, + "memory(GiB)": 78.33, + "step": 4296, + "token_acc": 0.8961258624855694, + "train_speed(iter/s)": 0.032359 + }, + { + "epoch": 0.8326309160490238, + "grad_norm": 0.0993753969669342, + "learning_rate": 2.2413875168268154e-05, + "loss": 0.35692083835601807, + "memory(GiB)": 78.33, + "step": 4297, + "token_acc": 0.8944187141847869, + "train_speed(iter/s)": 0.03236 + }, + { + "epoch": 0.8328246863343506, + "grad_norm": 0.10167410969734192, + "learning_rate": 2.2363349888335775e-05, + "loss": 0.349956214427948, + "memory(GiB)": 78.33, + "step": 4298, + "token_acc": 0.895724891202321, + "train_speed(iter/s)": 0.03236 + }, + { + "epoch": 0.8330184566196773, + "grad_norm": 0.0991780236363411, + "learning_rate": 2.2312877032205346e-05, + "loss": 0.3107830584049225, + "memory(GiB)": 78.33, + "step": 4299, + "token_acc": 0.9071795583946675, + "train_speed(iter/s)": 0.032361 + }, + { + "epoch": 0.8332122269050041, + "grad_norm": 0.09722703695297241, + "learning_rate": 2.22624566206074e-05, + "loss": 0.3044697344303131, + "memory(GiB)": 78.33, + "step": 4300, + "token_acc": 0.9069954220544535, + "train_speed(iter/s)": 0.032362 + }, + { + "epoch": 0.8334059971903308, + "grad_norm": 0.09022455662488937, + "learning_rate": 2.2212088674250956e-05, + "loss": 0.2870331108570099, + "memory(GiB)": 78.33, + "step": 4301, + "token_acc": 0.91363100759646, + "train_speed(iter/s)": 0.032362 + }, + { + "epoch": 0.8335997674756576, + "grad_norm": 0.1041305810213089, + "learning_rate": 2.216177321382348e-05, + "loss": 0.3358173966407776, + "memory(GiB)": 78.33, + "step": 4302, + "token_acc": 0.8962132245849112, + "train_speed(iter/s)": 0.032363 + }, + { + "epoch": 0.8337935377609843, + "grad_norm": 0.10265124589204788, + "learning_rate": 2.2111510259990913e-05, + "loss": 0.3599543869495392, + "memory(GiB)": 78.33, + "step": 4303, + "token_acc": 0.8918628427620197, + "train_speed(iter/s)": 0.032363 + }, + { + "epoch": 0.833987308046311, + "grad_norm": 0.0928591713309288, + "learning_rate": 2.2061299833397532e-05, + "loss": 0.29637521505355835, + "memory(GiB)": 78.33, + "step": 4304, + "token_acc": 0.910638866613205, + "train_speed(iter/s)": 0.032364 + }, + { + "epoch": 0.8341810783316378, + "grad_norm": 0.0997898206114769, + "learning_rate": 2.2011141954666185e-05, + "loss": 0.33638525009155273, + "memory(GiB)": 78.33, + "step": 4305, + "token_acc": 0.8974762101779065, + "train_speed(iter/s)": 0.032365 + }, + { + "epoch": 0.8343748486169645, + "grad_norm": 0.10566650331020355, + "learning_rate": 2.1961036644398035e-05, + "loss": 0.3401532769203186, + "memory(GiB)": 78.33, + "step": 4306, + "token_acc": 0.8974769961412882, + "train_speed(iter/s)": 0.032365 + }, + { + "epoch": 0.8345686189022913, + "grad_norm": 0.11155321449041367, + "learning_rate": 2.1910983923172686e-05, + "loss": 0.29791954159736633, + "memory(GiB)": 78.33, + "step": 4307, + "token_acc": 0.910283068563032, + "train_speed(iter/s)": 0.032366 + }, + { + "epoch": 0.834762389187618, + "grad_norm": 0.10973121225833893, + "learning_rate": 2.1860983811548118e-05, + "loss": 0.3326932191848755, + "memory(GiB)": 78.33, + "step": 4308, + "token_acc": 0.8998096353785328, + "train_speed(iter/s)": 0.032366 + }, + { + "epoch": 0.8349561594729448, + "grad_norm": 0.08969270437955856, + "learning_rate": 2.1811036330060676e-05, + "loss": 0.30071404576301575, + "memory(GiB)": 78.33, + "step": 4309, + "token_acc": 0.9097683498797705, + "train_speed(iter/s)": 0.032367 + }, + { + "epoch": 0.8351499297582715, + "grad_norm": 0.09654086083173752, + "learning_rate": 2.1761141499225278e-05, + "loss": 0.3438301384449005, + "memory(GiB)": 78.33, + "step": 4310, + "token_acc": 0.8972290781405053, + "train_speed(iter/s)": 0.032368 + }, + { + "epoch": 0.8353437000435984, + "grad_norm": 0.09115693718194962, + "learning_rate": 2.171129933953489e-05, + "loss": 0.2993430197238922, + "memory(GiB)": 78.33, + "step": 4311, + "token_acc": 0.9102314363897779, + "train_speed(iter/s)": 0.032368 + }, + { + "epoch": 0.8355374703289251, + "grad_norm": 0.1047009527683258, + "learning_rate": 2.1661509871461168e-05, + "loss": 0.33541637659072876, + "memory(GiB)": 78.33, + "step": 4312, + "token_acc": 0.8988593808067237, + "train_speed(iter/s)": 0.032369 + }, + { + "epoch": 0.8357312406142519, + "grad_norm": 0.09559044986963272, + "learning_rate": 2.1611773115453913e-05, + "loss": 0.3060503900051117, + "memory(GiB)": 78.33, + "step": 4313, + "token_acc": 0.9065379777703622, + "train_speed(iter/s)": 0.03237 + }, + { + "epoch": 0.8359250108995786, + "grad_norm": 0.10747367143630981, + "learning_rate": 2.1562089091941376e-05, + "loss": 0.3518577218055725, + "memory(GiB)": 78.33, + "step": 4314, + "token_acc": 0.8972739541160594, + "train_speed(iter/s)": 0.03237 + }, + { + "epoch": 0.8361187811849053, + "grad_norm": 0.10569079220294952, + "learning_rate": 2.1512457821330102e-05, + "loss": 0.33122116327285767, + "memory(GiB)": 78.33, + "step": 4315, + "token_acc": 0.9022820362785254, + "train_speed(iter/s)": 0.032371 + }, + { + "epoch": 0.8363125514702321, + "grad_norm": 0.10672451555728912, + "learning_rate": 2.1462879324004973e-05, + "loss": 0.356486439704895, + "memory(GiB)": 78.33, + "step": 4316, + "token_acc": 0.892833182626958, + "train_speed(iter/s)": 0.032371 + }, + { + "epoch": 0.8365063217555588, + "grad_norm": 0.10485262423753738, + "learning_rate": 2.1413353620329294e-05, + "loss": 0.33778226375579834, + "memory(GiB)": 78.33, + "step": 4317, + "token_acc": 0.8977509922093194, + "train_speed(iter/s)": 0.032372 + }, + { + "epoch": 0.8367000920408856, + "grad_norm": 0.1112034022808075, + "learning_rate": 2.136388073064446e-05, + "loss": 0.31891724467277527, + "memory(GiB)": 78.33, + "step": 4318, + "token_acc": 0.9021532012195121, + "train_speed(iter/s)": 0.032373 + }, + { + "epoch": 0.8368938623262123, + "grad_norm": 0.09961036592721939, + "learning_rate": 2.131446067527044e-05, + "loss": 0.3315172493457794, + "memory(GiB)": 78.33, + "step": 4319, + "token_acc": 0.9015923147782603, + "train_speed(iter/s)": 0.032373 + }, + { + "epoch": 0.8370876326115391, + "grad_norm": 0.10558105260133743, + "learning_rate": 2.126509347450534e-05, + "loss": 0.36373376846313477, + "memory(GiB)": 78.33, + "step": 4320, + "token_acc": 0.8906225980015373, + "train_speed(iter/s)": 0.032374 + }, + { + "epoch": 0.8372814028968658, + "grad_norm": 0.09915536642074585, + "learning_rate": 2.1215779148625578e-05, + "loss": 0.3262394964694977, + "memory(GiB)": 78.33, + "step": 4321, + "token_acc": 0.8996407847471677, + "train_speed(iter/s)": 0.032374 + }, + { + "epoch": 0.8374751731821926, + "grad_norm": 0.11209924519062042, + "learning_rate": 2.11665177178859e-05, + "loss": 0.3615911900997162, + "memory(GiB)": 78.33, + "step": 4322, + "token_acc": 0.8943596998400787, + "train_speed(iter/s)": 0.032375 + }, + { + "epoch": 0.8376689434675193, + "grad_norm": 0.09516075998544693, + "learning_rate": 2.111730920251924e-05, + "loss": 0.3138526678085327, + "memory(GiB)": 78.33, + "step": 4323, + "token_acc": 0.9044929966462814, + "train_speed(iter/s)": 0.032376 + }, + { + "epoch": 0.837862713752846, + "grad_norm": 0.10355053097009659, + "learning_rate": 2.1068153622736943e-05, + "loss": 0.3259844183921814, + "memory(GiB)": 78.33, + "step": 4324, + "token_acc": 0.902735473289597, + "train_speed(iter/s)": 0.032376 + }, + { + "epoch": 0.8380564840381728, + "grad_norm": 0.11187773942947388, + "learning_rate": 2.101905099872848e-05, + "loss": 0.3713458776473999, + "memory(GiB)": 78.33, + "step": 4325, + "token_acc": 0.888154201235107, + "train_speed(iter/s)": 0.032377 + }, + { + "epoch": 0.8382502543234995, + "grad_norm": 0.10848337411880493, + "learning_rate": 2.0970001350661635e-05, + "loss": 0.3444962799549103, + "memory(GiB)": 78.33, + "step": 4326, + "token_acc": 0.8968367889420521, + "train_speed(iter/s)": 0.032378 + }, + { + "epoch": 0.8384440246088263, + "grad_norm": 0.08781024068593979, + "learning_rate": 2.0921004698682407e-05, + "loss": 0.328144371509552, + "memory(GiB)": 78.33, + "step": 4327, + "token_acc": 0.9009247631008106, + "train_speed(iter/s)": 0.032378 + }, + { + "epoch": 0.838637794894153, + "grad_norm": 0.09519962966442108, + "learning_rate": 2.087206106291502e-05, + "loss": 0.31252622604370117, + "memory(GiB)": 78.33, + "step": 4328, + "token_acc": 0.9068911656474201, + "train_speed(iter/s)": 0.032379 + }, + { + "epoch": 0.8388315651794798, + "grad_norm": 0.09435832500457764, + "learning_rate": 2.082317046346197e-05, + "loss": 0.32768622040748596, + "memory(GiB)": 78.33, + "step": 4329, + "token_acc": 0.900981393912031, + "train_speed(iter/s)": 0.032379 + }, + { + "epoch": 0.8390253354648065, + "grad_norm": 0.10018561035394669, + "learning_rate": 2.077433292040388e-05, + "loss": 0.3233901560306549, + "memory(GiB)": 78.33, + "step": 4330, + "token_acc": 0.9020335985853227, + "train_speed(iter/s)": 0.03238 + }, + { + "epoch": 0.8392191057501333, + "grad_norm": 0.09643685072660446, + "learning_rate": 2.072554845379974e-05, + "loss": 0.316950261592865, + "memory(GiB)": 78.33, + "step": 4331, + "token_acc": 0.9074275988617929, + "train_speed(iter/s)": 0.032381 + }, + { + "epoch": 0.83941287603546, + "grad_norm": 0.0906091183423996, + "learning_rate": 2.067681708368657e-05, + "loss": 0.31494781374931335, + "memory(GiB)": 78.33, + "step": 4332, + "token_acc": 0.9049370490875654, + "train_speed(iter/s)": 0.032381 + }, + { + "epoch": 0.8396066463207867, + "grad_norm": 0.10366849601268768, + "learning_rate": 2.0628138830079695e-05, + "loss": 0.3571404218673706, + "memory(GiB)": 78.33, + "step": 4333, + "token_acc": 0.8948438109026116, + "train_speed(iter/s)": 0.032382 + }, + { + "epoch": 0.8398004166061135, + "grad_norm": 0.10383996367454529, + "learning_rate": 2.0579513712972535e-05, + "loss": 0.3651527166366577, + "memory(GiB)": 78.33, + "step": 4334, + "token_acc": 0.8928912539227726, + "train_speed(iter/s)": 0.032383 + }, + { + "epoch": 0.8399941868914402, + "grad_norm": 0.1002940759062767, + "learning_rate": 2.0530941752336767e-05, + "loss": 0.33056554198265076, + "memory(GiB)": 78.33, + "step": 4335, + "token_acc": 0.9013376077824453, + "train_speed(iter/s)": 0.032383 + }, + { + "epoch": 0.840187957176767, + "grad_norm": 0.09768969565629959, + "learning_rate": 2.0482422968122198e-05, + "loss": 0.327396422624588, + "memory(GiB)": 78.33, + "step": 4336, + "token_acc": 0.9023116889636305, + "train_speed(iter/s)": 0.032384 + }, + { + "epoch": 0.8403817274620937, + "grad_norm": 0.0857066735625267, + "learning_rate": 2.043395738025674e-05, + "loss": 0.30105772614479065, + "memory(GiB)": 78.33, + "step": 4337, + "token_acc": 0.9095752105336621, + "train_speed(iter/s)": 0.032385 + }, + { + "epoch": 0.8405754977474205, + "grad_norm": 0.09931961447000504, + "learning_rate": 2.0385545008646597e-05, + "loss": 0.3311472535133362, + "memory(GiB)": 78.33, + "step": 4338, + "token_acc": 0.9012172284644194, + "train_speed(iter/s)": 0.032385 + }, + { + "epoch": 0.8407692680327472, + "grad_norm": 0.10935309529304504, + "learning_rate": 2.0337185873176004e-05, + "loss": 0.34328311681747437, + "memory(GiB)": 78.33, + "step": 4339, + "token_acc": 0.8972643180476284, + "train_speed(iter/s)": 0.032386 + }, + { + "epoch": 0.840963038318074, + "grad_norm": 0.08845790475606918, + "learning_rate": 2.0288879993707335e-05, + "loss": 0.2846664786338806, + "memory(GiB)": 78.33, + "step": 4340, + "token_acc": 0.9113034491667743, + "train_speed(iter/s)": 0.032386 + }, + { + "epoch": 0.8411568086034007, + "grad_norm": 0.10597343742847443, + "learning_rate": 2.0240627390081137e-05, + "loss": 0.32449671626091003, + "memory(GiB)": 78.33, + "step": 4341, + "token_acc": 0.9027229011608319, + "train_speed(iter/s)": 0.032387 + }, + { + "epoch": 0.8413505788887274, + "grad_norm": 0.1017051637172699, + "learning_rate": 2.0192428082115992e-05, + "loss": 0.32706570625305176, + "memory(GiB)": 78.33, + "step": 4342, + "token_acc": 0.9033635878270155, + "train_speed(iter/s)": 0.032388 + }, + { + "epoch": 0.8415443491740542, + "grad_norm": 0.10161115974187851, + "learning_rate": 2.0144282089608778e-05, + "loss": 0.34076470136642456, + "memory(GiB)": 78.33, + "step": 4343, + "token_acc": 0.8972409669428547, + "train_speed(iter/s)": 0.032388 + }, + { + "epoch": 0.8417381194593809, + "grad_norm": 0.10345038771629333, + "learning_rate": 2.009618943233419e-05, + "loss": 0.3462626338005066, + "memory(GiB)": 78.33, + "step": 4344, + "token_acc": 0.8954670063412068, + "train_speed(iter/s)": 0.032389 + }, + { + "epoch": 0.8419318897447077, + "grad_norm": 0.10364454239606857, + "learning_rate": 2.0048150130045303e-05, + "loss": 0.35689064860343933, + "memory(GiB)": 78.33, + "step": 4345, + "token_acc": 0.893178860336336, + "train_speed(iter/s)": 0.03239 + }, + { + "epoch": 0.8421256600300344, + "grad_norm": 0.08873011916875839, + "learning_rate": 2.000016420247308e-05, + "loss": 0.28760528564453125, + "memory(GiB)": 78.33, + "step": 4346, + "token_acc": 0.9117839149094981, + "train_speed(iter/s)": 0.03239 + }, + { + "epoch": 0.8423194303153612, + "grad_norm": 0.10340781509876251, + "learning_rate": 1.9952231669326668e-05, + "loss": 0.3450697958469391, + "memory(GiB)": 78.33, + "step": 4347, + "token_acc": 0.896965636602183, + "train_speed(iter/s)": 0.032391 + }, + { + "epoch": 0.8425132006006879, + "grad_norm": 0.08939331769943237, + "learning_rate": 1.9904352550293224e-05, + "loss": 0.28838780522346497, + "memory(GiB)": 78.33, + "step": 4348, + "token_acc": 0.9093310888218653, + "train_speed(iter/s)": 0.032391 + }, + { + "epoch": 0.8427069708860147, + "grad_norm": 0.09946542233228683, + "learning_rate": 1.9856526865037947e-05, + "loss": 0.322831392288208, + "memory(GiB)": 78.33, + "step": 4349, + "token_acc": 0.90141040206157, + "train_speed(iter/s)": 0.032392 + }, + { + "epoch": 0.8429007411713414, + "grad_norm": 0.10646877437829971, + "learning_rate": 1.980875463320426e-05, + "loss": 0.3633573651313782, + "memory(GiB)": 78.33, + "step": 4350, + "token_acc": 0.8931559868080351, + "train_speed(iter/s)": 0.032393 + }, + { + "epoch": 0.8430945114566681, + "grad_norm": 0.09976527839899063, + "learning_rate": 1.9761035874413333e-05, + "loss": 0.3256649374961853, + "memory(GiB)": 78.33, + "step": 4351, + "token_acc": 0.9018026445528784, + "train_speed(iter/s)": 0.032393 + }, + { + "epoch": 0.8432882817419949, + "grad_norm": 0.10646556317806244, + "learning_rate": 1.9713370608264674e-05, + "loss": 0.32194364070892334, + "memory(GiB)": 78.33, + "step": 4352, + "token_acc": 0.9037880343886623, + "train_speed(iter/s)": 0.032394 + }, + { + "epoch": 0.8434820520273216, + "grad_norm": 0.10743288695812225, + "learning_rate": 1.966575885433565e-05, + "loss": 0.34353500604629517, + "memory(GiB)": 78.33, + "step": 4353, + "token_acc": 0.8954018292107992, + "train_speed(iter/s)": 0.032394 + }, + { + "epoch": 0.8436758223126484, + "grad_norm": 0.1006690040230751, + "learning_rate": 1.9618200632181673e-05, + "loss": 0.3364703059196472, + "memory(GiB)": 78.33, + "step": 4354, + "token_acc": 0.8980853059647961, + "train_speed(iter/s)": 0.032395 + }, + { + "epoch": 0.8438695925979751, + "grad_norm": 0.09362746775150299, + "learning_rate": 1.9570695961336203e-05, + "loss": 0.28917554020881653, + "memory(GiB)": 78.33, + "step": 4355, + "token_acc": 0.9127130038690557, + "train_speed(iter/s)": 0.032396 + }, + { + "epoch": 0.8440633628833019, + "grad_norm": 0.11022236943244934, + "learning_rate": 1.9523244861310626e-05, + "loss": 0.31461644172668457, + "memory(GiB)": 78.33, + "step": 4356, + "token_acc": 0.905908453993461, + "train_speed(iter/s)": 0.032396 + }, + { + "epoch": 0.8442571331686286, + "grad_norm": 0.09002205729484558, + "learning_rate": 1.9475847351594458e-05, + "loss": 0.28434687852859497, + "memory(GiB)": 78.33, + "step": 4357, + "token_acc": 0.9123356771629471, + "train_speed(iter/s)": 0.032397 + }, + { + "epoch": 0.8444509034539553, + "grad_norm": 0.09616533666849136, + "learning_rate": 1.9428503451655125e-05, + "loss": 0.33912795782089233, + "memory(GiB)": 78.33, + "step": 4358, + "token_acc": 0.8976422492479367, + "train_speed(iter/s)": 0.032397 + }, + { + "epoch": 0.8446446737392821, + "grad_norm": 0.09272696822881699, + "learning_rate": 1.9381213180938003e-05, + "loss": 0.30497756600379944, + "memory(GiB)": 78.33, + "step": 4359, + "token_acc": 0.9095019342359768, + "train_speed(iter/s)": 0.032398 + }, + { + "epoch": 0.8448384440246088, + "grad_norm": 0.09096968919038773, + "learning_rate": 1.9333976558866476e-05, + "loss": 0.295366108417511, + "memory(GiB)": 78.33, + "step": 4360, + "token_acc": 0.9114807813484562, + "train_speed(iter/s)": 0.032399 + }, + { + "epoch": 0.8450322143099356, + "grad_norm": 0.1177552342414856, + "learning_rate": 1.928679360484194e-05, + "loss": 0.3394724428653717, + "memory(GiB)": 78.33, + "step": 4361, + "token_acc": 0.8994290351563632, + "train_speed(iter/s)": 0.032399 + }, + { + "epoch": 0.8452259845952623, + "grad_norm": 0.10825799405574799, + "learning_rate": 1.9239664338243637e-05, + "loss": 0.32095205783843994, + "memory(GiB)": 78.33, + "step": 4362, + "token_acc": 0.90408615136876, + "train_speed(iter/s)": 0.0324 + }, + { + "epoch": 0.8454197548805891, + "grad_norm": 0.09334609657526016, + "learning_rate": 1.9192588778428842e-05, + "loss": 0.30651578307151794, + "memory(GiB)": 78.33, + "step": 4363, + "token_acc": 0.907121790842721, + "train_speed(iter/s)": 0.0324 + }, + { + "epoch": 0.8456135251659158, + "grad_norm": 0.10030210018157959, + "learning_rate": 1.91455669447328e-05, + "loss": 0.3436662554740906, + "memory(GiB)": 78.33, + "step": 4364, + "token_acc": 0.9008844323772145, + "train_speed(iter/s)": 0.032401 + }, + { + "epoch": 0.8458072954512426, + "grad_norm": 0.09695718437433243, + "learning_rate": 1.909859885646861e-05, + "loss": 0.33061671257019043, + "memory(GiB)": 78.33, + "step": 4365, + "token_acc": 0.9015153412648717, + "train_speed(iter/s)": 0.032402 + }, + { + "epoch": 0.8460010657365693, + "grad_norm": 0.1165538877248764, + "learning_rate": 1.9051684532927332e-05, + "loss": 0.3574296534061432, + "memory(GiB)": 78.33, + "step": 4366, + "token_acc": 0.8923136197264496, + "train_speed(iter/s)": 0.032402 + }, + { + "epoch": 0.846194836021896, + "grad_norm": 0.09451232105493546, + "learning_rate": 1.9004823993377927e-05, + "loss": 0.3224382996559143, + "memory(GiB)": 78.33, + "step": 4367, + "token_acc": 0.9037022153011504, + "train_speed(iter/s)": 0.032403 + }, + { + "epoch": 0.8463886063072228, + "grad_norm": 0.08888131380081177, + "learning_rate": 1.895801725706727e-05, + "loss": 0.2944706082344055, + "memory(GiB)": 78.33, + "step": 4368, + "token_acc": 0.9106517341753416, + "train_speed(iter/s)": 0.032403 + }, + { + "epoch": 0.8465823765925495, + "grad_norm": 0.10048877447843552, + "learning_rate": 1.8911264343220184e-05, + "loss": 0.3280665874481201, + "memory(GiB)": 78.33, + "step": 4369, + "token_acc": 0.9011113631208891, + "train_speed(iter/s)": 0.032404 + }, + { + "epoch": 0.8467761468778763, + "grad_norm": 0.10502710193395615, + "learning_rate": 1.8864565271039274e-05, + "loss": 0.3245999217033386, + "memory(GiB)": 78.33, + "step": 4370, + "token_acc": 0.9021815622800844, + "train_speed(iter/s)": 0.032405 + }, + { + "epoch": 0.846969917163203, + "grad_norm": 0.1014811173081398, + "learning_rate": 1.8817920059705194e-05, + "loss": 0.3340781629085541, + "memory(GiB)": 78.33, + "step": 4371, + "token_acc": 0.8999215070643642, + "train_speed(iter/s)": 0.032405 + }, + { + "epoch": 0.8471636874485298, + "grad_norm": 0.09874725341796875, + "learning_rate": 1.8771328728376338e-05, + "loss": 0.3380013704299927, + "memory(GiB)": 78.33, + "step": 4372, + "token_acc": 0.8971284634760706, + "train_speed(iter/s)": 0.032406 + }, + { + "epoch": 0.8473574577338565, + "grad_norm": 0.14172475039958954, + "learning_rate": 1.8724791296189034e-05, + "loss": 0.32453781366348267, + "memory(GiB)": 78.33, + "step": 4373, + "token_acc": 0.9030038451530833, + "train_speed(iter/s)": 0.032406 + }, + { + "epoch": 0.8475512280191833, + "grad_norm": 0.11031965911388397, + "learning_rate": 1.867830778225744e-05, + "loss": 0.3707287907600403, + "memory(GiB)": 78.33, + "step": 4374, + "token_acc": 0.892270504861836, + "train_speed(iter/s)": 0.032407 + }, + { + "epoch": 0.84774499830451, + "grad_norm": 0.10298004001379013, + "learning_rate": 1.8631878205673552e-05, + "loss": 0.34424299001693726, + "memory(GiB)": 78.33, + "step": 4375, + "token_acc": 0.8972931413876296, + "train_speed(iter/s)": 0.032408 + }, + { + "epoch": 0.8479387685898367, + "grad_norm": 0.11782942712306976, + "learning_rate": 1.858550258550736e-05, + "loss": 0.32887881994247437, + "memory(GiB)": 78.33, + "step": 4376, + "token_acc": 0.9019160805238904, + "train_speed(iter/s)": 0.032408 + }, + { + "epoch": 0.8481325388751635, + "grad_norm": 0.09481247514486313, + "learning_rate": 1.8539180940806436e-05, + "loss": 0.32029709219932556, + "memory(GiB)": 78.33, + "step": 4377, + "token_acc": 0.9027707297269867, + "train_speed(iter/s)": 0.032409 + }, + { + "epoch": 0.8483263091604902, + "grad_norm": 0.11217660456895828, + "learning_rate": 1.8492913290596407e-05, + "loss": 0.3478485345840454, + "memory(GiB)": 78.33, + "step": 4378, + "token_acc": 0.8978001381079215, + "train_speed(iter/s)": 0.032409 + }, + { + "epoch": 0.848520079445817, + "grad_norm": 0.10071595013141632, + "learning_rate": 1.8446699653880638e-05, + "loss": 0.32309237122535706, + "memory(GiB)": 78.33, + "step": 4379, + "token_acc": 0.9025203747208446, + "train_speed(iter/s)": 0.03241 + }, + { + "epoch": 0.8487138497311437, + "grad_norm": 0.10318909585475922, + "learning_rate": 1.8400540049640278e-05, + "loss": 0.3701336681842804, + "memory(GiB)": 78.33, + "step": 4380, + "token_acc": 0.8895177546906498, + "train_speed(iter/s)": 0.032411 + }, + { + "epoch": 0.8489076200164705, + "grad_norm": 0.1191054955124855, + "learning_rate": 1.8354434496834346e-05, + "loss": 0.37648195028305054, + "memory(GiB)": 78.33, + "step": 4381, + "token_acc": 0.8879827766179541, + "train_speed(iter/s)": 0.032411 + }, + { + "epoch": 0.8491013903017972, + "grad_norm": 0.09198344498872757, + "learning_rate": 1.830838301439958e-05, + "loss": 0.32460615038871765, + "memory(GiB)": 78.33, + "step": 4382, + "token_acc": 0.9025374855824683, + "train_speed(iter/s)": 0.032412 + }, + { + "epoch": 0.849295160587124, + "grad_norm": 0.09382152557373047, + "learning_rate": 1.826238562125068e-05, + "loss": 0.31965401768684387, + "memory(GiB)": 78.33, + "step": 4383, + "token_acc": 0.9034426770175306, + "train_speed(iter/s)": 0.032413 + }, + { + "epoch": 0.8494889308724507, + "grad_norm": 0.10211214423179626, + "learning_rate": 1.821644233627985e-05, + "loss": 0.35185420513153076, + "memory(GiB)": 78.33, + "step": 4384, + "token_acc": 0.8940360240160107, + "train_speed(iter/s)": 0.032413 + }, + { + "epoch": 0.8496827011577774, + "grad_norm": 0.09837459772825241, + "learning_rate": 1.8170553178357366e-05, + "loss": 0.3056308627128601, + "memory(GiB)": 78.33, + "step": 4385, + "token_acc": 0.9083345412378605, + "train_speed(iter/s)": 0.032414 + }, + { + "epoch": 0.8498764714431042, + "grad_norm": 0.09455987811088562, + "learning_rate": 1.8124718166331066e-05, + "loss": 0.2900204658508301, + "memory(GiB)": 78.33, + "step": 4386, + "token_acc": 0.9121223003129328, + "train_speed(iter/s)": 0.032414 + }, + { + "epoch": 0.8500702417284309, + "grad_norm": 0.09906947612762451, + "learning_rate": 1.8078937319026654e-05, + "loss": 0.33766764402389526, + "memory(GiB)": 78.33, + "step": 4387, + "token_acc": 0.8973069206205668, + "train_speed(iter/s)": 0.032415 + }, + { + "epoch": 0.8502640120137577, + "grad_norm": 0.09509405493736267, + "learning_rate": 1.8033210655247527e-05, + "loss": 0.3071034550666809, + "memory(GiB)": 78.33, + "step": 4388, + "token_acc": 0.9057726952298896, + "train_speed(iter/s)": 0.032416 + }, + { + "epoch": 0.8504577822990844, + "grad_norm": 0.10163518041372299, + "learning_rate": 1.7987538193774857e-05, + "loss": 0.34781065583229065, + "memory(GiB)": 78.33, + "step": 4389, + "token_acc": 0.8967510195262917, + "train_speed(iter/s)": 0.032416 + }, + { + "epoch": 0.8506515525844112, + "grad_norm": 0.09907959401607513, + "learning_rate": 1.794191995336761e-05, + "loss": 0.3162297308444977, + "memory(GiB)": 78.33, + "step": 4390, + "token_acc": 0.9030727923627685, + "train_speed(iter/s)": 0.032417 + }, + { + "epoch": 0.8508453228697379, + "grad_norm": 0.10068470239639282, + "learning_rate": 1.7896355952762314e-05, + "loss": 0.32280969619750977, + "memory(GiB)": 78.33, + "step": 4391, + "token_acc": 0.9020573146084884, + "train_speed(iter/s)": 0.032417 + }, + { + "epoch": 0.8510390931550647, + "grad_norm": 0.1039271429181099, + "learning_rate": 1.785084621067343e-05, + "loss": 0.3339514434337616, + "memory(GiB)": 78.33, + "step": 4392, + "token_acc": 0.9003369695585127, + "train_speed(iter/s)": 0.032418 + }, + { + "epoch": 0.8512328634403914, + "grad_norm": 0.1004716232419014, + "learning_rate": 1.780539074579299e-05, + "loss": 0.3353673815727234, + "memory(GiB)": 78.33, + "step": 4393, + "token_acc": 0.8975802074107934, + "train_speed(iter/s)": 0.032418 + }, + { + "epoch": 0.8514266337257181, + "grad_norm": 0.10031532496213913, + "learning_rate": 1.7759989576790778e-05, + "loss": 0.32534271478652954, + "memory(GiB)": 78.33, + "step": 4394, + "token_acc": 0.9057053186123409, + "train_speed(iter/s)": 0.032419 + }, + { + "epoch": 0.8516204040110449, + "grad_norm": 0.10088721662759781, + "learning_rate": 1.7714642722314278e-05, + "loss": 0.3248327970504761, + "memory(GiB)": 78.33, + "step": 4395, + "token_acc": 0.9025890722491108, + "train_speed(iter/s)": 0.03242 + }, + { + "epoch": 0.8518141742963716, + "grad_norm": 0.10467974096536636, + "learning_rate": 1.766935020098862e-05, + "loss": 0.33937883377075195, + "memory(GiB)": 78.33, + "step": 4396, + "token_acc": 0.8982584532423584, + "train_speed(iter/s)": 0.03242 + }, + { + "epoch": 0.8520079445816984, + "grad_norm": 0.0988173633813858, + "learning_rate": 1.7624112031416725e-05, + "loss": 0.33925455808639526, + "memory(GiB)": 78.33, + "step": 4397, + "token_acc": 0.8979691440036875, + "train_speed(iter/s)": 0.032421 + }, + { + "epoch": 0.8522017148670251, + "grad_norm": 0.10114793479442596, + "learning_rate": 1.7578928232179102e-05, + "loss": 0.29743602871894836, + "memory(GiB)": 78.33, + "step": 4398, + "token_acc": 0.9106254871918932, + "train_speed(iter/s)": 0.032422 + }, + { + "epoch": 0.8523954851523519, + "grad_norm": 0.1041836142539978, + "learning_rate": 1.753379882183395e-05, + "loss": 0.3369101881980896, + "memory(GiB)": 78.33, + "step": 4399, + "token_acc": 0.8994292404507537, + "train_speed(iter/s)": 0.032422 + }, + { + "epoch": 0.8525892554376786, + "grad_norm": 0.10185372084379196, + "learning_rate": 1.748872381891713e-05, + "loss": 0.3253825902938843, + "memory(GiB)": 78.33, + "step": 4400, + "token_acc": 0.9039803494668677, + "train_speed(iter/s)": 0.032423 + }, + { + "epoch": 0.8527830257230054, + "grad_norm": 0.09283772110939026, + "learning_rate": 1.7443703241942143e-05, + "loss": 0.2999337911605835, + "memory(GiB)": 78.33, + "step": 4401, + "token_acc": 0.9094778884305121, + "train_speed(iter/s)": 0.03242 + }, + { + "epoch": 0.8529767960083321, + "grad_norm": 0.10008923709392548, + "learning_rate": 1.739873710940015e-05, + "loss": 0.32751649618148804, + "memory(GiB)": 78.33, + "step": 4402, + "token_acc": 0.9021663945992309, + "train_speed(iter/s)": 0.032421 + }, + { + "epoch": 0.8531705662936588, + "grad_norm": 0.11449175328016281, + "learning_rate": 1.7353825439759948e-05, + "loss": 0.3371378481388092, + "memory(GiB)": 78.33, + "step": 4403, + "token_acc": 0.8989161168708766, + "train_speed(iter/s)": 0.032422 + }, + { + "epoch": 0.8533643365789856, + "grad_norm": 0.10284436494112015, + "learning_rate": 1.7308968251467997e-05, + "loss": 0.34078723192214966, + "memory(GiB)": 78.33, + "step": 4404, + "token_acc": 0.8978489916236173, + "train_speed(iter/s)": 0.032422 + }, + { + "epoch": 0.8535581068643123, + "grad_norm": 0.09477044641971588, + "learning_rate": 1.726416556294834e-05, + "loss": 0.31810081005096436, + "memory(GiB)": 78.33, + "step": 4405, + "token_acc": 0.9051635577183023, + "train_speed(iter/s)": 0.032423 + }, + { + "epoch": 0.8537518771496391, + "grad_norm": 0.09910459071397781, + "learning_rate": 1.721941739260264e-05, + "loss": 0.340251088142395, + "memory(GiB)": 78.33, + "step": 4406, + "token_acc": 0.8982416457979913, + "train_speed(iter/s)": 0.032423 + }, + { + "epoch": 0.8539456474349658, + "grad_norm": 0.09987306594848633, + "learning_rate": 1.7174723758810166e-05, + "loss": 0.3514068126678467, + "memory(GiB)": 78.33, + "step": 4407, + "token_acc": 0.8941308390766874, + "train_speed(iter/s)": 0.032424 + }, + { + "epoch": 0.8541394177202926, + "grad_norm": 0.10393253713846207, + "learning_rate": 1.7130084679927763e-05, + "loss": 0.34633970260620117, + "memory(GiB)": 78.33, + "step": 4408, + "token_acc": 0.896145340536909, + "train_speed(iter/s)": 0.032425 + }, + { + "epoch": 0.8543331880056193, + "grad_norm": 0.10434923321008682, + "learning_rate": 1.708550017429e-05, + "loss": 0.34398695826530457, + "memory(GiB)": 78.33, + "step": 4409, + "token_acc": 0.8962004803657725, + "train_speed(iter/s)": 0.032425 + }, + { + "epoch": 0.854526958290946, + "grad_norm": 0.09756119549274445, + "learning_rate": 1.70409702602088e-05, + "loss": 0.31827184557914734, + "memory(GiB)": 78.33, + "step": 4410, + "token_acc": 0.9032197224841708, + "train_speed(iter/s)": 0.032426 + }, + { + "epoch": 0.8547207285762728, + "grad_norm": 0.09933782368898392, + "learning_rate": 1.699649495597389e-05, + "loss": 0.30512192845344543, + "memory(GiB)": 78.33, + "step": 4411, + "token_acc": 0.9053738951249932, + "train_speed(iter/s)": 0.032427 + }, + { + "epoch": 0.8549144988615995, + "grad_norm": 0.0985947772860527, + "learning_rate": 1.695207427985246e-05, + "loss": 0.31603682041168213, + "memory(GiB)": 78.33, + "step": 4412, + "token_acc": 0.9057937693987814, + "train_speed(iter/s)": 0.032427 + }, + { + "epoch": 0.8551082691469263, + "grad_norm": 0.09728779643774033, + "learning_rate": 1.690770825008924e-05, + "loss": 0.3370038568973541, + "memory(GiB)": 78.33, + "step": 4413, + "token_acc": 0.898977370525996, + "train_speed(iter/s)": 0.032428 + }, + { + "epoch": 0.855302039432253, + "grad_norm": 0.09609750658273697, + "learning_rate": 1.6863396884906583e-05, + "loss": 0.30580419301986694, + "memory(GiB)": 78.33, + "step": 4414, + "token_acc": 0.9062045771938325, + "train_speed(iter/s)": 0.032428 + }, + { + "epoch": 0.8554958097175798, + "grad_norm": 0.08634244650602341, + "learning_rate": 1.681914020250431e-05, + "loss": 0.3033207356929779, + "memory(GiB)": 78.33, + "step": 4415, + "token_acc": 0.9060820975714593, + "train_speed(iter/s)": 0.032429 + }, + { + "epoch": 0.8556895800029065, + "grad_norm": 0.10316123068332672, + "learning_rate": 1.677493822105992e-05, + "loss": 0.3632691502571106, + "memory(GiB)": 78.33, + "step": 4416, + "token_acc": 0.8925554382259767, + "train_speed(iter/s)": 0.032429 + }, + { + "epoch": 0.8558833502882333, + "grad_norm": 0.08851286768913269, + "learning_rate": 1.6730790958728253e-05, + "loss": 0.2938583493232727, + "memory(GiB)": 78.33, + "step": 4417, + "token_acc": 0.9093023255813953, + "train_speed(iter/s)": 0.03243 + }, + { + "epoch": 0.85607712057356, + "grad_norm": 0.10702818632125854, + "learning_rate": 1.6686698433641836e-05, + "loss": 0.33791351318359375, + "memory(GiB)": 78.33, + "step": 4418, + "token_acc": 0.8968233799237612, + "train_speed(iter/s)": 0.032431 + }, + { + "epoch": 0.8562708908588867, + "grad_norm": 0.10163046419620514, + "learning_rate": 1.6642660663910658e-05, + "loss": 0.34928232431411743, + "memory(GiB)": 78.33, + "step": 4419, + "token_acc": 0.8963705698818635, + "train_speed(iter/s)": 0.032431 + }, + { + "epoch": 0.8564646611442135, + "grad_norm": 0.09223022311925888, + "learning_rate": 1.6598677667622175e-05, + "loss": 0.29846811294555664, + "memory(GiB)": 78.33, + "step": 4420, + "token_acc": 0.9110857397454819, + "train_speed(iter/s)": 0.032432 + }, + { + "epoch": 0.8566584314295402, + "grad_norm": 0.0997534915804863, + "learning_rate": 1.655474946284142e-05, + "loss": 0.3577570915222168, + "memory(GiB)": 78.33, + "step": 4421, + "token_acc": 0.8929023263019287, + "train_speed(iter/s)": 0.032432 + }, + { + "epoch": 0.856852201714867, + "grad_norm": 0.10170114785432816, + "learning_rate": 1.6510876067610833e-05, + "loss": 0.3271295130252838, + "memory(GiB)": 78.33, + "step": 4422, + "token_acc": 0.9011473179963333, + "train_speed(iter/s)": 0.032433 + }, + { + "epoch": 0.8570459720001937, + "grad_norm": 0.10372772812843323, + "learning_rate": 1.6467057499950497e-05, + "loss": 0.3079608380794525, + "memory(GiB)": 78.33, + "step": 4423, + "token_acc": 0.9084609878310665, + "train_speed(iter/s)": 0.032434 + }, + { + "epoch": 0.8572397422855205, + "grad_norm": 0.09529042989015579, + "learning_rate": 1.6423293777857765e-05, + "loss": 0.31907567381858826, + "memory(GiB)": 78.33, + "step": 4424, + "token_acc": 0.9041177847722602, + "train_speed(iter/s)": 0.032434 + }, + { + "epoch": 0.8574335125708472, + "grad_norm": 0.10136647522449493, + "learning_rate": 1.6379584919307644e-05, + "loss": 0.3494371175765991, + "memory(GiB)": 78.33, + "step": 4425, + "token_acc": 0.8960502692998205, + "train_speed(iter/s)": 0.032435 + }, + { + "epoch": 0.857627282856174, + "grad_norm": 0.091824971139431, + "learning_rate": 1.6335930942252535e-05, + "loss": 0.30003249645233154, + "memory(GiB)": 78.33, + "step": 4426, + "token_acc": 0.9094136715634568, + "train_speed(iter/s)": 0.032435 + }, + { + "epoch": 0.8578210531415007, + "grad_norm": 0.10738231241703033, + "learning_rate": 1.6292331864622265e-05, + "loss": 0.338476300239563, + "memory(GiB)": 78.33, + "step": 4427, + "token_acc": 0.8967210902712185, + "train_speed(iter/s)": 0.032436 + }, + { + "epoch": 0.8580148234268274, + "grad_norm": 0.09496748447418213, + "learning_rate": 1.6248787704324163e-05, + "loss": 0.3244837522506714, + "memory(GiB)": 78.33, + "step": 4428, + "token_acc": 0.9002851103573711, + "train_speed(iter/s)": 0.032437 + }, + { + "epoch": 0.8582085937121542, + "grad_norm": 0.10821764171123505, + "learning_rate": 1.620529847924295e-05, + "loss": 0.3445894420146942, + "memory(GiB)": 78.33, + "step": 4429, + "token_acc": 0.8967734961898071, + "train_speed(iter/s)": 0.032437 + }, + { + "epoch": 0.8584023639974809, + "grad_norm": 0.11172088235616684, + "learning_rate": 1.616186420724089e-05, + "loss": 0.3518655598163605, + "memory(GiB)": 78.33, + "step": 4430, + "token_acc": 0.8950802436686354, + "train_speed(iter/s)": 0.032438 + }, + { + "epoch": 0.8585961342828077, + "grad_norm": 0.09562534093856812, + "learning_rate": 1.611848490615757e-05, + "loss": 0.31965699791908264, + "memory(GiB)": 78.33, + "step": 4431, + "token_acc": 0.9038684271484827, + "train_speed(iter/s)": 0.032438 + }, + { + "epoch": 0.8587899045681344, + "grad_norm": 0.10791201889514923, + "learning_rate": 1.6075160593810044e-05, + "loss": 0.3649354577064514, + "memory(GiB)": 78.33, + "step": 4432, + "token_acc": 0.8911983213293257, + "train_speed(iter/s)": 0.032439 + }, + { + "epoch": 0.8589836748534613, + "grad_norm": 0.10779910534620285, + "learning_rate": 1.6031891287992747e-05, + "loss": 0.3215060234069824, + "memory(GiB)": 78.33, + "step": 4433, + "token_acc": 0.9031464957595514, + "train_speed(iter/s)": 0.03244 + }, + { + "epoch": 0.859177445138788, + "grad_norm": 0.09658445417881012, + "learning_rate": 1.5988677006477568e-05, + "loss": 0.33394524455070496, + "memory(GiB)": 78.33, + "step": 4434, + "token_acc": 0.8990924985430022, + "train_speed(iter/s)": 0.03244 + }, + { + "epoch": 0.8593712154241148, + "grad_norm": 0.10684984922409058, + "learning_rate": 1.594551776701377e-05, + "loss": 0.33571135997772217, + "memory(GiB)": 78.33, + "step": 4435, + "token_acc": 0.9019298688193743, + "train_speed(iter/s)": 0.032441 + }, + { + "epoch": 0.8595649857094415, + "grad_norm": 0.09724808484315872, + "learning_rate": 1.5902413587327978e-05, + "loss": 0.32527902722358704, + "memory(GiB)": 78.33, + "step": 4436, + "token_acc": 0.9014134468456669, + "train_speed(iter/s)": 0.032441 + }, + { + "epoch": 0.8597587559947683, + "grad_norm": 0.1140824407339096, + "learning_rate": 1.5859364485124294e-05, + "loss": 0.3552241027355194, + "memory(GiB)": 78.33, + "step": 4437, + "token_acc": 0.8934269717466584, + "train_speed(iter/s)": 0.032442 + }, + { + "epoch": 0.859952526280095, + "grad_norm": 0.11029309034347534, + "learning_rate": 1.5816370478084106e-05, + "loss": 0.3606140613555908, + "memory(GiB)": 78.33, + "step": 4438, + "token_acc": 0.8945596964690044, + "train_speed(iter/s)": 0.032443 + }, + { + "epoch": 0.8601462965654217, + "grad_norm": 0.10048742592334747, + "learning_rate": 1.5773431583866226e-05, + "loss": 0.3234768509864807, + "memory(GiB)": 78.33, + "step": 4439, + "token_acc": 0.9028618152085037, + "train_speed(iter/s)": 0.032443 + }, + { + "epoch": 0.8603400668507485, + "grad_norm": 0.10467264801263809, + "learning_rate": 1.573054782010681e-05, + "loss": 0.33086928725242615, + "memory(GiB)": 78.33, + "step": 4440, + "token_acc": 0.9003659258462036, + "train_speed(iter/s)": 0.032444 + }, + { + "epoch": 0.8605338371360752, + "grad_norm": 0.11301259696483612, + "learning_rate": 1.568771920441932e-05, + "loss": 0.36950597167015076, + "memory(GiB)": 78.33, + "step": 4441, + "token_acc": 0.8924123686288804, + "train_speed(iter/s)": 0.032445 + }, + { + "epoch": 0.860727607421402, + "grad_norm": 0.1058172732591629, + "learning_rate": 1.5644945754394732e-05, + "loss": 0.36208900809288025, + "memory(GiB)": 78.33, + "step": 4442, + "token_acc": 0.8900826684333324, + "train_speed(iter/s)": 0.032445 + }, + { + "epoch": 0.8609213777067287, + "grad_norm": 0.10789795219898224, + "learning_rate": 1.5602227487601114e-05, + "loss": 0.35568827390670776, + "memory(GiB)": 78.33, + "step": 4443, + "token_acc": 0.8932357022646078, + "train_speed(iter/s)": 0.032446 + }, + { + "epoch": 0.8611151479920555, + "grad_norm": 0.10026465356349945, + "learning_rate": 1.5559564421584114e-05, + "loss": 0.32759472727775574, + "memory(GiB)": 78.33, + "step": 4444, + "token_acc": 0.8990959539379413, + "train_speed(iter/s)": 0.032446 + }, + { + "epoch": 0.8613089182773822, + "grad_norm": 0.09213128685951233, + "learning_rate": 1.5516956573866564e-05, + "loss": 0.3294154107570648, + "memory(GiB)": 78.33, + "step": 4445, + "token_acc": 0.9012261580381471, + "train_speed(iter/s)": 0.032447 + }, + { + "epoch": 0.861502688562709, + "grad_norm": 0.09709835052490234, + "learning_rate": 1.5474403961948627e-05, + "loss": 0.31836092472076416, + "memory(GiB)": 78.33, + "step": 4446, + "token_acc": 0.9042838018741634, + "train_speed(iter/s)": 0.032447 + }, + { + "epoch": 0.8616964588480357, + "grad_norm": 0.10115919262170792, + "learning_rate": 1.5431906603307846e-05, + "loss": 0.3285466432571411, + "memory(GiB)": 78.33, + "step": 4447, + "token_acc": 0.8986334681362831, + "train_speed(iter/s)": 0.032448 + }, + { + "epoch": 0.8618902291333624, + "grad_norm": 0.10198438912630081, + "learning_rate": 1.5389464515398976e-05, + "loss": 0.326136976480484, + "memory(GiB)": 78.33, + "step": 4448, + "token_acc": 0.9012800087897599, + "train_speed(iter/s)": 0.032449 + }, + { + "epoch": 0.8620839994186892, + "grad_norm": 0.10833071917295456, + "learning_rate": 1.5347077715654198e-05, + "loss": 0.3257608115673065, + "memory(GiB)": 78.33, + "step": 4449, + "token_acc": 0.9010049449672994, + "train_speed(iter/s)": 0.032449 + }, + { + "epoch": 0.8622777697040159, + "grad_norm": 0.09669304639101028, + "learning_rate": 1.5304746221482827e-05, + "loss": 0.30072930455207825, + "memory(GiB)": 78.33, + "step": 4450, + "token_acc": 0.9068726155150487, + "train_speed(iter/s)": 0.03245 + }, + { + "epoch": 0.8624715399893427, + "grad_norm": 0.11190321296453476, + "learning_rate": 1.52624700502716e-05, + "loss": 0.3294280469417572, + "memory(GiB)": 78.33, + "step": 4451, + "token_acc": 0.9005425479444106, + "train_speed(iter/s)": 0.03245 + }, + { + "epoch": 0.8626653102746694, + "grad_norm": 0.09287480264902115, + "learning_rate": 1.5220249219384484e-05, + "loss": 0.3197273015975952, + "memory(GiB)": 78.33, + "step": 4452, + "token_acc": 0.9034961140224298, + "train_speed(iter/s)": 0.032451 + }, + { + "epoch": 0.8628590805599962, + "grad_norm": 0.09530606120824814, + "learning_rate": 1.5178083746162666e-05, + "loss": 0.3117508590221405, + "memory(GiB)": 78.33, + "step": 4453, + "token_acc": 0.9065753927662811, + "train_speed(iter/s)": 0.032452 + }, + { + "epoch": 0.8630528508453229, + "grad_norm": 0.10420526564121246, + "learning_rate": 1.5135973647924665e-05, + "loss": 0.30883246660232544, + "memory(GiB)": 78.33, + "step": 4454, + "token_acc": 0.9062847265094456, + "train_speed(iter/s)": 0.032452 + }, + { + "epoch": 0.8632466211306496, + "grad_norm": 0.08543951064348221, + "learning_rate": 1.5093918941966193e-05, + "loss": 0.27868953347206116, + "memory(GiB)": 78.33, + "step": 4455, + "token_acc": 0.9159319449077825, + "train_speed(iter/s)": 0.032453 + }, + { + "epoch": 0.8634403914159764, + "grad_norm": 0.09152313321828842, + "learning_rate": 1.5051919645560334e-05, + "loss": 0.31067970395088196, + "memory(GiB)": 78.33, + "step": 4456, + "token_acc": 0.9047191907111177, + "train_speed(iter/s)": 0.032453 + }, + { + "epoch": 0.8636341617013031, + "grad_norm": 0.11151348054409027, + "learning_rate": 1.5009975775957207e-05, + "loss": 0.3601279556751251, + "memory(GiB)": 78.33, + "step": 4457, + "token_acc": 0.8918564063150479, + "train_speed(iter/s)": 0.032454 + }, + { + "epoch": 0.8638279319866299, + "grad_norm": 0.10080854594707489, + "learning_rate": 1.4968087350384395e-05, + "loss": 0.34047406911849976, + "memory(GiB)": 78.33, + "step": 4458, + "token_acc": 0.8975244985557326, + "train_speed(iter/s)": 0.032455 + }, + { + "epoch": 0.8640217022719566, + "grad_norm": 0.0961776152253151, + "learning_rate": 1.4926254386046554e-05, + "loss": 0.3052217662334442, + "memory(GiB)": 78.33, + "step": 4459, + "token_acc": 0.908130910695084, + "train_speed(iter/s)": 0.032455 + }, + { + "epoch": 0.8642154725572834, + "grad_norm": 0.10806053876876831, + "learning_rate": 1.4884476900125591e-05, + "loss": 0.33401399850845337, + "memory(GiB)": 78.33, + "step": 4460, + "token_acc": 0.9004724605246799, + "train_speed(iter/s)": 0.032456 + }, + { + "epoch": 0.8644092428426101, + "grad_norm": 0.09109724313020706, + "learning_rate": 1.484275490978068e-05, + "loss": 0.32588818669319153, + "memory(GiB)": 78.33, + "step": 4461, + "token_acc": 0.90133139220889, + "train_speed(iter/s)": 0.032456 + }, + { + "epoch": 0.8646030131279369, + "grad_norm": 0.08890355378389359, + "learning_rate": 1.4801088432148112e-05, + "loss": 0.2869184613227844, + "memory(GiB)": 78.33, + "step": 4462, + "token_acc": 0.911701588985605, + "train_speed(iter/s)": 0.032457 + }, + { + "epoch": 0.8647967834132636, + "grad_norm": 0.09064006060361862, + "learning_rate": 1.4759477484341513e-05, + "loss": 0.29877418279647827, + "memory(GiB)": 78.33, + "step": 4463, + "token_acc": 0.9077449822904369, + "train_speed(iter/s)": 0.032458 + }, + { + "epoch": 0.8649905536985903, + "grad_norm": 0.10824266821146011, + "learning_rate": 1.47179220834515e-05, + "loss": 0.3239610493183136, + "memory(GiB)": 78.33, + "step": 4464, + "token_acc": 0.9016067776803973, + "train_speed(iter/s)": 0.032458 + }, + { + "epoch": 0.8651843239839171, + "grad_norm": 0.1040419191122055, + "learning_rate": 1.46764222465461e-05, + "loss": 0.35535627603530884, + "memory(GiB)": 78.33, + "step": 4465, + "token_acc": 0.8935964513193813, + "train_speed(iter/s)": 0.032459 + }, + { + "epoch": 0.8653780942692438, + "grad_norm": 0.13735035061836243, + "learning_rate": 1.463497799067036e-05, + "loss": 0.3214113116264343, + "memory(GiB)": 78.33, + "step": 4466, + "token_acc": 0.9032424242424243, + "train_speed(iter/s)": 0.032459 + }, + { + "epoch": 0.8655718645545706, + "grad_norm": 0.11260360479354858, + "learning_rate": 1.4593589332846567e-05, + "loss": 0.3748854696750641, + "memory(GiB)": 78.33, + "step": 4467, + "token_acc": 0.8932340525328331, + "train_speed(iter/s)": 0.03246 + }, + { + "epoch": 0.8657656348398973, + "grad_norm": 0.09724284708499908, + "learning_rate": 1.4552256290074138e-05, + "loss": 0.32235243916511536, + "memory(GiB)": 78.33, + "step": 4468, + "token_acc": 0.9010934523343511, + "train_speed(iter/s)": 0.032461 + }, + { + "epoch": 0.8659594051252241, + "grad_norm": 0.1042628064751625, + "learning_rate": 1.451097887932966e-05, + "loss": 0.3547298014163971, + "memory(GiB)": 78.33, + "step": 4469, + "token_acc": 0.8934734091775081, + "train_speed(iter/s)": 0.032461 + }, + { + "epoch": 0.8661531754105508, + "grad_norm": 0.11357161402702332, + "learning_rate": 1.4469757117566888e-05, + "loss": 0.3611631393432617, + "memory(GiB)": 78.33, + "step": 4470, + "token_acc": 0.8912460223373058, + "train_speed(iter/s)": 0.032462 + }, + { + "epoch": 0.8663469456958776, + "grad_norm": 0.11139743030071259, + "learning_rate": 1.4428591021716729e-05, + "loss": 0.35104861855506897, + "memory(GiB)": 78.33, + "step": 4471, + "token_acc": 0.8951944796856144, + "train_speed(iter/s)": 0.032462 + }, + { + "epoch": 0.8665407159812043, + "grad_norm": 0.11307670921087265, + "learning_rate": 1.4387480608687174e-05, + "loss": 0.36476895213127136, + "memory(GiB)": 78.33, + "step": 4472, + "token_acc": 0.8911314038074601, + "train_speed(iter/s)": 0.032463 + }, + { + "epoch": 0.866734486266531, + "grad_norm": 0.10000620037317276, + "learning_rate": 1.4346425895363384e-05, + "loss": 0.33012938499450684, + "memory(GiB)": 78.33, + "step": 4473, + "token_acc": 0.9017330185642236, + "train_speed(iter/s)": 0.032464 + }, + { + "epoch": 0.8669282565518578, + "grad_norm": 0.09574344009160995, + "learning_rate": 1.4305426898607602e-05, + "loss": 0.31818804144859314, + "memory(GiB)": 78.33, + "step": 4474, + "token_acc": 0.9036768787502313, + "train_speed(iter/s)": 0.032464 + }, + { + "epoch": 0.8671220268371845, + "grad_norm": 0.09933432936668396, + "learning_rate": 1.426448363525931e-05, + "loss": 0.3508920669555664, + "memory(GiB)": 78.33, + "step": 4475, + "token_acc": 0.8937370802838147, + "train_speed(iter/s)": 0.032465 + }, + { + "epoch": 0.8673157971225113, + "grad_norm": 0.10301670432090759, + "learning_rate": 1.4223596122134873e-05, + "loss": 0.3236181437969208, + "memory(GiB)": 78.33, + "step": 4476, + "token_acc": 0.9032188377619357, + "train_speed(iter/s)": 0.032465 + }, + { + "epoch": 0.867509567407838, + "grad_norm": 0.11474580317735672, + "learning_rate": 1.4182764376028006e-05, + "loss": 0.3661247491836548, + "memory(GiB)": 78.33, + "step": 4477, + "token_acc": 0.8923494649417026, + "train_speed(iter/s)": 0.032466 + }, + { + "epoch": 0.8677033376931648, + "grad_norm": 0.10032869875431061, + "learning_rate": 1.414198841370936e-05, + "loss": 0.3041604459285736, + "memory(GiB)": 78.33, + "step": 4478, + "token_acc": 0.9077770323157777, + "train_speed(iter/s)": 0.032467 + }, + { + "epoch": 0.8678971079784915, + "grad_norm": 0.10304979979991913, + "learning_rate": 1.4101268251926707e-05, + "loss": 0.3436087667942047, + "memory(GiB)": 78.33, + "step": 4479, + "token_acc": 0.8969530197494768, + "train_speed(iter/s)": 0.032467 + }, + { + "epoch": 0.8680908782638183, + "grad_norm": 0.10889364033937454, + "learning_rate": 1.4060603907404933e-05, + "loss": 0.35096222162246704, + "memory(GiB)": 78.33, + "step": 4480, + "token_acc": 0.8916971772764852, + "train_speed(iter/s)": 0.032468 + }, + { + "epoch": 0.868284648549145, + "grad_norm": 0.09136100858449936, + "learning_rate": 1.401999539684593e-05, + "loss": 0.3105822205543518, + "memory(GiB)": 78.33, + "step": 4481, + "token_acc": 0.9048854192898514, + "train_speed(iter/s)": 0.032468 + }, + { + "epoch": 0.8684784188344717, + "grad_norm": 0.09702181816101074, + "learning_rate": 1.3979442736928803e-05, + "loss": 0.31895220279693604, + "memory(GiB)": 78.33, + "step": 4482, + "token_acc": 0.9040166789823709, + "train_speed(iter/s)": 0.032469 + }, + { + "epoch": 0.8686721891197985, + "grad_norm": 0.09936001151800156, + "learning_rate": 1.3938945944309499e-05, + "loss": 0.3287636339664459, + "memory(GiB)": 78.33, + "step": 4483, + "token_acc": 0.9009811937857727, + "train_speed(iter/s)": 0.032469 + }, + { + "epoch": 0.8688659594051252, + "grad_norm": 0.09482026845216751, + "learning_rate": 1.3898505035621226e-05, + "loss": 0.30222681164741516, + "memory(GiB)": 78.33, + "step": 4484, + "token_acc": 0.9081456725285801, + "train_speed(iter/s)": 0.03247 + }, + { + "epoch": 0.869059729690452, + "grad_norm": 0.09608285129070282, + "learning_rate": 1.3858120027474134e-05, + "loss": 0.2800081968307495, + "memory(GiB)": 78.33, + "step": 4485, + "token_acc": 0.9137849484969519, + "train_speed(iter/s)": 0.032471 + }, + { + "epoch": 0.8692534999757787, + "grad_norm": 0.09473740309476852, + "learning_rate": 1.3817790936455402e-05, + "loss": 0.3255942165851593, + "memory(GiB)": 78.33, + "step": 4486, + "token_acc": 0.9004664035672185, + "train_speed(iter/s)": 0.032471 + }, + { + "epoch": 0.8694472702611055, + "grad_norm": 0.09641645103693008, + "learning_rate": 1.3777517779129316e-05, + "loss": 0.31738942861557007, + "memory(GiB)": 78.33, + "step": 4487, + "token_acc": 0.9032964135021097, + "train_speed(iter/s)": 0.032472 + }, + { + "epoch": 0.8696410405464322, + "grad_norm": 0.12395960092544556, + "learning_rate": 1.3737300572037075e-05, + "loss": 0.30116546154022217, + "memory(GiB)": 78.33, + "step": 4488, + "token_acc": 0.9070750061682704, + "train_speed(iter/s)": 0.032472 + }, + { + "epoch": 0.869834810831759, + "grad_norm": 0.09995721280574799, + "learning_rate": 1.3697139331697065e-05, + "loss": 0.34251344203948975, + "memory(GiB)": 78.33, + "step": 4489, + "token_acc": 0.896267974399468, + "train_speed(iter/s)": 0.032473 + }, + { + "epoch": 0.8700285811170857, + "grad_norm": 0.10301598161458969, + "learning_rate": 1.3657034074604478e-05, + "loss": 0.3315487205982208, + "memory(GiB)": 78.33, + "step": 4490, + "token_acc": 0.9004393623612911, + "train_speed(iter/s)": 0.032473 + }, + { + "epoch": 0.8702223514024124, + "grad_norm": 0.099857397377491, + "learning_rate": 1.3616984817231685e-05, + "loss": 0.3251858353614807, + "memory(GiB)": 78.33, + "step": 4491, + "token_acc": 0.9022668466644167, + "train_speed(iter/s)": 0.032474 + }, + { + "epoch": 0.8704161216877392, + "grad_norm": 0.09973510354757309, + "learning_rate": 1.3576991576028013e-05, + "loss": 0.3382074534893036, + "memory(GiB)": 78.33, + "step": 4492, + "token_acc": 0.9003876364378252, + "train_speed(iter/s)": 0.032475 + }, + { + "epoch": 0.8706098919730659, + "grad_norm": 0.08957747370004654, + "learning_rate": 1.3537054367419703e-05, + "loss": 0.29314741492271423, + "memory(GiB)": 78.33, + "step": 4493, + "token_acc": 0.9126814476038974, + "train_speed(iter/s)": 0.032475 + }, + { + "epoch": 0.8708036622583927, + "grad_norm": 0.10090366750955582, + "learning_rate": 1.3497173207810068e-05, + "loss": 0.32285887002944946, + "memory(GiB)": 78.33, + "step": 4494, + "token_acc": 0.9043573045641298, + "train_speed(iter/s)": 0.032476 + }, + { + "epoch": 0.8709974325437194, + "grad_norm": 0.09529086202383041, + "learning_rate": 1.3457348113579358e-05, + "loss": 0.30941274762153625, + "memory(GiB)": 78.33, + "step": 4495, + "token_acc": 0.9068646306126112, + "train_speed(iter/s)": 0.032476 + }, + { + "epoch": 0.8711912028290462, + "grad_norm": 0.09300097078084946, + "learning_rate": 1.3417579101084869e-05, + "loss": 0.3176164925098419, + "memory(GiB)": 78.33, + "step": 4496, + "token_acc": 0.9025844421699079, + "train_speed(iter/s)": 0.032477 + }, + { + "epoch": 0.8713849731143729, + "grad_norm": 0.10441552102565765, + "learning_rate": 1.3377866186660701e-05, + "loss": 0.33653199672698975, + "memory(GiB)": 78.33, + "step": 4497, + "token_acc": 0.8984428473648186, + "train_speed(iter/s)": 0.032477 + }, + { + "epoch": 0.8715787433996997, + "grad_norm": 0.10322162508964539, + "learning_rate": 1.3338209386618092e-05, + "loss": 0.32316023111343384, + "memory(GiB)": 78.33, + "step": 4498, + "token_acc": 0.9009378129837021, + "train_speed(iter/s)": 0.032478 + }, + { + "epoch": 0.8717725136850264, + "grad_norm": 0.09797031432390213, + "learning_rate": 1.329860871724513e-05, + "loss": 0.3057693541049957, + "memory(GiB)": 78.33, + "step": 4499, + "token_acc": 0.9071016563475685, + "train_speed(iter/s)": 0.032479 + }, + { + "epoch": 0.8719662839703531, + "grad_norm": 0.10819875448942184, + "learning_rate": 1.3259064194806885e-05, + "loss": 0.3662481904029846, + "memory(GiB)": 78.33, + "step": 4500, + "token_acc": 0.8935516888433982, + "train_speed(iter/s)": 0.032479 + }, + { + "epoch": 0.8719662839703531, + "eval_loss": 0.3799753785133362, + "eval_runtime": 1344.7448, + "eval_samples_per_second": 5.019, + "eval_steps_per_second": 5.019, + "eval_token_acc": 0.9022423587138677, + "step": 4500 + }, + { + "epoch": 0.8721600542556799, + "grad_norm": 0.09284574538469315, + "learning_rate": 1.3219575835545332e-05, + "loss": 0.30319368839263916, + "memory(GiB)": 78.33, + "step": 4501, + "token_acc": 0.9082558951495945, + "train_speed(iter/s)": 0.032168 + }, + { + "epoch": 0.8723538245410066, + "grad_norm": 0.098308265209198, + "learning_rate": 1.3180143655679397e-05, + "loss": 0.3219000995159149, + "memory(GiB)": 78.33, + "step": 4502, + "token_acc": 0.9041865907009868, + "train_speed(iter/s)": 0.032168 + }, + { + "epoch": 0.8725475948263334, + "grad_norm": 0.10174136608839035, + "learning_rate": 1.3140767671404995e-05, + "loss": 0.33587872982025146, + "memory(GiB)": 78.33, + "step": 4503, + "token_acc": 0.8989667675363268, + "train_speed(iter/s)": 0.032169 + }, + { + "epoch": 0.8727413651116601, + "grad_norm": 0.09019061923027039, + "learning_rate": 1.3101447898894852e-05, + "loss": 0.2792420983314514, + "memory(GiB)": 78.33, + "step": 4504, + "token_acc": 0.9144691759092409, + "train_speed(iter/s)": 0.03217 + }, + { + "epoch": 0.8729351353969869, + "grad_norm": 0.1039118766784668, + "learning_rate": 1.306218435429865e-05, + "loss": 0.35971078276634216, + "memory(GiB)": 78.33, + "step": 4505, + "token_acc": 0.8914640875954808, + "train_speed(iter/s)": 0.03217 + }, + { + "epoch": 0.8731289056823136, + "grad_norm": 0.10616055876016617, + "learning_rate": 1.3022977053743005e-05, + "loss": 0.3402102291584015, + "memory(GiB)": 78.33, + "step": 4506, + "token_acc": 0.8962266862085811, + "train_speed(iter/s)": 0.032171 + }, + { + "epoch": 0.8733226759676403, + "grad_norm": 0.09928639978170395, + "learning_rate": 1.298382601333139e-05, + "loss": 0.3082139194011688, + "memory(GiB)": 78.33, + "step": 4507, + "token_acc": 0.9083105179101072, + "train_speed(iter/s)": 0.032171 + }, + { + "epoch": 0.8735164462529671, + "grad_norm": 0.09501931816339493, + "learning_rate": 1.294473124914422e-05, + "loss": 0.3152191638946533, + "memory(GiB)": 78.33, + "step": 4508, + "token_acc": 0.9062588283637671, + "train_speed(iter/s)": 0.032172 + }, + { + "epoch": 0.8737102165382938, + "grad_norm": 0.09296244382858276, + "learning_rate": 1.2905692777238719e-05, + "loss": 0.27433520555496216, + "memory(GiB)": 78.33, + "step": 4509, + "token_acc": 0.9149881928045562, + "train_speed(iter/s)": 0.032173 + }, + { + "epoch": 0.8739039868236206, + "grad_norm": 0.09587821364402771, + "learning_rate": 1.2866710613649062e-05, + "loss": 0.2978772521018982, + "memory(GiB)": 78.33, + "step": 4510, + "token_acc": 0.9090977199510526, + "train_speed(iter/s)": 0.032173 + }, + { + "epoch": 0.8740977571089473, + "grad_norm": 0.10060916841030121, + "learning_rate": 1.282778477438629e-05, + "loss": 0.321035236120224, + "memory(GiB)": 78.33, + "step": 4511, + "token_acc": 0.9027070865260921, + "train_speed(iter/s)": 0.032174 + }, + { + "epoch": 0.8742915273942741, + "grad_norm": 0.097939632833004, + "learning_rate": 1.2788915275438267e-05, + "loss": 0.2997584044933319, + "memory(GiB)": 78.33, + "step": 4512, + "token_acc": 0.9089030803906837, + "train_speed(iter/s)": 0.032175 + }, + { + "epoch": 0.8744852976796008, + "grad_norm": 0.08998732268810272, + "learning_rate": 1.2750102132769735e-05, + "loss": 0.2946266531944275, + "memory(GiB)": 78.33, + "step": 4513, + "token_acc": 0.9110179997024842, + "train_speed(iter/s)": 0.032175 + }, + { + "epoch": 0.8746790679649276, + "grad_norm": 0.10302318632602692, + "learning_rate": 1.2711345362322295e-05, + "loss": 0.3208523988723755, + "memory(GiB)": 78.33, + "step": 4514, + "token_acc": 0.9040156111839498, + "train_speed(iter/s)": 0.032176 + }, + { + "epoch": 0.8748728382502543, + "grad_norm": 0.10049934685230255, + "learning_rate": 1.2672644980014445e-05, + "loss": 0.35380083322525024, + "memory(GiB)": 78.33, + "step": 4515, + "token_acc": 0.8943812514845214, + "train_speed(iter/s)": 0.032177 + }, + { + "epoch": 0.875066608535581, + "grad_norm": 0.09158279001712799, + "learning_rate": 1.2634001001741373e-05, + "loss": 0.3017105460166931, + "memory(GiB)": 78.33, + "step": 4516, + "token_acc": 0.908971506687753, + "train_speed(iter/s)": 0.032177 + }, + { + "epoch": 0.8752603788209078, + "grad_norm": 0.10271737724542618, + "learning_rate": 1.2595413443375297e-05, + "loss": 0.3498397767543793, + "memory(GiB)": 78.33, + "step": 4517, + "token_acc": 0.896111417539989, + "train_speed(iter/s)": 0.032178 + }, + { + "epoch": 0.8754541491062345, + "grad_norm": 0.09800441563129425, + "learning_rate": 1.2556882320765122e-05, + "loss": 0.3219001591205597, + "memory(GiB)": 78.33, + "step": 4518, + "token_acc": 0.9034795042897998, + "train_speed(iter/s)": 0.032178 + }, + { + "epoch": 0.8756479193915613, + "grad_norm": 0.10516981035470963, + "learning_rate": 1.2518407649736607e-05, + "loss": 0.3301500082015991, + "memory(GiB)": 78.33, + "step": 4519, + "token_acc": 0.8994242640075973, + "train_speed(iter/s)": 0.032179 + }, + { + "epoch": 0.875841689676888, + "grad_norm": 0.10065259039402008, + "learning_rate": 1.2479989446092359e-05, + "loss": 0.3331465423107147, + "memory(GiB)": 78.33, + "step": 4520, + "token_acc": 0.8990757119011435, + "train_speed(iter/s)": 0.03218 + }, + { + "epoch": 0.8760354599622148, + "grad_norm": 0.11081155389547348, + "learning_rate": 1.2441627725611708e-05, + "loss": 0.34419184923171997, + "memory(GiB)": 78.33, + "step": 4521, + "token_acc": 0.8977344241661422, + "train_speed(iter/s)": 0.03218 + }, + { + "epoch": 0.8762292302475415, + "grad_norm": 0.0998803898692131, + "learning_rate": 1.240332250405095e-05, + "loss": 0.31978899240493774, + "memory(GiB)": 78.33, + "step": 4522, + "token_acc": 0.9027906976744186, + "train_speed(iter/s)": 0.032181 + }, + { + "epoch": 0.8764230005328683, + "grad_norm": 0.10122760385274887, + "learning_rate": 1.236507379714295e-05, + "loss": 0.31909558176994324, + "memory(GiB)": 78.33, + "step": 4523, + "token_acc": 0.9035904628330996, + "train_speed(iter/s)": 0.032182 + }, + { + "epoch": 0.876616770818195, + "grad_norm": 0.10056735575199127, + "learning_rate": 1.2326881620597556e-05, + "loss": 0.3240450620651245, + "memory(GiB)": 78.33, + "step": 4524, + "token_acc": 0.9028395198933096, + "train_speed(iter/s)": 0.032182 + }, + { + "epoch": 0.8768105411035217, + "grad_norm": 0.08745722472667694, + "learning_rate": 1.2288745990101323e-05, + "loss": 0.3004869520664215, + "memory(GiB)": 78.33, + "step": 4525, + "token_acc": 0.907592263761578, + "train_speed(iter/s)": 0.032183 + }, + { + "epoch": 0.8770043113888485, + "grad_norm": 0.10847844928503036, + "learning_rate": 1.2250666921317537e-05, + "loss": 0.34397241473197937, + "memory(GiB)": 78.33, + "step": 4526, + "token_acc": 0.8972039724980901, + "train_speed(iter/s)": 0.032184 + }, + { + "epoch": 0.8771980816741752, + "grad_norm": 0.09692453593015671, + "learning_rate": 1.221264442988632e-05, + "loss": 0.33984488248825073, + "memory(GiB)": 78.33, + "step": 4527, + "token_acc": 0.9006118417883123, + "train_speed(iter/s)": 0.032184 + }, + { + "epoch": 0.877391851959502, + "grad_norm": 0.09908100217580795, + "learning_rate": 1.2174678531424497e-05, + "loss": 0.31903359293937683, + "memory(GiB)": 78.33, + "step": 4528, + "token_acc": 0.9019696566409369, + "train_speed(iter/s)": 0.032185 + }, + { + "epoch": 0.8775856222448287, + "grad_norm": 0.10452962666749954, + "learning_rate": 1.2136769241525762e-05, + "loss": 0.33924567699432373, + "memory(GiB)": 78.33, + "step": 4529, + "token_acc": 0.897684942468475, + "train_speed(iter/s)": 0.032185 + }, + { + "epoch": 0.8777793925301555, + "grad_norm": 0.09859445691108704, + "learning_rate": 1.2098916575760376e-05, + "loss": 0.3426741659641266, + "memory(GiB)": 78.33, + "step": 4530, + "token_acc": 0.896549017555794, + "train_speed(iter/s)": 0.032186 + }, + { + "epoch": 0.8779731628154822, + "grad_norm": 0.1066657230257988, + "learning_rate": 1.2061120549675518e-05, + "loss": 0.3669888377189636, + "memory(GiB)": 78.33, + "step": 4531, + "token_acc": 0.8908226988867771, + "train_speed(iter/s)": 0.032187 + }, + { + "epoch": 0.878166933100809, + "grad_norm": 0.1010642871260643, + "learning_rate": 1.2023381178795022e-05, + "loss": 0.31678637862205505, + "memory(GiB)": 78.33, + "step": 4532, + "token_acc": 0.9021789270887037, + "train_speed(iter/s)": 0.032187 + }, + { + "epoch": 0.8783607033861357, + "grad_norm": 0.10661202669143677, + "learning_rate": 1.1985698478619454e-05, + "loss": 0.3384348452091217, + "memory(GiB)": 78.33, + "step": 4533, + "token_acc": 0.899502755811167, + "train_speed(iter/s)": 0.032188 + }, + { + "epoch": 0.8785544736714624, + "grad_norm": 0.09901930391788483, + "learning_rate": 1.1948072464626101e-05, + "loss": 0.3115423619747162, + "memory(GiB)": 78.33, + "step": 4534, + "token_acc": 0.9044517138387143, + "train_speed(iter/s)": 0.032189 + }, + { + "epoch": 0.8787482439567892, + "grad_norm": 0.09338109940290451, + "learning_rate": 1.1910503152268952e-05, + "loss": 0.2875381410121918, + "memory(GiB)": 78.33, + "step": 4535, + "token_acc": 0.9132529457108248, + "train_speed(iter/s)": 0.032189 + }, + { + "epoch": 0.8789420142421159, + "grad_norm": 0.10732939094305038, + "learning_rate": 1.187299055697883e-05, + "loss": 0.3420470058917999, + "memory(GiB)": 78.33, + "step": 4536, + "token_acc": 0.8977093546629971, + "train_speed(iter/s)": 0.03219 + }, + { + "epoch": 0.8791357845274427, + "grad_norm": 0.10017619282007217, + "learning_rate": 1.1835534694163057e-05, + "loss": 0.3340654969215393, + "memory(GiB)": 78.33, + "step": 4537, + "token_acc": 0.8995184135977337, + "train_speed(iter/s)": 0.03219 + }, + { + "epoch": 0.8793295548127694, + "grad_norm": 0.09547953307628632, + "learning_rate": 1.1798135579205831e-05, + "loss": 0.3285997807979584, + "memory(GiB)": 78.33, + "step": 4538, + "token_acc": 0.9024012202922796, + "train_speed(iter/s)": 0.032191 + }, + { + "epoch": 0.8795233250980962, + "grad_norm": 0.09887305647134781, + "learning_rate": 1.1760793227467947e-05, + "loss": 0.34439530968666077, + "memory(GiB)": 78.33, + "step": 4539, + "token_acc": 0.8960512218764348, + "train_speed(iter/s)": 0.032192 + }, + { + "epoch": 0.8797170953834229, + "grad_norm": 0.10936824232339859, + "learning_rate": 1.1723507654286885e-05, + "loss": 0.3436868488788605, + "memory(GiB)": 78.33, + "step": 4540, + "token_acc": 0.8984078161751402, + "train_speed(iter/s)": 0.032192 + }, + { + "epoch": 0.8799108656687497, + "grad_norm": 0.09683515876531601, + "learning_rate": 1.1686278874976912e-05, + "loss": 0.32740306854248047, + "memory(GiB)": 78.33, + "step": 4541, + "token_acc": 0.9006859993763642, + "train_speed(iter/s)": 0.032193 + }, + { + "epoch": 0.8801046359540764, + "grad_norm": 0.10903044044971466, + "learning_rate": 1.1649106904828798e-05, + "loss": 0.3218260705471039, + "memory(GiB)": 78.33, + "step": 4542, + "token_acc": 0.9046106833637916, + "train_speed(iter/s)": 0.032194 + }, + { + "epoch": 0.8802984062394031, + "grad_norm": 0.09545977413654327, + "learning_rate": 1.1611991759110128e-05, + "loss": 0.31498417258262634, + "memory(GiB)": 78.33, + "step": 4543, + "token_acc": 0.904716222873292, + "train_speed(iter/s)": 0.032194 + }, + { + "epoch": 0.8804921765247299, + "grad_norm": 0.09254451096057892, + "learning_rate": 1.1574933453065078e-05, + "loss": 0.2897379696369171, + "memory(GiB)": 78.33, + "step": 4544, + "token_acc": 0.9108022299442514, + "train_speed(iter/s)": 0.032195 + }, + { + "epoch": 0.8806859468100566, + "grad_norm": 0.10882263630628586, + "learning_rate": 1.1537932001914485e-05, + "loss": 0.3277333974838257, + "memory(GiB)": 78.33, + "step": 4545, + "token_acc": 0.9019514213860503, + "train_speed(iter/s)": 0.032196 + }, + { + "epoch": 0.8808797170953834, + "grad_norm": 0.0869324579834938, + "learning_rate": 1.1500987420855845e-05, + "loss": 0.2924439609050751, + "memory(GiB)": 78.33, + "step": 4546, + "token_acc": 0.9105908898603864, + "train_speed(iter/s)": 0.032196 + }, + { + "epoch": 0.8810734873807101, + "grad_norm": 0.09957166761159897, + "learning_rate": 1.1464099725063237e-05, + "loss": 0.3209323287010193, + "memory(GiB)": 78.33, + "step": 4547, + "token_acc": 0.9023074369189907, + "train_speed(iter/s)": 0.032197 + }, + { + "epoch": 0.8812672576660369, + "grad_norm": 0.11669695377349854, + "learning_rate": 1.1427268929687555e-05, + "loss": 0.3712838888168335, + "memory(GiB)": 78.33, + "step": 4548, + "token_acc": 0.8882153971045866, + "train_speed(iter/s)": 0.032197 + }, + { + "epoch": 0.8814610279513636, + "grad_norm": 0.0909273773431778, + "learning_rate": 1.1390495049856036e-05, + "loss": 0.31092870235443115, + "memory(GiB)": 78.33, + "step": 4549, + "token_acc": 0.9068354688855991, + "train_speed(iter/s)": 0.032198 + }, + { + "epoch": 0.8816547982366904, + "grad_norm": 0.08977729827165604, + "learning_rate": 1.1353778100672828e-05, + "loss": 0.29384180903434753, + "memory(GiB)": 78.33, + "step": 4550, + "token_acc": 0.9103132495094121, + "train_speed(iter/s)": 0.032199 + }, + { + "epoch": 0.8818485685220171, + "grad_norm": 0.09765194356441498, + "learning_rate": 1.131711809721852e-05, + "loss": 0.31177225708961487, + "memory(GiB)": 78.33, + "step": 4551, + "token_acc": 0.9054578904333606, + "train_speed(iter/s)": 0.032199 + }, + { + "epoch": 0.8820423388073438, + "grad_norm": 0.11460500210523605, + "learning_rate": 1.1280515054550366e-05, + "loss": 0.3225688934326172, + "memory(GiB)": 78.33, + "step": 4552, + "token_acc": 0.9011132164850781, + "train_speed(iter/s)": 0.0322 + }, + { + "epoch": 0.8822361090926706, + "grad_norm": 0.09452386945486069, + "learning_rate": 1.1243968987702206e-05, + "loss": 0.29615986347198486, + "memory(GiB)": 78.33, + "step": 4553, + "token_acc": 0.9087365694797036, + "train_speed(iter/s)": 0.032201 + }, + { + "epoch": 0.8824298793779974, + "grad_norm": 0.09222650527954102, + "learning_rate": 1.1207479911684487e-05, + "loss": 0.2946526110172272, + "memory(GiB)": 78.33, + "step": 4554, + "token_acc": 0.9109519196451333, + "train_speed(iter/s)": 0.032201 + }, + { + "epoch": 0.8826236496633242, + "grad_norm": 0.10203875601291656, + "learning_rate": 1.11710478414843e-05, + "loss": 0.34442439675331116, + "memory(GiB)": 78.33, + "step": 4555, + "token_acc": 0.8982755761896866, + "train_speed(iter/s)": 0.032202 + }, + { + "epoch": 0.8828174199486509, + "grad_norm": 0.10598360747098923, + "learning_rate": 1.1134672792065209e-05, + "loss": 0.329555869102478, + "memory(GiB)": 78.33, + "step": 4556, + "token_acc": 0.9016222021252543, + "train_speed(iter/s)": 0.032202 + }, + { + "epoch": 0.8830111902339777, + "grad_norm": 0.10227588564157486, + "learning_rate": 1.109835477836748e-05, + "loss": 0.33669596910476685, + "memory(GiB)": 78.33, + "step": 4557, + "token_acc": 0.8991185669604776, + "train_speed(iter/s)": 0.032203 + }, + { + "epoch": 0.8832049605193044, + "grad_norm": 0.09594012796878815, + "learning_rate": 1.1062093815307865e-05, + "loss": 0.3027840554714203, + "memory(GiB)": 78.33, + "step": 4558, + "token_acc": 0.9077541203159593, + "train_speed(iter/s)": 0.032204 + }, + { + "epoch": 0.8833987308046312, + "grad_norm": 0.11179719865322113, + "learning_rate": 1.1025889917779735e-05, + "loss": 0.3367825746536255, + "memory(GiB)": 78.33, + "step": 4559, + "token_acc": 0.9006670902160102, + "train_speed(iter/s)": 0.032204 + }, + { + "epoch": 0.8835925010899579, + "grad_norm": 0.09386321157217026, + "learning_rate": 1.0989743100653008e-05, + "loss": 0.3008558750152588, + "memory(GiB)": 78.33, + "step": 4560, + "token_acc": 0.908220500378284, + "train_speed(iter/s)": 0.032205 + }, + { + "epoch": 0.8837862713752846, + "grad_norm": 0.08674740046262741, + "learning_rate": 1.0953653378774097e-05, + "loss": 0.27751424908638, + "memory(GiB)": 78.33, + "step": 4561, + "token_acc": 0.9156325156325157, + "train_speed(iter/s)": 0.032206 + }, + { + "epoch": 0.8839800416606114, + "grad_norm": 0.10045383870601654, + "learning_rate": 1.0917620766966123e-05, + "loss": 0.30626630783081055, + "memory(GiB)": 78.33, + "step": 4562, + "token_acc": 0.9081538638719336, + "train_speed(iter/s)": 0.032206 + }, + { + "epoch": 0.8841738119459381, + "grad_norm": 0.10451143980026245, + "learning_rate": 1.0881645280028534e-05, + "loss": 0.3387284576892853, + "memory(GiB)": 78.33, + "step": 4563, + "token_acc": 0.8987345454545455, + "train_speed(iter/s)": 0.032207 + }, + { + "epoch": 0.8843675822312649, + "grad_norm": 0.09835859388113022, + "learning_rate": 1.0845726932737509e-05, + "loss": 0.33612358570098877, + "memory(GiB)": 78.33, + "step": 4564, + "token_acc": 0.8987608426270136, + "train_speed(iter/s)": 0.032207 + }, + { + "epoch": 0.8845613525165916, + "grad_norm": 0.10052934288978577, + "learning_rate": 1.0809865739845646e-05, + "loss": 0.31725549697875977, + "memory(GiB)": 78.33, + "step": 4565, + "token_acc": 0.9065557583555249, + "train_speed(iter/s)": 0.032208 + }, + { + "epoch": 0.8847551228019184, + "grad_norm": 0.10602091997861862, + "learning_rate": 1.0774061716082117e-05, + "loss": 0.34389835596084595, + "memory(GiB)": 78.33, + "step": 4566, + "token_acc": 0.8980959097320169, + "train_speed(iter/s)": 0.032209 + }, + { + "epoch": 0.8849488930872451, + "grad_norm": 0.09629754722118378, + "learning_rate": 1.0738314876152587e-05, + "loss": 0.29018452763557434, + "memory(GiB)": 78.33, + "step": 4567, + "token_acc": 0.9128355904015205, + "train_speed(iter/s)": 0.032209 + }, + { + "epoch": 0.8851426633725719, + "grad_norm": 0.11927231401205063, + "learning_rate": 1.0702625234739215e-05, + "loss": 0.3924431800842285, + "memory(GiB)": 78.33, + "step": 4568, + "token_acc": 0.8830037082818294, + "train_speed(iter/s)": 0.03221 + }, + { + "epoch": 0.8853364336578986, + "grad_norm": 0.10137398540973663, + "learning_rate": 1.0666992806500774e-05, + "loss": 0.3232523202896118, + "memory(GiB)": 78.33, + "step": 4569, + "token_acc": 0.9024434010228191, + "train_speed(iter/s)": 0.032211 + }, + { + "epoch": 0.8855302039432253, + "grad_norm": 0.09592381864786148, + "learning_rate": 1.0631417606072356e-05, + "loss": 0.31686073541641235, + "memory(GiB)": 78.33, + "step": 4570, + "token_acc": 0.9042706275456697, + "train_speed(iter/s)": 0.032211 + }, + { + "epoch": 0.8857239742285521, + "grad_norm": 0.10683749616146088, + "learning_rate": 1.0595899648065742e-05, + "loss": 0.33387985825538635, + "memory(GiB)": 78.33, + "step": 4571, + "token_acc": 0.8986985376504726, + "train_speed(iter/s)": 0.032212 + }, + { + "epoch": 0.8859177445138788, + "grad_norm": 0.10224435478448868, + "learning_rate": 1.0560438947069077e-05, + "loss": 0.32763242721557617, + "memory(GiB)": 78.33, + "step": 4572, + "token_acc": 0.9004544285346824, + "train_speed(iter/s)": 0.032213 + }, + { + "epoch": 0.8861115147992056, + "grad_norm": 0.1058693677186966, + "learning_rate": 1.0525035517647012e-05, + "loss": 0.33532196283340454, + "memory(GiB)": 78.33, + "step": 4573, + "token_acc": 0.899757553151809, + "train_speed(iter/s)": 0.032213 + }, + { + "epoch": 0.8863052850845323, + "grad_norm": 0.10156890749931335, + "learning_rate": 1.0489689374340699e-05, + "loss": 0.32516196370124817, + "memory(GiB)": 78.33, + "step": 4574, + "token_acc": 0.9034920634920635, + "train_speed(iter/s)": 0.032214 + }, + { + "epoch": 0.8864990553698591, + "grad_norm": 0.09546992182731628, + "learning_rate": 1.0454400531667723e-05, + "loss": 0.32520344853401184, + "memory(GiB)": 78.33, + "step": 4575, + "token_acc": 0.9030310559006212, + "train_speed(iter/s)": 0.032214 + }, + { + "epoch": 0.8866928256551858, + "grad_norm": 0.0944749042391777, + "learning_rate": 1.0419169004122208e-05, + "loss": 0.308064341545105, + "memory(GiB)": 78.33, + "step": 4576, + "token_acc": 0.9058090431558348, + "train_speed(iter/s)": 0.032215 + }, + { + "epoch": 0.8868865959405126, + "grad_norm": 0.10865960270166397, + "learning_rate": 1.0383994806174678e-05, + "loss": 0.34614354372024536, + "memory(GiB)": 78.33, + "step": 4577, + "token_acc": 0.8971537001897533, + "train_speed(iter/s)": 0.032216 + }, + { + "epoch": 0.8870803662258393, + "grad_norm": 0.11621136218309402, + "learning_rate": 1.0348877952272094e-05, + "loss": 0.3003866672515869, + "memory(GiB)": 78.33, + "step": 4578, + "token_acc": 0.908298606977021, + "train_speed(iter/s)": 0.032216 + }, + { + "epoch": 0.887274136511166, + "grad_norm": 0.09108857810497284, + "learning_rate": 1.0313818456837918e-05, + "loss": 0.29279422760009766, + "memory(GiB)": 78.33, + "step": 4579, + "token_acc": 0.9107714628777266, + "train_speed(iter/s)": 0.032217 + }, + { + "epoch": 0.8874679067964928, + "grad_norm": 0.1540272831916809, + "learning_rate": 1.0278816334271984e-05, + "loss": 0.3414173722267151, + "memory(GiB)": 78.33, + "step": 4580, + "token_acc": 0.8987891249714416, + "train_speed(iter/s)": 0.032217 + }, + { + "epoch": 0.8876616770818195, + "grad_norm": 0.10262391716241837, + "learning_rate": 1.024387159895067e-05, + "loss": 0.35209545493125916, + "memory(GiB)": 78.33, + "step": 4581, + "token_acc": 0.894268269722155, + "train_speed(iter/s)": 0.032218 + }, + { + "epoch": 0.8878554473671463, + "grad_norm": 0.10410673916339874, + "learning_rate": 1.0208984265226649e-05, + "loss": 0.3540557324886322, + "memory(GiB)": 78.33, + "step": 4582, + "token_acc": 0.896634891466636, + "train_speed(iter/s)": 0.032219 + }, + { + "epoch": 0.888049217652473, + "grad_norm": 0.09851502627134323, + "learning_rate": 1.0174154347429141e-05, + "loss": 0.3153938055038452, + "memory(GiB)": 78.33, + "step": 4583, + "token_acc": 0.904395755821965, + "train_speed(iter/s)": 0.032219 + }, + { + "epoch": 0.8882429879377998, + "grad_norm": 0.09433472901582718, + "learning_rate": 1.013938185986372e-05, + "loss": 0.30990996956825256, + "memory(GiB)": 78.33, + "step": 4584, + "token_acc": 0.9047421073145381, + "train_speed(iter/s)": 0.03222 + }, + { + "epoch": 0.8884367582231265, + "grad_norm": 0.10146637260913849, + "learning_rate": 1.0104666816812362e-05, + "loss": 0.33719319105148315, + "memory(GiB)": 78.33, + "step": 4585, + "token_acc": 0.89920724801812, + "train_speed(iter/s)": 0.032221 + }, + { + "epoch": 0.8886305285084533, + "grad_norm": 0.11201319843530655, + "learning_rate": 1.0070009232533476e-05, + "loss": 0.3517708480358124, + "memory(GiB)": 78.33, + "step": 4586, + "token_acc": 0.8955111278762731, + "train_speed(iter/s)": 0.032221 + }, + { + "epoch": 0.88882429879378, + "grad_norm": 0.09309983253479004, + "learning_rate": 1.0035409121261828e-05, + "loss": 0.2867148518562317, + "memory(GiB)": 78.33, + "step": 4587, + "token_acc": 0.912418334951329, + "train_speed(iter/s)": 0.032222 + }, + { + "epoch": 0.8890180690791067, + "grad_norm": 0.09345608949661255, + "learning_rate": 1.0000866497208714e-05, + "loss": 0.310814768075943, + "memory(GiB)": 78.33, + "step": 4588, + "token_acc": 0.9047558526727841, + "train_speed(iter/s)": 0.032222 + }, + { + "epoch": 0.8892118393644335, + "grad_norm": 0.1012081727385521, + "learning_rate": 9.96638137456159e-06, + "loss": 0.3262358605861664, + "memory(GiB)": 78.33, + "step": 4589, + "token_acc": 0.9022921972678861, + "train_speed(iter/s)": 0.032223 + }, + { + "epoch": 0.8894056096497602, + "grad_norm": 0.10403067618608475, + "learning_rate": 9.931953767484518e-06, + "loss": 0.3487986922264099, + "memory(GiB)": 78.33, + "step": 4590, + "token_acc": 0.8952626667398583, + "train_speed(iter/s)": 0.032224 + }, + { + "epoch": 0.889599379935087, + "grad_norm": 0.11049600690603256, + "learning_rate": 9.89758369011781e-06, + "loss": 0.3473038971424103, + "memory(GiB)": 78.33, + "step": 4591, + "token_acc": 0.8961088918479065, + "train_speed(iter/s)": 0.032224 + }, + { + "epoch": 0.8897931502204137, + "grad_norm": 0.08729095757007599, + "learning_rate": 9.863271156578174e-06, + "loss": 0.28431570529937744, + "memory(GiB)": 78.33, + "step": 4592, + "token_acc": 0.9126174143741486, + "train_speed(iter/s)": 0.032225 + }, + { + "epoch": 0.8899869205057405, + "grad_norm": 0.09780146926641464, + "learning_rate": 9.829016180958681e-06, + "loss": 0.3170766234397888, + "memory(GiB)": 78.33, + "step": 4593, + "token_acc": 0.9040569020021075, + "train_speed(iter/s)": 0.032226 + }, + { + "epoch": 0.8901806907910672, + "grad_norm": 0.11150769889354706, + "learning_rate": 9.794818777328767e-06, + "loss": 0.29845190048217773, + "memory(GiB)": 78.33, + "step": 4594, + "token_acc": 0.909273330479452, + "train_speed(iter/s)": 0.032226 + }, + { + "epoch": 0.890374461076394, + "grad_norm": 0.10244564712047577, + "learning_rate": 9.760678959734292e-06, + "loss": 0.34681934118270874, + "memory(GiB)": 78.33, + "step": 4595, + "token_acc": 0.8985476177715848, + "train_speed(iter/s)": 0.032227 + }, + { + "epoch": 0.8905682313617207, + "grad_norm": 0.10472705215215683, + "learning_rate": 9.726596742197307e-06, + "loss": 0.3317742645740509, + "memory(GiB)": 78.33, + "step": 4596, + "token_acc": 0.8990034710558727, + "train_speed(iter/s)": 0.032227 + }, + { + "epoch": 0.8907620016470474, + "grad_norm": 0.0967646911740303, + "learning_rate": 9.692572138716347e-06, + "loss": 0.34411588311195374, + "memory(GiB)": 78.33, + "step": 4597, + "token_acc": 0.8969721095713183, + "train_speed(iter/s)": 0.032228 + }, + { + "epoch": 0.8909557719323742, + "grad_norm": 0.10965237766504288, + "learning_rate": 9.658605163266203e-06, + "loss": 0.3249712884426117, + "memory(GiB)": 78.33, + "step": 4598, + "token_acc": 0.9018522218099831, + "train_speed(iter/s)": 0.032229 + }, + { + "epoch": 0.8911495422177009, + "grad_norm": 0.10542988777160645, + "learning_rate": 9.624695829798045e-06, + "loss": 0.33859795331954956, + "memory(GiB)": 78.33, + "step": 4599, + "token_acc": 0.898973819608289, + "train_speed(iter/s)": 0.032229 + }, + { + "epoch": 0.8913433125030277, + "grad_norm": 0.09840527176856995, + "learning_rate": 9.590844152239353e-06, + "loss": 0.3379114270210266, + "memory(GiB)": 78.33, + "step": 4600, + "token_acc": 0.8994015604878418, + "train_speed(iter/s)": 0.03223 + }, + { + "epoch": 0.8915370827883544, + "grad_norm": 0.0945395678281784, + "learning_rate": 9.557050144493884e-06, + "loss": 0.30370837450027466, + "memory(GiB)": 78.33, + "step": 4601, + "token_acc": 0.9077565971057858, + "train_speed(iter/s)": 0.032227 + }, + { + "epoch": 0.8917308530736812, + "grad_norm": 0.10549890249967575, + "learning_rate": 9.523313820441803e-06, + "loss": 0.3490893840789795, + "memory(GiB)": 78.33, + "step": 4602, + "token_acc": 0.897390101976148, + "train_speed(iter/s)": 0.032228 + }, + { + "epoch": 0.8919246233590079, + "grad_norm": 0.10837141424417496, + "learning_rate": 9.489635193939444e-06, + "loss": 0.36473971605300903, + "memory(GiB)": 78.33, + "step": 4603, + "token_acc": 0.8911379629370041, + "train_speed(iter/s)": 0.032229 + }, + { + "epoch": 0.8921183936443347, + "grad_norm": 0.09465762972831726, + "learning_rate": 9.456014278819606e-06, + "loss": 0.3148060142993927, + "memory(GiB)": 78.33, + "step": 4604, + "token_acc": 0.9028953344343518, + "train_speed(iter/s)": 0.032229 + }, + { + "epoch": 0.8923121639296614, + "grad_norm": 0.09781907498836517, + "learning_rate": 9.422451088891264e-06, + "loss": 0.3031347990036011, + "memory(GiB)": 78.33, + "step": 4605, + "token_acc": 0.908634026587614, + "train_speed(iter/s)": 0.03223 + }, + { + "epoch": 0.8925059342149881, + "grad_norm": 0.09446154534816742, + "learning_rate": 9.38894563793972e-06, + "loss": 0.3180665373802185, + "memory(GiB)": 78.33, + "step": 4606, + "token_acc": 0.905185446482564, + "train_speed(iter/s)": 0.03223 + }, + { + "epoch": 0.8926997045003149, + "grad_norm": 0.09713321179151535, + "learning_rate": 9.355497939726569e-06, + "loss": 0.31975382566452026, + "memory(GiB)": 78.33, + "step": 4607, + "token_acc": 0.904540804281105, + "train_speed(iter/s)": 0.032231 + }, + { + "epoch": 0.8928934747856416, + "grad_norm": 0.0889185443520546, + "learning_rate": 9.322108007989654e-06, + "loss": 0.28848981857299805, + "memory(GiB)": 78.33, + "step": 4608, + "token_acc": 0.9133419560242761, + "train_speed(iter/s)": 0.032232 + }, + { + "epoch": 0.8930872450709684, + "grad_norm": 0.10388769209384918, + "learning_rate": 9.288775856443187e-06, + "loss": 0.3318116068840027, + "memory(GiB)": 78.33, + "step": 4609, + "token_acc": 0.9007829299222584, + "train_speed(iter/s)": 0.032232 + }, + { + "epoch": 0.8932810153562951, + "grad_norm": 0.10328590124845505, + "learning_rate": 9.255501498777485e-06, + "loss": 0.35095512866973877, + "memory(GiB)": 78.33, + "step": 4610, + "token_acc": 0.8963482109922538, + "train_speed(iter/s)": 0.032233 + }, + { + "epoch": 0.8934747856416219, + "grad_norm": 0.09728133678436279, + "learning_rate": 9.222284948659297e-06, + "loss": 0.32896798849105835, + "memory(GiB)": 78.33, + "step": 4611, + "token_acc": 0.9007275166400083, + "train_speed(iter/s)": 0.032233 + }, + { + "epoch": 0.8936685559269486, + "grad_norm": 0.09340497106313705, + "learning_rate": 9.189126219731513e-06, + "loss": 0.3199424147605896, + "memory(GiB)": 78.33, + "step": 4612, + "token_acc": 0.9017920064431693, + "train_speed(iter/s)": 0.032234 + }, + { + "epoch": 0.8938623262122753, + "grad_norm": 0.12456195801496506, + "learning_rate": 9.156025325613319e-06, + "loss": 0.38881370425224304, + "memory(GiB)": 78.33, + "step": 4613, + "token_acc": 0.8865035516969219, + "train_speed(iter/s)": 0.032235 + }, + { + "epoch": 0.8940560964976021, + "grad_norm": 0.10831289738416672, + "learning_rate": 9.122982279900192e-06, + "loss": 0.31916946172714233, + "memory(GiB)": 78.33, + "step": 4614, + "token_acc": 0.9046099809746817, + "train_speed(iter/s)": 0.032235 + }, + { + "epoch": 0.8942498667829288, + "grad_norm": 0.09680543094873428, + "learning_rate": 9.089997096163692e-06, + "loss": 0.31975850462913513, + "memory(GiB)": 78.33, + "step": 4615, + "token_acc": 0.9020113186492016, + "train_speed(iter/s)": 0.032236 + }, + { + "epoch": 0.8944436370682556, + "grad_norm": 0.09476674348115921, + "learning_rate": 9.057069787951832e-06, + "loss": 0.3149741291999817, + "memory(GiB)": 78.33, + "step": 4616, + "token_acc": 0.9046471784987686, + "train_speed(iter/s)": 0.032236 + }, + { + "epoch": 0.8946374073535823, + "grad_norm": 0.09538931399583817, + "learning_rate": 9.024200368788676e-06, + "loss": 0.298635333776474, + "memory(GiB)": 78.33, + "step": 4617, + "token_acc": 0.9104565695584527, + "train_speed(iter/s)": 0.032237 + }, + { + "epoch": 0.8948311776389091, + "grad_norm": 0.09865786135196686, + "learning_rate": 8.991388852174592e-06, + "loss": 0.3136424124240875, + "memory(GiB)": 78.33, + "step": 4618, + "token_acc": 0.9061650538218985, + "train_speed(iter/s)": 0.032238 + }, + { + "epoch": 0.8950249479242358, + "grad_norm": 0.09958308935165405, + "learning_rate": 8.958635251586166e-06, + "loss": 0.32274237275123596, + "memory(GiB)": 78.33, + "step": 4619, + "token_acc": 0.9019686621132985, + "train_speed(iter/s)": 0.032238 + }, + { + "epoch": 0.8952187182095626, + "grad_norm": 0.1041649580001831, + "learning_rate": 8.925939580476138e-06, + "loss": 0.33783018589019775, + "memory(GiB)": 78.33, + "step": 4620, + "token_acc": 0.8964275424949583, + "train_speed(iter/s)": 0.032239 + }, + { + "epoch": 0.8954124884948893, + "grad_norm": 0.09779240190982819, + "learning_rate": 8.893301852273582e-06, + "loss": 0.3440534174442291, + "memory(GiB)": 78.33, + "step": 4621, + "token_acc": 0.8966321509124462, + "train_speed(iter/s)": 0.03224 + }, + { + "epoch": 0.895606258780216, + "grad_norm": 0.11845773458480835, + "learning_rate": 8.860722080383593e-06, + "loss": 0.35379868745803833, + "memory(GiB)": 78.33, + "step": 4622, + "token_acc": 0.8932370241687239, + "train_speed(iter/s)": 0.03224 + }, + { + "epoch": 0.8958000290655428, + "grad_norm": 0.0978410616517067, + "learning_rate": 8.82820027818763e-06, + "loss": 0.3412013053894043, + "memory(GiB)": 78.33, + "step": 4623, + "token_acc": 0.9002301375594958, + "train_speed(iter/s)": 0.032241 + }, + { + "epoch": 0.8959937993508695, + "grad_norm": 0.09728839993476868, + "learning_rate": 8.795736459043246e-06, + "loss": 0.3061705529689789, + "memory(GiB)": 78.33, + "step": 4624, + "token_acc": 0.9070262390670554, + "train_speed(iter/s)": 0.032241 + }, + { + "epoch": 0.8961875696361963, + "grad_norm": 0.09861348569393158, + "learning_rate": 8.763330636284204e-06, + "loss": 0.3142586350440979, + "memory(GiB)": 78.33, + "step": 4625, + "token_acc": 0.9079194559839123, + "train_speed(iter/s)": 0.032242 + }, + { + "epoch": 0.896381339921523, + "grad_norm": 0.09855727106332779, + "learning_rate": 8.730982823220445e-06, + "loss": 0.32500770688056946, + "memory(GiB)": 78.33, + "step": 4626, + "token_acc": 0.9023219335979276, + "train_speed(iter/s)": 0.032243 + }, + { + "epoch": 0.8965751102068498, + "grad_norm": 0.10426346212625504, + "learning_rate": 8.698693033138054e-06, + "loss": 0.32699429988861084, + "memory(GiB)": 78.33, + "step": 4627, + "token_acc": 0.9018576936558009, + "train_speed(iter/s)": 0.032243 + }, + { + "epoch": 0.8967688804921765, + "grad_norm": 0.09685856848955154, + "learning_rate": 8.666461279299408e-06, + "loss": 0.3376917243003845, + "memory(GiB)": 78.33, + "step": 4628, + "token_acc": 0.8998025786638719, + "train_speed(iter/s)": 0.032244 + }, + { + "epoch": 0.8969626507775033, + "grad_norm": 0.10653463006019592, + "learning_rate": 8.634287574942833e-06, + "loss": 0.3381001055240631, + "memory(GiB)": 78.33, + "step": 4629, + "token_acc": 0.9000566335807326, + "train_speed(iter/s)": 0.032244 + }, + { + "epoch": 0.89715642106283, + "grad_norm": 0.1089102178812027, + "learning_rate": 8.602171933283025e-06, + "loss": 0.3288399279117584, + "memory(GiB)": 78.33, + "step": 4630, + "token_acc": 0.9028040685495066, + "train_speed(iter/s)": 0.032245 + }, + { + "epoch": 0.8973501913481567, + "grad_norm": 0.10240910202264786, + "learning_rate": 8.570114367510717e-06, + "loss": 0.33557647466659546, + "memory(GiB)": 78.33, + "step": 4631, + "token_acc": 0.897394733136178, + "train_speed(iter/s)": 0.032246 + }, + { + "epoch": 0.8975439616334835, + "grad_norm": 0.10204704850912094, + "learning_rate": 8.53811489079279e-06, + "loss": 0.3257525861263275, + "memory(GiB)": 78.33, + "step": 4632, + "token_acc": 0.9036119817088836, + "train_speed(iter/s)": 0.032246 + }, + { + "epoch": 0.8977377319188102, + "grad_norm": 0.10609758645296097, + "learning_rate": 8.506173516272319e-06, + "loss": 0.362753689289093, + "memory(GiB)": 78.33, + "step": 4633, + "token_acc": 0.8909109816971714, + "train_speed(iter/s)": 0.032247 + }, + { + "epoch": 0.897931502204137, + "grad_norm": 0.09235696494579315, + "learning_rate": 8.474290257068456e-06, + "loss": 0.3118291199207306, + "memory(GiB)": 78.33, + "step": 4634, + "token_acc": 0.9042984381050729, + "train_speed(iter/s)": 0.032247 + }, + { + "epoch": 0.8981252724894637, + "grad_norm": 0.10148902982473373, + "learning_rate": 8.44246512627656e-06, + "loss": 0.3247118592262268, + "memory(GiB)": 78.33, + "step": 4635, + "token_acc": 0.9009333957432749, + "train_speed(iter/s)": 0.032248 + }, + { + "epoch": 0.8983190427747905, + "grad_norm": 0.10464513301849365, + "learning_rate": 8.41069813696799e-06, + "loss": 0.3417477309703827, + "memory(GiB)": 78.33, + "step": 4636, + "token_acc": 0.898022001552421, + "train_speed(iter/s)": 0.032249 + }, + { + "epoch": 0.8985128130601172, + "grad_norm": 0.0930318832397461, + "learning_rate": 8.37898930219038e-06, + "loss": 0.3098987638950348, + "memory(GiB)": 78.33, + "step": 4637, + "token_acc": 0.9070895522388059, + "train_speed(iter/s)": 0.032249 + }, + { + "epoch": 0.898706583345444, + "grad_norm": 0.09498867392539978, + "learning_rate": 8.34733863496736e-06, + "loss": 0.3298591375350952, + "memory(GiB)": 78.33, + "step": 4638, + "token_acc": 0.9012791647079199, + "train_speed(iter/s)": 0.03225 + }, + { + "epoch": 0.8989003536307707, + "grad_norm": 0.09717161953449249, + "learning_rate": 8.315746148298713e-06, + "loss": 0.3042880892753601, + "memory(GiB)": 78.33, + "step": 4639, + "token_acc": 0.9077266387726639, + "train_speed(iter/s)": 0.03225 + }, + { + "epoch": 0.8990941239160974, + "grad_norm": 0.09742318093776703, + "learning_rate": 8.284211855160328e-06, + "loss": 0.32740819454193115, + "memory(GiB)": 78.33, + "step": 4640, + "token_acc": 0.9031217784356744, + "train_speed(iter/s)": 0.032251 + }, + { + "epoch": 0.8992878942014242, + "grad_norm": 0.10585065186023712, + "learning_rate": 8.252735768504176e-06, + "loss": 0.32093313336372375, + "memory(GiB)": 78.33, + "step": 4641, + "token_acc": 0.9033283470456245, + "train_speed(iter/s)": 0.032252 + }, + { + "epoch": 0.8994816644867509, + "grad_norm": 0.10464109480381012, + "learning_rate": 8.221317901258367e-06, + "loss": 0.3451445698738098, + "memory(GiB)": 78.33, + "step": 4642, + "token_acc": 0.8969477704080168, + "train_speed(iter/s)": 0.032252 + }, + { + "epoch": 0.8996754347720777, + "grad_norm": 0.09843490272760391, + "learning_rate": 8.189958266326996e-06, + "loss": 0.3220871090888977, + "memory(GiB)": 78.33, + "step": 4643, + "token_acc": 0.9026759937819232, + "train_speed(iter/s)": 0.032253 + }, + { + "epoch": 0.8998692050574044, + "grad_norm": 0.10574258118867874, + "learning_rate": 8.158656876590375e-06, + "loss": 0.36457881331443787, + "memory(GiB)": 78.33, + "step": 4644, + "token_acc": 0.8903435022820082, + "train_speed(iter/s)": 0.032254 + }, + { + "epoch": 0.9000629753427312, + "grad_norm": 0.0989030972123146, + "learning_rate": 8.127413744904804e-06, + "loss": 0.3353177309036255, + "memory(GiB)": 78.33, + "step": 4645, + "token_acc": 0.8995719135954338, + "train_speed(iter/s)": 0.032254 + }, + { + "epoch": 0.9002567456280579, + "grad_norm": 0.10432767868041992, + "learning_rate": 8.096228884102652e-06, + "loss": 0.32852283120155334, + "memory(GiB)": 78.33, + "step": 4646, + "token_acc": 0.9020051081818983, + "train_speed(iter/s)": 0.032255 + }, + { + "epoch": 0.9004505159133847, + "grad_norm": 0.09884372353553772, + "learning_rate": 8.065102306992439e-06, + "loss": 0.34128913283348083, + "memory(GiB)": 78.33, + "step": 4647, + "token_acc": 0.8975473369505038, + "train_speed(iter/s)": 0.032255 + }, + { + "epoch": 0.9006442861987114, + "grad_norm": 0.09996260702610016, + "learning_rate": 8.034034026358587e-06, + "loss": 0.32066965103149414, + "memory(GiB)": 78.33, + "step": 4648, + "token_acc": 0.9021558379855089, + "train_speed(iter/s)": 0.032256 + }, + { + "epoch": 0.9008380564840381, + "grad_norm": 0.10087880492210388, + "learning_rate": 8.003024054961776e-06, + "loss": 0.34059831500053406, + "memory(GiB)": 78.33, + "step": 4649, + "token_acc": 0.89795299887043, + "train_speed(iter/s)": 0.032257 + }, + { + "epoch": 0.9010318267693649, + "grad_norm": 0.10195588320493698, + "learning_rate": 7.972072405538582e-06, + "loss": 0.3243820369243622, + "memory(GiB)": 78.33, + "step": 4650, + "token_acc": 0.9031757307864341, + "train_speed(iter/s)": 0.032257 + }, + { + "epoch": 0.9012255970546916, + "grad_norm": 0.11210127919912338, + "learning_rate": 7.941179090801687e-06, + "loss": 0.33561965823173523, + "memory(GiB)": 78.33, + "step": 4651, + "token_acc": 0.8998974967630556, + "train_speed(iter/s)": 0.032258 + }, + { + "epoch": 0.9014193673400184, + "grad_norm": 0.09735889732837677, + "learning_rate": 7.91034412343982e-06, + "loss": 0.3177420496940613, + "memory(GiB)": 78.33, + "step": 4652, + "token_acc": 0.9025480310669028, + "train_speed(iter/s)": 0.032258 + }, + { + "epoch": 0.9016131376253451, + "grad_norm": 0.1058276817202568, + "learning_rate": 7.879567516117691e-06, + "loss": 0.34389030933380127, + "memory(GiB)": 78.33, + "step": 4653, + "token_acc": 0.9005196304849884, + "train_speed(iter/s)": 0.032259 + }, + { + "epoch": 0.9018069079106719, + "grad_norm": 0.10890252888202667, + "learning_rate": 7.848849281476149e-06, + "loss": 0.3455137014389038, + "memory(GiB)": 78.33, + "step": 4654, + "token_acc": 0.89581208468552, + "train_speed(iter/s)": 0.03226 + }, + { + "epoch": 0.9020006781959986, + "grad_norm": 0.09760237485170364, + "learning_rate": 7.818189432131921e-06, + "loss": 0.3121528923511505, + "memory(GiB)": 78.33, + "step": 4655, + "token_acc": 0.9046669988374024, + "train_speed(iter/s)": 0.03226 + }, + { + "epoch": 0.9021944484813254, + "grad_norm": 0.0973569005727768, + "learning_rate": 7.787587980677868e-06, + "loss": 0.3131091594696045, + "memory(GiB)": 78.33, + "step": 4656, + "token_acc": 0.9063661202185792, + "train_speed(iter/s)": 0.032261 + }, + { + "epoch": 0.9023882187666521, + "grad_norm": 0.09090343117713928, + "learning_rate": 7.75704493968285e-06, + "loss": 0.29967737197875977, + "memory(GiB)": 78.33, + "step": 4657, + "token_acc": 0.9070998676422846, + "train_speed(iter/s)": 0.032262 + }, + { + "epoch": 0.9025819890519788, + "grad_norm": 0.10325030982494354, + "learning_rate": 7.726560321691682e-06, + "loss": 0.3381047248840332, + "memory(GiB)": 78.33, + "step": 4658, + "token_acc": 0.8999558238845531, + "train_speed(iter/s)": 0.032262 + }, + { + "epoch": 0.9027757593373056, + "grad_norm": 0.10501549392938614, + "learning_rate": 7.696134139225219e-06, + "loss": 0.33340954780578613, + "memory(GiB)": 78.33, + "step": 4659, + "token_acc": 0.8982804760006833, + "train_speed(iter/s)": 0.032263 + }, + { + "epoch": 0.9029695296226323, + "grad_norm": 0.098429374396801, + "learning_rate": 7.66576640478031e-06, + "loss": 0.30632802844047546, + "memory(GiB)": 78.33, + "step": 4660, + "token_acc": 0.9067490984028851, + "train_speed(iter/s)": 0.032263 + }, + { + "epoch": 0.9031632999079591, + "grad_norm": 0.11389485746622086, + "learning_rate": 7.635457130829832e-06, + "loss": 0.34137189388275146, + "memory(GiB)": 78.33, + "step": 4661, + "token_acc": 0.8986180210060807, + "train_speed(iter/s)": 0.032264 + }, + { + "epoch": 0.9033570701932858, + "grad_norm": 0.09606049209833145, + "learning_rate": 7.6052063298225715e-06, + "loss": 0.29577910900115967, + "memory(GiB)": 78.33, + "step": 4662, + "token_acc": 0.9094975613979395, + "train_speed(iter/s)": 0.032265 + }, + { + "epoch": 0.9035508404786126, + "grad_norm": 0.10640320926904678, + "learning_rate": 7.575014014183378e-06, + "loss": 0.30323442816734314, + "memory(GiB)": 78.33, + "step": 4663, + "token_acc": 0.9066974069962713, + "train_speed(iter/s)": 0.032265 + }, + { + "epoch": 0.9037446107639393, + "grad_norm": 0.10014378279447556, + "learning_rate": 7.5448801963130305e-06, + "loss": 0.337171733379364, + "memory(GiB)": 78.33, + "step": 4664, + "token_acc": 0.8995919717688575, + "train_speed(iter/s)": 0.032266 + }, + { + "epoch": 0.903938381049266, + "grad_norm": 0.10196644067764282, + "learning_rate": 7.5148048885883105e-06, + "loss": 0.31075039505958557, + "memory(GiB)": 78.33, + "step": 4665, + "token_acc": 0.9083940206975852, + "train_speed(iter/s)": 0.032266 + }, + { + "epoch": 0.9041321513345928, + "grad_norm": 0.09376231580972672, + "learning_rate": 7.484788103361955e-06, + "loss": 0.3261268436908722, + "memory(GiB)": 78.33, + "step": 4666, + "token_acc": 0.9021167228378149, + "train_speed(iter/s)": 0.032267 + }, + { + "epoch": 0.9043259216199195, + "grad_norm": 0.09824056923389435, + "learning_rate": 7.454829852962635e-06, + "loss": 0.32310348749160767, + "memory(GiB)": 78.33, + "step": 4667, + "token_acc": 0.9040854978354979, + "train_speed(iter/s)": 0.032268 + }, + { + "epoch": 0.9045196919052463, + "grad_norm": 0.09825246036052704, + "learning_rate": 7.424930149695074e-06, + "loss": 0.32509177923202515, + "memory(GiB)": 78.33, + "step": 4668, + "token_acc": 0.9037440744840898, + "train_speed(iter/s)": 0.032268 + }, + { + "epoch": 0.904713462190573, + "grad_norm": 0.11145921051502228, + "learning_rate": 7.395089005839783e-06, + "loss": 0.32568415999412537, + "memory(GiB)": 78.33, + "step": 4669, + "token_acc": 0.9009174311926605, + "train_speed(iter/s)": 0.032269 + }, + { + "epoch": 0.9049072324758998, + "grad_norm": 0.10170602053403854, + "learning_rate": 7.365306433653423e-06, + "loss": 0.3407144844532013, + "memory(GiB)": 78.33, + "step": 4670, + "token_acc": 0.8984231756508985, + "train_speed(iter/s)": 0.032269 + }, + { + "epoch": 0.9051010027612265, + "grad_norm": 0.1278442144393921, + "learning_rate": 7.335582445368443e-06, + "loss": 0.3306085467338562, + "memory(GiB)": 78.33, + "step": 4671, + "token_acc": 0.8980235373613379, + "train_speed(iter/s)": 0.03227 + }, + { + "epoch": 0.9052947730465533, + "grad_norm": 0.0913781076669693, + "learning_rate": 7.305917053193294e-06, + "loss": 0.28779953718185425, + "memory(GiB)": 78.33, + "step": 4672, + "token_acc": 0.9126665944305992, + "train_speed(iter/s)": 0.032271 + }, + { + "epoch": 0.90548854333188, + "grad_norm": 0.08933483064174652, + "learning_rate": 7.276310269312347e-06, + "loss": 0.3098442852497101, + "memory(GiB)": 78.33, + "step": 4673, + "token_acc": 0.9050055824339412, + "train_speed(iter/s)": 0.032271 + }, + { + "epoch": 0.9056823136172067, + "grad_norm": 0.09485527127981186, + "learning_rate": 7.24676210588589e-06, + "loss": 0.3187693655490875, + "memory(GiB)": 78.33, + "step": 4674, + "token_acc": 0.9054063940302876, + "train_speed(iter/s)": 0.032272 + }, + { + "epoch": 0.9058760839025335, + "grad_norm": 0.10101988166570663, + "learning_rate": 7.217272575050198e-06, + "loss": 0.3332398235797882, + "memory(GiB)": 78.33, + "step": 4675, + "token_acc": 0.9006496386548044, + "train_speed(iter/s)": 0.032272 + }, + { + "epoch": 0.9060698541878603, + "grad_norm": 0.1058175191283226, + "learning_rate": 7.187841688917351e-06, + "loss": 0.30771443247795105, + "memory(GiB)": 78.33, + "step": 4676, + "token_acc": 0.9090541632983024, + "train_speed(iter/s)": 0.032273 + }, + { + "epoch": 0.9062636244731871, + "grad_norm": 0.10288235545158386, + "learning_rate": 7.158469459575444e-06, + "loss": 0.31234169006347656, + "memory(GiB)": 78.33, + "step": 4677, + "token_acc": 0.9043364814657251, + "train_speed(iter/s)": 0.032274 + }, + { + "epoch": 0.9064573947585138, + "grad_norm": 0.11373648047447205, + "learning_rate": 7.129155899088429e-06, + "loss": 0.36738601326942444, + "memory(GiB)": 78.33, + "step": 4678, + "token_acc": 0.8934572225850501, + "train_speed(iter/s)": 0.032274 + }, + { + "epoch": 0.9066511650438406, + "grad_norm": 0.09895353019237518, + "learning_rate": 7.099901019496157e-06, + "loss": 0.32815736532211304, + "memory(GiB)": 78.33, + "step": 4679, + "token_acc": 0.9006158244900203, + "train_speed(iter/s)": 0.032275 + }, + { + "epoch": 0.9068449353291673, + "grad_norm": 0.10247547179460526, + "learning_rate": 7.070704832814467e-06, + "loss": 0.32758232951164246, + "memory(GiB)": 78.33, + "step": 4680, + "token_acc": 0.9017008208823009, + "train_speed(iter/s)": 0.032275 + }, + { + "epoch": 0.9070387056144941, + "grad_norm": 0.09913128614425659, + "learning_rate": 7.041567351034899e-06, + "loss": 0.31721755862236023, + "memory(GiB)": 78.33, + "step": 4681, + "token_acc": 0.9058820262163964, + "train_speed(iter/s)": 0.032276 + }, + { + "epoch": 0.9072324758998208, + "grad_norm": 0.1092270091176033, + "learning_rate": 7.0124885861251145e-06, + "loss": 0.31192123889923096, + "memory(GiB)": 78.33, + "step": 4682, + "token_acc": 0.9066539550794942, + "train_speed(iter/s)": 0.032277 + }, + { + "epoch": 0.9074262461851476, + "grad_norm": 0.09576813131570816, + "learning_rate": 6.983468550028442e-06, + "loss": 0.3214895725250244, + "memory(GiB)": 78.33, + "step": 4683, + "token_acc": 0.9031646569427624, + "train_speed(iter/s)": 0.032277 + }, + { + "epoch": 0.9076200164704743, + "grad_norm": 0.09397298097610474, + "learning_rate": 6.954507254664266e-06, + "loss": 0.3123416006565094, + "memory(GiB)": 78.33, + "step": 4684, + "token_acc": 0.9053452877968654, + "train_speed(iter/s)": 0.032278 + }, + { + "epoch": 0.907813786755801, + "grad_norm": 0.11117564886808395, + "learning_rate": 6.925604711927751e-06, + "loss": 0.36098116636276245, + "memory(GiB)": 78.33, + "step": 4685, + "token_acc": 0.8910591965540965, + "train_speed(iter/s)": 0.032278 + }, + { + "epoch": 0.9080075570411278, + "grad_norm": 0.0936315655708313, + "learning_rate": 6.896760933689904e-06, + "loss": 0.3093935251235962, + "memory(GiB)": 78.33, + "step": 4686, + "token_acc": 0.9063284652831155, + "train_speed(iter/s)": 0.032279 + }, + { + "epoch": 0.9082013273264545, + "grad_norm": 0.09810250252485275, + "learning_rate": 6.867975931797715e-06, + "loss": 0.2804984450340271, + "memory(GiB)": 78.33, + "step": 4687, + "token_acc": 0.9146265467818037, + "train_speed(iter/s)": 0.03228 + }, + { + "epoch": 0.9083950976117813, + "grad_norm": 0.09873296320438385, + "learning_rate": 6.839249718073875e-06, + "loss": 0.3098883032798767, + "memory(GiB)": 78.33, + "step": 4688, + "token_acc": 0.9055900621118013, + "train_speed(iter/s)": 0.03228 + }, + { + "epoch": 0.908588867897108, + "grad_norm": 0.10540574043989182, + "learning_rate": 6.810582304317081e-06, + "loss": 0.31421953439712524, + "memory(GiB)": 78.33, + "step": 4689, + "token_acc": 0.9060614765871876, + "train_speed(iter/s)": 0.032281 + }, + { + "epoch": 0.9087826381824348, + "grad_norm": 0.12720443308353424, + "learning_rate": 6.781973702301796e-06, + "loss": 0.3550341725349426, + "memory(GiB)": 78.33, + "step": 4690, + "token_acc": 0.8944157949020186, + "train_speed(iter/s)": 0.032282 + }, + { + "epoch": 0.9089764084677615, + "grad_norm": 0.1082942858338356, + "learning_rate": 6.7534239237783065e-06, + "loss": 0.3174242675304413, + "memory(GiB)": 78.33, + "step": 4691, + "token_acc": 0.9045280815373182, + "train_speed(iter/s)": 0.032282 + }, + { + "epoch": 0.9091701787530883, + "grad_norm": 0.0936630368232727, + "learning_rate": 6.724932980472813e-06, + "loss": 0.3189206123352051, + "memory(GiB)": 78.33, + "step": 4692, + "token_acc": 0.9040681037771843, + "train_speed(iter/s)": 0.032283 + }, + { + "epoch": 0.909363949038415, + "grad_norm": 0.1038210541009903, + "learning_rate": 6.696500884087258e-06, + "loss": 0.3283487558364868, + "memory(GiB)": 78.33, + "step": 4693, + "token_acc": 0.901529364943999, + "train_speed(iter/s)": 0.032283 + }, + { + "epoch": 0.9095577193237417, + "grad_norm": 0.09301093220710754, + "learning_rate": 6.668127646299548e-06, + "loss": 0.3160119652748108, + "memory(GiB)": 78.33, + "step": 4694, + "token_acc": 0.906036029207408, + "train_speed(iter/s)": 0.032284 + }, + { + "epoch": 0.9097514896090685, + "grad_norm": 0.10908481478691101, + "learning_rate": 6.639813278763262e-06, + "loss": 0.35018086433410645, + "memory(GiB)": 78.33, + "step": 4695, + "token_acc": 0.8955263546434372, + "train_speed(iter/s)": 0.032285 + }, + { + "epoch": 0.9099452598943952, + "grad_norm": 0.09090767800807953, + "learning_rate": 6.611557793107914e-06, + "loss": 0.30631187558174133, + "memory(GiB)": 78.33, + "step": 4696, + "token_acc": 0.9081879446779534, + "train_speed(iter/s)": 0.032285 + }, + { + "epoch": 0.910139030179722, + "grad_norm": 0.10230287909507751, + "learning_rate": 6.583361200938769e-06, + "loss": 0.3390125036239624, + "memory(GiB)": 78.33, + "step": 4697, + "token_acc": 0.8979241998125084, + "train_speed(iter/s)": 0.032286 + }, + { + "epoch": 0.9103328004650487, + "grad_norm": 0.10016355663537979, + "learning_rate": 6.5552235138369494e-06, + "loss": 0.32282742857933044, + "memory(GiB)": 78.33, + "step": 4698, + "token_acc": 0.904220375941925, + "train_speed(iter/s)": 0.032286 + }, + { + "epoch": 0.9105265707503755, + "grad_norm": 0.10659368336200714, + "learning_rate": 6.527144743359342e-06, + "loss": 0.35568249225616455, + "memory(GiB)": 78.33, + "step": 4699, + "token_acc": 0.8939569935121965, + "train_speed(iter/s)": 0.032287 + }, + { + "epoch": 0.9107203410357022, + "grad_norm": 0.10241632908582687, + "learning_rate": 6.499124901038621e-06, + "loss": 0.33742234110832214, + "memory(GiB)": 78.33, + "step": 4700, + "token_acc": 0.8987963512711178, + "train_speed(iter/s)": 0.032287 + }, + { + "epoch": 0.910914111321029, + "grad_norm": 0.09770441055297852, + "learning_rate": 6.471163998383366e-06, + "loss": 0.3216272294521332, + "memory(GiB)": 78.33, + "step": 4701, + "token_acc": 0.903465820499494, + "train_speed(iter/s)": 0.032288 + }, + { + "epoch": 0.9111078816063557, + "grad_norm": 0.09685520827770233, + "learning_rate": 6.44326204687779e-06, + "loss": 0.2898436188697815, + "memory(GiB)": 78.33, + "step": 4702, + "token_acc": 0.9125583603703411, + "train_speed(iter/s)": 0.032289 + }, + { + "epoch": 0.9113016518916824, + "grad_norm": 0.10244952887296677, + "learning_rate": 6.415419057982024e-06, + "loss": 0.3338771164417267, + "memory(GiB)": 78.33, + "step": 4703, + "token_acc": 0.8992232679442114, + "train_speed(iter/s)": 0.032289 + }, + { + "epoch": 0.9114954221770092, + "grad_norm": 0.09772542864084244, + "learning_rate": 6.387635043131923e-06, + "loss": 0.31243184208869934, + "memory(GiB)": 78.33, + "step": 4704, + "token_acc": 0.9057063523545078, + "train_speed(iter/s)": 0.03229 + }, + { + "epoch": 0.9116891924623359, + "grad_norm": 0.10280278325080872, + "learning_rate": 6.359910013739122e-06, + "loss": 0.3368726372718811, + "memory(GiB)": 78.33, + "step": 4705, + "token_acc": 0.8969188578803032, + "train_speed(iter/s)": 0.03229 + }, + { + "epoch": 0.9118829627476627, + "grad_norm": 0.09057314693927765, + "learning_rate": 6.332243981191032e-06, + "loss": 0.30282315611839294, + "memory(GiB)": 78.33, + "step": 4706, + "token_acc": 0.9087763447625039, + "train_speed(iter/s)": 0.032291 + }, + { + "epoch": 0.9120767330329894, + "grad_norm": 0.10076764971017838, + "learning_rate": 6.304636956850828e-06, + "loss": 0.32998421788215637, + "memory(GiB)": 78.33, + "step": 4707, + "token_acc": 0.900506177543134, + "train_speed(iter/s)": 0.032291 + }, + { + "epoch": 0.9122705033183162, + "grad_norm": 0.10298432409763336, + "learning_rate": 6.277088952057508e-06, + "loss": 0.3607329726219177, + "memory(GiB)": 78.33, + "step": 4708, + "token_acc": 0.8920216927933519, + "train_speed(iter/s)": 0.032292 + }, + { + "epoch": 0.9124642736036429, + "grad_norm": 0.09257783740758896, + "learning_rate": 6.249599978125685e-06, + "loss": 0.31179288029670715, + "memory(GiB)": 78.33, + "step": 4709, + "token_acc": 0.9052415912583234, + "train_speed(iter/s)": 0.032293 + }, + { + "epoch": 0.9126580438889697, + "grad_norm": 0.10241623967885971, + "learning_rate": 6.222170046345914e-06, + "loss": 0.3377891480922699, + "memory(GiB)": 78.33, + "step": 4710, + "token_acc": 0.8971870153484144, + "train_speed(iter/s)": 0.032293 + }, + { + "epoch": 0.9128518141742964, + "grad_norm": 0.09825358539819717, + "learning_rate": 6.194799167984365e-06, + "loss": 0.2925589084625244, + "memory(GiB)": 78.33, + "step": 4711, + "token_acc": 0.9106943825234062, + "train_speed(iter/s)": 0.032294 + }, + { + "epoch": 0.9130455844596231, + "grad_norm": 0.09656988829374313, + "learning_rate": 6.167487354282963e-06, + "loss": 0.32839635014533997, + "memory(GiB)": 78.33, + "step": 4712, + "token_acc": 0.9005813656994237, + "train_speed(iter/s)": 0.032294 + }, + { + "epoch": 0.9132393547449499, + "grad_norm": 0.10096679627895355, + "learning_rate": 6.140234616459483e-06, + "loss": 0.33763110637664795, + "memory(GiB)": 78.33, + "step": 4713, + "token_acc": 0.895675275944236, + "train_speed(iter/s)": 0.032295 + }, + { + "epoch": 0.9134331250302766, + "grad_norm": 0.10386110842227936, + "learning_rate": 6.113040965707256e-06, + "loss": 0.30816999077796936, + "memory(GiB)": 78.33, + "step": 4714, + "token_acc": 0.904618748291883, + "train_speed(iter/s)": 0.032295 + }, + { + "epoch": 0.9136268953156034, + "grad_norm": 0.09449145197868347, + "learning_rate": 6.085906413195546e-06, + "loss": 0.30260464549064636, + "memory(GiB)": 78.33, + "step": 4715, + "token_acc": 0.9089724446857608, + "train_speed(iter/s)": 0.032296 + }, + { + "epoch": 0.9138206656009301, + "grad_norm": 0.1062229797244072, + "learning_rate": 6.058830970069156e-06, + "loss": 0.3432777523994446, + "memory(GiB)": 78.33, + "step": 4716, + "token_acc": 0.8961035016855469, + "train_speed(iter/s)": 0.032297 + }, + { + "epoch": 0.9140144358862569, + "grad_norm": 0.10210466384887695, + "learning_rate": 6.03181464744874e-06, + "loss": 0.31807243824005127, + "memory(GiB)": 78.33, + "step": 4717, + "token_acc": 0.9045282715354299, + "train_speed(iter/s)": 0.032297 + }, + { + "epoch": 0.9142082061715836, + "grad_norm": 0.10354772955179214, + "learning_rate": 6.00485745643064e-06, + "loss": 0.3176378011703491, + "memory(GiB)": 78.33, + "step": 4718, + "token_acc": 0.9056022642137684, + "train_speed(iter/s)": 0.032298 + }, + { + "epoch": 0.9144019764569103, + "grad_norm": 0.10732652246952057, + "learning_rate": 5.977959408086863e-06, + "loss": 0.3399035930633545, + "memory(GiB)": 78.33, + "step": 4719, + "token_acc": 0.8993071737036692, + "train_speed(iter/s)": 0.032298 + }, + { + "epoch": 0.9145957467422371, + "grad_norm": 0.08825431764125824, + "learning_rate": 5.951120513465207e-06, + "loss": 0.2766910195350647, + "memory(GiB)": 78.33, + "step": 4720, + "token_acc": 0.9151678797839868, + "train_speed(iter/s)": 0.032299 + }, + { + "epoch": 0.9147895170275638, + "grad_norm": 0.08927903324365616, + "learning_rate": 5.924340783589071e-06, + "loss": 0.2686108350753784, + "memory(GiB)": 78.33, + "step": 4721, + "token_acc": 0.917844274653106, + "train_speed(iter/s)": 0.0323 + }, + { + "epoch": 0.9149832873128906, + "grad_norm": 0.09068787842988968, + "learning_rate": 5.897620229457639e-06, + "loss": 0.2903996407985687, + "memory(GiB)": 78.33, + "step": 4722, + "token_acc": 0.9110736173485845, + "train_speed(iter/s)": 0.0323 + }, + { + "epoch": 0.9151770575982173, + "grad_norm": 0.10634169727563858, + "learning_rate": 5.870958862045782e-06, + "loss": 0.3310278356075287, + "memory(GiB)": 78.33, + "step": 4723, + "token_acc": 0.9006753122572163, + "train_speed(iter/s)": 0.032301 + }, + { + "epoch": 0.9153708278835441, + "grad_norm": 0.09822243452072144, + "learning_rate": 5.844356692304009e-06, + "loss": 0.316002756357193, + "memory(GiB)": 78.33, + "step": 4724, + "token_acc": 0.9054254372961755, + "train_speed(iter/s)": 0.032301 + }, + { + "epoch": 0.9155645981688708, + "grad_norm": 0.10107910633087158, + "learning_rate": 5.817813731158544e-06, + "loss": 0.3441876769065857, + "memory(GiB)": 78.33, + "step": 4725, + "token_acc": 0.8951170707621944, + "train_speed(iter/s)": 0.032302 + }, + { + "epoch": 0.9157583684541976, + "grad_norm": 0.10910195857286453, + "learning_rate": 5.791329989511301e-06, + "loss": 0.34992867708206177, + "memory(GiB)": 78.33, + "step": 4726, + "token_acc": 0.8951081299872788, + "train_speed(iter/s)": 0.032302 + }, + { + "epoch": 0.9159521387395243, + "grad_norm": 0.10190389305353165, + "learning_rate": 5.764905478239895e-06, + "loss": 0.3404694199562073, + "memory(GiB)": 78.33, + "step": 4727, + "token_acc": 0.8978785857238158, + "train_speed(iter/s)": 0.032303 + }, + { + "epoch": 0.916145909024851, + "grad_norm": 0.09621675312519073, + "learning_rate": 5.7385402081975284e-06, + "loss": 0.3171209990978241, + "memory(GiB)": 78.33, + "step": 4728, + "token_acc": 0.9050346044239381, + "train_speed(iter/s)": 0.032304 + }, + { + "epoch": 0.9163396793101778, + "grad_norm": 0.09636913985013962, + "learning_rate": 5.712234190213172e-06, + "loss": 0.317484587430954, + "memory(GiB)": 78.33, + "step": 4729, + "token_acc": 0.9054396568531877, + "train_speed(iter/s)": 0.032304 + }, + { + "epoch": 0.9165334495955045, + "grad_norm": 0.10614298284053802, + "learning_rate": 5.685987435091399e-06, + "loss": 0.3546099066734314, + "memory(GiB)": 78.33, + "step": 4730, + "token_acc": 0.8933270676691729, + "train_speed(iter/s)": 0.032305 + }, + { + "epoch": 0.9167272198808313, + "grad_norm": 0.1044028177857399, + "learning_rate": 5.659799953612438e-06, + "loss": 0.349134236574173, + "memory(GiB)": 78.33, + "step": 4731, + "token_acc": 0.8962899543378996, + "train_speed(iter/s)": 0.032305 + }, + { + "epoch": 0.916920990166158, + "grad_norm": 0.09492190182209015, + "learning_rate": 5.633671756532232e-06, + "loss": 0.2976999878883362, + "memory(GiB)": 78.33, + "step": 4732, + "token_acc": 0.9110198151256294, + "train_speed(iter/s)": 0.032306 + }, + { + "epoch": 0.9171147604514848, + "grad_norm": 0.10704229772090912, + "learning_rate": 5.607602854582266e-06, + "loss": 0.38176852464675903, + "memory(GiB)": 78.33, + "step": 4733, + "token_acc": 0.8874598960061953, + "train_speed(iter/s)": 0.032306 + }, + { + "epoch": 0.9173085307368115, + "grad_norm": 0.09060212224721909, + "learning_rate": 5.581593258469841e-06, + "loss": 0.30857348442077637, + "memory(GiB)": 78.33, + "step": 4734, + "token_acc": 0.9065131425913003, + "train_speed(iter/s)": 0.032307 + }, + { + "epoch": 0.9175023010221383, + "grad_norm": 0.10411375015974045, + "learning_rate": 5.555642978877678e-06, + "loss": 0.3482334315776825, + "memory(GiB)": 78.33, + "step": 4735, + "token_acc": 0.8967308452374231, + "train_speed(iter/s)": 0.032307 + }, + { + "epoch": 0.917696071307465, + "grad_norm": 0.11206993460655212, + "learning_rate": 5.529752026464351e-06, + "loss": 0.35386189818382263, + "memory(GiB)": 78.33, + "step": 4736, + "token_acc": 0.8936384358071106, + "train_speed(iter/s)": 0.032308 + }, + { + "epoch": 0.9178898415927917, + "grad_norm": 0.0973845049738884, + "learning_rate": 5.5039204118639215e-06, + "loss": 0.34648823738098145, + "memory(GiB)": 78.33, + "step": 4737, + "token_acc": 0.8961357595583725, + "train_speed(iter/s)": 0.032309 + }, + { + "epoch": 0.9180836118781185, + "grad_norm": 0.09547542035579681, + "learning_rate": 5.478148145686151e-06, + "loss": 0.32375043630599976, + "memory(GiB)": 78.33, + "step": 4738, + "token_acc": 0.9037339556592765, + "train_speed(iter/s)": 0.032309 + }, + { + "epoch": 0.9182773821634452, + "grad_norm": 0.10374420881271362, + "learning_rate": 5.452435238516373e-06, + "loss": 0.34971579909324646, + "memory(GiB)": 78.33, + "step": 4739, + "token_acc": 0.8960960267766973, + "train_speed(iter/s)": 0.03231 + }, + { + "epoch": 0.918471152448772, + "grad_norm": 0.098647341132164, + "learning_rate": 5.426781700915573e-06, + "loss": 0.33870449662208557, + "memory(GiB)": 78.33, + "step": 4740, + "token_acc": 0.8994294051744353, + "train_speed(iter/s)": 0.03231 + }, + { + "epoch": 0.9186649227340987, + "grad_norm": 0.1018560454249382, + "learning_rate": 5.401187543420405e-06, + "loss": 0.3089270293712616, + "memory(GiB)": 78.33, + "step": 4741, + "token_acc": 0.9068556551923633, + "train_speed(iter/s)": 0.032311 + }, + { + "epoch": 0.9188586930194255, + "grad_norm": 0.10648074001073837, + "learning_rate": 5.375652776542994e-06, + "loss": 0.33178946375846863, + "memory(GiB)": 78.33, + "step": 4742, + "token_acc": 0.8994858134060215, + "train_speed(iter/s)": 0.032312 + }, + { + "epoch": 0.9190524633047522, + "grad_norm": 0.09511461853981018, + "learning_rate": 5.350177410771217e-06, + "loss": 0.31413573026657104, + "memory(GiB)": 78.33, + "step": 4743, + "token_acc": 0.9058918735479156, + "train_speed(iter/s)": 0.032312 + }, + { + "epoch": 0.919246233590079, + "grad_norm": 0.10511302202939987, + "learning_rate": 5.324761456568455e-06, + "loss": 0.3475594222545624, + "memory(GiB)": 78.33, + "step": 4744, + "token_acc": 0.8972395423451783, + "train_speed(iter/s)": 0.032313 + }, + { + "epoch": 0.9194400038754057, + "grad_norm": 0.0897306576371193, + "learning_rate": 5.29940492437374e-06, + "loss": 0.30537813901901245, + "memory(GiB)": 78.33, + "step": 4745, + "token_acc": 0.9067023393016396, + "train_speed(iter/s)": 0.032313 + }, + { + "epoch": 0.9196337741607324, + "grad_norm": 0.10397239774465561, + "learning_rate": 5.274107824601692e-06, + "loss": 0.3287217617034912, + "memory(GiB)": 78.33, + "step": 4746, + "token_acc": 0.9034060827107477, + "train_speed(iter/s)": 0.032314 + }, + { + "epoch": 0.9198275444460592, + "grad_norm": 0.10913705080747604, + "learning_rate": 5.248870167642466e-06, + "loss": 0.37385323643684387, + "memory(GiB)": 78.33, + "step": 4747, + "token_acc": 0.8894808050059192, + "train_speed(iter/s)": 0.032315 + }, + { + "epoch": 0.9200213147313859, + "grad_norm": 0.09678813070058823, + "learning_rate": 5.22369196386192e-06, + "loss": 0.29931795597076416, + "memory(GiB)": 78.33, + "step": 4748, + "token_acc": 0.9094277587871513, + "train_speed(iter/s)": 0.032315 + }, + { + "epoch": 0.9202150850167127, + "grad_norm": 0.1072593703866005, + "learning_rate": 5.198573223601332e-06, + "loss": 0.33650124073028564, + "memory(GiB)": 78.33, + "step": 4749, + "token_acc": 0.9004353969032035, + "train_speed(iter/s)": 0.032316 + }, + { + "epoch": 0.9204088553020394, + "grad_norm": 0.10376523435115814, + "learning_rate": 5.173513957177716e-06, + "loss": 0.35175377130508423, + "memory(GiB)": 78.33, + "step": 4750, + "token_acc": 0.8930210695128021, + "train_speed(iter/s)": 0.032316 + }, + { + "epoch": 0.9206026255873662, + "grad_norm": 0.09530597925186157, + "learning_rate": 5.148514174883539e-06, + "loss": 0.3145235478878021, + "memory(GiB)": 78.33, + "step": 4751, + "token_acc": 0.9033812729498164, + "train_speed(iter/s)": 0.032317 + }, + { + "epoch": 0.9207963958726929, + "grad_norm": 0.09804235398769379, + "learning_rate": 5.123573886986887e-06, + "loss": 0.32991063594818115, + "memory(GiB)": 78.33, + "step": 4752, + "token_acc": 0.90162206627079, + "train_speed(iter/s)": 0.032318 + }, + { + "epoch": 0.9209901661580197, + "grad_norm": 0.09455542266368866, + "learning_rate": 5.098693103731466e-06, + "loss": 0.3050658106803894, + "memory(GiB)": 78.33, + "step": 4753, + "token_acc": 0.9097025408482637, + "train_speed(iter/s)": 0.032318 + }, + { + "epoch": 0.9211839364433464, + "grad_norm": 0.09994572401046753, + "learning_rate": 5.073871835336402e-06, + "loss": 0.3111793100833893, + "memory(GiB)": 78.33, + "step": 4754, + "token_acc": 0.9068566536879021, + "train_speed(iter/s)": 0.032319 + }, + { + "epoch": 0.9213777067286731, + "grad_norm": 0.0913093090057373, + "learning_rate": 5.049110091996505e-06, + "loss": 0.2801298499107361, + "memory(GiB)": 78.33, + "step": 4755, + "token_acc": 0.9132923800810352, + "train_speed(iter/s)": 0.032319 + }, + { + "epoch": 0.9215714770139999, + "grad_norm": 0.1138007864356041, + "learning_rate": 5.024407883882059e-06, + "loss": 0.32883748412132263, + "memory(GiB)": 78.33, + "step": 4756, + "token_acc": 0.8995948792740237, + "train_speed(iter/s)": 0.03232 + }, + { + "epoch": 0.9217652472993266, + "grad_norm": 0.10155371576547623, + "learning_rate": 4.999765221138946e-06, + "loss": 0.338005006313324, + "memory(GiB)": 78.33, + "step": 4757, + "token_acc": 0.8987489089322083, + "train_speed(iter/s)": 0.03232 + }, + { + "epoch": 0.9219590175846534, + "grad_norm": 0.09977617859840393, + "learning_rate": 4.975182113888571e-06, + "loss": 0.3259299397468567, + "memory(GiB)": 78.33, + "step": 4758, + "token_acc": 0.9017150933652252, + "train_speed(iter/s)": 0.032321 + }, + { + "epoch": 0.9221527878699801, + "grad_norm": 0.10653609782457352, + "learning_rate": 4.950658572227856e-06, + "loss": 0.3374466300010681, + "memory(GiB)": 78.33, + "step": 4759, + "token_acc": 0.8985345429169574, + "train_speed(iter/s)": 0.032322 + }, + { + "epoch": 0.9223465581553069, + "grad_norm": 0.11073584109544754, + "learning_rate": 4.926194606229311e-06, + "loss": 0.3533399701118469, + "memory(GiB)": 78.33, + "step": 4760, + "token_acc": 0.8946103673189152, + "train_speed(iter/s)": 0.032322 + }, + { + "epoch": 0.9225403284406336, + "grad_norm": 0.10663000494241714, + "learning_rate": 4.901790225940916e-06, + "loss": 0.3574818968772888, + "memory(GiB)": 78.33, + "step": 4761, + "token_acc": 0.8953725220185329, + "train_speed(iter/s)": 0.032323 + }, + { + "epoch": 0.9227340987259603, + "grad_norm": 0.08861353248357773, + "learning_rate": 4.877445441386218e-06, + "loss": 0.2784247100353241, + "memory(GiB)": 78.33, + "step": 4762, + "token_acc": 0.9163010066405425, + "train_speed(iter/s)": 0.032323 + }, + { + "epoch": 0.9229278690112871, + "grad_norm": 0.10696634650230408, + "learning_rate": 4.853160262564271e-06, + "loss": 0.34982746839523315, + "memory(GiB)": 78.33, + "step": 4763, + "token_acc": 0.8978552430535907, + "train_speed(iter/s)": 0.032324 + }, + { + "epoch": 0.9231216392966138, + "grad_norm": 0.09375255554914474, + "learning_rate": 4.8289346994496434e-06, + "loss": 0.3086707890033722, + "memory(GiB)": 78.33, + "step": 4764, + "token_acc": 0.9071370640713706, + "train_speed(iter/s)": 0.032324 + }, + { + "epoch": 0.9233154095819406, + "grad_norm": 0.10051163285970688, + "learning_rate": 4.804768761992445e-06, + "loss": 0.33212947845458984, + "memory(GiB)": 78.33, + "step": 4765, + "token_acc": 0.9005619215513442, + "train_speed(iter/s)": 0.032325 + }, + { + "epoch": 0.9235091798672673, + "grad_norm": 0.0955612063407898, + "learning_rate": 4.780662460118234e-06, + "loss": 0.31439507007598877, + "memory(GiB)": 78.33, + "step": 4766, + "token_acc": 0.9068176100628931, + "train_speed(iter/s)": 0.032326 + }, + { + "epoch": 0.9237029501525941, + "grad_norm": 0.09498281031847, + "learning_rate": 4.756615803728192e-06, + "loss": 0.31122851371765137, + "memory(GiB)": 78.33, + "step": 4767, + "token_acc": 0.9053871107818846, + "train_speed(iter/s)": 0.032326 + }, + { + "epoch": 0.9238967204379208, + "grad_norm": 0.10263197124004364, + "learning_rate": 4.732628802698835e-06, + "loss": 0.31166380643844604, + "memory(GiB)": 78.33, + "step": 4768, + "token_acc": 0.9051054675202015, + "train_speed(iter/s)": 0.032327 + }, + { + "epoch": 0.9240904907232476, + "grad_norm": 0.08937201648950577, + "learning_rate": 4.708701466882348e-06, + "loss": 0.2815870940685272, + "memory(GiB)": 78.33, + "step": 4769, + "token_acc": 0.9132003898738179, + "train_speed(iter/s)": 0.032327 + }, + { + "epoch": 0.9242842610085743, + "grad_norm": 0.09476013481616974, + "learning_rate": 4.684833806106286e-06, + "loss": 0.32919424772262573, + "memory(GiB)": 78.33, + "step": 4770, + "token_acc": 0.9013624509493285, + "train_speed(iter/s)": 0.032328 + }, + { + "epoch": 0.924478031293901, + "grad_norm": 0.09523847699165344, + "learning_rate": 4.661025830173742e-06, + "loss": 0.3461284935474396, + "memory(GiB)": 78.33, + "step": 4771, + "token_acc": 0.8952430099455808, + "train_speed(iter/s)": 0.032328 + }, + { + "epoch": 0.9246718015792278, + "grad_norm": 0.09518402069807053, + "learning_rate": 4.63727754886331e-06, + "loss": 0.30630695819854736, + "memory(GiB)": 78.33, + "step": 4772, + "token_acc": 0.9087172109035788, + "train_speed(iter/s)": 0.032329 + }, + { + "epoch": 0.9248655718645545, + "grad_norm": 0.09036926180124283, + "learning_rate": 4.613588971929022e-06, + "loss": 0.31284967064857483, + "memory(GiB)": 78.33, + "step": 4773, + "token_acc": 0.9051664182811724, + "train_speed(iter/s)": 0.03233 + }, + { + "epoch": 0.9250593421498813, + "grad_norm": 0.09823027998209, + "learning_rate": 4.589960109100444e-06, + "loss": 0.30616480112075806, + "memory(GiB)": 78.33, + "step": 4774, + "token_acc": 0.904891304347826, + "train_speed(iter/s)": 0.03233 + }, + { + "epoch": 0.925253112435208, + "grad_norm": 0.10480865091085434, + "learning_rate": 4.566390970082562e-06, + "loss": 0.32189154624938965, + "memory(GiB)": 78.33, + "step": 4775, + "token_acc": 0.9040938430251407, + "train_speed(iter/s)": 0.032331 + }, + { + "epoch": 0.9254468827205348, + "grad_norm": 0.09449231624603271, + "learning_rate": 4.54288156455585e-06, + "loss": 0.3326607942581177, + "memory(GiB)": 78.33, + "step": 4776, + "token_acc": 0.9016650796510282, + "train_speed(iter/s)": 0.032331 + }, + { + "epoch": 0.9256406530058615, + "grad_norm": 0.09553948044776917, + "learning_rate": 4.519431902176285e-06, + "loss": 0.318975567817688, + "memory(GiB)": 78.33, + "step": 4777, + "token_acc": 0.9035005686844433, + "train_speed(iter/s)": 0.032332 + }, + { + "epoch": 0.9258344232911883, + "grad_norm": 0.09830733388662338, + "learning_rate": 4.496041992575227e-06, + "loss": 0.3239940404891968, + "memory(GiB)": 78.33, + "step": 4778, + "token_acc": 0.900301950436604, + "train_speed(iter/s)": 0.032332 + }, + { + "epoch": 0.926028193576515, + "grad_norm": 0.09941928833723068, + "learning_rate": 4.472711845359594e-06, + "loss": 0.3222883641719818, + "memory(GiB)": 78.33, + "step": 4779, + "token_acc": 0.9041993341209322, + "train_speed(iter/s)": 0.032333 + }, + { + "epoch": 0.9262219638618417, + "grad_norm": 0.10502801835536957, + "learning_rate": 4.449441470111653e-06, + "loss": 0.35170549154281616, + "memory(GiB)": 78.33, + "step": 4780, + "token_acc": 0.895190294701809, + "train_speed(iter/s)": 0.032333 + }, + { + "epoch": 0.9264157341471685, + "grad_norm": 0.08964619785547256, + "learning_rate": 4.426230876389208e-06, + "loss": 0.2930634617805481, + "memory(GiB)": 78.33, + "step": 4781, + "token_acc": 0.9114340692728378, + "train_speed(iter/s)": 0.032334 + }, + { + "epoch": 0.9266095044324952, + "grad_norm": 0.09876509755849838, + "learning_rate": 4.403080073725451e-06, + "loss": 0.31545448303222656, + "memory(GiB)": 78.33, + "step": 4782, + "token_acc": 0.9063012667079041, + "train_speed(iter/s)": 0.032334 + }, + { + "epoch": 0.926803274717822, + "grad_norm": 0.10196632146835327, + "learning_rate": 4.379989071629059e-06, + "loss": 0.3357621133327484, + "memory(GiB)": 78.33, + "step": 4783, + "token_acc": 0.9003710320186891, + "train_speed(iter/s)": 0.032335 + }, + { + "epoch": 0.9269970450031487, + "grad_norm": 0.10657868534326553, + "learning_rate": 4.356957879584111e-06, + "loss": 0.3701134920120239, + "memory(GiB)": 78.33, + "step": 4784, + "token_acc": 0.8883001847625258, + "train_speed(iter/s)": 0.032336 + }, + { + "epoch": 0.9271908152884755, + "grad_norm": 0.09407640993595123, + "learning_rate": 4.333986507050125e-06, + "loss": 0.3258771002292633, + "memory(GiB)": 78.33, + "step": 4785, + "token_acc": 0.9031313034290654, + "train_speed(iter/s)": 0.032336 + }, + { + "epoch": 0.9273845855738022, + "grad_norm": 0.09928041696548462, + "learning_rate": 4.311074963462119e-06, + "loss": 0.34973931312561035, + "memory(GiB)": 78.33, + "step": 4786, + "token_acc": 0.8954454830273685, + "train_speed(iter/s)": 0.032337 + }, + { + "epoch": 0.927578355859129, + "grad_norm": 0.10799898207187653, + "learning_rate": 4.2882232582304e-06, + "loss": 0.34097224473953247, + "memory(GiB)": 78.33, + "step": 4787, + "token_acc": 0.9003566184182924, + "train_speed(iter/s)": 0.032337 + }, + { + "epoch": 0.9277721261444557, + "grad_norm": 0.09870768338441849, + "learning_rate": 4.265431400740843e-06, + "loss": 0.3039855659008026, + "memory(GiB)": 78.33, + "step": 4788, + "token_acc": 0.907001012860256, + "train_speed(iter/s)": 0.032338 + }, + { + "epoch": 0.9279658964297824, + "grad_norm": 0.0973040908575058, + "learning_rate": 4.242699400354627e-06, + "loss": 0.34012576937675476, + "memory(GiB)": 78.33, + "step": 4789, + "token_acc": 0.8992445949466007, + "train_speed(iter/s)": 0.032339 + }, + { + "epoch": 0.9281596667151092, + "grad_norm": 0.09325951337814331, + "learning_rate": 4.220027266408432e-06, + "loss": 0.30864644050598145, + "memory(GiB)": 78.33, + "step": 4790, + "token_acc": 0.9057424223046425, + "train_speed(iter/s)": 0.032339 + }, + { + "epoch": 0.9283534370004359, + "grad_norm": 0.2708342671394348, + "learning_rate": 4.197415008214294e-06, + "loss": 0.3377147316932678, + "memory(GiB)": 78.33, + "step": 4791, + "token_acc": 0.897419232539876, + "train_speed(iter/s)": 0.03234 + }, + { + "epoch": 0.9285472072857627, + "grad_norm": 0.10447484999895096, + "learning_rate": 4.174862635059667e-06, + "loss": 0.36220014095306396, + "memory(GiB)": 78.33, + "step": 4792, + "token_acc": 0.8931025706286587, + "train_speed(iter/s)": 0.03234 + }, + { + "epoch": 0.9287409775710894, + "grad_norm": 0.10496804118156433, + "learning_rate": 4.152370156207457e-06, + "loss": 0.3289092481136322, + "memory(GiB)": 78.33, + "step": 4793, + "token_acc": 0.9012323899291816, + "train_speed(iter/s)": 0.032341 + }, + { + "epoch": 0.9289347478564162, + "grad_norm": 0.09275829046964645, + "learning_rate": 4.129937580895876e-06, + "loss": 0.30164456367492676, + "memory(GiB)": 78.33, + "step": 4794, + "token_acc": 0.9096051284813877, + "train_speed(iter/s)": 0.032341 + }, + { + "epoch": 0.9291285181417429, + "grad_norm": 0.09018470346927643, + "learning_rate": 4.107564918338635e-06, + "loss": 0.2863234579563141, + "memory(GiB)": 78.33, + "step": 4795, + "token_acc": 0.9150141643059491, + "train_speed(iter/s)": 0.032342 + }, + { + "epoch": 0.9293222884270697, + "grad_norm": 0.09511598944664001, + "learning_rate": 4.085252177724751e-06, + "loss": 0.29719942808151245, + "memory(GiB)": 78.33, + "step": 4796, + "token_acc": 0.9094084830157199, + "train_speed(iter/s)": 0.032342 + }, + { + "epoch": 0.9295160587123965, + "grad_norm": 0.11475857347249985, + "learning_rate": 4.062999368218678e-06, + "loss": 0.38321453332901, + "memory(GiB)": 78.33, + "step": 4797, + "token_acc": 0.8886180329842301, + "train_speed(iter/s)": 0.032343 + }, + { + "epoch": 0.9297098289977233, + "grad_norm": 0.10678976774215698, + "learning_rate": 4.040806498960236e-06, + "loss": 0.3360079824924469, + "memory(GiB)": 78.33, + "step": 4798, + "token_acc": 0.8982757019471875, + "train_speed(iter/s)": 0.032344 + }, + { + "epoch": 0.92990359928305, + "grad_norm": 0.09599542617797852, + "learning_rate": 4.0186735790646355e-06, + "loss": 0.30816134810447693, + "memory(GiB)": 78.33, + "step": 4799, + "token_acc": 0.9078661001140015, + "train_speed(iter/s)": 0.032344 + }, + { + "epoch": 0.9300973695683767, + "grad_norm": 0.10428732633590698, + "learning_rate": 3.996600617622503e-06, + "loss": 0.3343293070793152, + "memory(GiB)": 78.33, + "step": 4800, + "token_acc": 0.8990470605483067, + "train_speed(iter/s)": 0.032345 + }, + { + "epoch": 0.9302911398537035, + "grad_norm": 0.10038130730390549, + "learning_rate": 3.974587623699721e-06, + "loss": 0.2978077828884125, + "memory(GiB)": 78.33, + "step": 4801, + "token_acc": 0.9088284412239237, + "train_speed(iter/s)": 0.032343 + }, + { + "epoch": 0.9304849101390302, + "grad_norm": 0.10180116444826126, + "learning_rate": 3.9526346063376735e-06, + "loss": 0.31616196036338806, + "memory(GiB)": 78.33, + "step": 4802, + "token_acc": 0.9038762241790799, + "train_speed(iter/s)": 0.032343 + }, + { + "epoch": 0.930678680424357, + "grad_norm": 0.10563566535711288, + "learning_rate": 3.930741574553048e-06, + "loss": 0.35403674840927124, + "memory(GiB)": 78.33, + "step": 4803, + "token_acc": 0.8924158321943599, + "train_speed(iter/s)": 0.032344 + }, + { + "epoch": 0.9308724507096837, + "grad_norm": 0.10199618339538574, + "learning_rate": 3.908908537337868e-06, + "loss": 0.35334300994873047, + "memory(GiB)": 78.33, + "step": 4804, + "token_acc": 0.8947171598813293, + "train_speed(iter/s)": 0.032344 + }, + { + "epoch": 0.9310662209950105, + "grad_norm": 0.08841365575790405, + "learning_rate": 3.887135503659594e-06, + "loss": 0.27694904804229736, + "memory(GiB)": 78.33, + "step": 4805, + "token_acc": 0.9153520015775981, + "train_speed(iter/s)": 0.032345 + }, + { + "epoch": 0.9312599912803372, + "grad_norm": 0.10381640493869781, + "learning_rate": 3.8654224824609396e-06, + "loss": 0.2975887358188629, + "memory(GiB)": 78.33, + "step": 4806, + "token_acc": 0.9085034107542324, + "train_speed(iter/s)": 0.032346 + }, + { + "epoch": 0.931453761565664, + "grad_norm": 0.10239759087562561, + "learning_rate": 3.8437694826601025e-06, + "loss": 0.34857773780822754, + "memory(GiB)": 78.33, + "step": 4807, + "token_acc": 0.8943446895898057, + "train_speed(iter/s)": 0.032346 + }, + { + "epoch": 0.9316475318509907, + "grad_norm": 0.12824027240276337, + "learning_rate": 3.8221765131504714e-06, + "loss": 0.32302480936050415, + "memory(GiB)": 78.33, + "step": 4808, + "token_acc": 0.904497486048953, + "train_speed(iter/s)": 0.032347 + }, + { + "epoch": 0.9318413021363174, + "grad_norm": 0.09815438836812973, + "learning_rate": 3.8006435828009162e-06, + "loss": 0.30667853355407715, + "memory(GiB)": 78.33, + "step": 4809, + "token_acc": 0.907749177788046, + "train_speed(iter/s)": 0.032347 + }, + { + "epoch": 0.9320350724216442, + "grad_norm": 0.09541095048189163, + "learning_rate": 3.7791707004555802e-06, + "loss": 0.3058740794658661, + "memory(GiB)": 78.33, + "step": 4810, + "token_acc": 0.9084823790877341, + "train_speed(iter/s)": 0.032348 + }, + { + "epoch": 0.9322288427069709, + "grad_norm": 0.10464680939912796, + "learning_rate": 3.7577578749339255e-06, + "loss": 0.3602639138698578, + "memory(GiB)": 78.33, + "step": 4811, + "token_acc": 0.8935190262090141, + "train_speed(iter/s)": 0.032348 + }, + { + "epoch": 0.9324226129922977, + "grad_norm": 0.10377727448940277, + "learning_rate": 3.7364051150307993e-06, + "loss": 0.33127960562705994, + "memory(GiB)": 78.33, + "step": 4812, + "token_acc": 0.9014416334773097, + "train_speed(iter/s)": 0.032349 + }, + { + "epoch": 0.9326163832776244, + "grad_norm": 0.09491308033466339, + "learning_rate": 3.715112429516337e-06, + "loss": 0.32317915558815, + "memory(GiB)": 78.33, + "step": 4813, + "token_acc": 0.9034436015745158, + "train_speed(iter/s)": 0.032349 + }, + { + "epoch": 0.9328101535629512, + "grad_norm": 0.10311052948236465, + "learning_rate": 3.6938798271360594e-06, + "loss": 0.3292618691921234, + "memory(GiB)": 78.33, + "step": 4814, + "token_acc": 0.8985615698012804, + "train_speed(iter/s)": 0.03235 + }, + { + "epoch": 0.9330039238482779, + "grad_norm": 0.0882132351398468, + "learning_rate": 3.672707316610707e-06, + "loss": 0.30853813886642456, + "memory(GiB)": 78.33, + "step": 4815, + "token_acc": 0.9053889463403424, + "train_speed(iter/s)": 0.032351 + }, + { + "epoch": 0.9331976941336046, + "grad_norm": 0.09400478005409241, + "learning_rate": 3.6515949066364236e-06, + "loss": 0.3131456971168518, + "memory(GiB)": 78.33, + "step": 4816, + "token_acc": 0.9059575833888848, + "train_speed(iter/s)": 0.032351 + }, + { + "epoch": 0.9333914644189314, + "grad_norm": 0.09559616446495056, + "learning_rate": 3.6305426058846565e-06, + "loss": 0.3201538324356079, + "memory(GiB)": 78.33, + "step": 4817, + "token_acc": 0.9050924765675906, + "train_speed(iter/s)": 0.032352 + }, + { + "epoch": 0.9335852347042581, + "grad_norm": 0.1044517233967781, + "learning_rate": 3.6095504230021387e-06, + "loss": 0.34499791264533997, + "memory(GiB)": 78.33, + "step": 4818, + "token_acc": 0.8969500515691764, + "train_speed(iter/s)": 0.032352 + }, + { + "epoch": 0.9337790049895849, + "grad_norm": 0.0850808322429657, + "learning_rate": 3.5886183666109405e-06, + "loss": 0.27842819690704346, + "memory(GiB)": 78.33, + "step": 4819, + "token_acc": 0.9156751866093644, + "train_speed(iter/s)": 0.032353 + }, + { + "epoch": 0.9339727752749116, + "grad_norm": 0.09250815957784653, + "learning_rate": 3.567746445308367e-06, + "loss": 0.30809280276298523, + "memory(GiB)": 78.33, + "step": 4820, + "token_acc": 0.9050716781110709, + "train_speed(iter/s)": 0.032353 + }, + { + "epoch": 0.9341665455602384, + "grad_norm": 0.09440483152866364, + "learning_rate": 3.5469346676671616e-06, + "loss": 0.2813994288444519, + "memory(GiB)": 78.33, + "step": 4821, + "token_acc": 0.9127701474449607, + "train_speed(iter/s)": 0.032354 + }, + { + "epoch": 0.9343603158455651, + "grad_norm": 0.09050828963518143, + "learning_rate": 3.526183042235203e-06, + "loss": 0.2954739034175873, + "memory(GiB)": 78.33, + "step": 4822, + "token_acc": 0.9097762259958737, + "train_speed(iter/s)": 0.032354 + }, + { + "epoch": 0.9345540861308919, + "grad_norm": 0.10041210055351257, + "learning_rate": 3.5054915775357907e-06, + "loss": 0.3556936979293823, + "memory(GiB)": 78.33, + "step": 4823, + "token_acc": 0.8935934907970614, + "train_speed(iter/s)": 0.032355 + }, + { + "epoch": 0.9347478564162186, + "grad_norm": 0.10176945477724075, + "learning_rate": 3.4848602820674255e-06, + "loss": 0.3064349889755249, + "memory(GiB)": 78.33, + "step": 4824, + "token_acc": 0.9054652880354506, + "train_speed(iter/s)": 0.032355 + }, + { + "epoch": 0.9349416267015453, + "grad_norm": 0.10641314834356308, + "learning_rate": 3.464289164303963e-06, + "loss": 0.32750222086906433, + "memory(GiB)": 78.33, + "step": 4825, + "token_acc": 0.9015335861015222, + "train_speed(iter/s)": 0.032356 + }, + { + "epoch": 0.9351353969868721, + "grad_norm": 0.09517652541399002, + "learning_rate": 3.4437782326945274e-06, + "loss": 0.31871315836906433, + "memory(GiB)": 78.33, + "step": 4826, + "token_acc": 0.9053494708163051, + "train_speed(iter/s)": 0.032357 + }, + { + "epoch": 0.9353291672721988, + "grad_norm": 0.09240631759166718, + "learning_rate": 3.4233274956634803e-06, + "loss": 0.2959226667881012, + "memory(GiB)": 78.33, + "step": 4827, + "token_acc": 0.9104759299781182, + "train_speed(iter/s)": 0.032357 + }, + { + "epoch": 0.9355229375575256, + "grad_norm": 0.10840350389480591, + "learning_rate": 3.402936961610503e-06, + "loss": 0.33262190222740173, + "memory(GiB)": 78.33, + "step": 4828, + "token_acc": 0.9010374853331686, + "train_speed(iter/s)": 0.032358 + }, + { + "epoch": 0.9357167078428523, + "grad_norm": 0.09562506526708603, + "learning_rate": 3.3826066389105123e-06, + "loss": 0.3282950818538666, + "memory(GiB)": 78.33, + "step": 4829, + "token_acc": 0.9013872354899055, + "train_speed(iter/s)": 0.032358 + }, + { + "epoch": 0.9359104781281791, + "grad_norm": 0.1005750298500061, + "learning_rate": 3.3623365359137453e-06, + "loss": 0.3179299831390381, + "memory(GiB)": 78.33, + "step": 4830, + "token_acc": 0.9042720884875364, + "train_speed(iter/s)": 0.032359 + }, + { + "epoch": 0.9361042484135058, + "grad_norm": 0.09987672418355942, + "learning_rate": 3.3421266609456766e-06, + "loss": 0.27341562509536743, + "memory(GiB)": 78.33, + "step": 4831, + "token_acc": 0.9170077307867213, + "train_speed(iter/s)": 0.032359 + }, + { + "epoch": 0.9362980186988326, + "grad_norm": 0.10075198113918304, + "learning_rate": 3.321977022307032e-06, + "loss": 0.3398180305957794, + "memory(GiB)": 78.33, + "step": 4832, + "token_acc": 0.8986838767860665, + "train_speed(iter/s)": 0.03236 + }, + { + "epoch": 0.9364917889841593, + "grad_norm": 0.10808294266462326, + "learning_rate": 3.301887628273825e-06, + "loss": 0.35527899861335754, + "memory(GiB)": 78.33, + "step": 4833, + "token_acc": 0.8941093763730631, + "train_speed(iter/s)": 0.032361 + }, + { + "epoch": 0.936685559269486, + "grad_norm": 0.09991144388914108, + "learning_rate": 3.2818584870972887e-06, + "loss": 0.319998562335968, + "memory(GiB)": 78.33, + "step": 4834, + "token_acc": 0.902095910695979, + "train_speed(iter/s)": 0.032361 + }, + { + "epoch": 0.9368793295548128, + "grad_norm": 0.10339634865522385, + "learning_rate": 3.2618896070039422e-06, + "loss": 0.3507855236530304, + "memory(GiB)": 78.33, + "step": 4835, + "token_acc": 0.8963165437078712, + "train_speed(iter/s)": 0.032362 + }, + { + "epoch": 0.9370730998401395, + "grad_norm": 0.10384315997362137, + "learning_rate": 3.241980996195559e-06, + "loss": 0.34698525071144104, + "memory(GiB)": 78.33, + "step": 4836, + "token_acc": 0.8970609075192019, + "train_speed(iter/s)": 0.032362 + }, + { + "epoch": 0.9372668701254663, + "grad_norm": 0.0977269858121872, + "learning_rate": 3.2221326628490973e-06, + "loss": 0.33199065923690796, + "memory(GiB)": 78.33, + "step": 4837, + "token_acc": 0.900308274874384, + "train_speed(iter/s)": 0.032363 + }, + { + "epoch": 0.937460640410793, + "grad_norm": 0.11115771532058716, + "learning_rate": 3.2023446151168363e-06, + "loss": 0.35356926918029785, + "memory(GiB)": 78.33, + "step": 4838, + "token_acc": 0.8959869212498843, + "train_speed(iter/s)": 0.032363 + }, + { + "epoch": 0.9376544106961198, + "grad_norm": 0.09477823227643967, + "learning_rate": 3.1826168611262417e-06, + "loss": 0.30472081899642944, + "memory(GiB)": 78.33, + "step": 4839, + "token_acc": 0.9091226468297661, + "train_speed(iter/s)": 0.032364 + }, + { + "epoch": 0.9378481809814465, + "grad_norm": 0.09087901562452316, + "learning_rate": 3.162949408980048e-06, + "loss": 0.295918732881546, + "memory(GiB)": 78.33, + "step": 4840, + "token_acc": 0.9117901101274023, + "train_speed(iter/s)": 0.032364 + }, + { + "epoch": 0.9380419512667733, + "grad_norm": 0.10526914894580841, + "learning_rate": 3.143342266756177e-06, + "loss": 0.3462876081466675, + "memory(GiB)": 78.33, + "step": 4841, + "token_acc": 0.8977767847167915, + "train_speed(iter/s)": 0.032365 + }, + { + "epoch": 0.9382357215521, + "grad_norm": 0.08861321955919266, + "learning_rate": 3.1237954425078537e-06, + "loss": 0.30201205611228943, + "memory(GiB)": 78.33, + "step": 4842, + "token_acc": 0.9064061563062162, + "train_speed(iter/s)": 0.032366 + }, + { + "epoch": 0.9384294918374267, + "grad_norm": 0.09867441654205322, + "learning_rate": 3.1043089442634394e-06, + "loss": 0.3132460415363312, + "memory(GiB)": 78.33, + "step": 4843, + "token_acc": 0.9052049214169717, + "train_speed(iter/s)": 0.032366 + }, + { + "epoch": 0.9386232621227535, + "grad_norm": 0.09859292209148407, + "learning_rate": 3.0848827800265817e-06, + "loss": 0.3280273675918579, + "memory(GiB)": 78.33, + "step": 4844, + "token_acc": 0.900389837343729, + "train_speed(iter/s)": 0.032367 + }, + { + "epoch": 0.9388170324080802, + "grad_norm": 0.10763488709926605, + "learning_rate": 3.0655169577761483e-06, + "loss": 0.3330923914909363, + "memory(GiB)": 78.33, + "step": 4845, + "token_acc": 0.8993531614620408, + "train_speed(iter/s)": 0.032367 + }, + { + "epoch": 0.939010802693407, + "grad_norm": 0.10462969541549683, + "learning_rate": 3.04621148546616e-06, + "loss": 0.34643369913101196, + "memory(GiB)": 78.33, + "step": 4846, + "token_acc": 0.8975769701011448, + "train_speed(iter/s)": 0.032368 + }, + { + "epoch": 0.9392045729787337, + "grad_norm": 0.10669691115617752, + "learning_rate": 3.0269663710259405e-06, + "loss": 0.3267556130886078, + "memory(GiB)": 78.33, + "step": 4847, + "token_acc": 0.9032104437471986, + "train_speed(iter/s)": 0.032368 + }, + { + "epoch": 0.9393983432640605, + "grad_norm": 0.09605014324188232, + "learning_rate": 3.007781622359934e-06, + "loss": 0.31911730766296387, + "memory(GiB)": 78.33, + "step": 4848, + "token_acc": 0.9046072137571867, + "train_speed(iter/s)": 0.032369 + }, + { + "epoch": 0.9395921135493872, + "grad_norm": 0.09630642086267471, + "learning_rate": 2.988657247347853e-06, + "loss": 0.31961339712142944, + "memory(GiB)": 78.33, + "step": 4849, + "token_acc": 0.9028399069553437, + "train_speed(iter/s)": 0.032369 + }, + { + "epoch": 0.939785883834714, + "grad_norm": 0.09434531629085541, + "learning_rate": 2.969593253844582e-06, + "loss": 0.29251518845558167, + "memory(GiB)": 78.33, + "step": 4850, + "token_acc": 0.9108287448821121, + "train_speed(iter/s)": 0.03237 + }, + { + "epoch": 0.9399796541200407, + "grad_norm": 0.09503661096096039, + "learning_rate": 2.950589649680224e-06, + "loss": 0.327890545129776, + "memory(GiB)": 78.33, + "step": 4851, + "token_acc": 0.9029296643340055, + "train_speed(iter/s)": 0.032371 + }, + { + "epoch": 0.9401734244053674, + "grad_norm": 0.09823209792375565, + "learning_rate": 2.931646442660085e-06, + "loss": 0.3063351511955261, + "memory(GiB)": 78.33, + "step": 4852, + "token_acc": 0.9096577564030869, + "train_speed(iter/s)": 0.032371 + }, + { + "epoch": 0.9403671946906942, + "grad_norm": 0.10142441838979721, + "learning_rate": 2.912763640564608e-06, + "loss": 0.3455069363117218, + "memory(GiB)": 78.33, + "step": 4853, + "token_acc": 0.8964419326593966, + "train_speed(iter/s)": 0.032372 + }, + { + "epoch": 0.9405609649760209, + "grad_norm": 0.09713559597730637, + "learning_rate": 2.8939412511495066e-06, + "loss": 0.3110349178314209, + "memory(GiB)": 78.33, + "step": 4854, + "token_acc": 0.9043400246719748, + "train_speed(iter/s)": 0.032372 + }, + { + "epoch": 0.9407547352613477, + "grad_norm": 0.09745633602142334, + "learning_rate": 2.875179282145612e-06, + "loss": 0.33706194162368774, + "memory(GiB)": 78.33, + "step": 4855, + "token_acc": 0.9004770564824333, + "train_speed(iter/s)": 0.032373 + }, + { + "epoch": 0.9409485055466744, + "grad_norm": 0.10170414298772812, + "learning_rate": 2.8564777412589944e-06, + "loss": 0.33610811829566956, + "memory(GiB)": 78.33, + "step": 4856, + "token_acc": 0.9004509018036072, + "train_speed(iter/s)": 0.032373 + }, + { + "epoch": 0.9411422758320012, + "grad_norm": 0.09948156028985977, + "learning_rate": 2.8378366361708593e-06, + "loss": 0.3124832808971405, + "memory(GiB)": 78.33, + "step": 4857, + "token_acc": 0.9069445621169759, + "train_speed(iter/s)": 0.032374 + }, + { + "epoch": 0.9413360461173279, + "grad_norm": 0.10482876747846603, + "learning_rate": 2.8192559745376152e-06, + "loss": 0.35402458906173706, + "memory(GiB)": 78.33, + "step": 4858, + "token_acc": 0.8951886335504009, + "train_speed(iter/s)": 0.032375 + }, + { + "epoch": 0.9415298164026547, + "grad_norm": 0.12244974076747894, + "learning_rate": 2.8007357639908743e-06, + "loss": 0.34525981545448303, + "memory(GiB)": 78.33, + "step": 4859, + "token_acc": 0.8989004930936265, + "train_speed(iter/s)": 0.032375 + }, + { + "epoch": 0.9417235866879814, + "grad_norm": 0.10524637997150421, + "learning_rate": 2.7822760121373187e-06, + "loss": 0.3427123427391052, + "memory(GiB)": 78.33, + "step": 4860, + "token_acc": 0.8977062523122457, + "train_speed(iter/s)": 0.032376 + }, + { + "epoch": 0.9419173569733081, + "grad_norm": 0.10324079543352127, + "learning_rate": 2.7638767265589168e-06, + "loss": 0.3470621109008789, + "memory(GiB)": 78.33, + "step": 4861, + "token_acc": 0.8988676079966448, + "train_speed(iter/s)": 0.032376 + }, + { + "epoch": 0.9421111272586349, + "grad_norm": 0.10326438397169113, + "learning_rate": 2.7455379148127064e-06, + "loss": 0.3370799422264099, + "memory(GiB)": 78.33, + "step": 4862, + "token_acc": 0.8991185653891851, + "train_speed(iter/s)": 0.032377 + }, + { + "epoch": 0.9423048975439616, + "grad_norm": 0.10224196314811707, + "learning_rate": 2.7272595844309797e-06, + "loss": 0.3339768052101135, + "memory(GiB)": 78.33, + "step": 4863, + "token_acc": 0.9005996573386637, + "train_speed(iter/s)": 0.032377 + }, + { + "epoch": 0.9424986678292884, + "grad_norm": 0.09687656909227371, + "learning_rate": 2.7090417429211143e-06, + "loss": 0.316772997379303, + "memory(GiB)": 78.33, + "step": 4864, + "token_acc": 0.9048740035311112, + "train_speed(iter/s)": 0.032378 + }, + { + "epoch": 0.9426924381146151, + "grad_norm": 0.10042627900838852, + "learning_rate": 2.6908843977656415e-06, + "loss": 0.3032311499118805, + "memory(GiB)": 78.33, + "step": 4865, + "token_acc": 0.9067732763020391, + "train_speed(iter/s)": 0.032378 + }, + { + "epoch": 0.9428862083999419, + "grad_norm": 0.10065774619579315, + "learning_rate": 2.6727875564223287e-06, + "loss": 0.32765907049179077, + "memory(GiB)": 78.33, + "step": 4866, + "token_acc": 0.9011883622455948, + "train_speed(iter/s)": 0.032379 + }, + { + "epoch": 0.9430799786852686, + "grad_norm": 0.10036752372980118, + "learning_rate": 2.654751226323981e-06, + "loss": 0.3179859519004822, + "memory(GiB)": 78.33, + "step": 4867, + "token_acc": 0.9036099036099036, + "train_speed(iter/s)": 0.03238 + }, + { + "epoch": 0.9432737489705953, + "grad_norm": 0.09917581081390381, + "learning_rate": 2.6367754148786225e-06, + "loss": 0.3333044648170471, + "memory(GiB)": 78.33, + "step": 4868, + "token_acc": 0.8983144059174526, + "train_speed(iter/s)": 0.03238 + }, + { + "epoch": 0.9434675192559221, + "grad_norm": 0.09930333495140076, + "learning_rate": 2.6188601294694135e-06, + "loss": 0.3436489999294281, + "memory(GiB)": 78.33, + "step": 4869, + "token_acc": 0.8961160505381376, + "train_speed(iter/s)": 0.032381 + }, + { + "epoch": 0.9436612895412488, + "grad_norm": 0.10882118344306946, + "learning_rate": 2.601005377454635e-06, + "loss": 0.3218170702457428, + "memory(GiB)": 78.33, + "step": 4870, + "token_acc": 0.9025652352254441, + "train_speed(iter/s)": 0.032381 + }, + { + "epoch": 0.9438550598265756, + "grad_norm": 0.10568146407604218, + "learning_rate": 2.5832111661677203e-06, + "loss": 0.3271356225013733, + "memory(GiB)": 78.33, + "step": 4871, + "token_acc": 0.9018029976218129, + "train_speed(iter/s)": 0.032382 + }, + { + "epoch": 0.9440488301119023, + "grad_norm": 0.11422929167747498, + "learning_rate": 2.5654775029171903e-06, + "loss": 0.3454614281654358, + "memory(GiB)": 78.33, + "step": 4872, + "token_acc": 0.8961740297778863, + "train_speed(iter/s)": 0.032382 + }, + { + "epoch": 0.9442426003972291, + "grad_norm": 0.10132316499948502, + "learning_rate": 2.547804394986819e-06, + "loss": 0.3422181010246277, + "memory(GiB)": 78.33, + "step": 4873, + "token_acc": 0.8978968880151073, + "train_speed(iter/s)": 0.032383 + }, + { + "epoch": 0.9444363706825558, + "grad_norm": 0.09934362024068832, + "learning_rate": 2.5301918496353322e-06, + "loss": 0.31417742371559143, + "memory(GiB)": 78.33, + "step": 4874, + "token_acc": 0.9052588444201611, + "train_speed(iter/s)": 0.032383 + }, + { + "epoch": 0.9446301409678826, + "grad_norm": 0.10042758285999298, + "learning_rate": 2.5126398740967446e-06, + "loss": 0.3282407522201538, + "memory(GiB)": 78.33, + "step": 4875, + "token_acc": 0.901354957914186, + "train_speed(iter/s)": 0.032384 + }, + { + "epoch": 0.9448239112532093, + "grad_norm": 0.09526004642248154, + "learning_rate": 2.4951484755800886e-06, + "loss": 0.29993996024131775, + "memory(GiB)": 78.33, + "step": 4876, + "token_acc": 0.9110852110852111, + "train_speed(iter/s)": 0.032384 + }, + { + "epoch": 0.945017681538536, + "grad_norm": 0.21196365356445312, + "learning_rate": 2.4777176612695513e-06, + "loss": 0.3271010220050812, + "memory(GiB)": 78.33, + "step": 4877, + "token_acc": 0.9029842588543944, + "train_speed(iter/s)": 0.032385 + }, + { + "epoch": 0.9452114518238628, + "grad_norm": 0.10081423819065094, + "learning_rate": 2.4603474383244724e-06, + "loss": 0.32736343145370483, + "memory(GiB)": 78.33, + "step": 4878, + "token_acc": 0.904177119187912, + "train_speed(iter/s)": 0.032386 + }, + { + "epoch": 0.9454052221091895, + "grad_norm": 0.10198832303285599, + "learning_rate": 2.4430378138792282e-06, + "loss": 0.3308171331882477, + "memory(GiB)": 78.33, + "step": 4879, + "token_acc": 0.899286116294914, + "train_speed(iter/s)": 0.032386 + }, + { + "epoch": 0.9455989923945163, + "grad_norm": 0.10578960925340652, + "learning_rate": 2.425788795043382e-06, + "loss": 0.3238624930381775, + "memory(GiB)": 78.33, + "step": 4880, + "token_acc": 0.904650030083374, + "train_speed(iter/s)": 0.032387 + }, + { + "epoch": 0.945792762679843, + "grad_norm": 0.09681179374456406, + "learning_rate": 2.4086003889015326e-06, + "loss": 0.2874050438404083, + "memory(GiB)": 78.33, + "step": 4881, + "token_acc": 0.9121931027345872, + "train_speed(iter/s)": 0.032387 + }, + { + "epoch": 0.9459865329651698, + "grad_norm": 0.10076259076595306, + "learning_rate": 2.3914726025134335e-06, + "loss": 0.3133833706378937, + "memory(GiB)": 78.33, + "step": 4882, + "token_acc": 0.9046173554639635, + "train_speed(iter/s)": 0.032388 + }, + { + "epoch": 0.9461803032504965, + "grad_norm": 0.09847967326641083, + "learning_rate": 2.3744054429139402e-06, + "loss": 0.31684496998786926, + "memory(GiB)": 78.33, + "step": 4883, + "token_acc": 0.903607284929505, + "train_speed(iter/s)": 0.032389 + }, + { + "epoch": 0.9463740735358233, + "grad_norm": 0.09512414038181305, + "learning_rate": 2.3573989171129792e-06, + "loss": 0.3404044806957245, + "memory(GiB)": 78.33, + "step": 4884, + "token_acc": 0.8973556187081547, + "train_speed(iter/s)": 0.032389 + }, + { + "epoch": 0.94656784382115, + "grad_norm": 0.111565001308918, + "learning_rate": 2.340453032095613e-06, + "loss": 0.36841219663619995, + "memory(GiB)": 78.33, + "step": 4885, + "token_acc": 0.8914011037920598, + "train_speed(iter/s)": 0.03239 + }, + { + "epoch": 0.9467616141064767, + "grad_norm": 0.09721893817186356, + "learning_rate": 2.3235677948219234e-06, + "loss": 0.3191075921058655, + "memory(GiB)": 78.33, + "step": 4886, + "token_acc": 0.9030001088968747, + "train_speed(iter/s)": 0.03239 + }, + { + "epoch": 0.9469553843918035, + "grad_norm": 0.09971962869167328, + "learning_rate": 2.3067432122271966e-06, + "loss": 0.33436092734336853, + "memory(GiB)": 78.33, + "step": 4887, + "token_acc": 0.8994865769240593, + "train_speed(iter/s)": 0.032391 + }, + { + "epoch": 0.9471491546771302, + "grad_norm": 0.103705994784832, + "learning_rate": 2.289979291221672e-06, + "loss": 0.3510754108428955, + "memory(GiB)": 78.33, + "step": 4888, + "token_acc": 0.8964129530759531, + "train_speed(iter/s)": 0.032391 + }, + { + "epoch": 0.947342924962457, + "grad_norm": 0.10477303713560104, + "learning_rate": 2.273276038690791e-06, + "loss": 0.3268052637577057, + "memory(GiB)": 78.33, + "step": 4889, + "token_acc": 0.9006867252517022, + "train_speed(iter/s)": 0.032392 + }, + { + "epoch": 0.9475366952477837, + "grad_norm": 0.10907679796218872, + "learning_rate": 2.256633461495e-06, + "loss": 0.35595622658729553, + "memory(GiB)": 78.33, + "step": 4890, + "token_acc": 0.8929480987090542, + "train_speed(iter/s)": 0.032392 + }, + { + "epoch": 0.9477304655331105, + "grad_norm": 0.09888530522584915, + "learning_rate": 2.240051566469864e-06, + "loss": 0.3075358271598816, + "memory(GiB)": 78.33, + "step": 4891, + "token_acc": 0.9061078252957234, + "train_speed(iter/s)": 0.032393 + }, + { + "epoch": 0.9479242358184372, + "grad_norm": 0.09880520403385162, + "learning_rate": 2.2235303604260347e-06, + "loss": 0.34165963530540466, + "memory(GiB)": 78.33, + "step": 4892, + "token_acc": 0.8993901612684646, + "train_speed(iter/s)": 0.032393 + }, + { + "epoch": 0.948118006103764, + "grad_norm": 0.09279303252696991, + "learning_rate": 2.207069850149168e-06, + "loss": 0.31139740347862244, + "memory(GiB)": 78.33, + "step": 4893, + "token_acc": 0.906871677108554, + "train_speed(iter/s)": 0.032394 + }, + { + "epoch": 0.9483117763890907, + "grad_norm": 0.10540986061096191, + "learning_rate": 2.190670042400089e-06, + "loss": 0.3575151264667511, + "memory(GiB)": 78.33, + "step": 4894, + "token_acc": 0.8956390443200994, + "train_speed(iter/s)": 0.032394 + }, + { + "epoch": 0.9485055466744174, + "grad_norm": 0.10701259970664978, + "learning_rate": 2.174330943914593e-06, + "loss": 0.3582616150379181, + "memory(GiB)": 78.33, + "step": 4895, + "token_acc": 0.8956076759061834, + "train_speed(iter/s)": 0.032395 + }, + { + "epoch": 0.9486993169597442, + "grad_norm": 0.09107998758554459, + "learning_rate": 2.1580525614036115e-06, + "loss": 0.325961172580719, + "memory(GiB)": 78.33, + "step": 4896, + "token_acc": 0.9035096153846154, + "train_speed(iter/s)": 0.032396 + }, + { + "epoch": 0.9488930872450709, + "grad_norm": 0.09532765299081802, + "learning_rate": 2.141834901553113e-06, + "loss": 0.304240345954895, + "memory(GiB)": 78.33, + "step": 4897, + "token_acc": 0.9070364131139044, + "train_speed(iter/s)": 0.032396 + }, + { + "epoch": 0.9490868575303977, + "grad_norm": 0.0961633175611496, + "learning_rate": 2.12567797102412e-06, + "loss": 0.317226380109787, + "memory(GiB)": 78.33, + "step": 4898, + "token_acc": 0.9035757011830386, + "train_speed(iter/s)": 0.032397 + }, + { + "epoch": 0.9492806278157244, + "grad_norm": 0.092696912586689, + "learning_rate": 2.10958177645274e-06, + "loss": 0.31177228689193726, + "memory(GiB)": 78.33, + "step": 4899, + "token_acc": 0.9065967318942909, + "train_speed(iter/s)": 0.032397 + }, + { + "epoch": 0.9494743981010512, + "grad_norm": 0.09380333125591278, + "learning_rate": 2.0935463244500683e-06, + "loss": 0.3175070583820343, + "memory(GiB)": 78.33, + "step": 4900, + "token_acc": 0.9043746832999542, + "train_speed(iter/s)": 0.032398 + }, + { + "epoch": 0.9496681683863779, + "grad_norm": 0.09854871779680252, + "learning_rate": 2.077571621602353e-06, + "loss": 0.3263704180717468, + "memory(GiB)": 78.33, + "step": 4901, + "token_acc": 0.9024825089818742, + "train_speed(iter/s)": 0.032398 + }, + { + "epoch": 0.9498619386717047, + "grad_norm": 0.11277089267969131, + "learning_rate": 2.0616576744707624e-06, + "loss": 0.35801947116851807, + "memory(GiB)": 78.33, + "step": 4902, + "token_acc": 0.895363334124911, + "train_speed(iter/s)": 0.032399 + }, + { + "epoch": 0.9500557089570314, + "grad_norm": 0.09970259666442871, + "learning_rate": 2.0458044895916513e-06, + "loss": 0.31966596841812134, + "memory(GiB)": 78.33, + "step": 4903, + "token_acc": 0.9028877820418606, + "train_speed(iter/s)": 0.032399 + }, + { + "epoch": 0.9502494792423581, + "grad_norm": 0.10125202685594559, + "learning_rate": 2.0300120734763113e-06, + "loss": 0.3093150556087494, + "memory(GiB)": 78.33, + "step": 4904, + "token_acc": 0.905799933852491, + "train_speed(iter/s)": 0.0324 + }, + { + "epoch": 0.9504432495276849, + "grad_norm": 0.09156003594398499, + "learning_rate": 2.014280432611104e-06, + "loss": 0.29941630363464355, + "memory(GiB)": 78.33, + "step": 4905, + "token_acc": 0.9120291488636662, + "train_speed(iter/s)": 0.032401 + }, + { + "epoch": 0.9506370198130116, + "grad_norm": 0.10564015060663223, + "learning_rate": 1.998609573457477e-06, + "loss": 0.3411896824836731, + "memory(GiB)": 78.33, + "step": 4906, + "token_acc": 0.8986840243384746, + "train_speed(iter/s)": 0.032401 + }, + { + "epoch": 0.9508307900983384, + "grad_norm": 0.10158051550388336, + "learning_rate": 1.982999502451832e-06, + "loss": 0.33457452058792114, + "memory(GiB)": 78.33, + "step": 4907, + "token_acc": 0.8990842883153047, + "train_speed(iter/s)": 0.032402 + }, + { + "epoch": 0.9510245603836651, + "grad_norm": 0.09677699208259583, + "learning_rate": 1.9674502260056733e-06, + "loss": 0.3045703172683716, + "memory(GiB)": 78.33, + "step": 4908, + "token_acc": 0.9080135296660817, + "train_speed(iter/s)": 0.032402 + }, + { + "epoch": 0.9512183306689919, + "grad_norm": 0.09688906371593475, + "learning_rate": 1.9519617505055098e-06, + "loss": 0.3192395865917206, + "memory(GiB)": 78.33, + "step": 4909, + "token_acc": 0.9055065341226405, + "train_speed(iter/s)": 0.032403 + }, + { + "epoch": 0.9514121009543186, + "grad_norm": 0.1135900616645813, + "learning_rate": 1.936534082312835e-06, + "loss": 0.35068824887275696, + "memory(GiB)": 78.33, + "step": 4910, + "token_acc": 0.898571652801758, + "train_speed(iter/s)": 0.032403 + }, + { + "epoch": 0.9516058712396454, + "grad_norm": 0.09443049877882004, + "learning_rate": 1.9211672277642475e-06, + "loss": 0.30319830775260925, + "memory(GiB)": 78.33, + "step": 4911, + "token_acc": 0.906641655112416, + "train_speed(iter/s)": 0.032404 + }, + { + "epoch": 0.9517996415249721, + "grad_norm": 0.10316184908151627, + "learning_rate": 1.9058611931712986e-06, + "loss": 0.31675225496292114, + "memory(GiB)": 78.33, + "step": 4912, + "token_acc": 0.9031579571039222, + "train_speed(iter/s)": 0.032405 + }, + { + "epoch": 0.9519934118102988, + "grad_norm": 0.10155156254768372, + "learning_rate": 1.8906159848206092e-06, + "loss": 0.30963414907455444, + "memory(GiB)": 78.33, + "step": 4913, + "token_acc": 0.905279359704479, + "train_speed(iter/s)": 0.032405 + }, + { + "epoch": 0.9521871820956256, + "grad_norm": 0.11472687125205994, + "learning_rate": 1.8754316089737876e-06, + "loss": 0.3777286112308502, + "memory(GiB)": 78.33, + "step": 4914, + "token_acc": 0.8904687163389038, + "train_speed(iter/s)": 0.032406 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 0.09569194912910461, + "learning_rate": 1.8603080718674612e-06, + "loss": 0.3033876419067383, + "memory(GiB)": 78.33, + "step": 4915, + "token_acc": 0.9085435990308607, + "train_speed(iter/s)": 0.032406 + }, + { + "epoch": 0.9525747226662791, + "grad_norm": 0.09913970530033112, + "learning_rate": 1.8452453797132948e-06, + "loss": 0.3156486749649048, + "memory(GiB)": 78.33, + "step": 4916, + "token_acc": 0.9022938238862399, + "train_speed(iter/s)": 0.032407 + }, + { + "epoch": 0.9527684929516058, + "grad_norm": 0.09308433532714844, + "learning_rate": 1.8302435386978897e-06, + "loss": 0.30741560459136963, + "memory(GiB)": 78.33, + "step": 4917, + "token_acc": 0.907486671172186, + "train_speed(iter/s)": 0.032407 + }, + { + "epoch": 0.9529622632369327, + "grad_norm": 0.09139852225780487, + "learning_rate": 1.8153025549829836e-06, + "loss": 0.32832008600234985, + "memory(GiB)": 78.33, + "step": 4918, + "token_acc": 0.9023957409050577, + "train_speed(iter/s)": 0.032408 + }, + { + "epoch": 0.9531560335222594, + "grad_norm": 0.11089830100536346, + "learning_rate": 1.800422434705151e-06, + "loss": 0.3461150527000427, + "memory(GiB)": 78.33, + "step": 4919, + "token_acc": 0.8961630109366407, + "train_speed(iter/s)": 0.032408 + }, + { + "epoch": 0.9533498038075862, + "grad_norm": 0.10732929408550262, + "learning_rate": 1.7856031839761363e-06, + "loss": 0.3544648289680481, + "memory(GiB)": 78.33, + "step": 4920, + "token_acc": 0.8927505075839006, + "train_speed(iter/s)": 0.032409 + }, + { + "epoch": 0.9535435740929129, + "grad_norm": 0.10486234724521637, + "learning_rate": 1.7708448088825545e-06, + "loss": 0.341609388589859, + "memory(GiB)": 78.33, + "step": 4921, + "token_acc": 0.8979067310163635, + "train_speed(iter/s)": 0.032409 + }, + { + "epoch": 0.9537373443782396, + "grad_norm": 0.10882783681154251, + "learning_rate": 1.7561473154860728e-06, + "loss": 0.34817302227020264, + "memory(GiB)": 78.33, + "step": 4922, + "token_acc": 0.8951537080828513, + "train_speed(iter/s)": 0.03241 + }, + { + "epoch": 0.9539311146635664, + "grad_norm": 0.09959416091442108, + "learning_rate": 1.7415107098233628e-06, + "loss": 0.3298611044883728, + "memory(GiB)": 78.33, + "step": 4923, + "token_acc": 0.9004360465116279, + "train_speed(iter/s)": 0.03241 + }, + { + "epoch": 0.9541248849488931, + "grad_norm": 0.08811165392398834, + "learning_rate": 1.7269349979060654e-06, + "loss": 0.28269749879837036, + "memory(GiB)": 78.33, + "step": 4924, + "token_acc": 0.9127474034881442, + "train_speed(iter/s)": 0.032411 + }, + { + "epoch": 0.9543186552342199, + "grad_norm": 0.10352582484483719, + "learning_rate": 1.7124201857208252e-06, + "loss": 0.3340131640434265, + "memory(GiB)": 78.33, + "step": 4925, + "token_acc": 0.8997261368174816, + "train_speed(iter/s)": 0.032412 + }, + { + "epoch": 0.9545124255195466, + "grad_norm": 0.10261107236146927, + "learning_rate": 1.6979662792292404e-06, + "loss": 0.32005396485328674, + "memory(GiB)": 78.33, + "step": 4926, + "token_acc": 0.9025667147748818, + "train_speed(iter/s)": 0.032412 + }, + { + "epoch": 0.9547061958048734, + "grad_norm": 0.09662662446498871, + "learning_rate": 1.6835732843679451e-06, + "loss": 0.32958412170410156, + "memory(GiB)": 78.33, + "step": 4927, + "token_acc": 0.9002728335715213, + "train_speed(iter/s)": 0.032413 + }, + { + "epoch": 0.9548999660902001, + "grad_norm": 0.09711389243602753, + "learning_rate": 1.6692412070485106e-06, + "loss": 0.3142922818660736, + "memory(GiB)": 78.33, + "step": 4928, + "token_acc": 0.904241246186658, + "train_speed(iter/s)": 0.032413 + }, + { + "epoch": 0.9550937363755269, + "grad_norm": 0.10424994677305222, + "learning_rate": 1.6549700531575284e-06, + "loss": 0.335157573223114, + "memory(GiB)": 78.33, + "step": 4929, + "token_acc": 0.899737302977233, + "train_speed(iter/s)": 0.032414 + }, + { + "epoch": 0.9552875066608536, + "grad_norm": 0.09868663549423218, + "learning_rate": 1.6407598285565093e-06, + "loss": 0.32267120480537415, + "memory(GiB)": 78.33, + "step": 4930, + "token_acc": 0.9046480108427011, + "train_speed(iter/s)": 0.032414 + }, + { + "epoch": 0.9554812769461803, + "grad_norm": 0.09154357016086578, + "learning_rate": 1.6266105390820017e-06, + "loss": 0.31675025820732117, + "memory(GiB)": 78.33, + "step": 4931, + "token_acc": 0.903805316214699, + "train_speed(iter/s)": 0.032415 + }, + { + "epoch": 0.9556750472315071, + "grad_norm": 0.09340998530387878, + "learning_rate": 1.6125221905455231e-06, + "loss": 0.30397865176200867, + "memory(GiB)": 78.33, + "step": 4932, + "token_acc": 0.9084064579411999, + "train_speed(iter/s)": 0.032415 + }, + { + "epoch": 0.9558688175168338, + "grad_norm": 0.10670307278633118, + "learning_rate": 1.598494788733462e-06, + "loss": 0.32941100001335144, + "memory(GiB)": 78.33, + "step": 4933, + "token_acc": 0.9001266174608566, + "train_speed(iter/s)": 0.032416 + }, + { + "epoch": 0.9560625878021606, + "grad_norm": 0.09027762711048126, + "learning_rate": 1.584528339407326e-06, + "loss": 0.2942469120025635, + "memory(GiB)": 78.33, + "step": 4934, + "token_acc": 0.9111908000494621, + "train_speed(iter/s)": 0.032416 + }, + { + "epoch": 0.9562563580874873, + "grad_norm": 0.09194032102823257, + "learning_rate": 1.57062284830346e-06, + "loss": 0.303377628326416, + "memory(GiB)": 78.33, + "step": 4935, + "token_acc": 0.9077935247705968, + "train_speed(iter/s)": 0.032417 + }, + { + "epoch": 0.9564501283728141, + "grad_norm": 0.13369497656822205, + "learning_rate": 1.5567783211332619e-06, + "loss": 0.3248521685600281, + "memory(GiB)": 78.33, + "step": 4936, + "token_acc": 0.9012696041822256, + "train_speed(iter/s)": 0.032417 + }, + { + "epoch": 0.9566438986581408, + "grad_norm": 0.0936022698879242, + "learning_rate": 1.5429947635830164e-06, + "loss": 0.3220188021659851, + "memory(GiB)": 78.33, + "step": 4937, + "token_acc": 0.9041003304295584, + "train_speed(iter/s)": 0.032418 + }, + { + "epoch": 0.9568376689434676, + "grad_norm": 0.09517528116703033, + "learning_rate": 1.529272181314012e-06, + "loss": 0.3031097650527954, + "memory(GiB)": 78.33, + "step": 4938, + "token_acc": 0.9060737583491436, + "train_speed(iter/s)": 0.032419 + }, + { + "epoch": 0.9570314392287943, + "grad_norm": 0.09666939824819565, + "learning_rate": 1.5156105799625063e-06, + "loss": 0.3083738684654236, + "memory(GiB)": 78.33, + "step": 4939, + "token_acc": 0.9067667594099428, + "train_speed(iter/s)": 0.032419 + }, + { + "epoch": 0.957225209514121, + "grad_norm": 0.09546789526939392, + "learning_rate": 1.5020099651396444e-06, + "loss": 0.3306988775730133, + "memory(GiB)": 78.33, + "step": 4940, + "token_acc": 0.9035588697408784, + "train_speed(iter/s)": 0.03242 + }, + { + "epoch": 0.9574189797994478, + "grad_norm": 0.10004635155200958, + "learning_rate": 1.4884703424315915e-06, + "loss": 0.3262169063091278, + "memory(GiB)": 78.33, + "step": 4941, + "token_acc": 0.9031863057731064, + "train_speed(iter/s)": 0.03242 + }, + { + "epoch": 0.9576127500847745, + "grad_norm": 0.09645616263151169, + "learning_rate": 1.474991717399432e-06, + "loss": 0.2961626946926117, + "memory(GiB)": 78.33, + "step": 4942, + "token_acc": 0.9100462809072081, + "train_speed(iter/s)": 0.032421 + }, + { + "epoch": 0.9578065203701013, + "grad_norm": 0.09985015541315079, + "learning_rate": 1.4615740955792044e-06, + "loss": 0.3318878412246704, + "memory(GiB)": 78.33, + "step": 4943, + "token_acc": 0.9009440305532396, + "train_speed(iter/s)": 0.032421 + }, + { + "epoch": 0.958000290655428, + "grad_norm": 0.10011833906173706, + "learning_rate": 1.4482174824818671e-06, + "loss": 0.337127149105072, + "memory(GiB)": 78.33, + "step": 4944, + "token_acc": 0.8997083603370059, + "train_speed(iter/s)": 0.032422 + }, + { + "epoch": 0.9581940609407548, + "grad_norm": 0.10610119998455048, + "learning_rate": 1.4349218835933486e-06, + "loss": 0.35346800088882446, + "memory(GiB)": 78.33, + "step": 4945, + "token_acc": 0.8942115189322501, + "train_speed(iter/s)": 0.032422 + }, + { + "epoch": 0.9583878312260815, + "grad_norm": 0.09986083954572678, + "learning_rate": 1.4216873043745137e-06, + "loss": 0.3105732202529907, + "memory(GiB)": 78.33, + "step": 4946, + "token_acc": 0.9050974597942831, + "train_speed(iter/s)": 0.032423 + }, + { + "epoch": 0.9585816015114083, + "grad_norm": 0.10908082872629166, + "learning_rate": 1.4085137502611477e-06, + "loss": 0.3552319407463074, + "memory(GiB)": 78.33, + "step": 4947, + "token_acc": 0.8946395037842144, + "train_speed(iter/s)": 0.032423 + }, + { + "epoch": 0.958775371796735, + "grad_norm": 0.09129244834184647, + "learning_rate": 1.3954012266640059e-06, + "loss": 0.27050110697746277, + "memory(GiB)": 78.33, + "step": 4948, + "token_acc": 0.9176398959428793, + "train_speed(iter/s)": 0.032424 + }, + { + "epoch": 0.9589691420820617, + "grad_norm": 0.10130368173122406, + "learning_rate": 1.3823497389687466e-06, + "loss": 0.3376561403274536, + "memory(GiB)": 78.33, + "step": 4949, + "token_acc": 0.8982817684028808, + "train_speed(iter/s)": 0.032424 + }, + { + "epoch": 0.9591629123673885, + "grad_norm": 0.10016176849603653, + "learning_rate": 1.369359292535932e-06, + "loss": 0.3218695819377899, + "memory(GiB)": 78.33, + "step": 4950, + "token_acc": 0.9014069086330542, + "train_speed(iter/s)": 0.032425 + }, + { + "epoch": 0.9593566826527152, + "grad_norm": 0.09046775102615356, + "learning_rate": 1.356429892701144e-06, + "loss": 0.30442506074905396, + "memory(GiB)": 78.33, + "step": 4951, + "token_acc": 0.9069761971963496, + "train_speed(iter/s)": 0.032425 + }, + { + "epoch": 0.959550452938042, + "grad_norm": 0.09234574437141418, + "learning_rate": 1.343561544774785e-06, + "loss": 0.30501365661621094, + "memory(GiB)": 78.33, + "step": 4952, + "token_acc": 0.9067395915863726, + "train_speed(iter/s)": 0.032426 + }, + { + "epoch": 0.9597442232233687, + "grad_norm": 0.09642866253852844, + "learning_rate": 1.3307542540422766e-06, + "loss": 0.29419055581092834, + "memory(GiB)": 78.33, + "step": 4953, + "token_acc": 0.9106110240706452, + "train_speed(iter/s)": 0.032426 + }, + { + "epoch": 0.9599379935086955, + "grad_norm": 0.10034112632274628, + "learning_rate": 1.3180080257638782e-06, + "loss": 0.339307963848114, + "memory(GiB)": 78.33, + "step": 4954, + "token_acc": 0.8978796816739941, + "train_speed(iter/s)": 0.032427 + }, + { + "epoch": 0.9601317637940222, + "grad_norm": 0.10334701836109161, + "learning_rate": 1.3053228651748349e-06, + "loss": 0.3403671681880951, + "memory(GiB)": 78.33, + "step": 4955, + "token_acc": 0.8992078746578135, + "train_speed(iter/s)": 0.032428 + }, + { + "epoch": 0.960325534079349, + "grad_norm": 0.10150493681430817, + "learning_rate": 1.2926987774852627e-06, + "loss": 0.3322233259677887, + "memory(GiB)": 78.33, + "step": 4956, + "token_acc": 0.9011913901447297, + "train_speed(iter/s)": 0.032428 + }, + { + "epoch": 0.9605193043646757, + "grad_norm": 0.09975534677505493, + "learning_rate": 1.2801357678802138e-06, + "loss": 0.3389909267425537, + "memory(GiB)": 78.33, + "step": 4957, + "token_acc": 0.8986236035148656, + "train_speed(iter/s)": 0.032429 + }, + { + "epoch": 0.9607130746500024, + "grad_norm": 0.10334469377994537, + "learning_rate": 1.2676338415196774e-06, + "loss": 0.34487882256507874, + "memory(GiB)": 78.33, + "step": 4958, + "token_acc": 0.897090561398716, + "train_speed(iter/s)": 0.032429 + }, + { + "epoch": 0.9609068449353292, + "grad_norm": 0.10029633343219757, + "learning_rate": 1.2551930035385126e-06, + "loss": 0.32758665084838867, + "memory(GiB)": 78.33, + "step": 4959, + "token_acc": 0.9024535719127016, + "train_speed(iter/s)": 0.03243 + }, + { + "epoch": 0.9611006152206559, + "grad_norm": 0.1052929162979126, + "learning_rate": 1.2428132590465156e-06, + "loss": 0.3499680757522583, + "memory(GiB)": 78.33, + "step": 4960, + "token_acc": 0.8937179730499146, + "train_speed(iter/s)": 0.03243 + }, + { + "epoch": 0.9612943855059827, + "grad_norm": 0.10789167135953903, + "learning_rate": 1.2304946131283521e-06, + "loss": 0.347482293844223, + "memory(GiB)": 78.33, + "step": 4961, + "token_acc": 0.8981575675370183, + "train_speed(iter/s)": 0.032431 + }, + { + "epoch": 0.9614881557913094, + "grad_norm": 0.09253862500190735, + "learning_rate": 1.2182370708436584e-06, + "loss": 0.2795042097568512, + "memory(GiB)": 78.33, + "step": 4962, + "token_acc": 0.9132865314612585, + "train_speed(iter/s)": 0.032431 + }, + { + "epoch": 0.9616819260766362, + "grad_norm": 0.11232099682092667, + "learning_rate": 1.2060406372269238e-06, + "loss": 0.3673211932182312, + "memory(GiB)": 78.33, + "step": 4963, + "token_acc": 0.8896090319882919, + "train_speed(iter/s)": 0.032432 + }, + { + "epoch": 0.9618756963619629, + "grad_norm": 0.09660880267620087, + "learning_rate": 1.1939053172875245e-06, + "loss": 0.31402865052223206, + "memory(GiB)": 78.33, + "step": 4964, + "token_acc": 0.9040212859337283, + "train_speed(iter/s)": 0.032433 + }, + { + "epoch": 0.9620694666472897, + "grad_norm": 0.10611861944198608, + "learning_rate": 1.1818311160098237e-06, + "loss": 0.3377360999584198, + "memory(GiB)": 78.33, + "step": 4965, + "token_acc": 0.9000355008579374, + "train_speed(iter/s)": 0.032433 + }, + { + "epoch": 0.9622632369326164, + "grad_norm": 0.1003570705652237, + "learning_rate": 1.1698180383529542e-06, + "loss": 0.32752174139022827, + "memory(GiB)": 78.33, + "step": 4966, + "token_acc": 0.9013712208822072, + "train_speed(iter/s)": 0.032434 + }, + { + "epoch": 0.9624570072179431, + "grad_norm": 0.08952160179615021, + "learning_rate": 1.1578660892510528e-06, + "loss": 0.2788778245449066, + "memory(GiB)": 78.33, + "step": 4967, + "token_acc": 0.9154586305821423, + "train_speed(iter/s)": 0.032434 + }, + { + "epoch": 0.9626507775032699, + "grad_norm": 0.10649926215410233, + "learning_rate": 1.1459752736130756e-06, + "loss": 0.3794569969177246, + "memory(GiB)": 78.33, + "step": 4968, + "token_acc": 0.888167308750688, + "train_speed(iter/s)": 0.032435 + }, + { + "epoch": 0.9628445477885966, + "grad_norm": 0.08885262906551361, + "learning_rate": 1.1341455963229329e-06, + "loss": 0.29271164536476135, + "memory(GiB)": 78.33, + "step": 4969, + "token_acc": 0.9096743030637593, + "train_speed(iter/s)": 0.032435 + }, + { + "epoch": 0.9630383180739234, + "grad_norm": 0.10036831349134445, + "learning_rate": 1.1223770622393714e-06, + "loss": 0.3443622887134552, + "memory(GiB)": 78.33, + "step": 4970, + "token_acc": 0.8970222654561176, + "train_speed(iter/s)": 0.032436 + }, + { + "epoch": 0.9632320883592501, + "grad_norm": 0.09779267013072968, + "learning_rate": 1.110669676196041e-06, + "loss": 0.3445608615875244, + "memory(GiB)": 78.33, + "step": 4971, + "token_acc": 0.8951095773995004, + "train_speed(iter/s)": 0.032436 + }, + { + "epoch": 0.9634258586445769, + "grad_norm": 0.10155529528856277, + "learning_rate": 1.0990234430014954e-06, + "loss": 0.3322911262512207, + "memory(GiB)": 78.33, + "step": 4972, + "token_acc": 0.9017332921313593, + "train_speed(iter/s)": 0.032437 + }, + { + "epoch": 0.9636196289299036, + "grad_norm": 0.09281053394079208, + "learning_rate": 1.087438367439125e-06, + "loss": 0.29993924498558044, + "memory(GiB)": 78.33, + "step": 4973, + "token_acc": 0.9089661368003393, + "train_speed(iter/s)": 0.032437 + }, + { + "epoch": 0.9638133992152303, + "grad_norm": 0.09526029974222183, + "learning_rate": 1.0759144542672737e-06, + "loss": 0.29557523131370544, + "memory(GiB)": 78.33, + "step": 4974, + "token_acc": 0.9109068897204491, + "train_speed(iter/s)": 0.032438 + }, + { + "epoch": 0.9640071695005571, + "grad_norm": 0.10219217091798782, + "learning_rate": 1.0644517082190883e-06, + "loss": 0.3419698476791382, + "memory(GiB)": 78.33, + "step": 4975, + "token_acc": 0.8975590462199119, + "train_speed(iter/s)": 0.032439 + }, + { + "epoch": 0.9642009397858838, + "grad_norm": 0.09392113983631134, + "learning_rate": 1.0530501340026532e-06, + "loss": 0.3047352135181427, + "memory(GiB)": 78.33, + "step": 4976, + "token_acc": 0.9068194872330426, + "train_speed(iter/s)": 0.032439 + }, + { + "epoch": 0.9643947100712106, + "grad_norm": 0.10133232921361923, + "learning_rate": 1.0417097363008886e-06, + "loss": 0.32953399419784546, + "memory(GiB)": 78.33, + "step": 4977, + "token_acc": 0.9026956897794436, + "train_speed(iter/s)": 0.03244 + }, + { + "epoch": 0.9645884803565373, + "grad_norm": 0.09701909869909286, + "learning_rate": 1.030430519771569e-06, + "loss": 0.3382679224014282, + "memory(GiB)": 78.33, + "step": 4978, + "token_acc": 0.8983569375214284, + "train_speed(iter/s)": 0.03244 + }, + { + "epoch": 0.9647822506418641, + "grad_norm": 0.09171733260154724, + "learning_rate": 1.0192124890474385e-06, + "loss": 0.3097023665904999, + "memory(GiB)": 78.33, + "step": 4979, + "token_acc": 0.9057717083225972, + "train_speed(iter/s)": 0.032441 + }, + { + "epoch": 0.9649760209271908, + "grad_norm": 0.0920836329460144, + "learning_rate": 1.0080556487359947e-06, + "loss": 0.30298176407814026, + "memory(GiB)": 78.33, + "step": 4980, + "token_acc": 0.9079930043486482, + "train_speed(iter/s)": 0.032441 + }, + { + "epoch": 0.9651697912125176, + "grad_norm": 0.09740208089351654, + "learning_rate": 9.969600034196557e-07, + "loss": 0.3513997197151184, + "memory(GiB)": 78.33, + "step": 4981, + "token_acc": 0.8963819470346885, + "train_speed(iter/s)": 0.032442 + }, + { + "epoch": 0.9653635614978443, + "grad_norm": 0.09901650995016098, + "learning_rate": 9.859255576557257e-07, + "loss": 0.32587093114852905, + "memory(GiB)": 78.33, + "step": 4982, + "token_acc": 0.9019118199881854, + "train_speed(iter/s)": 0.032442 + }, + { + "epoch": 0.965557331783171, + "grad_norm": 0.10267394781112671, + "learning_rate": 9.749523159763295e-07, + "loss": 0.32968056201934814, + "memory(GiB)": 78.33, + "step": 4983, + "token_acc": 0.9018928833455613, + "train_speed(iter/s)": 0.032443 + }, + { + "epoch": 0.9657511020684978, + "grad_norm": 0.0950947031378746, + "learning_rate": 9.64040282888462e-07, + "loss": 0.32611650228500366, + "memory(GiB)": 78.33, + "step": 4984, + "token_acc": 0.9009006650259203, + "train_speed(iter/s)": 0.032443 + }, + { + "epoch": 0.9659448723538245, + "grad_norm": 0.10337654501199722, + "learning_rate": 9.531894628740044e-07, + "loss": 0.3508460521697998, + "memory(GiB)": 78.33, + "step": 4985, + "token_acc": 0.8948303758520995, + "train_speed(iter/s)": 0.032444 + }, + { + "epoch": 0.9661386426391513, + "grad_norm": 0.08894824236631393, + "learning_rate": 9.423998603896921e-07, + "loss": 0.29168346524238586, + "memory(GiB)": 78.33, + "step": 4986, + "token_acc": 0.9120740535223346, + "train_speed(iter/s)": 0.032444 + }, + { + "epoch": 0.966332412924478, + "grad_norm": 0.0989081859588623, + "learning_rate": 9.316714798670799e-07, + "loss": 0.32772505283355713, + "memory(GiB)": 78.33, + "step": 4987, + "token_acc": 0.899692881430295, + "train_speed(iter/s)": 0.032445 + }, + { + "epoch": 0.9665261832098048, + "grad_norm": 0.08811601996421814, + "learning_rate": 9.210043257126098e-07, + "loss": 0.289480984210968, + "memory(GiB)": 78.33, + "step": 4988, + "token_acc": 0.9128572101125182, + "train_speed(iter/s)": 0.032445 + }, + { + "epoch": 0.9667199534951315, + "grad_norm": 0.10313121974468231, + "learning_rate": 9.103984023075772e-07, + "loss": 0.35241398215293884, + "memory(GiB)": 78.33, + "step": 4989, + "token_acc": 0.8971772553485724, + "train_speed(iter/s)": 0.032446 + }, + { + "epoch": 0.9669137237804583, + "grad_norm": 0.09983363002538681, + "learning_rate": 8.998537140081141e-07, + "loss": 0.31030166149139404, + "memory(GiB)": 78.33, + "step": 4990, + "token_acc": 0.9058147247402166, + "train_speed(iter/s)": 0.032447 + }, + { + "epoch": 0.967107494065785, + "grad_norm": 0.10593412816524506, + "learning_rate": 8.893702651452062e-07, + "loss": 0.3430192768573761, + "memory(GiB)": 78.33, + "step": 4991, + "token_acc": 0.8970623145400594, + "train_speed(iter/s)": 0.032447 + }, + { + "epoch": 0.9673012643511117, + "grad_norm": 0.09164869040250778, + "learning_rate": 8.789480600246757e-07, + "loss": 0.3059519827365875, + "memory(GiB)": 78.33, + "step": 4992, + "token_acc": 0.9062849909936225, + "train_speed(iter/s)": 0.032448 + }, + { + "epoch": 0.9674950346364385, + "grad_norm": 0.11753620952367783, + "learning_rate": 8.685871029272318e-07, + "loss": 0.35421380400657654, + "memory(GiB)": 78.33, + "step": 4993, + "token_acc": 0.8949581180397317, + "train_speed(iter/s)": 0.032448 + }, + { + "epoch": 0.9676888049217652, + "grad_norm": 0.09262137115001678, + "learning_rate": 8.582873981083705e-07, + "loss": 0.29231324791908264, + "memory(GiB)": 78.33, + "step": 4994, + "token_acc": 0.9102221280876863, + "train_speed(iter/s)": 0.032449 + }, + { + "epoch": 0.967882575207092, + "grad_norm": 0.09335288405418396, + "learning_rate": 8.480489497984744e-07, + "loss": 0.3362676203250885, + "memory(GiB)": 78.33, + "step": 4995, + "token_acc": 0.8979033950843279, + "train_speed(iter/s)": 0.032449 + }, + { + "epoch": 0.9680763454924187, + "grad_norm": 0.12949591875076294, + "learning_rate": 8.378717622027465e-07, + "loss": 0.32554298639297485, + "memory(GiB)": 78.33, + "step": 4996, + "token_acc": 0.9030780971762911, + "train_speed(iter/s)": 0.03245 + }, + { + "epoch": 0.9682701157777455, + "grad_norm": 0.10350757837295532, + "learning_rate": 8.277558395012096e-07, + "loss": 0.335059255361557, + "memory(GiB)": 78.33, + "step": 4997, + "token_acc": 0.8990599887317261, + "train_speed(iter/s)": 0.03245 + }, + { + "epoch": 0.9684638860630722, + "grad_norm": 0.11247258633375168, + "learning_rate": 8.177011858487903e-07, + "loss": 0.3429482579231262, + "memory(GiB)": 78.33, + "step": 4998, + "token_acc": 0.895786360575093, + "train_speed(iter/s)": 0.032451 + }, + { + "epoch": 0.968657656348399, + "grad_norm": 0.10709885507822037, + "learning_rate": 8.077078053751518e-07, + "loss": 0.3340757191181183, + "memory(GiB)": 78.33, + "step": 4999, + "token_acc": 0.8992043255199533, + "train_speed(iter/s)": 0.032451 + }, + { + "epoch": 0.9688514266337257, + "grad_norm": 0.09967630356550217, + "learning_rate": 7.97775702184894e-07, + "loss": 0.30826178193092346, + "memory(GiB)": 78.33, + "step": 5000, + "token_acc": 0.9074506820281506, + "train_speed(iter/s)": 0.032452 + }, + { + "epoch": 0.9688514266337257, + "eval_loss": 0.3782345950603485, + "eval_runtime": 1344.6451, + "eval_samples_per_second": 5.019, + "eval_steps_per_second": 5.019, + "eval_token_acc": 0.9026852655344548, + "step": 5000 + }, + { + "epoch": 0.9690451969190524, + "grad_norm": 0.10758214443922043, + "learning_rate": 7.87904880357354e-07, + "loss": 0.3177351951599121, + "memory(GiB)": 78.33, + "step": 5001, + "token_acc": 0.9042991375981465, + "train_speed(iter/s)": 0.032169 + }, + { + "epoch": 0.9692389672043792, + "grad_norm": 0.0970822274684906, + "learning_rate": 7.780953439467719e-07, + "loss": 0.3285868167877197, + "memory(GiB)": 78.33, + "step": 5002, + "token_acc": 0.9016248076571994, + "train_speed(iter/s)": 0.03217 + }, + { + "epoch": 0.9694327374897059, + "grad_norm": 0.10737350583076477, + "learning_rate": 7.683470969821748e-07, + "loss": 0.3421315550804138, + "memory(GiB)": 78.33, + "step": 5003, + "token_acc": 0.894730186830209, + "train_speed(iter/s)": 0.03217 + }, + { + "epoch": 0.9696265077750327, + "grad_norm": 0.09953954815864563, + "learning_rate": 7.586601434674266e-07, + "loss": 0.31448549032211304, + "memory(GiB)": 78.33, + "step": 5004, + "token_acc": 0.9061734010562758, + "train_speed(iter/s)": 0.032171 + }, + { + "epoch": 0.9698202780603594, + "grad_norm": 0.10338281095027924, + "learning_rate": 7.490344873812615e-07, + "loss": 0.33236753940582275, + "memory(GiB)": 78.33, + "step": 5005, + "token_acc": 0.8993050377307842, + "train_speed(iter/s)": 0.032171 + }, + { + "epoch": 0.9700140483456862, + "grad_norm": 0.09365373104810715, + "learning_rate": 7.394701326771335e-07, + "loss": 0.3056509792804718, + "memory(GiB)": 78.33, + "step": 5006, + "token_acc": 0.906607994493338, + "train_speed(iter/s)": 0.032172 + }, + { + "epoch": 0.9702078186310129, + "grad_norm": 0.10147716104984283, + "learning_rate": 7.29967083283417e-07, + "loss": 0.31711748242378235, + "memory(GiB)": 78.33, + "step": 5007, + "token_acc": 0.902543880455408, + "train_speed(iter/s)": 0.032173 + }, + { + "epoch": 0.9704015889163397, + "grad_norm": 0.09770286083221436, + "learning_rate": 7.205253431032564e-07, + "loss": 0.324236124753952, + "memory(GiB)": 78.33, + "step": 5008, + "token_acc": 0.9028327266972622, + "train_speed(iter/s)": 0.032173 + }, + { + "epoch": 0.9705953592016664, + "grad_norm": 0.10440776497125626, + "learning_rate": 7.111449160146332e-07, + "loss": 0.3207136392593384, + "memory(GiB)": 78.33, + "step": 5009, + "token_acc": 0.9042985518859825, + "train_speed(iter/s)": 0.032174 + }, + { + "epoch": 0.9707891294869931, + "grad_norm": 0.09528158605098724, + "learning_rate": 7.018258058703319e-07, + "loss": 0.2945985496044159, + "memory(GiB)": 78.33, + "step": 5010, + "token_acc": 0.9116842726151536, + "train_speed(iter/s)": 0.032174 + }, + { + "epoch": 0.9709828997723199, + "grad_norm": 0.09875106066465378, + "learning_rate": 6.925680164979741e-07, + "loss": 0.33711299300193787, + "memory(GiB)": 78.33, + "step": 5011, + "token_acc": 0.8979212309573547, + "train_speed(iter/s)": 0.032175 + }, + { + "epoch": 0.9711766700576466, + "grad_norm": 0.09752146899700165, + "learning_rate": 6.833715516999849e-07, + "loss": 0.33156195282936096, + "memory(GiB)": 78.33, + "step": 5012, + "token_acc": 0.9005984838409363, + "train_speed(iter/s)": 0.032175 + }, + { + "epoch": 0.9713704403429734, + "grad_norm": 0.11073119193315506, + "learning_rate": 6.742364152535929e-07, + "loss": 0.3404943645000458, + "memory(GiB)": 78.33, + "step": 5013, + "token_acc": 0.8999028182701652, + "train_speed(iter/s)": 0.032176 + }, + { + "epoch": 0.9715642106283001, + "grad_norm": 0.08712846785783768, + "learning_rate": 6.651626109108465e-07, + "loss": 0.30490002036094666, + "memory(GiB)": 78.33, + "step": 5014, + "token_acc": 0.907512204600102, + "train_speed(iter/s)": 0.032176 + }, + { + "epoch": 0.9717579809136269, + "grad_norm": 0.0969480574131012, + "learning_rate": 6.561501423985816e-07, + "loss": 0.32621052861213684, + "memory(GiB)": 78.33, + "step": 5015, + "token_acc": 0.9022739990842956, + "train_speed(iter/s)": 0.032177 + }, + { + "epoch": 0.9719517511989536, + "grad_norm": 0.11033257842063904, + "learning_rate": 6.471990134185035e-07, + "loss": 0.34958821535110474, + "memory(GiB)": 78.33, + "step": 5016, + "token_acc": 0.8971919453168299, + "train_speed(iter/s)": 0.032178 + }, + { + "epoch": 0.9721455214842804, + "grad_norm": 0.11200974881649017, + "learning_rate": 6.383092276470381e-07, + "loss": 0.37351301312446594, + "memory(GiB)": 78.33, + "step": 5017, + "token_acc": 0.890080579498554, + "train_speed(iter/s)": 0.032178 + }, + { + "epoch": 0.9723392917696071, + "grad_norm": 0.10688678920269012, + "learning_rate": 6.294807887354647e-07, + "loss": 0.32566171884536743, + "memory(GiB)": 78.33, + "step": 5018, + "token_acc": 0.9027785449925426, + "train_speed(iter/s)": 0.032179 + }, + { + "epoch": 0.9725330620549338, + "grad_norm": 0.11659563332796097, + "learning_rate": 6.207137003098994e-07, + "loss": 0.3676939904689789, + "memory(GiB)": 78.33, + "step": 5019, + "token_acc": 0.8909811380567443, + "train_speed(iter/s)": 0.032179 + }, + { + "epoch": 0.9727268323402606, + "grad_norm": 0.10240488499403, + "learning_rate": 6.120079659711786e-07, + "loss": 0.3170427680015564, + "memory(GiB)": 78.33, + "step": 5020, + "token_acc": 0.9030933713471133, + "train_speed(iter/s)": 0.03218 + }, + { + "epoch": 0.9729206026255873, + "grad_norm": 0.1029011458158493, + "learning_rate": 6.033635892950084e-07, + "loss": 0.3273800313472748, + "memory(GiB)": 78.33, + "step": 5021, + "token_acc": 0.9038344491783323, + "train_speed(iter/s)": 0.03218 + }, + { + "epoch": 0.9731143729109141, + "grad_norm": 0.09669654816389084, + "learning_rate": 5.94780573831849e-07, + "loss": 0.3153877854347229, + "memory(GiB)": 78.33, + "step": 5022, + "token_acc": 0.9058483637541851, + "train_speed(iter/s)": 0.032181 + }, + { + "epoch": 0.9733081431962408, + "grad_norm": 0.09650097042322159, + "learning_rate": 5.862589231069803e-07, + "loss": 0.3036450445652008, + "memory(GiB)": 78.33, + "step": 5023, + "token_acc": 0.9092272045795284, + "train_speed(iter/s)": 0.032181 + }, + { + "epoch": 0.9735019134815676, + "grad_norm": 0.11860737949609756, + "learning_rate": 5.777986406204694e-07, + "loss": 0.3921282887458801, + "memory(GiB)": 78.33, + "step": 5024, + "token_acc": 0.8855097849722156, + "train_speed(iter/s)": 0.032182 + }, + { + "epoch": 0.9736956837668943, + "grad_norm": 0.10267064720392227, + "learning_rate": 5.693997298472031e-07, + "loss": 0.31443023681640625, + "memory(GiB)": 78.33, + "step": 5025, + "token_acc": 0.9051502501866032, + "train_speed(iter/s)": 0.032183 + }, + { + "epoch": 0.973889454052221, + "grad_norm": 0.09616965055465698, + "learning_rate": 5.610621942368054e-07, + "loss": 0.3315185606479645, + "memory(GiB)": 78.33, + "step": 5026, + "token_acc": 0.902337848564771, + "train_speed(iter/s)": 0.032183 + }, + { + "epoch": 0.9740832243375478, + "grad_norm": 0.09976300597190857, + "learning_rate": 5.527860372137538e-07, + "loss": 0.32565930485725403, + "memory(GiB)": 78.33, + "step": 5027, + "token_acc": 0.9027216527952472, + "train_speed(iter/s)": 0.032184 + }, + { + "epoch": 0.9742769946228745, + "grad_norm": 0.09962465614080429, + "learning_rate": 5.445712621772791e-07, + "loss": 0.327309787273407, + "memory(GiB)": 78.33, + "step": 5028, + "token_acc": 0.9021066306645948, + "train_speed(iter/s)": 0.032184 + }, + { + "epoch": 0.9744707649082013, + "grad_norm": 0.10362134128808975, + "learning_rate": 5.364178725014157e-07, + "loss": 0.3133904039859772, + "memory(GiB)": 78.33, + "step": 5029, + "token_acc": 0.9041495198902606, + "train_speed(iter/s)": 0.032185 + }, + { + "epoch": 0.974664535193528, + "grad_norm": 0.10740216076374054, + "learning_rate": 5.283258715349514e-07, + "loss": 0.33964803814888, + "memory(GiB)": 78.33, + "step": 5030, + "token_acc": 0.8985633557311141, + "train_speed(iter/s)": 0.032186 + }, + { + "epoch": 0.9748583054788548, + "grad_norm": 0.0993296429514885, + "learning_rate": 5.202952626015445e-07, + "loss": 0.3082018196582794, + "memory(GiB)": 78.33, + "step": 5031, + "token_acc": 0.9063476667744369, + "train_speed(iter/s)": 0.032186 + }, + { + "epoch": 0.9750520757641815, + "grad_norm": 0.09285027533769608, + "learning_rate": 5.123260489995229e-07, + "loss": 0.2862505316734314, + "memory(GiB)": 78.33, + "step": 5032, + "token_acc": 0.9113854235062376, + "train_speed(iter/s)": 0.032187 + }, + { + "epoch": 0.9752458460495083, + "grad_norm": 0.11154574900865555, + "learning_rate": 5.044182340021019e-07, + "loss": 0.36379557847976685, + "memory(GiB)": 78.33, + "step": 5033, + "token_acc": 0.8903923823000501, + "train_speed(iter/s)": 0.032187 + }, + { + "epoch": 0.975439616334835, + "grad_norm": 0.09221196174621582, + "learning_rate": 4.965718208572001e-07, + "loss": 0.3152211308479309, + "memory(GiB)": 78.33, + "step": 5034, + "token_acc": 0.9047773077880069, + "train_speed(iter/s)": 0.032188 + }, + { + "epoch": 0.9756333866201617, + "grad_norm": 0.09758051484823227, + "learning_rate": 4.887868127875561e-07, + "loss": 0.3170757293701172, + "memory(GiB)": 78.33, + "step": 5035, + "token_acc": 0.9045536265328575, + "train_speed(iter/s)": 0.032188 + }, + { + "epoch": 0.9758271569054885, + "grad_norm": 0.09600334614515305, + "learning_rate": 4.810632129907122e-07, + "loss": 0.31009405851364136, + "memory(GiB)": 78.33, + "step": 5036, + "token_acc": 0.9058032803330491, + "train_speed(iter/s)": 0.032189 + }, + { + "epoch": 0.9760209271908152, + "grad_norm": 0.10319238901138306, + "learning_rate": 4.7340102463891415e-07, + "loss": 0.33604225516319275, + "memory(GiB)": 78.33, + "step": 5037, + "token_acc": 0.8996856559863233, + "train_speed(iter/s)": 0.032189 + }, + { + "epoch": 0.976214697476142, + "grad_norm": 0.10404365509748459, + "learning_rate": 4.6580025087926134e-07, + "loss": 0.3151894509792328, + "memory(GiB)": 78.33, + "step": 5038, + "token_acc": 0.9064724919093851, + "train_speed(iter/s)": 0.03219 + }, + { + "epoch": 0.9764084677614687, + "grad_norm": 0.10096472501754761, + "learning_rate": 4.5826089483358973e-07, + "loss": 0.3288641571998596, + "memory(GiB)": 78.33, + "step": 5039, + "token_acc": 0.9003461989642643, + "train_speed(iter/s)": 0.032191 + }, + { + "epoch": 0.9766022380467956, + "grad_norm": 0.08929524570703506, + "learning_rate": 4.5078295959850576e-07, + "loss": 0.2969154715538025, + "memory(GiB)": 78.33, + "step": 5040, + "token_acc": 0.9102983397827986, + "train_speed(iter/s)": 0.032191 + }, + { + "epoch": 0.9767960083321223, + "grad_norm": 0.09447763115167618, + "learning_rate": 4.4336644824540245e-07, + "loss": 0.3048000633716583, + "memory(GiB)": 78.33, + "step": 5041, + "token_acc": 0.9062621145943711, + "train_speed(iter/s)": 0.032192 + }, + { + "epoch": 0.9769897786174491, + "grad_norm": 0.10533607751131058, + "learning_rate": 4.360113638204432e-07, + "loss": 0.3521908223628998, + "memory(GiB)": 78.33, + "step": 5042, + "token_acc": 0.8957643566617194, + "train_speed(iter/s)": 0.032192 + }, + { + "epoch": 0.9771835489027758, + "grad_norm": 0.09984603524208069, + "learning_rate": 4.287177093445615e-07, + "loss": 0.3105589747428894, + "memory(GiB)": 78.33, + "step": 5043, + "token_acc": 0.9076200993926008, + "train_speed(iter/s)": 0.032193 + }, + { + "epoch": 0.9773773191881026, + "grad_norm": 0.09986462444067001, + "learning_rate": 4.2148548781344437e-07, + "loss": 0.2763623893260956, + "memory(GiB)": 78.33, + "step": 5044, + "token_acc": 0.9144782780290841, + "train_speed(iter/s)": 0.032193 + }, + { + "epoch": 0.9775710894734293, + "grad_norm": 0.09815599024295807, + "learning_rate": 4.143147021975823e-07, + "loss": 0.33960387110710144, + "memory(GiB)": 78.33, + "step": 5045, + "token_acc": 0.9006058664958497, + "train_speed(iter/s)": 0.032194 + }, + { + "epoch": 0.977764859758756, + "grad_norm": 0.10120173543691635, + "learning_rate": 4.0720535544216945e-07, + "loss": 0.3160873055458069, + "memory(GiB)": 78.33, + "step": 5046, + "token_acc": 0.9033082947099249, + "train_speed(iter/s)": 0.032195 + }, + { + "epoch": 0.9779586300440828, + "grad_norm": 0.09940861910581589, + "learning_rate": 4.0015745046725336e-07, + "loss": 0.31816571950912476, + "memory(GiB)": 78.33, + "step": 5047, + "token_acc": 0.9058804471083752, + "train_speed(iter/s)": 0.032195 + }, + { + "epoch": 0.9781524003294095, + "grad_norm": 0.11098282784223557, + "learning_rate": 3.931709901675684e-07, + "loss": 0.353601336479187, + "memory(GiB)": 78.33, + "step": 5048, + "token_acc": 0.8934923500340327, + "train_speed(iter/s)": 0.032196 + }, + { + "epoch": 0.9783461706147363, + "grad_norm": 0.09338078647851944, + "learning_rate": 3.862459774126525e-07, + "loss": 0.3041617274284363, + "memory(GiB)": 78.33, + "step": 5049, + "token_acc": 0.9068331108843003, + "train_speed(iter/s)": 0.032196 + }, + { + "epoch": 0.978539940900063, + "grad_norm": 0.10786343365907669, + "learning_rate": 3.793824150467806e-07, + "loss": 0.3523224890232086, + "memory(GiB)": 78.33, + "step": 5050, + "token_acc": 0.8945492180312787, + "train_speed(iter/s)": 0.032197 + }, + { + "epoch": 0.9787337111853898, + "grad_norm": 0.09506326168775558, + "learning_rate": 3.7258030588901424e-07, + "loss": 0.3265533447265625, + "memory(GiB)": 78.33, + "step": 5051, + "token_acc": 0.9006531536959823, + "train_speed(iter/s)": 0.032197 + }, + { + "epoch": 0.9789274814707165, + "grad_norm": 0.0921454057097435, + "learning_rate": 3.6583965273316864e-07, + "loss": 0.29769742488861084, + "memory(GiB)": 78.33, + "step": 5052, + "token_acc": 0.9098529003608105, + "train_speed(iter/s)": 0.032198 + }, + { + "epoch": 0.9791212517560433, + "grad_norm": 0.10511661320924759, + "learning_rate": 3.591604583478125e-07, + "loss": 0.33176693320274353, + "memory(GiB)": 78.33, + "step": 5053, + "token_acc": 0.9007189710979348, + "train_speed(iter/s)": 0.032199 + }, + { + "epoch": 0.97931502204137, + "grad_norm": 0.09943993389606476, + "learning_rate": 3.5254272547623474e-07, + "loss": 0.32589173316955566, + "memory(GiB)": 78.33, + "step": 5054, + "token_acc": 0.9014144342263095, + "train_speed(iter/s)": 0.032199 + }, + { + "epoch": 0.9795087923266967, + "grad_norm": 0.10941484570503235, + "learning_rate": 3.4598645683656113e-07, + "loss": 0.3675105571746826, + "memory(GiB)": 78.33, + "step": 5055, + "token_acc": 0.8914224336351082, + "train_speed(iter/s)": 0.0322 + }, + { + "epoch": 0.9797025626120235, + "grad_norm": 0.10779840499162674, + "learning_rate": 3.3949165512160423e-07, + "loss": 0.35344696044921875, + "memory(GiB)": 78.33, + "step": 5056, + "token_acc": 0.8957955624622291, + "train_speed(iter/s)": 0.0322 + }, + { + "epoch": 0.9798963328973502, + "grad_norm": 0.09050939232110977, + "learning_rate": 3.330583229989636e-07, + "loss": 0.30384790897369385, + "memory(GiB)": 78.33, + "step": 5057, + "token_acc": 0.9091451737259681, + "train_speed(iter/s)": 0.032201 + }, + { + "epoch": 0.980090103182677, + "grad_norm": 0.10062376409769058, + "learning_rate": 3.2668646311097556e-07, + "loss": 0.30701377987861633, + "memory(GiB)": 78.33, + "step": 5058, + "token_acc": 0.9093784940958303, + "train_speed(iter/s)": 0.032201 + }, + { + "epoch": 0.9802838734680037, + "grad_norm": 0.09237212687730789, + "learning_rate": 3.2037607807473e-07, + "loss": 0.30352169275283813, + "memory(GiB)": 78.33, + "step": 5059, + "token_acc": 0.9058777531604327, + "train_speed(iter/s)": 0.032202 + }, + { + "epoch": 0.9804776437533305, + "grad_norm": 0.0979812890291214, + "learning_rate": 3.1412717048207025e-07, + "loss": 0.31397053599357605, + "memory(GiB)": 78.33, + "step": 5060, + "token_acc": 0.9068339778781405, + "train_speed(iter/s)": 0.032203 + }, + { + "epoch": 0.9806714140386572, + "grad_norm": 0.09479079395532608, + "learning_rate": 3.0793974289961e-07, + "loss": 0.31937700510025024, + "memory(GiB)": 78.33, + "step": 5061, + "token_acc": 0.9035309120858683, + "train_speed(iter/s)": 0.032203 + }, + { + "epoch": 0.980865184323984, + "grad_norm": 0.12520615756511688, + "learning_rate": 3.01813797868683e-07, + "loss": 0.35134872794151306, + "memory(GiB)": 78.33, + "step": 5062, + "token_acc": 0.8932484641205903, + "train_speed(iter/s)": 0.032204 + }, + { + "epoch": 0.9810589546093107, + "grad_norm": 0.10819295048713684, + "learning_rate": 2.957493379053599e-07, + "loss": 0.3610383868217468, + "memory(GiB)": 78.33, + "step": 5063, + "token_acc": 0.8935895511184507, + "train_speed(iter/s)": 0.032204 + }, + { + "epoch": 0.9812527248946374, + "grad_norm": 0.09377988427877426, + "learning_rate": 2.8974636550049833e-07, + "loss": 0.3174287676811218, + "memory(GiB)": 78.33, + "step": 5064, + "token_acc": 0.9046624721817732, + "train_speed(iter/s)": 0.032205 + }, + { + "epoch": 0.9814464951799642, + "grad_norm": 0.09167510271072388, + "learning_rate": 2.83804883119676e-07, + "loss": 0.297576367855072, + "memory(GiB)": 78.33, + "step": 5065, + "token_acc": 0.9104416645391882, + "train_speed(iter/s)": 0.032205 + }, + { + "epoch": 0.9816402654652909, + "grad_norm": 0.09928658604621887, + "learning_rate": 2.7792489320322407e-07, + "loss": 0.3319474458694458, + "memory(GiB)": 78.33, + "step": 5066, + "token_acc": 0.9005888179616993, + "train_speed(iter/s)": 0.032206 + }, + { + "epoch": 0.9818340357506177, + "grad_norm": 0.09505660086870193, + "learning_rate": 2.721063981661942e-07, + "loss": 0.32442140579223633, + "memory(GiB)": 78.33, + "step": 5067, + "token_acc": 0.9013683579704355, + "train_speed(iter/s)": 0.032206 + }, + { + "epoch": 0.9820278060359444, + "grad_norm": 0.09601055830717087, + "learning_rate": 2.663494003984079e-07, + "loss": 0.3260546922683716, + "memory(GiB)": 78.33, + "step": 5068, + "token_acc": 0.9013065431263338, + "train_speed(iter/s)": 0.032207 + }, + { + "epoch": 0.9822215763212712, + "grad_norm": 0.10125764459371567, + "learning_rate": 2.6065390226444047e-07, + "loss": 0.3628811836242676, + "memory(GiB)": 78.33, + "step": 5069, + "token_acc": 0.8924914675767918, + "train_speed(iter/s)": 0.032208 + }, + { + "epoch": 0.9824153466065979, + "grad_norm": 0.10014388710260391, + "learning_rate": 2.5501990610355406e-07, + "loss": 0.32086047530174255, + "memory(GiB)": 78.33, + "step": 5070, + "token_acc": 0.9021128125605737, + "train_speed(iter/s)": 0.032208 + }, + { + "epoch": 0.9826091168919246, + "grad_norm": 0.09349211305379868, + "learning_rate": 2.4944741422979754e-07, + "loss": 0.31767165660858154, + "memory(GiB)": 78.33, + "step": 5071, + "token_acc": 0.9040567600306544, + "train_speed(iter/s)": 0.032209 + }, + { + "epoch": 0.9828028871772514, + "grad_norm": 0.09731042385101318, + "learning_rate": 2.4393642893194007e-07, + "loss": 0.3093053102493286, + "memory(GiB)": 78.33, + "step": 5072, + "token_acc": 0.9075150674702651, + "train_speed(iter/s)": 0.032209 + }, + { + "epoch": 0.9829966574625781, + "grad_norm": 0.10741175711154938, + "learning_rate": 2.3848695247350446e-07, + "loss": 0.31964316964149475, + "memory(GiB)": 78.33, + "step": 5073, + "token_acc": 0.9033793824646376, + "train_speed(iter/s)": 0.03221 + }, + { + "epoch": 0.9831904277479049, + "grad_norm": 0.09962072223424911, + "learning_rate": 2.330989870927169e-07, + "loss": 0.33166712522506714, + "memory(GiB)": 78.33, + "step": 5074, + "token_acc": 0.8991797207209679, + "train_speed(iter/s)": 0.03221 + }, + { + "epoch": 0.9833841980332316, + "grad_norm": 0.11339244991540909, + "learning_rate": 2.2777253500257386e-07, + "loss": 0.3660873770713806, + "memory(GiB)": 78.33, + "step": 5075, + "token_acc": 0.8902705205370726, + "train_speed(iter/s)": 0.032211 + }, + { + "epoch": 0.9835779683185584, + "grad_norm": 0.10337600857019424, + "learning_rate": 2.2250759839077536e-07, + "loss": 0.3016257882118225, + "memory(GiB)": 78.33, + "step": 5076, + "token_acc": 0.9081414405155412, + "train_speed(iter/s)": 0.032212 + }, + { + "epoch": 0.9837717386038851, + "grad_norm": 0.09644091874361038, + "learning_rate": 2.173041794197916e-07, + "loss": 0.3238067924976349, + "memory(GiB)": 78.33, + "step": 5077, + "token_acc": 0.9044397813242034, + "train_speed(iter/s)": 0.032212 + }, + { + "epoch": 0.9839655088892119, + "grad_norm": 0.10839894413948059, + "learning_rate": 2.1216228022679638e-07, + "loss": 0.3413659334182739, + "memory(GiB)": 78.33, + "step": 5078, + "token_acc": 0.8964040304440267, + "train_speed(iter/s)": 0.032213 + }, + { + "epoch": 0.9841592791745386, + "grad_norm": 0.08885491639375687, + "learning_rate": 2.070819029237003e-07, + "loss": 0.29514098167419434, + "memory(GiB)": 78.33, + "step": 5079, + "token_acc": 0.9091107924858441, + "train_speed(iter/s)": 0.032213 + }, + { + "epoch": 0.9843530494598653, + "grad_norm": 0.09718841314315796, + "learning_rate": 2.0206304959716756e-07, + "loss": 0.3231916129589081, + "memory(GiB)": 78.33, + "step": 5080, + "token_acc": 0.9021267809209168, + "train_speed(iter/s)": 0.032214 + }, + { + "epoch": 0.9845468197451921, + "grad_norm": 0.11555752903223038, + "learning_rate": 1.971057223085659e-07, + "loss": 0.3575592637062073, + "memory(GiB)": 78.33, + "step": 5081, + "token_acc": 0.893569844789357, + "train_speed(iter/s)": 0.032214 + }, + { + "epoch": 0.9847405900305188, + "grad_norm": 0.11206049472093582, + "learning_rate": 1.9220992309399997e-07, + "loss": 0.37476587295532227, + "memory(GiB)": 78.33, + "step": 5082, + "token_acc": 0.8887654848355404, + "train_speed(iter/s)": 0.032215 + }, + { + "epoch": 0.9849343603158456, + "grad_norm": 0.08786389976739883, + "learning_rate": 1.873756539643112e-07, + "loss": 0.2945099174976349, + "memory(GiB)": 78.33, + "step": 5083, + "token_acc": 0.9125009193655153, + "train_speed(iter/s)": 0.032215 + }, + { + "epoch": 0.9851281306011723, + "grad_norm": 0.09714756906032562, + "learning_rate": 1.8260291690506135e-07, + "loss": 0.32386794686317444, + "memory(GiB)": 78.33, + "step": 5084, + "token_acc": 0.9020605635215081, + "train_speed(iter/s)": 0.032216 + }, + { + "epoch": 0.9853219008864991, + "grad_norm": 0.09232458472251892, + "learning_rate": 1.7789171387654898e-07, + "loss": 0.3251858949661255, + "memory(GiB)": 78.33, + "step": 5085, + "token_acc": 0.901725535610886, + "train_speed(iter/s)": 0.032216 + }, + { + "epoch": 0.9855156711718258, + "grad_norm": 0.11348365992307663, + "learning_rate": 1.7324204681377628e-07, + "loss": 0.35106420516967773, + "memory(GiB)": 78.33, + "step": 5086, + "token_acc": 0.895291405992756, + "train_speed(iter/s)": 0.032217 + }, + { + "epoch": 0.9857094414571526, + "grad_norm": 0.09008847177028656, + "learning_rate": 1.6865391762649893e-07, + "loss": 0.2719075083732605, + "memory(GiB)": 78.33, + "step": 5087, + "token_acc": 0.9164248403946604, + "train_speed(iter/s)": 0.032218 + }, + { + "epoch": 0.9859032117424793, + "grad_norm": 0.1007675901055336, + "learning_rate": 1.6412732819919284e-07, + "loss": 0.29824161529541016, + "memory(GiB)": 78.33, + "step": 5088, + "token_acc": 0.9085992132867133, + "train_speed(iter/s)": 0.032218 + }, + { + "epoch": 0.986096982027806, + "grad_norm": 0.12334459275007248, + "learning_rate": 1.596622803910208e-07, + "loss": 0.3122141361236572, + "memory(GiB)": 78.33, + "step": 5089, + "token_acc": 0.90404706917409, + "train_speed(iter/s)": 0.032219 + }, + { + "epoch": 0.9862907523131328, + "grad_norm": 0.1014440655708313, + "learning_rate": 1.552587760359325e-07, + "loss": 0.31267303228378296, + "memory(GiB)": 78.33, + "step": 5090, + "token_acc": 0.9055354659248956, + "train_speed(iter/s)": 0.032219 + }, + { + "epoch": 0.9864845225984595, + "grad_norm": 0.11133712530136108, + "learning_rate": 1.5091681694253122e-07, + "loss": 0.35740742087364197, + "memory(GiB)": 78.33, + "step": 5091, + "token_acc": 0.8951927600808125, + "train_speed(iter/s)": 0.03222 + }, + { + "epoch": 0.9866782928837863, + "grad_norm": 0.10128425806760788, + "learning_rate": 1.4663640489420702e-07, + "loss": 0.3182716965675354, + "memory(GiB)": 78.33, + "step": 5092, + "token_acc": 0.9042219609160648, + "train_speed(iter/s)": 0.03222 + }, + { + "epoch": 0.986872063169113, + "grad_norm": 0.09385867416858673, + "learning_rate": 1.4241754164903696e-07, + "loss": 0.2903617322444916, + "memory(GiB)": 78.33, + "step": 5093, + "token_acc": 0.9133545725178879, + "train_speed(iter/s)": 0.032221 + }, + { + "epoch": 0.9870658334544398, + "grad_norm": 0.09520326554775238, + "learning_rate": 1.3826022893980159e-07, + "loss": 0.32819217443466187, + "memory(GiB)": 78.33, + "step": 5094, + "token_acc": 0.901990578939371, + "train_speed(iter/s)": 0.032221 + }, + { + "epoch": 0.9872596037397665, + "grad_norm": 0.09930814802646637, + "learning_rate": 1.3416446847401842e-07, + "loss": 0.30430757999420166, + "memory(GiB)": 78.33, + "step": 5095, + "token_acc": 0.9086877119749543, + "train_speed(iter/s)": 0.032222 + }, + { + "epoch": 0.9874533740250933, + "grad_norm": 0.10373541712760925, + "learning_rate": 1.3013026193395836e-07, + "loss": 0.3304864168167114, + "memory(GiB)": 78.33, + "step": 5096, + "token_acc": 0.9003137958085845, + "train_speed(iter/s)": 0.032223 + }, + { + "epoch": 0.98764714431042, + "grad_norm": 0.09495776146650314, + "learning_rate": 1.2615761097654608e-07, + "loss": 0.31185808777809143, + "memory(GiB)": 78.33, + "step": 5097, + "token_acc": 0.9045740484060134, + "train_speed(iter/s)": 0.032223 + }, + { + "epoch": 0.9878409145957467, + "grad_norm": 0.10554935038089752, + "learning_rate": 1.2224651723347634e-07, + "loss": 0.3295019567012787, + "memory(GiB)": 78.33, + "step": 5098, + "token_acc": 0.9011663040850858, + "train_speed(iter/s)": 0.032224 + }, + { + "epoch": 0.9880346848810735, + "grad_norm": 0.09783808141946793, + "learning_rate": 1.1839698231113082e-07, + "loss": 0.32202041149139404, + "memory(GiB)": 78.33, + "step": 5099, + "token_acc": 0.9030985169491526, + "train_speed(iter/s)": 0.032224 + }, + { + "epoch": 0.9882284551664002, + "grad_norm": 0.1201200857758522, + "learning_rate": 1.1460900779061144e-07, + "loss": 0.31718727946281433, + "memory(GiB)": 78.33, + "step": 5100, + "token_acc": 0.9047082558230932, + "train_speed(iter/s)": 0.032225 + }, + { + "epoch": 0.988422225451727, + "grad_norm": 0.10204530507326126, + "learning_rate": 1.1088259522777365e-07, + "loss": 0.33351755142211914, + "memory(GiB)": 78.33, + "step": 5101, + "token_acc": 0.90131747431921, + "train_speed(iter/s)": 0.032225 + }, + { + "epoch": 0.9886159957370537, + "grad_norm": 0.1120908334851265, + "learning_rate": 1.0721774615310985e-07, + "loss": 0.35009273886680603, + "memory(GiB)": 78.33, + "step": 5102, + "token_acc": 0.895470053070508, + "train_speed(iter/s)": 0.032226 + }, + { + "epoch": 0.9888097660223805, + "grad_norm": 0.10349807143211365, + "learning_rate": 1.0361446207189928e-07, + "loss": 0.3232646584510803, + "memory(GiB)": 78.33, + "step": 5103, + "token_acc": 0.9045182551383227, + "train_speed(iter/s)": 0.032226 + }, + { + "epoch": 0.9890035363077072, + "grad_norm": 0.10206515341997147, + "learning_rate": 1.0007274446409141e-07, + "loss": 0.32843074202537537, + "memory(GiB)": 78.33, + "step": 5104, + "token_acc": 0.9033802574615097, + "train_speed(iter/s)": 0.032227 + }, + { + "epoch": 0.989197306593034, + "grad_norm": 0.10101883858442307, + "learning_rate": 9.65925947843893e-08, + "loss": 0.3663754165172577, + "memory(GiB)": 78.33, + "step": 5105, + "token_acc": 0.8890428585568643, + "train_speed(iter/s)": 0.032227 + }, + { + "epoch": 0.9893910768783607, + "grad_norm": 0.10111556947231293, + "learning_rate": 9.317401446216621e-08, + "loss": 0.3458866775035858, + "memory(GiB)": 78.33, + "step": 5106, + "token_acc": 0.898968688533305, + "train_speed(iter/s)": 0.032228 + }, + { + "epoch": 0.9895848471636874, + "grad_norm": 0.10001836717128754, + "learning_rate": 8.981700490151567e-08, + "loss": 0.34546831250190735, + "memory(GiB)": 78.33, + "step": 5107, + "token_acc": 0.8980206216602694, + "train_speed(iter/s)": 0.032229 + }, + { + "epoch": 0.9897786174490142, + "grad_norm": 0.10675647109746933, + "learning_rate": 8.652156748126804e-08, + "loss": 0.34267866611480713, + "memory(GiB)": 78.33, + "step": 5108, + "token_acc": 0.8978237122930847, + "train_speed(iter/s)": 0.032229 + }, + { + "epoch": 0.9899723877343409, + "grad_norm": 0.10720111429691315, + "learning_rate": 8.328770355495729e-08, + "loss": 0.3194417953491211, + "memory(GiB)": 78.33, + "step": 5109, + "token_acc": 0.9027690371302706, + "train_speed(iter/s)": 0.03223 + }, + { + "epoch": 0.9901661580196677, + "grad_norm": 0.08818720281124115, + "learning_rate": 8.011541445078762e-08, + "loss": 0.2857103645801544, + "memory(GiB)": 78.33, + "step": 5110, + "token_acc": 0.91203895313451, + "train_speed(iter/s)": 0.03223 + }, + { + "epoch": 0.9903599283049944, + "grad_norm": 0.09907843917608261, + "learning_rate": 7.700470147173343e-08, + "loss": 0.3331596553325653, + "memory(GiB)": 78.33, + "step": 5111, + "token_acc": 0.8999431495167709, + "train_speed(iter/s)": 0.032231 + }, + { + "epoch": 0.9905536985903212, + "grad_norm": 0.13084760308265686, + "learning_rate": 7.395556589542274e-08, + "loss": 0.3520573377609253, + "memory(GiB)": 78.33, + "step": 5112, + "token_acc": 0.8953529427741111, + "train_speed(iter/s)": 0.032231 + }, + { + "epoch": 0.9907474688756479, + "grad_norm": 0.09977789223194122, + "learning_rate": 7.09680089742537e-08, + "loss": 0.34126073122024536, + "memory(GiB)": 78.33, + "step": 5113, + "token_acc": 0.8968176914778857, + "train_speed(iter/s)": 0.032232 + }, + { + "epoch": 0.9909412391609747, + "grad_norm": 0.0944044291973114, + "learning_rate": 6.804203193524483e-08, + "loss": 0.32394513487815857, + "memory(GiB)": 78.33, + "step": 5114, + "token_acc": 0.9009689518649718, + "train_speed(iter/s)": 0.032232 + }, + { + "epoch": 0.9911350094463014, + "grad_norm": 0.09428620338439941, + "learning_rate": 6.517763598021808e-08, + "loss": 0.2987945079803467, + "memory(GiB)": 78.33, + "step": 5115, + "token_acc": 0.9094635777663906, + "train_speed(iter/s)": 0.032233 + }, + { + "epoch": 0.9913287797316281, + "grad_norm": 0.10901875793933868, + "learning_rate": 6.237482228563239e-08, + "loss": 0.3245546817779541, + "memory(GiB)": 78.33, + "step": 5116, + "token_acc": 0.9043344214726151, + "train_speed(iter/s)": 0.032234 + }, + { + "epoch": 0.9915225500169549, + "grad_norm": 0.10262294858694077, + "learning_rate": 5.963359200270024e-08, + "loss": 0.3375054895877838, + "memory(GiB)": 78.33, + "step": 5117, + "token_acc": 0.8971592035573682, + "train_speed(iter/s)": 0.032234 + }, + { + "epoch": 0.9917163203022816, + "grad_norm": 0.0967864915728569, + "learning_rate": 5.6953946257287665e-08, + "loss": 0.3116954267024994, + "memory(GiB)": 78.33, + "step": 5118, + "token_acc": 0.9069773955911599, + "train_speed(iter/s)": 0.032235 + }, + { + "epoch": 0.9919100905876084, + "grad_norm": 0.09405119717121124, + "learning_rate": 5.433588615003093e-08, + "loss": 0.30092549324035645, + "memory(GiB)": 78.33, + "step": 5119, + "token_acc": 0.9092479884464617, + "train_speed(iter/s)": 0.032235 + }, + { + "epoch": 0.9921038608729351, + "grad_norm": 0.0960017740726471, + "learning_rate": 5.177941275620323e-08, + "loss": 0.29913923144340515, + "memory(GiB)": 78.33, + "step": 5120, + "token_acc": 0.9077315436241611, + "train_speed(iter/s)": 0.032236 + }, + { + "epoch": 0.9922976311582619, + "grad_norm": 0.10961098968982697, + "learning_rate": 4.928452712584796e-08, + "loss": 0.3457680344581604, + "memory(GiB)": 78.33, + "step": 5121, + "token_acc": 0.8968839910971175, + "train_speed(iter/s)": 0.032236 + }, + { + "epoch": 0.9924914014435886, + "grad_norm": 0.10494101792573929, + "learning_rate": 4.6851230283678766e-08, + "loss": 0.32835787534713745, + "memory(GiB)": 78.33, + "step": 5122, + "token_acc": 0.9018704634282524, + "train_speed(iter/s)": 0.032237 + }, + { + "epoch": 0.9926851717289153, + "grad_norm": 0.09790334105491638, + "learning_rate": 4.44795232290962e-08, + "loss": 0.30867090821266174, + "memory(GiB)": 78.33, + "step": 5123, + "token_acc": 0.9083395542284313, + "train_speed(iter/s)": 0.032237 + }, + { + "epoch": 0.9928789420142421, + "grad_norm": 0.09199753403663635, + "learning_rate": 4.216940693622106e-08, + "loss": 0.3020906448364258, + "memory(GiB)": 78.33, + "step": 5124, + "token_acc": 0.9092012383900929, + "train_speed(iter/s)": 0.032238 + }, + { + "epoch": 0.9930727122995688, + "grad_norm": 0.10397046059370041, + "learning_rate": 3.9920882353911e-08, + "loss": 0.3302524983882904, + "memory(GiB)": 78.33, + "step": 5125, + "token_acc": 0.9000474552141416, + "train_speed(iter/s)": 0.032238 + }, + { + "epoch": 0.9932664825848956, + "grad_norm": 0.1026199460029602, + "learning_rate": 3.773395040567728e-08, + "loss": 0.3645437955856323, + "memory(GiB)": 78.33, + "step": 5126, + "token_acc": 0.8931143232588699, + "train_speed(iter/s)": 0.032239 + }, + { + "epoch": 0.9934602528702223, + "grad_norm": 0.1036728173494339, + "learning_rate": 3.56086119897514e-08, + "loss": 0.34227946400642395, + "memory(GiB)": 78.33, + "step": 5127, + "token_acc": 0.8958297382801266, + "train_speed(iter/s)": 0.03224 + }, + { + "epoch": 0.9936540231555491, + "grad_norm": 0.10324281454086304, + "learning_rate": 3.354486797906841e-08, + "loss": 0.3703176975250244, + "memory(GiB)": 78.33, + "step": 5128, + "token_acc": 0.8903992961943412, + "train_speed(iter/s)": 0.03224 + }, + { + "epoch": 0.9938477934408758, + "grad_norm": 0.10356750339269638, + "learning_rate": 3.154271922125029e-08, + "loss": 0.3422203063964844, + "memory(GiB)": 78.33, + "step": 5129, + "token_acc": 0.9005716619028175, + "train_speed(iter/s)": 0.032241 + }, + { + "epoch": 0.9940415637262026, + "grad_norm": 0.10634325444698334, + "learning_rate": 2.960216653865588e-08, + "loss": 0.33742478489875793, + "memory(GiB)": 78.33, + "step": 5130, + "token_acc": 0.8997738043946575, + "train_speed(iter/s)": 0.032241 + }, + { + "epoch": 0.9942353340115293, + "grad_norm": 0.09013240784406662, + "learning_rate": 2.7723210728314292e-08, + "loss": 0.29299721121788025, + "memory(GiB)": 78.33, + "step": 5131, + "token_acc": 0.910865125192264, + "train_speed(iter/s)": 0.032242 + }, + { + "epoch": 0.994429104296856, + "grad_norm": 0.09943090379238129, + "learning_rate": 2.5905852561958208e-08, + "loss": 0.3339230716228485, + "memory(GiB)": 78.33, + "step": 5132, + "token_acc": 0.8997598211081357, + "train_speed(iter/s)": 0.032242 + }, + { + "epoch": 0.9946228745821828, + "grad_norm": 0.10124190896749496, + "learning_rate": 2.415009278604052e-08, + "loss": 0.3518436551094055, + "memory(GiB)": 78.33, + "step": 5133, + "token_acc": 0.8950833333333333, + "train_speed(iter/s)": 0.032243 + }, + { + "epoch": 0.9948166448675095, + "grad_norm": 0.09643140435218811, + "learning_rate": 2.245593212166774e-08, + "loss": 0.3150833249092102, + "memory(GiB)": 78.33, + "step": 5134, + "token_acc": 0.9049085264157237, + "train_speed(iter/s)": 0.032243 + }, + { + "epoch": 0.9950104151528363, + "grad_norm": 0.10642070323228836, + "learning_rate": 2.0823371264699907e-08, + "loss": 0.33650028705596924, + "memory(GiB)": 78.33, + "step": 5135, + "token_acc": 0.9001945581787031, + "train_speed(iter/s)": 0.032244 + }, + { + "epoch": 0.995204185438163, + "grad_norm": 0.10311167687177658, + "learning_rate": 1.9252410885683965e-08, + "loss": 0.3397340178489685, + "memory(GiB)": 78.33, + "step": 5136, + "token_acc": 0.8996154508408426, + "train_speed(iter/s)": 0.032245 + }, + { + "epoch": 0.9953979557234898, + "grad_norm": 0.10818962752819061, + "learning_rate": 1.7743051629837135e-08, + "loss": 0.3583690822124481, + "memory(GiB)": 78.33, + "step": 5137, + "token_acc": 0.8913712208308734, + "train_speed(iter/s)": 0.032245 + }, + { + "epoch": 0.9955917260088165, + "grad_norm": 0.11495489627122879, + "learning_rate": 1.6295294117080192e-08, + "loss": 0.3481709063053131, + "memory(GiB)": 78.33, + "step": 5138, + "token_acc": 0.8962670979044539, + "train_speed(iter/s)": 0.032246 + }, + { + "epoch": 0.9957854962941433, + "grad_norm": 0.09410503506660461, + "learning_rate": 1.490913894208745e-08, + "loss": 0.30858170986175537, + "memory(GiB)": 78.33, + "step": 5139, + "token_acc": 0.9072154599071401, + "train_speed(iter/s)": 0.032246 + }, + { + "epoch": 0.99597926657947, + "grad_norm": 0.10258731245994568, + "learning_rate": 1.3584586674153519e-08, + "loss": 0.3513551652431488, + "memory(GiB)": 78.33, + "step": 5140, + "token_acc": 0.8945655624933856, + "train_speed(iter/s)": 0.032247 + }, + { + "epoch": 0.9961730368647967, + "grad_norm": 0.10465652495622635, + "learning_rate": 1.2321637857326538e-08, + "loss": 0.32509326934814453, + "memory(GiB)": 78.33, + "step": 5141, + "token_acc": 0.9022335312411648, + "train_speed(iter/s)": 0.032247 + }, + { + "epoch": 0.9963668071501235, + "grad_norm": 0.17421969771385193, + "learning_rate": 1.112029301032491e-08, + "loss": 0.3402435779571533, + "memory(GiB)": 78.33, + "step": 5142, + "token_acc": 0.8987543069175722, + "train_speed(iter/s)": 0.032248 + }, + { + "epoch": 0.9965605774354502, + "grad_norm": 0.09737160056829453, + "learning_rate": 9.980552626587257e-09, + "loss": 0.31006062030792236, + "memory(GiB)": 78.33, + "step": 5143, + "token_acc": 0.9062544199598314, + "train_speed(iter/s)": 0.032248 + }, + { + "epoch": 0.996754347720777, + "grad_norm": 0.0954391285777092, + "learning_rate": 8.902417174205812e-09, + "loss": 0.3203745186328888, + "memory(GiB)": 78.33, + "step": 5144, + "token_acc": 0.903609002530297, + "train_speed(iter/s)": 0.032249 + }, + { + "epoch": 0.9969481180061037, + "grad_norm": 0.10394702851772308, + "learning_rate": 7.885887096026333e-09, + "loss": 0.33040565252304077, + "memory(GiB)": 78.33, + "step": 5145, + "token_acc": 0.9019830523281209, + "train_speed(iter/s)": 0.03225 + }, + { + "epoch": 0.9971418882914305, + "grad_norm": 0.1102806031703949, + "learning_rate": 6.930962809564844e-09, + "loss": 0.38149887323379517, + "memory(GiB)": 78.33, + "step": 5146, + "token_acc": 0.8889834487615917, + "train_speed(iter/s)": 0.03225 + }, + { + "epoch": 0.9973356585767572, + "grad_norm": 0.09441733360290527, + "learning_rate": 6.0376447070242805e-09, + "loss": 0.2967412769794464, + "memory(GiB)": 78.33, + "step": 5147, + "token_acc": 0.910029761147712, + "train_speed(iter/s)": 0.032251 + }, + { + "epoch": 0.997529428862084, + "grad_norm": 0.09879467636346817, + "learning_rate": 5.205933155311149e-09, + "loss": 0.31042206287384033, + "memory(GiB)": 78.33, + "step": 5148, + "token_acc": 0.9073388532511939, + "train_speed(iter/s)": 0.032251 + }, + { + "epoch": 0.9977231991474107, + "grad_norm": 0.09496015310287476, + "learning_rate": 4.435828496035521e-09, + "loss": 0.30281057953834534, + "memory(GiB)": 78.33, + "step": 5149, + "token_acc": 0.9083017847485128, + "train_speed(iter/s)": 0.032252 + }, + { + "epoch": 0.9979169694327374, + "grad_norm": 0.09966279566287994, + "learning_rate": 3.727331045511039e-09, + "loss": 0.3183283805847168, + "memory(GiB)": 78.33, + "step": 5150, + "token_acc": 0.9036394691893312, + "train_speed(iter/s)": 0.032252 + }, + { + "epoch": 0.9981107397180642, + "grad_norm": 0.0968567430973053, + "learning_rate": 3.0804410947216084e-09, + "loss": 0.34515106678009033, + "memory(GiB)": 78.33, + "step": 5151, + "token_acc": 0.8957895251601545, + "train_speed(iter/s)": 0.032253 + }, + { + "epoch": 0.9983045100033909, + "grad_norm": 0.09464351087808609, + "learning_rate": 2.4951589093713533e-09, + "loss": 0.32018736004829407, + "memory(GiB)": 78.33, + "step": 5152, + "token_acc": 0.90508582795118, + "train_speed(iter/s)": 0.032253 + }, + { + "epoch": 0.9984982802887177, + "grad_norm": 0.0978541150689125, + "learning_rate": 1.9714847298513135e-09, + "loss": 0.3221741318702698, + "memory(GiB)": 78.33, + "step": 5153, + "token_acc": 0.90467557008248, + "train_speed(iter/s)": 0.032254 + }, + { + "epoch": 0.9986920505740444, + "grad_norm": 0.09578622877597809, + "learning_rate": 1.5094187712394456e-09, + "loss": 0.3303835988044739, + "memory(GiB)": 78.33, + "step": 5154, + "token_acc": 0.9011171856429759, + "train_speed(iter/s)": 0.032255 + }, + { + "epoch": 0.9988858208593712, + "grad_norm": 0.09848620742559433, + "learning_rate": 1.1089612233339261e-09, + "loss": 0.30692458152770996, + "memory(GiB)": 78.33, + "step": 5155, + "token_acc": 0.9065349757288363, + "train_speed(iter/s)": 0.032255 + }, + { + "epoch": 0.9990795911446979, + "grad_norm": 0.09920880943536758, + "learning_rate": 7.701122505865409e-10, + "loss": 0.3253172039985657, + "memory(GiB)": 78.33, + "step": 5156, + "token_acc": 0.9011366073343341, + "train_speed(iter/s)": 0.032256 + }, + { + "epoch": 0.9992733614300247, + "grad_norm": 0.09585346281528473, + "learning_rate": 4.928719922026037e-10, + "loss": 0.30521735548973083, + "memory(GiB)": 78.33, + "step": 5157, + "token_acc": 0.906159781992823, + "train_speed(iter/s)": 0.032256 + }, + { + "epoch": 0.9994671317153514, + "grad_norm": 0.10266774892807007, + "learning_rate": 2.772405620410367e-10, + "loss": 0.341879278421402, + "memory(GiB)": 78.33, + "step": 5158, + "token_acc": 0.8973592287271203, + "train_speed(iter/s)": 0.032257 + }, + { + "epoch": 0.9996609020006781, + "grad_norm": 0.10256272554397583, + "learning_rate": 1.2321804866433082e-10, + "loss": 0.3247736990451813, + "memory(GiB)": 78.33, + "step": 5159, + "token_acc": 0.9010874626783905, + "train_speed(iter/s)": 0.032257 + }, + { + "epoch": 0.9998546722860049, + "grad_norm": 0.09452533721923828, + "learning_rate": 3.0804515321891657e-11, + "loss": 0.3123696446418762, + "memory(GiB)": 78.33, + "step": 5160, + "token_acc": 0.9040158570691574, + "train_speed(iter/s)": 0.032258 + }, + { + "epoch": 1.0, + "grad_norm": 0.1213284358382225, + "learning_rate": 0.0, + "loss": 0.31489574909210205, + "memory(GiB)": 78.33, + "step": 5161, + "token_acc": 0.9069757440220196, + "train_speed(iter/s)": 0.03226 + }, + { + "epoch": 1.0, + "eval_loss": 0.37820467352867126, + "eval_runtime": 1344.7251, + "eval_samples_per_second": 5.019, + "eval_steps_per_second": 5.019, + "eval_token_acc": 0.9026974626241785, + "step": 5161 + } + ], + "logging_steps": 1, + "max_steps": 5161, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 200, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.701470853092039e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}