{ "best_global_step": 1237, "best_metric": 0.36159474, "best_model_checkpoint": "/workspace/checkpoint/gui_exp/sft_amex/v0-20260413_084132/checkpoint-1237", "epoch": 1.0, "eval_steps": 500, "global_step": 1237, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008088978766430738, "grad_norm": 64.78370666503906, "learning_rate": 1.6129032258064518e-07, "loss": 1.7486257553100586, "memory(GiB)": 60.95, "step": 1, "token_acc": 0.6546184738955824, "train_speed(iter/s)": 0.017239 }, { "epoch": 0.0016177957532861476, "grad_norm": 68.59134674072266, "learning_rate": 3.2258064516129035e-07, "loss": 1.777339220046997, "memory(GiB)": 71.88, "step": 2, "token_acc": 0.6341463414634146, "train_speed(iter/s)": 0.019622 }, { "epoch": 0.0024266936299292214, "grad_norm": 64.73936462402344, "learning_rate": 4.838709677419355e-07, "loss": 1.8061851263046265, "memory(GiB)": 71.9, "step": 3, "token_acc": 0.6195652173913043, "train_speed(iter/s)": 0.020613 }, { "epoch": 0.0032355915065722953, "grad_norm": 65.572998046875, "learning_rate": 6.451612903225807e-07, "loss": 1.807295560836792, "memory(GiB)": 72.45, "step": 4, "token_acc": 0.5654205607476636, "train_speed(iter/s)": 0.021122 }, { "epoch": 0.004044489383215369, "grad_norm": 65.35359191894531, "learning_rate": 8.064516129032258e-07, "loss": 1.5166772603988647, "memory(GiB)": 72.45, "step": 5, "token_acc": 0.6327433628318584, "train_speed(iter/s)": 0.021426 }, { "epoch": 0.004853387259858443, "grad_norm": 57.624046325683594, "learning_rate": 9.67741935483871e-07, "loss": 1.5725659132003784, "memory(GiB)": 72.45, "step": 6, "token_acc": 0.6591928251121076, "train_speed(iter/s)": 0.021622 }, { "epoch": 0.005662285136501517, "grad_norm": 41.641319274902344, "learning_rate": 1.1290322580645162e-06, "loss": 1.6281558275222778, "memory(GiB)": 72.45, "step": 7, "token_acc": 0.6067415730337079, "train_speed(iter/s)": 0.02179 }, { "epoch": 0.006471183013144591, "grad_norm": 40.06605529785156, "learning_rate": 1.2903225806451614e-06, "loss": 1.6883149147033691, "memory(GiB)": 72.45, "step": 8, "token_acc": 0.6423841059602649, "train_speed(iter/s)": 0.021906 }, { "epoch": 0.007280080889787664, "grad_norm": 23.832304000854492, "learning_rate": 1.4516129032258066e-06, "loss": 1.4600856304168701, "memory(GiB)": 73.8, "step": 9, "token_acc": 0.6683417085427136, "train_speed(iter/s)": 0.022001 }, { "epoch": 0.008088978766430738, "grad_norm": 19.52027702331543, "learning_rate": 1.6129032258064516e-06, "loss": 1.178048014640808, "memory(GiB)": 73.8, "step": 10, "token_acc": 0.6995515695067265, "train_speed(iter/s)": 0.022088 }, { "epoch": 0.008897876643073812, "grad_norm": 22.565189361572266, "learning_rate": 1.774193548387097e-06, "loss": 1.225492000579834, "memory(GiB)": 73.8, "step": 11, "token_acc": 0.680327868852459, "train_speed(iter/s)": 0.022154 }, { "epoch": 0.009706774519716885, "grad_norm": 15.462038040161133, "learning_rate": 1.935483870967742e-06, "loss": 1.0573687553405762, "memory(GiB)": 73.8, "step": 12, "token_acc": 0.7576923076923077, "train_speed(iter/s)": 0.02221 }, { "epoch": 0.010515672396359959, "grad_norm": 14.245152473449707, "learning_rate": 2.096774193548387e-06, "loss": 1.0721827745437622, "memory(GiB)": 73.8, "step": 13, "token_acc": 0.7405857740585774, "train_speed(iter/s)": 0.022257 }, { "epoch": 0.011324570273003034, "grad_norm": 8.204596519470215, "learning_rate": 2.2580645161290324e-06, "loss": 0.8753397464752197, "memory(GiB)": 73.8, "step": 14, "token_acc": 0.7516778523489933, "train_speed(iter/s)": 0.022297 }, { "epoch": 0.012133468149646108, "grad_norm": 11.066507339477539, "learning_rate": 2.4193548387096776e-06, "loss": 0.9424616098403931, "memory(GiB)": 73.8, "step": 15, "token_acc": 0.7411003236245954, "train_speed(iter/s)": 0.022325 }, { "epoch": 0.012942366026289181, "grad_norm": 8.134406089782715, "learning_rate": 2.580645161290323e-06, "loss": 0.9165105819702148, "memory(GiB)": 73.8, "step": 16, "token_acc": 0.7902439024390244, "train_speed(iter/s)": 0.022352 }, { "epoch": 0.013751263902932255, "grad_norm": 14.990755081176758, "learning_rate": 2.7419354838709676e-06, "loss": 0.8677236437797546, "memory(GiB)": 73.8, "step": 17, "token_acc": 0.7635658914728682, "train_speed(iter/s)": 0.02238 }, { "epoch": 0.014560161779575328, "grad_norm": 5.65842342376709, "learning_rate": 2.903225806451613e-06, "loss": 0.7795729637145996, "memory(GiB)": 73.8, "step": 18, "token_acc": 0.7739938080495357, "train_speed(iter/s)": 0.022403 }, { "epoch": 0.015369059656218402, "grad_norm": 5.559131145477295, "learning_rate": 3.0645161290322584e-06, "loss": 0.8590961694717407, "memory(GiB)": 73.8, "step": 19, "token_acc": 0.75, "train_speed(iter/s)": 0.022423 }, { "epoch": 0.016177957532861477, "grad_norm": 4.871716499328613, "learning_rate": 3.225806451612903e-06, "loss": 0.7650733590126038, "memory(GiB)": 73.8, "step": 20, "token_acc": 0.7865612648221344, "train_speed(iter/s)": 0.022441 }, { "epoch": 0.01698685540950455, "grad_norm": 5.387275218963623, "learning_rate": 3.3870967741935484e-06, "loss": 0.7404652833938599, "memory(GiB)": 73.8, "step": 21, "token_acc": 0.7907801418439716, "train_speed(iter/s)": 0.022456 }, { "epoch": 0.017795753286147624, "grad_norm": 6.131480693817139, "learning_rate": 3.548387096774194e-06, "loss": 0.8067750334739685, "memory(GiB)": 73.8, "step": 22, "token_acc": 0.7986111111111112, "train_speed(iter/s)": 0.022476 }, { "epoch": 0.018604651162790697, "grad_norm": 5.183681488037109, "learning_rate": 3.7096774193548392e-06, "loss": 0.8132314682006836, "memory(GiB)": 73.8, "step": 23, "token_acc": 0.7714285714285715, "train_speed(iter/s)": 0.022492 }, { "epoch": 0.01941354903943377, "grad_norm": 5.063383102416992, "learning_rate": 3.870967741935484e-06, "loss": 0.7204439640045166, "memory(GiB)": 73.8, "step": 24, "token_acc": 0.7905982905982906, "train_speed(iter/s)": 0.022506 }, { "epoch": 0.020222446916076844, "grad_norm": 4.753130912780762, "learning_rate": 4.032258064516129e-06, "loss": 0.7673914432525635, "memory(GiB)": 73.8, "step": 25, "token_acc": 0.7453416149068323, "train_speed(iter/s)": 0.022518 }, { "epoch": 0.021031344792719918, "grad_norm": 4.112824440002441, "learning_rate": 4.193548387096774e-06, "loss": 0.6755634546279907, "memory(GiB)": 73.8, "step": 26, "token_acc": 0.7714285714285715, "train_speed(iter/s)": 0.02253 }, { "epoch": 0.02184024266936299, "grad_norm": 3.704129695892334, "learning_rate": 4.35483870967742e-06, "loss": 0.685713529586792, "memory(GiB)": 73.8, "step": 27, "token_acc": 0.8447488584474886, "train_speed(iter/s)": 0.022541 }, { "epoch": 0.02264914054600607, "grad_norm": 4.385001182556152, "learning_rate": 4.516129032258065e-06, "loss": 0.7436140179634094, "memory(GiB)": 73.8, "step": 28, "token_acc": 0.749003984063745, "train_speed(iter/s)": 0.022549 }, { "epoch": 0.023458038422649142, "grad_norm": 5.385667324066162, "learning_rate": 4.67741935483871e-06, "loss": 0.7293410301208496, "memory(GiB)": 73.8, "step": 29, "token_acc": 0.8248175182481752, "train_speed(iter/s)": 0.022558 }, { "epoch": 0.024266936299292215, "grad_norm": 5.816902160644531, "learning_rate": 4.838709677419355e-06, "loss": 0.6676285266876221, "memory(GiB)": 73.8, "step": 30, "token_acc": 0.7844827586206896, "train_speed(iter/s)": 0.022568 }, { "epoch": 0.02507583417593529, "grad_norm": 3.9358129501342773, "learning_rate": 5e-06, "loss": 0.6832848787307739, "memory(GiB)": 73.8, "step": 31, "token_acc": 0.8340807174887892, "train_speed(iter/s)": 0.022574 }, { "epoch": 0.025884732052578362, "grad_norm": 3.9400582313537598, "learning_rate": 5.161290322580646e-06, "loss": 0.6794041395187378, "memory(GiB)": 73.8, "step": 32, "token_acc": 0.7857142857142857, "train_speed(iter/s)": 0.022581 }, { "epoch": 0.026693629929221436, "grad_norm": 6.0499725341796875, "learning_rate": 5.322580645161291e-06, "loss": 0.6280096769332886, "memory(GiB)": 73.8, "step": 33, "token_acc": 0.8277511961722488, "train_speed(iter/s)": 0.022589 }, { "epoch": 0.02750252780586451, "grad_norm": 4.963372230529785, "learning_rate": 5.483870967741935e-06, "loss": 0.7461614012718201, "memory(GiB)": 73.8, "step": 34, "token_acc": 0.7442922374429224, "train_speed(iter/s)": 0.022594 }, { "epoch": 0.028311425682507583, "grad_norm": 4.874055862426758, "learning_rate": 5.645161290322582e-06, "loss": 0.6325216889381409, "memory(GiB)": 73.8, "step": 35, "token_acc": 0.8239700374531835, "train_speed(iter/s)": 0.022602 }, { "epoch": 0.029120323559150656, "grad_norm": 4.295459747314453, "learning_rate": 5.806451612903226e-06, "loss": 0.6098757982254028, "memory(GiB)": 73.8, "step": 36, "token_acc": 0.85, "train_speed(iter/s)": 0.022606 }, { "epoch": 0.02992922143579373, "grad_norm": 4.486640453338623, "learning_rate": 5.967741935483872e-06, "loss": 0.6720225811004639, "memory(GiB)": 73.8, "step": 37, "token_acc": 0.7675276752767528, "train_speed(iter/s)": 0.022613 }, { "epoch": 0.030738119312436803, "grad_norm": 3.9755430221557617, "learning_rate": 6.129032258064517e-06, "loss": 0.7007983326911926, "memory(GiB)": 73.8, "step": 38, "token_acc": 0.7446808510638298, "train_speed(iter/s)": 0.022618 }, { "epoch": 0.03154701718907988, "grad_norm": 3.85732102394104, "learning_rate": 6.290322580645162e-06, "loss": 0.6228176355361938, "memory(GiB)": 73.8, "step": 39, "token_acc": 0.8116591928251121, "train_speed(iter/s)": 0.022625 }, { "epoch": 0.032355915065722954, "grad_norm": 3.556612491607666, "learning_rate": 6.451612903225806e-06, "loss": 0.6283481121063232, "memory(GiB)": 73.8, "step": 40, "token_acc": 0.8035087719298246, "train_speed(iter/s)": 0.02263 }, { "epoch": 0.03316481294236603, "grad_norm": 5.600265979766846, "learning_rate": 6.612903225806452e-06, "loss": 0.6793509721755981, "memory(GiB)": 73.8, "step": 41, "token_acc": 0.8174904942965779, "train_speed(iter/s)": 0.022635 }, { "epoch": 0.0339737108190091, "grad_norm": 3.7283554077148438, "learning_rate": 6.774193548387097e-06, "loss": 0.6385987997055054, "memory(GiB)": 73.8, "step": 42, "token_acc": 0.8125, "train_speed(iter/s)": 0.022639 }, { "epoch": 0.034782608695652174, "grad_norm": 3.8624303340911865, "learning_rate": 6.935483870967743e-06, "loss": 0.6532889604568481, "memory(GiB)": 73.8, "step": 43, "token_acc": 0.8297872340425532, "train_speed(iter/s)": 0.022644 }, { "epoch": 0.03559150657229525, "grad_norm": 3.6706488132476807, "learning_rate": 7.096774193548388e-06, "loss": 0.579014241695404, "memory(GiB)": 73.8, "step": 44, "token_acc": 0.8345070422535211, "train_speed(iter/s)": 0.022648 }, { "epoch": 0.03640040444893832, "grad_norm": 3.9184775352478027, "learning_rate": 7.258064516129033e-06, "loss": 0.5859895348548889, "memory(GiB)": 73.8, "step": 45, "token_acc": 0.8291925465838509, "train_speed(iter/s)": 0.022651 }, { "epoch": 0.037209302325581395, "grad_norm": 3.94393253326416, "learning_rate": 7.4193548387096784e-06, "loss": 0.5704982280731201, "memory(GiB)": 73.8, "step": 46, "token_acc": 0.8542713567839196, "train_speed(iter/s)": 0.022655 }, { "epoch": 0.03801820020222447, "grad_norm": 4.142230987548828, "learning_rate": 7.580645161290323e-06, "loss": 0.623918354511261, "memory(GiB)": 73.8, "step": 47, "token_acc": 0.7984790874524715, "train_speed(iter/s)": 0.022657 }, { "epoch": 0.03882709807886754, "grad_norm": 4.207951545715332, "learning_rate": 7.741935483870968e-06, "loss": 0.5815058946609497, "memory(GiB)": 73.8, "step": 48, "token_acc": 0.8186528497409327, "train_speed(iter/s)": 0.022662 }, { "epoch": 0.039635995955510615, "grad_norm": 4.375429153442383, "learning_rate": 7.903225806451613e-06, "loss": 0.6511105895042419, "memory(GiB)": 73.8, "step": 49, "token_acc": 0.809375, "train_speed(iter/s)": 0.022666 }, { "epoch": 0.04044489383215369, "grad_norm": 4.1379499435424805, "learning_rate": 8.064516129032258e-06, "loss": 0.6755905747413635, "memory(GiB)": 73.8, "step": 50, "token_acc": 0.8034934497816594, "train_speed(iter/s)": 0.02267 }, { "epoch": 0.04125379170879676, "grad_norm": 4.107391357421875, "learning_rate": 8.225806451612904e-06, "loss": 0.558114230632782, "memory(GiB)": 73.8, "step": 51, "token_acc": 0.8186528497409327, "train_speed(iter/s)": 0.022672 }, { "epoch": 0.042062689585439836, "grad_norm": 3.2282044887542725, "learning_rate": 8.387096774193549e-06, "loss": 0.5646804571151733, "memory(GiB)": 73.8, "step": 52, "token_acc": 0.7943262411347518, "train_speed(iter/s)": 0.022674 }, { "epoch": 0.04287158746208291, "grad_norm": 3.679171085357666, "learning_rate": 8.548387096774194e-06, "loss": 0.5988277196884155, "memory(GiB)": 73.8, "step": 53, "token_acc": 0.8022922636103151, "train_speed(iter/s)": 0.022675 }, { "epoch": 0.04368048533872598, "grad_norm": 4.386334419250488, "learning_rate": 8.70967741935484e-06, "loss": 0.6635404825210571, "memory(GiB)": 73.8, "step": 54, "token_acc": 0.7681159420289855, "train_speed(iter/s)": 0.022674 }, { "epoch": 0.044489383215369056, "grad_norm": 5.1664557456970215, "learning_rate": 8.870967741935484e-06, "loss": 0.5942538976669312, "memory(GiB)": 73.8, "step": 55, "token_acc": 0.86328125, "train_speed(iter/s)": 0.022677 }, { "epoch": 0.04529828109201214, "grad_norm": 5.156553268432617, "learning_rate": 9.03225806451613e-06, "loss": 0.5873563885688782, "memory(GiB)": 74.11, "step": 56, "token_acc": 0.7923875432525952, "train_speed(iter/s)": 0.022677 }, { "epoch": 0.04610717896865521, "grad_norm": 3.327913999557495, "learning_rate": 9.193548387096775e-06, "loss": 0.5179651975631714, "memory(GiB)": 74.11, "step": 57, "token_acc": 0.8286713286713286, "train_speed(iter/s)": 0.022681 }, { "epoch": 0.046916076845298284, "grad_norm": 3.147554397583008, "learning_rate": 9.35483870967742e-06, "loss": 0.6654713153839111, "memory(GiB)": 74.11, "step": 58, "token_acc": 0.8122866894197952, "train_speed(iter/s)": 0.022683 }, { "epoch": 0.04772497472194136, "grad_norm": 3.951767921447754, "learning_rate": 9.516129032258065e-06, "loss": 0.5465582013130188, "memory(GiB)": 74.11, "step": 59, "token_acc": 0.828125, "train_speed(iter/s)": 0.022686 }, { "epoch": 0.04853387259858443, "grad_norm": 3.6060750484466553, "learning_rate": 9.67741935483871e-06, "loss": 0.6206121444702148, "memory(GiB)": 74.11, "step": 60, "token_acc": 0.8258928571428571, "train_speed(iter/s)": 0.022687 }, { "epoch": 0.049342770475227504, "grad_norm": 4.130661487579346, "learning_rate": 9.838709677419356e-06, "loss": 0.6245087385177612, "memory(GiB)": 74.11, "step": 61, "token_acc": 0.8050847457627118, "train_speed(iter/s)": 0.02269 }, { "epoch": 0.05015166835187058, "grad_norm": 4.408290386199951, "learning_rate": 1e-05, "loss": 0.6183744668960571, "memory(GiB)": 74.11, "step": 62, "token_acc": 0.8229665071770335, "train_speed(iter/s)": 0.022694 }, { "epoch": 0.05096056622851365, "grad_norm": 3.7502522468566895, "learning_rate": 9.999982128386562e-06, "loss": 0.5600206851959229, "memory(GiB)": 74.11, "step": 63, "token_acc": 0.8364312267657993, "train_speed(iter/s)": 0.022695 }, { "epoch": 0.051769464105156725, "grad_norm": 4.595156669616699, "learning_rate": 9.999928513674004e-06, "loss": 0.5526872873306274, "memory(GiB)": 74.11, "step": 64, "token_acc": 0.8165467625899281, "train_speed(iter/s)": 0.022697 }, { "epoch": 0.0525783619817998, "grad_norm": 4.10991907119751, "learning_rate": 9.999839156245597e-06, "loss": 0.4983682632446289, "memory(GiB)": 74.11, "step": 65, "token_acc": 0.8742857142857143, "train_speed(iter/s)": 0.022698 }, { "epoch": 0.05338725985844287, "grad_norm": 4.291178226470947, "learning_rate": 9.99971405674013e-06, "loss": 0.6258913278579712, "memory(GiB)": 74.11, "step": 66, "token_acc": 0.8235294117647058, "train_speed(iter/s)": 0.0227 }, { "epoch": 0.054196157735085945, "grad_norm": 4.950540065765381, "learning_rate": 9.999553216051892e-06, "loss": 0.6055471897125244, "memory(GiB)": 74.11, "step": 67, "token_acc": 0.75, "train_speed(iter/s)": 0.0227 }, { "epoch": 0.05500505561172902, "grad_norm": 4.7848076820373535, "learning_rate": 9.999356635330675e-06, "loss": 0.5771285891532898, "memory(GiB)": 74.11, "step": 68, "token_acc": 0.8007518796992481, "train_speed(iter/s)": 0.022702 }, { "epoch": 0.05581395348837209, "grad_norm": 4.7233567237854, "learning_rate": 9.999124315981766e-06, "loss": 0.5602097511291504, "memory(GiB)": 74.11, "step": 69, "token_acc": 0.85, "train_speed(iter/s)": 0.022704 }, { "epoch": 0.056622851365015166, "grad_norm": 3.280118227005005, "learning_rate": 9.998856259665936e-06, "loss": 0.5948894023895264, "memory(GiB)": 74.11, "step": 70, "token_acc": 0.8597285067873304, "train_speed(iter/s)": 0.022706 }, { "epoch": 0.05743174924165824, "grad_norm": 3.6923129558563232, "learning_rate": 9.99855246829942e-06, "loss": 0.615454912185669, "memory(GiB)": 74.11, "step": 71, "token_acc": 0.7639484978540773, "train_speed(iter/s)": 0.022708 }, { "epoch": 0.05824064711830131, "grad_norm": 3.9682765007019043, "learning_rate": 9.99821294405392e-06, "loss": 0.6003280878067017, "memory(GiB)": 74.11, "step": 72, "token_acc": 0.8415492957746479, "train_speed(iter/s)": 0.02271 }, { "epoch": 0.059049544994944386, "grad_norm": 3.5200328826904297, "learning_rate": 9.99783768935657e-06, "loss": 0.5450583100318909, "memory(GiB)": 74.11, "step": 73, "token_acc": 0.8100358422939068, "train_speed(iter/s)": 0.022712 }, { "epoch": 0.05985844287158746, "grad_norm": 4.187544345855713, "learning_rate": 9.997426706889935e-06, "loss": 0.5230978727340698, "memory(GiB)": 74.11, "step": 74, "token_acc": 0.8472222222222222, "train_speed(iter/s)": 0.022714 }, { "epoch": 0.06066734074823053, "grad_norm": 3.5596694946289062, "learning_rate": 9.996979999591982e-06, "loss": 0.5269993543624878, "memory(GiB)": 74.11, "step": 75, "token_acc": 0.8168316831683168, "train_speed(iter/s)": 0.022715 }, { "epoch": 0.06147623862487361, "grad_norm": 3.213773012161255, "learning_rate": 9.996497570656063e-06, "loss": 0.5459144711494446, "memory(GiB)": 74.11, "step": 76, "token_acc": 0.7665198237885462, "train_speed(iter/s)": 0.022716 }, { "epoch": 0.06228513650151668, "grad_norm": 3.1109633445739746, "learning_rate": 9.995979423530893e-06, "loss": 0.5678860545158386, "memory(GiB)": 74.11, "step": 77, "token_acc": 0.8123076923076923, "train_speed(iter/s)": 0.022717 }, { "epoch": 0.06309403437815976, "grad_norm": 3.668972969055176, "learning_rate": 9.99542556192052e-06, "loss": 0.5075556039810181, "memory(GiB)": 74.11, "step": 78, "token_acc": 0.84, "train_speed(iter/s)": 0.022718 }, { "epoch": 0.06390293225480283, "grad_norm": 4.338983535766602, "learning_rate": 9.994835989784305e-06, "loss": 0.5242471098899841, "memory(GiB)": 74.11, "step": 79, "token_acc": 0.865, "train_speed(iter/s)": 0.022721 }, { "epoch": 0.06471183013144591, "grad_norm": 4.064675807952881, "learning_rate": 9.99421071133689e-06, "loss": 0.6131962537765503, "memory(GiB)": 74.11, "step": 80, "token_acc": 0.7767857142857143, "train_speed(iter/s)": 0.022722 }, { "epoch": 0.06552072800808897, "grad_norm": 3.6171154975891113, "learning_rate": 9.993549731048171e-06, "loss": 0.5887628197669983, "memory(GiB)": 74.11, "step": 81, "token_acc": 0.7992125984251969, "train_speed(iter/s)": 0.022724 }, { "epoch": 0.06632962588473205, "grad_norm": 3.9707374572753906, "learning_rate": 9.992853053643257e-06, "loss": 0.5989000201225281, "memory(GiB)": 74.11, "step": 82, "token_acc": 0.8346456692913385, "train_speed(iter/s)": 0.022726 }, { "epoch": 0.06713852376137512, "grad_norm": 4.361082077026367, "learning_rate": 9.992120684102453e-06, "loss": 0.6060096025466919, "memory(GiB)": 74.11, "step": 83, "token_acc": 0.8148148148148148, "train_speed(iter/s)": 0.022728 }, { "epoch": 0.0679474216380182, "grad_norm": 3.9677209854125977, "learning_rate": 9.991352627661205e-06, "loss": 0.5200193524360657, "memory(GiB)": 74.11, "step": 84, "token_acc": 0.8506787330316742, "train_speed(iter/s)": 0.022729 }, { "epoch": 0.06875631951466127, "grad_norm": 3.435011863708496, "learning_rate": 9.990548889810078e-06, "loss": 0.6048153638839722, "memory(GiB)": 74.11, "step": 85, "token_acc": 0.8391608391608392, "train_speed(iter/s)": 0.022731 }, { "epoch": 0.06956521739130435, "grad_norm": 3.5457801818847656, "learning_rate": 9.989709476294708e-06, "loss": 0.5572282671928406, "memory(GiB)": 74.11, "step": 86, "token_acc": 0.8181818181818182, "train_speed(iter/s)": 0.022732 }, { "epoch": 0.07037411526794742, "grad_norm": 3.885216474533081, "learning_rate": 9.988834393115768e-06, "loss": 0.5753508806228638, "memory(GiB)": 74.11, "step": 87, "token_acc": 0.8823529411764706, "train_speed(iter/s)": 0.022732 }, { "epoch": 0.0711830131445905, "grad_norm": 3.5327308177948, "learning_rate": 9.987923646528911e-06, "loss": 0.5835089683532715, "memory(GiB)": 74.11, "step": 88, "token_acc": 0.8226221079691517, "train_speed(iter/s)": 0.022731 }, { "epoch": 0.07199191102123358, "grad_norm": 3.0550527572631836, "learning_rate": 9.986977243044747e-06, "loss": 0.5215576887130737, "memory(GiB)": 74.11, "step": 89, "token_acc": 0.8870292887029289, "train_speed(iter/s)": 0.022732 }, { "epoch": 0.07280080889787664, "grad_norm": 3.0193593502044678, "learning_rate": 9.985995189428775e-06, "loss": 0.4884870648384094, "memory(GiB)": 74.11, "step": 90, "token_acc": 0.8713235294117647, "train_speed(iter/s)": 0.022732 }, { "epoch": 0.07360970677451972, "grad_norm": 3.2098543643951416, "learning_rate": 9.984977492701351e-06, "loss": 0.5010548233985901, "memory(GiB)": 74.11, "step": 91, "token_acc": 0.8104575163398693, "train_speed(iter/s)": 0.022734 }, { "epoch": 0.07441860465116279, "grad_norm": 3.6859188079833984, "learning_rate": 9.983924160137627e-06, "loss": 0.5493002533912659, "memory(GiB)": 74.11, "step": 92, "token_acc": 0.7937743190661478, "train_speed(iter/s)": 0.022734 }, { "epoch": 0.07522750252780587, "grad_norm": 3.2814273834228516, "learning_rate": 9.982835199267502e-06, "loss": 0.6033581495285034, "memory(GiB)": 74.11, "step": 93, "token_acc": 0.8416666666666667, "train_speed(iter/s)": 0.022734 }, { "epoch": 0.07603640040444894, "grad_norm": 3.5553441047668457, "learning_rate": 9.981710617875575e-06, "loss": 0.6103281378746033, "memory(GiB)": 74.11, "step": 94, "token_acc": 0.7589285714285714, "train_speed(iter/s)": 0.022736 }, { "epoch": 0.07684529828109202, "grad_norm": 3.5121068954467773, "learning_rate": 9.980550424001077e-06, "loss": 0.5484324097633362, "memory(GiB)": 74.11, "step": 95, "token_acc": 0.8211678832116789, "train_speed(iter/s)": 0.022736 }, { "epoch": 0.07765419615773508, "grad_norm": 2.6635591983795166, "learning_rate": 9.979354625937821e-06, "loss": 0.509511411190033, "memory(GiB)": 74.11, "step": 96, "token_acc": 0.8333333333333334, "train_speed(iter/s)": 0.022736 }, { "epoch": 0.07846309403437816, "grad_norm": 3.5615248680114746, "learning_rate": 9.978123232234147e-06, "loss": 0.5271934270858765, "memory(GiB)": 74.11, "step": 97, "token_acc": 0.815625, "train_speed(iter/s)": 0.022737 }, { "epoch": 0.07927199191102123, "grad_norm": 4.439089775085449, "learning_rate": 9.976856251692851e-06, "loss": 0.5473837852478027, "memory(GiB)": 74.11, "step": 98, "token_acc": 0.843558282208589, "train_speed(iter/s)": 0.022738 }, { "epoch": 0.08008088978766431, "grad_norm": 3.3765029907226562, "learning_rate": 9.975553693371124e-06, "loss": 0.572515070438385, "memory(GiB)": 74.11, "step": 99, "token_acc": 0.8262411347517731, "train_speed(iter/s)": 0.022739 }, { "epoch": 0.08088978766430738, "grad_norm": 3.8845911026000977, "learning_rate": 9.974215566580499e-06, "loss": 0.5989265441894531, "memory(GiB)": 74.11, "step": 100, "token_acc": 0.8562091503267973, "train_speed(iter/s)": 0.022741 }, { "epoch": 0.08169868554095046, "grad_norm": 3.336557626724243, "learning_rate": 9.972841880886766e-06, "loss": 0.5662233829498291, "memory(GiB)": 74.11, "step": 101, "token_acc": 0.8298755186721992, "train_speed(iter/s)": 0.022741 }, { "epoch": 0.08250758341759352, "grad_norm": 2.8836798667907715, "learning_rate": 9.971432646109919e-06, "loss": 0.44332605600357056, "memory(GiB)": 74.11, "step": 102, "token_acc": 0.8586572438162544, "train_speed(iter/s)": 0.022742 }, { "epoch": 0.0833164812942366, "grad_norm": 4.133236885070801, "learning_rate": 9.969987872324076e-06, "loss": 0.5478776693344116, "memory(GiB)": 74.11, "step": 103, "token_acc": 0.8424908424908425, "train_speed(iter/s)": 0.022743 }, { "epoch": 0.08412537917087967, "grad_norm": 4.5403828620910645, "learning_rate": 9.968507569857413e-06, "loss": 0.5256601572036743, "memory(GiB)": 74.11, "step": 104, "token_acc": 0.7886178861788617, "train_speed(iter/s)": 0.022744 }, { "epoch": 0.08493427704752275, "grad_norm": 3.083695888519287, "learning_rate": 9.966991749292088e-06, "loss": 0.560812771320343, "memory(GiB)": 74.11, "step": 105, "token_acc": 0.8056537102473498, "train_speed(iter/s)": 0.022745 }, { "epoch": 0.08574317492416582, "grad_norm": 2.619795083999634, "learning_rate": 9.965440421464163e-06, "loss": 0.5007873773574829, "memory(GiB)": 74.11, "step": 106, "token_acc": 0.8132295719844358, "train_speed(iter/s)": 0.022745 }, { "epoch": 0.0865520728008089, "grad_norm": 3.6254372596740723, "learning_rate": 9.963853597463533e-06, "loss": 0.49696582555770874, "memory(GiB)": 74.11, "step": 107, "token_acc": 0.846441947565543, "train_speed(iter/s)": 0.022747 }, { "epoch": 0.08736097067745197, "grad_norm": 3.388469934463501, "learning_rate": 9.962231288633838e-06, "loss": 0.4739895462989807, "memory(GiB)": 74.11, "step": 108, "token_acc": 0.84, "train_speed(iter/s)": 0.022748 }, { "epoch": 0.08816986855409505, "grad_norm": 2.8459818363189697, "learning_rate": 9.960573506572391e-06, "loss": 0.46099379658699036, "memory(GiB)": 74.11, "step": 109, "token_acc": 0.821875, "train_speed(iter/s)": 0.022748 }, { "epoch": 0.08897876643073811, "grad_norm": 3.143099546432495, "learning_rate": 9.958880263130084e-06, "loss": 0.48788702487945557, "memory(GiB)": 74.11, "step": 110, "token_acc": 0.8125, "train_speed(iter/s)": 0.022748 }, { "epoch": 0.0897876643073812, "grad_norm": 3.5926871299743652, "learning_rate": 9.957151570411317e-06, "loss": 0.5500156879425049, "memory(GiB)": 74.11, "step": 111, "token_acc": 0.8222222222222222, "train_speed(iter/s)": 0.022748 }, { "epoch": 0.09059656218402427, "grad_norm": 5.149491310119629, "learning_rate": 9.955387440773902e-06, "loss": 0.5181611776351929, "memory(GiB)": 74.11, "step": 112, "token_acc": 0.8571428571428571, "train_speed(iter/s)": 0.022749 }, { "epoch": 0.09140546006066734, "grad_norm": 4.696843147277832, "learning_rate": 9.953587886828973e-06, "loss": 0.5575085282325745, "memory(GiB)": 74.11, "step": 113, "token_acc": 0.7924528301886793, "train_speed(iter/s)": 0.02275 }, { "epoch": 0.09221435793731042, "grad_norm": 4.4397053718566895, "learning_rate": 9.951752921440904e-06, "loss": 0.5986132621765137, "memory(GiB)": 74.11, "step": 114, "token_acc": 0.8097560975609757, "train_speed(iter/s)": 0.022749 }, { "epoch": 0.09302325581395349, "grad_norm": 3.5311803817749023, "learning_rate": 9.949882557727215e-06, "loss": 0.47439733147621155, "memory(GiB)": 74.11, "step": 115, "token_acc": 0.8576779026217228, "train_speed(iter/s)": 0.022749 }, { "epoch": 0.09383215369059657, "grad_norm": 4.034605503082275, "learning_rate": 9.947976809058468e-06, "loss": 0.52587890625, "memory(GiB)": 74.11, "step": 116, "token_acc": 0.8388888888888889, "train_speed(iter/s)": 0.02275 }, { "epoch": 0.09464105156723963, "grad_norm": 2.4622230529785156, "learning_rate": 9.946035689058189e-06, "loss": 0.5111696720123291, "memory(GiB)": 74.11, "step": 117, "token_acc": 0.8653846153846154, "train_speed(iter/s)": 0.02275 }, { "epoch": 0.09544994944388271, "grad_norm": 4.2029523849487305, "learning_rate": 9.944059211602752e-06, "loss": 0.644461452960968, "memory(GiB)": 74.11, "step": 118, "token_acc": 0.8391304347826087, "train_speed(iter/s)": 0.02275 }, { "epoch": 0.09625884732052578, "grad_norm": 3.6433732509613037, "learning_rate": 9.942047390821296e-06, "loss": 0.529866099357605, "memory(GiB)": 74.11, "step": 119, "token_acc": 0.8287671232876712, "train_speed(iter/s)": 0.02275 }, { "epoch": 0.09706774519716886, "grad_norm": 2.930225133895874, "learning_rate": 9.940000241095616e-06, "loss": 0.53721022605896, "memory(GiB)": 74.11, "step": 120, "token_acc": 0.8637873754152824, "train_speed(iter/s)": 0.022751 }, { "epoch": 0.09787664307381193, "grad_norm": 3.059379816055298, "learning_rate": 9.937917777060057e-06, "loss": 0.5285677909851074, "memory(GiB)": 74.11, "step": 121, "token_acc": 0.7914691943127962, "train_speed(iter/s)": 0.022752 }, { "epoch": 0.09868554095045501, "grad_norm": 3.1179027557373047, "learning_rate": 9.935800013601415e-06, "loss": 0.543626606464386, "memory(GiB)": 74.11, "step": 122, "token_acc": 0.8638132295719845, "train_speed(iter/s)": 0.022751 }, { "epoch": 0.09949443882709808, "grad_norm": 2.9850940704345703, "learning_rate": 9.933646965858832e-06, "loss": 0.5759721994400024, "memory(GiB)": 74.11, "step": 123, "token_acc": 0.8392857142857143, "train_speed(iter/s)": 0.022752 }, { "epoch": 0.10030333670374116, "grad_norm": 3.2056992053985596, "learning_rate": 9.931458649223683e-06, "loss": 0.5128383636474609, "memory(GiB)": 74.11, "step": 124, "token_acc": 0.8404255319148937, "train_speed(iter/s)": 0.022753 }, { "epoch": 0.10111223458038422, "grad_norm": 3.4550704956054688, "learning_rate": 9.929235079339466e-06, "loss": 0.4931023418903351, "memory(GiB)": 74.11, "step": 125, "token_acc": 0.7634069400630915, "train_speed(iter/s)": 0.022754 }, { "epoch": 0.1019211324570273, "grad_norm": 4.975637912750244, "learning_rate": 9.926976272101693e-06, "loss": 0.5036507844924927, "memory(GiB)": 74.11, "step": 126, "token_acc": 0.8422818791946308, "train_speed(iter/s)": 0.022754 }, { "epoch": 0.10273003033367037, "grad_norm": 3.2330217361450195, "learning_rate": 9.92468224365778e-06, "loss": 0.4464947581291199, "memory(GiB)": 74.11, "step": 127, "token_acc": 0.8804347826086957, "train_speed(iter/s)": 0.022754 }, { "epoch": 0.10353892821031345, "grad_norm": 2.581622362136841, "learning_rate": 9.922353010406918e-06, "loss": 0.5149933695793152, "memory(GiB)": 74.11, "step": 128, "token_acc": 0.8318181818181818, "train_speed(iter/s)": 0.022755 }, { "epoch": 0.10434782608695652, "grad_norm": 2.6486399173736572, "learning_rate": 9.919988588999971e-06, "loss": 0.5142784118652344, "memory(GiB)": 74.11, "step": 129, "token_acc": 0.8621908127208481, "train_speed(iter/s)": 0.022756 }, { "epoch": 0.1051567239635996, "grad_norm": 3.3094420433044434, "learning_rate": 9.917588996339352e-06, "loss": 0.5297855734825134, "memory(GiB)": 74.11, "step": 130, "token_acc": 0.8177339901477833, "train_speed(iter/s)": 0.022757 }, { "epoch": 0.10596562184024266, "grad_norm": 2.769592046737671, "learning_rate": 9.915154249578894e-06, "loss": 0.5081691145896912, "memory(GiB)": 74.11, "step": 131, "token_acc": 0.8755364806866953, "train_speed(iter/s)": 0.022758 }, { "epoch": 0.10677451971688574, "grad_norm": 2.8748629093170166, "learning_rate": 9.91268436612374e-06, "loss": 0.5512316823005676, "memory(GiB)": 74.11, "step": 132, "token_acc": 0.8618181818181818, "train_speed(iter/s)": 0.022757 }, { "epoch": 0.10758341759352881, "grad_norm": 3.3325603008270264, "learning_rate": 9.91017936363021e-06, "loss": 0.48270368576049805, "memory(GiB)": 74.11, "step": 133, "token_acc": 0.8526315789473684, "train_speed(iter/s)": 0.022757 }, { "epoch": 0.10839231547017189, "grad_norm": 4.002824783325195, "learning_rate": 9.907639260005682e-06, "loss": 0.48671406507492065, "memory(GiB)": 74.11, "step": 134, "token_acc": 0.8547717842323651, "train_speed(iter/s)": 0.022758 }, { "epoch": 0.10920121334681497, "grad_norm": 3.655064344406128, "learning_rate": 9.90506407340845e-06, "loss": 0.5502010583877563, "memory(GiB)": 74.11, "step": 135, "token_acc": 0.7976190476190477, "train_speed(iter/s)": 0.022758 }, { "epoch": 0.11001011122345804, "grad_norm": 3.198472023010254, "learning_rate": 9.902453822247615e-06, "loss": 0.47892680764198303, "memory(GiB)": 74.11, "step": 136, "token_acc": 0.8318965517241379, "train_speed(iter/s)": 0.022759 }, { "epoch": 0.11081900910010112, "grad_norm": 2.7282052040100098, "learning_rate": 9.899808525182935e-06, "loss": 0.49719753861427307, "memory(GiB)": 74.11, "step": 137, "token_acc": 0.8417508417508418, "train_speed(iter/s)": 0.022759 }, { "epoch": 0.11162790697674418, "grad_norm": 3.089430093765259, "learning_rate": 9.897128201124699e-06, "loss": 0.532843291759491, "memory(GiB)": 74.11, "step": 138, "token_acc": 0.8152173913043478, "train_speed(iter/s)": 0.022759 }, { "epoch": 0.11243680485338726, "grad_norm": 6.901391983032227, "learning_rate": 9.894412869233597e-06, "loss": 0.5238447189331055, "memory(GiB)": 74.11, "step": 139, "token_acc": 0.8558558558558559, "train_speed(iter/s)": 0.02276 }, { "epoch": 0.11324570273003033, "grad_norm": 3.125302791595459, "learning_rate": 9.89166254892057e-06, "loss": 0.5573660135269165, "memory(GiB)": 74.11, "step": 140, "token_acc": 0.8068181818181818, "train_speed(iter/s)": 0.022761 }, { "epoch": 0.11405460060667341, "grad_norm": 3.38075852394104, "learning_rate": 9.888877259846686e-06, "loss": 0.524215817451477, "memory(GiB)": 74.33, "step": 141, "token_acc": 0.8505338078291815, "train_speed(iter/s)": 0.02276 }, { "epoch": 0.11486349848331648, "grad_norm": 3.413461446762085, "learning_rate": 9.886057021922984e-06, "loss": 0.49190688133239746, "memory(GiB)": 74.33, "step": 142, "token_acc": 0.8333333333333334, "train_speed(iter/s)": 0.02276 }, { "epoch": 0.11567239635995956, "grad_norm": 4.181169509887695, "learning_rate": 9.88320185531035e-06, "loss": 0.542352557182312, "memory(GiB)": 74.33, "step": 143, "token_acc": 0.8503649635036497, "train_speed(iter/s)": 0.02276 }, { "epoch": 0.11648129423660263, "grad_norm": 2.688110828399658, "learning_rate": 9.880311780419353e-06, "loss": 0.5551398992538452, "memory(GiB)": 74.33, "step": 144, "token_acc": 0.8007246376811594, "train_speed(iter/s)": 0.02276 }, { "epoch": 0.1172901921132457, "grad_norm": 3.9851884841918945, "learning_rate": 9.877386817910118e-06, "loss": 0.49384480714797974, "memory(GiB)": 74.33, "step": 145, "token_acc": 0.8333333333333334, "train_speed(iter/s)": 0.02276 }, { "epoch": 0.11809908998988877, "grad_norm": 2.6871986389160156, "learning_rate": 9.874426988692163e-06, "loss": 0.5515081286430359, "memory(GiB)": 74.33, "step": 146, "token_acc": 0.8006644518272426, "train_speed(iter/s)": 0.02276 }, { "epoch": 0.11890798786653185, "grad_norm": 2.288706064224243, "learning_rate": 9.871432313924255e-06, "loss": 0.4420849084854126, "memory(GiB)": 74.33, "step": 147, "token_acc": 0.8091872791519434, "train_speed(iter/s)": 0.022761 }, { "epoch": 0.11971688574317492, "grad_norm": 2.6680195331573486, "learning_rate": 9.868402815014266e-06, "loss": 0.4678765833377838, "memory(GiB)": 74.33, "step": 148, "token_acc": 0.8604651162790697, "train_speed(iter/s)": 0.022761 }, { "epoch": 0.120525783619818, "grad_norm": 2.3895063400268555, "learning_rate": 9.865338513619005e-06, "loss": 0.4832306504249573, "memory(GiB)": 74.33, "step": 149, "token_acc": 0.8480565371024735, "train_speed(iter/s)": 0.022761 }, { "epoch": 0.12133468149646107, "grad_norm": 2.4143781661987305, "learning_rate": 9.86223943164408e-06, "loss": 0.49357208609580994, "memory(GiB)": 74.33, "step": 150, "token_acc": 0.8461538461538461, "train_speed(iter/s)": 0.022762 }, { "epoch": 0.12214357937310415, "grad_norm": 3.0790457725524902, "learning_rate": 9.859105591243728e-06, "loss": 0.4809868633747101, "memory(GiB)": 74.33, "step": 151, "token_acc": 0.8617021276595744, "train_speed(iter/s)": 0.022762 }, { "epoch": 0.12295247724974721, "grad_norm": 3.636885643005371, "learning_rate": 9.85593701482066e-06, "loss": 0.5743482112884521, "memory(GiB)": 74.33, "step": 152, "token_acc": 0.8461538461538461, "train_speed(iter/s)": 0.022763 }, { "epoch": 0.1237613751263903, "grad_norm": 2.7628660202026367, "learning_rate": 9.85273372502591e-06, "loss": 0.46740931272506714, "memory(GiB)": 74.33, "step": 153, "token_acc": 0.8658008658008658, "train_speed(iter/s)": 0.022763 }, { "epoch": 0.12457027300303336, "grad_norm": 3.155374765396118, "learning_rate": 9.849495744758654e-06, "loss": 0.5438951253890991, "memory(GiB)": 74.33, "step": 154, "token_acc": 0.8550185873605948, "train_speed(iter/s)": 0.022763 }, { "epoch": 0.12537917087967643, "grad_norm": 2.9564826488494873, "learning_rate": 9.846223097166072e-06, "loss": 0.537287175655365, "memory(GiB)": 74.33, "step": 155, "token_acc": 0.8456140350877193, "train_speed(iter/s)": 0.022764 }, { "epoch": 0.12618806875631952, "grad_norm": 2.997941017150879, "learning_rate": 9.842915805643156e-06, "loss": 0.4728841781616211, "memory(GiB)": 74.33, "step": 156, "token_acc": 0.9, "train_speed(iter/s)": 0.022764 }, { "epoch": 0.1269969666329626, "grad_norm": 4.7811431884765625, "learning_rate": 9.839573893832564e-06, "loss": 0.48365718126296997, "memory(GiB)": 74.33, "step": 157, "token_acc": 0.8501742160278746, "train_speed(iter/s)": 0.022764 }, { "epoch": 0.12780586450960565, "grad_norm": 2.611847400665283, "learning_rate": 9.836197385624434e-06, "loss": 0.4837043285369873, "memory(GiB)": 74.33, "step": 158, "token_acc": 0.8952879581151832, "train_speed(iter/s)": 0.022765 }, { "epoch": 0.12861476238624875, "grad_norm": 3.331645965576172, "learning_rate": 9.83278630515623e-06, "loss": 0.5694408416748047, "memory(GiB)": 74.33, "step": 159, "token_acc": 0.8177966101694916, "train_speed(iter/s)": 0.022765 }, { "epoch": 0.12942366026289182, "grad_norm": 3.4143426418304443, "learning_rate": 9.829340676812553e-06, "loss": 0.5614443421363831, "memory(GiB)": 74.33, "step": 160, "token_acc": 0.8487972508591065, "train_speed(iter/s)": 0.022765 }, { "epoch": 0.13023255813953488, "grad_norm": 2.541956901550293, "learning_rate": 9.825860525224982e-06, "loss": 0.48274075984954834, "memory(GiB)": 74.33, "step": 161, "token_acc": 0.8207885304659498, "train_speed(iter/s)": 0.022765 }, { "epoch": 0.13104145601617795, "grad_norm": 2.933729410171509, "learning_rate": 9.822345875271884e-06, "loss": 0.47431913018226624, "memory(GiB)": 74.33, "step": 162, "token_acc": 0.8713450292397661, "train_speed(iter/s)": 0.022766 }, { "epoch": 0.13185035389282104, "grad_norm": 2.8055856227874756, "learning_rate": 9.818796752078246e-06, "loss": 0.5554227232933044, "memory(GiB)": 74.33, "step": 163, "token_acc": 0.8627450980392157, "train_speed(iter/s)": 0.022766 }, { "epoch": 0.1326592517694641, "grad_norm": 2.662719488143921, "learning_rate": 9.815213181015489e-06, "loss": 0.4458203911781311, "memory(GiB)": 74.33, "step": 164, "token_acc": 0.8825622775800712, "train_speed(iter/s)": 0.022766 }, { "epoch": 0.13346814964610718, "grad_norm": 5.495974540710449, "learning_rate": 9.811595187701296e-06, "loss": 0.4638062119483948, "memory(GiB)": 74.33, "step": 165, "token_acc": 0.8227848101265823, "train_speed(iter/s)": 0.022766 }, { "epoch": 0.13427704752275024, "grad_norm": 84.01348114013672, "learning_rate": 9.807942797999412e-06, "loss": 0.6657401323318481, "memory(GiB)": 74.33, "step": 166, "token_acc": 0.8483606557377049, "train_speed(iter/s)": 0.022767 }, { "epoch": 0.13508594539939334, "grad_norm": 138.69554138183594, "learning_rate": 9.804256038019482e-06, "loss": 0.6723936796188354, "memory(GiB)": 74.33, "step": 167, "token_acc": 0.8143712574850299, "train_speed(iter/s)": 0.022767 }, { "epoch": 0.1358948432760364, "grad_norm": 11.966114044189453, "learning_rate": 9.800534934116843e-06, "loss": 0.5228875875473022, "memory(GiB)": 74.33, "step": 168, "token_acc": 0.8411552346570397, "train_speed(iter/s)": 0.022768 }, { "epoch": 0.13670374115267947, "grad_norm": 3.311744451522827, "learning_rate": 9.796779512892346e-06, "loss": 0.5082340240478516, "memory(GiB)": 74.33, "step": 169, "token_acc": 0.8514056224899599, "train_speed(iter/s)": 0.022768 }, { "epoch": 0.13751263902932254, "grad_norm": 2.891026735305786, "learning_rate": 9.792989801192167e-06, "loss": 0.4903358519077301, "memory(GiB)": 74.33, "step": 170, "token_acc": 0.8439490445859873, "train_speed(iter/s)": 0.022769 }, { "epoch": 0.13832153690596563, "grad_norm": 2.643505096435547, "learning_rate": 9.789165826107612e-06, "loss": 0.514635443687439, "memory(GiB)": 74.33, "step": 171, "token_acc": 0.8709677419354839, "train_speed(iter/s)": 0.022769 }, { "epoch": 0.1391304347826087, "grad_norm": 2.8423476219177246, "learning_rate": 9.785307614974922e-06, "loss": 0.5150923728942871, "memory(GiB)": 74.33, "step": 172, "token_acc": 0.796875, "train_speed(iter/s)": 0.022769 }, { "epoch": 0.13993933265925176, "grad_norm": 2.4324862957000732, "learning_rate": 9.781415195375078e-06, "loss": 0.4808637797832489, "memory(GiB)": 74.33, "step": 173, "token_acc": 0.8296529968454258, "train_speed(iter/s)": 0.022769 }, { "epoch": 0.14074823053589483, "grad_norm": 2.2403547763824463, "learning_rate": 9.77748859513361e-06, "loss": 0.4378691017627716, "memory(GiB)": 74.33, "step": 174, "token_acc": 0.8571428571428571, "train_speed(iter/s)": 0.022769 }, { "epoch": 0.14155712841253792, "grad_norm": 2.552274703979492, "learning_rate": 9.77352784232039e-06, "loss": 0.4910467565059662, "memory(GiB)": 74.33, "step": 175, "token_acc": 0.8369565217391305, "train_speed(iter/s)": 0.022769 }, { "epoch": 0.142366026289181, "grad_norm": 2.844341278076172, "learning_rate": 9.769532965249435e-06, "loss": 0.5578226447105408, "memory(GiB)": 74.33, "step": 176, "token_acc": 0.8274509803921568, "train_speed(iter/s)": 0.02277 }, { "epoch": 0.14317492416582406, "grad_norm": 2.700742483139038, "learning_rate": 9.765503992478704e-06, "loss": 0.4441274404525757, "memory(GiB)": 74.33, "step": 177, "token_acc": 0.8543689320388349, "train_speed(iter/s)": 0.02277 }, { "epoch": 0.14398382204246715, "grad_norm": 2.824364185333252, "learning_rate": 9.761440952809897e-06, "loss": 0.5075165033340454, "memory(GiB)": 74.33, "step": 178, "token_acc": 0.8222222222222222, "train_speed(iter/s)": 0.022771 }, { "epoch": 0.14479271991911022, "grad_norm": 3.220512628555298, "learning_rate": 9.757343875288242e-06, "loss": 0.47000789642333984, "memory(GiB)": 74.33, "step": 179, "token_acc": 0.828125, "train_speed(iter/s)": 0.022772 }, { "epoch": 0.14560161779575329, "grad_norm": 2.345557689666748, "learning_rate": 9.75321278920229e-06, "loss": 0.5143015384674072, "memory(GiB)": 74.33, "step": 180, "token_acc": 0.8530465949820788, "train_speed(iter/s)": 0.022771 }, { "epoch": 0.14641051567239635, "grad_norm": 3.0752451419830322, "learning_rate": 9.749047724083717e-06, "loss": 0.5505862236022949, "memory(GiB)": 74.33, "step": 181, "token_acc": 0.8475609756097561, "train_speed(iter/s)": 0.022772 }, { "epoch": 0.14721941354903945, "grad_norm": 2.662064552307129, "learning_rate": 9.74484870970709e-06, "loss": 0.5013206601142883, "memory(GiB)": 74.33, "step": 182, "token_acc": 0.873015873015873, "train_speed(iter/s)": 0.022772 }, { "epoch": 0.1480283114256825, "grad_norm": 3.027050256729126, "learning_rate": 9.74061577608968e-06, "loss": 0.554660439491272, "memory(GiB)": 74.33, "step": 183, "token_acc": 0.8388278388278388, "train_speed(iter/s)": 0.022772 }, { "epoch": 0.14883720930232558, "grad_norm": 3.55436635017395, "learning_rate": 9.736348953491224e-06, "loss": 0.5106396675109863, "memory(GiB)": 74.33, "step": 184, "token_acc": 0.797979797979798, "train_speed(iter/s)": 0.022773 }, { "epoch": 0.14964610717896865, "grad_norm": 3.821077585220337, "learning_rate": 9.732048272413725e-06, "loss": 0.5329099297523499, "memory(GiB)": 74.33, "step": 185, "token_acc": 0.8278388278388278, "train_speed(iter/s)": 0.022773 }, { "epoch": 0.15045500505561174, "grad_norm": 2.861586332321167, "learning_rate": 9.727713763601226e-06, "loss": 0.48308447003364563, "memory(GiB)": 74.33, "step": 186, "token_acc": 0.8556701030927835, "train_speed(iter/s)": 0.022773 }, { "epoch": 0.1512639029322548, "grad_norm": 3.025512456893921, "learning_rate": 9.723345458039595e-06, "loss": 0.4873977601528168, "memory(GiB)": 74.33, "step": 187, "token_acc": 0.8426573426573427, "train_speed(iter/s)": 0.022773 }, { "epoch": 0.15207280080889787, "grad_norm": 2.5745112895965576, "learning_rate": 9.718943386956298e-06, "loss": 0.538512110710144, "memory(GiB)": 74.33, "step": 188, "token_acc": 0.8155339805825242, "train_speed(iter/s)": 0.022773 }, { "epoch": 0.15288169868554094, "grad_norm": 2.985320806503296, "learning_rate": 9.714507581820181e-06, "loss": 0.5343044400215149, "memory(GiB)": 74.33, "step": 189, "token_acc": 0.7977099236641222, "train_speed(iter/s)": 0.022773 }, { "epoch": 0.15369059656218403, "grad_norm": 3.339107036590576, "learning_rate": 9.71003807434124e-06, "loss": 0.5087035298347473, "memory(GiB)": 74.33, "step": 190, "token_acc": 0.8478260869565217, "train_speed(iter/s)": 0.022774 }, { "epoch": 0.1544994944388271, "grad_norm": 2.712999105453491, "learning_rate": 9.705534896470401e-06, "loss": 0.4998268783092499, "memory(GiB)": 74.33, "step": 191, "token_acc": 0.8514056224899599, "train_speed(iter/s)": 0.022774 }, { "epoch": 0.15530839231547017, "grad_norm": 3.6283011436462402, "learning_rate": 9.700998080399287e-06, "loss": 0.4922446608543396, "memory(GiB)": 74.33, "step": 192, "token_acc": 0.810126582278481, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.15611729019211323, "grad_norm": 2.546504020690918, "learning_rate": 9.696427658559983e-06, "loss": 0.5213550925254822, "memory(GiB)": 74.33, "step": 193, "token_acc": 0.8381294964028777, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.15692618806875633, "grad_norm": 3.0982861518859863, "learning_rate": 9.691823663624817e-06, "loss": 0.5097714066505432, "memory(GiB)": 74.33, "step": 194, "token_acc": 0.8066914498141264, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.1577350859453994, "grad_norm": 2.8496217727661133, "learning_rate": 9.687186128506116e-06, "loss": 0.5594595074653625, "memory(GiB)": 74.33, "step": 195, "token_acc": 0.8622222222222222, "train_speed(iter/s)": 0.022776 }, { "epoch": 0.15854398382204246, "grad_norm": 2.693981647491455, "learning_rate": 9.682515086355973e-06, "loss": 0.5774262547492981, "memory(GiB)": 74.33, "step": 196, "token_acc": 0.7975708502024291, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.15935288169868553, "grad_norm": 3.6492180824279785, "learning_rate": 9.677810570566011e-06, "loss": 0.5103310346603394, "memory(GiB)": 74.33, "step": 197, "token_acc": 0.8129032258064516, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.16016177957532862, "grad_norm": 2.6552608013153076, "learning_rate": 9.673072614767147e-06, "loss": 0.4744953513145447, "memory(GiB)": 74.33, "step": 198, "token_acc": 0.8699186991869918, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.1609706774519717, "grad_norm": 2.7724416255950928, "learning_rate": 9.668301252829344e-06, "loss": 0.4586220979690552, "memory(GiB)": 74.33, "step": 199, "token_acc": 0.8583690987124464, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.16177957532861476, "grad_norm": 3.1484899520874023, "learning_rate": 9.663496518861381e-06, "loss": 0.6070712208747864, "memory(GiB)": 74.33, "step": 200, "token_acc": 0.8131313131313131, "train_speed(iter/s)": 0.022776 }, { "epoch": 0.16258847320525785, "grad_norm": 4.5751142501831055, "learning_rate": 9.658658447210595e-06, "loss": 0.5579652786254883, "memory(GiB)": 74.33, "step": 201, "token_acc": 0.8524590163934426, "train_speed(iter/s)": 0.022776 }, { "epoch": 0.16339737108190092, "grad_norm": 2.3848133087158203, "learning_rate": 9.653787072462644e-06, "loss": 0.47080251574516296, "memory(GiB)": 74.33, "step": 202, "token_acc": 0.9058823529411765, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.16420626895854398, "grad_norm": 2.686843156814575, "learning_rate": 9.648882429441258e-06, "loss": 0.46535661816596985, "memory(GiB)": 74.33, "step": 203, "token_acc": 0.8138528138528138, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.16501516683518705, "grad_norm": 3.4251608848571777, "learning_rate": 9.643944553207992e-06, "loss": 0.42402440309524536, "memory(GiB)": 74.33, "step": 204, "token_acc": 0.870722433460076, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.16582406471183014, "grad_norm": 3.019339084625244, "learning_rate": 9.63897347906197e-06, "loss": 0.5313763618469238, "memory(GiB)": 74.33, "step": 205, "token_acc": 0.8062283737024222, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.1666329625884732, "grad_norm": 2.4439475536346436, "learning_rate": 9.633969242539643e-06, "loss": 0.47857385873794556, "memory(GiB)": 74.33, "step": 206, "token_acc": 0.8204334365325078, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.16744186046511628, "grad_norm": 2.991232395172119, "learning_rate": 9.628931879414519e-06, "loss": 0.5055133104324341, "memory(GiB)": 74.33, "step": 207, "token_acc": 0.8614864864864865, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.16825075834175934, "grad_norm": 2.8914828300476074, "learning_rate": 9.623861425696919e-06, "loss": 0.48094457387924194, "memory(GiB)": 74.33, "step": 208, "token_acc": 0.8517110266159695, "train_speed(iter/s)": 0.022774 }, { "epoch": 0.16905965621840244, "grad_norm": 3.07913875579834, "learning_rate": 9.618757917633724e-06, "loss": 0.4644262492656708, "memory(GiB)": 74.33, "step": 209, "token_acc": 0.8459016393442623, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.1698685540950455, "grad_norm": 3.3538849353790283, "learning_rate": 9.6136213917081e-06, "loss": 0.49402916431427, "memory(GiB)": 74.33, "step": 210, "token_acc": 0.8023255813953488, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.17067745197168857, "grad_norm": 2.8253116607666016, "learning_rate": 9.608451884639249e-06, "loss": 0.5242215394973755, "memory(GiB)": 74.33, "step": 211, "token_acc": 0.8426966292134831, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.17148634984833164, "grad_norm": 3.1118881702423096, "learning_rate": 9.603249433382145e-06, "loss": 0.4387696385383606, "memory(GiB)": 74.33, "step": 212, "token_acc": 0.8384279475982532, "train_speed(iter/s)": 0.022776 }, { "epoch": 0.17229524772497473, "grad_norm": 3.0564656257629395, "learning_rate": 9.598014075127267e-06, "loss": 0.4570474922657013, "memory(GiB)": 74.33, "step": 213, "token_acc": 0.8423423423423423, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.1731041456016178, "grad_norm": 2.173403024673462, "learning_rate": 9.592745847300334e-06, "loss": 0.4705919027328491, "memory(GiB)": 74.33, "step": 214, "token_acc": 0.8900343642611683, "train_speed(iter/s)": 0.022776 }, { "epoch": 0.17391304347826086, "grad_norm": 2.676457405090332, "learning_rate": 9.587444787562038e-06, "loss": 0.4593808650970459, "memory(GiB)": 74.33, "step": 215, "token_acc": 0.8425655976676385, "train_speed(iter/s)": 0.022776 }, { "epoch": 0.17472194135490393, "grad_norm": 2.6276440620422363, "learning_rate": 9.582110933807778e-06, "loss": 0.5120923519134521, "memory(GiB)": 74.33, "step": 216, "token_acc": 0.8402555910543131, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.17553083923154703, "grad_norm": 2.9223127365112305, "learning_rate": 9.57674432416738e-06, "loss": 0.5409821271896362, "memory(GiB)": 74.33, "step": 217, "token_acc": 0.8786885245901639, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.1763397371081901, "grad_norm": 2.7943737506866455, "learning_rate": 9.571344997004833e-06, "loss": 0.5195801854133606, "memory(GiB)": 74.33, "step": 218, "token_acc": 0.8904761904761904, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.17714863498483316, "grad_norm": 3.1022114753723145, "learning_rate": 9.565912990918014e-06, "loss": 0.5200923085212708, "memory(GiB)": 74.33, "step": 219, "token_acc": 0.8181818181818182, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.17795753286147623, "grad_norm": 2.570176124572754, "learning_rate": 9.560448344738409e-06, "loss": 0.5091375112533569, "memory(GiB)": 74.33, "step": 220, "token_acc": 0.823045267489712, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.17876643073811932, "grad_norm": 3.0033743381500244, "learning_rate": 9.554951097530833e-06, "loss": 0.4781090021133423, "memory(GiB)": 74.33, "step": 221, "token_acc": 0.8544061302681992, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.1795753286147624, "grad_norm": 2.6318182945251465, "learning_rate": 9.549421288593157e-06, "loss": 0.4314906597137451, "memory(GiB)": 74.33, "step": 222, "token_acc": 0.8851063829787233, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.18038422649140545, "grad_norm": 2.8283627033233643, "learning_rate": 9.543858957456027e-06, "loss": 0.5246187448501587, "memory(GiB)": 74.33, "step": 223, "token_acc": 0.8140495867768595, "train_speed(iter/s)": 0.022776 }, { "epoch": 0.18119312436804855, "grad_norm": 2.760436773300171, "learning_rate": 9.538264143882573e-06, "loss": 0.583112359046936, "memory(GiB)": 74.33, "step": 224, "token_acc": 0.8316831683168316, "train_speed(iter/s)": 0.022776 }, { "epoch": 0.1820020222446916, "grad_norm": 2.844444513320923, "learning_rate": 9.532636887868132e-06, "loss": 0.5270188450813293, "memory(GiB)": 74.33, "step": 225, "token_acc": 0.8197424892703863, "train_speed(iter/s)": 0.022776 }, { "epoch": 0.18281092012133468, "grad_norm": 3.431413173675537, "learning_rate": 9.526977229639967e-06, "loss": 0.6098812818527222, "memory(GiB)": 74.33, "step": 226, "token_acc": 0.8528138528138528, "train_speed(iter/s)": 0.022777 }, { "epoch": 0.18361981799797775, "grad_norm": 3.651771068572998, "learning_rate": 9.521285209656964e-06, "loss": 0.5220578908920288, "memory(GiB)": 74.33, "step": 227, "token_acc": 0.8111888111888111, "train_speed(iter/s)": 0.022776 }, { "epoch": 0.18442871587462084, "grad_norm": 2.586838960647583, "learning_rate": 9.515560868609353e-06, "loss": 0.5361062288284302, "memory(GiB)": 74.33, "step": 228, "token_acc": 0.8318181818181818, "train_speed(iter/s)": 0.022776 }, { "epoch": 0.1852376137512639, "grad_norm": 3.409284830093384, "learning_rate": 9.509804247418421e-06, "loss": 0.5047948360443115, "memory(GiB)": 74.33, "step": 229, "token_acc": 0.83, "train_speed(iter/s)": 0.022776 }, { "epoch": 0.18604651162790697, "grad_norm": 2.8747854232788086, "learning_rate": 9.504015387236215e-06, "loss": 0.4199560880661011, "memory(GiB)": 74.33, "step": 230, "token_acc": 0.8304347826086956, "train_speed(iter/s)": 0.022776 }, { "epoch": 0.18685540950455004, "grad_norm": 3.537949800491333, "learning_rate": 9.498194329445235e-06, "loss": 0.48431631922721863, "memory(GiB)": 74.33, "step": 231, "token_acc": 0.8588957055214724, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.18766430738119314, "grad_norm": 2.270864486694336, "learning_rate": 9.492341115658167e-06, "loss": 0.43944597244262695, "memory(GiB)": 74.33, "step": 232, "token_acc": 0.8387096774193549, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.1884732052578362, "grad_norm": 2.3423984050750732, "learning_rate": 9.486455787717556e-06, "loss": 0.4949726462364197, "memory(GiB)": 74.33, "step": 233, "token_acc": 0.8244274809160306, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.18928210313447927, "grad_norm": 2.186225175857544, "learning_rate": 9.480538387695526e-06, "loss": 0.5247252583503723, "memory(GiB)": 74.33, "step": 234, "token_acc": 0.8256578947368421, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.19009100101112233, "grad_norm": 6.916714191436768, "learning_rate": 9.474588957893471e-06, "loss": 0.5562118291854858, "memory(GiB)": 74.33, "step": 235, "token_acc": 0.815668202764977, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.19089989888776543, "grad_norm": 2.669564962387085, "learning_rate": 9.468607540841755e-06, "loss": 0.4648740589618683, "memory(GiB)": 74.33, "step": 236, "token_acc": 0.8404255319148937, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.1917087967644085, "grad_norm": 2.7446367740631104, "learning_rate": 9.462594179299408e-06, "loss": 0.47179466485977173, "memory(GiB)": 74.33, "step": 237, "token_acc": 0.8296943231441049, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.19251769464105156, "grad_norm": 2.733185052871704, "learning_rate": 9.456548916253816e-06, "loss": 0.43457281589508057, "memory(GiB)": 74.33, "step": 238, "token_acc": 0.8382838283828383, "train_speed(iter/s)": 0.022776 }, { "epoch": 0.19332659251769463, "grad_norm": 2.792586326599121, "learning_rate": 9.450471794920425e-06, "loss": 0.5208027362823486, "memory(GiB)": 74.33, "step": 239, "token_acc": 0.8494623655913979, "train_speed(iter/s)": 0.022776 }, { "epoch": 0.19413549039433772, "grad_norm": 3.106788158416748, "learning_rate": 9.444362858742417e-06, "loss": 0.5069155693054199, "memory(GiB)": 74.33, "step": 240, "token_acc": 0.821917808219178, "train_speed(iter/s)": 0.022776 }, { "epoch": 0.1949443882709808, "grad_norm": 2.545304298400879, "learning_rate": 9.438222151390413e-06, "loss": 0.48083266615867615, "memory(GiB)": 74.33, "step": 241, "token_acc": 0.875, "train_speed(iter/s)": 0.022776 }, { "epoch": 0.19575328614762386, "grad_norm": 2.3545124530792236, "learning_rate": 9.432049716762151e-06, "loss": 0.45232367515563965, "memory(GiB)": 74.33, "step": 242, "token_acc": 0.8584070796460177, "train_speed(iter/s)": 0.022776 }, { "epoch": 0.19656218402426692, "grad_norm": 2.424670934677124, "learning_rate": 9.425845598982178e-06, "loss": 0.46154850721359253, "memory(GiB)": 74.33, "step": 243, "token_acc": 0.8481848184818482, "train_speed(iter/s)": 0.022776 }, { "epoch": 0.19737108190091002, "grad_norm": 3.0621895790100098, "learning_rate": 9.419609842401529e-06, "loss": 0.5216662883758545, "memory(GiB)": 74.33, "step": 244, "token_acc": 0.8381742738589212, "train_speed(iter/s)": 0.022775 }, { "epoch": 0.19817997977755308, "grad_norm": 3.4800291061401367, "learning_rate": 9.41334249159742e-06, "loss": 0.578390896320343, "memory(GiB)": 74.33, "step": 245, "token_acc": 0.8411214953271028, "train_speed(iter/s)": 0.022776 }, { "epoch": 0.19898887765419615, "grad_norm": 2.887791633605957, "learning_rate": 9.407043591372917e-06, "loss": 0.45752766728401184, "memory(GiB)": 74.33, "step": 246, "token_acc": 0.8452830188679246, "train_speed(iter/s)": 0.022776 }, { "epoch": 0.19979777553083924, "grad_norm": 2.991569995880127, "learning_rate": 9.400713186756625e-06, "loss": 0.47424283623695374, "memory(GiB)": 74.33, "step": 247, "token_acc": 0.8492063492063492, "train_speed(iter/s)": 0.022776 }, { "epoch": 0.2006066734074823, "grad_norm": 2.222763776779175, "learning_rate": 9.394351323002362e-06, "loss": 0.4558030366897583, "memory(GiB)": 74.33, "step": 248, "token_acc": 0.8471615720524017, "train_speed(iter/s)": 0.022777 }, { "epoch": 0.20141557128412538, "grad_norm": 2.18121075630188, "learning_rate": 9.387958045588837e-06, "loss": 0.47976818680763245, "memory(GiB)": 74.33, "step": 249, "token_acc": 0.8878048780487805, "train_speed(iter/s)": 0.022777 }, { "epoch": 0.20222446916076844, "grad_norm": 2.4463536739349365, "learning_rate": 9.381533400219319e-06, "loss": 0.42482298612594604, "memory(GiB)": 74.33, "step": 250, "token_acc": 0.8661971830985915, "train_speed(iter/s)": 0.022777 }, { "epoch": 0.20303336703741154, "grad_norm": 2.2221012115478516, "learning_rate": 9.375077432821322e-06, "loss": 0.4842270016670227, "memory(GiB)": 74.33, "step": 251, "token_acc": 0.8290909090909091, "train_speed(iter/s)": 0.022777 }, { "epoch": 0.2038422649140546, "grad_norm": 2.4321460723876953, "learning_rate": 9.368590189546268e-06, "loss": 0.49549242854118347, "memory(GiB)": 74.33, "step": 252, "token_acc": 0.8470948012232415, "train_speed(iter/s)": 0.022777 }, { "epoch": 0.20465116279069767, "grad_norm": 2.9055986404418945, "learning_rate": 9.362071716769158e-06, "loss": 0.604824423789978, "memory(GiB)": 74.33, "step": 253, "token_acc": 0.8354430379746836, "train_speed(iter/s)": 0.022777 }, { "epoch": 0.20546006066734074, "grad_norm": 2.3008358478546143, "learning_rate": 9.355522061088242e-06, "loss": 0.43147778511047363, "memory(GiB)": 74.33, "step": 254, "token_acc": 0.8907563025210085, "train_speed(iter/s)": 0.022777 }, { "epoch": 0.20626895854398383, "grad_norm": 2.770148515701294, "learning_rate": 9.348941269324686e-06, "loss": 0.4882833957672119, "memory(GiB)": 74.33, "step": 255, "token_acc": 0.8423423423423423, "train_speed(iter/s)": 0.022778 }, { "epoch": 0.2070778564206269, "grad_norm": 3.3866539001464844, "learning_rate": 9.342329388522239e-06, "loss": 0.5174039006233215, "memory(GiB)": 74.33, "step": 256, "token_acc": 0.825503355704698, "train_speed(iter/s)": 0.022778 }, { "epoch": 0.20788675429726997, "grad_norm": 3.170250654220581, "learning_rate": 9.335686465946888e-06, "loss": 0.5126312971115112, "memory(GiB)": 74.33, "step": 257, "token_acc": 0.8125, "train_speed(iter/s)": 0.022778 }, { "epoch": 0.20869565217391303, "grad_norm": 2.1758675575256348, "learning_rate": 9.32901254908653e-06, "loss": 0.3875027298927307, "memory(GiB)": 74.33, "step": 258, "token_acc": 0.7954545454545454, "train_speed(iter/s)": 0.022778 }, { "epoch": 0.20950455005055613, "grad_norm": 2.4927093982696533, "learning_rate": 9.322307685650638e-06, "loss": 0.4708499312400818, "memory(GiB)": 74.33, "step": 259, "token_acc": 0.8743718592964824, "train_speed(iter/s)": 0.022778 }, { "epoch": 0.2103134479271992, "grad_norm": 3.2660865783691406, "learning_rate": 9.315571923569892e-06, "loss": 0.48012182116508484, "memory(GiB)": 74.33, "step": 260, "token_acc": 0.8479087452471483, "train_speed(iter/s)": 0.022778 }, { "epoch": 0.21112234580384226, "grad_norm": 2.607844829559326, "learning_rate": 9.308805310995877e-06, "loss": 0.4679752588272095, "memory(GiB)": 74.33, "step": 261, "token_acc": 0.813953488372093, "train_speed(iter/s)": 0.022778 }, { "epoch": 0.21193124368048533, "grad_norm": 2.9813013076782227, "learning_rate": 9.302007896300697e-06, "loss": 0.47132837772369385, "memory(GiB)": 74.33, "step": 262, "token_acc": 0.8687258687258688, "train_speed(iter/s)": 0.022779 }, { "epoch": 0.21274014155712842, "grad_norm": 2.997264862060547, "learning_rate": 9.295179728076666e-06, "loss": 0.47330912947654724, "memory(GiB)": 74.33, "step": 263, "token_acc": 0.8465608465608465, "train_speed(iter/s)": 0.022779 }, { "epoch": 0.2135490394337715, "grad_norm": 2.7569003105163574, "learning_rate": 9.288320855135936e-06, "loss": 0.5202451348304749, "memory(GiB)": 74.33, "step": 264, "token_acc": 0.8395061728395061, "train_speed(iter/s)": 0.022779 }, { "epoch": 0.21435793731041455, "grad_norm": 3.455897569656372, "learning_rate": 9.281431326510153e-06, "loss": 0.5138571262359619, "memory(GiB)": 74.33, "step": 265, "token_acc": 0.8263888888888888, "train_speed(iter/s)": 0.022779 }, { "epoch": 0.21516683518705762, "grad_norm": 2.402111291885376, "learning_rate": 9.27451119145012e-06, "loss": 0.4587266147136688, "memory(GiB)": 74.33, "step": 266, "token_acc": 0.8116591928251121, "train_speed(iter/s)": 0.022779 }, { "epoch": 0.21597573306370071, "grad_norm": 2.7626912593841553, "learning_rate": 9.267560499425425e-06, "loss": 0.5164949893951416, "memory(GiB)": 74.33, "step": 267, "token_acc": 0.845771144278607, "train_speed(iter/s)": 0.022779 }, { "epoch": 0.21678463094034378, "grad_norm": 2.1381757259368896, "learning_rate": 9.2605793001241e-06, "loss": 0.47523602843284607, "memory(GiB)": 74.33, "step": 268, "token_acc": 0.8202247191011236, "train_speed(iter/s)": 0.02278 }, { "epoch": 0.21759352881698685, "grad_norm": 3.386496067047119, "learning_rate": 9.253567643452263e-06, "loss": 0.5109878778457642, "memory(GiB)": 74.33, "step": 269, "token_acc": 0.8279569892473119, "train_speed(iter/s)": 0.02278 }, { "epoch": 0.21840242669362994, "grad_norm": 3.036259889602661, "learning_rate": 9.246525579533765e-06, "loss": 0.47165533900260925, "memory(GiB)": 74.33, "step": 270, "token_acc": 0.8557046979865772, "train_speed(iter/s)": 0.02278 }, { "epoch": 0.219211324570273, "grad_norm": 2.2953364849090576, "learning_rate": 9.239453158709822e-06, "loss": 0.452242374420166, "memory(GiB)": 74.33, "step": 271, "token_acc": 0.9050445103857567, "train_speed(iter/s)": 0.02278 }, { "epoch": 0.22002022244691607, "grad_norm": 3.2290663719177246, "learning_rate": 9.232350431538656e-06, "loss": 0.5369592905044556, "memory(GiB)": 74.33, "step": 272, "token_acc": 0.8627450980392157, "train_speed(iter/s)": 0.02278 }, { "epoch": 0.22082912032355914, "grad_norm": 2.628915786743164, "learning_rate": 9.225217448795155e-06, "loss": 0.46493035554885864, "memory(GiB)": 74.33, "step": 273, "token_acc": 0.8185483870967742, "train_speed(iter/s)": 0.02278 }, { "epoch": 0.22163801820020224, "grad_norm": 2.308983325958252, "learning_rate": 9.218054261470477e-06, "loss": 0.462538480758667, "memory(GiB)": 74.33, "step": 274, "token_acc": 0.8456375838926175, "train_speed(iter/s)": 0.02278 }, { "epoch": 0.2224469160768453, "grad_norm": 3.000230550765991, "learning_rate": 9.210860920771706e-06, "loss": 0.43489784002304077, "memory(GiB)": 74.33, "step": 275, "token_acc": 0.842741935483871, "train_speed(iter/s)": 0.02278 }, { "epoch": 0.22325581395348837, "grad_norm": 2.6025278568267822, "learning_rate": 9.203637478121492e-06, "loss": 0.46363720297813416, "memory(GiB)": 74.33, "step": 276, "token_acc": 0.8724489795918368, "train_speed(iter/s)": 0.022781 }, { "epoch": 0.22406471183013144, "grad_norm": 3.2257838249206543, "learning_rate": 9.196383985157657e-06, "loss": 0.46590667963027954, "memory(GiB)": 74.33, "step": 277, "token_acc": 0.8736842105263158, "train_speed(iter/s)": 0.022781 }, { "epoch": 0.22487360970677453, "grad_norm": 2.476445436477661, "learning_rate": 9.189100493732852e-06, "loss": 0.4720000624656677, "memory(GiB)": 74.33, "step": 278, "token_acc": 0.8990825688073395, "train_speed(iter/s)": 0.022781 }, { "epoch": 0.2256825075834176, "grad_norm": 1.9399663209915161, "learning_rate": 9.181787055914175e-06, "loss": 0.43296879529953003, "memory(GiB)": 74.33, "step": 279, "token_acc": 0.8297872340425532, "train_speed(iter/s)": 0.022782 }, { "epoch": 0.22649140546006066, "grad_norm": 2.530008554458618, "learning_rate": 9.1744437239828e-06, "loss": 0.43587636947631836, "memory(GiB)": 74.33, "step": 280, "token_acc": 0.8951612903225806, "train_speed(iter/s)": 0.022782 }, { "epoch": 0.22730030333670373, "grad_norm": 2.7868869304656982, "learning_rate": 9.167070550433604e-06, "loss": 0.3868146538734436, "memory(GiB)": 74.33, "step": 281, "token_acc": 0.8425925925925926, "train_speed(iter/s)": 0.022783 }, { "epoch": 0.22810920121334682, "grad_norm": 2.6715898513793945, "learning_rate": 9.159667587974786e-06, "loss": 0.40206730365753174, "memory(GiB)": 74.33, "step": 282, "token_acc": 0.8894736842105263, "train_speed(iter/s)": 0.022783 }, { "epoch": 0.2289180990899899, "grad_norm": 2.36309552192688, "learning_rate": 9.1522348895275e-06, "loss": 0.5806437730789185, "memory(GiB)": 74.33, "step": 283, "token_acc": 0.7923728813559322, "train_speed(iter/s)": 0.022783 }, { "epoch": 0.22972699696663296, "grad_norm": 2.1452529430389404, "learning_rate": 9.144772508225477e-06, "loss": 0.4016059339046478, "memory(GiB)": 74.33, "step": 284, "token_acc": 0.872093023255814, "train_speed(iter/s)": 0.022783 }, { "epoch": 0.23053589484327602, "grad_norm": 2.564225196838379, "learning_rate": 9.137280497414628e-06, "loss": 0.3909257650375366, "memory(GiB)": 74.33, "step": 285, "token_acc": 0.8805460750853242, "train_speed(iter/s)": 0.022783 }, { "epoch": 0.23134479271991912, "grad_norm": 2.211818218231201, "learning_rate": 9.129758910652684e-06, "loss": 0.4310418963432312, "memory(GiB)": 74.33, "step": 286, "token_acc": 0.8644859813084113, "train_speed(iter/s)": 0.022784 }, { "epoch": 0.23215369059656218, "grad_norm": 3.1847712993621826, "learning_rate": 9.122207801708802e-06, "loss": 0.43590471148490906, "memory(GiB)": 74.33, "step": 287, "token_acc": 0.864, "train_speed(iter/s)": 0.022784 }, { "epoch": 0.23296258847320525, "grad_norm": 2.477933406829834, "learning_rate": 9.114627224563182e-06, "loss": 0.4442121386528015, "memory(GiB)": 74.33, "step": 288, "token_acc": 0.884, "train_speed(iter/s)": 0.022784 }, { "epoch": 0.23377148634984835, "grad_norm": 3.274622678756714, "learning_rate": 9.10701723340668e-06, "loss": 0.47166556119918823, "memory(GiB)": 74.33, "step": 289, "token_acc": 0.8571428571428571, "train_speed(iter/s)": 0.022784 }, { "epoch": 0.2345803842264914, "grad_norm": 3.145052671432495, "learning_rate": 9.099377882640425e-06, "loss": 0.46739423274993896, "memory(GiB)": 74.33, "step": 290, "token_acc": 0.8502202643171806, "train_speed(iter/s)": 0.022784 }, { "epoch": 0.23538928210313448, "grad_norm": 2.3364012241363525, "learning_rate": 9.09170922687543e-06, "loss": 0.4193730354309082, "memory(GiB)": 74.33, "step": 291, "token_acc": 0.8828451882845189, "train_speed(iter/s)": 0.022785 }, { "epoch": 0.23619817997977754, "grad_norm": 2.827242612838745, "learning_rate": 9.08401132093219e-06, "loss": 0.5026365518569946, "memory(GiB)": 74.33, "step": 292, "token_acc": 0.8571428571428571, "train_speed(iter/s)": 0.022785 }, { "epoch": 0.23700707785642064, "grad_norm": 3.1282265186309814, "learning_rate": 9.076284219840306e-06, "loss": 0.46792399883270264, "memory(GiB)": 74.33, "step": 293, "token_acc": 0.8814814814814815, "train_speed(iter/s)": 0.022785 }, { "epoch": 0.2378159757330637, "grad_norm": 2.6595497131347656, "learning_rate": 9.068527978838086e-06, "loss": 0.48813870549201965, "memory(GiB)": 74.33, "step": 294, "token_acc": 0.8664122137404581, "train_speed(iter/s)": 0.022785 }, { "epoch": 0.23862487360970677, "grad_norm": 2.2860071659088135, "learning_rate": 9.060742653372143e-06, "loss": 0.4249404966831207, "memory(GiB)": 74.33, "step": 295, "token_acc": 0.815068493150685, "train_speed(iter/s)": 0.022785 }, { "epoch": 0.23943377148634984, "grad_norm": 2.8490703105926514, "learning_rate": 9.052928299097013e-06, "loss": 0.5840834975242615, "memory(GiB)": 74.33, "step": 296, "token_acc": 0.8630705394190872, "train_speed(iter/s)": 0.022785 }, { "epoch": 0.24024266936299293, "grad_norm": 2.5748631954193115, "learning_rate": 9.045084971874738e-06, "loss": 0.4933628439903259, "memory(GiB)": 74.33, "step": 297, "token_acc": 0.8488372093023255, "train_speed(iter/s)": 0.022785 }, { "epoch": 0.241051567239636, "grad_norm": 2.2127761840820312, "learning_rate": 9.037212727774486e-06, "loss": 0.47793740034103394, "memory(GiB)": 74.33, "step": 298, "token_acc": 0.8963730569948186, "train_speed(iter/s)": 0.022784 }, { "epoch": 0.24186046511627907, "grad_norm": 2.8014166355133057, "learning_rate": 9.029311623072137e-06, "loss": 0.4578291177749634, "memory(GiB)": 74.33, "step": 299, "token_acc": 0.8131868131868132, "train_speed(iter/s)": 0.022784 }, { "epoch": 0.24266936299292213, "grad_norm": 2.5986998081207275, "learning_rate": 9.021381714249888e-06, "loss": 0.5257298350334167, "memory(GiB)": 74.33, "step": 300, "token_acc": 0.8229166666666666, "train_speed(iter/s)": 0.022784 }, { "epoch": 0.24347826086956523, "grad_norm": 2.7166779041290283, "learning_rate": 9.013423057995845e-06, "loss": 0.5010583400726318, "memory(GiB)": 74.33, "step": 301, "token_acc": 0.8590308370044053, "train_speed(iter/s)": 0.022784 }, { "epoch": 0.2442871587462083, "grad_norm": 2.9347927570343018, "learning_rate": 9.005435711203619e-06, "loss": 0.4537706971168518, "memory(GiB)": 74.33, "step": 302, "token_acc": 0.8659003831417624, "train_speed(iter/s)": 0.022785 }, { "epoch": 0.24509605662285136, "grad_norm": 2.4154651165008545, "learning_rate": 8.997419730971917e-06, "loss": 0.39763540029525757, "memory(GiB)": 74.33, "step": 303, "token_acc": 0.8690476190476191, "train_speed(iter/s)": 0.022785 }, { "epoch": 0.24590495449949443, "grad_norm": 2.5024564266204834, "learning_rate": 8.989375174604142e-06, "loss": 0.5160707235336304, "memory(GiB)": 74.33, "step": 304, "token_acc": 0.8614718614718615, "train_speed(iter/s)": 0.022785 }, { "epoch": 0.24671385237613752, "grad_norm": 2.6469497680664062, "learning_rate": 8.981302099607973e-06, "loss": 0.4616546332836151, "memory(GiB)": 74.33, "step": 305, "token_acc": 0.8442028985507246, "train_speed(iter/s)": 0.022785 }, { "epoch": 0.2475227502527806, "grad_norm": 2.6130266189575195, "learning_rate": 8.973200563694964e-06, "loss": 0.42548537254333496, "memory(GiB)": 74.33, "step": 306, "token_acc": 0.852589641434263, "train_speed(iter/s)": 0.022785 }, { "epoch": 0.24833164812942365, "grad_norm": 2.578451156616211, "learning_rate": 8.965070624780117e-06, "loss": 0.48335641622543335, "memory(GiB)": 74.33, "step": 307, "token_acc": 0.846441947565543, "train_speed(iter/s)": 0.022785 }, { "epoch": 0.24914054600606672, "grad_norm": 2.4299726486206055, "learning_rate": 8.956912340981485e-06, "loss": 0.4736361801624298, "memory(GiB)": 74.33, "step": 308, "token_acc": 0.8448979591836735, "train_speed(iter/s)": 0.022785 }, { "epoch": 0.24994944388270982, "grad_norm": 2.816293239593506, "learning_rate": 8.948725770619745e-06, "loss": 0.503253698348999, "memory(GiB)": 74.33, "step": 309, "token_acc": 0.8466453674121406, "train_speed(iter/s)": 0.022785 }, { "epoch": 0.25075834175935285, "grad_norm": 2.6718838214874268, "learning_rate": 8.940510972217785e-06, "loss": 0.43048620223999023, "memory(GiB)": 74.33, "step": 310, "token_acc": 0.8262295081967214, "train_speed(iter/s)": 0.022785 }, { "epoch": 0.251567239635996, "grad_norm": 2.4307098388671875, "learning_rate": 8.932268004500288e-06, "loss": 0.51353919506073, "memory(GiB)": 74.33, "step": 311, "token_acc": 0.8412017167381974, "train_speed(iter/s)": 0.022785 }, { "epoch": 0.25237613751263904, "grad_norm": 2.6662516593933105, "learning_rate": 8.923996926393306e-06, "loss": 0.4586646556854248, "memory(GiB)": 74.33, "step": 312, "token_acc": 0.8692579505300353, "train_speed(iter/s)": 0.022786 }, { "epoch": 0.2531850353892821, "grad_norm": 3.027970790863037, "learning_rate": 8.915697797023841e-06, "loss": 0.5299907326698303, "memory(GiB)": 74.33, "step": 313, "token_acc": 0.8582089552238806, "train_speed(iter/s)": 0.022786 }, { "epoch": 0.2539939332659252, "grad_norm": 2.6045422554016113, "learning_rate": 8.907370675719428e-06, "loss": 0.5199022889137268, "memory(GiB)": 74.33, "step": 314, "token_acc": 0.8116883116883117, "train_speed(iter/s)": 0.022785 }, { "epoch": 0.25480283114256824, "grad_norm": 2.7272956371307373, "learning_rate": 8.899015622007703e-06, "loss": 0.45891785621643066, "memory(GiB)": 74.33, "step": 315, "token_acc": 0.8243243243243243, "train_speed(iter/s)": 0.022786 }, { "epoch": 0.2556117290192113, "grad_norm": 2.200077533721924, "learning_rate": 8.890632695615984e-06, "loss": 0.39891767501831055, "memory(GiB)": 74.33, "step": 316, "token_acc": 0.8440677966101695, "train_speed(iter/s)": 0.022786 }, { "epoch": 0.2564206268958544, "grad_norm": 2.301032543182373, "learning_rate": 8.882221956470838e-06, "loss": 0.4599316716194153, "memory(GiB)": 74.33, "step": 317, "token_acc": 0.8325358851674641, "train_speed(iter/s)": 0.022786 }, { "epoch": 0.2572295247724975, "grad_norm": 2.614656448364258, "learning_rate": 8.873783464697653e-06, "loss": 0.459076464176178, "memory(GiB)": 74.33, "step": 318, "token_acc": 0.8393939393939394, "train_speed(iter/s)": 0.022786 }, { "epoch": 0.25803842264914056, "grad_norm": 2.1406943798065186, "learning_rate": 8.865317280620221e-06, "loss": 0.39890217781066895, "memory(GiB)": 74.33, "step": 319, "token_acc": 0.8304347826086956, "train_speed(iter/s)": 0.022786 }, { "epoch": 0.25884732052578363, "grad_norm": 2.5298852920532227, "learning_rate": 8.856823464760284e-06, "loss": 0.4256265163421631, "memory(GiB)": 74.33, "step": 320, "token_acc": 0.8717948717948718, "train_speed(iter/s)": 0.022786 }, { "epoch": 0.2596562184024267, "grad_norm": 2.3466522693634033, "learning_rate": 8.84830207783712e-06, "loss": 0.395018070936203, "memory(GiB)": 74.33, "step": 321, "token_acc": 0.8884462151394422, "train_speed(iter/s)": 0.022786 }, { "epoch": 0.26046511627906976, "grad_norm": 2.6752617359161377, "learning_rate": 8.839753180767108e-06, "loss": 0.4618658423423767, "memory(GiB)": 74.33, "step": 322, "token_acc": 0.8387096774193549, "train_speed(iter/s)": 0.022786 }, { "epoch": 0.26127401415571283, "grad_norm": 2.248332977294922, "learning_rate": 8.831176834663275e-06, "loss": 0.4209662675857544, "memory(GiB)": 74.33, "step": 323, "token_acc": 0.8830645161290323, "train_speed(iter/s)": 0.022786 }, { "epoch": 0.2620829120323559, "grad_norm": 2.6968088150024414, "learning_rate": 8.82257310083488e-06, "loss": 0.4762377440929413, "memory(GiB)": 74.33, "step": 324, "token_acc": 0.8810572687224669, "train_speed(iter/s)": 0.022786 }, { "epoch": 0.26289180990899896, "grad_norm": 3.221013307571411, "learning_rate": 8.813942040786964e-06, "loss": 0.5154784917831421, "memory(GiB)": 74.33, "step": 325, "token_acc": 0.8494208494208494, "train_speed(iter/s)": 0.022786 }, { "epoch": 0.2637007077856421, "grad_norm": 1.9791827201843262, "learning_rate": 8.805283716219917e-06, "loss": 0.47922518849372864, "memory(GiB)": 74.33, "step": 326, "token_acc": 0.8412698412698413, "train_speed(iter/s)": 0.022786 }, { "epoch": 0.26450960566228515, "grad_norm": 1.939926266670227, "learning_rate": 8.79659818902903e-06, "loss": 0.4087769389152527, "memory(GiB)": 74.33, "step": 327, "token_acc": 0.8360655737704918, "train_speed(iter/s)": 0.022787 }, { "epoch": 0.2653185035389282, "grad_norm": 2.3445236682891846, "learning_rate": 8.787885521304056e-06, "loss": 0.49197518825531006, "memory(GiB)": 74.33, "step": 328, "token_acc": 0.8293413173652695, "train_speed(iter/s)": 0.022786 }, { "epoch": 0.2661274014155713, "grad_norm": 2.549042224884033, "learning_rate": 8.779145775328766e-06, "loss": 0.4610610604286194, "memory(GiB)": 74.33, "step": 329, "token_acc": 0.8407960199004975, "train_speed(iter/s)": 0.022787 }, { "epoch": 0.26693629929221435, "grad_norm": 7.023351192474365, "learning_rate": 8.770379013580507e-06, "loss": 0.5349440574645996, "memory(GiB)": 74.33, "step": 330, "token_acc": 0.8619246861924686, "train_speed(iter/s)": 0.022786 }, { "epoch": 0.2677451971688574, "grad_norm": 3.5521559715270996, "learning_rate": 8.761585298729748e-06, "loss": 0.46497541666030884, "memory(GiB)": 74.33, "step": 331, "token_acc": 0.8870292887029289, "train_speed(iter/s)": 0.022786 }, { "epoch": 0.2685540950455005, "grad_norm": 2.684696674346924, "learning_rate": 8.75276469363964e-06, "loss": 0.4779859781265259, "memory(GiB)": 74.33, "step": 332, "token_acc": 0.837696335078534, "train_speed(iter/s)": 0.022786 }, { "epoch": 0.2693629929221436, "grad_norm": 2.123192310333252, "learning_rate": 8.743917261365557e-06, "loss": 0.43780291080474854, "memory(GiB)": 74.33, "step": 333, "token_acc": 0.8692307692307693, "train_speed(iter/s)": 0.022786 }, { "epoch": 0.2701718907987867, "grad_norm": 2.416212558746338, "learning_rate": 8.73504306515466e-06, "loss": 0.43149372935295105, "memory(GiB)": 74.33, "step": 334, "token_acc": 0.85, "train_speed(iter/s)": 0.022786 }, { "epoch": 0.27098078867542974, "grad_norm": 2.407726764678955, "learning_rate": 8.726142168445427e-06, "loss": 0.46393710374832153, "memory(GiB)": 74.33, "step": 335, "token_acc": 0.8478260869565217, "train_speed(iter/s)": 0.022786 }, { "epoch": 0.2717896865520728, "grad_norm": 2.2603883743286133, "learning_rate": 8.717214634867213e-06, "loss": 0.4834635555744171, "memory(GiB)": 74.33, "step": 336, "token_acc": 0.8544303797468354, "train_speed(iter/s)": 0.022786 }, { "epoch": 0.2725985844287159, "grad_norm": 2.377035140991211, "learning_rate": 8.708260528239788e-06, "loss": 0.4176112711429596, "memory(GiB)": 74.33, "step": 337, "token_acc": 0.8802083333333334, "train_speed(iter/s)": 0.022786 }, { "epoch": 0.27340748230535894, "grad_norm": 2.855900526046753, "learning_rate": 8.699279912572888e-06, "loss": 0.4877198338508606, "memory(GiB)": 74.33, "step": 338, "token_acc": 0.8592964824120602, "train_speed(iter/s)": 0.022787 }, { "epoch": 0.274216380182002, "grad_norm": 3.3495020866394043, "learning_rate": 8.690272852065748e-06, "loss": 0.44448497891426086, "memory(GiB)": 74.33, "step": 339, "token_acc": 0.8760683760683761, "train_speed(iter/s)": 0.022787 }, { "epoch": 0.2750252780586451, "grad_norm": 2.204909563064575, "learning_rate": 8.68123941110665e-06, "loss": 0.47281521558761597, "memory(GiB)": 74.33, "step": 340, "token_acc": 0.8225108225108225, "train_speed(iter/s)": 0.022787 }, { "epoch": 0.2758341759352882, "grad_norm": 2.295105218887329, "learning_rate": 8.67217965427246e-06, "loss": 0.42724931240081787, "memory(GiB)": 74.33, "step": 341, "token_acc": 0.875, "train_speed(iter/s)": 0.022787 }, { "epoch": 0.27664307381193126, "grad_norm": 3.001664876937866, "learning_rate": 8.663093646328166e-06, "loss": 0.5214186310768127, "memory(GiB)": 74.33, "step": 342, "token_acc": 0.8205128205128205, "train_speed(iter/s)": 0.022787 }, { "epoch": 0.27745197168857433, "grad_norm": 2.665395736694336, "learning_rate": 8.653981452226418e-06, "loss": 0.43387383222579956, "memory(GiB)": 74.33, "step": 343, "token_acc": 0.908256880733945, "train_speed(iter/s)": 0.022787 }, { "epoch": 0.2782608695652174, "grad_norm": 2.3217623233795166, "learning_rate": 8.644843137107058e-06, "loss": 0.5246144533157349, "memory(GiB)": 74.33, "step": 344, "token_acc": 0.825925925925926, "train_speed(iter/s)": 0.022787 }, { "epoch": 0.27906976744186046, "grad_norm": 2.4558563232421875, "learning_rate": 8.635678766296663e-06, "loss": 0.48798543214797974, "memory(GiB)": 74.33, "step": 345, "token_acc": 0.848780487804878, "train_speed(iter/s)": 0.022787 }, { "epoch": 0.27987866531850353, "grad_norm": 2.1867096424102783, "learning_rate": 8.626488405308067e-06, "loss": 0.5087660551071167, "memory(GiB)": 74.33, "step": 346, "token_acc": 0.8311688311688312, "train_speed(iter/s)": 0.022787 }, { "epoch": 0.2806875631951466, "grad_norm": 2.2217187881469727, "learning_rate": 8.617272119839903e-06, "loss": 0.43445640802383423, "memory(GiB)": 74.33, "step": 347, "token_acc": 0.8571428571428571, "train_speed(iter/s)": 0.022787 }, { "epoch": 0.28149646107178966, "grad_norm": 2.6297953128814697, "learning_rate": 8.608029975776128e-06, "loss": 0.4504978656768799, "memory(GiB)": 74.33, "step": 348, "token_acc": 0.8523809523809524, "train_speed(iter/s)": 0.022788 }, { "epoch": 0.2823053589484328, "grad_norm": 3.717496156692505, "learning_rate": 8.598762039185553e-06, "loss": 0.45087772607803345, "memory(GiB)": 74.33, "step": 349, "token_acc": 0.8565400843881856, "train_speed(iter/s)": 0.022787 }, { "epoch": 0.28311425682507585, "grad_norm": 2.353040933609009, "learning_rate": 8.589468376321369e-06, "loss": 0.4105454683303833, "memory(GiB)": 74.33, "step": 350, "token_acc": 0.8566775244299675, "train_speed(iter/s)": 0.022788 }, { "epoch": 0.2839231547017189, "grad_norm": 2.3427672386169434, "learning_rate": 8.580149053620674e-06, "loss": 0.5255011320114136, "memory(GiB)": 74.33, "step": 351, "token_acc": 0.8346456692913385, "train_speed(iter/s)": 0.022788 }, { "epoch": 0.284732052578362, "grad_norm": 2.3275554180145264, "learning_rate": 8.570804137704005e-06, "loss": 0.443267822265625, "memory(GiB)": 74.33, "step": 352, "token_acc": 0.8314176245210728, "train_speed(iter/s)": 0.022787 }, { "epoch": 0.28554095045500505, "grad_norm": 2.162351608276367, "learning_rate": 8.561433695374848e-06, "loss": 0.4688035249710083, "memory(GiB)": 74.33, "step": 353, "token_acc": 0.8375451263537906, "train_speed(iter/s)": 0.022787 }, { "epoch": 0.2863498483316481, "grad_norm": 2.127072811126709, "learning_rate": 8.552037793619177e-06, "loss": 0.488004207611084, "memory(GiB)": 74.33, "step": 354, "token_acc": 0.8119266055045872, "train_speed(iter/s)": 0.022787 }, { "epoch": 0.2871587462082912, "grad_norm": 2.731759786605835, "learning_rate": 8.542616499604958e-06, "loss": 0.4488160312175751, "memory(GiB)": 74.33, "step": 355, "token_acc": 0.8196078431372549, "train_speed(iter/s)": 0.022787 }, { "epoch": 0.2879676440849343, "grad_norm": 2.025136709213257, "learning_rate": 8.533169880681682e-06, "loss": 0.3923991024494171, "memory(GiB)": 74.33, "step": 356, "token_acc": 0.8362989323843416, "train_speed(iter/s)": 0.022787 }, { "epoch": 0.28877654196157737, "grad_norm": 2.501194477081299, "learning_rate": 8.523698004379878e-06, "loss": 0.46766936779022217, "memory(GiB)": 74.33, "step": 357, "token_acc": 0.8260869565217391, "train_speed(iter/s)": 0.022787 }, { "epoch": 0.28958543983822044, "grad_norm": 2.192864179611206, "learning_rate": 8.514200938410628e-06, "loss": 0.48559021949768066, "memory(GiB)": 74.33, "step": 358, "token_acc": 0.86328125, "train_speed(iter/s)": 0.022788 }, { "epoch": 0.2903943377148635, "grad_norm": 2.9228947162628174, "learning_rate": 8.504678750665094e-06, "loss": 0.5047175288200378, "memory(GiB)": 74.33, "step": 359, "token_acc": 0.8647540983606558, "train_speed(iter/s)": 0.022788 }, { "epoch": 0.29120323559150657, "grad_norm": 2.388331174850464, "learning_rate": 8.495131509214015e-06, "loss": 0.4464142620563507, "memory(GiB)": 74.33, "step": 360, "token_acc": 0.8411552346570397, "train_speed(iter/s)": 0.022788 }, { "epoch": 0.29201213346814964, "grad_norm": 3.4440038204193115, "learning_rate": 8.485559282307237e-06, "loss": 0.44610536098480225, "memory(GiB)": 74.33, "step": 361, "token_acc": 0.892018779342723, "train_speed(iter/s)": 0.022788 }, { "epoch": 0.2928210313447927, "grad_norm": 2.4162344932556152, "learning_rate": 8.475962138373212e-06, "loss": 0.43880611658096313, "memory(GiB)": 74.33, "step": 362, "token_acc": 0.8632478632478633, "train_speed(iter/s)": 0.022788 }, { "epoch": 0.29362992922143577, "grad_norm": 2.4398529529571533, "learning_rate": 8.466340146018522e-06, "loss": 0.4168269634246826, "memory(GiB)": 74.33, "step": 363, "token_acc": 0.8543307086614174, "train_speed(iter/s)": 0.022788 }, { "epoch": 0.2944388270980789, "grad_norm": 2.5178182125091553, "learning_rate": 8.456693374027378e-06, "loss": 0.4725669026374817, "memory(GiB)": 74.33, "step": 364, "token_acc": 0.8543689320388349, "train_speed(iter/s)": 0.022788 }, { "epoch": 0.29524772497472196, "grad_norm": 2.5267229080200195, "learning_rate": 8.44702189136113e-06, "loss": 0.5213101506233215, "memory(GiB)": 74.33, "step": 365, "token_acc": 0.839344262295082, "train_speed(iter/s)": 0.022789 }, { "epoch": 0.296056622851365, "grad_norm": 2.3971071243286133, "learning_rate": 8.43732576715778e-06, "loss": 0.4878075122833252, "memory(GiB)": 74.33, "step": 366, "token_acc": 0.8620689655172413, "train_speed(iter/s)": 0.022789 }, { "epoch": 0.2968655207280081, "grad_norm": 3.86580753326416, "learning_rate": 8.427605070731482e-06, "loss": 0.38472825288772583, "memory(GiB)": 74.33, "step": 367, "token_acc": 0.8538461538461538, "train_speed(iter/s)": 0.022789 }, { "epoch": 0.29767441860465116, "grad_norm": 2.5940558910369873, "learning_rate": 8.417859871572045e-06, "loss": 0.5018994808197021, "memory(GiB)": 74.33, "step": 368, "token_acc": 0.8375796178343949, "train_speed(iter/s)": 0.022789 }, { "epoch": 0.2984833164812942, "grad_norm": 2.456550359725952, "learning_rate": 8.408090239344442e-06, "loss": 0.4518444240093231, "memory(GiB)": 74.33, "step": 369, "token_acc": 0.8458149779735683, "train_speed(iter/s)": 0.022789 }, { "epoch": 0.2992922143579373, "grad_norm": 3.4539546966552734, "learning_rate": 8.39829624388831e-06, "loss": 0.4444255828857422, "memory(GiB)": 74.33, "step": 370, "token_acc": 0.8786407766990292, "train_speed(iter/s)": 0.022789 }, { "epoch": 0.30010111223458036, "grad_norm": 2.5049355030059814, "learning_rate": 8.38847795521745e-06, "loss": 0.4359434247016907, "memory(GiB)": 74.33, "step": 371, "token_acc": 0.8415841584158416, "train_speed(iter/s)": 0.022789 }, { "epoch": 0.3009100101112235, "grad_norm": 2.7211098670959473, "learning_rate": 8.378635443519327e-06, "loss": 0.4071110785007477, "memory(GiB)": 74.33, "step": 372, "token_acc": 0.8516949152542372, "train_speed(iter/s)": 0.02279 }, { "epoch": 0.30171890798786655, "grad_norm": 2.0721325874328613, "learning_rate": 8.368768779154564e-06, "loss": 0.449047714471817, "memory(GiB)": 74.33, "step": 373, "token_acc": 0.8604651162790697, "train_speed(iter/s)": 0.02279 }, { "epoch": 0.3025278058645096, "grad_norm": 2.6694495677948, "learning_rate": 8.358878032656446e-06, "loss": 0.436679869890213, "memory(GiB)": 74.33, "step": 374, "token_acc": 0.8672566371681416, "train_speed(iter/s)": 0.02279 }, { "epoch": 0.3033367037411527, "grad_norm": 2.6044750213623047, "learning_rate": 8.348963274730413e-06, "loss": 0.4522557556629181, "memory(GiB)": 74.33, "step": 375, "token_acc": 0.8481675392670157, "train_speed(iter/s)": 0.02279 }, { "epoch": 0.30414560161779575, "grad_norm": 2.2683019638061523, "learning_rate": 8.339024576253555e-06, "loss": 0.3990349769592285, "memory(GiB)": 74.33, "step": 376, "token_acc": 0.8393574297188755, "train_speed(iter/s)": 0.02279 }, { "epoch": 0.3049544994944388, "grad_norm": 2.6098105907440186, "learning_rate": 8.3290620082741e-06, "loss": 0.47003981471061707, "memory(GiB)": 74.33, "step": 377, "token_acc": 0.8828125, "train_speed(iter/s)": 0.02279 }, { "epoch": 0.3057633973710819, "grad_norm": 2.756648540496826, "learning_rate": 8.319075642010914e-06, "loss": 0.46801501512527466, "memory(GiB)": 74.33, "step": 378, "token_acc": 0.8024691358024691, "train_speed(iter/s)": 0.022791 }, { "epoch": 0.306572295247725, "grad_norm": 2.435135841369629, "learning_rate": 8.30906554885299e-06, "loss": 0.45518428087234497, "memory(GiB)": 74.33, "step": 379, "token_acc": 0.86328125, "train_speed(iter/s)": 0.022791 }, { "epoch": 0.30738119312436807, "grad_norm": 2.305549144744873, "learning_rate": 8.299031800358933e-06, "loss": 0.40630266070365906, "memory(GiB)": 74.33, "step": 380, "token_acc": 0.8652173913043478, "train_speed(iter/s)": 0.022791 }, { "epoch": 0.30819009100101113, "grad_norm": 2.8813188076019287, "learning_rate": 8.288974468256453e-06, "loss": 0.5275822877883911, "memory(GiB)": 74.33, "step": 381, "token_acc": 0.8652849740932642, "train_speed(iter/s)": 0.022791 }, { "epoch": 0.3089989888776542, "grad_norm": 2.2883760929107666, "learning_rate": 8.278893624441849e-06, "loss": 0.4657808542251587, "memory(GiB)": 74.33, "step": 382, "token_acc": 0.8081632653061225, "train_speed(iter/s)": 0.022791 }, { "epoch": 0.30980788675429727, "grad_norm": 2.4337222576141357, "learning_rate": 8.268789340979499e-06, "loss": 0.4899158179759979, "memory(GiB)": 74.33, "step": 383, "token_acc": 0.8776371308016878, "train_speed(iter/s)": 0.022791 }, { "epoch": 0.31061678463094033, "grad_norm": 2.359471082687378, "learning_rate": 8.258661690101347e-06, "loss": 0.4913978576660156, "memory(GiB)": 74.33, "step": 384, "token_acc": 0.8454935622317596, "train_speed(iter/s)": 0.022791 }, { "epoch": 0.3114256825075834, "grad_norm": 2.946106433868408, "learning_rate": 8.24851074420637e-06, "loss": 0.3954363167285919, "memory(GiB)": 74.33, "step": 385, "token_acc": 0.876984126984127, "train_speed(iter/s)": 0.022792 }, { "epoch": 0.31223458038422647, "grad_norm": 2.676274299621582, "learning_rate": 8.238336575860085e-06, "loss": 0.4366721212863922, "memory(GiB)": 74.33, "step": 386, "token_acc": 0.8426229508196721, "train_speed(iter/s)": 0.022791 }, { "epoch": 0.3130434782608696, "grad_norm": 2.2800793647766113, "learning_rate": 8.228139257794012e-06, "loss": 0.4242827892303467, "memory(GiB)": 74.33, "step": 387, "token_acc": 0.8724137931034482, "train_speed(iter/s)": 0.022792 }, { "epoch": 0.31385237613751266, "grad_norm": 2.1262009143829346, "learning_rate": 8.217918862905163e-06, "loss": 0.44696488976478577, "memory(GiB)": 74.33, "step": 388, "token_acc": 0.8759398496240601, "train_speed(iter/s)": 0.022792 }, { "epoch": 0.3146612740141557, "grad_norm": 2.389130115509033, "learning_rate": 8.207675464255519e-06, "loss": 0.4506322741508484, "memory(GiB)": 74.33, "step": 389, "token_acc": 0.8823529411764706, "train_speed(iter/s)": 0.022792 }, { "epoch": 0.3154701718907988, "grad_norm": 2.2962496280670166, "learning_rate": 8.197409135071497e-06, "loss": 0.416850209236145, "memory(GiB)": 74.33, "step": 390, "token_acc": 0.8865248226950354, "train_speed(iter/s)": 0.022792 }, { "epoch": 0.31627906976744186, "grad_norm": 2.0682525634765625, "learning_rate": 8.18711994874345e-06, "loss": 0.423944354057312, "memory(GiB)": 74.33, "step": 391, "token_acc": 0.8411552346570397, "train_speed(iter/s)": 0.022792 }, { "epoch": 0.3170879676440849, "grad_norm": 2.43737530708313, "learning_rate": 8.17680797882512e-06, "loss": 0.44383469223976135, "memory(GiB)": 74.33, "step": 392, "token_acc": 0.8977777777777778, "train_speed(iter/s)": 0.022792 }, { "epoch": 0.317896865520728, "grad_norm": 3.0157485008239746, "learning_rate": 8.166473299033122e-06, "loss": 0.4669773280620575, "memory(GiB)": 74.33, "step": 393, "token_acc": 0.8571428571428571, "train_speed(iter/s)": 0.022792 }, { "epoch": 0.31870576339737106, "grad_norm": 2.434302568435669, "learning_rate": 8.15611598324642e-06, "loss": 0.46818387508392334, "memory(GiB)": 74.33, "step": 394, "token_acc": 0.7833333333333333, "train_speed(iter/s)": 0.022792 }, { "epoch": 0.3195146612740142, "grad_norm": 2.063925266265869, "learning_rate": 8.145736105505788e-06, "loss": 0.45939022302627563, "memory(GiB)": 74.33, "step": 395, "token_acc": 0.8424908424908425, "train_speed(iter/s)": 0.022792 }, { "epoch": 0.32032355915065724, "grad_norm": 2.5207791328430176, "learning_rate": 8.135333740013294e-06, "loss": 0.5139025449752808, "memory(GiB)": 74.33, "step": 396, "token_acc": 0.8441176470588235, "train_speed(iter/s)": 0.022792 }, { "epoch": 0.3211324570273003, "grad_norm": 2.687681198120117, "learning_rate": 8.124908961131759e-06, "loss": 0.4349074959754944, "memory(GiB)": 74.33, "step": 397, "token_acc": 0.852017937219731, "train_speed(iter/s)": 0.022792 }, { "epoch": 0.3219413549039434, "grad_norm": 2.1986069679260254, "learning_rate": 8.114461843384229e-06, "loss": 0.4546552300453186, "memory(GiB)": 74.33, "step": 398, "token_acc": 0.8714859437751004, "train_speed(iter/s)": 0.022793 }, { "epoch": 0.32275025278058644, "grad_norm": 2.6796491146087646, "learning_rate": 8.103992461453447e-06, "loss": 0.5386300086975098, "memory(GiB)": 74.33, "step": 399, "token_acc": 0.8553191489361702, "train_speed(iter/s)": 0.022793 }, { "epoch": 0.3235591506572295, "grad_norm": 2.465752363204956, "learning_rate": 8.093500890181307e-06, "loss": 0.4470570683479309, "memory(GiB)": 74.33, "step": 400, "token_acc": 0.8025889967637541, "train_speed(iter/s)": 0.022793 }, { "epoch": 0.3243680485338726, "grad_norm": 2.695773124694824, "learning_rate": 8.082987204568336e-06, "loss": 0.4630998373031616, "memory(GiB)": 74.33, "step": 401, "token_acc": 0.8252788104089219, "train_speed(iter/s)": 0.022793 }, { "epoch": 0.3251769464105157, "grad_norm": 2.6388256549835205, "learning_rate": 8.072451479773143e-06, "loss": 0.47690147161483765, "memory(GiB)": 74.33, "step": 402, "token_acc": 0.8565400843881856, "train_speed(iter/s)": 0.022793 }, { "epoch": 0.32598584428715877, "grad_norm": 2.6586854457855225, "learning_rate": 8.061893791111887e-06, "loss": 0.5046311020851135, "memory(GiB)": 74.33, "step": 403, "token_acc": 0.825, "train_speed(iter/s)": 0.022793 }, { "epoch": 0.32679474216380183, "grad_norm": 2.575148820877075, "learning_rate": 8.05131421405774e-06, "loss": 0.45166927576065063, "memory(GiB)": 74.33, "step": 404, "token_acc": 0.8725490196078431, "train_speed(iter/s)": 0.022794 }, { "epoch": 0.3276036400404449, "grad_norm": 2.7520835399627686, "learning_rate": 8.040712824240348e-06, "loss": 0.47704529762268066, "memory(GiB)": 74.33, "step": 405, "token_acc": 0.8539682539682539, "train_speed(iter/s)": 0.022793 }, { "epoch": 0.32841253791708797, "grad_norm": 2.6821768283843994, "learning_rate": 8.030089697445287e-06, "loss": 0.44387978315353394, "memory(GiB)": 74.33, "step": 406, "token_acc": 0.8506224066390041, "train_speed(iter/s)": 0.022793 }, { "epoch": 0.32922143579373103, "grad_norm": 2.5903446674346924, "learning_rate": 8.019444909613524e-06, "loss": 0.47109007835388184, "memory(GiB)": 74.33, "step": 407, "token_acc": 0.8415300546448088, "train_speed(iter/s)": 0.022794 }, { "epoch": 0.3300303336703741, "grad_norm": 1.9421981573104858, "learning_rate": 8.00877853684087e-06, "loss": 0.4276235103607178, "memory(GiB)": 74.33, "step": 408, "token_acc": 0.8691588785046729, "train_speed(iter/s)": 0.022794 }, { "epoch": 0.33083923154701717, "grad_norm": 1.9274567365646362, "learning_rate": 7.998090655377441e-06, "loss": 0.4399895668029785, "memory(GiB)": 74.33, "step": 409, "token_acc": 0.8153846153846154, "train_speed(iter/s)": 0.022794 }, { "epoch": 0.3316481294236603, "grad_norm": 2.349695920944214, "learning_rate": 7.987381341627116e-06, "loss": 0.4371504485607147, "memory(GiB)": 74.33, "step": 410, "token_acc": 0.8447488584474886, "train_speed(iter/s)": 0.022794 }, { "epoch": 0.33245702730030335, "grad_norm": 2.508023738861084, "learning_rate": 7.976650672146977e-06, "loss": 0.4392384886741638, "memory(GiB)": 74.33, "step": 411, "token_acc": 0.845360824742268, "train_speed(iter/s)": 0.022794 }, { "epoch": 0.3332659251769464, "grad_norm": 2.007159948348999, "learning_rate": 7.965898723646777e-06, "loss": 0.42986416816711426, "memory(GiB)": 74.33, "step": 412, "token_acc": 0.8504273504273504, "train_speed(iter/s)": 0.022793 }, { "epoch": 0.3340748230535895, "grad_norm": 2.3318965435028076, "learning_rate": 7.955125572988381e-06, "loss": 0.45020729303359985, "memory(GiB)": 74.33, "step": 413, "token_acc": 0.8546099290780141, "train_speed(iter/s)": 0.022794 }, { "epoch": 0.33488372093023255, "grad_norm": 2.5200366973876953, "learning_rate": 7.944331297185224e-06, "loss": 0.4530584216117859, "memory(GiB)": 74.33, "step": 414, "token_acc": 0.8896103896103896, "train_speed(iter/s)": 0.022794 }, { "epoch": 0.3356926188068756, "grad_norm": 2.353825569152832, "learning_rate": 7.933515973401756e-06, "loss": 0.44627559185028076, "memory(GiB)": 74.33, "step": 415, "token_acc": 0.875, "train_speed(iter/s)": 0.022794 }, { "epoch": 0.3365015166835187, "grad_norm": 2.2710440158843994, "learning_rate": 7.92267967895289e-06, "loss": 0.4454203248023987, "memory(GiB)": 74.33, "step": 416, "token_acc": 0.8151260504201681, "train_speed(iter/s)": 0.022794 }, { "epoch": 0.33731041456016175, "grad_norm": 2.4699690341949463, "learning_rate": 7.911822491303453e-06, "loss": 0.4395456910133362, "memory(GiB)": 74.33, "step": 417, "token_acc": 0.8617021276595744, "train_speed(iter/s)": 0.022794 }, { "epoch": 0.3381193124368049, "grad_norm": 2.3089406490325928, "learning_rate": 7.90094448806763e-06, "loss": 0.4436686038970947, "memory(GiB)": 74.33, "step": 418, "token_acc": 0.8864468864468864, "train_speed(iter/s)": 0.022794 }, { "epoch": 0.33892821031344794, "grad_norm": 2.105353593826294, "learning_rate": 7.890045747008406e-06, "loss": 0.48908939957618713, "memory(GiB)": 74.33, "step": 419, "token_acc": 0.8593155893536122, "train_speed(iter/s)": 0.022794 }, { "epoch": 0.339737108190091, "grad_norm": 2.435878276824951, "learning_rate": 7.879126346037018e-06, "loss": 0.4750370979309082, "memory(GiB)": 74.33, "step": 420, "token_acc": 0.8844444444444445, "train_speed(iter/s)": 0.022795 }, { "epoch": 0.3405460060667341, "grad_norm": 2.587909698486328, "learning_rate": 7.868186363212392e-06, "loss": 0.4756377339363098, "memory(GiB)": 74.33, "step": 421, "token_acc": 0.8487084870848709, "train_speed(iter/s)": 0.022795 }, { "epoch": 0.34135490394337714, "grad_norm": 2.2281887531280518, "learning_rate": 7.857225876740585e-06, "loss": 0.4277176558971405, "memory(GiB)": 74.62, "step": 422, "token_acc": 0.8493150684931506, "train_speed(iter/s)": 0.022795 }, { "epoch": 0.3421638018200202, "grad_norm": 2.5752649307250977, "learning_rate": 7.846244964974226e-06, "loss": 0.48055747151374817, "memory(GiB)": 74.62, "step": 423, "token_acc": 0.8837209302325582, "train_speed(iter/s)": 0.022795 }, { "epoch": 0.3429726996966633, "grad_norm": 2.586489200592041, "learning_rate": 7.835243706411961e-06, "loss": 0.4750707745552063, "memory(GiB)": 74.62, "step": 424, "token_acc": 0.8576642335766423, "train_speed(iter/s)": 0.022795 }, { "epoch": 0.3437815975733064, "grad_norm": 2.450918674468994, "learning_rate": 7.824222179697884e-06, "loss": 0.5177239179611206, "memory(GiB)": 74.62, "step": 425, "token_acc": 0.852589641434263, "train_speed(iter/s)": 0.022795 }, { "epoch": 0.34459049544994946, "grad_norm": 2.3722708225250244, "learning_rate": 7.813180463620987e-06, "loss": 0.46518608927726746, "memory(GiB)": 74.62, "step": 426, "token_acc": 0.8423645320197044, "train_speed(iter/s)": 0.022795 }, { "epoch": 0.34539939332659253, "grad_norm": 2.5841665267944336, "learning_rate": 7.802118637114575e-06, "loss": 0.4838918149471283, "memory(GiB)": 74.62, "step": 427, "token_acc": 0.8434782608695652, "train_speed(iter/s)": 0.022795 }, { "epoch": 0.3462082912032356, "grad_norm": 2.3192875385284424, "learning_rate": 7.791036779255726e-06, "loss": 0.42157137393951416, "memory(GiB)": 74.62, "step": 428, "token_acc": 0.8404669260700389, "train_speed(iter/s)": 0.022796 }, { "epoch": 0.34701718907987866, "grad_norm": 2.49680495262146, "learning_rate": 7.779934969264714e-06, "loss": 0.4023537039756775, "memory(GiB)": 74.62, "step": 429, "token_acc": 0.8734939759036144, "train_speed(iter/s)": 0.022796 }, { "epoch": 0.34782608695652173, "grad_norm": 2.0230259895324707, "learning_rate": 7.768813286504439e-06, "loss": 0.37253955006599426, "memory(GiB)": 74.62, "step": 430, "token_acc": 0.9224806201550387, "train_speed(iter/s)": 0.022796 }, { "epoch": 0.3486349848331648, "grad_norm": 2.3140506744384766, "learning_rate": 7.757671810479865e-06, "loss": 0.4874904751777649, "memory(GiB)": 74.62, "step": 431, "token_acc": 0.8592057761732852, "train_speed(iter/s)": 0.022796 }, { "epoch": 0.34944388270980786, "grad_norm": 2.2125346660614014, "learning_rate": 7.74651062083746e-06, "loss": 0.37930744886398315, "memory(GiB)": 74.62, "step": 432, "token_acc": 0.8764940239043825, "train_speed(iter/s)": 0.022796 }, { "epoch": 0.350252780586451, "grad_norm": 2.240590810775757, "learning_rate": 7.735329797364605e-06, "loss": 0.47669389843940735, "memory(GiB)": 74.62, "step": 433, "token_acc": 0.8710801393728222, "train_speed(iter/s)": 0.022796 }, { "epoch": 0.35106167846309405, "grad_norm": 2.510114908218384, "learning_rate": 7.724129419989044e-06, "loss": 0.4742322266101837, "memory(GiB)": 74.62, "step": 434, "token_acc": 0.8536585365853658, "train_speed(iter/s)": 0.022796 }, { "epoch": 0.3518705763397371, "grad_norm": 2.476958990097046, "learning_rate": 7.712909568778302e-06, "loss": 0.4492417871952057, "memory(GiB)": 74.62, "step": 435, "token_acc": 0.8333333333333334, "train_speed(iter/s)": 0.022795 }, { "epoch": 0.3526794742163802, "grad_norm": 2.098637104034424, "learning_rate": 7.701670323939117e-06, "loss": 0.4481479525566101, "memory(GiB)": 74.62, "step": 436, "token_acc": 0.8601398601398601, "train_speed(iter/s)": 0.022796 }, { "epoch": 0.35348837209302325, "grad_norm": 2.2469687461853027, "learning_rate": 7.690411765816864e-06, "loss": 0.43956851959228516, "memory(GiB)": 74.62, "step": 437, "token_acc": 0.8629032258064516, "train_speed(iter/s)": 0.022796 }, { "epoch": 0.3542972699696663, "grad_norm": 2.8738715648651123, "learning_rate": 7.679133974894984e-06, "loss": 0.4626030921936035, "memory(GiB)": 74.62, "step": 438, "token_acc": 0.8680851063829788, "train_speed(iter/s)": 0.022796 }, { "epoch": 0.3551061678463094, "grad_norm": 2.638291358947754, "learning_rate": 7.667837031794404e-06, "loss": 0.45615172386169434, "memory(GiB)": 74.62, "step": 439, "token_acc": 0.8088235294117647, "train_speed(iter/s)": 0.022796 }, { "epoch": 0.35591506572295245, "grad_norm": 2.2586326599121094, "learning_rate": 7.656521017272965e-06, "loss": 0.4124460816383362, "memory(GiB)": 74.62, "step": 440, "token_acc": 0.8611111111111112, "train_speed(iter/s)": 0.022796 }, { "epoch": 0.3567239635995956, "grad_norm": 2.374500274658203, "learning_rate": 7.64518601222484e-06, "loss": 0.4275168180465698, "memory(GiB)": 74.62, "step": 441, "token_acc": 0.8487084870848709, "train_speed(iter/s)": 0.022796 }, { "epoch": 0.35753286147623864, "grad_norm": 1.9997868537902832, "learning_rate": 7.633832097679959e-06, "loss": 0.3909873068332672, "memory(GiB)": 74.62, "step": 442, "token_acc": 0.8868613138686131, "train_speed(iter/s)": 0.022796 }, { "epoch": 0.3583417593528817, "grad_norm": 4.926924705505371, "learning_rate": 7.622459354803435e-06, "loss": 0.43666255474090576, "memory(GiB)": 74.62, "step": 443, "token_acc": 0.8704453441295547, "train_speed(iter/s)": 0.022796 }, { "epoch": 0.3591506572295248, "grad_norm": 2.317330837249756, "learning_rate": 7.611067864894972e-06, "loss": 0.44106507301330566, "memory(GiB)": 74.62, "step": 444, "token_acc": 0.8059701492537313, "train_speed(iter/s)": 0.022796 }, { "epoch": 0.35995955510616784, "grad_norm": 2.5835938453674316, "learning_rate": 7.599657709388292e-06, "loss": 0.46531200408935547, "memory(GiB)": 74.62, "step": 445, "token_acc": 0.7931034482758621, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3607684529828109, "grad_norm": 2.8004226684570312, "learning_rate": 7.58822896985055e-06, "loss": 0.5187166333198547, "memory(GiB)": 74.62, "step": 446, "token_acc": 0.8099173553719008, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.36157735085945397, "grad_norm": 2.7265071868896484, "learning_rate": 7.5767817279817505e-06, "loss": 0.47425639629364014, "memory(GiB)": 74.62, "step": 447, "token_acc": 0.8085106382978723, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3623862487360971, "grad_norm": 2.1328177452087402, "learning_rate": 7.565316065614168e-06, "loss": 0.4435673952102661, "memory(GiB)": 74.62, "step": 448, "token_acc": 0.8631578947368421, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.36319514661274016, "grad_norm": 2.4672372341156006, "learning_rate": 7.5538320647117565e-06, "loss": 0.41679224371910095, "memory(GiB)": 74.62, "step": 449, "token_acc": 0.8908296943231441, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3640040444893832, "grad_norm": 2.6723108291625977, "learning_rate": 7.542329807369566e-06, "loss": 0.5179734826087952, "memory(GiB)": 74.62, "step": 450, "token_acc": 0.7644444444444445, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3648129423660263, "grad_norm": 3.7509987354278564, "learning_rate": 7.530809375813155e-06, "loss": 0.4264351725578308, "memory(GiB)": 74.62, "step": 451, "token_acc": 0.9, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.36562184024266936, "grad_norm": 1.9851875305175781, "learning_rate": 7.519270852398002e-06, "loss": 0.4789334535598755, "memory(GiB)": 74.62, "step": 452, "token_acc": 0.8250950570342205, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3664307381193124, "grad_norm": 2.21183705329895, "learning_rate": 7.507714319608922e-06, "loss": 0.36344388127326965, "memory(GiB)": 74.62, "step": 453, "token_acc": 0.8487394957983193, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3672396359959555, "grad_norm": 1.613560676574707, "learning_rate": 7.496139860059468e-06, "loss": 0.4224799871444702, "memory(GiB)": 74.62, "step": 454, "token_acc": 0.8813056379821959, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.36804853387259856, "grad_norm": 2.4515528678894043, "learning_rate": 7.484547556491346e-06, "loss": 0.4368416368961334, "memory(GiB)": 74.62, "step": 455, "token_acc": 0.8559322033898306, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3688574317492417, "grad_norm": 2.2103137969970703, "learning_rate": 7.472937491773824e-06, "loss": 0.3967626392841339, "memory(GiB)": 74.62, "step": 456, "token_acc": 0.8217821782178217, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.36966632962588475, "grad_norm": 2.522752046585083, "learning_rate": 7.461309748903138e-06, "loss": 0.45169344544410706, "memory(GiB)": 74.62, "step": 457, "token_acc": 0.8535714285714285, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3704752275025278, "grad_norm": 3.0310842990875244, "learning_rate": 7.449664411001898e-06, "loss": 0.37837380170822144, "memory(GiB)": 74.62, "step": 458, "token_acc": 0.9108527131782945, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3712841253791709, "grad_norm": 2.2086234092712402, "learning_rate": 7.438001561318494e-06, "loss": 0.44610685110092163, "memory(GiB)": 74.62, "step": 459, "token_acc": 0.870722433460076, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.37209302325581395, "grad_norm": 2.4862678050994873, "learning_rate": 7.426321283226504e-06, "loss": 0.4015771746635437, "memory(GiB)": 74.62, "step": 460, "token_acc": 0.8907563025210085, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.372901921132457, "grad_norm": 2.0166738033294678, "learning_rate": 7.4146236602240936e-06, "loss": 0.4152040481567383, "memory(GiB)": 74.62, "step": 461, "token_acc": 0.9248826291079812, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3737108190091001, "grad_norm": 2.448951005935669, "learning_rate": 7.402908775933419e-06, "loss": 0.5621334910392761, "memory(GiB)": 74.62, "step": 462, "token_acc": 0.8628318584070797, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.37451971688574315, "grad_norm": 2.186652183532715, "learning_rate": 7.391176714100038e-06, "loss": 0.4613068699836731, "memory(GiB)": 74.62, "step": 463, "token_acc": 0.8188679245283019, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.37532861476238627, "grad_norm": 2.2740073204040527, "learning_rate": 7.379427558592296e-06, "loss": 0.4919006824493408, "memory(GiB)": 74.62, "step": 464, "token_acc": 0.8471760797342193, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.37613751263902934, "grad_norm": 2.158538579940796, "learning_rate": 7.36766139340074e-06, "loss": 0.42273247241973877, "memory(GiB)": 74.62, "step": 465, "token_acc": 0.8622047244094488, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3769464105156724, "grad_norm": 3.0366506576538086, "learning_rate": 7.3558783026375156e-06, "loss": 0.5097289085388184, "memory(GiB)": 74.62, "step": 466, "token_acc": 0.9178082191780822, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.37775530839231547, "grad_norm": 2.2849361896514893, "learning_rate": 7.344078370535757e-06, "loss": 0.5165024995803833, "memory(GiB)": 74.62, "step": 467, "token_acc": 0.8006430868167203, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.37856420626895854, "grad_norm": 1.753194808959961, "learning_rate": 7.3322616814489955e-06, "loss": 0.4367058277130127, "memory(GiB)": 74.62, "step": 468, "token_acc": 0.8678571428571429, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3793731041456016, "grad_norm": 1.9058223962783813, "learning_rate": 7.32042831985055e-06, "loss": 0.41317999362945557, "memory(GiB)": 74.62, "step": 469, "token_acc": 0.8257261410788381, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.38018200202224467, "grad_norm": 2.459209680557251, "learning_rate": 7.308578370332926e-06, "loss": 0.3700507581233978, "memory(GiB)": 74.62, "step": 470, "token_acc": 0.8687943262411347, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3809908998988878, "grad_norm": 1.8641716241836548, "learning_rate": 7.296711917607211e-06, "loss": 0.40189939737319946, "memory(GiB)": 74.62, "step": 471, "token_acc": 0.8717948717948718, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.38179979777553086, "grad_norm": 2.2401087284088135, "learning_rate": 7.284829046502467e-06, "loss": 0.4430382251739502, "memory(GiB)": 74.62, "step": 472, "token_acc": 0.8419243986254296, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3826086956521739, "grad_norm": 2.416550636291504, "learning_rate": 7.272929841965126e-06, "loss": 0.4755879342556, "memory(GiB)": 74.62, "step": 473, "token_acc": 0.8486238532110092, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.383417593528817, "grad_norm": 2.260345935821533, "learning_rate": 7.261014389058383e-06, "loss": 0.44997456669807434, "memory(GiB)": 74.62, "step": 474, "token_acc": 0.7671957671957672, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.38422649140546006, "grad_norm": 2.261056661605835, "learning_rate": 7.2490827729615835e-06, "loss": 0.47697365283966064, "memory(GiB)": 74.62, "step": 475, "token_acc": 0.8628318584070797, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3850353892821031, "grad_norm": 2.013577461242676, "learning_rate": 7.237135078969618e-06, "loss": 0.3827347159385681, "memory(GiB)": 74.62, "step": 476, "token_acc": 0.8478964401294499, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3858442871587462, "grad_norm": 2.1973073482513428, "learning_rate": 7.225171392492316e-06, "loss": 0.40540656447410583, "memory(GiB)": 74.62, "step": 477, "token_acc": 0.863013698630137, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.38665318503538926, "grad_norm": 2.2481391429901123, "learning_rate": 7.213191799053832e-06, "loss": 0.4136468172073364, "memory(GiB)": 74.62, "step": 478, "token_acc": 0.8339100346020761, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3874620829120324, "grad_norm": 2.1501901149749756, "learning_rate": 7.201196384292027e-06, "loss": 0.4204309284687042, "memory(GiB)": 74.62, "step": 479, "token_acc": 0.8870967741935484, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.38827098078867545, "grad_norm": 2.1305158138275146, "learning_rate": 7.189185233957868e-06, "loss": 0.4197065830230713, "memory(GiB)": 74.62, "step": 480, "token_acc": 0.8160919540229885, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3890798786653185, "grad_norm": 2.526954174041748, "learning_rate": 7.177158433914811e-06, "loss": 0.4064275622367859, "memory(GiB)": 74.62, "step": 481, "token_acc": 0.8907103825136612, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3898887765419616, "grad_norm": 3.277456283569336, "learning_rate": 7.165116070138183e-06, "loss": 0.46176213026046753, "memory(GiB)": 74.62, "step": 482, "token_acc": 0.834983498349835, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.39069767441860465, "grad_norm": 2.337390184402466, "learning_rate": 7.153058228714573e-06, "loss": 0.3911609649658203, "memory(GiB)": 74.62, "step": 483, "token_acc": 0.8909952606635071, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3915065722952477, "grad_norm": 2.273653745651245, "learning_rate": 7.140984995841214e-06, "loss": 0.43842604756355286, "memory(GiB)": 74.62, "step": 484, "token_acc": 0.844559585492228, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3923154701718908, "grad_norm": 2.842496395111084, "learning_rate": 7.128896457825364e-06, "loss": 0.41556501388549805, "memory(GiB)": 74.62, "step": 485, "token_acc": 0.8459016393442623, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.39312436804853385, "grad_norm": 2.3521416187286377, "learning_rate": 7.116792701083697e-06, "loss": 0.4312630891799927, "memory(GiB)": 74.62, "step": 486, "token_acc": 0.8566433566433567, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.39393326592517697, "grad_norm": 2.2411739826202393, "learning_rate": 7.104673812141676e-06, "loss": 0.4646815359592438, "memory(GiB)": 74.62, "step": 487, "token_acc": 0.8078602620087336, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.39474216380182003, "grad_norm": 2.26692533493042, "learning_rate": 7.09253987763294e-06, "loss": 0.41715699434280396, "memory(GiB)": 74.62, "step": 488, "token_acc": 0.8636363636363636, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3955510616784631, "grad_norm": 2.127204179763794, "learning_rate": 7.080390984298686e-06, "loss": 0.39702218770980835, "memory(GiB)": 74.62, "step": 489, "token_acc": 0.8631578947368421, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.39635995955510617, "grad_norm": 1.905442476272583, "learning_rate": 7.068227218987043e-06, "loss": 0.3825928568840027, "memory(GiB)": 74.62, "step": 490, "token_acc": 0.8986784140969163, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.39716885743174923, "grad_norm": 1.9447747468948364, "learning_rate": 7.056048668652454e-06, "loss": 0.45161956548690796, "memory(GiB)": 74.62, "step": 491, "token_acc": 0.8728813559322034, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3979777553083923, "grad_norm": 2.295433282852173, "learning_rate": 7.04385542035506e-06, "loss": 0.41795414686203003, "memory(GiB)": 74.62, "step": 492, "token_acc": 0.8817733990147784, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.39878665318503537, "grad_norm": 2.265631675720215, "learning_rate": 7.031647561260065e-06, "loss": 0.4432828426361084, "memory(GiB)": 74.62, "step": 493, "token_acc": 0.7985865724381626, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.3995955510616785, "grad_norm": 2.9621176719665527, "learning_rate": 7.019425178637127e-06, "loss": 0.44883739948272705, "memory(GiB)": 74.62, "step": 494, "token_acc": 0.9203539823008849, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.40040444893832156, "grad_norm": 2.9266443252563477, "learning_rate": 7.007188359859727e-06, "loss": 0.48823320865631104, "memory(GiB)": 74.62, "step": 495, "token_acc": 0.8736842105263158, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.4012133468149646, "grad_norm": 3.5501937866210938, "learning_rate": 6.994937192404539e-06, "loss": 0.41887539625167847, "memory(GiB)": 74.62, "step": 496, "token_acc": 0.8600823045267489, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.4020222446916077, "grad_norm": 2.9611189365386963, "learning_rate": 6.982671763850814e-06, "loss": 0.460665225982666, "memory(GiB)": 74.62, "step": 497, "token_acc": 0.8066037735849056, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.40283114256825076, "grad_norm": 2.5562634468078613, "learning_rate": 6.9703921618797556e-06, "loss": 0.42445844411849976, "memory(GiB)": 74.62, "step": 498, "token_acc": 0.8809523809523809, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.4036400404448938, "grad_norm": 2.2612838745117188, "learning_rate": 6.95809847427388e-06, "loss": 0.4139663577079773, "memory(GiB)": 74.62, "step": 499, "token_acc": 0.8612244897959184, "train_speed(iter/s)": 0.022797 }, { "epoch": 0.4044489383215369, "grad_norm": 2.0981252193450928, "learning_rate": 6.945790788916402e-06, "loss": 0.4424452781677246, "memory(GiB)": 74.62, "step": 500, "token_acc": 0.8401486988847584, "train_speed(iter/s)": 0.022798 }, { "epoch": 0.4044489383215369, "eval_loss": 0.42885029315948486, "eval_runtime": 431.8839, "eval_samples_per_second": 3.7, "eval_steps_per_second": 0.116, "eval_token_acc": 0.8577324229008779, "step": 500 }, { "epoch": 0.40525783619817995, "grad_norm": 2.1216652393341064, "learning_rate": 6.9334691937905995e-06, "loss": 0.4369218349456787, "memory(GiB)": 74.62, "step": 501, "token_acc": 0.8514335360556038, "train_speed(iter/s)": 0.022355 }, { "epoch": 0.4060667340748231, "grad_norm": 2.564833641052246, "learning_rate": 6.921133776979186e-06, "loss": 0.4658987820148468, "memory(GiB)": 74.62, "step": 502, "token_acc": 0.8582089552238806, "train_speed(iter/s)": 0.022356 }, { "epoch": 0.40687563195146614, "grad_norm": 1.8351505994796753, "learning_rate": 6.908784626663681e-06, "loss": 0.4119420647621155, "memory(GiB)": 74.62, "step": 503, "token_acc": 0.8387096774193549, "train_speed(iter/s)": 0.022357 }, { "epoch": 0.4076845298281092, "grad_norm": 2.2373807430267334, "learning_rate": 6.896421831123783e-06, "loss": 0.45484626293182373, "memory(GiB)": 74.62, "step": 504, "token_acc": 0.8540772532188842, "train_speed(iter/s)": 0.022358 }, { "epoch": 0.4084934277047523, "grad_norm": 2.1204137802124023, "learning_rate": 6.884045478736732e-06, "loss": 0.3930210471153259, "memory(GiB)": 74.62, "step": 505, "token_acc": 0.9181034482758621, "train_speed(iter/s)": 0.022359 }, { "epoch": 0.40930232558139534, "grad_norm": 2.195955276489258, "learning_rate": 6.871655657976682e-06, "loss": 0.4383777976036072, "memory(GiB)": 74.62, "step": 506, "token_acc": 0.8703703703703703, "train_speed(iter/s)": 0.022359 }, { "epoch": 0.4101112234580384, "grad_norm": 2.449862241744995, "learning_rate": 6.859252457414067e-06, "loss": 0.5421361923217773, "memory(GiB)": 74.62, "step": 507, "token_acc": 0.8745247148288974, "train_speed(iter/s)": 0.02236 }, { "epoch": 0.4109201213346815, "grad_norm": 2.8813657760620117, "learning_rate": 6.8468359657149705e-06, "loss": 0.3448445498943329, "memory(GiB)": 74.62, "step": 508, "token_acc": 0.8831168831168831, "train_speed(iter/s)": 0.022361 }, { "epoch": 0.41172901921132454, "grad_norm": 2.2587554454803467, "learning_rate": 6.834406271640488e-06, "loss": 0.40410223603248596, "memory(GiB)": 74.62, "step": 509, "token_acc": 0.9575289575289575, "train_speed(iter/s)": 0.022362 }, { "epoch": 0.41253791708796766, "grad_norm": 2.2055654525756836, "learning_rate": 6.821963464046096e-06, "loss": 0.4498205780982971, "memory(GiB)": 74.62, "step": 510, "token_acc": 0.8311258278145696, "train_speed(iter/s)": 0.022363 }, { "epoch": 0.41334681496461073, "grad_norm": 2.171542167663574, "learning_rate": 6.809507631881014e-06, "loss": 0.4186447858810425, "memory(GiB)": 74.62, "step": 511, "token_acc": 0.8443708609271523, "train_speed(iter/s)": 0.022364 }, { "epoch": 0.4141557128412538, "grad_norm": 2.509507417678833, "learning_rate": 6.797038864187564e-06, "loss": 0.4081672728061676, "memory(GiB)": 74.62, "step": 512, "token_acc": 0.8518518518518519, "train_speed(iter/s)": 0.022364 }, { "epoch": 0.41496461071789686, "grad_norm": 2.3102705478668213, "learning_rate": 6.78455725010055e-06, "loss": 0.4792659878730774, "memory(GiB)": 74.62, "step": 513, "token_acc": 0.8571428571428571, "train_speed(iter/s)": 0.022365 }, { "epoch": 0.41577350859453993, "grad_norm": 2.7244982719421387, "learning_rate": 6.772062878846604e-06, "loss": 0.41006016731262207, "memory(GiB)": 74.62, "step": 514, "token_acc": 0.8380952380952381, "train_speed(iter/s)": 0.022366 }, { "epoch": 0.416582406471183, "grad_norm": 2.3030154705047607, "learning_rate": 6.75955583974355e-06, "loss": 0.4155740737915039, "memory(GiB)": 74.62, "step": 515, "token_acc": 0.8803418803418803, "train_speed(iter/s)": 0.022367 }, { "epoch": 0.41739130434782606, "grad_norm": 3.1387264728546143, "learning_rate": 6.747036222199783e-06, "loss": 0.4403674602508545, "memory(GiB)": 74.62, "step": 516, "token_acc": 0.8486486486486486, "train_speed(iter/s)": 0.022368 }, { "epoch": 0.4182002022244692, "grad_norm": 2.3326053619384766, "learning_rate": 6.7345041157136035e-06, "loss": 0.5110398530960083, "memory(GiB)": 74.62, "step": 517, "token_acc": 0.8466453674121406, "train_speed(iter/s)": 0.022369 }, { "epoch": 0.41900910010111225, "grad_norm": 1.981326699256897, "learning_rate": 6.7219596098725995e-06, "loss": 0.3945692181587219, "memory(GiB)": 74.62, "step": 518, "token_acc": 0.8426395939086294, "train_speed(iter/s)": 0.02237 }, { "epoch": 0.4198179979777553, "grad_norm": 2.0242714881896973, "learning_rate": 6.709402794352993e-06, "loss": 0.3980899155139923, "memory(GiB)": 74.62, "step": 519, "token_acc": 0.8425925925925926, "train_speed(iter/s)": 0.022371 }, { "epoch": 0.4206268958543984, "grad_norm": 2.2979252338409424, "learning_rate": 6.696833758919006e-06, "loss": 0.4187348484992981, "memory(GiB)": 74.62, "step": 520, "token_acc": 0.9004329004329005, "train_speed(iter/s)": 0.022372 }, { "epoch": 0.42143579373104145, "grad_norm": 2.154912233352661, "learning_rate": 6.684252593422214e-06, "loss": 0.4182782471179962, "memory(GiB)": 74.62, "step": 521, "token_acc": 0.896414342629482, "train_speed(iter/s)": 0.022372 }, { "epoch": 0.4222446916076845, "grad_norm": 2.3540515899658203, "learning_rate": 6.67165938780091e-06, "loss": 0.41942286491394043, "memory(GiB)": 74.62, "step": 522, "token_acc": 0.7923728813559322, "train_speed(iter/s)": 0.022373 }, { "epoch": 0.4230535894843276, "grad_norm": 2.746999502182007, "learning_rate": 6.659054232079454e-06, "loss": 0.48690980672836304, "memory(GiB)": 74.62, "step": 523, "token_acc": 0.8956521739130435, "train_speed(iter/s)": 0.022374 }, { "epoch": 0.42386248736097065, "grad_norm": 2.6656594276428223, "learning_rate": 6.646437216367634e-06, "loss": 0.41001442074775696, "memory(GiB)": 74.62, "step": 524, "token_acc": 0.871244635193133, "train_speed(iter/s)": 0.022375 }, { "epoch": 0.4246713852376138, "grad_norm": 3.287884473800659, "learning_rate": 6.633808430860021e-06, "loss": 0.3976552486419678, "memory(GiB)": 74.62, "step": 525, "token_acc": 0.8932584269662921, "train_speed(iter/s)": 0.022376 }, { "epoch": 0.42548028311425684, "grad_norm": 1.8821219205856323, "learning_rate": 6.6211679658353235e-06, "loss": 0.40812772512435913, "memory(GiB)": 74.62, "step": 526, "token_acc": 0.8380281690140845, "train_speed(iter/s)": 0.022377 }, { "epoch": 0.4262891809908999, "grad_norm": 2.2975385189056396, "learning_rate": 6.608515911655744e-06, "loss": 0.4923143982887268, "memory(GiB)": 74.62, "step": 527, "token_acc": 0.8621908127208481, "train_speed(iter/s)": 0.022378 }, { "epoch": 0.427098078867543, "grad_norm": 2.0141286849975586, "learning_rate": 6.595852358766334e-06, "loss": 0.42522329092025757, "memory(GiB)": 74.62, "step": 528, "token_acc": 0.8579234972677595, "train_speed(iter/s)": 0.022379 }, { "epoch": 0.42790697674418604, "grad_norm": 2.7446937561035156, "learning_rate": 6.583177397694338e-06, "loss": 0.4497550129890442, "memory(GiB)": 74.62, "step": 529, "token_acc": 0.8915094339622641, "train_speed(iter/s)": 0.022379 }, { "epoch": 0.4287158746208291, "grad_norm": 2.207721710205078, "learning_rate": 6.570491119048558e-06, "loss": 0.48890426754951477, "memory(GiB)": 74.62, "step": 530, "token_acc": 0.8237082066869301, "train_speed(iter/s)": 0.02238 }, { "epoch": 0.4295247724974722, "grad_norm": 1.9948323965072632, "learning_rate": 6.557793613518704e-06, "loss": 0.39835628867149353, "memory(GiB)": 74.62, "step": 531, "token_acc": 0.8313953488372093, "train_speed(iter/s)": 0.022381 }, { "epoch": 0.43033367037411524, "grad_norm": 2.0337955951690674, "learning_rate": 6.545084971874738e-06, "loss": 0.4067310094833374, "memory(GiB)": 74.62, "step": 532, "token_acc": 0.8481848184818482, "train_speed(iter/s)": 0.022382 }, { "epoch": 0.43114256825075836, "grad_norm": 1.673884630203247, "learning_rate": 6.5323652849662335e-06, "loss": 0.4390275478363037, "memory(GiB)": 74.62, "step": 533, "token_acc": 0.7947976878612717, "train_speed(iter/s)": 0.022382 }, { "epoch": 0.43195146612740143, "grad_norm": 2.2995364665985107, "learning_rate": 6.519634643721721e-06, "loss": 0.40432244539260864, "memory(GiB)": 74.62, "step": 534, "token_acc": 0.8676470588235294, "train_speed(iter/s)": 0.022383 }, { "epoch": 0.4327603640040445, "grad_norm": 2.3338489532470703, "learning_rate": 6.50689313914804e-06, "loss": 0.4244130849838257, "memory(GiB)": 74.62, "step": 535, "token_acc": 0.8333333333333334, "train_speed(iter/s)": 0.022384 }, { "epoch": 0.43356926188068756, "grad_norm": 3.962207078933716, "learning_rate": 6.494140862329688e-06, "loss": 0.43396979570388794, "memory(GiB)": 74.62, "step": 536, "token_acc": 0.8958333333333334, "train_speed(iter/s)": 0.022385 }, { "epoch": 0.43437815975733063, "grad_norm": 2.2048377990722656, "learning_rate": 6.481377904428171e-06, "loss": 0.4214767813682556, "memory(GiB)": 74.62, "step": 537, "token_acc": 0.92, "train_speed(iter/s)": 0.022385 }, { "epoch": 0.4351870576339737, "grad_norm": 2.1275532245635986, "learning_rate": 6.468604356681347e-06, "loss": 0.47981610894203186, "memory(GiB)": 74.62, "step": 538, "token_acc": 0.8615384615384616, "train_speed(iter/s)": 0.022386 }, { "epoch": 0.43599595551061676, "grad_norm": 2.525294542312622, "learning_rate": 6.4558203104027805e-06, "loss": 0.3834857940673828, "memory(GiB)": 74.62, "step": 539, "token_acc": 0.8962264150943396, "train_speed(iter/s)": 0.022387 }, { "epoch": 0.4368048533872599, "grad_norm": 1.9019864797592163, "learning_rate": 6.443025856981086e-06, "loss": 0.4347085952758789, "memory(GiB)": 74.62, "step": 540, "token_acc": 0.8483606557377049, "train_speed(iter/s)": 0.022388 }, { "epoch": 0.43761375126390295, "grad_norm": 2.1029298305511475, "learning_rate": 6.430221087879272e-06, "loss": 0.3873569071292877, "memory(GiB)": 74.62, "step": 541, "token_acc": 0.8458149779735683, "train_speed(iter/s)": 0.022388 }, { "epoch": 0.438422649140546, "grad_norm": 2.2039341926574707, "learning_rate": 6.41740609463409e-06, "loss": 0.41179242730140686, "memory(GiB)": 74.62, "step": 542, "token_acc": 0.8433179723502304, "train_speed(iter/s)": 0.022389 }, { "epoch": 0.4392315470171891, "grad_norm": 1.985140085220337, "learning_rate": 6.404580968855385e-06, "loss": 0.3754437565803528, "memory(GiB)": 74.62, "step": 543, "token_acc": 0.8695652173913043, "train_speed(iter/s)": 0.02239 }, { "epoch": 0.44004044489383215, "grad_norm": 2.1291117668151855, "learning_rate": 6.3917458022254345e-06, "loss": 0.382461816072464, "memory(GiB)": 74.62, "step": 544, "token_acc": 0.8463768115942029, "train_speed(iter/s)": 0.022391 }, { "epoch": 0.4408493427704752, "grad_norm": 2.164369583129883, "learning_rate": 6.3789006864982885e-06, "loss": 0.41792726516723633, "memory(GiB)": 74.62, "step": 545, "token_acc": 0.8883495145631068, "train_speed(iter/s)": 0.022391 }, { "epoch": 0.4416582406471183, "grad_norm": 2.030388355255127, "learning_rate": 6.366045713499129e-06, "loss": 0.42167988419532776, "memory(GiB)": 74.62, "step": 546, "token_acc": 0.8613445378151261, "train_speed(iter/s)": 0.022392 }, { "epoch": 0.44246713852376135, "grad_norm": 1.9591219425201416, "learning_rate": 6.353180975123595e-06, "loss": 0.3823608458042145, "memory(GiB)": 74.62, "step": 547, "token_acc": 0.8422818791946308, "train_speed(iter/s)": 0.022393 }, { "epoch": 0.44327603640040447, "grad_norm": 2.547567367553711, "learning_rate": 6.340306563337142e-06, "loss": 0.4388830363750458, "memory(GiB)": 74.62, "step": 548, "token_acc": 0.8425925925925926, "train_speed(iter/s)": 0.022393 }, { "epoch": 0.44408493427704754, "grad_norm": 2.0034782886505127, "learning_rate": 6.327422570174373e-06, "loss": 0.3995330035686493, "memory(GiB)": 74.62, "step": 549, "token_acc": 0.8996138996138996, "train_speed(iter/s)": 0.022394 }, { "epoch": 0.4448938321536906, "grad_norm": 2.489525079727173, "learning_rate": 6.314529087738387e-06, "loss": 0.4121745824813843, "memory(GiB)": 74.62, "step": 550, "token_acc": 0.842741935483871, "train_speed(iter/s)": 0.022395 }, { "epoch": 0.44570273003033367, "grad_norm": 2.647597551345825, "learning_rate": 6.301626208200116e-06, "loss": 0.4198951721191406, "memory(GiB)": 74.62, "step": 551, "token_acc": 0.8409090909090909, "train_speed(iter/s)": 0.022395 }, { "epoch": 0.44651162790697674, "grad_norm": 3.1573736667633057, "learning_rate": 6.2887140237976714e-06, "loss": 0.36342883110046387, "memory(GiB)": 74.62, "step": 552, "token_acc": 0.8653846153846154, "train_speed(iter/s)": 0.022396 }, { "epoch": 0.4473205257836198, "grad_norm": 2.4319777488708496, "learning_rate": 6.27579262683568e-06, "loss": 0.4457288086414337, "memory(GiB)": 74.62, "step": 553, "token_acc": 0.842741935483871, "train_speed(iter/s)": 0.022397 }, { "epoch": 0.44812942366026287, "grad_norm": 2.0444133281707764, "learning_rate": 6.2628621096846265e-06, "loss": 0.3989095091819763, "memory(GiB)": 74.62, "step": 554, "token_acc": 0.8648648648648649, "train_speed(iter/s)": 0.022398 }, { "epoch": 0.448938321536906, "grad_norm": 2.0919275283813477, "learning_rate": 6.249922564780193e-06, "loss": 0.4167803227901459, "memory(GiB)": 74.62, "step": 555, "token_acc": 0.8681318681318682, "train_speed(iter/s)": 0.022398 }, { "epoch": 0.44974721941354906, "grad_norm": 2.3367862701416016, "learning_rate": 6.236974084622598e-06, "loss": 0.43416649103164673, "memory(GiB)": 74.62, "step": 556, "token_acc": 0.8571428571428571, "train_speed(iter/s)": 0.022399 }, { "epoch": 0.4505561172901921, "grad_norm": 2.491732597351074, "learning_rate": 6.224016761775933e-06, "loss": 0.451057493686676, "memory(GiB)": 74.62, "step": 557, "token_acc": 0.8170347003154574, "train_speed(iter/s)": 0.0224 }, { "epoch": 0.4513650151668352, "grad_norm": 2.020247459411621, "learning_rate": 6.211050688867504e-06, "loss": 0.4087960422039032, "memory(GiB)": 74.62, "step": 558, "token_acc": 0.8835978835978836, "train_speed(iter/s)": 0.0224 }, { "epoch": 0.45217391304347826, "grad_norm": 2.914745807647705, "learning_rate": 6.198075958587168e-06, "loss": 0.42803430557250977, "memory(GiB)": 74.62, "step": 559, "token_acc": 0.8418079096045198, "train_speed(iter/s)": 0.022401 }, { "epoch": 0.4529828109201213, "grad_norm": 2.470507860183716, "learning_rate": 6.185092663686671e-06, "loss": 0.4218277931213379, "memory(GiB)": 74.62, "step": 560, "token_acc": 0.8411764705882353, "train_speed(iter/s)": 0.022402 }, { "epoch": 0.4537917087967644, "grad_norm": 1.9057127237319946, "learning_rate": 6.172100896978985e-06, "loss": 0.3940941095352173, "memory(GiB)": 74.62, "step": 561, "token_acc": 0.8507936507936508, "train_speed(iter/s)": 0.022403 }, { "epoch": 0.45460060667340746, "grad_norm": 3.1265318393707275, "learning_rate": 6.1591007513376425e-06, "loss": 0.4158666431903839, "memory(GiB)": 74.62, "step": 562, "token_acc": 0.8809523809523809, "train_speed(iter/s)": 0.022403 }, { "epoch": 0.4554095045500506, "grad_norm": 2.3407959938049316, "learning_rate": 6.146092319696073e-06, "loss": 0.4111853241920471, "memory(GiB)": 74.62, "step": 563, "token_acc": 0.8944723618090452, "train_speed(iter/s)": 0.022404 }, { "epoch": 0.45621840242669365, "grad_norm": 2.639300584793091, "learning_rate": 6.133075695046944e-06, "loss": 0.41796183586120605, "memory(GiB)": 74.62, "step": 564, "token_acc": 0.8415094339622642, "train_speed(iter/s)": 0.022405 }, { "epoch": 0.4570273003033367, "grad_norm": 2.0815927982330322, "learning_rate": 6.120050970441485e-06, "loss": 0.4047802686691284, "memory(GiB)": 74.62, "step": 565, "token_acc": 0.8901734104046243, "train_speed(iter/s)": 0.022406 }, { "epoch": 0.4578361981799798, "grad_norm": 2.186722993850708, "learning_rate": 6.107018238988838e-06, "loss": 0.45547983050346375, "memory(GiB)": 74.62, "step": 566, "token_acc": 0.8584905660377359, "train_speed(iter/s)": 0.022406 }, { "epoch": 0.45864509605662285, "grad_norm": 2.1137285232543945, "learning_rate": 6.093977593855376e-06, "loss": 0.4355093836784363, "memory(GiB)": 74.62, "step": 567, "token_acc": 0.8921933085501859, "train_speed(iter/s)": 0.022407 }, { "epoch": 0.4594539939332659, "grad_norm": 2.740379810333252, "learning_rate": 6.080929128264046e-06, "loss": 0.5192371606826782, "memory(GiB)": 74.62, "step": 568, "token_acc": 0.8766519823788547, "train_speed(iter/s)": 0.022408 }, { "epoch": 0.460262891809909, "grad_norm": 2.2080211639404297, "learning_rate": 6.067872935493703e-06, "loss": 0.3434896767139435, "memory(GiB)": 74.62, "step": 569, "token_acc": 0.9264069264069265, "train_speed(iter/s)": 0.022408 }, { "epoch": 0.46107178968655205, "grad_norm": 2.196671724319458, "learning_rate": 6.054809108878438e-06, "loss": 0.4425520896911621, "memory(GiB)": 74.62, "step": 570, "token_acc": 0.8904761904761904, "train_speed(iter/s)": 0.022409 }, { "epoch": 0.46188068756319517, "grad_norm": 2.0799689292907715, "learning_rate": 6.041737741806914e-06, "loss": 0.4603237509727478, "memory(GiB)": 74.62, "step": 571, "token_acc": 0.8606060606060606, "train_speed(iter/s)": 0.02241 }, { "epoch": 0.46268958543983824, "grad_norm": 2.2659521102905273, "learning_rate": 6.028658927721698e-06, "loss": 0.3965636193752289, "memory(GiB)": 74.62, "step": 572, "token_acc": 0.8088235294117647, "train_speed(iter/s)": 0.02241 }, { "epoch": 0.4634984833164813, "grad_norm": 1.9087399244308472, "learning_rate": 6.015572760118597e-06, "loss": 0.3759012222290039, "memory(GiB)": 74.62, "step": 573, "token_acc": 0.8742331288343558, "train_speed(iter/s)": 0.022411 }, { "epoch": 0.46430738119312437, "grad_norm": 1.982033610343933, "learning_rate": 6.002479332545982e-06, "loss": 0.45862114429473877, "memory(GiB)": 74.62, "step": 574, "token_acc": 0.8328173374613003, "train_speed(iter/s)": 0.022411 }, { "epoch": 0.46511627906976744, "grad_norm": 3.0300614833831787, "learning_rate": 5.989378738604121e-06, "loss": 0.47833582758903503, "memory(GiB)": 74.62, "step": 575, "token_acc": 0.8853211009174312, "train_speed(iter/s)": 0.022412 }, { "epoch": 0.4659251769464105, "grad_norm": 2.1511874198913574, "learning_rate": 5.976271071944517e-06, "loss": 0.4461168348789215, "memory(GiB)": 74.62, "step": 576, "token_acc": 0.8412698412698413, "train_speed(iter/s)": 0.022413 }, { "epoch": 0.46673407482305357, "grad_norm": 2.324009418487549, "learning_rate": 5.963156426269228e-06, "loss": 0.3640004098415375, "memory(GiB)": 74.62, "step": 577, "token_acc": 0.8808510638297873, "train_speed(iter/s)": 0.022413 }, { "epoch": 0.4675429726996967, "grad_norm": 2.6052918434143066, "learning_rate": 5.9500348953302055e-06, "loss": 0.3626942038536072, "memory(GiB)": 74.62, "step": 578, "token_acc": 0.8615384615384616, "train_speed(iter/s)": 0.022414 }, { "epoch": 0.46835187057633976, "grad_norm": 3.0375425815582275, "learning_rate": 5.936906572928625e-06, "loss": 0.4241126775741577, "memory(GiB)": 74.62, "step": 579, "token_acc": 0.8881987577639752, "train_speed(iter/s)": 0.022415 }, { "epoch": 0.4691607684529828, "grad_norm": 2.636939764022827, "learning_rate": 5.923771552914202e-06, "loss": 0.4479450583457947, "memory(GiB)": 74.62, "step": 580, "token_acc": 0.8616071428571429, "train_speed(iter/s)": 0.022416 }, { "epoch": 0.4699696663296259, "grad_norm": 1.9995110034942627, "learning_rate": 5.910629929184541e-06, "loss": 0.37398701906204224, "memory(GiB)": 74.62, "step": 581, "token_acc": 0.8115942028985508, "train_speed(iter/s)": 0.022416 }, { "epoch": 0.47077856420626896, "grad_norm": 2.149606227874756, "learning_rate": 5.897481795684447e-06, "loss": 0.4055722951889038, "memory(GiB)": 74.62, "step": 582, "token_acc": 0.8668941979522184, "train_speed(iter/s)": 0.022417 }, { "epoch": 0.471587462082912, "grad_norm": 3.842085599899292, "learning_rate": 5.8843272464052626e-06, "loss": 0.38462674617767334, "memory(GiB)": 74.62, "step": 583, "token_acc": 0.8869565217391304, "train_speed(iter/s)": 0.022418 }, { "epoch": 0.4723963599595551, "grad_norm": 2.599775552749634, "learning_rate": 5.871166375384201e-06, "loss": 0.4538233280181885, "memory(GiB)": 74.62, "step": 584, "token_acc": 0.8263888888888888, "train_speed(iter/s)": 0.022418 }, { "epoch": 0.47320525783619816, "grad_norm": 2.188464403152466, "learning_rate": 5.857999276703657e-06, "loss": 0.39639097452163696, "memory(GiB)": 74.62, "step": 585, "token_acc": 0.8488372093023255, "train_speed(iter/s)": 0.022419 }, { "epoch": 0.4740141557128413, "grad_norm": 2.0777783393859863, "learning_rate": 5.844826044490551e-06, "loss": 0.40574946999549866, "memory(GiB)": 74.62, "step": 586, "token_acc": 0.863013698630137, "train_speed(iter/s)": 0.02242 }, { "epoch": 0.47482305358948435, "grad_norm": 2.120650053024292, "learning_rate": 5.831646772915651e-06, "loss": 0.4573715329170227, "memory(GiB)": 74.62, "step": 587, "token_acc": 0.8333333333333334, "train_speed(iter/s)": 0.022421 }, { "epoch": 0.4756319514661274, "grad_norm": 2.0684597492218018, "learning_rate": 5.8184615561928924e-06, "loss": 0.39603498578071594, "memory(GiB)": 74.62, "step": 588, "token_acc": 0.8802816901408451, "train_speed(iter/s)": 0.022421 }, { "epoch": 0.4764408493427705, "grad_norm": 2.653454303741455, "learning_rate": 5.805270488578715e-06, "loss": 0.4210537075996399, "memory(GiB)": 74.62, "step": 589, "token_acc": 0.8981481481481481, "train_speed(iter/s)": 0.022422 }, { "epoch": 0.47724974721941354, "grad_norm": 2.2436983585357666, "learning_rate": 5.7920736643713835e-06, "loss": 0.3758474290370941, "memory(GiB)": 74.62, "step": 590, "token_acc": 0.8515625, "train_speed(iter/s)": 0.022423 }, { "epoch": 0.4780586450960566, "grad_norm": 3.6357314586639404, "learning_rate": 5.778871177910315e-06, "loss": 0.4624039828777313, "memory(GiB)": 74.62, "step": 591, "token_acc": 0.8033898305084746, "train_speed(iter/s)": 0.022423 }, { "epoch": 0.4788675429726997, "grad_norm": 2.0779330730438232, "learning_rate": 5.765663123575401e-06, "loss": 0.4041805863380432, "memory(GiB)": 74.62, "step": 592, "token_acc": 0.8620689655172413, "train_speed(iter/s)": 0.022424 }, { "epoch": 0.47967644084934274, "grad_norm": 2.654712200164795, "learning_rate": 5.752449595786341e-06, "loss": 0.3960053324699402, "memory(GiB)": 74.62, "step": 593, "token_acc": 0.8228782287822878, "train_speed(iter/s)": 0.022424 }, { "epoch": 0.48048533872598587, "grad_norm": 2.4642553329467773, "learning_rate": 5.7392306890019565e-06, "loss": 0.41592419147491455, "memory(GiB)": 74.62, "step": 594, "token_acc": 0.7847533632286996, "train_speed(iter/s)": 0.022425 }, { "epoch": 0.48129423660262893, "grad_norm": 2.2550253868103027, "learning_rate": 5.726006497719525e-06, "loss": 0.46111100912094116, "memory(GiB)": 74.62, "step": 595, "token_acc": 0.8361204013377926, "train_speed(iter/s)": 0.022426 }, { "epoch": 0.482103134479272, "grad_norm": 2.8922863006591797, "learning_rate": 5.712777116474103e-06, "loss": 0.5086416006088257, "memory(GiB)": 74.62, "step": 596, "token_acc": 0.8284023668639053, "train_speed(iter/s)": 0.022427 }, { "epoch": 0.48291203235591507, "grad_norm": 2.173737049102783, "learning_rate": 5.699542639837844e-06, "loss": 0.45955491065979004, "memory(GiB)": 74.62, "step": 597, "token_acc": 0.8786610878661087, "train_speed(iter/s)": 0.022427 }, { "epoch": 0.48372093023255813, "grad_norm": 1.9948984384536743, "learning_rate": 5.686303162419326e-06, "loss": 0.4127792716026306, "memory(GiB)": 74.62, "step": 598, "token_acc": 0.8712121212121212, "train_speed(iter/s)": 0.022428 }, { "epoch": 0.4845298281092012, "grad_norm": 2.446259021759033, "learning_rate": 5.6730587788628785e-06, "loss": 0.4015938341617584, "memory(GiB)": 74.62, "step": 599, "token_acc": 0.8502202643171806, "train_speed(iter/s)": 0.022429 }, { "epoch": 0.48533872598584427, "grad_norm": 2.781144618988037, "learning_rate": 5.659809583847907e-06, "loss": 0.44586971402168274, "memory(GiB)": 74.62, "step": 600, "token_acc": 0.8482490272373541, "train_speed(iter/s)": 0.022429 }, { "epoch": 0.4861476238624874, "grad_norm": 2.267489433288574, "learning_rate": 5.646555672088203e-06, "loss": 0.36807918548583984, "memory(GiB)": 74.62, "step": 601, "token_acc": 0.8648648648648649, "train_speed(iter/s)": 0.02243 }, { "epoch": 0.48695652173913045, "grad_norm": 2.3026046752929688, "learning_rate": 5.633297138331285e-06, "loss": 0.4327083230018616, "memory(GiB)": 74.62, "step": 602, "token_acc": 0.8597122302158273, "train_speed(iter/s)": 0.02243 }, { "epoch": 0.4877654196157735, "grad_norm": 2.635984420776367, "learning_rate": 5.620034077357708e-06, "loss": 0.44607388973236084, "memory(GiB)": 74.62, "step": 603, "token_acc": 0.8711111111111111, "train_speed(iter/s)": 0.022431 }, { "epoch": 0.4885743174924166, "grad_norm": 2.5992751121520996, "learning_rate": 5.60676658398039e-06, "loss": 0.3917505145072937, "memory(GiB)": 74.62, "step": 604, "token_acc": 0.9137931034482759, "train_speed(iter/s)": 0.022431 }, { "epoch": 0.48938321536905965, "grad_norm": 2.3977952003479004, "learning_rate": 5.593494753043938e-06, "loss": 0.41896378993988037, "memory(GiB)": 74.62, "step": 605, "token_acc": 0.8821548821548821, "train_speed(iter/s)": 0.022432 }, { "epoch": 0.4901921132457027, "grad_norm": 2.1268513202667236, "learning_rate": 5.580218679423965e-06, "loss": 0.436327189207077, "memory(GiB)": 74.62, "step": 606, "token_acc": 0.8737864077669902, "train_speed(iter/s)": 0.022432 }, { "epoch": 0.4910010111223458, "grad_norm": 3.2890071868896484, "learning_rate": 5.566938458026411e-06, "loss": 0.4408925771713257, "memory(GiB)": 74.62, "step": 607, "token_acc": 0.9095744680851063, "train_speed(iter/s)": 0.022433 }, { "epoch": 0.49180990899898885, "grad_norm": 2.2176642417907715, "learning_rate": 5.553654183786872e-06, "loss": 0.46782928705215454, "memory(GiB)": 74.62, "step": 608, "token_acc": 0.8888888888888888, "train_speed(iter/s)": 0.022434 }, { "epoch": 0.492618806875632, "grad_norm": 2.8756251335144043, "learning_rate": 5.540365951669913e-06, "loss": 0.4359992742538452, "memory(GiB)": 74.62, "step": 609, "token_acc": 0.8753993610223643, "train_speed(iter/s)": 0.022434 }, { "epoch": 0.49342770475227504, "grad_norm": 2.9646661281585693, "learning_rate": 5.527073856668391e-06, "loss": 0.4747014343738556, "memory(GiB)": 74.62, "step": 610, "token_acc": 0.889795918367347, "train_speed(iter/s)": 0.022435 }, { "epoch": 0.4942366026289181, "grad_norm": 2.289034128189087, "learning_rate": 5.513777993802781e-06, "loss": 0.4281376600265503, "memory(GiB)": 74.62, "step": 611, "token_acc": 0.87890625, "train_speed(iter/s)": 0.022435 }, { "epoch": 0.4950455005055612, "grad_norm": 2.541618585586548, "learning_rate": 5.500478458120493e-06, "loss": 0.45447611808776855, "memory(GiB)": 74.62, "step": 612, "token_acc": 0.8346456692913385, "train_speed(iter/s)": 0.022436 }, { "epoch": 0.49585439838220424, "grad_norm": 3.065063953399658, "learning_rate": 5.487175344695188e-06, "loss": 0.4350849688053131, "memory(GiB)": 74.62, "step": 613, "token_acc": 0.8583333333333333, "train_speed(iter/s)": 0.022436 }, { "epoch": 0.4966632962588473, "grad_norm": 1.9416303634643555, "learning_rate": 5.47386874862611e-06, "loss": 0.4030672311782837, "memory(GiB)": 74.62, "step": 614, "token_acc": 0.8527397260273972, "train_speed(iter/s)": 0.022437 }, { "epoch": 0.4974721941354904, "grad_norm": 2.4637768268585205, "learning_rate": 5.460558765037392e-06, "loss": 0.4326108396053314, "memory(GiB)": 74.62, "step": 615, "token_acc": 0.8831168831168831, "train_speed(iter/s)": 0.022437 }, { "epoch": 0.49828109201213344, "grad_norm": 2.7800002098083496, "learning_rate": 5.447245489077389e-06, "loss": 0.42490726709365845, "memory(GiB)": 74.62, "step": 616, "token_acc": 0.8571428571428571, "train_speed(iter/s)": 0.022438 }, { "epoch": 0.49908998988877656, "grad_norm": 4.720980167388916, "learning_rate": 5.433929015917988e-06, "loss": 0.39446377754211426, "memory(GiB)": 74.62, "step": 617, "token_acc": 0.8888888888888888, "train_speed(iter/s)": 0.022439 }, { "epoch": 0.49989888776541963, "grad_norm": 2.4783382415771484, "learning_rate": 5.420609440753935e-06, "loss": 0.41358453035354614, "memory(GiB)": 74.62, "step": 618, "token_acc": 0.8716216216216216, "train_speed(iter/s)": 0.022439 }, { "epoch": 0.5007077856420626, "grad_norm": 2.4651012420654297, "learning_rate": 5.407286858802147e-06, "loss": 0.3854910433292389, "memory(GiB)": 74.62, "step": 619, "token_acc": 0.8565217391304348, "train_speed(iter/s)": 0.022424 }, { "epoch": 0.5015166835187057, "grad_norm": 2.053473472595215, "learning_rate": 5.393961365301041e-06, "loss": 0.3815562427043915, "memory(GiB)": 74.62, "step": 620, "token_acc": 0.888135593220339, "train_speed(iter/s)": 0.022425 }, { "epoch": 0.5023255813953489, "grad_norm": 2.1635167598724365, "learning_rate": 5.380633055509843e-06, "loss": 0.45562463998794556, "memory(GiB)": 74.62, "step": 621, "token_acc": 0.8426573426573427, "train_speed(iter/s)": 0.022426 }, { "epoch": 0.503134479271992, "grad_norm": 2.1759238243103027, "learning_rate": 5.367302024707911e-06, "loss": 0.4003329873085022, "memory(GiB)": 74.62, "step": 622, "token_acc": 0.8444444444444444, "train_speed(iter/s)": 0.022427 }, { "epoch": 0.503943377148635, "grad_norm": 2.391221284866333, "learning_rate": 5.35396836819406e-06, "loss": 0.4506310820579529, "memory(GiB)": 74.62, "step": 623, "token_acc": 0.8243727598566308, "train_speed(iter/s)": 0.022427 }, { "epoch": 0.5047522750252781, "grad_norm": 2.422003746032715, "learning_rate": 5.340632181285872e-06, "loss": 0.3775983154773712, "memory(GiB)": 74.62, "step": 624, "token_acc": 0.9178082191780822, "train_speed(iter/s)": 0.022428 }, { "epoch": 0.5055611729019212, "grad_norm": 2.822801113128662, "learning_rate": 5.327293559319014e-06, "loss": 0.46088916063308716, "memory(GiB)": 74.62, "step": 625, "token_acc": 0.8203389830508474, "train_speed(iter/s)": 0.022428 }, { "epoch": 0.5063700707785642, "grad_norm": 2.9713943004608154, "learning_rate": 5.3139525976465675e-06, "loss": 0.4233189821243286, "memory(GiB)": 74.62, "step": 626, "token_acc": 0.8892988929889298, "train_speed(iter/s)": 0.022429 }, { "epoch": 0.5071789686552073, "grad_norm": 2.24816632270813, "learning_rate": 5.300609391638336e-06, "loss": 0.45002853870391846, "memory(GiB)": 74.62, "step": 627, "token_acc": 0.835820895522388, "train_speed(iter/s)": 0.02243 }, { "epoch": 0.5079878665318504, "grad_norm": 3.1802284717559814, "learning_rate": 5.287264036680166e-06, "loss": 0.40955209732055664, "memory(GiB)": 74.62, "step": 628, "token_acc": 0.9, "train_speed(iter/s)": 0.02243 }, { "epoch": 0.5087967644084934, "grad_norm": 2.9746017456054688, "learning_rate": 5.27391662817327e-06, "loss": 0.4412648677825928, "memory(GiB)": 74.62, "step": 629, "token_acc": 0.864951768488746, "train_speed(iter/s)": 0.022431 }, { "epoch": 0.5096056622851365, "grad_norm": 7.995876312255859, "learning_rate": 5.260567261533538e-06, "loss": 0.4368639886379242, "memory(GiB)": 74.62, "step": 630, "token_acc": 0.9067796610169492, "train_speed(iter/s)": 0.022431 }, { "epoch": 0.5104145601617796, "grad_norm": 4.124439239501953, "learning_rate": 5.2472160321908535e-06, "loss": 0.3601537346839905, "memory(GiB)": 74.62, "step": 631, "token_acc": 0.9384615384615385, "train_speed(iter/s)": 0.022432 }, { "epoch": 0.5112234580384226, "grad_norm": 2.16349196434021, "learning_rate": 5.233863035588427e-06, "loss": 0.49298688769340515, "memory(GiB)": 74.62, "step": 632, "token_acc": 0.8697318007662835, "train_speed(iter/s)": 0.022432 }, { "epoch": 0.5120323559150657, "grad_norm": 3.2173032760620117, "learning_rate": 5.22050836718209e-06, "loss": 0.3806041479110718, "memory(GiB)": 74.62, "step": 633, "token_acc": 0.9253112033195021, "train_speed(iter/s)": 0.022433 }, { "epoch": 0.5128412537917088, "grad_norm": 2.4195048809051514, "learning_rate": 5.207152122439635e-06, "loss": 0.41035759449005127, "memory(GiB)": 74.62, "step": 634, "token_acc": 0.86328125, "train_speed(iter/s)": 0.022434 }, { "epoch": 0.5136501516683518, "grad_norm": 2.598662853240967, "learning_rate": 5.1937943968401175e-06, "loss": 0.40409672260284424, "memory(GiB)": 74.62, "step": 635, "token_acc": 0.9050279329608939, "train_speed(iter/s)": 0.022434 }, { "epoch": 0.514459049544995, "grad_norm": 3.158039093017578, "learning_rate": 5.180435285873182e-06, "loss": 0.4163573682308197, "memory(GiB)": 74.62, "step": 636, "token_acc": 0.8577405857740585, "train_speed(iter/s)": 0.022435 }, { "epoch": 0.5152679474216381, "grad_norm": 2.9024956226348877, "learning_rate": 5.1670748850383734e-06, "loss": 0.43788814544677734, "memory(GiB)": 74.62, "step": 637, "token_acc": 0.8318181818181818, "train_speed(iter/s)": 0.022435 }, { "epoch": 0.5160768452982811, "grad_norm": 5.88484001159668, "learning_rate": 5.153713289844462e-06, "loss": 0.43005481362342834, "memory(GiB)": 74.62, "step": 638, "token_acc": 0.8546099290780141, "train_speed(iter/s)": 0.022436 }, { "epoch": 0.5168857431749242, "grad_norm": 2.6073086261749268, "learning_rate": 5.140350595808751e-06, "loss": 0.441942036151886, "memory(GiB)": 74.62, "step": 639, "token_acc": 0.7777777777777778, "train_speed(iter/s)": 0.022437 }, { "epoch": 0.5176946410515673, "grad_norm": 2.607276439666748, "learning_rate": 5.126986898456401e-06, "loss": 0.40762656927108765, "memory(GiB)": 74.62, "step": 640, "token_acc": 0.9018181818181819, "train_speed(iter/s)": 0.022437 }, { "epoch": 0.5185035389282103, "grad_norm": 3.1285383701324463, "learning_rate": 5.113622293319749e-06, "loss": 0.4376784861087799, "memory(GiB)": 74.62, "step": 641, "token_acc": 0.8571428571428571, "train_speed(iter/s)": 0.022438 }, { "epoch": 0.5193124368048534, "grad_norm": 2.1132287979125977, "learning_rate": 5.1002568759376134e-06, "loss": 0.3872153162956238, "memory(GiB)": 74.62, "step": 642, "token_acc": 0.8991596638655462, "train_speed(iter/s)": 0.022438 }, { "epoch": 0.5201213346814965, "grad_norm": 2.294435501098633, "learning_rate": 5.086890741854626e-06, "loss": 0.4477715492248535, "memory(GiB)": 74.62, "step": 643, "token_acc": 0.8445945945945946, "train_speed(iter/s)": 0.022439 }, { "epoch": 0.5209302325581395, "grad_norm": 4.424786567687988, "learning_rate": 5.073523986620539e-06, "loss": 0.4204040765762329, "memory(GiB)": 74.62, "step": 644, "token_acc": 0.8901960784313725, "train_speed(iter/s)": 0.022439 }, { "epoch": 0.5217391304347826, "grad_norm": 6.769619941711426, "learning_rate": 5.060156705789545e-06, "loss": 0.433963418006897, "memory(GiB)": 74.62, "step": 645, "token_acc": 0.8599221789883269, "train_speed(iter/s)": 0.02244 }, { "epoch": 0.5225480283114257, "grad_norm": 2.297720193862915, "learning_rate": 5.046788994919595e-06, "loss": 0.38716062903404236, "memory(GiB)": 74.62, "step": 646, "token_acc": 0.9004329004329005, "train_speed(iter/s)": 0.022441 }, { "epoch": 0.5233569261880687, "grad_norm": 3.8223865032196045, "learning_rate": 5.033420949571712e-06, "loss": 0.3824414610862732, "memory(GiB)": 74.62, "step": 647, "token_acc": 0.898989898989899, "train_speed(iter/s)": 0.022441 }, { "epoch": 0.5241658240647118, "grad_norm": 2.3025248050689697, "learning_rate": 5.020052665309312e-06, "loss": 0.40017083287239075, "memory(GiB)": 74.62, "step": 648, "token_acc": 0.8936170212765957, "train_speed(iter/s)": 0.022442 }, { "epoch": 0.5249747219413549, "grad_norm": 1.8813366889953613, "learning_rate": 5.00668423769752e-06, "loss": 0.3807840347290039, "memory(GiB)": 74.62, "step": 649, "token_acc": 0.8823529411764706, "train_speed(iter/s)": 0.022442 }, { "epoch": 0.5257836198179979, "grad_norm": 2.805870532989502, "learning_rate": 4.993315762302483e-06, "loss": 0.4545632302761078, "memory(GiB)": 74.62, "step": 650, "token_acc": 0.8395061728395061, "train_speed(iter/s)": 0.022443 }, { "epoch": 0.5265925176946411, "grad_norm": 2.4668116569519043, "learning_rate": 4.97994733469069e-06, "loss": 0.39456599950790405, "memory(GiB)": 74.62, "step": 651, "token_acc": 0.8664122137404581, "train_speed(iter/s)": 0.022443 }, { "epoch": 0.5274014155712842, "grad_norm": 2.224895715713501, "learning_rate": 4.96657905042829e-06, "loss": 0.3933877944946289, "memory(GiB)": 74.62, "step": 652, "token_acc": 0.8654708520179372, "train_speed(iter/s)": 0.022444 }, { "epoch": 0.5282103134479272, "grad_norm": 2.5314419269561768, "learning_rate": 4.9532110050804074e-06, "loss": 0.36528831720352173, "memory(GiB)": 74.62, "step": 653, "token_acc": 0.9087591240875912, "train_speed(iter/s)": 0.022444 }, { "epoch": 0.5290192113245703, "grad_norm": 2.0852181911468506, "learning_rate": 4.939843294210456e-06, "loss": 0.39938467741012573, "memory(GiB)": 74.62, "step": 654, "token_acc": 0.8872180451127819, "train_speed(iter/s)": 0.022445 }, { "epoch": 0.5298281092012134, "grad_norm": 2.4768409729003906, "learning_rate": 4.926476013379462e-06, "loss": 0.4587656259536743, "memory(GiB)": 74.62, "step": 655, "token_acc": 0.9147540983606557, "train_speed(iter/s)": 0.022446 }, { "epoch": 0.5306370070778564, "grad_norm": 3.768552303314209, "learning_rate": 4.9131092581453745e-06, "loss": 0.4000494182109833, "memory(GiB)": 74.62, "step": 656, "token_acc": 0.8588957055214724, "train_speed(iter/s)": 0.022446 }, { "epoch": 0.5314459049544995, "grad_norm": 2.7904086112976074, "learning_rate": 4.899743124062387e-06, "loss": 0.42587220668792725, "memory(GiB)": 74.62, "step": 657, "token_acc": 0.8741258741258742, "train_speed(iter/s)": 0.022447 }, { "epoch": 0.5322548028311426, "grad_norm": 2.2774369716644287, "learning_rate": 4.886377706680253e-06, "loss": 0.38174745440483093, "memory(GiB)": 74.62, "step": 658, "token_acc": 0.8697318007662835, "train_speed(iter/s)": 0.022447 }, { "epoch": 0.5330637007077856, "grad_norm": 2.049821376800537, "learning_rate": 4.873013101543599e-06, "loss": 0.4340623617172241, "memory(GiB)": 74.62, "step": 659, "token_acc": 0.8543046357615894, "train_speed(iter/s)": 0.022448 }, { "epoch": 0.5338725985844287, "grad_norm": 2.252617120742798, "learning_rate": 4.859649404191251e-06, "loss": 0.35842257738113403, "memory(GiB)": 74.62, "step": 660, "token_acc": 0.8933333333333333, "train_speed(iter/s)": 0.022448 }, { "epoch": 0.5346814964610718, "grad_norm": 2.1607117652893066, "learning_rate": 4.84628671015554e-06, "loss": 0.40685737133026123, "memory(GiB)": 74.62, "step": 661, "token_acc": 0.8737201365187713, "train_speed(iter/s)": 0.022449 }, { "epoch": 0.5354903943377148, "grad_norm": 2.924506425857544, "learning_rate": 4.832925114961629e-06, "loss": 0.44293731451034546, "memory(GiB)": 74.62, "step": 662, "token_acc": 0.8465608465608465, "train_speed(iter/s)": 0.02245 }, { "epoch": 0.5362992922143579, "grad_norm": 3.0079522132873535, "learning_rate": 4.8195647141268196e-06, "loss": 0.4585626423358917, "memory(GiB)": 74.62, "step": 663, "token_acc": 0.8599221789883269, "train_speed(iter/s)": 0.02245 }, { "epoch": 0.537108190091001, "grad_norm": 2.986860990524292, "learning_rate": 4.8062056031598825e-06, "loss": 0.4173978567123413, "memory(GiB)": 74.62, "step": 664, "token_acc": 0.8721804511278195, "train_speed(iter/s)": 0.022451 }, { "epoch": 0.537917087967644, "grad_norm": 2.1893157958984375, "learning_rate": 4.792847877560367e-06, "loss": 0.40209460258483887, "memory(GiB)": 74.62, "step": 665, "token_acc": 0.8129770992366412, "train_speed(iter/s)": 0.022451 }, { "epoch": 0.5387259858442872, "grad_norm": 2.2716012001037598, "learning_rate": 4.779491632817911e-06, "loss": 0.4765605926513672, "memory(GiB)": 74.62, "step": 666, "token_acc": 0.8706293706293706, "train_speed(iter/s)": 0.022452 }, { "epoch": 0.5395348837209303, "grad_norm": 2.23425555229187, "learning_rate": 4.766136964411576e-06, "loss": 0.39718160033226013, "memory(GiB)": 74.62, "step": 667, "token_acc": 0.8536585365853658, "train_speed(iter/s)": 0.022452 }, { "epoch": 0.5403437815975733, "grad_norm": 2.647259473800659, "learning_rate": 4.752783967809147e-06, "loss": 0.4938986301422119, "memory(GiB)": 74.62, "step": 668, "token_acc": 0.8101694915254237, "train_speed(iter/s)": 0.022453 }, { "epoch": 0.5411526794742164, "grad_norm": 2.081202507019043, "learning_rate": 4.739432738466465e-06, "loss": 0.4376961588859558, "memory(GiB)": 74.62, "step": 669, "token_acc": 0.8683274021352313, "train_speed(iter/s)": 0.022453 }, { "epoch": 0.5419615773508595, "grad_norm": 2.3195981979370117, "learning_rate": 4.726083371826731e-06, "loss": 0.3606075644493103, "memory(GiB)": 74.62, "step": 670, "token_acc": 0.8583690987124464, "train_speed(iter/s)": 0.022454 }, { "epoch": 0.5427704752275025, "grad_norm": 2.1184582710266113, "learning_rate": 4.712735963319834e-06, "loss": 0.4429006576538086, "memory(GiB)": 74.62, "step": 671, "token_acc": 0.8438818565400844, "train_speed(iter/s)": 0.022454 }, { "epoch": 0.5435793731041456, "grad_norm": 2.6941933631896973, "learning_rate": 4.699390608361665e-06, "loss": 0.41405189037323, "memory(GiB)": 74.62, "step": 672, "token_acc": 0.8790697674418605, "train_speed(iter/s)": 0.022455 }, { "epoch": 0.5443882709807887, "grad_norm": 2.466550588607788, "learning_rate": 4.686047402353433e-06, "loss": 0.4570333659648895, "memory(GiB)": 74.62, "step": 673, "token_acc": 0.8647686832740213, "train_speed(iter/s)": 0.022455 }, { "epoch": 0.5451971688574317, "grad_norm": 3.1605703830718994, "learning_rate": 4.672706440680989e-06, "loss": 0.3652383089065552, "memory(GiB)": 74.62, "step": 674, "token_acc": 0.8957345971563981, "train_speed(iter/s)": 0.022456 }, { "epoch": 0.5460060667340748, "grad_norm": 2.547511577606201, "learning_rate": 4.65936781871413e-06, "loss": 0.4206015467643738, "memory(GiB)": 74.62, "step": 675, "token_acc": 0.88671875, "train_speed(iter/s)": 0.022456 }, { "epoch": 0.5468149646107179, "grad_norm": 2.2908408641815186, "learning_rate": 4.64603163180594e-06, "loss": 0.42101001739501953, "memory(GiB)": 74.62, "step": 676, "token_acc": 0.9054054054054054, "train_speed(iter/s)": 0.022457 }, { "epoch": 0.547623862487361, "grad_norm": 2.6179423332214355, "learning_rate": 4.6326979752920905e-06, "loss": 0.4017224907875061, "memory(GiB)": 74.62, "step": 677, "token_acc": 0.8642533936651584, "train_speed(iter/s)": 0.022457 }, { "epoch": 0.548432760364004, "grad_norm": 2.2148091793060303, "learning_rate": 4.619366944490158e-06, "loss": 0.3605102300643921, "memory(GiB)": 74.62, "step": 678, "token_acc": 0.8927038626609443, "train_speed(iter/s)": 0.022458 }, { "epoch": 0.5492416582406471, "grad_norm": 2.3841159343719482, "learning_rate": 4.60603863469896e-06, "loss": 0.3840959370136261, "memory(GiB)": 74.62, "step": 679, "token_acc": 0.8226415094339623, "train_speed(iter/s)": 0.022458 }, { "epoch": 0.5500505561172901, "grad_norm": 2.1525049209594727, "learning_rate": 4.5927131411978536e-06, "loss": 0.41845589876174927, "memory(GiB)": 74.62, "step": 680, "token_acc": 0.8461538461538461, "train_speed(iter/s)": 0.022459 }, { "epoch": 0.5508594539939332, "grad_norm": 2.088181495666504, "learning_rate": 4.579390559246066e-06, "loss": 0.3538067936897278, "memory(GiB)": 74.62, "step": 681, "token_acc": 0.8301282051282052, "train_speed(iter/s)": 0.022459 }, { "epoch": 0.5516683518705764, "grad_norm": 4.506858825683594, "learning_rate": 4.566070984082013e-06, "loss": 0.4188098907470703, "memory(GiB)": 74.62, "step": 682, "token_acc": 0.8808777429467085, "train_speed(iter/s)": 0.02246 }, { "epoch": 0.5524772497472195, "grad_norm": 7.24404764175415, "learning_rate": 4.552754510922612e-06, "loss": 0.3949962258338928, "memory(GiB)": 74.62, "step": 683, "token_acc": 0.8771929824561403, "train_speed(iter/s)": 0.02246 }, { "epoch": 0.5532861476238625, "grad_norm": 2.410817861557007, "learning_rate": 4.539441234962609e-06, "loss": 0.36630767583847046, "memory(GiB)": 74.62, "step": 684, "token_acc": 0.8398692810457516, "train_speed(iter/s)": 0.022461 }, { "epoch": 0.5540950455005056, "grad_norm": 3.47383975982666, "learning_rate": 4.526131251373892e-06, "loss": 0.4143676161766052, "memory(GiB)": 74.62, "step": 685, "token_acc": 0.8458149779735683, "train_speed(iter/s)": 0.022461 }, { "epoch": 0.5549039433771487, "grad_norm": 3.989591360092163, "learning_rate": 4.512824655304814e-06, "loss": 0.39957284927368164, "memory(GiB)": 74.62, "step": 686, "token_acc": 0.8847457627118644, "train_speed(iter/s)": 0.022462 }, { "epoch": 0.5557128412537917, "grad_norm": 2.368927001953125, "learning_rate": 4.499521541879508e-06, "loss": 0.3500638008117676, "memory(GiB)": 74.62, "step": 687, "token_acc": 0.8498402555910544, "train_speed(iter/s)": 0.022462 }, { "epoch": 0.5565217391304348, "grad_norm": 2.1441452503204346, "learning_rate": 4.48622200619722e-06, "loss": 0.3939352035522461, "memory(GiB)": 74.62, "step": 688, "token_acc": 0.9003831417624522, "train_speed(iter/s)": 0.022463 }, { "epoch": 0.5573306370070779, "grad_norm": 2.4296200275421143, "learning_rate": 4.472926143331612e-06, "loss": 0.4165255129337311, "memory(GiB)": 74.62, "step": 689, "token_acc": 0.8741935483870967, "train_speed(iter/s)": 0.022463 }, { "epoch": 0.5581395348837209, "grad_norm": 2.0704715251922607, "learning_rate": 4.459634048330089e-06, "loss": 0.3778902292251587, "memory(GiB)": 74.62, "step": 690, "token_acc": 0.8571428571428571, "train_speed(iter/s)": 0.022464 }, { "epoch": 0.558948432760364, "grad_norm": 1.9288545846939087, "learning_rate": 4.44634581621313e-06, "loss": 0.3621513843536377, "memory(GiB)": 74.62, "step": 691, "token_acc": 0.8803827751196173, "train_speed(iter/s)": 0.022464 }, { "epoch": 0.5597573306370071, "grad_norm": 2.8786773681640625, "learning_rate": 4.433061541973591e-06, "loss": 0.46439093351364136, "memory(GiB)": 74.62, "step": 692, "token_acc": 0.8423076923076923, "train_speed(iter/s)": 0.022465 }, { "epoch": 0.5605662285136501, "grad_norm": 7.472469329833984, "learning_rate": 4.419781320576037e-06, "loss": 0.3596475124359131, "memory(GiB)": 74.62, "step": 693, "token_acc": 0.8888888888888888, "train_speed(iter/s)": 0.022465 }, { "epoch": 0.5613751263902932, "grad_norm": 2.2149417400360107, "learning_rate": 4.406505246956064e-06, "loss": 0.39849790930747986, "memory(GiB)": 74.62, "step": 694, "token_acc": 0.861904761904762, "train_speed(iter/s)": 0.022466 }, { "epoch": 0.5621840242669363, "grad_norm": 2.573707342147827, "learning_rate": 4.393233416019611e-06, "loss": 0.33962416648864746, "memory(GiB)": 74.62, "step": 695, "token_acc": 0.875, "train_speed(iter/s)": 0.022466 }, { "epoch": 0.5629929221435793, "grad_norm": 2.2001919746398926, "learning_rate": 4.379965922642294e-06, "loss": 0.43496495485305786, "memory(GiB)": 74.62, "step": 696, "token_acc": 0.8486842105263158, "train_speed(iter/s)": 0.022467 }, { "epoch": 0.5638018200202225, "grad_norm": 1.9872112274169922, "learning_rate": 4.366702861668717e-06, "loss": 0.3653467297554016, "memory(GiB)": 74.62, "step": 697, "token_acc": 0.8991935483870968, "train_speed(iter/s)": 0.022467 }, { "epoch": 0.5646107178968656, "grad_norm": 2.000946521759033, "learning_rate": 4.353444327911797e-06, "loss": 0.4383889138698578, "memory(GiB)": 74.62, "step": 698, "token_acc": 0.8419243986254296, "train_speed(iter/s)": 0.022468 }, { "epoch": 0.5654196157735086, "grad_norm": 2.3316028118133545, "learning_rate": 4.3401904161520944e-06, "loss": 0.4090406000614166, "memory(GiB)": 74.62, "step": 699, "token_acc": 0.8454106280193237, "train_speed(iter/s)": 0.022468 }, { "epoch": 0.5662285136501517, "grad_norm": 2.3193917274475098, "learning_rate": 4.3269412211371215e-06, "loss": 0.40262287855148315, "memory(GiB)": 74.62, "step": 700, "token_acc": 0.8765432098765432, "train_speed(iter/s)": 0.022469 }, { "epoch": 0.5670374115267948, "grad_norm": 2.7743844985961914, "learning_rate": 4.313696837580677e-06, "loss": 0.40288880467414856, "memory(GiB)": 74.62, "step": 701, "token_acc": 0.8993055555555556, "train_speed(iter/s)": 0.022469 }, { "epoch": 0.5678463094034378, "grad_norm": 1.9505183696746826, "learning_rate": 4.300457360162158e-06, "loss": 0.34644150733947754, "memory(GiB)": 74.62, "step": 702, "token_acc": 0.8622222222222222, "train_speed(iter/s)": 0.02247 }, { "epoch": 0.5686552072800809, "grad_norm": 2.183720588684082, "learning_rate": 4.287222883525897e-06, "loss": 0.429502934217453, "memory(GiB)": 74.62, "step": 703, "token_acc": 0.8661417322834646, "train_speed(iter/s)": 0.022471 }, { "epoch": 0.569464105156724, "grad_norm": 2.0480737686157227, "learning_rate": 4.273993502280476e-06, "loss": 0.3910590708255768, "memory(GiB)": 74.62, "step": 704, "token_acc": 0.8404255319148937, "train_speed(iter/s)": 0.022471 }, { "epoch": 0.570273003033367, "grad_norm": 2.1780683994293213, "learning_rate": 4.2607693109980435e-06, "loss": 0.45382118225097656, "memory(GiB)": 74.62, "step": 705, "token_acc": 0.8862745098039215, "train_speed(iter/s)": 0.022472 }, { "epoch": 0.5710819009100101, "grad_norm": 2.0752146244049072, "learning_rate": 4.247550404213661e-06, "loss": 0.39520663022994995, "memory(GiB)": 74.62, "step": 706, "token_acc": 0.8012422360248447, "train_speed(iter/s)": 0.022472 }, { "epoch": 0.5718907987866532, "grad_norm": 2.0002593994140625, "learning_rate": 4.2343368764246005e-06, "loss": 0.4130653738975525, "memory(GiB)": 74.62, "step": 707, "token_acc": 0.862453531598513, "train_speed(iter/s)": 0.022473 }, { "epoch": 0.5726996966632962, "grad_norm": 2.031238317489624, "learning_rate": 4.221128822089687e-06, "loss": 0.36960452795028687, "memory(GiB)": 74.62, "step": 708, "token_acc": 0.8981818181818182, "train_speed(iter/s)": 0.022473 }, { "epoch": 0.5735085945399393, "grad_norm": 2.3516478538513184, "learning_rate": 4.207926335628617e-06, "loss": 0.43690210580825806, "memory(GiB)": 74.62, "step": 709, "token_acc": 0.8461538461538461, "train_speed(iter/s)": 0.022474 }, { "epoch": 0.5743174924165824, "grad_norm": 2.5592732429504395, "learning_rate": 4.194729511421285e-06, "loss": 0.3793370723724365, "memory(GiB)": 74.62, "step": 710, "token_acc": 0.8393574297188755, "train_speed(iter/s)": 0.022474 }, { "epoch": 0.5751263902932254, "grad_norm": 2.097623825073242, "learning_rate": 4.181538443807109e-06, "loss": 0.39188504219055176, "memory(GiB)": 74.62, "step": 711, "token_acc": 0.9033613445378151, "train_speed(iter/s)": 0.022475 }, { "epoch": 0.5759352881698686, "grad_norm": 1.9303717613220215, "learning_rate": 4.1683532270843505e-06, "loss": 0.4174485504627228, "memory(GiB)": 74.62, "step": 712, "token_acc": 0.8346774193548387, "train_speed(iter/s)": 0.022475 }, { "epoch": 0.5767441860465117, "grad_norm": 2.5618019104003906, "learning_rate": 4.15517395550945e-06, "loss": 0.3718748390674591, "memory(GiB)": 74.62, "step": 713, "token_acc": 0.8719723183391004, "train_speed(iter/s)": 0.022475 }, { "epoch": 0.5775530839231547, "grad_norm": 2.322850227355957, "learning_rate": 4.1420007232963435e-06, "loss": 0.3762381970882416, "memory(GiB)": 74.62, "step": 714, "token_acc": 0.8874458874458875, "train_speed(iter/s)": 0.022476 }, { "epoch": 0.5783619817997978, "grad_norm": 2.1827359199523926, "learning_rate": 4.1288336246158e-06, "loss": 0.40151140093803406, "memory(GiB)": 74.62, "step": 715, "token_acc": 0.8538461538461538, "train_speed(iter/s)": 0.022476 }, { "epoch": 0.5791708796764409, "grad_norm": 2.6647045612335205, "learning_rate": 4.115672753594739e-06, "loss": 0.34364283084869385, "memory(GiB)": 74.62, "step": 716, "token_acc": 0.903448275862069, "train_speed(iter/s)": 0.022477 }, { "epoch": 0.5799797775530839, "grad_norm": 2.086578845977783, "learning_rate": 4.102518204315555e-06, "loss": 0.4202456474304199, "memory(GiB)": 74.62, "step": 717, "token_acc": 0.8202764976958525, "train_speed(iter/s)": 0.022477 }, { "epoch": 0.580788675429727, "grad_norm": 1.952487826347351, "learning_rate": 4.089370070815463e-06, "loss": 0.37721166014671326, "memory(GiB)": 74.62, "step": 718, "token_acc": 0.878419452887538, "train_speed(iter/s)": 0.022478 }, { "epoch": 0.5815975733063701, "grad_norm": 1.9967212677001953, "learning_rate": 4.0762284470858e-06, "loss": 0.42397576570510864, "memory(GiB)": 74.62, "step": 719, "token_acc": 0.8559322033898306, "train_speed(iter/s)": 0.022479 }, { "epoch": 0.5824064711830131, "grad_norm": 2.281806707382202, "learning_rate": 4.063093427071376e-06, "loss": 0.3868061900138855, "memory(GiB)": 74.62, "step": 720, "token_acc": 0.9313304721030042, "train_speed(iter/s)": 0.022479 }, { "epoch": 0.5832153690596562, "grad_norm": 2.5271997451782227, "learning_rate": 4.049965104669795e-06, "loss": 0.4714341163635254, "memory(GiB)": 74.62, "step": 721, "token_acc": 0.8309859154929577, "train_speed(iter/s)": 0.022479 }, { "epoch": 0.5840242669362993, "grad_norm": 2.1930084228515625, "learning_rate": 4.036843573730774e-06, "loss": 0.4007885456085205, "memory(GiB)": 74.62, "step": 722, "token_acc": 0.9045643153526971, "train_speed(iter/s)": 0.02248 }, { "epoch": 0.5848331648129423, "grad_norm": 2.2075302600860596, "learning_rate": 4.023728928055486e-06, "loss": 0.4345509707927704, "memory(GiB)": 74.62, "step": 723, "token_acc": 0.8504672897196262, "train_speed(iter/s)": 0.02248 }, { "epoch": 0.5856420626895854, "grad_norm": 2.093959331512451, "learning_rate": 4.0106212613958805e-06, "loss": 0.39234721660614014, "memory(GiB)": 74.62, "step": 724, "token_acc": 0.8838174273858921, "train_speed(iter/s)": 0.022481 }, { "epoch": 0.5864509605662285, "grad_norm": 2.8163022994995117, "learning_rate": 3.99752066745402e-06, "loss": 0.377105712890625, "memory(GiB)": 74.62, "step": 725, "token_acc": 0.8704318936877077, "train_speed(iter/s)": 0.022481 }, { "epoch": 0.5872598584428715, "grad_norm": 3.310258626937866, "learning_rate": 3.984427239881404e-06, "loss": 0.33992162346839905, "memory(GiB)": 74.62, "step": 726, "token_acc": 0.8781725888324873, "train_speed(iter/s)": 0.022482 }, { "epoch": 0.5880687563195146, "grad_norm": 2.1290695667266846, "learning_rate": 3.971341072278302e-06, "loss": 0.3612005114555359, "memory(GiB)": 74.62, "step": 727, "token_acc": 0.8576642335766423, "train_speed(iter/s)": 0.022482 }, { "epoch": 0.5888776541961578, "grad_norm": 2.370741844177246, "learning_rate": 3.958262258193089e-06, "loss": 0.39483344554901123, "memory(GiB)": 74.62, "step": 728, "token_acc": 0.8625954198473282, "train_speed(iter/s)": 0.022482 }, { "epoch": 0.5896865520728009, "grad_norm": 1.9654161930084229, "learning_rate": 3.9451908911215645e-06, "loss": 0.3784998059272766, "memory(GiB)": 74.62, "step": 729, "token_acc": 0.8663101604278075, "train_speed(iter/s)": 0.022483 }, { "epoch": 0.5904954499494439, "grad_norm": 2.5404610633850098, "learning_rate": 3.9321270645062995e-06, "loss": 0.4317411780357361, "memory(GiB)": 74.62, "step": 730, "token_acc": 0.8413793103448276, "train_speed(iter/s)": 0.022484 }, { "epoch": 0.591304347826087, "grad_norm": 1.932789921760559, "learning_rate": 3.919070871735956e-06, "loss": 0.3979855179786682, "memory(GiB)": 74.62, "step": 731, "token_acc": 0.8699551569506726, "train_speed(iter/s)": 0.022484 }, { "epoch": 0.59211324570273, "grad_norm": 2.322033643722534, "learning_rate": 3.906022406144625e-06, "loss": 0.4147607088088989, "memory(GiB)": 74.62, "step": 732, "token_acc": 0.8712871287128713, "train_speed(iter/s)": 0.022484 }, { "epoch": 0.5929221435793731, "grad_norm": 2.0661261081695557, "learning_rate": 3.892981761011164e-06, "loss": 0.3968489170074463, "memory(GiB)": 74.62, "step": 733, "token_acc": 0.8418367346938775, "train_speed(iter/s)": 0.022485 }, { "epoch": 0.5937310414560162, "grad_norm": 1.8793938159942627, "learning_rate": 3.8799490295585155e-06, "loss": 0.34254151582717896, "memory(GiB)": 74.62, "step": 734, "token_acc": 0.9105263157894737, "train_speed(iter/s)": 0.022485 }, { "epoch": 0.5945399393326593, "grad_norm": 3.2460901737213135, "learning_rate": 3.866924304953059e-06, "loss": 0.4647367298603058, "memory(GiB)": 74.62, "step": 735, "token_acc": 0.864, "train_speed(iter/s)": 0.022486 }, { "epoch": 0.5953488372093023, "grad_norm": 2.1490590572357178, "learning_rate": 3.8539076803039285e-06, "loss": 0.4941931962966919, "memory(GiB)": 74.62, "step": 736, "token_acc": 0.843065693430657, "train_speed(iter/s)": 0.022486 }, { "epoch": 0.5961577350859454, "grad_norm": 2.9426324367523193, "learning_rate": 3.840899248662358e-06, "loss": 0.43801772594451904, "memory(GiB)": 74.62, "step": 737, "token_acc": 0.7985611510791367, "train_speed(iter/s)": 0.022487 }, { "epoch": 0.5969666329625885, "grad_norm": 1.8307894468307495, "learning_rate": 3.827899103021017e-06, "loss": 0.36532309651374817, "memory(GiB)": 74.62, "step": 738, "token_acc": 0.8484848484848485, "train_speed(iter/s)": 0.022487 }, { "epoch": 0.5977755308392315, "grad_norm": 1.6826763153076172, "learning_rate": 3.814907336313329e-06, "loss": 0.3788911998271942, "memory(GiB)": 74.62, "step": 739, "token_acc": 0.8656716417910447, "train_speed(iter/s)": 0.022488 }, { "epoch": 0.5985844287158746, "grad_norm": 3.5640852451324463, "learning_rate": 3.8019240414128335e-06, "loss": 0.3946545720100403, "memory(GiB)": 74.62, "step": 740, "token_acc": 0.8245614035087719, "train_speed(iter/s)": 0.022488 }, { "epoch": 0.5993933265925177, "grad_norm": 3.612060785293579, "learning_rate": 3.7889493111324977e-06, "loss": 0.4639260172843933, "memory(GiB)": 74.62, "step": 741, "token_acc": 0.8678571428571429, "train_speed(iter/s)": 0.022489 }, { "epoch": 0.6002022244691607, "grad_norm": 2.10774564743042, "learning_rate": 3.77598323822407e-06, "loss": 0.3779371380805969, "memory(GiB)": 74.62, "step": 742, "token_acc": 0.8962264150943396, "train_speed(iter/s)": 0.022489 }, { "epoch": 0.6010111223458039, "grad_norm": 2.0632522106170654, "learning_rate": 3.763025915377403e-06, "loss": 0.4415694773197174, "memory(GiB)": 74.62, "step": 743, "token_acc": 0.8744939271255061, "train_speed(iter/s)": 0.02249 }, { "epoch": 0.601820020222447, "grad_norm": 2.2084765434265137, "learning_rate": 3.7500774352198066e-06, "loss": 0.4385090470314026, "memory(GiB)": 74.62, "step": 744, "token_acc": 0.8181818181818182, "train_speed(iter/s)": 0.02249 }, { "epoch": 0.60262891809909, "grad_norm": 3.2526354789733887, "learning_rate": 3.7371378903153747e-06, "loss": 0.36739417910575867, "memory(GiB)": 74.62, "step": 745, "token_acc": 0.8622047244094488, "train_speed(iter/s)": 0.022491 }, { "epoch": 0.6034378159757331, "grad_norm": 2.1862826347351074, "learning_rate": 3.7242073731643212e-06, "loss": 0.39445218443870544, "memory(GiB)": 74.62, "step": 746, "token_acc": 0.9465648854961832, "train_speed(iter/s)": 0.022491 }, { "epoch": 0.6042467138523762, "grad_norm": 1.964879035949707, "learning_rate": 3.711285976202331e-06, "loss": 0.4600139558315277, "memory(GiB)": 74.62, "step": 747, "token_acc": 0.8509803921568627, "train_speed(iter/s)": 0.022491 }, { "epoch": 0.6050556117290192, "grad_norm": 2.6029324531555176, "learning_rate": 3.6983737917998858e-06, "loss": 0.38224440813064575, "memory(GiB)": 74.62, "step": 748, "token_acc": 0.8801498127340824, "train_speed(iter/s)": 0.022492 }, { "epoch": 0.6058645096056623, "grad_norm": 2.0742950439453125, "learning_rate": 3.685470912261615e-06, "loss": 0.3933752775192261, "memory(GiB)": 74.62, "step": 749, "token_acc": 0.8681318681318682, "train_speed(iter/s)": 0.022492 }, { "epoch": 0.6066734074823054, "grad_norm": 3.2914257049560547, "learning_rate": 3.672577429825629e-06, "loss": 0.39733976125717163, "memory(GiB)": 74.62, "step": 750, "token_acc": 0.9066147859922179, "train_speed(iter/s)": 0.022493 }, { "epoch": 0.6074823053589484, "grad_norm": 1.9089115858078003, "learning_rate": 3.659693436662859e-06, "loss": 0.40482792258262634, "memory(GiB)": 74.62, "step": 751, "token_acc": 0.8535564853556485, "train_speed(iter/s)": 0.022493 }, { "epoch": 0.6082912032355915, "grad_norm": 3.0140185356140137, "learning_rate": 3.6468190248764063e-06, "loss": 0.5314335823059082, "memory(GiB)": 74.62, "step": 752, "token_acc": 0.8707865168539326, "train_speed(iter/s)": 0.022493 }, { "epoch": 0.6091001011122346, "grad_norm": 2.3016703128814697, "learning_rate": 3.6339542865008724e-06, "loss": 0.3704250454902649, "memory(GiB)": 74.62, "step": 753, "token_acc": 0.8878923766816144, "train_speed(iter/s)": 0.022494 }, { "epoch": 0.6099089989888776, "grad_norm": 1.9638766050338745, "learning_rate": 3.6210993135017115e-06, "loss": 0.4164350628852844, "memory(GiB)": 74.62, "step": 754, "token_acc": 0.8492462311557789, "train_speed(iter/s)": 0.022494 }, { "epoch": 0.6107178968655207, "grad_norm": 2.505688428878784, "learning_rate": 3.608254197774567e-06, "loss": 0.40423935651779175, "memory(GiB)": 74.62, "step": 755, "token_acc": 0.8679245283018868, "train_speed(iter/s)": 0.022495 }, { "epoch": 0.6115267947421638, "grad_norm": 2.152834415435791, "learning_rate": 3.595419031144615e-06, "loss": 0.3799169957637787, "memory(GiB)": 74.62, "step": 756, "token_acc": 0.8670520231213873, "train_speed(iter/s)": 0.022495 }, { "epoch": 0.6123356926188068, "grad_norm": 2.534213066101074, "learning_rate": 3.582593905365912e-06, "loss": 0.4056301414966583, "memory(GiB)": 74.62, "step": 757, "token_acc": 0.855072463768116, "train_speed(iter/s)": 0.022496 }, { "epoch": 0.61314459049545, "grad_norm": 1.9786441326141357, "learning_rate": 3.56977891212073e-06, "loss": 0.4082239270210266, "memory(GiB)": 74.62, "step": 758, "token_acc": 0.8888888888888888, "train_speed(iter/s)": 0.022496 }, { "epoch": 0.6139534883720931, "grad_norm": 1.8767694234848022, "learning_rate": 3.5569741430189163e-06, "loss": 0.39076924324035645, "memory(GiB)": 74.62, "step": 759, "token_acc": 0.8728070175438597, "train_speed(iter/s)": 0.022496 }, { "epoch": 0.6147623862487361, "grad_norm": 2.0986220836639404, "learning_rate": 3.5441796895972203e-06, "loss": 0.4426667094230652, "memory(GiB)": 74.62, "step": 760, "token_acc": 0.8986486486486487, "train_speed(iter/s)": 0.022497 }, { "epoch": 0.6155712841253792, "grad_norm": 2.349647045135498, "learning_rate": 3.5313956433186535e-06, "loss": 0.3979909121990204, "memory(GiB)": 74.62, "step": 761, "token_acc": 0.8770949720670391, "train_speed(iter/s)": 0.022497 }, { "epoch": 0.6163801820020223, "grad_norm": 2.267604351043701, "learning_rate": 3.518622095571831e-06, "loss": 0.3654158413410187, "memory(GiB)": 74.62, "step": 762, "token_acc": 0.8448979591836735, "train_speed(iter/s)": 0.022497 }, { "epoch": 0.6171890798786653, "grad_norm": 2.626412868499756, "learning_rate": 3.505859137670313e-06, "loss": 0.3898380398750305, "memory(GiB)": 74.62, "step": 763, "token_acc": 0.860655737704918, "train_speed(iter/s)": 0.022497 }, { "epoch": 0.6179979777553084, "grad_norm": 2.134931802749634, "learning_rate": 3.4931068608519626e-06, "loss": 0.45385637879371643, "memory(GiB)": 74.62, "step": 764, "token_acc": 0.8448275862068966, "train_speed(iter/s)": 0.022498 }, { "epoch": 0.6188068756319515, "grad_norm": 2.1175262928009033, "learning_rate": 3.4803653562782807e-06, "loss": 0.44239288568496704, "memory(GiB)": 74.62, "step": 765, "token_acc": 0.8226600985221675, "train_speed(iter/s)": 0.022498 }, { "epoch": 0.6196157735085945, "grad_norm": 1.9157018661499023, "learning_rate": 3.4676347150337673e-06, "loss": 0.37729379534721375, "memory(GiB)": 74.62, "step": 766, "token_acc": 0.8744588744588745, "train_speed(iter/s)": 0.022499 }, { "epoch": 0.6204246713852376, "grad_norm": 2.0690548419952393, "learning_rate": 3.4549150281252635e-06, "loss": 0.40089553594589233, "memory(GiB)": 74.62, "step": 767, "token_acc": 0.9056603773584906, "train_speed(iter/s)": 0.022499 }, { "epoch": 0.6212335692618807, "grad_norm": 2.284152030944824, "learning_rate": 3.442206386481297e-06, "loss": 0.37923118472099304, "memory(GiB)": 74.62, "step": 768, "token_acc": 0.8286713286713286, "train_speed(iter/s)": 0.022499 }, { "epoch": 0.6220424671385237, "grad_norm": 3.0014827251434326, "learning_rate": 3.429508880951444e-06, "loss": 0.38093435764312744, "memory(GiB)": 74.62, "step": 769, "token_acc": 0.8698224852071006, "train_speed(iter/s)": 0.0225 }, { "epoch": 0.6228513650151668, "grad_norm": 2.2891621589660645, "learning_rate": 3.4168226023056638e-06, "loss": 0.4511076509952545, "memory(GiB)": 74.62, "step": 770, "token_acc": 0.7976190476190477, "train_speed(iter/s)": 0.0225 }, { "epoch": 0.6236602628918099, "grad_norm": 2.116448163986206, "learning_rate": 3.4041476412336672e-06, "loss": 0.49026528000831604, "memory(GiB)": 74.62, "step": 771, "token_acc": 0.840625, "train_speed(iter/s)": 0.022501 }, { "epoch": 0.6244691607684529, "grad_norm": 1.6491224765777588, "learning_rate": 3.391484088344257e-06, "loss": 0.3303212523460388, "memory(GiB)": 74.62, "step": 772, "token_acc": 0.8959276018099548, "train_speed(iter/s)": 0.022501 }, { "epoch": 0.625278058645096, "grad_norm": 2.458468198776245, "learning_rate": 3.3788320341646764e-06, "loss": 0.37041348218917847, "memory(GiB)": 74.62, "step": 773, "token_acc": 0.8658008658008658, "train_speed(iter/s)": 0.022501 }, { "epoch": 0.6260869565217392, "grad_norm": 1.9400595426559448, "learning_rate": 3.3661915691399814e-06, "loss": 0.40716874599456787, "memory(GiB)": 74.62, "step": 774, "token_acc": 0.8565400843881856, "train_speed(iter/s)": 0.022502 }, { "epoch": 0.6268958543983822, "grad_norm": 2.076422691345215, "learning_rate": 3.3535627836323683e-06, "loss": 0.4028838276863098, "memory(GiB)": 74.62, "step": 775, "token_acc": 0.8844444444444445, "train_speed(iter/s)": 0.022502 }, { "epoch": 0.6277047522750253, "grad_norm": 2.1392087936401367, "learning_rate": 3.340945767920547e-06, "loss": 0.3876573443412781, "memory(GiB)": 74.62, "step": 776, "token_acc": 0.9047619047619048, "train_speed(iter/s)": 0.022502 }, { "epoch": 0.6285136501516684, "grad_norm": 1.980198860168457, "learning_rate": 3.328340612199091e-06, "loss": 0.3929121494293213, "memory(GiB)": 74.62, "step": 777, "token_acc": 0.8625, "train_speed(iter/s)": 0.022503 }, { "epoch": 0.6293225480283114, "grad_norm": 2.5135369300842285, "learning_rate": 3.315747406577787e-06, "loss": 0.4506552815437317, "memory(GiB)": 74.62, "step": 778, "token_acc": 0.8625954198473282, "train_speed(iter/s)": 0.022503 }, { "epoch": 0.6301314459049545, "grad_norm": 2.9397776126861572, "learning_rate": 3.303166241080996e-06, "loss": 0.366382896900177, "memory(GiB)": 74.62, "step": 779, "token_acc": 0.9363636363636364, "train_speed(iter/s)": 0.022504 }, { "epoch": 0.6309403437815976, "grad_norm": 2.5433013439178467, "learning_rate": 3.290597205647009e-06, "loss": 0.39890724420547485, "memory(GiB)": 74.62, "step": 780, "token_acc": 0.8835341365461847, "train_speed(iter/s)": 0.022504 }, { "epoch": 0.6317492416582406, "grad_norm": 1.8281358480453491, "learning_rate": 3.2780403901274026e-06, "loss": 0.3230600953102112, "memory(GiB)": 74.62, "step": 781, "token_acc": 0.8682170542635659, "train_speed(iter/s)": 0.022504 }, { "epoch": 0.6325581395348837, "grad_norm": 2.3992929458618164, "learning_rate": 3.265495884286397e-06, "loss": 0.3860858082771301, "memory(GiB)": 74.62, "step": 782, "token_acc": 0.8675213675213675, "train_speed(iter/s)": 0.022505 }, { "epoch": 0.6333670374115268, "grad_norm": 2.3929519653320312, "learning_rate": 3.2529637778002177e-06, "loss": 0.41789501905441284, "memory(GiB)": 74.62, "step": 783, "token_acc": 0.8291666666666667, "train_speed(iter/s)": 0.022505 }, { "epoch": 0.6341759352881698, "grad_norm": 2.3482816219329834, "learning_rate": 3.2404441602564507e-06, "loss": 0.42455971240997314, "memory(GiB)": 74.62, "step": 784, "token_acc": 0.8830188679245283, "train_speed(iter/s)": 0.022506 }, { "epoch": 0.6349848331648129, "grad_norm": 1.525108814239502, "learning_rate": 3.2279371211533976e-06, "loss": 0.3243609070777893, "memory(GiB)": 74.62, "step": 785, "token_acc": 0.8814229249011858, "train_speed(iter/s)": 0.022506 }, { "epoch": 0.635793731041456, "grad_norm": 2.330397367477417, "learning_rate": 3.2154427498994517e-06, "loss": 0.424887478351593, "memory(GiB)": 74.62, "step": 786, "token_acc": 0.8571428571428571, "train_speed(iter/s)": 0.022506 }, { "epoch": 0.636602628918099, "grad_norm": 1.9202159643173218, "learning_rate": 3.202961135812437e-06, "loss": 0.3225609064102173, "memory(GiB)": 74.62, "step": 787, "token_acc": 0.9056603773584906, "train_speed(iter/s)": 0.022507 }, { "epoch": 0.6374115267947421, "grad_norm": 1.9144957065582275, "learning_rate": 3.1904923681189883e-06, "loss": 0.3496546745300293, "memory(GiB)": 74.62, "step": 788, "token_acc": 0.903010033444816, "train_speed(iter/s)": 0.022507 }, { "epoch": 0.6382204246713853, "grad_norm": 2.0034921169281006, "learning_rate": 3.1780365359539043e-06, "loss": 0.41543805599212646, "memory(GiB)": 74.62, "step": 789, "token_acc": 0.8977272727272727, "train_speed(iter/s)": 0.022508 }, { "epoch": 0.6390293225480284, "grad_norm": 1.9115188121795654, "learning_rate": 3.1655937283595116e-06, "loss": 0.38339167833328247, "memory(GiB)": 74.62, "step": 790, "token_acc": 0.8555133079847909, "train_speed(iter/s)": 0.022508 }, { "epoch": 0.6398382204246714, "grad_norm": 2.29840350151062, "learning_rate": 3.153164034285031e-06, "loss": 0.3975831866264343, "memory(GiB)": 74.62, "step": 791, "token_acc": 0.8322368421052632, "train_speed(iter/s)": 0.022508 }, { "epoch": 0.6406471183013145, "grad_norm": 2.4968957901000977, "learning_rate": 3.1407475425859348e-06, "loss": 0.346437931060791, "memory(GiB)": 74.62, "step": 792, "token_acc": 0.8744588744588745, "train_speed(iter/s)": 0.022509 }, { "epoch": 0.6414560161779576, "grad_norm": 2.1374566555023193, "learning_rate": 3.1283443420233196e-06, "loss": 0.4348532557487488, "memory(GiB)": 74.62, "step": 793, "token_acc": 0.881578947368421, "train_speed(iter/s)": 0.022509 }, { "epoch": 0.6422649140546006, "grad_norm": 2.104574203491211, "learning_rate": 3.1159545212632697e-06, "loss": 0.3768533170223236, "memory(GiB)": 74.62, "step": 794, "token_acc": 0.8533834586466166, "train_speed(iter/s)": 0.02251 }, { "epoch": 0.6430738119312437, "grad_norm": 2.8082082271575928, "learning_rate": 3.1035781688762177e-06, "loss": 0.3694327473640442, "memory(GiB)": 74.62, "step": 795, "token_acc": 0.8781512605042017, "train_speed(iter/s)": 0.02251 }, { "epoch": 0.6438827098078868, "grad_norm": 2.036285161972046, "learning_rate": 3.0912153733363203e-06, "loss": 0.4223785996437073, "memory(GiB)": 74.62, "step": 796, "token_acc": 0.9013452914798207, "train_speed(iter/s)": 0.02251 }, { "epoch": 0.6446916076845298, "grad_norm": 1.9402992725372314, "learning_rate": 3.078866223020815e-06, "loss": 0.39007920026779175, "memory(GiB)": 74.62, "step": 797, "token_acc": 0.8409090909090909, "train_speed(iter/s)": 0.022511 }, { "epoch": 0.6455005055611729, "grad_norm": 2.3638556003570557, "learning_rate": 3.066530806209402e-06, "loss": 0.39857369661331177, "memory(GiB)": 74.62, "step": 798, "token_acc": 0.8956521739130435, "train_speed(iter/s)": 0.022511 }, { "epoch": 0.646309403437816, "grad_norm": 1.8377914428710938, "learning_rate": 3.0542092110835996e-06, "loss": 0.3549560010433197, "memory(GiB)": 74.62, "step": 799, "token_acc": 0.8955823293172691, "train_speed(iter/s)": 0.022512 }, { "epoch": 0.647118301314459, "grad_norm": 2.2061686515808105, "learning_rate": 3.04190152572612e-06, "loss": 0.43962785601615906, "memory(GiB)": 74.62, "step": 800, "token_acc": 0.8395522388059702, "train_speed(iter/s)": 0.022512 }, { "epoch": 0.6479271991911021, "grad_norm": 2.3892087936401367, "learning_rate": 3.0296078381202465e-06, "loss": 0.37227606773376465, "memory(GiB)": 74.62, "step": 801, "token_acc": 0.8662420382165605, "train_speed(iter/s)": 0.022512 }, { "epoch": 0.6487360970677452, "grad_norm": 2.125608444213867, "learning_rate": 3.017328236149187e-06, "loss": 0.43218767642974854, "memory(GiB)": 74.62, "step": 802, "token_acc": 0.8796992481203008, "train_speed(iter/s)": 0.022513 }, { "epoch": 0.6495449949443882, "grad_norm": 2.3993020057678223, "learning_rate": 3.0050628075954643e-06, "loss": 0.3682135343551636, "memory(GiB)": 74.62, "step": 803, "token_acc": 0.9087136929460581, "train_speed(iter/s)": 0.022513 }, { "epoch": 0.6503538928210314, "grad_norm": 2.251502513885498, "learning_rate": 2.9928116401402753e-06, "loss": 0.4699886441230774, "memory(GiB)": 74.62, "step": 804, "token_acc": 0.8686440677966102, "train_speed(iter/s)": 0.022513 }, { "epoch": 0.6511627906976745, "grad_norm": 13.69151496887207, "learning_rate": 2.9805748213628727e-06, "loss": 0.3267248272895813, "memory(GiB)": 74.62, "step": 805, "token_acc": 0.8592057761732852, "train_speed(iter/s)": 0.022514 }, { "epoch": 0.6519716885743175, "grad_norm": 2.1798858642578125, "learning_rate": 2.968352438739936e-06, "loss": 0.4122653901576996, "memory(GiB)": 74.62, "step": 806, "token_acc": 0.8377581120943953, "train_speed(iter/s)": 0.022514 }, { "epoch": 0.6527805864509606, "grad_norm": 1.9182910919189453, "learning_rate": 2.956144579644942e-06, "loss": 0.36671823263168335, "memory(GiB)": 74.62, "step": 807, "token_acc": 0.8716216216216216, "train_speed(iter/s)": 0.022515 }, { "epoch": 0.6535894843276037, "grad_norm": 2.026547908782959, "learning_rate": 2.9439513313475464e-06, "loss": 0.3970714807510376, "memory(GiB)": 74.62, "step": 808, "token_acc": 0.9066666666666666, "train_speed(iter/s)": 0.022515 }, { "epoch": 0.6543983822042467, "grad_norm": 2.1154861450195312, "learning_rate": 2.931772781012958e-06, "loss": 0.3996396064758301, "memory(GiB)": 74.62, "step": 809, "token_acc": 0.8494623655913979, "train_speed(iter/s)": 0.022515 }, { "epoch": 0.6552072800808898, "grad_norm": 2.0756337642669678, "learning_rate": 2.9196090157013146e-06, "loss": 0.44487231969833374, "memory(GiB)": 74.62, "step": 810, "token_acc": 0.8075471698113208, "train_speed(iter/s)": 0.022516 }, { "epoch": 0.6560161779575329, "grad_norm": 2.0214574337005615, "learning_rate": 2.907460122367062e-06, "loss": 0.3471815586090088, "memory(GiB)": 74.62, "step": 811, "token_acc": 0.8540925266903915, "train_speed(iter/s)": 0.022516 }, { "epoch": 0.6568250758341759, "grad_norm": 1.8203327655792236, "learning_rate": 2.8953261878583263e-06, "loss": 0.3285714387893677, "memory(GiB)": 74.62, "step": 812, "token_acc": 0.9137931034482759, "train_speed(iter/s)": 0.022517 }, { "epoch": 0.657633973710819, "grad_norm": 2.6111230850219727, "learning_rate": 2.8832072989163048e-06, "loss": 0.38925743103027344, "memory(GiB)": 74.62, "step": 813, "token_acc": 0.8852459016393442, "train_speed(iter/s)": 0.022517 }, { "epoch": 0.6584428715874621, "grad_norm": 1.8417023420333862, "learning_rate": 2.871103542174637e-06, "loss": 0.3698727488517761, "memory(GiB)": 74.62, "step": 814, "token_acc": 0.8767605633802817, "train_speed(iter/s)": 0.022517 }, { "epoch": 0.6592517694641051, "grad_norm": 2.0547242164611816, "learning_rate": 2.859015004158789e-06, "loss": 0.37436971068382263, "memory(GiB)": 74.62, "step": 815, "token_acc": 0.8426966292134831, "train_speed(iter/s)": 0.022518 }, { "epoch": 0.6600606673407482, "grad_norm": 3.1478235721588135, "learning_rate": 2.8469417712854287e-06, "loss": 0.4491364359855652, "memory(GiB)": 74.62, "step": 816, "token_acc": 0.8157894736842105, "train_speed(iter/s)": 0.022518 }, { "epoch": 0.6608695652173913, "grad_norm": 2.21091890335083, "learning_rate": 2.834883929861818e-06, "loss": 0.3636167049407959, "memory(GiB)": 74.62, "step": 817, "token_acc": 0.9236947791164659, "train_speed(iter/s)": 0.022518 }, { "epoch": 0.6616784630940343, "grad_norm": 2.1053714752197266, "learning_rate": 2.822841566085192e-06, "loss": 0.3697773218154907, "memory(GiB)": 74.62, "step": 818, "token_acc": 0.9090909090909091, "train_speed(iter/s)": 0.022519 }, { "epoch": 0.6624873609706774, "grad_norm": 1.9461814165115356, "learning_rate": 2.8108147660421325e-06, "loss": 0.42437541484832764, "memory(GiB)": 74.62, "step": 819, "token_acc": 0.8985507246376812, "train_speed(iter/s)": 0.022519 }, { "epoch": 0.6632962588473206, "grad_norm": 1.9878171682357788, "learning_rate": 2.798803615707976e-06, "loss": 0.40904805064201355, "memory(GiB)": 74.62, "step": 820, "token_acc": 0.8475609756097561, "train_speed(iter/s)": 0.022519 }, { "epoch": 0.6641051567239636, "grad_norm": 1.8959929943084717, "learning_rate": 2.78680820094617e-06, "loss": 0.3745640218257904, "memory(GiB)": 74.62, "step": 821, "token_acc": 0.914396887159533, "train_speed(iter/s)": 0.02252 }, { "epoch": 0.6649140546006067, "grad_norm": 2.005540609359741, "learning_rate": 2.7748286075076834e-06, "loss": 0.364071786403656, "memory(GiB)": 74.62, "step": 822, "token_acc": 0.8765432098765432, "train_speed(iter/s)": 0.02252 }, { "epoch": 0.6657229524772498, "grad_norm": 2.166395902633667, "learning_rate": 2.762864921030384e-06, "loss": 0.37051212787628174, "memory(GiB)": 74.62, "step": 823, "token_acc": 0.8909774436090225, "train_speed(iter/s)": 0.02252 }, { "epoch": 0.6665318503538928, "grad_norm": 1.9548283815383911, "learning_rate": 2.750917227038419e-06, "loss": 0.39772191643714905, "memory(GiB)": 74.62, "step": 824, "token_acc": 0.8986928104575164, "train_speed(iter/s)": 0.022521 }, { "epoch": 0.6673407482305359, "grad_norm": 2.373486280441284, "learning_rate": 2.7389856109416178e-06, "loss": 0.39033639430999756, "memory(GiB)": 74.62, "step": 825, "token_acc": 0.8876404494382022, "train_speed(iter/s)": 0.022521 }, { "epoch": 0.668149646107179, "grad_norm": 1.9656351804733276, "learning_rate": 2.7270701580348737e-06, "loss": 0.4327496588230133, "memory(GiB)": 74.62, "step": 826, "token_acc": 0.8840579710144928, "train_speed(iter/s)": 0.022522 }, { "epoch": 0.668958543983822, "grad_norm": 1.7876020669937134, "learning_rate": 2.715170953497532e-06, "loss": 0.4038127064704895, "memory(GiB)": 74.62, "step": 827, "token_acc": 0.8581081081081081, "train_speed(iter/s)": 0.022522 }, { "epoch": 0.6697674418604651, "grad_norm": 2.269183397293091, "learning_rate": 2.703288082392791e-06, "loss": 0.3742678165435791, "memory(GiB)": 74.62, "step": 828, "token_acc": 0.9116279069767442, "train_speed(iter/s)": 0.022522 }, { "epoch": 0.6705763397371082, "grad_norm": 2.3092498779296875, "learning_rate": 2.691421629667076e-06, "loss": 0.3477456867694855, "memory(GiB)": 74.62, "step": 829, "token_acc": 0.8858447488584474, "train_speed(iter/s)": 0.022523 }, { "epoch": 0.6713852376137512, "grad_norm": 2.0374417304992676, "learning_rate": 2.6795716801494538e-06, "loss": 0.3951851725578308, "memory(GiB)": 74.62, "step": 830, "token_acc": 0.8655913978494624, "train_speed(iter/s)": 0.022523 }, { "epoch": 0.6721941354903943, "grad_norm": 2.6279661655426025, "learning_rate": 2.6677383185510053e-06, "loss": 0.37477776408195496, "memory(GiB)": 74.62, "step": 831, "token_acc": 0.8745519713261649, "train_speed(iter/s)": 0.022523 }, { "epoch": 0.6730030333670374, "grad_norm": 2.128077268600464, "learning_rate": 2.6559216294642446e-06, "loss": 0.34244000911712646, "memory(GiB)": 74.62, "step": 832, "token_acc": 0.8764478764478765, "train_speed(iter/s)": 0.022524 }, { "epoch": 0.6738119312436804, "grad_norm": 1.9825257062911987, "learning_rate": 2.6441216973624857e-06, "loss": 0.36798208951950073, "memory(GiB)": 74.62, "step": 833, "token_acc": 0.9363957597173145, "train_speed(iter/s)": 0.022524 }, { "epoch": 0.6746208291203235, "grad_norm": 2.1210215091705322, "learning_rate": 2.6323386065992596e-06, "loss": 0.3946457505226135, "memory(GiB)": 74.62, "step": 834, "token_acc": 0.8380681818181818, "train_speed(iter/s)": 0.022524 }, { "epoch": 0.6754297269969667, "grad_norm": 1.8778958320617676, "learning_rate": 2.6205724414077064e-06, "loss": 0.3758698105812073, "memory(GiB)": 74.62, "step": 835, "token_acc": 0.8895705521472392, "train_speed(iter/s)": 0.022525 }, { "epoch": 0.6762386248736098, "grad_norm": 1.917371153831482, "learning_rate": 2.6088232858999644e-06, "loss": 0.3301732540130615, "memory(GiB)": 74.62, "step": 836, "token_acc": 0.9437229437229437, "train_speed(iter/s)": 0.022525 }, { "epoch": 0.6770475227502528, "grad_norm": 2.223240613937378, "learning_rate": 2.5970912240665815e-06, "loss": 0.4553636908531189, "memory(GiB)": 74.62, "step": 837, "token_acc": 0.8589211618257261, "train_speed(iter/s)": 0.022525 }, { "epoch": 0.6778564206268959, "grad_norm": 3.028218984603882, "learning_rate": 2.585376339775908e-06, "loss": 0.46183380484580994, "memory(GiB)": 74.62, "step": 838, "token_acc": 0.8557213930348259, "train_speed(iter/s)": 0.022526 }, { "epoch": 0.678665318503539, "grad_norm": 1.9921714067459106, "learning_rate": 2.573678716773496e-06, "loss": 0.38901880383491516, "memory(GiB)": 74.62, "step": 839, "token_acc": 0.8819188191881919, "train_speed(iter/s)": 0.022526 }, { "epoch": 0.679474216380182, "grad_norm": 2.3916425704956055, "learning_rate": 2.5619984386815073e-06, "loss": 0.4160255193710327, "memory(GiB)": 74.62, "step": 840, "token_acc": 0.8577981651376146, "train_speed(iter/s)": 0.022526 }, { "epoch": 0.6802831142568251, "grad_norm": 2.2416515350341797, "learning_rate": 2.550335588998103e-06, "loss": 0.46858906745910645, "memory(GiB)": 74.62, "step": 841, "token_acc": 0.8333333333333334, "train_speed(iter/s)": 0.022526 }, { "epoch": 0.6810920121334681, "grad_norm": 1.9555854797363281, "learning_rate": 2.5386902510968627e-06, "loss": 0.4036467969417572, "memory(GiB)": 74.62, "step": 842, "token_acc": 0.8421052631578947, "train_speed(iter/s)": 0.022527 }, { "epoch": 0.6819009100101112, "grad_norm": 4.791243553161621, "learning_rate": 2.527062508226176e-06, "loss": 0.37610989809036255, "memory(GiB)": 74.62, "step": 843, "token_acc": 0.8712871287128713, "train_speed(iter/s)": 0.022527 }, { "epoch": 0.6827098078867543, "grad_norm": 1.9034098386764526, "learning_rate": 2.5154524435086537e-06, "loss": 0.3707886040210724, "memory(GiB)": 74.62, "step": 844, "token_acc": 0.8761904761904762, "train_speed(iter/s)": 0.022527 }, { "epoch": 0.6835187057633973, "grad_norm": 2.0733163356781006, "learning_rate": 2.5038601399405337e-06, "loss": 0.4223529100418091, "memory(GiB)": 74.62, "step": 845, "token_acc": 0.844106463878327, "train_speed(iter/s)": 0.022528 }, { "epoch": 0.6843276036400404, "grad_norm": 1.9344497919082642, "learning_rate": 2.492285680391079e-06, "loss": 0.38297271728515625, "memory(GiB)": 74.62, "step": 846, "token_acc": 0.8446215139442231, "train_speed(iter/s)": 0.022528 }, { "epoch": 0.6851365015166835, "grad_norm": 2.1887664794921875, "learning_rate": 2.4807291476019996e-06, "loss": 0.3631531000137329, "memory(GiB)": 74.62, "step": 847, "token_acc": 0.8963963963963963, "train_speed(iter/s)": 0.022529 }, { "epoch": 0.6859453993933265, "grad_norm": 2.7835731506347656, "learning_rate": 2.4691906241868473e-06, "loss": 0.4326528310775757, "memory(GiB)": 74.62, "step": 848, "token_acc": 0.8395522388059702, "train_speed(iter/s)": 0.022529 }, { "epoch": 0.6867542972699696, "grad_norm": 2.0431745052337646, "learning_rate": 2.4576701926304357e-06, "loss": 0.34864187240600586, "memory(GiB)": 74.62, "step": 849, "token_acc": 0.899581589958159, "train_speed(iter/s)": 0.022529 }, { "epoch": 0.6875631951466128, "grad_norm": 5.60698938369751, "learning_rate": 2.4461679352882443e-06, "loss": 0.4054935574531555, "memory(GiB)": 74.62, "step": 850, "token_acc": 0.8398058252427184, "train_speed(iter/s)": 0.022529 }, { "epoch": 0.6883720930232559, "grad_norm": 1.8018779754638672, "learning_rate": 2.434683934385833e-06, "loss": 0.32462042570114136, "memory(GiB)": 74.62, "step": 851, "token_acc": 0.8726591760299626, "train_speed(iter/s)": 0.02253 }, { "epoch": 0.6891809908998989, "grad_norm": 3.250086545944214, "learning_rate": 2.4232182720182524e-06, "loss": 0.3477787375450134, "memory(GiB)": 74.62, "step": 852, "token_acc": 0.8364312267657993, "train_speed(iter/s)": 0.02253 }, { "epoch": 0.689989888776542, "grad_norm": 2.627101421356201, "learning_rate": 2.4117710301494527e-06, "loss": 0.38884738087654114, "memory(GiB)": 74.62, "step": 853, "token_acc": 0.9395348837209302, "train_speed(iter/s)": 0.022531 }, { "epoch": 0.6907987866531851, "grad_norm": 2.8539373874664307, "learning_rate": 2.40034229061171e-06, "loss": 0.40084555745124817, "memory(GiB)": 74.62, "step": 854, "token_acc": 0.8843283582089553, "train_speed(iter/s)": 0.022531 }, { "epoch": 0.6916076845298281, "grad_norm": 2.5881996154785156, "learning_rate": 2.3889321351050286e-06, "loss": 0.36527204513549805, "memory(GiB)": 74.62, "step": 855, "token_acc": 0.911504424778761, "train_speed(iter/s)": 0.022531 }, { "epoch": 0.6924165824064712, "grad_norm": 2.2433817386627197, "learning_rate": 2.377540645196565e-06, "loss": 0.4530036151409149, "memory(GiB)": 74.62, "step": 856, "token_acc": 0.8681672025723473, "train_speed(iter/s)": 0.022532 }, { "epoch": 0.6932254802831143, "grad_norm": 2.251718759536743, "learning_rate": 2.3661679023200422e-06, "loss": 0.44757646322250366, "memory(GiB)": 74.62, "step": 857, "token_acc": 0.9019607843137255, "train_speed(iter/s)": 0.022532 }, { "epoch": 0.6940343781597573, "grad_norm": 1.987608790397644, "learning_rate": 2.354813987775163e-06, "loss": 0.34107983112335205, "memory(GiB)": 74.62, "step": 858, "token_acc": 0.871875, "train_speed(iter/s)": 0.022532 }, { "epoch": 0.6948432760364004, "grad_norm": 2.4668984413146973, "learning_rate": 2.343478982727039e-06, "loss": 0.4043659269809723, "memory(GiB)": 74.62, "step": 859, "token_acc": 0.8947368421052632, "train_speed(iter/s)": 0.022532 }, { "epoch": 0.6956521739130435, "grad_norm": 1.9259587526321411, "learning_rate": 2.3321629682055984e-06, "loss": 0.378429114818573, "memory(GiB)": 74.62, "step": 860, "token_acc": 0.848297213622291, "train_speed(iter/s)": 0.022533 }, { "epoch": 0.6964610717896865, "grad_norm": 1.9682130813598633, "learning_rate": 2.320866025105016e-06, "loss": 0.34357139468193054, "memory(GiB)": 74.62, "step": 861, "token_acc": 0.8348348348348348, "train_speed(iter/s)": 0.022533 }, { "epoch": 0.6972699696663296, "grad_norm": 2.785592794418335, "learning_rate": 2.309588234183137e-06, "loss": 0.3498800992965698, "memory(GiB)": 74.62, "step": 862, "token_acc": 0.8847736625514403, "train_speed(iter/s)": 0.022534 }, { "epoch": 0.6980788675429727, "grad_norm": 2.4636342525482178, "learning_rate": 2.298329676060884e-06, "loss": 0.39585980772972107, "memory(GiB)": 74.62, "step": 863, "token_acc": 0.865546218487395, "train_speed(iter/s)": 0.022534 }, { "epoch": 0.6988877654196157, "grad_norm": 1.8095598220825195, "learning_rate": 2.287090431221701e-06, "loss": 0.37628334760665894, "memory(GiB)": 74.62, "step": 864, "token_acc": 0.8954703832752613, "train_speed(iter/s)": 0.022534 }, { "epoch": 0.6996966632962589, "grad_norm": 1.9140504598617554, "learning_rate": 2.275870580010958e-06, "loss": 0.3849208354949951, "memory(GiB)": 74.62, "step": 865, "token_acc": 0.910958904109589, "train_speed(iter/s)": 0.022534 }, { "epoch": 0.700505561172902, "grad_norm": 1.7582415342330933, "learning_rate": 2.264670202635396e-06, "loss": 0.3840162754058838, "memory(GiB)": 74.62, "step": 866, "token_acc": 0.8550185873605948, "train_speed(iter/s)": 0.022535 }, { "epoch": 0.701314459049545, "grad_norm": 1.8664969205856323, "learning_rate": 2.2534893791625408e-06, "loss": 0.3248283565044403, "memory(GiB)": 74.62, "step": 867, "token_acc": 0.896, "train_speed(iter/s)": 0.022535 }, { "epoch": 0.7021233569261881, "grad_norm": 1.9030721187591553, "learning_rate": 2.242328189520134e-06, "loss": 0.35055387020111084, "memory(GiB)": 74.62, "step": 868, "token_acc": 0.8939393939393939, "train_speed(iter/s)": 0.022535 }, { "epoch": 0.7029322548028312, "grad_norm": 2.2921035289764404, "learning_rate": 2.2311867134955637e-06, "loss": 0.41889488697052, "memory(GiB)": 74.62, "step": 869, "token_acc": 0.8565573770491803, "train_speed(iter/s)": 0.022535 }, { "epoch": 0.7037411526794742, "grad_norm": 2.5671067237854004, "learning_rate": 2.2200650307352883e-06, "loss": 0.3641519844532013, "memory(GiB)": 74.62, "step": 870, "token_acc": 0.8726415094339622, "train_speed(iter/s)": 0.022536 }, { "epoch": 0.7045500505561173, "grad_norm": 2.0666255950927734, "learning_rate": 2.2089632207442763e-06, "loss": 0.34707674384117126, "memory(GiB)": 74.62, "step": 871, "token_acc": 0.898876404494382, "train_speed(iter/s)": 0.022536 }, { "epoch": 0.7053589484327604, "grad_norm": 2.3214352130889893, "learning_rate": 2.197881362885426e-06, "loss": 0.30853113532066345, "memory(GiB)": 74.62, "step": 872, "token_acc": 0.8477611940298507, "train_speed(iter/s)": 0.022536 }, { "epoch": 0.7061678463094034, "grad_norm": 2.3969626426696777, "learning_rate": 2.1868195363790147e-06, "loss": 0.44838905334472656, "memory(GiB)": 74.62, "step": 873, "token_acc": 0.8190954773869347, "train_speed(iter/s)": 0.022537 }, { "epoch": 0.7069767441860465, "grad_norm": 2.3142099380493164, "learning_rate": 2.1757778203021163e-06, "loss": 0.4084170460700989, "memory(GiB)": 74.62, "step": 874, "token_acc": 0.875, "train_speed(iter/s)": 0.022537 }, { "epoch": 0.7077856420626896, "grad_norm": 2.4327192306518555, "learning_rate": 2.1647562935880405e-06, "loss": 0.4108632802963257, "memory(GiB)": 74.62, "step": 875, "token_acc": 0.8553054662379421, "train_speed(iter/s)": 0.022537 }, { "epoch": 0.7085945399393326, "grad_norm": 1.7217832803726196, "learning_rate": 2.153755035025777e-06, "loss": 0.3645017743110657, "memory(GiB)": 74.62, "step": 876, "token_acc": 0.825925925925926, "train_speed(iter/s)": 0.022538 }, { "epoch": 0.7094034378159757, "grad_norm": 1.7630640268325806, "learning_rate": 2.1427741232594185e-06, "loss": 0.3739239573478699, "memory(GiB)": 74.62, "step": 877, "token_acc": 0.8757961783439491, "train_speed(iter/s)": 0.022538 }, { "epoch": 0.7102123356926188, "grad_norm": 1.9821792840957642, "learning_rate": 2.1318136367876098e-06, "loss": 0.3128720223903656, "memory(GiB)": 74.62, "step": 878, "token_acc": 0.8744769874476988, "train_speed(iter/s)": 0.022538 }, { "epoch": 0.7110212335692618, "grad_norm": 1.9988818168640137, "learning_rate": 2.120873653962983e-06, "loss": 0.39012840390205383, "memory(GiB)": 74.62, "step": 879, "token_acc": 0.865814696485623, "train_speed(iter/s)": 0.022538 }, { "epoch": 0.7118301314459049, "grad_norm": 2.3474910259246826, "learning_rate": 2.109954252991595e-06, "loss": 0.3977096676826477, "memory(GiB)": 74.62, "step": 880, "token_acc": 0.8461538461538461, "train_speed(iter/s)": 0.022539 }, { "epoch": 0.7126390293225481, "grad_norm": 1.7941343784332275, "learning_rate": 2.0990555119323737e-06, "loss": 0.37561237812042236, "memory(GiB)": 74.62, "step": 881, "token_acc": 0.8759124087591241, "train_speed(iter/s)": 0.022539 }, { "epoch": 0.7134479271991911, "grad_norm": 2.288217782974243, "learning_rate": 2.0881775086965494e-06, "loss": 0.3414373993873596, "memory(GiB)": 74.62, "step": 882, "token_acc": 0.8681318681318682, "train_speed(iter/s)": 0.022539 }, { "epoch": 0.7142568250758342, "grad_norm": 1.7807132005691528, "learning_rate": 2.0773203210471115e-06, "loss": 0.3832324147224426, "memory(GiB)": 74.62, "step": 883, "token_acc": 0.8442028985507246, "train_speed(iter/s)": 0.02254 }, { "epoch": 0.7150657229524773, "grad_norm": 1.990700602531433, "learning_rate": 2.0664840265982457e-06, "loss": 0.4304344952106476, "memory(GiB)": 74.62, "step": 884, "token_acc": 0.8095238095238095, "train_speed(iter/s)": 0.02254 }, { "epoch": 0.7158746208291203, "grad_norm": 1.9708170890808105, "learning_rate": 2.0556687028147765e-06, "loss": 0.4029080867767334, "memory(GiB)": 74.62, "step": 885, "token_acc": 0.8530612244897959, "train_speed(iter/s)": 0.02254 }, { "epoch": 0.7166835187057634, "grad_norm": 2.2865779399871826, "learning_rate": 2.0448744270116206e-06, "loss": 0.390356183052063, "memory(GiB)": 74.62, "step": 886, "token_acc": 0.8823529411764706, "train_speed(iter/s)": 0.022541 }, { "epoch": 0.7174924165824065, "grad_norm": 2.5284066200256348, "learning_rate": 2.0341012763532243e-06, "loss": 0.40166282653808594, "memory(GiB)": 74.62, "step": 887, "token_acc": 0.8840579710144928, "train_speed(iter/s)": 0.022541 }, { "epoch": 0.7183013144590495, "grad_norm": 6.747030258178711, "learning_rate": 2.023349327853025e-06, "loss": 0.38176417350769043, "memory(GiB)": 74.62, "step": 888, "token_acc": 0.8652482269503546, "train_speed(iter/s)": 0.022541 }, { "epoch": 0.7191102123356926, "grad_norm": 2.049042224884033, "learning_rate": 2.0126186583728856e-06, "loss": 0.3778286576271057, "memory(GiB)": 74.62, "step": 889, "token_acc": 0.8429319371727748, "train_speed(iter/s)": 0.022542 }, { "epoch": 0.7199191102123357, "grad_norm": 2.2993712425231934, "learning_rate": 2.001909344622559e-06, "loss": 0.4231566786766052, "memory(GiB)": 74.62, "step": 890, "token_acc": 0.8865979381443299, "train_speed(iter/s)": 0.022542 }, { "epoch": 0.7207280080889787, "grad_norm": 2.244127035140991, "learning_rate": 1.9912214631591314e-06, "loss": 0.3927876651287079, "memory(GiB)": 74.62, "step": 891, "token_acc": 0.8867924528301887, "train_speed(iter/s)": 0.022542 }, { "epoch": 0.7215369059656218, "grad_norm": 1.9843049049377441, "learning_rate": 1.9805550903864775e-06, "loss": 0.39008790254592896, "memory(GiB)": 74.62, "step": 892, "token_acc": 0.8244274809160306, "train_speed(iter/s)": 0.022543 }, { "epoch": 0.7223458038422649, "grad_norm": 2.253777027130127, "learning_rate": 1.9699103025547145e-06, "loss": 0.3611776828765869, "memory(GiB)": 74.62, "step": 893, "token_acc": 0.84765625, "train_speed(iter/s)": 0.022543 }, { "epoch": 0.7231547017189079, "grad_norm": 2.2141964435577393, "learning_rate": 1.9592871757596532e-06, "loss": 0.4213542640209198, "memory(GiB)": 74.62, "step": 894, "token_acc": 0.8754716981132076, "train_speed(iter/s)": 0.022543 }, { "epoch": 0.723963599595551, "grad_norm": 1.9213643074035645, "learning_rate": 1.9486857859422607e-06, "loss": 0.4320271611213684, "memory(GiB)": 74.62, "step": 895, "token_acc": 0.8327526132404182, "train_speed(iter/s)": 0.022543 }, { "epoch": 0.7247724974721942, "grad_norm": 2.10569167137146, "learning_rate": 1.9381062088881142e-06, "loss": 0.3284885883331299, "memory(GiB)": 74.62, "step": 896, "token_acc": 0.8831615120274914, "train_speed(iter/s)": 0.022543 }, { "epoch": 0.7255813953488373, "grad_norm": 1.6468027830123901, "learning_rate": 1.9275485202268574e-06, "loss": 0.35665562748908997, "memory(GiB)": 74.62, "step": 897, "token_acc": 0.9037037037037037, "train_speed(iter/s)": 0.022544 }, { "epoch": 0.7263902932254803, "grad_norm": 1.961858868598938, "learning_rate": 1.917012795431665e-06, "loss": 0.3552227020263672, "memory(GiB)": 74.62, "step": 898, "token_acc": 0.8705882352941177, "train_speed(iter/s)": 0.022544 }, { "epoch": 0.7271991911021234, "grad_norm": 2.2594661712646484, "learning_rate": 1.9064991098186935e-06, "loss": 0.42378872632980347, "memory(GiB)": 74.62, "step": 899, "token_acc": 0.8449612403100775, "train_speed(iter/s)": 0.022544 }, { "epoch": 0.7280080889787665, "grad_norm": 2.3480887413024902, "learning_rate": 1.8960075385465547e-06, "loss": 0.38160020112991333, "memory(GiB)": 74.62, "step": 900, "token_acc": 0.8577235772357723, "train_speed(iter/s)": 0.022545 }, { "epoch": 0.7288169868554095, "grad_norm": 2.0713682174682617, "learning_rate": 1.8855381566157727e-06, "loss": 0.3788355588912964, "memory(GiB)": 74.62, "step": 901, "token_acc": 0.9140271493212669, "train_speed(iter/s)": 0.022545 }, { "epoch": 0.7296258847320526, "grad_norm": 1.8822578191757202, "learning_rate": 1.875091038868243e-06, "loss": 0.38564032316207886, "memory(GiB)": 74.62, "step": 902, "token_acc": 0.8618181818181818, "train_speed(iter/s)": 0.022545 }, { "epoch": 0.7304347826086957, "grad_norm": 2.0705273151397705, "learning_rate": 1.8646662599867072e-06, "loss": 0.4137299060821533, "memory(GiB)": 74.62, "step": 903, "token_acc": 0.8893617021276595, "train_speed(iter/s)": 0.022545 }, { "epoch": 0.7312436804853387, "grad_norm": 2.7392282485961914, "learning_rate": 1.8542638944942127e-06, "loss": 0.41165363788604736, "memory(GiB)": 74.62, "step": 904, "token_acc": 0.873015873015873, "train_speed(iter/s)": 0.022546 }, { "epoch": 0.7320525783619818, "grad_norm": 2.251229763031006, "learning_rate": 1.8438840167535826e-06, "loss": 0.39759790897369385, "memory(GiB)": 74.62, "step": 905, "token_acc": 0.8949416342412452, "train_speed(iter/s)": 0.022546 }, { "epoch": 0.7328614762386249, "grad_norm": 2.1164135932922363, "learning_rate": 1.8335267009668794e-06, "loss": 0.36323827505111694, "memory(GiB)": 74.62, "step": 906, "token_acc": 0.9142857142857143, "train_speed(iter/s)": 0.022546 }, { "epoch": 0.7336703741152679, "grad_norm": 2.421180009841919, "learning_rate": 1.8231920211748822e-06, "loss": 0.35361167788505554, "memory(GiB)": 74.62, "step": 907, "token_acc": 0.8603603603603603, "train_speed(iter/s)": 0.022547 }, { "epoch": 0.734479271991911, "grad_norm": 2.0135669708251953, "learning_rate": 1.8128800512565514e-06, "loss": 0.37238454818725586, "memory(GiB)": 74.62, "step": 908, "token_acc": 0.8442906574394463, "train_speed(iter/s)": 0.022547 }, { "epoch": 0.735288169868554, "grad_norm": 3.3785688877105713, "learning_rate": 1.8025908649285033e-06, "loss": 0.41406381130218506, "memory(GiB)": 74.62, "step": 909, "token_acc": 0.8631178707224335, "train_speed(iter/s)": 0.022548 }, { "epoch": 0.7360970677451971, "grad_norm": 2.393422842025757, "learning_rate": 1.7923245357444847e-06, "loss": 0.3641640543937683, "memory(GiB)": 74.62, "step": 910, "token_acc": 0.9076923076923077, "train_speed(iter/s)": 0.022548 }, { "epoch": 0.7369059656218403, "grad_norm": 2.425569534301758, "learning_rate": 1.7820811370948371e-06, "loss": 0.35734257102012634, "memory(GiB)": 74.62, "step": 911, "token_acc": 0.8491379310344828, "train_speed(iter/s)": 0.022548 }, { "epoch": 0.7377148634984834, "grad_norm": 2.3215856552124023, "learning_rate": 1.771860742205988e-06, "loss": 0.4984077513217926, "memory(GiB)": 74.62, "step": 912, "token_acc": 0.8333333333333334, "train_speed(iter/s)": 0.022548 }, { "epoch": 0.7385237613751264, "grad_norm": 2.343384265899658, "learning_rate": 1.7616634241399177e-06, "loss": 0.3875213861465454, "memory(GiB)": 74.62, "step": 913, "token_acc": 0.8376383763837638, "train_speed(iter/s)": 0.022548 }, { "epoch": 0.7393326592517695, "grad_norm": 1.9467604160308838, "learning_rate": 1.7514892557936309e-06, "loss": 0.3730790615081787, "memory(GiB)": 74.62, "step": 914, "token_acc": 0.8776223776223776, "train_speed(iter/s)": 0.022549 }, { "epoch": 0.7401415571284126, "grad_norm": 2.171644687652588, "learning_rate": 1.7413383098986563e-06, "loss": 0.3576545715332031, "memory(GiB)": 74.62, "step": 915, "token_acc": 0.8874172185430463, "train_speed(iter/s)": 0.022549 }, { "epoch": 0.7409504550050556, "grad_norm": 2.274996519088745, "learning_rate": 1.7312106590205014e-06, "loss": 0.3864002227783203, "memory(GiB)": 74.62, "step": 916, "token_acc": 0.9036144578313253, "train_speed(iter/s)": 0.022549 }, { "epoch": 0.7417593528816987, "grad_norm": 1.9122254848480225, "learning_rate": 1.7211063755581524e-06, "loss": 0.36603063344955444, "memory(GiB)": 74.62, "step": 917, "token_acc": 0.8798798798798799, "train_speed(iter/s)": 0.022549 }, { "epoch": 0.7425682507583418, "grad_norm": 2.126805067062378, "learning_rate": 1.7110255317435503e-06, "loss": 0.38110482692718506, "memory(GiB)": 74.62, "step": 918, "token_acc": 0.8325581395348837, "train_speed(iter/s)": 0.02255 }, { "epoch": 0.7433771486349848, "grad_norm": 2.5423946380615234, "learning_rate": 1.7009681996410693e-06, "loss": 0.3204044699668884, "memory(GiB)": 74.62, "step": 919, "token_acc": 0.8575757575757575, "train_speed(iter/s)": 0.02255 }, { "epoch": 0.7441860465116279, "grad_norm": 2.0116817951202393, "learning_rate": 1.6909344511470116e-06, "loss": 0.3353678286075592, "memory(GiB)": 74.62, "step": 920, "token_acc": 0.9169811320754717, "train_speed(iter/s)": 0.02255 }, { "epoch": 0.744994944388271, "grad_norm": 2.245171546936035, "learning_rate": 1.6809243579890865e-06, "loss": 0.401151180267334, "memory(GiB)": 74.62, "step": 921, "token_acc": 0.823321554770318, "train_speed(iter/s)": 0.022551 }, { "epoch": 0.745803842264914, "grad_norm": 2.26719331741333, "learning_rate": 1.6709379917259028e-06, "loss": 0.4471093714237213, "memory(GiB)": 74.62, "step": 922, "token_acc": 0.8719723183391004, "train_speed(iter/s)": 0.022551 }, { "epoch": 0.7466127401415571, "grad_norm": 2.297231435775757, "learning_rate": 1.6609754237464475e-06, "loss": 0.4046187400817871, "memory(GiB)": 74.62, "step": 923, "token_acc": 0.835, "train_speed(iter/s)": 0.022551 }, { "epoch": 0.7474216380182002, "grad_norm": 2.375325918197632, "learning_rate": 1.651036725269588e-06, "loss": 0.3570983409881592, "memory(GiB)": 74.62, "step": 924, "token_acc": 0.883177570093458, "train_speed(iter/s)": 0.022552 }, { "epoch": 0.7482305358948432, "grad_norm": 1.6737430095672607, "learning_rate": 1.6411219673435564e-06, "loss": 0.33470356464385986, "memory(GiB)": 74.62, "step": 925, "token_acc": 0.9057377049180327, "train_speed(iter/s)": 0.022552 }, { "epoch": 0.7490394337714863, "grad_norm": 1.9846247434616089, "learning_rate": 1.6312312208454373e-06, "loss": 0.37246596813201904, "memory(GiB)": 74.62, "step": 926, "token_acc": 0.8854166666666666, "train_speed(iter/s)": 0.022552 }, { "epoch": 0.7498483316481295, "grad_norm": 1.810141921043396, "learning_rate": 1.6213645564806751e-06, "loss": 0.3770504593849182, "memory(GiB)": 74.62, "step": 927, "token_acc": 0.9072164948453608, "train_speed(iter/s)": 0.022553 }, { "epoch": 0.7506572295247725, "grad_norm": 2.10400128364563, "learning_rate": 1.6115220447825503e-06, "loss": 0.376004159450531, "memory(GiB)": 74.62, "step": 928, "token_acc": 0.9196428571428571, "train_speed(iter/s)": 0.022553 }, { "epoch": 0.7514661274014156, "grad_norm": 2.000704050064087, "learning_rate": 1.6017037561116899e-06, "loss": 0.3456549048423767, "memory(GiB)": 74.62, "step": 929, "token_acc": 0.8565573770491803, "train_speed(iter/s)": 0.022553 }, { "epoch": 0.7522750252780587, "grad_norm": 1.5539889335632324, "learning_rate": 1.59190976065556e-06, "loss": 0.33691200613975525, "memory(GiB)": 74.62, "step": 930, "token_acc": 0.8848920863309353, "train_speed(iter/s)": 0.022554 }, { "epoch": 0.7530839231547017, "grad_norm": 1.6070380210876465, "learning_rate": 1.582140128427957e-06, "loss": 0.39200344681739807, "memory(GiB)": 74.62, "step": 931, "token_acc": 0.8892857142857142, "train_speed(iter/s)": 0.022554 }, { "epoch": 0.7538928210313448, "grad_norm": 1.8517992496490479, "learning_rate": 1.5723949292685193e-06, "loss": 0.34315165877342224, "memory(GiB)": 74.62, "step": 932, "token_acc": 0.8774193548387097, "train_speed(iter/s)": 0.022554 }, { "epoch": 0.7547017189079879, "grad_norm": 2.0841267108917236, "learning_rate": 1.5626742328422195e-06, "loss": 0.3751834034919739, "memory(GiB)": 74.62, "step": 933, "token_acc": 0.9234234234234234, "train_speed(iter/s)": 0.022554 }, { "epoch": 0.7555106167846309, "grad_norm": 2.080343008041382, "learning_rate": 1.552978108638869e-06, "loss": 0.37340766191482544, "memory(GiB)": 74.62, "step": 934, "token_acc": 0.9068627450980392, "train_speed(iter/s)": 0.022555 }, { "epoch": 0.756319514661274, "grad_norm": 2.0687668323516846, "learning_rate": 1.543306625972623e-06, "loss": 0.4011552929878235, "memory(GiB)": 74.62, "step": 935, "token_acc": 0.9128440366972477, "train_speed(iter/s)": 0.022555 }, { "epoch": 0.7571284125379171, "grad_norm": 1.9438579082489014, "learning_rate": 1.5336598539814784e-06, "loss": 0.389544278383255, "memory(GiB)": 74.62, "step": 936, "token_acc": 0.8618181818181818, "train_speed(iter/s)": 0.022555 }, { "epoch": 0.7579373104145601, "grad_norm": 2.186204671859741, "learning_rate": 1.5240378616267887e-06, "loss": 0.34044280648231506, "memory(GiB)": 74.62, "step": 937, "token_acc": 0.8879310344827587, "train_speed(iter/s)": 0.022555 }, { "epoch": 0.7587462082912032, "grad_norm": 2.069333076477051, "learning_rate": 1.514440717692765e-06, "loss": 0.41251152753829956, "memory(GiB)": 74.62, "step": 938, "token_acc": 0.8347107438016529, "train_speed(iter/s)": 0.022556 }, { "epoch": 0.7595551061678463, "grad_norm": 1.9282809495925903, "learning_rate": 1.5048684907859873e-06, "loss": 0.4127691984176636, "memory(GiB)": 74.62, "step": 939, "token_acc": 0.842443729903537, "train_speed(iter/s)": 0.022556 }, { "epoch": 0.7603640040444893, "grad_norm": 2.28041410446167, "learning_rate": 1.495321249334908e-06, "loss": 0.42502281069755554, "memory(GiB)": 74.62, "step": 940, "token_acc": 0.8642857142857143, "train_speed(iter/s)": 0.022556 }, { "epoch": 0.7611729019211324, "grad_norm": 1.8921377658843994, "learning_rate": 1.485799061589372e-06, "loss": 0.4206182658672333, "memory(GiB)": 74.62, "step": 941, "token_acc": 0.8811475409836066, "train_speed(iter/s)": 0.022556 }, { "epoch": 0.7619817997977756, "grad_norm": 1.8928072452545166, "learning_rate": 1.4763019956201252e-06, "loss": 0.3889954090118408, "memory(GiB)": 74.62, "step": 942, "token_acc": 0.8821752265861027, "train_speed(iter/s)": 0.022556 }, { "epoch": 0.7627906976744186, "grad_norm": 3.128412961959839, "learning_rate": 1.4668301193183198e-06, "loss": 0.38851073384284973, "memory(GiB)": 74.62, "step": 943, "token_acc": 0.8724137931034482, "train_speed(iter/s)": 0.022556 }, { "epoch": 0.7635995955510617, "grad_norm": 1.9432473182678223, "learning_rate": 1.4573835003950438e-06, "loss": 0.38765308260917664, "memory(GiB)": 74.62, "step": 944, "token_acc": 0.8867924528301887, "train_speed(iter/s)": 0.022557 }, { "epoch": 0.7644084934277048, "grad_norm": 2.4022583961486816, "learning_rate": 1.4479622063808242e-06, "loss": 0.43059998750686646, "memory(GiB)": 74.62, "step": 945, "token_acc": 0.8793774319066148, "train_speed(iter/s)": 0.022557 }, { "epoch": 0.7652173913043478, "grad_norm": 2.3695461750030518, "learning_rate": 1.4385663046251514e-06, "loss": 0.4037495255470276, "memory(GiB)": 74.62, "step": 946, "token_acc": 0.8772727272727273, "train_speed(iter/s)": 0.022557 }, { "epoch": 0.7660262891809909, "grad_norm": 1.9513347148895264, "learning_rate": 1.4291958622959972e-06, "loss": 0.35621780157089233, "memory(GiB)": 74.62, "step": 947, "token_acc": 0.9019607843137255, "train_speed(iter/s)": 0.022557 }, { "epoch": 0.766835187057634, "grad_norm": 2.0191597938537598, "learning_rate": 1.4198509463793275e-06, "loss": 0.38198453187942505, "memory(GiB)": 74.62, "step": 948, "token_acc": 0.8981132075471698, "train_speed(iter/s)": 0.022558 }, { "epoch": 0.767644084934277, "grad_norm": 1.8823531866073608, "learning_rate": 1.4105316236786332e-06, "loss": 0.39389660954475403, "memory(GiB)": 74.62, "step": 949, "token_acc": 0.8419689119170984, "train_speed(iter/s)": 0.022558 }, { "epoch": 0.7684529828109201, "grad_norm": 2.254852771759033, "learning_rate": 1.4012379608144477e-06, "loss": 0.40055525302886963, "memory(GiB)": 74.62, "step": 950, "token_acc": 0.8776371308016878, "train_speed(iter/s)": 0.022558 }, { "epoch": 0.7692618806875632, "grad_norm": 2.2618825435638428, "learning_rate": 1.3919700242238715e-06, "loss": 0.4659748673439026, "memory(GiB)": 74.62, "step": 951, "token_acc": 0.804635761589404, "train_speed(iter/s)": 0.022559 }, { "epoch": 0.7700707785642062, "grad_norm": 1.884406328201294, "learning_rate": 1.382727880160098e-06, "loss": 0.34575021266937256, "memory(GiB)": 74.62, "step": 952, "token_acc": 0.8618421052631579, "train_speed(iter/s)": 0.022559 }, { "epoch": 0.7708796764408493, "grad_norm": 2.369433641433716, "learning_rate": 1.3735115946919342e-06, "loss": 0.35355186462402344, "memory(GiB)": 74.62, "step": 953, "token_acc": 0.9132231404958677, "train_speed(iter/s)": 0.022559 }, { "epoch": 0.7716885743174924, "grad_norm": 1.9989012479782104, "learning_rate": 1.3643212337033396e-06, "loss": 0.35935360193252563, "memory(GiB)": 74.62, "step": 954, "token_acc": 0.8350877192982457, "train_speed(iter/s)": 0.02256 }, { "epoch": 0.7724974721941354, "grad_norm": 2.4037156105041504, "learning_rate": 1.3551568628929434e-06, "loss": 0.41261640191078186, "memory(GiB)": 74.62, "step": 955, "token_acc": 0.849112426035503, "train_speed(iter/s)": 0.02256 }, { "epoch": 0.7733063700707785, "grad_norm": 3.044893741607666, "learning_rate": 1.346018547773582e-06, "loss": 0.46254298090934753, "memory(GiB)": 74.62, "step": 956, "token_acc": 0.8521400778210116, "train_speed(iter/s)": 0.02256 }, { "epoch": 0.7741152679474217, "grad_norm": 2.8486831188201904, "learning_rate": 1.3369063536718347e-06, "loss": 0.39035511016845703, "memory(GiB)": 74.62, "step": 957, "token_acc": 0.8973509933774835, "train_speed(iter/s)": 0.02256 }, { "epoch": 0.7749241658240648, "grad_norm": 2.8728833198547363, "learning_rate": 1.3278203457275401e-06, "loss": 0.4135955572128296, "memory(GiB)": 74.62, "step": 958, "token_acc": 0.8804347826086957, "train_speed(iter/s)": 0.02256 }, { "epoch": 0.7757330637007078, "grad_norm": 2.2483723163604736, "learning_rate": 1.3187605888933508e-06, "loss": 0.3800261616706848, "memory(GiB)": 74.62, "step": 959, "token_acc": 0.8988095238095238, "train_speed(iter/s)": 0.022561 }, { "epoch": 0.7765419615773509, "grad_norm": 2.3790223598480225, "learning_rate": 1.3097271479342526e-06, "loss": 0.4093528389930725, "memory(GiB)": 74.62, "step": 960, "token_acc": 0.8419243986254296, "train_speed(iter/s)": 0.022561 }, { "epoch": 0.777350859453994, "grad_norm": 2.5826141834259033, "learning_rate": 1.3007200874271126e-06, "loss": 0.30737096071243286, "memory(GiB)": 74.62, "step": 961, "token_acc": 0.8328173374613003, "train_speed(iter/s)": 0.022561 }, { "epoch": 0.778159757330637, "grad_norm": 1.8254023790359497, "learning_rate": 1.2917394717602123e-06, "loss": 0.3649098575115204, "memory(GiB)": 74.62, "step": 962, "token_acc": 0.8901960784313725, "train_speed(iter/s)": 0.022562 }, { "epoch": 0.7789686552072801, "grad_norm": 1.9518779516220093, "learning_rate": 1.2827853651327883e-06, "loss": 0.3445701599121094, "memory(GiB)": 74.62, "step": 963, "token_acc": 0.87890625, "train_speed(iter/s)": 0.022562 }, { "epoch": 0.7797775530839232, "grad_norm": 2.0577752590179443, "learning_rate": 1.2738578315545751e-06, "loss": 0.3813546299934387, "memory(GiB)": 74.62, "step": 964, "token_acc": 0.8389830508474576, "train_speed(iter/s)": 0.022562 }, { "epoch": 0.7805864509605662, "grad_norm": 2.2759878635406494, "learning_rate": 1.2649569348453416e-06, "loss": 0.4146482050418854, "memory(GiB)": 74.62, "step": 965, "token_acc": 0.8581314878892734, "train_speed(iter/s)": 0.022562 }, { "epoch": 0.7813953488372093, "grad_norm": 2.162762403488159, "learning_rate": 1.2560827386344444e-06, "loss": 0.43926411867141724, "memory(GiB)": 74.62, "step": 966, "token_acc": 0.908256880733945, "train_speed(iter/s)": 0.022563 }, { "epoch": 0.7822042467138524, "grad_norm": 2.6653337478637695, "learning_rate": 1.2472353063603626e-06, "loss": 0.3915598690509796, "memory(GiB)": 74.62, "step": 967, "token_acc": 0.8888888888888888, "train_speed(iter/s)": 0.022563 }, { "epoch": 0.7830131445904954, "grad_norm": 1.9902511835098267, "learning_rate": 1.238414701270252e-06, "loss": 0.3581811189651489, "memory(GiB)": 74.62, "step": 968, "token_acc": 0.8584615384615385, "train_speed(iter/s)": 0.022563 }, { "epoch": 0.7838220424671385, "grad_norm": 2.0768163204193115, "learning_rate": 1.229620986419494e-06, "loss": 0.40156054496765137, "memory(GiB)": 74.62, "step": 969, "token_acc": 0.8660436137071651, "train_speed(iter/s)": 0.022563 }, { "epoch": 0.7846309403437816, "grad_norm": 2.0157761573791504, "learning_rate": 1.2208542246712346e-06, "loss": 0.3723048269748688, "memory(GiB)": 74.62, "step": 970, "token_acc": 0.9051724137931034, "train_speed(iter/s)": 0.022564 }, { "epoch": 0.7854398382204246, "grad_norm": 2.2510571479797363, "learning_rate": 1.2121144786959466e-06, "loss": 0.39407879114151, "memory(GiB)": 74.62, "step": 971, "token_acc": 0.8828125, "train_speed(iter/s)": 0.022564 }, { "epoch": 0.7862487360970677, "grad_norm": 1.9419714212417603, "learning_rate": 1.2034018109709716e-06, "loss": 0.3809299170970917, "memory(GiB)": 74.62, "step": 972, "token_acc": 0.8663967611336032, "train_speed(iter/s)": 0.022564 }, { "epoch": 0.7870576339737109, "grad_norm": 2.204801321029663, "learning_rate": 1.1947162837800842e-06, "loss": 0.41355523467063904, "memory(GiB)": 74.62, "step": 973, "token_acc": 0.8683274021352313, "train_speed(iter/s)": 0.022565 }, { "epoch": 0.7878665318503539, "grad_norm": 2.461207866668701, "learning_rate": 1.1860579592130366e-06, "loss": 0.407459557056427, "memory(GiB)": 74.62, "step": 974, "token_acc": 0.8583333333333333, "train_speed(iter/s)": 0.022565 }, { "epoch": 0.788675429726997, "grad_norm": 1.8681888580322266, "learning_rate": 1.177426899165121e-06, "loss": 0.33745524287223816, "memory(GiB)": 74.62, "step": 975, "token_acc": 0.9018691588785047, "train_speed(iter/s)": 0.022565 }, { "epoch": 0.7894843276036401, "grad_norm": 1.9317001104354858, "learning_rate": 1.1688231653367271e-06, "loss": 0.36072519421577454, "memory(GiB)": 74.62, "step": 976, "token_acc": 0.8922413793103449, "train_speed(iter/s)": 0.022565 }, { "epoch": 0.7902932254802831, "grad_norm": 1.4545793533325195, "learning_rate": 1.1602468192328936e-06, "loss": 0.3215617537498474, "memory(GiB)": 74.62, "step": 977, "token_acc": 0.9003436426116839, "train_speed(iter/s)": 0.022566 }, { "epoch": 0.7911021233569262, "grad_norm": 2.098681926727295, "learning_rate": 1.1516979221628804e-06, "loss": 0.36492764949798584, "memory(GiB)": 74.62, "step": 978, "token_acc": 0.8724489795918368, "train_speed(iter/s)": 0.022566 }, { "epoch": 0.7919110212335693, "grad_norm": 1.852514386177063, "learning_rate": 1.1431765352397167e-06, "loss": 0.3920031785964966, "memory(GiB)": 74.62, "step": 979, "token_acc": 0.875, "train_speed(iter/s)": 0.022566 }, { "epoch": 0.7927199191102123, "grad_norm": 2.011186122894287, "learning_rate": 1.13468271937978e-06, "loss": 0.3568735718727112, "memory(GiB)": 74.62, "step": 980, "token_acc": 0.9057971014492754, "train_speed(iter/s)": 0.022566 }, { "epoch": 0.7935288169868554, "grad_norm": 2.8331449031829834, "learning_rate": 1.1262165353023474e-06, "loss": 0.3684077560901642, "memory(GiB)": 74.62, "step": 981, "token_acc": 0.8900343642611683, "train_speed(iter/s)": 0.022567 }, { "epoch": 0.7943377148634985, "grad_norm": 2.3888087272644043, "learning_rate": 1.1177780435291641e-06, "loss": 0.3318890929222107, "memory(GiB)": 74.62, "step": 982, "token_acc": 0.8755760368663594, "train_speed(iter/s)": 0.022567 }, { "epoch": 0.7951466127401415, "grad_norm": 1.8067930936813354, "learning_rate": 1.1093673043840182e-06, "loss": 0.32926511764526367, "memory(GiB)": 74.62, "step": 983, "token_acc": 0.8690095846645367, "train_speed(iter/s)": 0.022567 }, { "epoch": 0.7959555106167846, "grad_norm": 2.238401412963867, "learning_rate": 1.100984377992298e-06, "loss": 0.39484211802482605, "memory(GiB)": 74.62, "step": 984, "token_acc": 0.8741496598639455, "train_speed(iter/s)": 0.022567 }, { "epoch": 0.7967644084934277, "grad_norm": 2.389265298843384, "learning_rate": 1.0926293242805735e-06, "loss": 0.45714324712753296, "memory(GiB)": 74.62, "step": 985, "token_acc": 0.8478260869565217, "train_speed(iter/s)": 0.022568 }, { "epoch": 0.7975733063700707, "grad_norm": 2.173175811767578, "learning_rate": 1.0843022029761596e-06, "loss": 0.37196797132492065, "memory(GiB)": 74.62, "step": 986, "token_acc": 0.8589211618257261, "train_speed(iter/s)": 0.022568 }, { "epoch": 0.7983822042467138, "grad_norm": 2.0259294509887695, "learning_rate": 1.0760030736066952e-06, "loss": 0.44243717193603516, "memory(GiB)": 74.62, "step": 987, "token_acc": 0.8244274809160306, "train_speed(iter/s)": 0.022568 }, { "epoch": 0.799191102123357, "grad_norm": 2.151653528213501, "learning_rate": 1.0677319954997129e-06, "loss": 0.39491477608680725, "memory(GiB)": 74.62, "step": 988, "token_acc": 0.9015544041450777, "train_speed(iter/s)": 0.022568 }, { "epoch": 0.8, "grad_norm": 2.1169228553771973, "learning_rate": 1.0594890277822151e-06, "loss": 0.3383401334285736, "memory(GiB)": 74.62, "step": 989, "token_acc": 0.8840125391849529, "train_speed(iter/s)": 0.022569 }, { "epoch": 0.8008088978766431, "grad_norm": 2.4547696113586426, "learning_rate": 1.0512742293802558e-06, "loss": 0.38963425159454346, "memory(GiB)": 74.62, "step": 990, "token_acc": 0.8666666666666667, "train_speed(iter/s)": 0.022569 }, { "epoch": 0.8016177957532862, "grad_norm": 1.8448153734207153, "learning_rate": 1.0430876590185162e-06, "loss": 0.36352628469467163, "memory(GiB)": 74.62, "step": 991, "token_acc": 0.92, "train_speed(iter/s)": 0.022569 }, { "epoch": 0.8024266936299292, "grad_norm": 1.883742094039917, "learning_rate": 1.0349293752198842e-06, "loss": 0.37957262992858887, "memory(GiB)": 74.62, "step": 992, "token_acc": 0.887240356083086, "train_speed(iter/s)": 0.022569 }, { "epoch": 0.8032355915065723, "grad_norm": 2.0374629497528076, "learning_rate": 1.0267994363050387e-06, "loss": 0.3739085793495178, "memory(GiB)": 74.62, "step": 993, "token_acc": 0.834061135371179, "train_speed(iter/s)": 0.02257 }, { "epoch": 0.8040444893832154, "grad_norm": 2.806663751602173, "learning_rate": 1.0186979003920273e-06, "loss": 0.31939688324928284, "memory(GiB)": 74.62, "step": 994, "token_acc": 0.8784313725490196, "train_speed(iter/s)": 0.02257 }, { "epoch": 0.8048533872598584, "grad_norm": 2.3647608757019043, "learning_rate": 1.0106248253958607e-06, "loss": 0.37592533230781555, "memory(GiB)": 74.62, "step": 995, "token_acc": 0.900355871886121, "train_speed(iter/s)": 0.02257 }, { "epoch": 0.8056622851365015, "grad_norm": 2.112464427947998, "learning_rate": 1.0025802690280851e-06, "loss": 0.3363335132598877, "memory(GiB)": 74.62, "step": 996, "token_acc": 0.9078014184397163, "train_speed(iter/s)": 0.02257 }, { "epoch": 0.8064711830131446, "grad_norm": 2.177457809448242, "learning_rate": 9.945642887963842e-07, "loss": 0.38282421231269836, "memory(GiB)": 74.62, "step": 997, "token_acc": 0.8685446009389671, "train_speed(iter/s)": 0.022571 }, { "epoch": 0.8072800808897876, "grad_norm": 2.463026523590088, "learning_rate": 9.86576942004156e-07, "loss": 0.3649854063987732, "memory(GiB)": 74.62, "step": 998, "token_acc": 0.8651685393258427, "train_speed(iter/s)": 0.022571 }, { "epoch": 0.8080889787664307, "grad_norm": 2.1493732929229736, "learning_rate": 9.78618285750112e-07, "loss": 0.4093163013458252, "memory(GiB)": 74.62, "step": 999, "token_acc": 0.8106060606060606, "train_speed(iter/s)": 0.022571 }, { "epoch": 0.8088978766430738, "grad_norm": 1.8683381080627441, "learning_rate": 9.70688376927864e-07, "loss": 0.3501003682613373, "memory(GiB)": 74.62, "step": 1000, "token_acc": 0.8641975308641975, "train_speed(iter/s)": 0.022571 }, { "epoch": 0.8088978766430738, "eval_loss": 0.36755362153053284, "eval_runtime": 428.8026, "eval_samples_per_second": 3.727, "eval_steps_per_second": 0.117, "eval_token_acc": 0.8743528175883545, "step": 1000 }, { "epoch": 0.8097067745197168, "grad_norm": 6.642233371734619, "learning_rate": 9.627872722255154e-07, "loss": 0.3149925470352173, "memory(GiB)": 74.62, "step": 1001, "token_acc": 0.875, "train_speed(iter/s)": 0.022354 }, { "epoch": 0.8105156723963599, "grad_norm": 2.2048041820526123, "learning_rate": 9.549150281252633e-07, "loss": 0.4250641167163849, "memory(GiB)": 74.62, "step": 1002, "token_acc": 0.8754448398576512, "train_speed(iter/s)": 0.022354 }, { "epoch": 0.8113245702730031, "grad_norm": 1.8660839796066284, "learning_rate": 9.470717009029889e-07, "loss": 0.32009008526802063, "memory(GiB)": 74.62, "step": 1003, "token_acc": 0.89, "train_speed(iter/s)": 0.022354 }, { "epoch": 0.8121334681496462, "grad_norm": 1.9054193496704102, "learning_rate": 9.39257346627857e-07, "loss": 0.3357139825820923, "memory(GiB)": 74.62, "step": 1004, "token_acc": 0.9012875536480687, "train_speed(iter/s)": 0.022355 }, { "epoch": 0.8129423660262892, "grad_norm": 3.9081051349639893, "learning_rate": 9.314720211619166e-07, "loss": 0.38648778200149536, "memory(GiB)": 74.62, "step": 1005, "token_acc": 0.9029850746268657, "train_speed(iter/s)": 0.022355 }, { "epoch": 0.8137512639029323, "grad_norm": 2.1040167808532715, "learning_rate": 9.237157801596958e-07, "loss": 0.3301439881324768, "memory(GiB)": 74.62, "step": 1006, "token_acc": 0.8925925925925926, "train_speed(iter/s)": 0.022356 }, { "epoch": 0.8145601617795754, "grad_norm": 1.6679681539535522, "learning_rate": 9.159886790678124e-07, "loss": 0.37634193897247314, "memory(GiB)": 74.62, "step": 1007, "token_acc": 0.8092485549132948, "train_speed(iter/s)": 0.022356 }, { "epoch": 0.8153690596562184, "grad_norm": 2.3380022048950195, "learning_rate": 9.082907731245733e-07, "loss": 0.4119229316711426, "memory(GiB)": 74.62, "step": 1008, "token_acc": 0.8262548262548263, "train_speed(iter/s)": 0.022357 }, { "epoch": 0.8161779575328615, "grad_norm": 1.9643757343292236, "learning_rate": 9.006221173595741e-07, "loss": 0.355658620595932, "memory(GiB)": 74.62, "step": 1009, "token_acc": 0.926530612244898, "train_speed(iter/s)": 0.022357 }, { "epoch": 0.8169868554095046, "grad_norm": 1.6694306135177612, "learning_rate": 8.929827665933211e-07, "loss": 0.3310469388961792, "memory(GiB)": 74.62, "step": 1010, "token_acc": 0.9036697247706422, "train_speed(iter/s)": 0.022358 }, { "epoch": 0.8177957532861476, "grad_norm": 3.9332058429718018, "learning_rate": 8.853727754368191e-07, "loss": 0.3335992693901062, "memory(GiB)": 74.62, "step": 1011, "token_acc": 0.9045226130653267, "train_speed(iter/s)": 0.022358 }, { "epoch": 0.8186046511627907, "grad_norm": 2.0935213565826416, "learning_rate": 8.777921982911996e-07, "loss": 0.3944769501686096, "memory(GiB)": 74.62, "step": 1012, "token_acc": 0.8847583643122676, "train_speed(iter/s)": 0.022359 }, { "epoch": 0.8194135490394338, "grad_norm": 1.8062115907669067, "learning_rate": 8.702410893473173e-07, "loss": 0.3291887938976288, "memory(GiB)": 74.62, "step": 1013, "token_acc": 0.8565217391304348, "train_speed(iter/s)": 0.022359 }, { "epoch": 0.8202224469160768, "grad_norm": 2.1609699726104736, "learning_rate": 8.627195025853735e-07, "loss": 0.2895755469799042, "memory(GiB)": 74.62, "step": 1014, "token_acc": 0.9145299145299145, "train_speed(iter/s)": 0.02236 }, { "epoch": 0.8210313447927199, "grad_norm": 2.0408060550689697, "learning_rate": 8.552274917745246e-07, "loss": 0.3750014901161194, "memory(GiB)": 74.62, "step": 1015, "token_acc": 0.8930817610062893, "train_speed(iter/s)": 0.02236 }, { "epoch": 0.821840242669363, "grad_norm": 2.2596545219421387, "learning_rate": 8.477651104724994e-07, "loss": 0.3800932466983795, "memory(GiB)": 74.62, "step": 1016, "token_acc": 0.8439306358381503, "train_speed(iter/s)": 0.022361 }, { "epoch": 0.822649140546006, "grad_norm": 1.547613263130188, "learning_rate": 8.40332412025216e-07, "loss": 0.3251078128814697, "memory(GiB)": 74.62, "step": 1017, "token_acc": 0.9111969111969112, "train_speed(iter/s)": 0.022361 }, { "epoch": 0.8234580384226491, "grad_norm": 2.1521153450012207, "learning_rate": 8.329294495663981e-07, "loss": 0.38296395540237427, "memory(GiB)": 74.62, "step": 1018, "token_acc": 0.8782894736842105, "train_speed(iter/s)": 0.022361 }, { "epoch": 0.8242669362992923, "grad_norm": 2.0719387531280518, "learning_rate": 8.255562760172004e-07, "loss": 0.3523367643356323, "memory(GiB)": 74.62, "step": 1019, "token_acc": 0.8269896193771626, "train_speed(iter/s)": 0.022362 }, { "epoch": 0.8250758341759353, "grad_norm": 2.2503058910369873, "learning_rate": 8.18212944085826e-07, "loss": 0.37082982063293457, "memory(GiB)": 74.62, "step": 1020, "token_acc": 0.8984126984126984, "train_speed(iter/s)": 0.022362 }, { "epoch": 0.8258847320525784, "grad_norm": 2.572887659072876, "learning_rate": 8.108995062671482e-07, "loss": 0.44092637300491333, "memory(GiB)": 74.62, "step": 1021, "token_acc": 0.8571428571428571, "train_speed(iter/s)": 0.022363 }, { "epoch": 0.8266936299292215, "grad_norm": 2.121467113494873, "learning_rate": 8.036160148423449e-07, "loss": 0.3986678123474121, "memory(GiB)": 74.62, "step": 1022, "token_acc": 0.9176470588235294, "train_speed(iter/s)": 0.022363 }, { "epoch": 0.8275025278058645, "grad_norm": 1.8472903966903687, "learning_rate": 7.963625218785099e-07, "loss": 0.35733747482299805, "memory(GiB)": 74.62, "step": 1023, "token_acc": 0.9130434782608695, "train_speed(iter/s)": 0.022364 }, { "epoch": 0.8283114256825076, "grad_norm": 1.9388898611068726, "learning_rate": 7.891390792282927e-07, "loss": 0.2967044711112976, "memory(GiB)": 74.62, "step": 1024, "token_acc": 0.9016393442622951, "train_speed(iter/s)": 0.022364 }, { "epoch": 0.8291203235591507, "grad_norm": 1.7470604181289673, "learning_rate": 7.819457385295254e-07, "loss": 0.31166231632232666, "memory(GiB)": 74.62, "step": 1025, "token_acc": 0.9090909090909091, "train_speed(iter/s)": 0.022364 }, { "epoch": 0.8299292214357937, "grad_norm": 2.2927539348602295, "learning_rate": 7.747825512048462e-07, "loss": 0.3713032007217407, "memory(GiB)": 74.62, "step": 1026, "token_acc": 0.8540540540540541, "train_speed(iter/s)": 0.022365 }, { "epoch": 0.8307381193124368, "grad_norm": 2.0093610286712646, "learning_rate": 7.676495684613433e-07, "loss": 0.3384319543838501, "memory(GiB)": 74.62, "step": 1027, "token_acc": 0.9325842696629213, "train_speed(iter/s)": 0.022365 }, { "epoch": 0.8315470171890799, "grad_norm": 1.9166637659072876, "learning_rate": 7.605468412901801e-07, "loss": 0.4422561824321747, "memory(GiB)": 74.62, "step": 1028, "token_acc": 0.8424657534246576, "train_speed(iter/s)": 0.022366 }, { "epoch": 0.8323559150657229, "grad_norm": 2.4499292373657227, "learning_rate": 7.534744204662348e-07, "loss": 0.42556819319725037, "memory(GiB)": 74.62, "step": 1029, "token_acc": 0.8181818181818182, "train_speed(iter/s)": 0.022366 }, { "epoch": 0.833164812942366, "grad_norm": 2.4436521530151367, "learning_rate": 7.464323565477372e-07, "loss": 0.46478235721588135, "memory(GiB)": 74.62, "step": 1030, "token_acc": 0.8811881188118812, "train_speed(iter/s)": 0.022367 }, { "epoch": 0.8339737108190091, "grad_norm": 1.8678390979766846, "learning_rate": 7.394206998759013e-07, "loss": 0.34241783618927, "memory(GiB)": 74.62, "step": 1031, "token_acc": 0.8908450704225352, "train_speed(iter/s)": 0.022367 }, { "epoch": 0.8347826086956521, "grad_norm": 2.002629041671753, "learning_rate": 7.324395005745772e-07, "loss": 0.3532907962799072, "memory(GiB)": 74.62, "step": 1032, "token_acc": 0.8291814946619217, "train_speed(iter/s)": 0.022368 }, { "epoch": 0.8355915065722952, "grad_norm": 2.4754257202148438, "learning_rate": 7.254888085498812e-07, "loss": 0.39124253392219543, "memory(GiB)": 74.62, "step": 1033, "token_acc": 0.8664122137404581, "train_speed(iter/s)": 0.022368 }, { "epoch": 0.8364004044489384, "grad_norm": 2.009551763534546, "learning_rate": 7.185686734898478e-07, "loss": 0.3519361913204193, "memory(GiB)": 74.62, "step": 1034, "token_acc": 0.8725490196078431, "train_speed(iter/s)": 0.022368 }, { "epoch": 0.8372093023255814, "grad_norm": 2.077303886413574, "learning_rate": 7.116791448640664e-07, "loss": 0.3848615884780884, "memory(GiB)": 74.62, "step": 1035, "token_acc": 0.8981481481481481, "train_speed(iter/s)": 0.022369 }, { "epoch": 0.8380182002022245, "grad_norm": 1.8623238801956177, "learning_rate": 7.048202719233344e-07, "loss": 0.3747529983520508, "memory(GiB)": 74.62, "step": 1036, "token_acc": 0.9141914191419142, "train_speed(iter/s)": 0.022369 }, { "epoch": 0.8388270980788676, "grad_norm": 1.8672590255737305, "learning_rate": 6.979921036993042e-07, "loss": 0.3627777099609375, "memory(GiB)": 74.62, "step": 1037, "token_acc": 0.8225352112676056, "train_speed(iter/s)": 0.02237 }, { "epoch": 0.8396359959555106, "grad_norm": 2.0797042846679688, "learning_rate": 6.911946890041254e-07, "loss": 0.4054332375526428, "memory(GiB)": 74.62, "step": 1038, "token_acc": 0.8860759493670886, "train_speed(iter/s)": 0.02237 }, { "epoch": 0.8404448938321537, "grad_norm": 2.2241296768188477, "learning_rate": 6.844280764301075e-07, "loss": 0.33668115735054016, "memory(GiB)": 74.62, "step": 1039, "token_acc": 0.9083665338645418, "train_speed(iter/s)": 0.022371 }, { "epoch": 0.8412537917087968, "grad_norm": 1.7550405263900757, "learning_rate": 6.776923143493636e-07, "loss": 0.3522379696369171, "memory(GiB)": 74.62, "step": 1040, "token_acc": 0.8508771929824561, "train_speed(iter/s)": 0.022371 }, { "epoch": 0.8420626895854398, "grad_norm": 1.8860352039337158, "learning_rate": 6.709874509134684e-07, "loss": 0.4433209300041199, "memory(GiB)": 74.62, "step": 1041, "token_acc": 0.8392282958199357, "train_speed(iter/s)": 0.022371 }, { "epoch": 0.8428715874620829, "grad_norm": 2.263840913772583, "learning_rate": 6.643135340531137e-07, "loss": 0.3951689302921295, "memory(GiB)": 74.62, "step": 1042, "token_acc": 0.8928571428571429, "train_speed(iter/s)": 0.022372 }, { "epoch": 0.843680485338726, "grad_norm": 2.3143765926361084, "learning_rate": 6.576706114777626e-07, "loss": 0.39435216784477234, "memory(GiB)": 74.62, "step": 1043, "token_acc": 0.8184523809523809, "train_speed(iter/s)": 0.022372 }, { "epoch": 0.844489383215369, "grad_norm": 2.4204423427581787, "learning_rate": 6.510587306753135e-07, "loss": 0.38613927364349365, "memory(GiB)": 74.62, "step": 1044, "token_acc": 0.8405797101449275, "train_speed(iter/s)": 0.022373 }, { "epoch": 0.8452982810920121, "grad_norm": 1.9565153121948242, "learning_rate": 6.444779389117579e-07, "loss": 0.3638315498828888, "memory(GiB)": 74.62, "step": 1045, "token_acc": 0.8671328671328671, "train_speed(iter/s)": 0.022373 }, { "epoch": 0.8461071789686552, "grad_norm": 1.82338547706604, "learning_rate": 6.379282832308414e-07, "loss": 0.3477684557437897, "memory(GiB)": 74.62, "step": 1046, "token_acc": 0.8731343283582089, "train_speed(iter/s)": 0.022374 }, { "epoch": 0.8469160768452982, "grad_norm": 2.053645610809326, "learning_rate": 6.314098104537325e-07, "loss": 0.359966516494751, "memory(GiB)": 74.62, "step": 1047, "token_acc": 0.84, "train_speed(iter/s)": 0.022374 }, { "epoch": 0.8477249747219413, "grad_norm": 2.145159959793091, "learning_rate": 6.249225671786785e-07, "loss": 0.3331785202026367, "memory(GiB)": 74.62, "step": 1048, "token_acc": 0.9153846153846154, "train_speed(iter/s)": 0.022375 }, { "epoch": 0.8485338725985845, "grad_norm": 2.2616126537323, "learning_rate": 6.184665997806832e-07, "loss": 0.3494233191013336, "memory(GiB)": 74.62, "step": 1049, "token_acc": 0.8663594470046083, "train_speed(iter/s)": 0.022375 }, { "epoch": 0.8493427704752275, "grad_norm": 2.032336711883545, "learning_rate": 6.120419544111655e-07, "loss": 0.35964176058769226, "memory(GiB)": 74.62, "step": 1050, "token_acc": 0.893687707641196, "train_speed(iter/s)": 0.022375 }, { "epoch": 0.8501516683518706, "grad_norm": 1.9737732410430908, "learning_rate": 6.056486769976388e-07, "loss": 0.37345680594444275, "memory(GiB)": 74.62, "step": 1051, "token_acc": 0.8767123287671232, "train_speed(iter/s)": 0.022376 }, { "epoch": 0.8509605662285137, "grad_norm": 3.4677176475524902, "learning_rate": 5.992868132433755e-07, "loss": 0.3770935535430908, "memory(GiB)": 74.62, "step": 1052, "token_acc": 0.8894230769230769, "train_speed(iter/s)": 0.022376 }, { "epoch": 0.8517694641051567, "grad_norm": 2.0082759857177734, "learning_rate": 5.929564086270834e-07, "loss": 0.40682828426361084, "memory(GiB)": 74.62, "step": 1053, "token_acc": 0.875, "train_speed(iter/s)": 0.022376 }, { "epoch": 0.8525783619817998, "grad_norm": 1.6112618446350098, "learning_rate": 5.866575084025816e-07, "loss": 0.3442041873931885, "memory(GiB)": 74.62, "step": 1054, "token_acc": 0.86, "train_speed(iter/s)": 0.022377 }, { "epoch": 0.8533872598584429, "grad_norm": 2.1978917121887207, "learning_rate": 5.803901575984721e-07, "loss": 0.37148886919021606, "memory(GiB)": 74.62, "step": 1055, "token_acc": 0.9269406392694064, "train_speed(iter/s)": 0.022377 }, { "epoch": 0.854196157735086, "grad_norm": 2.1480906009674072, "learning_rate": 5.74154401017824e-07, "loss": 0.37758809328079224, "memory(GiB)": 74.62, "step": 1056, "token_acc": 0.8744939271255061, "train_speed(iter/s)": 0.022378 }, { "epoch": 0.855005055611729, "grad_norm": 2.161919593811035, "learning_rate": 5.679502832378497e-07, "loss": 0.3692307472229004, "memory(GiB)": 74.62, "step": 1057, "token_acc": 0.8982035928143712, "train_speed(iter/s)": 0.022378 }, { "epoch": 0.8558139534883721, "grad_norm": 2.31783390045166, "learning_rate": 5.61777848609587e-07, "loss": 0.36903953552246094, "memory(GiB)": 74.62, "step": 1058, "token_acc": 0.8892857142857142, "train_speed(iter/s)": 0.022378 }, { "epoch": 0.8566228513650151, "grad_norm": 2.582380771636963, "learning_rate": 5.556371412575834e-07, "loss": 0.40472832322120667, "memory(GiB)": 74.62, "step": 1059, "token_acc": 0.8706896551724138, "train_speed(iter/s)": 0.022379 }, { "epoch": 0.8574317492416582, "grad_norm": 1.9625579118728638, "learning_rate": 5.495282050795763e-07, "loss": 0.3849819302558899, "memory(GiB)": 74.62, "step": 1060, "token_acc": 0.8406374501992032, "train_speed(iter/s)": 0.022379 }, { "epoch": 0.8582406471183013, "grad_norm": 2.0185904502868652, "learning_rate": 5.434510837461854e-07, "loss": 0.43619173765182495, "memory(GiB)": 74.62, "step": 1061, "token_acc": 0.8464730290456431, "train_speed(iter/s)": 0.02238 }, { "epoch": 0.8590495449949443, "grad_norm": 2.0642013549804688, "learning_rate": 5.374058207005945e-07, "loss": 0.37471503019332886, "memory(GiB)": 74.62, "step": 1062, "token_acc": 0.9219512195121952, "train_speed(iter/s)": 0.02238 }, { "epoch": 0.8598584428715874, "grad_norm": 2.187964677810669, "learning_rate": 5.313924591582453e-07, "loss": 0.3878336548805237, "memory(GiB)": 74.62, "step": 1063, "token_acc": 0.8531073446327684, "train_speed(iter/s)": 0.02238 }, { "epoch": 0.8606673407482305, "grad_norm": 3.5268666744232178, "learning_rate": 5.254110421065301e-07, "loss": 0.4011298716068268, "memory(GiB)": 74.62, "step": 1064, "token_acc": 0.8282442748091603, "train_speed(iter/s)": 0.022381 }, { "epoch": 0.8614762386248737, "grad_norm": 1.9126622676849365, "learning_rate": 5.194616123044749e-07, "loss": 0.3823421597480774, "memory(GiB)": 74.62, "step": 1065, "token_acc": 0.8555555555555555, "train_speed(iter/s)": 0.022381 }, { "epoch": 0.8622851365015167, "grad_norm": 1.9851644039154053, "learning_rate": 5.135442122824453e-07, "loss": 0.41584277153015137, "memory(GiB)": 74.62, "step": 1066, "token_acc": 0.896, "train_speed(iter/s)": 0.022382 }, { "epoch": 0.8630940343781598, "grad_norm": 2.158141613006592, "learning_rate": 5.076588843418345e-07, "loss": 0.3853064775466919, "memory(GiB)": 74.62, "step": 1067, "token_acc": 0.8201634877384196, "train_speed(iter/s)": 0.022382 }, { "epoch": 0.8639029322548029, "grad_norm": 2.003866672515869, "learning_rate": 5.018056705547652e-07, "loss": 0.3744017481803894, "memory(GiB)": 74.62, "step": 1068, "token_acc": 0.8693693693693694, "train_speed(iter/s)": 0.022382 }, { "epoch": 0.8647118301314459, "grad_norm": 3.3579702377319336, "learning_rate": 4.959846127637874e-07, "loss": 0.3795039653778076, "memory(GiB)": 74.62, "step": 1069, "token_acc": 0.8388625592417062, "train_speed(iter/s)": 0.022383 }, { "epoch": 0.865520728008089, "grad_norm": 2.1418285369873047, "learning_rate": 4.901957525815787e-07, "loss": 0.35196787118911743, "memory(GiB)": 74.62, "step": 1070, "token_acc": 0.8385650224215246, "train_speed(iter/s)": 0.022383 }, { "epoch": 0.8663296258847321, "grad_norm": 2.060997486114502, "learning_rate": 4.844391313906482e-07, "loss": 0.3312758207321167, "memory(GiB)": 74.62, "step": 1071, "token_acc": 0.8912280701754386, "train_speed(iter/s)": 0.022384 }, { "epoch": 0.8671385237613751, "grad_norm": 2.250108242034912, "learning_rate": 4.787147903430383e-07, "loss": 0.4016328752040863, "memory(GiB)": 74.62, "step": 1072, "token_acc": 0.8404669260700389, "train_speed(iter/s)": 0.022384 }, { "epoch": 0.8679474216380182, "grad_norm": 1.5963561534881592, "learning_rate": 4.730227703600354e-07, "loss": 0.3070691227912903, "memory(GiB)": 74.62, "step": 1073, "token_acc": 0.8928571428571429, "train_speed(iter/s)": 0.022384 }, { "epoch": 0.8687563195146613, "grad_norm": 2.321164846420288, "learning_rate": 4.6736311213186724e-07, "loss": 0.32245370745658875, "memory(GiB)": 74.62, "step": 1074, "token_acc": 0.8725868725868726, "train_speed(iter/s)": 0.022385 }, { "epoch": 0.8695652173913043, "grad_norm": 1.9174984693527222, "learning_rate": 4.617358561174279e-07, "loss": 0.32412296533584595, "memory(GiB)": 74.62, "step": 1075, "token_acc": 0.9473684210526315, "train_speed(iter/s)": 0.022385 }, { "epoch": 0.8703741152679474, "grad_norm": 1.674944281578064, "learning_rate": 4.561410425439744e-07, "loss": 0.299832284450531, "memory(GiB)": 74.62, "step": 1076, "token_acc": 0.9108527131782945, "train_speed(iter/s)": 0.022386 }, { "epoch": 0.8711830131445905, "grad_norm": 1.9611433744430542, "learning_rate": 4.505787114068433e-07, "loss": 0.3502030670642853, "memory(GiB)": 74.62, "step": 1077, "token_acc": 0.8602941176470589, "train_speed(iter/s)": 0.022386 }, { "epoch": 0.8719919110212335, "grad_norm": 2.2846431732177734, "learning_rate": 4.45048902469169e-07, "loss": 0.39019766449928284, "memory(GiB)": 74.62, "step": 1078, "token_acc": 0.8958333333333334, "train_speed(iter/s)": 0.022386 }, { "epoch": 0.8728008088978766, "grad_norm": 2.490588903427124, "learning_rate": 4.3955165526159306e-07, "loss": 0.37344303727149963, "memory(GiB)": 74.62, "step": 1079, "token_acc": 0.9163179916317992, "train_speed(iter/s)": 0.022387 }, { "epoch": 0.8736097067745198, "grad_norm": 5.213693141937256, "learning_rate": 4.3408700908198654e-07, "loss": 0.3260120153427124, "memory(GiB)": 74.62, "step": 1080, "token_acc": 0.8585858585858586, "train_speed(iter/s)": 0.022387 }, { "epoch": 0.8744186046511628, "grad_norm": 2.62857723236084, "learning_rate": 4.2865500299516747e-07, "loss": 0.36192968487739563, "memory(GiB)": 74.62, "step": 1081, "token_acc": 0.8915094339622641, "train_speed(iter/s)": 0.022387 }, { "epoch": 0.8752275025278059, "grad_norm": 2.0130198001861572, "learning_rate": 4.232556758326212e-07, "loss": 0.35925909876823425, "memory(GiB)": 74.62, "step": 1082, "token_acc": 0.8312236286919831, "train_speed(iter/s)": 0.022388 }, { "epoch": 0.876036400404449, "grad_norm": 1.795419454574585, "learning_rate": 4.178890661922241e-07, "loss": 0.34093332290649414, "memory(GiB)": 74.62, "step": 1083, "token_acc": 0.8543046357615894, "train_speed(iter/s)": 0.022388 }, { "epoch": 0.876845298281092, "grad_norm": 2.5592668056488037, "learning_rate": 4.125552124379628e-07, "loss": 0.412899911403656, "memory(GiB)": 74.62, "step": 1084, "token_acc": 0.85, "train_speed(iter/s)": 0.022389 }, { "epoch": 0.8776541961577351, "grad_norm": 1.8965997695922852, "learning_rate": 4.072541526996682e-07, "loss": 0.3767935633659363, "memory(GiB)": 74.62, "step": 1085, "token_acc": 0.8523676880222841, "train_speed(iter/s)": 0.022389 }, { "epoch": 0.8784630940343782, "grad_norm": 2.412139415740967, "learning_rate": 4.0198592487273426e-07, "loss": 0.3973158597946167, "memory(GiB)": 74.62, "step": 1086, "token_acc": 0.8678414096916299, "train_speed(iter/s)": 0.022389 }, { "epoch": 0.8792719919110212, "grad_norm": 1.8268601894378662, "learning_rate": 3.9675056661785563e-07, "loss": 0.35584717988967896, "memory(GiB)": 74.62, "step": 1087, "token_acc": 0.8561643835616438, "train_speed(iter/s)": 0.02239 }, { "epoch": 0.8800808897876643, "grad_norm": 2.1522209644317627, "learning_rate": 3.915481153607525e-07, "loss": 0.37817463278770447, "memory(GiB)": 74.62, "step": 1088, "token_acc": 0.8680851063829788, "train_speed(iter/s)": 0.02239 }, { "epoch": 0.8808897876643074, "grad_norm": 1.805523157119751, "learning_rate": 3.863786082919019e-07, "loss": 0.33031123876571655, "memory(GiB)": 74.62, "step": 1089, "token_acc": 0.9203539823008849, "train_speed(iter/s)": 0.02239 }, { "epoch": 0.8816986855409504, "grad_norm": 1.8276246786117554, "learning_rate": 3.8124208236627825e-07, "loss": 0.32658106088638306, "memory(GiB)": 74.62, "step": 1090, "token_acc": 0.900990099009901, "train_speed(iter/s)": 0.022391 }, { "epoch": 0.8825075834175935, "grad_norm": 2.1186046600341797, "learning_rate": 3.761385743030821e-07, "loss": 0.3983362019062042, "memory(GiB)": 74.62, "step": 1091, "token_acc": 0.9166666666666666, "train_speed(iter/s)": 0.022391 }, { "epoch": 0.8833164812942366, "grad_norm": 2.2194223403930664, "learning_rate": 3.710681205854838e-07, "loss": 0.34843602776527405, "memory(GiB)": 74.62, "step": 1092, "token_acc": 0.8144329896907216, "train_speed(iter/s)": 0.022391 }, { "epoch": 0.8841253791708796, "grad_norm": 1.7586379051208496, "learning_rate": 3.6603075746035886e-07, "loss": 0.3717504143714905, "memory(GiB)": 74.62, "step": 1093, "token_acc": 0.9043824701195219, "train_speed(iter/s)": 0.022392 }, { "epoch": 0.8849342770475227, "grad_norm": 1.730454921722412, "learning_rate": 3.6102652093802983e-07, "loss": 0.33724552392959595, "memory(GiB)": 74.62, "step": 1094, "token_acc": 0.8942857142857142, "train_speed(iter/s)": 0.022392 }, { "epoch": 0.8857431749241659, "grad_norm": 1.7972487211227417, "learning_rate": 3.5605544679200966e-07, "loss": 0.40413105487823486, "memory(GiB)": 74.62, "step": 1095, "token_acc": 0.8922413793103449, "train_speed(iter/s)": 0.022393 }, { "epoch": 0.8865520728008089, "grad_norm": 2.4188039302825928, "learning_rate": 3.511175705587433e-07, "loss": 0.4261808693408966, "memory(GiB)": 74.62, "step": 1096, "token_acc": 0.8990384615384616, "train_speed(iter/s)": 0.022393 }, { "epoch": 0.887360970677452, "grad_norm": 2.6165802478790283, "learning_rate": 3.462129275373577e-07, "loss": 0.3905704617500305, "memory(GiB)": 74.62, "step": 1097, "token_acc": 0.9346153846153846, "train_speed(iter/s)": 0.022394 }, { "epoch": 0.8881698685540951, "grad_norm": 1.8218803405761719, "learning_rate": 3.4134155278940594e-07, "loss": 0.42883560061454773, "memory(GiB)": 74.62, "step": 1098, "token_acc": 0.8745247148288974, "train_speed(iter/s)": 0.022394 }, { "epoch": 0.8889787664307381, "grad_norm": 1.979760766029358, "learning_rate": 3.3650348113861864e-07, "loss": 0.36739590764045715, "memory(GiB)": 74.62, "step": 1099, "token_acc": 0.8987341772151899, "train_speed(iter/s)": 0.022394 }, { "epoch": 0.8897876643073812, "grad_norm": 2.169462203979492, "learning_rate": 3.3169874717065564e-07, "loss": 0.43099868297576904, "memory(GiB)": 74.62, "step": 1100, "token_acc": 0.8531746031746031, "train_speed(iter/s)": 0.022395 }, { "epoch": 0.8905965621840243, "grad_norm": 3.057952642440796, "learning_rate": 3.269273852328547e-07, "loss": 0.3875833749771118, "memory(GiB)": 74.62, "step": 1101, "token_acc": 0.9, "train_speed(iter/s)": 0.022395 }, { "epoch": 0.8914054600606673, "grad_norm": 1.8207221031188965, "learning_rate": 3.2218942943399114e-07, "loss": 0.3375704884529114, "memory(GiB)": 74.62, "step": 1102, "token_acc": 0.8617511520737328, "train_speed(iter/s)": 0.022396 }, { "epoch": 0.8922143579373104, "grad_norm": 2.1824142932891846, "learning_rate": 3.174849136440294e-07, "loss": 0.36066344380378723, "memory(GiB)": 74.62, "step": 1103, "token_acc": 0.8494208494208494, "train_speed(iter/s)": 0.022396 }, { "epoch": 0.8930232558139535, "grad_norm": 2.046804428100586, "learning_rate": 3.1281387149388556e-07, "loss": 0.39939042925834656, "memory(GiB)": 74.62, "step": 1104, "token_acc": 0.8765432098765432, "train_speed(iter/s)": 0.022396 }, { "epoch": 0.8938321536905965, "grad_norm": 2.1102182865142822, "learning_rate": 3.081763363751844e-07, "loss": 0.35777053236961365, "memory(GiB)": 74.62, "step": 1105, "token_acc": 0.8803827751196173, "train_speed(iter/s)": 0.022397 }, { "epoch": 0.8946410515672396, "grad_norm": 1.6538591384887695, "learning_rate": 3.0357234144001766e-07, "loss": 0.32706207036972046, "memory(GiB)": 74.62, "step": 1106, "token_acc": 0.8989169675090253, "train_speed(iter/s)": 0.022397 }, { "epoch": 0.8954499494438827, "grad_norm": 2.0191094875335693, "learning_rate": 2.9900191960071544e-07, "loss": 0.3731483817100525, "memory(GiB)": 74.62, "step": 1107, "token_acc": 0.8875, "train_speed(iter/s)": 0.022397 }, { "epoch": 0.8962588473205257, "grad_norm": 1.9920696020126343, "learning_rate": 2.9446510352959924e-07, "loss": 0.3792566657066345, "memory(GiB)": 74.62, "step": 1108, "token_acc": 0.8431372549019608, "train_speed(iter/s)": 0.022398 }, { "epoch": 0.8970677451971688, "grad_norm": 4.2869157791137695, "learning_rate": 2.899619256587605e-07, "loss": 0.4134003520011902, "memory(GiB)": 74.62, "step": 1109, "token_acc": 0.8088888888888889, "train_speed(iter/s)": 0.022398 }, { "epoch": 0.897876643073812, "grad_norm": 1.730612874031067, "learning_rate": 2.854924181798202e-07, "loss": 0.3089058995246887, "memory(GiB)": 74.62, "step": 1110, "token_acc": 0.8550185873605948, "train_speed(iter/s)": 0.022398 }, { "epoch": 0.898685540950455, "grad_norm": 2.020568370819092, "learning_rate": 2.8105661304370256e-07, "loss": 0.33643391728401184, "memory(GiB)": 74.62, "step": 1111, "token_acc": 0.8732876712328768, "train_speed(iter/s)": 0.022399 }, { "epoch": 0.8994944388270981, "grad_norm": 2.182412624359131, "learning_rate": 2.7665454196040665e-07, "loss": 0.39632314443588257, "memory(GiB)": 74.62, "step": 1112, "token_acc": 0.8884297520661157, "train_speed(iter/s)": 0.022399 }, { "epoch": 0.9003033367037412, "grad_norm": 2.2093279361724854, "learning_rate": 2.722862363987749e-07, "loss": 0.43140286207199097, "memory(GiB)": 74.62, "step": 1113, "token_acc": 0.8701298701298701, "train_speed(iter/s)": 0.022399 }, { "epoch": 0.9011122345803843, "grad_norm": 4.873557090759277, "learning_rate": 2.6795172758627584e-07, "loss": 0.40689289569854736, "memory(GiB)": 74.62, "step": 1114, "token_acc": 0.880184331797235, "train_speed(iter/s)": 0.0224 }, { "epoch": 0.9019211324570273, "grad_norm": 2.0055012702941895, "learning_rate": 2.6365104650877716e-07, "loss": 0.3976328372955322, "memory(GiB)": 74.62, "step": 1115, "token_acc": 0.8812260536398467, "train_speed(iter/s)": 0.0224 }, { "epoch": 0.9027300303336704, "grad_norm": 1.9500057697296143, "learning_rate": 2.593842239103206e-07, "loss": 0.40250563621520996, "memory(GiB)": 74.62, "step": 1116, "token_acc": 0.8953168044077136, "train_speed(iter/s)": 0.022401 }, { "epoch": 0.9035389282103135, "grad_norm": 1.8744258880615234, "learning_rate": 2.5515129029290984e-07, "loss": 0.35562485456466675, "memory(GiB)": 74.62, "step": 1117, "token_acc": 0.8726591760299626, "train_speed(iter/s)": 0.022401 }, { "epoch": 0.9043478260869565, "grad_norm": 1.818701982498169, "learning_rate": 2.5095227591628467e-07, "loss": 0.32878684997558594, "memory(GiB)": 74.62, "step": 1118, "token_acc": 0.8952879581151832, "train_speed(iter/s)": 0.022401 }, { "epoch": 0.9051567239635996, "grad_norm": 2.0827207565307617, "learning_rate": 2.4678721079770984e-07, "loss": 0.4192107617855072, "memory(GiB)": 74.62, "step": 1119, "token_acc": 0.8461538461538461, "train_speed(iter/s)": 0.022402 }, { "epoch": 0.9059656218402427, "grad_norm": 2.060375690460205, "learning_rate": 2.4265612471176036e-07, "loss": 0.3454943895339966, "memory(GiB)": 74.62, "step": 1120, "token_acc": 0.9144981412639405, "train_speed(iter/s)": 0.022402 }, { "epoch": 0.9067745197168857, "grad_norm": 1.8084218502044678, "learning_rate": 2.385590471901045e-07, "loss": 0.31142184138298035, "memory(GiB)": 74.62, "step": 1121, "token_acc": 0.8678571428571429, "train_speed(iter/s)": 0.022402 }, { "epoch": 0.9075834175935288, "grad_norm": 2.012327194213867, "learning_rate": 2.3449600752129598e-07, "loss": 0.3716868460178375, "memory(GiB)": 74.62, "step": 1122, "token_acc": 0.8819672131147541, "train_speed(iter/s)": 0.022403 }, { "epoch": 0.9083923154701719, "grad_norm": 2.0449485778808594, "learning_rate": 2.3046703475056554e-07, "loss": 0.3710024356842041, "memory(GiB)": 74.62, "step": 1123, "token_acc": 0.8555555555555555, "train_speed(iter/s)": 0.022403 }, { "epoch": 0.9092012133468149, "grad_norm": 2.0092179775238037, "learning_rate": 2.2647215767961083e-07, "loss": 0.3403990864753723, "memory(GiB)": 74.62, "step": 1124, "token_acc": 0.925, "train_speed(iter/s)": 0.022403 }, { "epoch": 0.910010111223458, "grad_norm": 2.1806256771087646, "learning_rate": 2.2251140486639068e-07, "loss": 0.37321048974990845, "memory(GiB)": 74.62, "step": 1125, "token_acc": 0.9308510638297872, "train_speed(iter/s)": 0.022404 }, { "epoch": 0.9108190091001012, "grad_norm": 2.1333301067352295, "learning_rate": 2.1858480462492283e-07, "loss": 0.37797796726226807, "memory(GiB)": 74.62, "step": 1126, "token_acc": 0.8571428571428571, "train_speed(iter/s)": 0.022404 }, { "epoch": 0.9116279069767442, "grad_norm": 2.240083694458008, "learning_rate": 2.1469238502507926e-07, "loss": 0.3420672118663788, "memory(GiB)": 74.62, "step": 1127, "token_acc": 0.889795918367347, "train_speed(iter/s)": 0.022404 }, { "epoch": 0.9124368048533873, "grad_norm": 2.032658338546753, "learning_rate": 2.1083417389238858e-07, "loss": 0.3892640471458435, "memory(GiB)": 74.62, "step": 1128, "token_acc": 0.8831168831168831, "train_speed(iter/s)": 0.022405 }, { "epoch": 0.9132457027300304, "grad_norm": 2.2067453861236572, "learning_rate": 2.0701019880783324e-07, "loss": 0.33542943000793457, "memory(GiB)": 74.62, "step": 1129, "token_acc": 0.8740458015267175, "train_speed(iter/s)": 0.022405 }, { "epoch": 0.9140546006066734, "grad_norm": 2.052781343460083, "learning_rate": 2.0322048710765485e-07, "loss": 0.3520893454551697, "memory(GiB)": 74.62, "step": 1130, "token_acc": 0.8688524590163934, "train_speed(iter/s)": 0.022405 }, { "epoch": 0.9148634984833165, "grad_norm": 7.5011773109436035, "learning_rate": 1.9946506588315818e-07, "loss": 0.3370997905731201, "memory(GiB)": 74.62, "step": 1131, "token_acc": 0.8984771573604061, "train_speed(iter/s)": 0.022406 }, { "epoch": 0.9156723963599596, "grad_norm": 2.2244808673858643, "learning_rate": 1.957439619805196e-07, "loss": 0.3234095871448517, "memory(GiB)": 74.62, "step": 1132, "token_acc": 0.8681818181818182, "train_speed(iter/s)": 0.022406 }, { "epoch": 0.9164812942366026, "grad_norm": 1.946089506149292, "learning_rate": 1.9205720200058843e-07, "loss": 0.39126190543174744, "memory(GiB)": 74.62, "step": 1133, "token_acc": 0.909433962264151, "train_speed(iter/s)": 0.022406 }, { "epoch": 0.9172901921132457, "grad_norm": 11.597419738769531, "learning_rate": 1.8840481229870644e-07, "loss": 0.37995028495788574, "memory(GiB)": 74.62, "step": 1134, "token_acc": 0.8557046979865772, "train_speed(iter/s)": 0.022407 }, { "epoch": 0.9180990899898888, "grad_norm": 1.789217233657837, "learning_rate": 1.84786818984512e-07, "loss": 0.3505871295928955, "memory(GiB)": 74.62, "step": 1135, "token_acc": 0.9249146757679181, "train_speed(iter/s)": 0.022407 }, { "epoch": 0.9189079878665318, "grad_norm": 1.919080138206482, "learning_rate": 1.8120324792175569e-07, "loss": 0.3749197721481323, "memory(GiB)": 74.62, "step": 1136, "token_acc": 0.886435331230284, "train_speed(iter/s)": 0.022407 }, { "epoch": 0.9197168857431749, "grad_norm": 2.741631269454956, "learning_rate": 1.776541247281177e-07, "loss": 0.3757126033306122, "memory(GiB)": 74.62, "step": 1137, "token_acc": 0.8934010152284264, "train_speed(iter/s)": 0.022408 }, { "epoch": 0.920525783619818, "grad_norm": 1.856645107269287, "learning_rate": 1.7413947477501913e-07, "loss": 0.3616572320461273, "memory(GiB)": 74.62, "step": 1138, "token_acc": 0.9, "train_speed(iter/s)": 0.022408 }, { "epoch": 0.921334681496461, "grad_norm": 2.687711477279663, "learning_rate": 1.7065932318744704e-07, "loss": 0.3780667185783386, "memory(GiB)": 74.62, "step": 1139, "token_acc": 0.8723404255319149, "train_speed(iter/s)": 0.022409 }, { "epoch": 0.9221435793731041, "grad_norm": 1.6964043378829956, "learning_rate": 1.6721369484377082e-07, "loss": 0.35959312319755554, "memory(GiB)": 74.62, "step": 1140, "token_acc": 0.8790849673202614, "train_speed(iter/s)": 0.022409 }, { "epoch": 0.9229524772497473, "grad_norm": 2.040339469909668, "learning_rate": 1.6380261437556666e-07, "loss": 0.34360718727111816, "memory(GiB)": 74.62, "step": 1141, "token_acc": 0.9063829787234042, "train_speed(iter/s)": 0.022409 }, { "epoch": 0.9237613751263903, "grad_norm": 1.9790493249893188, "learning_rate": 1.6042610616743782e-07, "loss": 0.36330220103263855, "memory(GiB)": 74.62, "step": 1142, "token_acc": 0.8585858585858586, "train_speed(iter/s)": 0.02241 }, { "epoch": 0.9245702730030334, "grad_norm": 1.878999948501587, "learning_rate": 1.5708419435684463e-07, "loss": 0.3349642753601074, "memory(GiB)": 74.62, "step": 1143, "token_acc": 0.8650519031141869, "train_speed(iter/s)": 0.02241 }, { "epoch": 0.9253791708796765, "grad_norm": 1.9973299503326416, "learning_rate": 1.5377690283392977e-07, "loss": 0.3546566963195801, "memory(GiB)": 74.62, "step": 1144, "token_acc": 0.8781725888324873, "train_speed(iter/s)": 0.02241 }, { "epoch": 0.9261880687563195, "grad_norm": 1.9398893117904663, "learning_rate": 1.505042552413466e-07, "loss": 0.34872984886169434, "memory(GiB)": 74.62, "step": 1145, "token_acc": 0.8742138364779874, "train_speed(iter/s)": 0.022411 }, { "epoch": 0.9269969666329626, "grad_norm": 1.9519524574279785, "learning_rate": 1.4726627497409274e-07, "loss": 0.3644063472747803, "memory(GiB)": 74.62, "step": 1146, "token_acc": 0.8945147679324894, "train_speed(iter/s)": 0.022411 }, { "epoch": 0.9278058645096057, "grad_norm": 2.4077093601226807, "learning_rate": 1.440629851793407e-07, "loss": 0.42128363251686096, "memory(GiB)": 74.62, "step": 1147, "token_acc": 0.8775510204081632, "train_speed(iter/s)": 0.022411 }, { "epoch": 0.9286147623862487, "grad_norm": 2.0536437034606934, "learning_rate": 1.408944087562736e-07, "loss": 0.3700520396232605, "memory(GiB)": 74.62, "step": 1148, "token_acc": 0.8464566929133859, "train_speed(iter/s)": 0.022412 }, { "epoch": 0.9294236602628918, "grad_norm": 2.154677391052246, "learning_rate": 1.3776056835592132e-07, "loss": 0.3489128351211548, "memory(GiB)": 74.62, "step": 1149, "token_acc": 0.8795811518324608, "train_speed(iter/s)": 0.022412 }, { "epoch": 0.9302325581395349, "grad_norm": 1.8740899562835693, "learning_rate": 1.346614863809953e-07, "loss": 0.36078181862831116, "memory(GiB)": 74.62, "step": 1150, "token_acc": 0.8864468864468864, "train_speed(iter/s)": 0.022412 }, { "epoch": 0.9310414560161779, "grad_norm": 2.146127700805664, "learning_rate": 1.315971849857356e-07, "loss": 0.3723437190055847, "memory(GiB)": 74.62, "step": 1151, "token_acc": 0.8691275167785235, "train_speed(iter/s)": 0.022413 }, { "epoch": 0.931850353892821, "grad_norm": 1.787015438079834, "learning_rate": 1.2856768607574565e-07, "loss": 0.3393116891384125, "memory(GiB)": 74.62, "step": 1152, "token_acc": 0.9015544041450777, "train_speed(iter/s)": 0.022413 }, { "epoch": 0.9326592517694641, "grad_norm": 2.211394786834717, "learning_rate": 1.255730113078385e-07, "loss": 0.34008848667144775, "memory(GiB)": 74.62, "step": 1153, "token_acc": 0.8700787401574803, "train_speed(iter/s)": 0.022413 }, { "epoch": 0.9334681496461071, "grad_norm": 1.7942789793014526, "learning_rate": 1.2261318208988294e-07, "loss": 0.31053483486175537, "memory(GiB)": 74.62, "step": 1154, "token_acc": 0.8535825545171339, "train_speed(iter/s)": 0.022414 }, { "epoch": 0.9342770475227502, "grad_norm": 2.598997116088867, "learning_rate": 1.1968821958064702e-07, "loss": 0.4369804859161377, "memory(GiB)": 74.62, "step": 1155, "token_acc": 0.8713692946058091, "train_speed(iter/s)": 0.022414 }, { "epoch": 0.9350859453993934, "grad_norm": 1.7106472253799438, "learning_rate": 1.1679814468965211e-07, "loss": 0.3438988924026489, "memory(GiB)": 74.62, "step": 1156, "token_acc": 0.8736059479553904, "train_speed(iter/s)": 0.022414 }, { "epoch": 0.9358948432760364, "grad_norm": 1.8687455654144287, "learning_rate": 1.1394297807701737e-07, "loss": 0.3768293261528015, "memory(GiB)": 74.62, "step": 1157, "token_acc": 0.9270833333333334, "train_speed(iter/s)": 0.022415 }, { "epoch": 0.9367037411526795, "grad_norm": 1.5831663608551025, "learning_rate": 1.111227401533166e-07, "loss": 0.3412172496318817, "memory(GiB)": 74.62, "step": 1158, "token_acc": 0.875, "train_speed(iter/s)": 0.022415 }, { "epoch": 0.9375126390293226, "grad_norm": 1.8993335962295532, "learning_rate": 1.083374510794305e-07, "loss": 0.4136160910129547, "memory(GiB)": 74.62, "step": 1159, "token_acc": 0.9094488188976378, "train_speed(iter/s)": 0.022415 }, { "epoch": 0.9383215369059656, "grad_norm": 3.2496023178100586, "learning_rate": 1.0558713076640415e-07, "loss": 0.3755384087562561, "memory(GiB)": 74.62, "step": 1160, "token_acc": 0.9172932330827067, "train_speed(iter/s)": 0.022416 }, { "epoch": 0.9391304347826087, "grad_norm": 2.1333253383636475, "learning_rate": 1.028717988753014e-07, "loss": 0.3936523199081421, "memory(GiB)": 74.62, "step": 1161, "token_acc": 0.8974358974358975, "train_speed(iter/s)": 0.022416 }, { "epoch": 0.9399393326592518, "grad_norm": 2.6341114044189453, "learning_rate": 1.0019147481706626e-07, "loss": 0.40892741084098816, "memory(GiB)": 74.62, "step": 1162, "token_acc": 0.9217391304347826, "train_speed(iter/s)": 0.022417 }, { "epoch": 0.9407482305358948, "grad_norm": 1.8160382509231567, "learning_rate": 9.754617775238562e-08, "loss": 0.36974000930786133, "memory(GiB)": 74.62, "step": 1163, "token_acc": 0.8614457831325302, "train_speed(iter/s)": 0.022417 }, { "epoch": 0.9415571284125379, "grad_norm": 2.1739790439605713, "learning_rate": 9.493592659155004e-08, "loss": 0.3862905502319336, "memory(GiB)": 74.62, "step": 1164, "token_acc": 0.8700787401574803, "train_speed(iter/s)": 0.022417 }, { "epoch": 0.942366026289181, "grad_norm": 2.973860502243042, "learning_rate": 9.236073999431939e-08, "loss": 0.4268924593925476, "memory(GiB)": 74.62, "step": 1165, "token_acc": 0.8847736625514403, "train_speed(iter/s)": 0.022418 }, { "epoch": 0.943174924165824, "grad_norm": 2.2699947357177734, "learning_rate": 8.98206363697901e-08, "loss": 0.3827816843986511, "memory(GiB)": 74.62, "step": 1166, "token_acc": 0.8765432098765432, "train_speed(iter/s)": 0.022418 }, { "epoch": 0.9439838220424671, "grad_norm": 2.014028549194336, "learning_rate": 8.731563387626096e-08, "loss": 0.3976903259754181, "memory(GiB)": 74.62, "step": 1167, "token_acc": 0.8338658146964856, "train_speed(iter/s)": 0.022418 }, { "epoch": 0.9447927199191102, "grad_norm": 2.3635129928588867, "learning_rate": 8.484575042110699e-08, "loss": 0.3837153911590576, "memory(GiB)": 74.62, "step": 1168, "token_acc": 0.8766666666666667, "train_speed(iter/s)": 0.022418 }, { "epoch": 0.9456016177957532, "grad_norm": 2.5257232189178467, "learning_rate": 8.241100366064902e-08, "loss": 0.37266969680786133, "memory(GiB)": 74.62, "step": 1169, "token_acc": 0.8828828828828829, "train_speed(iter/s)": 0.022419 }, { "epoch": 0.9464105156723963, "grad_norm": 2.1283090114593506, "learning_rate": 8.001141100002885e-08, "loss": 0.32720375061035156, "memory(GiB)": 74.62, "step": 1170, "token_acc": 0.8850174216027874, "train_speed(iter/s)": 0.022419 }, { "epoch": 0.9472194135490394, "grad_norm": 2.261035919189453, "learning_rate": 7.764698959308315e-08, "loss": 0.38027650117874146, "memory(GiB)": 74.62, "step": 1171, "token_acc": 0.8956521739130435, "train_speed(iter/s)": 0.022419 }, { "epoch": 0.9480283114256826, "grad_norm": 1.921704888343811, "learning_rate": 7.531775634222138e-08, "loss": 0.37682783603668213, "memory(GiB)": 74.62, "step": 1172, "token_acc": 0.8680851063829788, "train_speed(iter/s)": 0.02242 }, { "epoch": 0.9488372093023256, "grad_norm": 2.031587600708008, "learning_rate": 7.302372789830702e-08, "loss": 0.3404289484024048, "memory(GiB)": 74.62, "step": 1173, "token_acc": 0.8355555555555556, "train_speed(iter/s)": 0.02242 }, { "epoch": 0.9496461071789687, "grad_norm": 1.8540045022964478, "learning_rate": 7.076492066053486e-08, "loss": 0.3675205111503601, "memory(GiB)": 74.62, "step": 1174, "token_acc": 0.8758389261744967, "train_speed(iter/s)": 0.02242 }, { "epoch": 0.9504550050556118, "grad_norm": 2.207390546798706, "learning_rate": 6.854135077631774e-08, "loss": 0.3710861802101135, "memory(GiB)": 74.62, "step": 1175, "token_acc": 0.8367875647668394, "train_speed(iter/s)": 0.022421 }, { "epoch": 0.9512639029322548, "grad_norm": 2.1160874366760254, "learning_rate": 6.635303414116834e-08, "loss": 0.375140517950058, "memory(GiB)": 74.62, "step": 1176, "token_acc": 0.8616600790513834, "train_speed(iter/s)": 0.022421 }, { "epoch": 0.9520728008088979, "grad_norm": 1.8097771406173706, "learning_rate": 6.419998639858538e-08, "loss": 0.33210816979408264, "memory(GiB)": 74.62, "step": 1177, "token_acc": 0.9314079422382672, "train_speed(iter/s)": 0.022421 }, { "epoch": 0.952881698685541, "grad_norm": 1.6278916597366333, "learning_rate": 6.208222293994425e-08, "loss": 0.3717727065086365, "memory(GiB)": 74.62, "step": 1178, "token_acc": 0.8639455782312925, "train_speed(iter/s)": 0.022422 }, { "epoch": 0.953690596562184, "grad_norm": 2.6115875244140625, "learning_rate": 5.999975890438436e-08, "loss": 0.35759437084198, "memory(GiB)": 74.62, "step": 1179, "token_acc": 0.9078498293515358, "train_speed(iter/s)": 0.022422 }, { "epoch": 0.9544994944388271, "grad_norm": 2.0658047199249268, "learning_rate": 5.79526091787036e-08, "loss": 0.37362658977508545, "memory(GiB)": 74.62, "step": 1180, "token_acc": 0.8454545454545455, "train_speed(iter/s)": 0.022422 }, { "epoch": 0.9553083923154702, "grad_norm": 2.23612117767334, "learning_rate": 5.594078839724793e-08, "loss": 0.37239736318588257, "memory(GiB)": 74.62, "step": 1181, "token_acc": 0.855072463768116, "train_speed(iter/s)": 0.022423 }, { "epoch": 0.9561172901921132, "grad_norm": 1.8740304708480835, "learning_rate": 5.396431094181198e-08, "loss": 0.3480920195579529, "memory(GiB)": 74.62, "step": 1182, "token_acc": 0.8709677419354839, "train_speed(iter/s)": 0.022423 }, { "epoch": 0.9569261880687563, "grad_norm": 2.2320539951324463, "learning_rate": 5.202319094153252e-08, "loss": 0.3483563959598541, "memory(GiB)": 74.62, "step": 1183, "token_acc": 0.8866666666666667, "train_speed(iter/s)": 0.022423 }, { "epoch": 0.9577350859453994, "grad_norm": 1.7620937824249268, "learning_rate": 5.011744227278625e-08, "loss": 0.33139705657958984, "memory(GiB)": 74.62, "step": 1184, "token_acc": 0.9110169491525424, "train_speed(iter/s)": 0.022424 }, { "epoch": 0.9585439838220424, "grad_norm": 1.869081974029541, "learning_rate": 4.824707855909605e-08, "loss": 0.3572564125061035, "memory(GiB)": 74.62, "step": 1185, "token_acc": 0.8842592592592593, "train_speed(iter/s)": 0.022424 }, { "epoch": 0.9593528816986855, "grad_norm": 2.5178749561309814, "learning_rate": 4.6412113171028226e-08, "loss": 0.39302319288253784, "memory(GiB)": 74.62, "step": 1186, "token_acc": 0.9107142857142857, "train_speed(iter/s)": 0.022424 }, { "epoch": 0.9601617795753287, "grad_norm": 2.3168158531188965, "learning_rate": 4.461255922609986e-08, "loss": 0.3867931365966797, "memory(GiB)": 74.62, "step": 1187, "token_acc": 0.8819444444444444, "train_speed(iter/s)": 0.022425 }, { "epoch": 0.9609706774519717, "grad_norm": 2.4859671592712402, "learning_rate": 4.2848429588683295e-08, "loss": 0.3992939591407776, "memory(GiB)": 74.62, "step": 1188, "token_acc": 0.8392857142857143, "train_speed(iter/s)": 0.022425 }, { "epoch": 0.9617795753286148, "grad_norm": 3.0036697387695312, "learning_rate": 4.111973686991677e-08, "loss": 0.49971675872802734, "memory(GiB)": 74.62, "step": 1189, "token_acc": 0.8101694915254237, "train_speed(iter/s)": 0.022425 }, { "epoch": 0.9625884732052579, "grad_norm": 2.2183077335357666, "learning_rate": 3.9426493427611177e-08, "loss": 0.38460367918014526, "memory(GiB)": 74.62, "step": 1190, "token_acc": 0.8319327731092437, "train_speed(iter/s)": 0.022426 }, { "epoch": 0.9633973710819009, "grad_norm": 2.1675848960876465, "learning_rate": 3.776871136616289e-08, "loss": 0.4845053553581238, "memory(GiB)": 74.62, "step": 1191, "token_acc": 0.8, "train_speed(iter/s)": 0.022426 }, { "epoch": 0.964206268958544, "grad_norm": 1.8861103057861328, "learning_rate": 3.6146402536468285e-08, "loss": 0.40070268511772156, "memory(GiB)": 74.62, "step": 1192, "token_acc": 0.8436363636363636, "train_speed(iter/s)": 0.022426 }, { "epoch": 0.9650151668351871, "grad_norm": 2.5333354473114014, "learning_rate": 3.455957853583769e-08, "loss": 0.3965553343296051, "memory(GiB)": 74.62, "step": 1193, "token_acc": 0.8791208791208791, "train_speed(iter/s)": 0.022427 }, { "epoch": 0.9658240647118301, "grad_norm": 2.1063308715820312, "learning_rate": 3.3008250707913246e-08, "loss": 0.35347798466682434, "memory(GiB)": 74.62, "step": 1194, "token_acc": 0.8908296943231441, "train_speed(iter/s)": 0.022427 }, { "epoch": 0.9666329625884732, "grad_norm": 2.082961320877075, "learning_rate": 3.14924301425884e-08, "loss": 0.3923337757587433, "memory(GiB)": 74.62, "step": 1195, "token_acc": 0.8774834437086093, "train_speed(iter/s)": 0.022427 }, { "epoch": 0.9674418604651163, "grad_norm": 1.8798726797103882, "learning_rate": 3.0012127675925206e-08, "loss": 0.35899072885513306, "memory(GiB)": 74.62, "step": 1196, "token_acc": 0.8819444444444444, "train_speed(iter/s)": 0.022428 }, { "epoch": 0.9682507583417593, "grad_norm": 13.926689147949219, "learning_rate": 2.8567353890082696e-08, "loss": 0.3928597569465637, "memory(GiB)": 74.62, "step": 1197, "token_acc": 0.8653846153846154, "train_speed(iter/s)": 0.022428 }, { "epoch": 0.9690596562184024, "grad_norm": 1.9069607257843018, "learning_rate": 2.7158119113234738e-08, "loss": 0.344777375459671, "memory(GiB)": 74.62, "step": 1198, "token_acc": 0.9090909090909091, "train_speed(iter/s)": 0.022428 }, { "epoch": 0.9698685540950455, "grad_norm": 2.385317087173462, "learning_rate": 2.5784433419501763e-08, "loss": 0.35486793518066406, "memory(GiB)": 74.62, "step": 1199, "token_acc": 0.8652849740932642, "train_speed(iter/s)": 0.022428 }, { "epoch": 0.9706774519716885, "grad_norm": 2.183742046356201, "learning_rate": 2.4446306628875814e-08, "loss": 0.3595341444015503, "memory(GiB)": 74.62, "step": 1200, "token_acc": 0.8879310344827587, "train_speed(iter/s)": 0.022429 }, { "epoch": 0.9714863498483316, "grad_norm": 2.103287935256958, "learning_rate": 2.3143748307150605e-08, "loss": 0.39095747470855713, "memory(GiB)": 74.62, "step": 1201, "token_acc": 0.8861788617886179, "train_speed(iter/s)": 0.022429 }, { "epoch": 0.9722952477249748, "grad_norm": 2.1582367420196533, "learning_rate": 2.1876767765853237e-08, "loss": 0.3016042113304138, "memory(GiB)": 74.62, "step": 1202, "token_acc": 0.8571428571428571, "train_speed(iter/s)": 0.022429 }, { "epoch": 0.9731041456016178, "grad_norm": 2.0449063777923584, "learning_rate": 2.0645374062179257e-08, "loss": 0.36447232961654663, "memory(GiB)": 74.62, "step": 1203, "token_acc": 0.8480392156862745, "train_speed(iter/s)": 0.02243 }, { "epoch": 0.9739130434782609, "grad_norm": 3.5183372497558594, "learning_rate": 1.9449575998924387e-08, "loss": 0.43112486600875854, "memory(GiB)": 74.62, "step": 1204, "token_acc": 0.8607594936708861, "train_speed(iter/s)": 0.02243 }, { "epoch": 0.974721941354904, "grad_norm": 2.14886736869812, "learning_rate": 1.8289382124426214e-08, "loss": 0.38468360900878906, "memory(GiB)": 74.62, "step": 1205, "token_acc": 0.8654545454545455, "train_speed(iter/s)": 0.02243 }, { "epoch": 0.975530839231547, "grad_norm": 2.688023090362549, "learning_rate": 1.7164800732498156e-08, "loss": 0.3501737713813782, "memory(GiB)": 74.62, "step": 1206, "token_acc": 0.8855421686746988, "train_speed(iter/s)": 0.022431 }, { "epoch": 0.9763397371081901, "grad_norm": 2.0248029232025146, "learning_rate": 1.6075839862374487e-08, "loss": 0.31531471014022827, "memory(GiB)": 74.62, "step": 1207, "token_acc": 0.865979381443299, "train_speed(iter/s)": 0.022431 }, { "epoch": 0.9771486349848332, "grad_norm": 3.5692150592803955, "learning_rate": 1.5022507298649848e-08, "loss": 0.3675447106361389, "memory(GiB)": 74.62, "step": 1208, "token_acc": 0.8636363636363636, "train_speed(iter/s)": 0.022431 }, { "epoch": 0.9779575328614762, "grad_norm": 1.9649704694747925, "learning_rate": 1.400481057122538e-08, "loss": 0.38956940174102783, "memory(GiB)": 74.62, "step": 1209, "token_acc": 0.8914473684210527, "train_speed(iter/s)": 0.022431 }, { "epoch": 0.9787664307381193, "grad_norm": 2.3865509033203125, "learning_rate": 1.3022756955254901e-08, "loss": 0.3772105574607849, "memory(GiB)": 74.62, "step": 1210, "token_acc": 0.8963963963963963, "train_speed(iter/s)": 0.022432 }, { "epoch": 0.9795753286147624, "grad_norm": 9.275412559509277, "learning_rate": 1.207635347108993e-08, "loss": 0.39102572202682495, "memory(GiB)": 74.62, "step": 1211, "token_acc": 0.8317757009345794, "train_speed(iter/s)": 0.022432 }, { "epoch": 0.9803842264914054, "grad_norm": 2.0313827991485596, "learning_rate": 1.1165606884234182e-08, "loss": 0.37432482838630676, "memory(GiB)": 74.62, "step": 1212, "token_acc": 0.875, "train_speed(iter/s)": 0.022432 }, { "epoch": 0.9811931243680485, "grad_norm": 1.960199236869812, "learning_rate": 1.0290523705291932e-08, "loss": 0.3433490991592407, "memory(GiB)": 74.62, "step": 1213, "token_acc": 0.8885714285714286, "train_speed(iter/s)": 0.022433 }, { "epoch": 0.9820020222446916, "grad_norm": 1.8676866292953491, "learning_rate": 9.451110189923063e-09, "loss": 0.3818192183971405, "memory(GiB)": 74.62, "step": 1214, "token_acc": 0.8989547038327527, "train_speed(iter/s)": 0.022433 }, { "epoch": 0.9828109201213346, "grad_norm": 2.4343481063842773, "learning_rate": 8.647372338795867e-09, "loss": 0.4184320569038391, "memory(GiB)": 74.62, "step": 1215, "token_acc": 0.8434782608695652, "train_speed(iter/s)": 0.022433 }, { "epoch": 0.9836198179979777, "grad_norm": 2.3009696006774902, "learning_rate": 7.8793158975482e-09, "loss": 0.40056365728378296, "memory(GiB)": 74.62, "step": 1216, "token_acc": 0.8210526315789474, "train_speed(iter/s)": 0.022433 }, { "epoch": 0.9844287158746208, "grad_norm": 4.763977527618408, "learning_rate": 7.146946356743068e-09, "loss": 0.37496888637542725, "memory(GiB)": 74.62, "step": 1217, "token_acc": 0.9244444444444444, "train_speed(iter/s)": 0.022434 }, { "epoch": 0.985237613751264, "grad_norm": 2.2471978664398193, "learning_rate": 6.450268951830319e-09, "loss": 0.3727502226829529, "memory(GiB)": 74.62, "step": 1218, "token_acc": 0.819327731092437, "train_speed(iter/s)": 0.022434 }, { "epoch": 0.986046511627907, "grad_norm": 1.7557698488235474, "learning_rate": 5.789288663110015e-09, "loss": 0.32791298627853394, "memory(GiB)": 74.62, "step": 1219, "token_acc": 0.8659420289855072, "train_speed(iter/s)": 0.022434 }, { "epoch": 0.9868554095045501, "grad_norm": 2.5717544555664062, "learning_rate": 5.164010215695792e-09, "loss": 0.37463176250457764, "memory(GiB)": 74.62, "step": 1220, "token_acc": 0.8560885608856088, "train_speed(iter/s)": 0.022434 }, { "epoch": 0.9876643073811932, "grad_norm": 3.5073463916778564, "learning_rate": 4.574438079480992e-09, "loss": 0.32435593008995056, "memory(GiB)": 74.62, "step": 1221, "token_acc": 0.8685446009389671, "train_speed(iter/s)": 0.022435 }, { "epoch": 0.9884732052578362, "grad_norm": 1.9765585660934448, "learning_rate": 4.020576469108139e-09, "loss": 0.38409414887428284, "memory(GiB)": 74.62, "step": 1222, "token_acc": 0.8888888888888888, "train_speed(iter/s)": 0.022435 }, { "epoch": 0.9892821031344793, "grad_norm": 1.8832907676696777, "learning_rate": 3.502429343937297e-09, "loss": 0.3716433644294739, "memory(GiB)": 74.62, "step": 1223, "token_acc": 0.8876811594202898, "train_speed(iter/s)": 0.022435 }, { "epoch": 0.9900910010111224, "grad_norm": 1.9831905364990234, "learning_rate": 3.020000408018864e-09, "loss": 0.3268841505050659, "memory(GiB)": 74.62, "step": 1224, "token_acc": 0.9003831417624522, "train_speed(iter/s)": 0.022436 }, { "epoch": 0.9908998988877654, "grad_norm": 2.281235456466675, "learning_rate": 2.573293110065822e-09, "loss": 0.33263713121414185, "memory(GiB)": 74.62, "step": 1225, "token_acc": 0.8669527896995708, "train_speed(iter/s)": 0.022436 }, { "epoch": 0.9917087967644085, "grad_norm": 2.3608005046844482, "learning_rate": 2.162310643430976e-09, "loss": 0.39835768938064575, "memory(GiB)": 74.62, "step": 1226, "token_acc": 0.8962655601659751, "train_speed(iter/s)": 0.022436 }, { "epoch": 0.9925176946410516, "grad_norm": 2.6654913425445557, "learning_rate": 1.7870559460814173e-09, "loss": 0.4261908531188965, "memory(GiB)": 74.62, "step": 1227, "token_acc": 0.8935574229691877, "train_speed(iter/s)": 0.022437 }, { "epoch": 0.9933265925176946, "grad_norm": 1.8069103956222534, "learning_rate": 1.447531700580207e-09, "loss": 0.3241886496543884, "memory(GiB)": 74.62, "step": 1228, "token_acc": 0.9383886255924171, "train_speed(iter/s)": 0.022437 }, { "epoch": 0.9941354903943377, "grad_norm": 2.0414981842041016, "learning_rate": 1.1437403340652797e-09, "loss": 0.4070656895637512, "memory(GiB)": 74.62, "step": 1229, "token_acc": 0.8465753424657534, "train_speed(iter/s)": 0.022437 }, { "epoch": 0.9949443882709808, "grad_norm": 2.6518869400024414, "learning_rate": 8.756840182344573e-10, "loss": 0.3987523317337036, "memory(GiB)": 74.62, "step": 1230, "token_acc": 0.8187134502923976, "train_speed(iter/s)": 0.022438 }, { "epoch": 0.9957532861476238, "grad_norm": 1.9646754264831543, "learning_rate": 6.433646693265738e-10, "loss": 0.32140272855758667, "memory(GiB)": 74.62, "step": 1231, "token_acc": 0.9049773755656109, "train_speed(iter/s)": 0.022438 }, { "epoch": 0.9965621840242669, "grad_norm": 2.0284359455108643, "learning_rate": 4.4678394810981904e-10, "loss": 0.38582661747932434, "memory(GiB)": 74.62, "step": 1232, "token_acc": 0.8961937716262975, "train_speed(iter/s)": 0.022438 }, { "epoch": 0.9973710819009101, "grad_norm": 1.9221043586730957, "learning_rate": 2.8594325987119086e-10, "loss": 0.3542518615722656, "memory(GiB)": 74.62, "step": 1233, "token_acc": 0.9240506329113924, "train_speed(iter/s)": 0.022439 }, { "epoch": 0.9981799797775531, "grad_norm": 2.5311009883880615, "learning_rate": 1.6084375440317268e-10, "loss": 0.44038695096969604, "memory(GiB)": 74.62, "step": 1234, "token_acc": 0.8537735849056604, "train_speed(iter/s)": 0.022439 }, { "epoch": 0.9989888776541962, "grad_norm": 2.092437505722046, "learning_rate": 7.148632599707217e-11, "loss": 0.3628859221935272, "memory(GiB)": 74.62, "step": 1235, "token_acc": 0.8671328671328671, "train_speed(iter/s)": 0.022439 }, { "epoch": 0.9997977755308393, "grad_norm": 2.2749087810516357, "learning_rate": 1.787161343858035e-11, "loss": 0.4479348063468933, "memory(GiB)": 74.62, "step": 1236, "token_acc": 0.8859934853420195, "train_speed(iter/s)": 0.02244 }, { "epoch": 1.0, "grad_norm": 4.017106056213379, "learning_rate": 0.0, "loss": 0.41172629594802856, "memory(GiB)": 74.62, "step": 1237, "token_acc": 0.8541666666666666, "train_speed(iter/s)": 0.022445 }, { "epoch": 1.0, "eval_loss": 0.3615947365760803, "eval_runtime": 428.6167, "eval_samples_per_second": 3.728, "eval_steps_per_second": 0.117, "eval_token_acc": 0.8760036017108126, "step": 1237 } ], "logging_steps": 1, "max_steps": 1237, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 618, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.135344722858895e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }