luca0621's picture
SFT AMEX checkpoint 20260414_000848
b169aa7 verified
{
"best_global_step": 1237,
"best_metric": 0.36159474,
"best_model_checkpoint": "/workspace/checkpoint/gui_exp/sft_amex/v0-20260413_084132/checkpoint-1237",
"epoch": 1.0,
"eval_steps": 500,
"global_step": 1237,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0008088978766430738,
"grad_norm": 64.78370666503906,
"learning_rate": 1.6129032258064518e-07,
"loss": 1.7486257553100586,
"memory(GiB)": 60.95,
"step": 1,
"token_acc": 0.6546184738955824,
"train_speed(iter/s)": 0.017239
},
{
"epoch": 0.0016177957532861476,
"grad_norm": 68.59134674072266,
"learning_rate": 3.2258064516129035e-07,
"loss": 1.777339220046997,
"memory(GiB)": 71.88,
"step": 2,
"token_acc": 0.6341463414634146,
"train_speed(iter/s)": 0.019622
},
{
"epoch": 0.0024266936299292214,
"grad_norm": 64.73936462402344,
"learning_rate": 4.838709677419355e-07,
"loss": 1.8061851263046265,
"memory(GiB)": 71.9,
"step": 3,
"token_acc": 0.6195652173913043,
"train_speed(iter/s)": 0.020613
},
{
"epoch": 0.0032355915065722953,
"grad_norm": 65.572998046875,
"learning_rate": 6.451612903225807e-07,
"loss": 1.807295560836792,
"memory(GiB)": 72.45,
"step": 4,
"token_acc": 0.5654205607476636,
"train_speed(iter/s)": 0.021122
},
{
"epoch": 0.004044489383215369,
"grad_norm": 65.35359191894531,
"learning_rate": 8.064516129032258e-07,
"loss": 1.5166772603988647,
"memory(GiB)": 72.45,
"step": 5,
"token_acc": 0.6327433628318584,
"train_speed(iter/s)": 0.021426
},
{
"epoch": 0.004853387259858443,
"grad_norm": 57.624046325683594,
"learning_rate": 9.67741935483871e-07,
"loss": 1.5725659132003784,
"memory(GiB)": 72.45,
"step": 6,
"token_acc": 0.6591928251121076,
"train_speed(iter/s)": 0.021622
},
{
"epoch": 0.005662285136501517,
"grad_norm": 41.641319274902344,
"learning_rate": 1.1290322580645162e-06,
"loss": 1.6281558275222778,
"memory(GiB)": 72.45,
"step": 7,
"token_acc": 0.6067415730337079,
"train_speed(iter/s)": 0.02179
},
{
"epoch": 0.006471183013144591,
"grad_norm": 40.06605529785156,
"learning_rate": 1.2903225806451614e-06,
"loss": 1.6883149147033691,
"memory(GiB)": 72.45,
"step": 8,
"token_acc": 0.6423841059602649,
"train_speed(iter/s)": 0.021906
},
{
"epoch": 0.007280080889787664,
"grad_norm": 23.832304000854492,
"learning_rate": 1.4516129032258066e-06,
"loss": 1.4600856304168701,
"memory(GiB)": 73.8,
"step": 9,
"token_acc": 0.6683417085427136,
"train_speed(iter/s)": 0.022001
},
{
"epoch": 0.008088978766430738,
"grad_norm": 19.52027702331543,
"learning_rate": 1.6129032258064516e-06,
"loss": 1.178048014640808,
"memory(GiB)": 73.8,
"step": 10,
"token_acc": 0.6995515695067265,
"train_speed(iter/s)": 0.022088
},
{
"epoch": 0.008897876643073812,
"grad_norm": 22.565189361572266,
"learning_rate": 1.774193548387097e-06,
"loss": 1.225492000579834,
"memory(GiB)": 73.8,
"step": 11,
"token_acc": 0.680327868852459,
"train_speed(iter/s)": 0.022154
},
{
"epoch": 0.009706774519716885,
"grad_norm": 15.462038040161133,
"learning_rate": 1.935483870967742e-06,
"loss": 1.0573687553405762,
"memory(GiB)": 73.8,
"step": 12,
"token_acc": 0.7576923076923077,
"train_speed(iter/s)": 0.02221
},
{
"epoch": 0.010515672396359959,
"grad_norm": 14.245152473449707,
"learning_rate": 2.096774193548387e-06,
"loss": 1.0721827745437622,
"memory(GiB)": 73.8,
"step": 13,
"token_acc": 0.7405857740585774,
"train_speed(iter/s)": 0.022257
},
{
"epoch": 0.011324570273003034,
"grad_norm": 8.204596519470215,
"learning_rate": 2.2580645161290324e-06,
"loss": 0.8753397464752197,
"memory(GiB)": 73.8,
"step": 14,
"token_acc": 0.7516778523489933,
"train_speed(iter/s)": 0.022297
},
{
"epoch": 0.012133468149646108,
"grad_norm": 11.066507339477539,
"learning_rate": 2.4193548387096776e-06,
"loss": 0.9424616098403931,
"memory(GiB)": 73.8,
"step": 15,
"token_acc": 0.7411003236245954,
"train_speed(iter/s)": 0.022325
},
{
"epoch": 0.012942366026289181,
"grad_norm": 8.134406089782715,
"learning_rate": 2.580645161290323e-06,
"loss": 0.9165105819702148,
"memory(GiB)": 73.8,
"step": 16,
"token_acc": 0.7902439024390244,
"train_speed(iter/s)": 0.022352
},
{
"epoch": 0.013751263902932255,
"grad_norm": 14.990755081176758,
"learning_rate": 2.7419354838709676e-06,
"loss": 0.8677236437797546,
"memory(GiB)": 73.8,
"step": 17,
"token_acc": 0.7635658914728682,
"train_speed(iter/s)": 0.02238
},
{
"epoch": 0.014560161779575328,
"grad_norm": 5.65842342376709,
"learning_rate": 2.903225806451613e-06,
"loss": 0.7795729637145996,
"memory(GiB)": 73.8,
"step": 18,
"token_acc": 0.7739938080495357,
"train_speed(iter/s)": 0.022403
},
{
"epoch": 0.015369059656218402,
"grad_norm": 5.559131145477295,
"learning_rate": 3.0645161290322584e-06,
"loss": 0.8590961694717407,
"memory(GiB)": 73.8,
"step": 19,
"token_acc": 0.75,
"train_speed(iter/s)": 0.022423
},
{
"epoch": 0.016177957532861477,
"grad_norm": 4.871716499328613,
"learning_rate": 3.225806451612903e-06,
"loss": 0.7650733590126038,
"memory(GiB)": 73.8,
"step": 20,
"token_acc": 0.7865612648221344,
"train_speed(iter/s)": 0.022441
},
{
"epoch": 0.01698685540950455,
"grad_norm": 5.387275218963623,
"learning_rate": 3.3870967741935484e-06,
"loss": 0.7404652833938599,
"memory(GiB)": 73.8,
"step": 21,
"token_acc": 0.7907801418439716,
"train_speed(iter/s)": 0.022456
},
{
"epoch": 0.017795753286147624,
"grad_norm": 6.131480693817139,
"learning_rate": 3.548387096774194e-06,
"loss": 0.8067750334739685,
"memory(GiB)": 73.8,
"step": 22,
"token_acc": 0.7986111111111112,
"train_speed(iter/s)": 0.022476
},
{
"epoch": 0.018604651162790697,
"grad_norm": 5.183681488037109,
"learning_rate": 3.7096774193548392e-06,
"loss": 0.8132314682006836,
"memory(GiB)": 73.8,
"step": 23,
"token_acc": 0.7714285714285715,
"train_speed(iter/s)": 0.022492
},
{
"epoch": 0.01941354903943377,
"grad_norm": 5.063383102416992,
"learning_rate": 3.870967741935484e-06,
"loss": 0.7204439640045166,
"memory(GiB)": 73.8,
"step": 24,
"token_acc": 0.7905982905982906,
"train_speed(iter/s)": 0.022506
},
{
"epoch": 0.020222446916076844,
"grad_norm": 4.753130912780762,
"learning_rate": 4.032258064516129e-06,
"loss": 0.7673914432525635,
"memory(GiB)": 73.8,
"step": 25,
"token_acc": 0.7453416149068323,
"train_speed(iter/s)": 0.022518
},
{
"epoch": 0.021031344792719918,
"grad_norm": 4.112824440002441,
"learning_rate": 4.193548387096774e-06,
"loss": 0.6755634546279907,
"memory(GiB)": 73.8,
"step": 26,
"token_acc": 0.7714285714285715,
"train_speed(iter/s)": 0.02253
},
{
"epoch": 0.02184024266936299,
"grad_norm": 3.704129695892334,
"learning_rate": 4.35483870967742e-06,
"loss": 0.685713529586792,
"memory(GiB)": 73.8,
"step": 27,
"token_acc": 0.8447488584474886,
"train_speed(iter/s)": 0.022541
},
{
"epoch": 0.02264914054600607,
"grad_norm": 4.385001182556152,
"learning_rate": 4.516129032258065e-06,
"loss": 0.7436140179634094,
"memory(GiB)": 73.8,
"step": 28,
"token_acc": 0.749003984063745,
"train_speed(iter/s)": 0.022549
},
{
"epoch": 0.023458038422649142,
"grad_norm": 5.385667324066162,
"learning_rate": 4.67741935483871e-06,
"loss": 0.7293410301208496,
"memory(GiB)": 73.8,
"step": 29,
"token_acc": 0.8248175182481752,
"train_speed(iter/s)": 0.022558
},
{
"epoch": 0.024266936299292215,
"grad_norm": 5.816902160644531,
"learning_rate": 4.838709677419355e-06,
"loss": 0.6676285266876221,
"memory(GiB)": 73.8,
"step": 30,
"token_acc": 0.7844827586206896,
"train_speed(iter/s)": 0.022568
},
{
"epoch": 0.02507583417593529,
"grad_norm": 3.9358129501342773,
"learning_rate": 5e-06,
"loss": 0.6832848787307739,
"memory(GiB)": 73.8,
"step": 31,
"token_acc": 0.8340807174887892,
"train_speed(iter/s)": 0.022574
},
{
"epoch": 0.025884732052578362,
"grad_norm": 3.9400582313537598,
"learning_rate": 5.161290322580646e-06,
"loss": 0.6794041395187378,
"memory(GiB)": 73.8,
"step": 32,
"token_acc": 0.7857142857142857,
"train_speed(iter/s)": 0.022581
},
{
"epoch": 0.026693629929221436,
"grad_norm": 6.0499725341796875,
"learning_rate": 5.322580645161291e-06,
"loss": 0.6280096769332886,
"memory(GiB)": 73.8,
"step": 33,
"token_acc": 0.8277511961722488,
"train_speed(iter/s)": 0.022589
},
{
"epoch": 0.02750252780586451,
"grad_norm": 4.963372230529785,
"learning_rate": 5.483870967741935e-06,
"loss": 0.7461614012718201,
"memory(GiB)": 73.8,
"step": 34,
"token_acc": 0.7442922374429224,
"train_speed(iter/s)": 0.022594
},
{
"epoch": 0.028311425682507583,
"grad_norm": 4.874055862426758,
"learning_rate": 5.645161290322582e-06,
"loss": 0.6325216889381409,
"memory(GiB)": 73.8,
"step": 35,
"token_acc": 0.8239700374531835,
"train_speed(iter/s)": 0.022602
},
{
"epoch": 0.029120323559150656,
"grad_norm": 4.295459747314453,
"learning_rate": 5.806451612903226e-06,
"loss": 0.6098757982254028,
"memory(GiB)": 73.8,
"step": 36,
"token_acc": 0.85,
"train_speed(iter/s)": 0.022606
},
{
"epoch": 0.02992922143579373,
"grad_norm": 4.486640453338623,
"learning_rate": 5.967741935483872e-06,
"loss": 0.6720225811004639,
"memory(GiB)": 73.8,
"step": 37,
"token_acc": 0.7675276752767528,
"train_speed(iter/s)": 0.022613
},
{
"epoch": 0.030738119312436803,
"grad_norm": 3.9755430221557617,
"learning_rate": 6.129032258064517e-06,
"loss": 0.7007983326911926,
"memory(GiB)": 73.8,
"step": 38,
"token_acc": 0.7446808510638298,
"train_speed(iter/s)": 0.022618
},
{
"epoch": 0.03154701718907988,
"grad_norm": 3.85732102394104,
"learning_rate": 6.290322580645162e-06,
"loss": 0.6228176355361938,
"memory(GiB)": 73.8,
"step": 39,
"token_acc": 0.8116591928251121,
"train_speed(iter/s)": 0.022625
},
{
"epoch": 0.032355915065722954,
"grad_norm": 3.556612491607666,
"learning_rate": 6.451612903225806e-06,
"loss": 0.6283481121063232,
"memory(GiB)": 73.8,
"step": 40,
"token_acc": 0.8035087719298246,
"train_speed(iter/s)": 0.02263
},
{
"epoch": 0.03316481294236603,
"grad_norm": 5.600265979766846,
"learning_rate": 6.612903225806452e-06,
"loss": 0.6793509721755981,
"memory(GiB)": 73.8,
"step": 41,
"token_acc": 0.8174904942965779,
"train_speed(iter/s)": 0.022635
},
{
"epoch": 0.0339737108190091,
"grad_norm": 3.7283554077148438,
"learning_rate": 6.774193548387097e-06,
"loss": 0.6385987997055054,
"memory(GiB)": 73.8,
"step": 42,
"token_acc": 0.8125,
"train_speed(iter/s)": 0.022639
},
{
"epoch": 0.034782608695652174,
"grad_norm": 3.8624303340911865,
"learning_rate": 6.935483870967743e-06,
"loss": 0.6532889604568481,
"memory(GiB)": 73.8,
"step": 43,
"token_acc": 0.8297872340425532,
"train_speed(iter/s)": 0.022644
},
{
"epoch": 0.03559150657229525,
"grad_norm": 3.6706488132476807,
"learning_rate": 7.096774193548388e-06,
"loss": 0.579014241695404,
"memory(GiB)": 73.8,
"step": 44,
"token_acc": 0.8345070422535211,
"train_speed(iter/s)": 0.022648
},
{
"epoch": 0.03640040444893832,
"grad_norm": 3.9184775352478027,
"learning_rate": 7.258064516129033e-06,
"loss": 0.5859895348548889,
"memory(GiB)": 73.8,
"step": 45,
"token_acc": 0.8291925465838509,
"train_speed(iter/s)": 0.022651
},
{
"epoch": 0.037209302325581395,
"grad_norm": 3.94393253326416,
"learning_rate": 7.4193548387096784e-06,
"loss": 0.5704982280731201,
"memory(GiB)": 73.8,
"step": 46,
"token_acc": 0.8542713567839196,
"train_speed(iter/s)": 0.022655
},
{
"epoch": 0.03801820020222447,
"grad_norm": 4.142230987548828,
"learning_rate": 7.580645161290323e-06,
"loss": 0.623918354511261,
"memory(GiB)": 73.8,
"step": 47,
"token_acc": 0.7984790874524715,
"train_speed(iter/s)": 0.022657
},
{
"epoch": 0.03882709807886754,
"grad_norm": 4.207951545715332,
"learning_rate": 7.741935483870968e-06,
"loss": 0.5815058946609497,
"memory(GiB)": 73.8,
"step": 48,
"token_acc": 0.8186528497409327,
"train_speed(iter/s)": 0.022662
},
{
"epoch": 0.039635995955510615,
"grad_norm": 4.375429153442383,
"learning_rate": 7.903225806451613e-06,
"loss": 0.6511105895042419,
"memory(GiB)": 73.8,
"step": 49,
"token_acc": 0.809375,
"train_speed(iter/s)": 0.022666
},
{
"epoch": 0.04044489383215369,
"grad_norm": 4.1379499435424805,
"learning_rate": 8.064516129032258e-06,
"loss": 0.6755905747413635,
"memory(GiB)": 73.8,
"step": 50,
"token_acc": 0.8034934497816594,
"train_speed(iter/s)": 0.02267
},
{
"epoch": 0.04125379170879676,
"grad_norm": 4.107391357421875,
"learning_rate": 8.225806451612904e-06,
"loss": 0.558114230632782,
"memory(GiB)": 73.8,
"step": 51,
"token_acc": 0.8186528497409327,
"train_speed(iter/s)": 0.022672
},
{
"epoch": 0.042062689585439836,
"grad_norm": 3.2282044887542725,
"learning_rate": 8.387096774193549e-06,
"loss": 0.5646804571151733,
"memory(GiB)": 73.8,
"step": 52,
"token_acc": 0.7943262411347518,
"train_speed(iter/s)": 0.022674
},
{
"epoch": 0.04287158746208291,
"grad_norm": 3.679171085357666,
"learning_rate": 8.548387096774194e-06,
"loss": 0.5988277196884155,
"memory(GiB)": 73.8,
"step": 53,
"token_acc": 0.8022922636103151,
"train_speed(iter/s)": 0.022675
},
{
"epoch": 0.04368048533872598,
"grad_norm": 4.386334419250488,
"learning_rate": 8.70967741935484e-06,
"loss": 0.6635404825210571,
"memory(GiB)": 73.8,
"step": 54,
"token_acc": 0.7681159420289855,
"train_speed(iter/s)": 0.022674
},
{
"epoch": 0.044489383215369056,
"grad_norm": 5.1664557456970215,
"learning_rate": 8.870967741935484e-06,
"loss": 0.5942538976669312,
"memory(GiB)": 73.8,
"step": 55,
"token_acc": 0.86328125,
"train_speed(iter/s)": 0.022677
},
{
"epoch": 0.04529828109201214,
"grad_norm": 5.156553268432617,
"learning_rate": 9.03225806451613e-06,
"loss": 0.5873563885688782,
"memory(GiB)": 74.11,
"step": 56,
"token_acc": 0.7923875432525952,
"train_speed(iter/s)": 0.022677
},
{
"epoch": 0.04610717896865521,
"grad_norm": 3.327913999557495,
"learning_rate": 9.193548387096775e-06,
"loss": 0.5179651975631714,
"memory(GiB)": 74.11,
"step": 57,
"token_acc": 0.8286713286713286,
"train_speed(iter/s)": 0.022681
},
{
"epoch": 0.046916076845298284,
"grad_norm": 3.147554397583008,
"learning_rate": 9.35483870967742e-06,
"loss": 0.6654713153839111,
"memory(GiB)": 74.11,
"step": 58,
"token_acc": 0.8122866894197952,
"train_speed(iter/s)": 0.022683
},
{
"epoch": 0.04772497472194136,
"grad_norm": 3.951767921447754,
"learning_rate": 9.516129032258065e-06,
"loss": 0.5465582013130188,
"memory(GiB)": 74.11,
"step": 59,
"token_acc": 0.828125,
"train_speed(iter/s)": 0.022686
},
{
"epoch": 0.04853387259858443,
"grad_norm": 3.6060750484466553,
"learning_rate": 9.67741935483871e-06,
"loss": 0.6206121444702148,
"memory(GiB)": 74.11,
"step": 60,
"token_acc": 0.8258928571428571,
"train_speed(iter/s)": 0.022687
},
{
"epoch": 0.049342770475227504,
"grad_norm": 4.130661487579346,
"learning_rate": 9.838709677419356e-06,
"loss": 0.6245087385177612,
"memory(GiB)": 74.11,
"step": 61,
"token_acc": 0.8050847457627118,
"train_speed(iter/s)": 0.02269
},
{
"epoch": 0.05015166835187058,
"grad_norm": 4.408290386199951,
"learning_rate": 1e-05,
"loss": 0.6183744668960571,
"memory(GiB)": 74.11,
"step": 62,
"token_acc": 0.8229665071770335,
"train_speed(iter/s)": 0.022694
},
{
"epoch": 0.05096056622851365,
"grad_norm": 3.7502522468566895,
"learning_rate": 9.999982128386562e-06,
"loss": 0.5600206851959229,
"memory(GiB)": 74.11,
"step": 63,
"token_acc": 0.8364312267657993,
"train_speed(iter/s)": 0.022695
},
{
"epoch": 0.051769464105156725,
"grad_norm": 4.595156669616699,
"learning_rate": 9.999928513674004e-06,
"loss": 0.5526872873306274,
"memory(GiB)": 74.11,
"step": 64,
"token_acc": 0.8165467625899281,
"train_speed(iter/s)": 0.022697
},
{
"epoch": 0.0525783619817998,
"grad_norm": 4.10991907119751,
"learning_rate": 9.999839156245597e-06,
"loss": 0.4983682632446289,
"memory(GiB)": 74.11,
"step": 65,
"token_acc": 0.8742857142857143,
"train_speed(iter/s)": 0.022698
},
{
"epoch": 0.05338725985844287,
"grad_norm": 4.291178226470947,
"learning_rate": 9.99971405674013e-06,
"loss": 0.6258913278579712,
"memory(GiB)": 74.11,
"step": 66,
"token_acc": 0.8235294117647058,
"train_speed(iter/s)": 0.0227
},
{
"epoch": 0.054196157735085945,
"grad_norm": 4.950540065765381,
"learning_rate": 9.999553216051892e-06,
"loss": 0.6055471897125244,
"memory(GiB)": 74.11,
"step": 67,
"token_acc": 0.75,
"train_speed(iter/s)": 0.0227
},
{
"epoch": 0.05500505561172902,
"grad_norm": 4.7848076820373535,
"learning_rate": 9.999356635330675e-06,
"loss": 0.5771285891532898,
"memory(GiB)": 74.11,
"step": 68,
"token_acc": 0.8007518796992481,
"train_speed(iter/s)": 0.022702
},
{
"epoch": 0.05581395348837209,
"grad_norm": 4.7233567237854,
"learning_rate": 9.999124315981766e-06,
"loss": 0.5602097511291504,
"memory(GiB)": 74.11,
"step": 69,
"token_acc": 0.85,
"train_speed(iter/s)": 0.022704
},
{
"epoch": 0.056622851365015166,
"grad_norm": 3.280118227005005,
"learning_rate": 9.998856259665936e-06,
"loss": 0.5948894023895264,
"memory(GiB)": 74.11,
"step": 70,
"token_acc": 0.8597285067873304,
"train_speed(iter/s)": 0.022706
},
{
"epoch": 0.05743174924165824,
"grad_norm": 3.6923129558563232,
"learning_rate": 9.99855246829942e-06,
"loss": 0.615454912185669,
"memory(GiB)": 74.11,
"step": 71,
"token_acc": 0.7639484978540773,
"train_speed(iter/s)": 0.022708
},
{
"epoch": 0.05824064711830131,
"grad_norm": 3.9682765007019043,
"learning_rate": 9.99821294405392e-06,
"loss": 0.6003280878067017,
"memory(GiB)": 74.11,
"step": 72,
"token_acc": 0.8415492957746479,
"train_speed(iter/s)": 0.02271
},
{
"epoch": 0.059049544994944386,
"grad_norm": 3.5200328826904297,
"learning_rate": 9.99783768935657e-06,
"loss": 0.5450583100318909,
"memory(GiB)": 74.11,
"step": 73,
"token_acc": 0.8100358422939068,
"train_speed(iter/s)": 0.022712
},
{
"epoch": 0.05985844287158746,
"grad_norm": 4.187544345855713,
"learning_rate": 9.997426706889935e-06,
"loss": 0.5230978727340698,
"memory(GiB)": 74.11,
"step": 74,
"token_acc": 0.8472222222222222,
"train_speed(iter/s)": 0.022714
},
{
"epoch": 0.06066734074823053,
"grad_norm": 3.5596694946289062,
"learning_rate": 9.996979999591982e-06,
"loss": 0.5269993543624878,
"memory(GiB)": 74.11,
"step": 75,
"token_acc": 0.8168316831683168,
"train_speed(iter/s)": 0.022715
},
{
"epoch": 0.06147623862487361,
"grad_norm": 3.213773012161255,
"learning_rate": 9.996497570656063e-06,
"loss": 0.5459144711494446,
"memory(GiB)": 74.11,
"step": 76,
"token_acc": 0.7665198237885462,
"train_speed(iter/s)": 0.022716
},
{
"epoch": 0.06228513650151668,
"grad_norm": 3.1109633445739746,
"learning_rate": 9.995979423530893e-06,
"loss": 0.5678860545158386,
"memory(GiB)": 74.11,
"step": 77,
"token_acc": 0.8123076923076923,
"train_speed(iter/s)": 0.022717
},
{
"epoch": 0.06309403437815976,
"grad_norm": 3.668972969055176,
"learning_rate": 9.99542556192052e-06,
"loss": 0.5075556039810181,
"memory(GiB)": 74.11,
"step": 78,
"token_acc": 0.84,
"train_speed(iter/s)": 0.022718
},
{
"epoch": 0.06390293225480283,
"grad_norm": 4.338983535766602,
"learning_rate": 9.994835989784305e-06,
"loss": 0.5242471098899841,
"memory(GiB)": 74.11,
"step": 79,
"token_acc": 0.865,
"train_speed(iter/s)": 0.022721
},
{
"epoch": 0.06471183013144591,
"grad_norm": 4.064675807952881,
"learning_rate": 9.99421071133689e-06,
"loss": 0.6131962537765503,
"memory(GiB)": 74.11,
"step": 80,
"token_acc": 0.7767857142857143,
"train_speed(iter/s)": 0.022722
},
{
"epoch": 0.06552072800808897,
"grad_norm": 3.6171154975891113,
"learning_rate": 9.993549731048171e-06,
"loss": 0.5887628197669983,
"memory(GiB)": 74.11,
"step": 81,
"token_acc": 0.7992125984251969,
"train_speed(iter/s)": 0.022724
},
{
"epoch": 0.06632962588473205,
"grad_norm": 3.9707374572753906,
"learning_rate": 9.992853053643257e-06,
"loss": 0.5989000201225281,
"memory(GiB)": 74.11,
"step": 82,
"token_acc": 0.8346456692913385,
"train_speed(iter/s)": 0.022726
},
{
"epoch": 0.06713852376137512,
"grad_norm": 4.361082077026367,
"learning_rate": 9.992120684102453e-06,
"loss": 0.6060096025466919,
"memory(GiB)": 74.11,
"step": 83,
"token_acc": 0.8148148148148148,
"train_speed(iter/s)": 0.022728
},
{
"epoch": 0.0679474216380182,
"grad_norm": 3.9677209854125977,
"learning_rate": 9.991352627661205e-06,
"loss": 0.5200193524360657,
"memory(GiB)": 74.11,
"step": 84,
"token_acc": 0.8506787330316742,
"train_speed(iter/s)": 0.022729
},
{
"epoch": 0.06875631951466127,
"grad_norm": 3.435011863708496,
"learning_rate": 9.990548889810078e-06,
"loss": 0.6048153638839722,
"memory(GiB)": 74.11,
"step": 85,
"token_acc": 0.8391608391608392,
"train_speed(iter/s)": 0.022731
},
{
"epoch": 0.06956521739130435,
"grad_norm": 3.5457801818847656,
"learning_rate": 9.989709476294708e-06,
"loss": 0.5572282671928406,
"memory(GiB)": 74.11,
"step": 86,
"token_acc": 0.8181818181818182,
"train_speed(iter/s)": 0.022732
},
{
"epoch": 0.07037411526794742,
"grad_norm": 3.885216474533081,
"learning_rate": 9.988834393115768e-06,
"loss": 0.5753508806228638,
"memory(GiB)": 74.11,
"step": 87,
"token_acc": 0.8823529411764706,
"train_speed(iter/s)": 0.022732
},
{
"epoch": 0.0711830131445905,
"grad_norm": 3.5327308177948,
"learning_rate": 9.987923646528911e-06,
"loss": 0.5835089683532715,
"memory(GiB)": 74.11,
"step": 88,
"token_acc": 0.8226221079691517,
"train_speed(iter/s)": 0.022731
},
{
"epoch": 0.07199191102123358,
"grad_norm": 3.0550527572631836,
"learning_rate": 9.986977243044747e-06,
"loss": 0.5215576887130737,
"memory(GiB)": 74.11,
"step": 89,
"token_acc": 0.8870292887029289,
"train_speed(iter/s)": 0.022732
},
{
"epoch": 0.07280080889787664,
"grad_norm": 3.0193593502044678,
"learning_rate": 9.985995189428775e-06,
"loss": 0.4884870648384094,
"memory(GiB)": 74.11,
"step": 90,
"token_acc": 0.8713235294117647,
"train_speed(iter/s)": 0.022732
},
{
"epoch": 0.07360970677451972,
"grad_norm": 3.2098543643951416,
"learning_rate": 9.984977492701351e-06,
"loss": 0.5010548233985901,
"memory(GiB)": 74.11,
"step": 91,
"token_acc": 0.8104575163398693,
"train_speed(iter/s)": 0.022734
},
{
"epoch": 0.07441860465116279,
"grad_norm": 3.6859188079833984,
"learning_rate": 9.983924160137627e-06,
"loss": 0.5493002533912659,
"memory(GiB)": 74.11,
"step": 92,
"token_acc": 0.7937743190661478,
"train_speed(iter/s)": 0.022734
},
{
"epoch": 0.07522750252780587,
"grad_norm": 3.2814273834228516,
"learning_rate": 9.982835199267502e-06,
"loss": 0.6033581495285034,
"memory(GiB)": 74.11,
"step": 93,
"token_acc": 0.8416666666666667,
"train_speed(iter/s)": 0.022734
},
{
"epoch": 0.07603640040444894,
"grad_norm": 3.5553441047668457,
"learning_rate": 9.981710617875575e-06,
"loss": 0.6103281378746033,
"memory(GiB)": 74.11,
"step": 94,
"token_acc": 0.7589285714285714,
"train_speed(iter/s)": 0.022736
},
{
"epoch": 0.07684529828109202,
"grad_norm": 3.5121068954467773,
"learning_rate": 9.980550424001077e-06,
"loss": 0.5484324097633362,
"memory(GiB)": 74.11,
"step": 95,
"token_acc": 0.8211678832116789,
"train_speed(iter/s)": 0.022736
},
{
"epoch": 0.07765419615773508,
"grad_norm": 2.6635591983795166,
"learning_rate": 9.979354625937821e-06,
"loss": 0.509511411190033,
"memory(GiB)": 74.11,
"step": 96,
"token_acc": 0.8333333333333334,
"train_speed(iter/s)": 0.022736
},
{
"epoch": 0.07846309403437816,
"grad_norm": 3.5615248680114746,
"learning_rate": 9.978123232234147e-06,
"loss": 0.5271934270858765,
"memory(GiB)": 74.11,
"step": 97,
"token_acc": 0.815625,
"train_speed(iter/s)": 0.022737
},
{
"epoch": 0.07927199191102123,
"grad_norm": 4.439089775085449,
"learning_rate": 9.976856251692851e-06,
"loss": 0.5473837852478027,
"memory(GiB)": 74.11,
"step": 98,
"token_acc": 0.843558282208589,
"train_speed(iter/s)": 0.022738
},
{
"epoch": 0.08008088978766431,
"grad_norm": 3.3765029907226562,
"learning_rate": 9.975553693371124e-06,
"loss": 0.572515070438385,
"memory(GiB)": 74.11,
"step": 99,
"token_acc": 0.8262411347517731,
"train_speed(iter/s)": 0.022739
},
{
"epoch": 0.08088978766430738,
"grad_norm": 3.8845911026000977,
"learning_rate": 9.974215566580499e-06,
"loss": 0.5989265441894531,
"memory(GiB)": 74.11,
"step": 100,
"token_acc": 0.8562091503267973,
"train_speed(iter/s)": 0.022741
},
{
"epoch": 0.08169868554095046,
"grad_norm": 3.336557626724243,
"learning_rate": 9.972841880886766e-06,
"loss": 0.5662233829498291,
"memory(GiB)": 74.11,
"step": 101,
"token_acc": 0.8298755186721992,
"train_speed(iter/s)": 0.022741
},
{
"epoch": 0.08250758341759352,
"grad_norm": 2.8836798667907715,
"learning_rate": 9.971432646109919e-06,
"loss": 0.44332605600357056,
"memory(GiB)": 74.11,
"step": 102,
"token_acc": 0.8586572438162544,
"train_speed(iter/s)": 0.022742
},
{
"epoch": 0.0833164812942366,
"grad_norm": 4.133236885070801,
"learning_rate": 9.969987872324076e-06,
"loss": 0.5478776693344116,
"memory(GiB)": 74.11,
"step": 103,
"token_acc": 0.8424908424908425,
"train_speed(iter/s)": 0.022743
},
{
"epoch": 0.08412537917087967,
"grad_norm": 4.5403828620910645,
"learning_rate": 9.968507569857413e-06,
"loss": 0.5256601572036743,
"memory(GiB)": 74.11,
"step": 104,
"token_acc": 0.7886178861788617,
"train_speed(iter/s)": 0.022744
},
{
"epoch": 0.08493427704752275,
"grad_norm": 3.083695888519287,
"learning_rate": 9.966991749292088e-06,
"loss": 0.560812771320343,
"memory(GiB)": 74.11,
"step": 105,
"token_acc": 0.8056537102473498,
"train_speed(iter/s)": 0.022745
},
{
"epoch": 0.08574317492416582,
"grad_norm": 2.619795083999634,
"learning_rate": 9.965440421464163e-06,
"loss": 0.5007873773574829,
"memory(GiB)": 74.11,
"step": 106,
"token_acc": 0.8132295719844358,
"train_speed(iter/s)": 0.022745
},
{
"epoch": 0.0865520728008089,
"grad_norm": 3.6254372596740723,
"learning_rate": 9.963853597463533e-06,
"loss": 0.49696582555770874,
"memory(GiB)": 74.11,
"step": 107,
"token_acc": 0.846441947565543,
"train_speed(iter/s)": 0.022747
},
{
"epoch": 0.08736097067745197,
"grad_norm": 3.388469934463501,
"learning_rate": 9.962231288633838e-06,
"loss": 0.4739895462989807,
"memory(GiB)": 74.11,
"step": 108,
"token_acc": 0.84,
"train_speed(iter/s)": 0.022748
},
{
"epoch": 0.08816986855409505,
"grad_norm": 2.8459818363189697,
"learning_rate": 9.960573506572391e-06,
"loss": 0.46099379658699036,
"memory(GiB)": 74.11,
"step": 109,
"token_acc": 0.821875,
"train_speed(iter/s)": 0.022748
},
{
"epoch": 0.08897876643073811,
"grad_norm": 3.143099546432495,
"learning_rate": 9.958880263130084e-06,
"loss": 0.48788702487945557,
"memory(GiB)": 74.11,
"step": 110,
"token_acc": 0.8125,
"train_speed(iter/s)": 0.022748
},
{
"epoch": 0.0897876643073812,
"grad_norm": 3.5926871299743652,
"learning_rate": 9.957151570411317e-06,
"loss": 0.5500156879425049,
"memory(GiB)": 74.11,
"step": 111,
"token_acc": 0.8222222222222222,
"train_speed(iter/s)": 0.022748
},
{
"epoch": 0.09059656218402427,
"grad_norm": 5.149491310119629,
"learning_rate": 9.955387440773902e-06,
"loss": 0.5181611776351929,
"memory(GiB)": 74.11,
"step": 112,
"token_acc": 0.8571428571428571,
"train_speed(iter/s)": 0.022749
},
{
"epoch": 0.09140546006066734,
"grad_norm": 4.696843147277832,
"learning_rate": 9.953587886828973e-06,
"loss": 0.5575085282325745,
"memory(GiB)": 74.11,
"step": 113,
"token_acc": 0.7924528301886793,
"train_speed(iter/s)": 0.02275
},
{
"epoch": 0.09221435793731042,
"grad_norm": 4.4397053718566895,
"learning_rate": 9.951752921440904e-06,
"loss": 0.5986132621765137,
"memory(GiB)": 74.11,
"step": 114,
"token_acc": 0.8097560975609757,
"train_speed(iter/s)": 0.022749
},
{
"epoch": 0.09302325581395349,
"grad_norm": 3.5311803817749023,
"learning_rate": 9.949882557727215e-06,
"loss": 0.47439733147621155,
"memory(GiB)": 74.11,
"step": 115,
"token_acc": 0.8576779026217228,
"train_speed(iter/s)": 0.022749
},
{
"epoch": 0.09383215369059657,
"grad_norm": 4.034605503082275,
"learning_rate": 9.947976809058468e-06,
"loss": 0.52587890625,
"memory(GiB)": 74.11,
"step": 116,
"token_acc": 0.8388888888888889,
"train_speed(iter/s)": 0.02275
},
{
"epoch": 0.09464105156723963,
"grad_norm": 2.4622230529785156,
"learning_rate": 9.946035689058189e-06,
"loss": 0.5111696720123291,
"memory(GiB)": 74.11,
"step": 117,
"token_acc": 0.8653846153846154,
"train_speed(iter/s)": 0.02275
},
{
"epoch": 0.09544994944388271,
"grad_norm": 4.2029523849487305,
"learning_rate": 9.944059211602752e-06,
"loss": 0.644461452960968,
"memory(GiB)": 74.11,
"step": 118,
"token_acc": 0.8391304347826087,
"train_speed(iter/s)": 0.02275
},
{
"epoch": 0.09625884732052578,
"grad_norm": 3.6433732509613037,
"learning_rate": 9.942047390821296e-06,
"loss": 0.529866099357605,
"memory(GiB)": 74.11,
"step": 119,
"token_acc": 0.8287671232876712,
"train_speed(iter/s)": 0.02275
},
{
"epoch": 0.09706774519716886,
"grad_norm": 2.930225133895874,
"learning_rate": 9.940000241095616e-06,
"loss": 0.53721022605896,
"memory(GiB)": 74.11,
"step": 120,
"token_acc": 0.8637873754152824,
"train_speed(iter/s)": 0.022751
},
{
"epoch": 0.09787664307381193,
"grad_norm": 3.059379816055298,
"learning_rate": 9.937917777060057e-06,
"loss": 0.5285677909851074,
"memory(GiB)": 74.11,
"step": 121,
"token_acc": 0.7914691943127962,
"train_speed(iter/s)": 0.022752
},
{
"epoch": 0.09868554095045501,
"grad_norm": 3.1179027557373047,
"learning_rate": 9.935800013601415e-06,
"loss": 0.543626606464386,
"memory(GiB)": 74.11,
"step": 122,
"token_acc": 0.8638132295719845,
"train_speed(iter/s)": 0.022751
},
{
"epoch": 0.09949443882709808,
"grad_norm": 2.9850940704345703,
"learning_rate": 9.933646965858832e-06,
"loss": 0.5759721994400024,
"memory(GiB)": 74.11,
"step": 123,
"token_acc": 0.8392857142857143,
"train_speed(iter/s)": 0.022752
},
{
"epoch": 0.10030333670374116,
"grad_norm": 3.2056992053985596,
"learning_rate": 9.931458649223683e-06,
"loss": 0.5128383636474609,
"memory(GiB)": 74.11,
"step": 124,
"token_acc": 0.8404255319148937,
"train_speed(iter/s)": 0.022753
},
{
"epoch": 0.10111223458038422,
"grad_norm": 3.4550704956054688,
"learning_rate": 9.929235079339466e-06,
"loss": 0.4931023418903351,
"memory(GiB)": 74.11,
"step": 125,
"token_acc": 0.7634069400630915,
"train_speed(iter/s)": 0.022754
},
{
"epoch": 0.1019211324570273,
"grad_norm": 4.975637912750244,
"learning_rate": 9.926976272101693e-06,
"loss": 0.5036507844924927,
"memory(GiB)": 74.11,
"step": 126,
"token_acc": 0.8422818791946308,
"train_speed(iter/s)": 0.022754
},
{
"epoch": 0.10273003033367037,
"grad_norm": 3.2330217361450195,
"learning_rate": 9.92468224365778e-06,
"loss": 0.4464947581291199,
"memory(GiB)": 74.11,
"step": 127,
"token_acc": 0.8804347826086957,
"train_speed(iter/s)": 0.022754
},
{
"epoch": 0.10353892821031345,
"grad_norm": 2.581622362136841,
"learning_rate": 9.922353010406918e-06,
"loss": 0.5149933695793152,
"memory(GiB)": 74.11,
"step": 128,
"token_acc": 0.8318181818181818,
"train_speed(iter/s)": 0.022755
},
{
"epoch": 0.10434782608695652,
"grad_norm": 2.6486399173736572,
"learning_rate": 9.919988588999971e-06,
"loss": 0.5142784118652344,
"memory(GiB)": 74.11,
"step": 129,
"token_acc": 0.8621908127208481,
"train_speed(iter/s)": 0.022756
},
{
"epoch": 0.1051567239635996,
"grad_norm": 3.3094420433044434,
"learning_rate": 9.917588996339352e-06,
"loss": 0.5297855734825134,
"memory(GiB)": 74.11,
"step": 130,
"token_acc": 0.8177339901477833,
"train_speed(iter/s)": 0.022757
},
{
"epoch": 0.10596562184024266,
"grad_norm": 2.769592046737671,
"learning_rate": 9.915154249578894e-06,
"loss": 0.5081691145896912,
"memory(GiB)": 74.11,
"step": 131,
"token_acc": 0.8755364806866953,
"train_speed(iter/s)": 0.022758
},
{
"epoch": 0.10677451971688574,
"grad_norm": 2.8748629093170166,
"learning_rate": 9.91268436612374e-06,
"loss": 0.5512316823005676,
"memory(GiB)": 74.11,
"step": 132,
"token_acc": 0.8618181818181818,
"train_speed(iter/s)": 0.022757
},
{
"epoch": 0.10758341759352881,
"grad_norm": 3.3325603008270264,
"learning_rate": 9.91017936363021e-06,
"loss": 0.48270368576049805,
"memory(GiB)": 74.11,
"step": 133,
"token_acc": 0.8526315789473684,
"train_speed(iter/s)": 0.022757
},
{
"epoch": 0.10839231547017189,
"grad_norm": 4.002824783325195,
"learning_rate": 9.907639260005682e-06,
"loss": 0.48671406507492065,
"memory(GiB)": 74.11,
"step": 134,
"token_acc": 0.8547717842323651,
"train_speed(iter/s)": 0.022758
},
{
"epoch": 0.10920121334681497,
"grad_norm": 3.655064344406128,
"learning_rate": 9.90506407340845e-06,
"loss": 0.5502010583877563,
"memory(GiB)": 74.11,
"step": 135,
"token_acc": 0.7976190476190477,
"train_speed(iter/s)": 0.022758
},
{
"epoch": 0.11001011122345804,
"grad_norm": 3.198472023010254,
"learning_rate": 9.902453822247615e-06,
"loss": 0.47892680764198303,
"memory(GiB)": 74.11,
"step": 136,
"token_acc": 0.8318965517241379,
"train_speed(iter/s)": 0.022759
},
{
"epoch": 0.11081900910010112,
"grad_norm": 2.7282052040100098,
"learning_rate": 9.899808525182935e-06,
"loss": 0.49719753861427307,
"memory(GiB)": 74.11,
"step": 137,
"token_acc": 0.8417508417508418,
"train_speed(iter/s)": 0.022759
},
{
"epoch": 0.11162790697674418,
"grad_norm": 3.089430093765259,
"learning_rate": 9.897128201124699e-06,
"loss": 0.532843291759491,
"memory(GiB)": 74.11,
"step": 138,
"token_acc": 0.8152173913043478,
"train_speed(iter/s)": 0.022759
},
{
"epoch": 0.11243680485338726,
"grad_norm": 6.901391983032227,
"learning_rate": 9.894412869233597e-06,
"loss": 0.5238447189331055,
"memory(GiB)": 74.11,
"step": 139,
"token_acc": 0.8558558558558559,
"train_speed(iter/s)": 0.02276
},
{
"epoch": 0.11324570273003033,
"grad_norm": 3.125302791595459,
"learning_rate": 9.89166254892057e-06,
"loss": 0.5573660135269165,
"memory(GiB)": 74.11,
"step": 140,
"token_acc": 0.8068181818181818,
"train_speed(iter/s)": 0.022761
},
{
"epoch": 0.11405460060667341,
"grad_norm": 3.38075852394104,
"learning_rate": 9.888877259846686e-06,
"loss": 0.524215817451477,
"memory(GiB)": 74.33,
"step": 141,
"token_acc": 0.8505338078291815,
"train_speed(iter/s)": 0.02276
},
{
"epoch": 0.11486349848331648,
"grad_norm": 3.413461446762085,
"learning_rate": 9.886057021922984e-06,
"loss": 0.49190688133239746,
"memory(GiB)": 74.33,
"step": 142,
"token_acc": 0.8333333333333334,
"train_speed(iter/s)": 0.02276
},
{
"epoch": 0.11567239635995956,
"grad_norm": 4.181169509887695,
"learning_rate": 9.88320185531035e-06,
"loss": 0.542352557182312,
"memory(GiB)": 74.33,
"step": 143,
"token_acc": 0.8503649635036497,
"train_speed(iter/s)": 0.02276
},
{
"epoch": 0.11648129423660263,
"grad_norm": 2.688110828399658,
"learning_rate": 9.880311780419353e-06,
"loss": 0.5551398992538452,
"memory(GiB)": 74.33,
"step": 144,
"token_acc": 0.8007246376811594,
"train_speed(iter/s)": 0.02276
},
{
"epoch": 0.1172901921132457,
"grad_norm": 3.9851884841918945,
"learning_rate": 9.877386817910118e-06,
"loss": 0.49384480714797974,
"memory(GiB)": 74.33,
"step": 145,
"token_acc": 0.8333333333333334,
"train_speed(iter/s)": 0.02276
},
{
"epoch": 0.11809908998988877,
"grad_norm": 2.6871986389160156,
"learning_rate": 9.874426988692163e-06,
"loss": 0.5515081286430359,
"memory(GiB)": 74.33,
"step": 146,
"token_acc": 0.8006644518272426,
"train_speed(iter/s)": 0.02276
},
{
"epoch": 0.11890798786653185,
"grad_norm": 2.288706064224243,
"learning_rate": 9.871432313924255e-06,
"loss": 0.4420849084854126,
"memory(GiB)": 74.33,
"step": 147,
"token_acc": 0.8091872791519434,
"train_speed(iter/s)": 0.022761
},
{
"epoch": 0.11971688574317492,
"grad_norm": 2.6680195331573486,
"learning_rate": 9.868402815014266e-06,
"loss": 0.4678765833377838,
"memory(GiB)": 74.33,
"step": 148,
"token_acc": 0.8604651162790697,
"train_speed(iter/s)": 0.022761
},
{
"epoch": 0.120525783619818,
"grad_norm": 2.3895063400268555,
"learning_rate": 9.865338513619005e-06,
"loss": 0.4832306504249573,
"memory(GiB)": 74.33,
"step": 149,
"token_acc": 0.8480565371024735,
"train_speed(iter/s)": 0.022761
},
{
"epoch": 0.12133468149646107,
"grad_norm": 2.4143781661987305,
"learning_rate": 9.86223943164408e-06,
"loss": 0.49357208609580994,
"memory(GiB)": 74.33,
"step": 150,
"token_acc": 0.8461538461538461,
"train_speed(iter/s)": 0.022762
},
{
"epoch": 0.12214357937310415,
"grad_norm": 3.0790457725524902,
"learning_rate": 9.859105591243728e-06,
"loss": 0.4809868633747101,
"memory(GiB)": 74.33,
"step": 151,
"token_acc": 0.8617021276595744,
"train_speed(iter/s)": 0.022762
},
{
"epoch": 0.12295247724974721,
"grad_norm": 3.636885643005371,
"learning_rate": 9.85593701482066e-06,
"loss": 0.5743482112884521,
"memory(GiB)": 74.33,
"step": 152,
"token_acc": 0.8461538461538461,
"train_speed(iter/s)": 0.022763
},
{
"epoch": 0.1237613751263903,
"grad_norm": 2.7628660202026367,
"learning_rate": 9.85273372502591e-06,
"loss": 0.46740931272506714,
"memory(GiB)": 74.33,
"step": 153,
"token_acc": 0.8658008658008658,
"train_speed(iter/s)": 0.022763
},
{
"epoch": 0.12457027300303336,
"grad_norm": 3.155374765396118,
"learning_rate": 9.849495744758654e-06,
"loss": 0.5438951253890991,
"memory(GiB)": 74.33,
"step": 154,
"token_acc": 0.8550185873605948,
"train_speed(iter/s)": 0.022763
},
{
"epoch": 0.12537917087967643,
"grad_norm": 2.9564826488494873,
"learning_rate": 9.846223097166072e-06,
"loss": 0.537287175655365,
"memory(GiB)": 74.33,
"step": 155,
"token_acc": 0.8456140350877193,
"train_speed(iter/s)": 0.022764
},
{
"epoch": 0.12618806875631952,
"grad_norm": 2.997941017150879,
"learning_rate": 9.842915805643156e-06,
"loss": 0.4728841781616211,
"memory(GiB)": 74.33,
"step": 156,
"token_acc": 0.9,
"train_speed(iter/s)": 0.022764
},
{
"epoch": 0.1269969666329626,
"grad_norm": 4.7811431884765625,
"learning_rate": 9.839573893832564e-06,
"loss": 0.48365718126296997,
"memory(GiB)": 74.33,
"step": 157,
"token_acc": 0.8501742160278746,
"train_speed(iter/s)": 0.022764
},
{
"epoch": 0.12780586450960565,
"grad_norm": 2.611847400665283,
"learning_rate": 9.836197385624434e-06,
"loss": 0.4837043285369873,
"memory(GiB)": 74.33,
"step": 158,
"token_acc": 0.8952879581151832,
"train_speed(iter/s)": 0.022765
},
{
"epoch": 0.12861476238624875,
"grad_norm": 3.331645965576172,
"learning_rate": 9.83278630515623e-06,
"loss": 0.5694408416748047,
"memory(GiB)": 74.33,
"step": 159,
"token_acc": 0.8177966101694916,
"train_speed(iter/s)": 0.022765
},
{
"epoch": 0.12942366026289182,
"grad_norm": 3.4143426418304443,
"learning_rate": 9.829340676812553e-06,
"loss": 0.5614443421363831,
"memory(GiB)": 74.33,
"step": 160,
"token_acc": 0.8487972508591065,
"train_speed(iter/s)": 0.022765
},
{
"epoch": 0.13023255813953488,
"grad_norm": 2.541956901550293,
"learning_rate": 9.825860525224982e-06,
"loss": 0.48274075984954834,
"memory(GiB)": 74.33,
"step": 161,
"token_acc": 0.8207885304659498,
"train_speed(iter/s)": 0.022765
},
{
"epoch": 0.13104145601617795,
"grad_norm": 2.933729410171509,
"learning_rate": 9.822345875271884e-06,
"loss": 0.47431913018226624,
"memory(GiB)": 74.33,
"step": 162,
"token_acc": 0.8713450292397661,
"train_speed(iter/s)": 0.022766
},
{
"epoch": 0.13185035389282104,
"grad_norm": 2.8055856227874756,
"learning_rate": 9.818796752078246e-06,
"loss": 0.5554227232933044,
"memory(GiB)": 74.33,
"step": 163,
"token_acc": 0.8627450980392157,
"train_speed(iter/s)": 0.022766
},
{
"epoch": 0.1326592517694641,
"grad_norm": 2.662719488143921,
"learning_rate": 9.815213181015489e-06,
"loss": 0.4458203911781311,
"memory(GiB)": 74.33,
"step": 164,
"token_acc": 0.8825622775800712,
"train_speed(iter/s)": 0.022766
},
{
"epoch": 0.13346814964610718,
"grad_norm": 5.495974540710449,
"learning_rate": 9.811595187701296e-06,
"loss": 0.4638062119483948,
"memory(GiB)": 74.33,
"step": 165,
"token_acc": 0.8227848101265823,
"train_speed(iter/s)": 0.022766
},
{
"epoch": 0.13427704752275024,
"grad_norm": 84.01348114013672,
"learning_rate": 9.807942797999412e-06,
"loss": 0.6657401323318481,
"memory(GiB)": 74.33,
"step": 166,
"token_acc": 0.8483606557377049,
"train_speed(iter/s)": 0.022767
},
{
"epoch": 0.13508594539939334,
"grad_norm": 138.69554138183594,
"learning_rate": 9.804256038019482e-06,
"loss": 0.6723936796188354,
"memory(GiB)": 74.33,
"step": 167,
"token_acc": 0.8143712574850299,
"train_speed(iter/s)": 0.022767
},
{
"epoch": 0.1358948432760364,
"grad_norm": 11.966114044189453,
"learning_rate": 9.800534934116843e-06,
"loss": 0.5228875875473022,
"memory(GiB)": 74.33,
"step": 168,
"token_acc": 0.8411552346570397,
"train_speed(iter/s)": 0.022768
},
{
"epoch": 0.13670374115267947,
"grad_norm": 3.311744451522827,
"learning_rate": 9.796779512892346e-06,
"loss": 0.5082340240478516,
"memory(GiB)": 74.33,
"step": 169,
"token_acc": 0.8514056224899599,
"train_speed(iter/s)": 0.022768
},
{
"epoch": 0.13751263902932254,
"grad_norm": 2.891026735305786,
"learning_rate": 9.792989801192167e-06,
"loss": 0.4903358519077301,
"memory(GiB)": 74.33,
"step": 170,
"token_acc": 0.8439490445859873,
"train_speed(iter/s)": 0.022769
},
{
"epoch": 0.13832153690596563,
"grad_norm": 2.643505096435547,
"learning_rate": 9.789165826107612e-06,
"loss": 0.514635443687439,
"memory(GiB)": 74.33,
"step": 171,
"token_acc": 0.8709677419354839,
"train_speed(iter/s)": 0.022769
},
{
"epoch": 0.1391304347826087,
"grad_norm": 2.8423476219177246,
"learning_rate": 9.785307614974922e-06,
"loss": 0.5150923728942871,
"memory(GiB)": 74.33,
"step": 172,
"token_acc": 0.796875,
"train_speed(iter/s)": 0.022769
},
{
"epoch": 0.13993933265925176,
"grad_norm": 2.4324862957000732,
"learning_rate": 9.781415195375078e-06,
"loss": 0.4808637797832489,
"memory(GiB)": 74.33,
"step": 173,
"token_acc": 0.8296529968454258,
"train_speed(iter/s)": 0.022769
},
{
"epoch": 0.14074823053589483,
"grad_norm": 2.2403547763824463,
"learning_rate": 9.77748859513361e-06,
"loss": 0.4378691017627716,
"memory(GiB)": 74.33,
"step": 174,
"token_acc": 0.8571428571428571,
"train_speed(iter/s)": 0.022769
},
{
"epoch": 0.14155712841253792,
"grad_norm": 2.552274703979492,
"learning_rate": 9.77352784232039e-06,
"loss": 0.4910467565059662,
"memory(GiB)": 74.33,
"step": 175,
"token_acc": 0.8369565217391305,
"train_speed(iter/s)": 0.022769
},
{
"epoch": 0.142366026289181,
"grad_norm": 2.844341278076172,
"learning_rate": 9.769532965249435e-06,
"loss": 0.5578226447105408,
"memory(GiB)": 74.33,
"step": 176,
"token_acc": 0.8274509803921568,
"train_speed(iter/s)": 0.02277
},
{
"epoch": 0.14317492416582406,
"grad_norm": 2.700742483139038,
"learning_rate": 9.765503992478704e-06,
"loss": 0.4441274404525757,
"memory(GiB)": 74.33,
"step": 177,
"token_acc": 0.8543689320388349,
"train_speed(iter/s)": 0.02277
},
{
"epoch": 0.14398382204246715,
"grad_norm": 2.824364185333252,
"learning_rate": 9.761440952809897e-06,
"loss": 0.5075165033340454,
"memory(GiB)": 74.33,
"step": 178,
"token_acc": 0.8222222222222222,
"train_speed(iter/s)": 0.022771
},
{
"epoch": 0.14479271991911022,
"grad_norm": 3.220512628555298,
"learning_rate": 9.757343875288242e-06,
"loss": 0.47000789642333984,
"memory(GiB)": 74.33,
"step": 179,
"token_acc": 0.828125,
"train_speed(iter/s)": 0.022772
},
{
"epoch": 0.14560161779575329,
"grad_norm": 2.345557689666748,
"learning_rate": 9.75321278920229e-06,
"loss": 0.5143015384674072,
"memory(GiB)": 74.33,
"step": 180,
"token_acc": 0.8530465949820788,
"train_speed(iter/s)": 0.022771
},
{
"epoch": 0.14641051567239635,
"grad_norm": 3.0752451419830322,
"learning_rate": 9.749047724083717e-06,
"loss": 0.5505862236022949,
"memory(GiB)": 74.33,
"step": 181,
"token_acc": 0.8475609756097561,
"train_speed(iter/s)": 0.022772
},
{
"epoch": 0.14721941354903945,
"grad_norm": 2.662064552307129,
"learning_rate": 9.74484870970709e-06,
"loss": 0.5013206601142883,
"memory(GiB)": 74.33,
"step": 182,
"token_acc": 0.873015873015873,
"train_speed(iter/s)": 0.022772
},
{
"epoch": 0.1480283114256825,
"grad_norm": 3.027050256729126,
"learning_rate": 9.74061577608968e-06,
"loss": 0.554660439491272,
"memory(GiB)": 74.33,
"step": 183,
"token_acc": 0.8388278388278388,
"train_speed(iter/s)": 0.022772
},
{
"epoch": 0.14883720930232558,
"grad_norm": 3.55436635017395,
"learning_rate": 9.736348953491224e-06,
"loss": 0.5106396675109863,
"memory(GiB)": 74.33,
"step": 184,
"token_acc": 0.797979797979798,
"train_speed(iter/s)": 0.022773
},
{
"epoch": 0.14964610717896865,
"grad_norm": 3.821077585220337,
"learning_rate": 9.732048272413725e-06,
"loss": 0.5329099297523499,
"memory(GiB)": 74.33,
"step": 185,
"token_acc": 0.8278388278388278,
"train_speed(iter/s)": 0.022773
},
{
"epoch": 0.15045500505561174,
"grad_norm": 2.861586332321167,
"learning_rate": 9.727713763601226e-06,
"loss": 0.48308447003364563,
"memory(GiB)": 74.33,
"step": 186,
"token_acc": 0.8556701030927835,
"train_speed(iter/s)": 0.022773
},
{
"epoch": 0.1512639029322548,
"grad_norm": 3.025512456893921,
"learning_rate": 9.723345458039595e-06,
"loss": 0.4873977601528168,
"memory(GiB)": 74.33,
"step": 187,
"token_acc": 0.8426573426573427,
"train_speed(iter/s)": 0.022773
},
{
"epoch": 0.15207280080889787,
"grad_norm": 2.5745112895965576,
"learning_rate": 9.718943386956298e-06,
"loss": 0.538512110710144,
"memory(GiB)": 74.33,
"step": 188,
"token_acc": 0.8155339805825242,
"train_speed(iter/s)": 0.022773
},
{
"epoch": 0.15288169868554094,
"grad_norm": 2.985320806503296,
"learning_rate": 9.714507581820181e-06,
"loss": 0.5343044400215149,
"memory(GiB)": 74.33,
"step": 189,
"token_acc": 0.7977099236641222,
"train_speed(iter/s)": 0.022773
},
{
"epoch": 0.15369059656218403,
"grad_norm": 3.339107036590576,
"learning_rate": 9.71003807434124e-06,
"loss": 0.5087035298347473,
"memory(GiB)": 74.33,
"step": 190,
"token_acc": 0.8478260869565217,
"train_speed(iter/s)": 0.022774
},
{
"epoch": 0.1544994944388271,
"grad_norm": 2.712999105453491,
"learning_rate": 9.705534896470401e-06,
"loss": 0.4998268783092499,
"memory(GiB)": 74.33,
"step": 191,
"token_acc": 0.8514056224899599,
"train_speed(iter/s)": 0.022774
},
{
"epoch": 0.15530839231547017,
"grad_norm": 3.6283011436462402,
"learning_rate": 9.700998080399287e-06,
"loss": 0.4922446608543396,
"memory(GiB)": 74.33,
"step": 192,
"token_acc": 0.810126582278481,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.15611729019211323,
"grad_norm": 2.546504020690918,
"learning_rate": 9.696427658559983e-06,
"loss": 0.5213550925254822,
"memory(GiB)": 74.33,
"step": 193,
"token_acc": 0.8381294964028777,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.15692618806875633,
"grad_norm": 3.0982861518859863,
"learning_rate": 9.691823663624817e-06,
"loss": 0.5097714066505432,
"memory(GiB)": 74.33,
"step": 194,
"token_acc": 0.8066914498141264,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.1577350859453994,
"grad_norm": 2.8496217727661133,
"learning_rate": 9.687186128506116e-06,
"loss": 0.5594595074653625,
"memory(GiB)": 74.33,
"step": 195,
"token_acc": 0.8622222222222222,
"train_speed(iter/s)": 0.022776
},
{
"epoch": 0.15854398382204246,
"grad_norm": 2.693981647491455,
"learning_rate": 9.682515086355973e-06,
"loss": 0.5774262547492981,
"memory(GiB)": 74.33,
"step": 196,
"token_acc": 0.7975708502024291,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.15935288169868553,
"grad_norm": 3.6492180824279785,
"learning_rate": 9.677810570566011e-06,
"loss": 0.5103310346603394,
"memory(GiB)": 74.33,
"step": 197,
"token_acc": 0.8129032258064516,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.16016177957532862,
"grad_norm": 2.6552608013153076,
"learning_rate": 9.673072614767147e-06,
"loss": 0.4744953513145447,
"memory(GiB)": 74.33,
"step": 198,
"token_acc": 0.8699186991869918,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.1609706774519717,
"grad_norm": 2.7724416255950928,
"learning_rate": 9.668301252829344e-06,
"loss": 0.4586220979690552,
"memory(GiB)": 74.33,
"step": 199,
"token_acc": 0.8583690987124464,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.16177957532861476,
"grad_norm": 3.1484899520874023,
"learning_rate": 9.663496518861381e-06,
"loss": 0.6070712208747864,
"memory(GiB)": 74.33,
"step": 200,
"token_acc": 0.8131313131313131,
"train_speed(iter/s)": 0.022776
},
{
"epoch": 0.16258847320525785,
"grad_norm": 4.5751142501831055,
"learning_rate": 9.658658447210595e-06,
"loss": 0.5579652786254883,
"memory(GiB)": 74.33,
"step": 201,
"token_acc": 0.8524590163934426,
"train_speed(iter/s)": 0.022776
},
{
"epoch": 0.16339737108190092,
"grad_norm": 2.3848133087158203,
"learning_rate": 9.653787072462644e-06,
"loss": 0.47080251574516296,
"memory(GiB)": 74.33,
"step": 202,
"token_acc": 0.9058823529411765,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.16420626895854398,
"grad_norm": 2.686843156814575,
"learning_rate": 9.648882429441258e-06,
"loss": 0.46535661816596985,
"memory(GiB)": 74.33,
"step": 203,
"token_acc": 0.8138528138528138,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.16501516683518705,
"grad_norm": 3.4251608848571777,
"learning_rate": 9.643944553207992e-06,
"loss": 0.42402440309524536,
"memory(GiB)": 74.33,
"step": 204,
"token_acc": 0.870722433460076,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.16582406471183014,
"grad_norm": 3.019339084625244,
"learning_rate": 9.63897347906197e-06,
"loss": 0.5313763618469238,
"memory(GiB)": 74.33,
"step": 205,
"token_acc": 0.8062283737024222,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.1666329625884732,
"grad_norm": 2.4439475536346436,
"learning_rate": 9.633969242539643e-06,
"loss": 0.47857385873794556,
"memory(GiB)": 74.33,
"step": 206,
"token_acc": 0.8204334365325078,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.16744186046511628,
"grad_norm": 2.991232395172119,
"learning_rate": 9.628931879414519e-06,
"loss": 0.5055133104324341,
"memory(GiB)": 74.33,
"step": 207,
"token_acc": 0.8614864864864865,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.16825075834175934,
"grad_norm": 2.8914828300476074,
"learning_rate": 9.623861425696919e-06,
"loss": 0.48094457387924194,
"memory(GiB)": 74.33,
"step": 208,
"token_acc": 0.8517110266159695,
"train_speed(iter/s)": 0.022774
},
{
"epoch": 0.16905965621840244,
"grad_norm": 3.07913875579834,
"learning_rate": 9.618757917633724e-06,
"loss": 0.4644262492656708,
"memory(GiB)": 74.33,
"step": 209,
"token_acc": 0.8459016393442623,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.1698685540950455,
"grad_norm": 3.3538849353790283,
"learning_rate": 9.6136213917081e-06,
"loss": 0.49402916431427,
"memory(GiB)": 74.33,
"step": 210,
"token_acc": 0.8023255813953488,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.17067745197168857,
"grad_norm": 2.8253116607666016,
"learning_rate": 9.608451884639249e-06,
"loss": 0.5242215394973755,
"memory(GiB)": 74.33,
"step": 211,
"token_acc": 0.8426966292134831,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.17148634984833164,
"grad_norm": 3.1118881702423096,
"learning_rate": 9.603249433382145e-06,
"loss": 0.4387696385383606,
"memory(GiB)": 74.33,
"step": 212,
"token_acc": 0.8384279475982532,
"train_speed(iter/s)": 0.022776
},
{
"epoch": 0.17229524772497473,
"grad_norm": 3.0564656257629395,
"learning_rate": 9.598014075127267e-06,
"loss": 0.4570474922657013,
"memory(GiB)": 74.33,
"step": 213,
"token_acc": 0.8423423423423423,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.1731041456016178,
"grad_norm": 2.173403024673462,
"learning_rate": 9.592745847300334e-06,
"loss": 0.4705919027328491,
"memory(GiB)": 74.33,
"step": 214,
"token_acc": 0.8900343642611683,
"train_speed(iter/s)": 0.022776
},
{
"epoch": 0.17391304347826086,
"grad_norm": 2.676457405090332,
"learning_rate": 9.587444787562038e-06,
"loss": 0.4593808650970459,
"memory(GiB)": 74.33,
"step": 215,
"token_acc": 0.8425655976676385,
"train_speed(iter/s)": 0.022776
},
{
"epoch": 0.17472194135490393,
"grad_norm": 2.6276440620422363,
"learning_rate": 9.582110933807778e-06,
"loss": 0.5120923519134521,
"memory(GiB)": 74.33,
"step": 216,
"token_acc": 0.8402555910543131,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.17553083923154703,
"grad_norm": 2.9223127365112305,
"learning_rate": 9.57674432416738e-06,
"loss": 0.5409821271896362,
"memory(GiB)": 74.33,
"step": 217,
"token_acc": 0.8786885245901639,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.1763397371081901,
"grad_norm": 2.7943737506866455,
"learning_rate": 9.571344997004833e-06,
"loss": 0.5195801854133606,
"memory(GiB)": 74.33,
"step": 218,
"token_acc": 0.8904761904761904,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.17714863498483316,
"grad_norm": 3.1022114753723145,
"learning_rate": 9.565912990918014e-06,
"loss": 0.5200923085212708,
"memory(GiB)": 74.33,
"step": 219,
"token_acc": 0.8181818181818182,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.17795753286147623,
"grad_norm": 2.570176124572754,
"learning_rate": 9.560448344738409e-06,
"loss": 0.5091375112533569,
"memory(GiB)": 74.33,
"step": 220,
"token_acc": 0.823045267489712,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.17876643073811932,
"grad_norm": 3.0033743381500244,
"learning_rate": 9.554951097530833e-06,
"loss": 0.4781090021133423,
"memory(GiB)": 74.33,
"step": 221,
"token_acc": 0.8544061302681992,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.1795753286147624,
"grad_norm": 2.6318182945251465,
"learning_rate": 9.549421288593157e-06,
"loss": 0.4314906597137451,
"memory(GiB)": 74.33,
"step": 222,
"token_acc": 0.8851063829787233,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.18038422649140545,
"grad_norm": 2.8283627033233643,
"learning_rate": 9.543858957456027e-06,
"loss": 0.5246187448501587,
"memory(GiB)": 74.33,
"step": 223,
"token_acc": 0.8140495867768595,
"train_speed(iter/s)": 0.022776
},
{
"epoch": 0.18119312436804855,
"grad_norm": 2.760436773300171,
"learning_rate": 9.538264143882573e-06,
"loss": 0.583112359046936,
"memory(GiB)": 74.33,
"step": 224,
"token_acc": 0.8316831683168316,
"train_speed(iter/s)": 0.022776
},
{
"epoch": 0.1820020222446916,
"grad_norm": 2.844444513320923,
"learning_rate": 9.532636887868132e-06,
"loss": 0.5270188450813293,
"memory(GiB)": 74.33,
"step": 225,
"token_acc": 0.8197424892703863,
"train_speed(iter/s)": 0.022776
},
{
"epoch": 0.18281092012133468,
"grad_norm": 3.431413173675537,
"learning_rate": 9.526977229639967e-06,
"loss": 0.6098812818527222,
"memory(GiB)": 74.33,
"step": 226,
"token_acc": 0.8528138528138528,
"train_speed(iter/s)": 0.022777
},
{
"epoch": 0.18361981799797775,
"grad_norm": 3.651771068572998,
"learning_rate": 9.521285209656964e-06,
"loss": 0.5220578908920288,
"memory(GiB)": 74.33,
"step": 227,
"token_acc": 0.8111888111888111,
"train_speed(iter/s)": 0.022776
},
{
"epoch": 0.18442871587462084,
"grad_norm": 2.586838960647583,
"learning_rate": 9.515560868609353e-06,
"loss": 0.5361062288284302,
"memory(GiB)": 74.33,
"step": 228,
"token_acc": 0.8318181818181818,
"train_speed(iter/s)": 0.022776
},
{
"epoch": 0.1852376137512639,
"grad_norm": 3.409284830093384,
"learning_rate": 9.509804247418421e-06,
"loss": 0.5047948360443115,
"memory(GiB)": 74.33,
"step": 229,
"token_acc": 0.83,
"train_speed(iter/s)": 0.022776
},
{
"epoch": 0.18604651162790697,
"grad_norm": 2.8747854232788086,
"learning_rate": 9.504015387236215e-06,
"loss": 0.4199560880661011,
"memory(GiB)": 74.33,
"step": 230,
"token_acc": 0.8304347826086956,
"train_speed(iter/s)": 0.022776
},
{
"epoch": 0.18685540950455004,
"grad_norm": 3.537949800491333,
"learning_rate": 9.498194329445235e-06,
"loss": 0.48431631922721863,
"memory(GiB)": 74.33,
"step": 231,
"token_acc": 0.8588957055214724,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.18766430738119314,
"grad_norm": 2.270864486694336,
"learning_rate": 9.492341115658167e-06,
"loss": 0.43944597244262695,
"memory(GiB)": 74.33,
"step": 232,
"token_acc": 0.8387096774193549,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.1884732052578362,
"grad_norm": 2.3423984050750732,
"learning_rate": 9.486455787717556e-06,
"loss": 0.4949726462364197,
"memory(GiB)": 74.33,
"step": 233,
"token_acc": 0.8244274809160306,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.18928210313447927,
"grad_norm": 2.186225175857544,
"learning_rate": 9.480538387695526e-06,
"loss": 0.5247252583503723,
"memory(GiB)": 74.33,
"step": 234,
"token_acc": 0.8256578947368421,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.19009100101112233,
"grad_norm": 6.916714191436768,
"learning_rate": 9.474588957893471e-06,
"loss": 0.5562118291854858,
"memory(GiB)": 74.33,
"step": 235,
"token_acc": 0.815668202764977,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.19089989888776543,
"grad_norm": 2.669564962387085,
"learning_rate": 9.468607540841755e-06,
"loss": 0.4648740589618683,
"memory(GiB)": 74.33,
"step": 236,
"token_acc": 0.8404255319148937,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.1917087967644085,
"grad_norm": 2.7446367740631104,
"learning_rate": 9.462594179299408e-06,
"loss": 0.47179466485977173,
"memory(GiB)": 74.33,
"step": 237,
"token_acc": 0.8296943231441049,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.19251769464105156,
"grad_norm": 2.733185052871704,
"learning_rate": 9.456548916253816e-06,
"loss": 0.43457281589508057,
"memory(GiB)": 74.33,
"step": 238,
"token_acc": 0.8382838283828383,
"train_speed(iter/s)": 0.022776
},
{
"epoch": 0.19332659251769463,
"grad_norm": 2.792586326599121,
"learning_rate": 9.450471794920425e-06,
"loss": 0.5208027362823486,
"memory(GiB)": 74.33,
"step": 239,
"token_acc": 0.8494623655913979,
"train_speed(iter/s)": 0.022776
},
{
"epoch": 0.19413549039433772,
"grad_norm": 3.106788158416748,
"learning_rate": 9.444362858742417e-06,
"loss": 0.5069155693054199,
"memory(GiB)": 74.33,
"step": 240,
"token_acc": 0.821917808219178,
"train_speed(iter/s)": 0.022776
},
{
"epoch": 0.1949443882709808,
"grad_norm": 2.545304298400879,
"learning_rate": 9.438222151390413e-06,
"loss": 0.48083266615867615,
"memory(GiB)": 74.33,
"step": 241,
"token_acc": 0.875,
"train_speed(iter/s)": 0.022776
},
{
"epoch": 0.19575328614762386,
"grad_norm": 2.3545124530792236,
"learning_rate": 9.432049716762151e-06,
"loss": 0.45232367515563965,
"memory(GiB)": 74.33,
"step": 242,
"token_acc": 0.8584070796460177,
"train_speed(iter/s)": 0.022776
},
{
"epoch": 0.19656218402426692,
"grad_norm": 2.424670934677124,
"learning_rate": 9.425845598982178e-06,
"loss": 0.46154850721359253,
"memory(GiB)": 74.33,
"step": 243,
"token_acc": 0.8481848184818482,
"train_speed(iter/s)": 0.022776
},
{
"epoch": 0.19737108190091002,
"grad_norm": 3.0621895790100098,
"learning_rate": 9.419609842401529e-06,
"loss": 0.5216662883758545,
"memory(GiB)": 74.33,
"step": 244,
"token_acc": 0.8381742738589212,
"train_speed(iter/s)": 0.022775
},
{
"epoch": 0.19817997977755308,
"grad_norm": 3.4800291061401367,
"learning_rate": 9.41334249159742e-06,
"loss": 0.578390896320343,
"memory(GiB)": 74.33,
"step": 245,
"token_acc": 0.8411214953271028,
"train_speed(iter/s)": 0.022776
},
{
"epoch": 0.19898887765419615,
"grad_norm": 2.887791633605957,
"learning_rate": 9.407043591372917e-06,
"loss": 0.45752766728401184,
"memory(GiB)": 74.33,
"step": 246,
"token_acc": 0.8452830188679246,
"train_speed(iter/s)": 0.022776
},
{
"epoch": 0.19979777553083924,
"grad_norm": 2.991569995880127,
"learning_rate": 9.400713186756625e-06,
"loss": 0.47424283623695374,
"memory(GiB)": 74.33,
"step": 247,
"token_acc": 0.8492063492063492,
"train_speed(iter/s)": 0.022776
},
{
"epoch": 0.2006066734074823,
"grad_norm": 2.222763776779175,
"learning_rate": 9.394351323002362e-06,
"loss": 0.4558030366897583,
"memory(GiB)": 74.33,
"step": 248,
"token_acc": 0.8471615720524017,
"train_speed(iter/s)": 0.022777
},
{
"epoch": 0.20141557128412538,
"grad_norm": 2.18121075630188,
"learning_rate": 9.387958045588837e-06,
"loss": 0.47976818680763245,
"memory(GiB)": 74.33,
"step": 249,
"token_acc": 0.8878048780487805,
"train_speed(iter/s)": 0.022777
},
{
"epoch": 0.20222446916076844,
"grad_norm": 2.4463536739349365,
"learning_rate": 9.381533400219319e-06,
"loss": 0.42482298612594604,
"memory(GiB)": 74.33,
"step": 250,
"token_acc": 0.8661971830985915,
"train_speed(iter/s)": 0.022777
},
{
"epoch": 0.20303336703741154,
"grad_norm": 2.2221012115478516,
"learning_rate": 9.375077432821322e-06,
"loss": 0.4842270016670227,
"memory(GiB)": 74.33,
"step": 251,
"token_acc": 0.8290909090909091,
"train_speed(iter/s)": 0.022777
},
{
"epoch": 0.2038422649140546,
"grad_norm": 2.4321460723876953,
"learning_rate": 9.368590189546268e-06,
"loss": 0.49549242854118347,
"memory(GiB)": 74.33,
"step": 252,
"token_acc": 0.8470948012232415,
"train_speed(iter/s)": 0.022777
},
{
"epoch": 0.20465116279069767,
"grad_norm": 2.9055986404418945,
"learning_rate": 9.362071716769158e-06,
"loss": 0.604824423789978,
"memory(GiB)": 74.33,
"step": 253,
"token_acc": 0.8354430379746836,
"train_speed(iter/s)": 0.022777
},
{
"epoch": 0.20546006066734074,
"grad_norm": 2.3008358478546143,
"learning_rate": 9.355522061088242e-06,
"loss": 0.43147778511047363,
"memory(GiB)": 74.33,
"step": 254,
"token_acc": 0.8907563025210085,
"train_speed(iter/s)": 0.022777
},
{
"epoch": 0.20626895854398383,
"grad_norm": 2.770148515701294,
"learning_rate": 9.348941269324686e-06,
"loss": 0.4882833957672119,
"memory(GiB)": 74.33,
"step": 255,
"token_acc": 0.8423423423423423,
"train_speed(iter/s)": 0.022778
},
{
"epoch": 0.2070778564206269,
"grad_norm": 3.3866539001464844,
"learning_rate": 9.342329388522239e-06,
"loss": 0.5174039006233215,
"memory(GiB)": 74.33,
"step": 256,
"token_acc": 0.825503355704698,
"train_speed(iter/s)": 0.022778
},
{
"epoch": 0.20788675429726997,
"grad_norm": 3.170250654220581,
"learning_rate": 9.335686465946888e-06,
"loss": 0.5126312971115112,
"memory(GiB)": 74.33,
"step": 257,
"token_acc": 0.8125,
"train_speed(iter/s)": 0.022778
},
{
"epoch": 0.20869565217391303,
"grad_norm": 2.1758675575256348,
"learning_rate": 9.32901254908653e-06,
"loss": 0.3875027298927307,
"memory(GiB)": 74.33,
"step": 258,
"token_acc": 0.7954545454545454,
"train_speed(iter/s)": 0.022778
},
{
"epoch": 0.20950455005055613,
"grad_norm": 2.4927093982696533,
"learning_rate": 9.322307685650638e-06,
"loss": 0.4708499312400818,
"memory(GiB)": 74.33,
"step": 259,
"token_acc": 0.8743718592964824,
"train_speed(iter/s)": 0.022778
},
{
"epoch": 0.2103134479271992,
"grad_norm": 3.2660865783691406,
"learning_rate": 9.315571923569892e-06,
"loss": 0.48012182116508484,
"memory(GiB)": 74.33,
"step": 260,
"token_acc": 0.8479087452471483,
"train_speed(iter/s)": 0.022778
},
{
"epoch": 0.21112234580384226,
"grad_norm": 2.607844829559326,
"learning_rate": 9.308805310995877e-06,
"loss": 0.4679752588272095,
"memory(GiB)": 74.33,
"step": 261,
"token_acc": 0.813953488372093,
"train_speed(iter/s)": 0.022778
},
{
"epoch": 0.21193124368048533,
"grad_norm": 2.9813013076782227,
"learning_rate": 9.302007896300697e-06,
"loss": 0.47132837772369385,
"memory(GiB)": 74.33,
"step": 262,
"token_acc": 0.8687258687258688,
"train_speed(iter/s)": 0.022779
},
{
"epoch": 0.21274014155712842,
"grad_norm": 2.997264862060547,
"learning_rate": 9.295179728076666e-06,
"loss": 0.47330912947654724,
"memory(GiB)": 74.33,
"step": 263,
"token_acc": 0.8465608465608465,
"train_speed(iter/s)": 0.022779
},
{
"epoch": 0.2135490394337715,
"grad_norm": 2.7569003105163574,
"learning_rate": 9.288320855135936e-06,
"loss": 0.5202451348304749,
"memory(GiB)": 74.33,
"step": 264,
"token_acc": 0.8395061728395061,
"train_speed(iter/s)": 0.022779
},
{
"epoch": 0.21435793731041455,
"grad_norm": 3.455897569656372,
"learning_rate": 9.281431326510153e-06,
"loss": 0.5138571262359619,
"memory(GiB)": 74.33,
"step": 265,
"token_acc": 0.8263888888888888,
"train_speed(iter/s)": 0.022779
},
{
"epoch": 0.21516683518705762,
"grad_norm": 2.402111291885376,
"learning_rate": 9.27451119145012e-06,
"loss": 0.4587266147136688,
"memory(GiB)": 74.33,
"step": 266,
"token_acc": 0.8116591928251121,
"train_speed(iter/s)": 0.022779
},
{
"epoch": 0.21597573306370071,
"grad_norm": 2.7626912593841553,
"learning_rate": 9.267560499425425e-06,
"loss": 0.5164949893951416,
"memory(GiB)": 74.33,
"step": 267,
"token_acc": 0.845771144278607,
"train_speed(iter/s)": 0.022779
},
{
"epoch": 0.21678463094034378,
"grad_norm": 2.1381757259368896,
"learning_rate": 9.2605793001241e-06,
"loss": 0.47523602843284607,
"memory(GiB)": 74.33,
"step": 268,
"token_acc": 0.8202247191011236,
"train_speed(iter/s)": 0.02278
},
{
"epoch": 0.21759352881698685,
"grad_norm": 3.386496067047119,
"learning_rate": 9.253567643452263e-06,
"loss": 0.5109878778457642,
"memory(GiB)": 74.33,
"step": 269,
"token_acc": 0.8279569892473119,
"train_speed(iter/s)": 0.02278
},
{
"epoch": 0.21840242669362994,
"grad_norm": 3.036259889602661,
"learning_rate": 9.246525579533765e-06,
"loss": 0.47165533900260925,
"memory(GiB)": 74.33,
"step": 270,
"token_acc": 0.8557046979865772,
"train_speed(iter/s)": 0.02278
},
{
"epoch": 0.219211324570273,
"grad_norm": 2.2953364849090576,
"learning_rate": 9.239453158709822e-06,
"loss": 0.452242374420166,
"memory(GiB)": 74.33,
"step": 271,
"token_acc": 0.9050445103857567,
"train_speed(iter/s)": 0.02278
},
{
"epoch": 0.22002022244691607,
"grad_norm": 3.2290663719177246,
"learning_rate": 9.232350431538656e-06,
"loss": 0.5369592905044556,
"memory(GiB)": 74.33,
"step": 272,
"token_acc": 0.8627450980392157,
"train_speed(iter/s)": 0.02278
},
{
"epoch": 0.22082912032355914,
"grad_norm": 2.628915786743164,
"learning_rate": 9.225217448795155e-06,
"loss": 0.46493035554885864,
"memory(GiB)": 74.33,
"step": 273,
"token_acc": 0.8185483870967742,
"train_speed(iter/s)": 0.02278
},
{
"epoch": 0.22163801820020224,
"grad_norm": 2.308983325958252,
"learning_rate": 9.218054261470477e-06,
"loss": 0.462538480758667,
"memory(GiB)": 74.33,
"step": 274,
"token_acc": 0.8456375838926175,
"train_speed(iter/s)": 0.02278
},
{
"epoch": 0.2224469160768453,
"grad_norm": 3.000230550765991,
"learning_rate": 9.210860920771706e-06,
"loss": 0.43489784002304077,
"memory(GiB)": 74.33,
"step": 275,
"token_acc": 0.842741935483871,
"train_speed(iter/s)": 0.02278
},
{
"epoch": 0.22325581395348837,
"grad_norm": 2.6025278568267822,
"learning_rate": 9.203637478121492e-06,
"loss": 0.46363720297813416,
"memory(GiB)": 74.33,
"step": 276,
"token_acc": 0.8724489795918368,
"train_speed(iter/s)": 0.022781
},
{
"epoch": 0.22406471183013144,
"grad_norm": 3.2257838249206543,
"learning_rate": 9.196383985157657e-06,
"loss": 0.46590667963027954,
"memory(GiB)": 74.33,
"step": 277,
"token_acc": 0.8736842105263158,
"train_speed(iter/s)": 0.022781
},
{
"epoch": 0.22487360970677453,
"grad_norm": 2.476445436477661,
"learning_rate": 9.189100493732852e-06,
"loss": 0.4720000624656677,
"memory(GiB)": 74.33,
"step": 278,
"token_acc": 0.8990825688073395,
"train_speed(iter/s)": 0.022781
},
{
"epoch": 0.2256825075834176,
"grad_norm": 1.9399663209915161,
"learning_rate": 9.181787055914175e-06,
"loss": 0.43296879529953003,
"memory(GiB)": 74.33,
"step": 279,
"token_acc": 0.8297872340425532,
"train_speed(iter/s)": 0.022782
},
{
"epoch": 0.22649140546006066,
"grad_norm": 2.530008554458618,
"learning_rate": 9.1744437239828e-06,
"loss": 0.43587636947631836,
"memory(GiB)": 74.33,
"step": 280,
"token_acc": 0.8951612903225806,
"train_speed(iter/s)": 0.022782
},
{
"epoch": 0.22730030333670373,
"grad_norm": 2.7868869304656982,
"learning_rate": 9.167070550433604e-06,
"loss": 0.3868146538734436,
"memory(GiB)": 74.33,
"step": 281,
"token_acc": 0.8425925925925926,
"train_speed(iter/s)": 0.022783
},
{
"epoch": 0.22810920121334682,
"grad_norm": 2.6715898513793945,
"learning_rate": 9.159667587974786e-06,
"loss": 0.40206730365753174,
"memory(GiB)": 74.33,
"step": 282,
"token_acc": 0.8894736842105263,
"train_speed(iter/s)": 0.022783
},
{
"epoch": 0.2289180990899899,
"grad_norm": 2.36309552192688,
"learning_rate": 9.1522348895275e-06,
"loss": 0.5806437730789185,
"memory(GiB)": 74.33,
"step": 283,
"token_acc": 0.7923728813559322,
"train_speed(iter/s)": 0.022783
},
{
"epoch": 0.22972699696663296,
"grad_norm": 2.1452529430389404,
"learning_rate": 9.144772508225477e-06,
"loss": 0.4016059339046478,
"memory(GiB)": 74.33,
"step": 284,
"token_acc": 0.872093023255814,
"train_speed(iter/s)": 0.022783
},
{
"epoch": 0.23053589484327602,
"grad_norm": 2.564225196838379,
"learning_rate": 9.137280497414628e-06,
"loss": 0.3909257650375366,
"memory(GiB)": 74.33,
"step": 285,
"token_acc": 0.8805460750853242,
"train_speed(iter/s)": 0.022783
},
{
"epoch": 0.23134479271991912,
"grad_norm": 2.211818218231201,
"learning_rate": 9.129758910652684e-06,
"loss": 0.4310418963432312,
"memory(GiB)": 74.33,
"step": 286,
"token_acc": 0.8644859813084113,
"train_speed(iter/s)": 0.022784
},
{
"epoch": 0.23215369059656218,
"grad_norm": 3.1847712993621826,
"learning_rate": 9.122207801708802e-06,
"loss": 0.43590471148490906,
"memory(GiB)": 74.33,
"step": 287,
"token_acc": 0.864,
"train_speed(iter/s)": 0.022784
},
{
"epoch": 0.23296258847320525,
"grad_norm": 2.477933406829834,
"learning_rate": 9.114627224563182e-06,
"loss": 0.4442121386528015,
"memory(GiB)": 74.33,
"step": 288,
"token_acc": 0.884,
"train_speed(iter/s)": 0.022784
},
{
"epoch": 0.23377148634984835,
"grad_norm": 3.274622678756714,
"learning_rate": 9.10701723340668e-06,
"loss": 0.47166556119918823,
"memory(GiB)": 74.33,
"step": 289,
"token_acc": 0.8571428571428571,
"train_speed(iter/s)": 0.022784
},
{
"epoch": 0.2345803842264914,
"grad_norm": 3.145052671432495,
"learning_rate": 9.099377882640425e-06,
"loss": 0.46739423274993896,
"memory(GiB)": 74.33,
"step": 290,
"token_acc": 0.8502202643171806,
"train_speed(iter/s)": 0.022784
},
{
"epoch": 0.23538928210313448,
"grad_norm": 2.3364012241363525,
"learning_rate": 9.09170922687543e-06,
"loss": 0.4193730354309082,
"memory(GiB)": 74.33,
"step": 291,
"token_acc": 0.8828451882845189,
"train_speed(iter/s)": 0.022785
},
{
"epoch": 0.23619817997977754,
"grad_norm": 2.827242612838745,
"learning_rate": 9.08401132093219e-06,
"loss": 0.5026365518569946,
"memory(GiB)": 74.33,
"step": 292,
"token_acc": 0.8571428571428571,
"train_speed(iter/s)": 0.022785
},
{
"epoch": 0.23700707785642064,
"grad_norm": 3.1282265186309814,
"learning_rate": 9.076284219840306e-06,
"loss": 0.46792399883270264,
"memory(GiB)": 74.33,
"step": 293,
"token_acc": 0.8814814814814815,
"train_speed(iter/s)": 0.022785
},
{
"epoch": 0.2378159757330637,
"grad_norm": 2.6595497131347656,
"learning_rate": 9.068527978838086e-06,
"loss": 0.48813870549201965,
"memory(GiB)": 74.33,
"step": 294,
"token_acc": 0.8664122137404581,
"train_speed(iter/s)": 0.022785
},
{
"epoch": 0.23862487360970677,
"grad_norm": 2.2860071659088135,
"learning_rate": 9.060742653372143e-06,
"loss": 0.4249404966831207,
"memory(GiB)": 74.33,
"step": 295,
"token_acc": 0.815068493150685,
"train_speed(iter/s)": 0.022785
},
{
"epoch": 0.23943377148634984,
"grad_norm": 2.8490703105926514,
"learning_rate": 9.052928299097013e-06,
"loss": 0.5840834975242615,
"memory(GiB)": 74.33,
"step": 296,
"token_acc": 0.8630705394190872,
"train_speed(iter/s)": 0.022785
},
{
"epoch": 0.24024266936299293,
"grad_norm": 2.5748631954193115,
"learning_rate": 9.045084971874738e-06,
"loss": 0.4933628439903259,
"memory(GiB)": 74.33,
"step": 297,
"token_acc": 0.8488372093023255,
"train_speed(iter/s)": 0.022785
},
{
"epoch": 0.241051567239636,
"grad_norm": 2.2127761840820312,
"learning_rate": 9.037212727774486e-06,
"loss": 0.47793740034103394,
"memory(GiB)": 74.33,
"step": 298,
"token_acc": 0.8963730569948186,
"train_speed(iter/s)": 0.022784
},
{
"epoch": 0.24186046511627907,
"grad_norm": 2.8014166355133057,
"learning_rate": 9.029311623072137e-06,
"loss": 0.4578291177749634,
"memory(GiB)": 74.33,
"step": 299,
"token_acc": 0.8131868131868132,
"train_speed(iter/s)": 0.022784
},
{
"epoch": 0.24266936299292213,
"grad_norm": 2.5986998081207275,
"learning_rate": 9.021381714249888e-06,
"loss": 0.5257298350334167,
"memory(GiB)": 74.33,
"step": 300,
"token_acc": 0.8229166666666666,
"train_speed(iter/s)": 0.022784
},
{
"epoch": 0.24347826086956523,
"grad_norm": 2.7166779041290283,
"learning_rate": 9.013423057995845e-06,
"loss": 0.5010583400726318,
"memory(GiB)": 74.33,
"step": 301,
"token_acc": 0.8590308370044053,
"train_speed(iter/s)": 0.022784
},
{
"epoch": 0.2442871587462083,
"grad_norm": 2.9347927570343018,
"learning_rate": 9.005435711203619e-06,
"loss": 0.4537706971168518,
"memory(GiB)": 74.33,
"step": 302,
"token_acc": 0.8659003831417624,
"train_speed(iter/s)": 0.022785
},
{
"epoch": 0.24509605662285136,
"grad_norm": 2.4154651165008545,
"learning_rate": 8.997419730971917e-06,
"loss": 0.39763540029525757,
"memory(GiB)": 74.33,
"step": 303,
"token_acc": 0.8690476190476191,
"train_speed(iter/s)": 0.022785
},
{
"epoch": 0.24590495449949443,
"grad_norm": 2.5024564266204834,
"learning_rate": 8.989375174604142e-06,
"loss": 0.5160707235336304,
"memory(GiB)": 74.33,
"step": 304,
"token_acc": 0.8614718614718615,
"train_speed(iter/s)": 0.022785
},
{
"epoch": 0.24671385237613752,
"grad_norm": 2.6469497680664062,
"learning_rate": 8.981302099607973e-06,
"loss": 0.4616546332836151,
"memory(GiB)": 74.33,
"step": 305,
"token_acc": 0.8442028985507246,
"train_speed(iter/s)": 0.022785
},
{
"epoch": 0.2475227502527806,
"grad_norm": 2.6130266189575195,
"learning_rate": 8.973200563694964e-06,
"loss": 0.42548537254333496,
"memory(GiB)": 74.33,
"step": 306,
"token_acc": 0.852589641434263,
"train_speed(iter/s)": 0.022785
},
{
"epoch": 0.24833164812942365,
"grad_norm": 2.578451156616211,
"learning_rate": 8.965070624780117e-06,
"loss": 0.48335641622543335,
"memory(GiB)": 74.33,
"step": 307,
"token_acc": 0.846441947565543,
"train_speed(iter/s)": 0.022785
},
{
"epoch": 0.24914054600606672,
"grad_norm": 2.4299726486206055,
"learning_rate": 8.956912340981485e-06,
"loss": 0.4736361801624298,
"memory(GiB)": 74.33,
"step": 308,
"token_acc": 0.8448979591836735,
"train_speed(iter/s)": 0.022785
},
{
"epoch": 0.24994944388270982,
"grad_norm": 2.816293239593506,
"learning_rate": 8.948725770619745e-06,
"loss": 0.503253698348999,
"memory(GiB)": 74.33,
"step": 309,
"token_acc": 0.8466453674121406,
"train_speed(iter/s)": 0.022785
},
{
"epoch": 0.25075834175935285,
"grad_norm": 2.6718838214874268,
"learning_rate": 8.940510972217785e-06,
"loss": 0.43048620223999023,
"memory(GiB)": 74.33,
"step": 310,
"token_acc": 0.8262295081967214,
"train_speed(iter/s)": 0.022785
},
{
"epoch": 0.251567239635996,
"grad_norm": 2.4307098388671875,
"learning_rate": 8.932268004500288e-06,
"loss": 0.51353919506073,
"memory(GiB)": 74.33,
"step": 311,
"token_acc": 0.8412017167381974,
"train_speed(iter/s)": 0.022785
},
{
"epoch": 0.25237613751263904,
"grad_norm": 2.6662516593933105,
"learning_rate": 8.923996926393306e-06,
"loss": 0.4586646556854248,
"memory(GiB)": 74.33,
"step": 312,
"token_acc": 0.8692579505300353,
"train_speed(iter/s)": 0.022786
},
{
"epoch": 0.2531850353892821,
"grad_norm": 3.027970790863037,
"learning_rate": 8.915697797023841e-06,
"loss": 0.5299907326698303,
"memory(GiB)": 74.33,
"step": 313,
"token_acc": 0.8582089552238806,
"train_speed(iter/s)": 0.022786
},
{
"epoch": 0.2539939332659252,
"grad_norm": 2.6045422554016113,
"learning_rate": 8.907370675719428e-06,
"loss": 0.5199022889137268,
"memory(GiB)": 74.33,
"step": 314,
"token_acc": 0.8116883116883117,
"train_speed(iter/s)": 0.022785
},
{
"epoch": 0.25480283114256824,
"grad_norm": 2.7272956371307373,
"learning_rate": 8.899015622007703e-06,
"loss": 0.45891785621643066,
"memory(GiB)": 74.33,
"step": 315,
"token_acc": 0.8243243243243243,
"train_speed(iter/s)": 0.022786
},
{
"epoch": 0.2556117290192113,
"grad_norm": 2.200077533721924,
"learning_rate": 8.890632695615984e-06,
"loss": 0.39891767501831055,
"memory(GiB)": 74.33,
"step": 316,
"token_acc": 0.8440677966101695,
"train_speed(iter/s)": 0.022786
},
{
"epoch": 0.2564206268958544,
"grad_norm": 2.301032543182373,
"learning_rate": 8.882221956470838e-06,
"loss": 0.4599316716194153,
"memory(GiB)": 74.33,
"step": 317,
"token_acc": 0.8325358851674641,
"train_speed(iter/s)": 0.022786
},
{
"epoch": 0.2572295247724975,
"grad_norm": 2.614656448364258,
"learning_rate": 8.873783464697653e-06,
"loss": 0.459076464176178,
"memory(GiB)": 74.33,
"step": 318,
"token_acc": 0.8393939393939394,
"train_speed(iter/s)": 0.022786
},
{
"epoch": 0.25803842264914056,
"grad_norm": 2.1406943798065186,
"learning_rate": 8.865317280620221e-06,
"loss": 0.39890217781066895,
"memory(GiB)": 74.33,
"step": 319,
"token_acc": 0.8304347826086956,
"train_speed(iter/s)": 0.022786
},
{
"epoch": 0.25884732052578363,
"grad_norm": 2.5298852920532227,
"learning_rate": 8.856823464760284e-06,
"loss": 0.4256265163421631,
"memory(GiB)": 74.33,
"step": 320,
"token_acc": 0.8717948717948718,
"train_speed(iter/s)": 0.022786
},
{
"epoch": 0.2596562184024267,
"grad_norm": 2.3466522693634033,
"learning_rate": 8.84830207783712e-06,
"loss": 0.395018070936203,
"memory(GiB)": 74.33,
"step": 321,
"token_acc": 0.8884462151394422,
"train_speed(iter/s)": 0.022786
},
{
"epoch": 0.26046511627906976,
"grad_norm": 2.6752617359161377,
"learning_rate": 8.839753180767108e-06,
"loss": 0.4618658423423767,
"memory(GiB)": 74.33,
"step": 322,
"token_acc": 0.8387096774193549,
"train_speed(iter/s)": 0.022786
},
{
"epoch": 0.26127401415571283,
"grad_norm": 2.248332977294922,
"learning_rate": 8.831176834663275e-06,
"loss": 0.4209662675857544,
"memory(GiB)": 74.33,
"step": 323,
"token_acc": 0.8830645161290323,
"train_speed(iter/s)": 0.022786
},
{
"epoch": 0.2620829120323559,
"grad_norm": 2.6968088150024414,
"learning_rate": 8.82257310083488e-06,
"loss": 0.4762377440929413,
"memory(GiB)": 74.33,
"step": 324,
"token_acc": 0.8810572687224669,
"train_speed(iter/s)": 0.022786
},
{
"epoch": 0.26289180990899896,
"grad_norm": 3.221013307571411,
"learning_rate": 8.813942040786964e-06,
"loss": 0.5154784917831421,
"memory(GiB)": 74.33,
"step": 325,
"token_acc": 0.8494208494208494,
"train_speed(iter/s)": 0.022786
},
{
"epoch": 0.2637007077856421,
"grad_norm": 1.9791827201843262,
"learning_rate": 8.805283716219917e-06,
"loss": 0.47922518849372864,
"memory(GiB)": 74.33,
"step": 326,
"token_acc": 0.8412698412698413,
"train_speed(iter/s)": 0.022786
},
{
"epoch": 0.26450960566228515,
"grad_norm": 1.939926266670227,
"learning_rate": 8.79659818902903e-06,
"loss": 0.4087769389152527,
"memory(GiB)": 74.33,
"step": 327,
"token_acc": 0.8360655737704918,
"train_speed(iter/s)": 0.022787
},
{
"epoch": 0.2653185035389282,
"grad_norm": 2.3445236682891846,
"learning_rate": 8.787885521304056e-06,
"loss": 0.49197518825531006,
"memory(GiB)": 74.33,
"step": 328,
"token_acc": 0.8293413173652695,
"train_speed(iter/s)": 0.022786
},
{
"epoch": 0.2661274014155713,
"grad_norm": 2.549042224884033,
"learning_rate": 8.779145775328766e-06,
"loss": 0.4610610604286194,
"memory(GiB)": 74.33,
"step": 329,
"token_acc": 0.8407960199004975,
"train_speed(iter/s)": 0.022787
},
{
"epoch": 0.26693629929221435,
"grad_norm": 7.023351192474365,
"learning_rate": 8.770379013580507e-06,
"loss": 0.5349440574645996,
"memory(GiB)": 74.33,
"step": 330,
"token_acc": 0.8619246861924686,
"train_speed(iter/s)": 0.022786
},
{
"epoch": 0.2677451971688574,
"grad_norm": 3.5521559715270996,
"learning_rate": 8.761585298729748e-06,
"loss": 0.46497541666030884,
"memory(GiB)": 74.33,
"step": 331,
"token_acc": 0.8870292887029289,
"train_speed(iter/s)": 0.022786
},
{
"epoch": 0.2685540950455005,
"grad_norm": 2.684696674346924,
"learning_rate": 8.75276469363964e-06,
"loss": 0.4779859781265259,
"memory(GiB)": 74.33,
"step": 332,
"token_acc": 0.837696335078534,
"train_speed(iter/s)": 0.022786
},
{
"epoch": 0.2693629929221436,
"grad_norm": 2.123192310333252,
"learning_rate": 8.743917261365557e-06,
"loss": 0.43780291080474854,
"memory(GiB)": 74.33,
"step": 333,
"token_acc": 0.8692307692307693,
"train_speed(iter/s)": 0.022786
},
{
"epoch": 0.2701718907987867,
"grad_norm": 2.416212558746338,
"learning_rate": 8.73504306515466e-06,
"loss": 0.43149372935295105,
"memory(GiB)": 74.33,
"step": 334,
"token_acc": 0.85,
"train_speed(iter/s)": 0.022786
},
{
"epoch": 0.27098078867542974,
"grad_norm": 2.407726764678955,
"learning_rate": 8.726142168445427e-06,
"loss": 0.46393710374832153,
"memory(GiB)": 74.33,
"step": 335,
"token_acc": 0.8478260869565217,
"train_speed(iter/s)": 0.022786
},
{
"epoch": 0.2717896865520728,
"grad_norm": 2.2603883743286133,
"learning_rate": 8.717214634867213e-06,
"loss": 0.4834635555744171,
"memory(GiB)": 74.33,
"step": 336,
"token_acc": 0.8544303797468354,
"train_speed(iter/s)": 0.022786
},
{
"epoch": 0.2725985844287159,
"grad_norm": 2.377035140991211,
"learning_rate": 8.708260528239788e-06,
"loss": 0.4176112711429596,
"memory(GiB)": 74.33,
"step": 337,
"token_acc": 0.8802083333333334,
"train_speed(iter/s)": 0.022786
},
{
"epoch": 0.27340748230535894,
"grad_norm": 2.855900526046753,
"learning_rate": 8.699279912572888e-06,
"loss": 0.4877198338508606,
"memory(GiB)": 74.33,
"step": 338,
"token_acc": 0.8592964824120602,
"train_speed(iter/s)": 0.022787
},
{
"epoch": 0.274216380182002,
"grad_norm": 3.3495020866394043,
"learning_rate": 8.690272852065748e-06,
"loss": 0.44448497891426086,
"memory(GiB)": 74.33,
"step": 339,
"token_acc": 0.8760683760683761,
"train_speed(iter/s)": 0.022787
},
{
"epoch": 0.2750252780586451,
"grad_norm": 2.204909563064575,
"learning_rate": 8.68123941110665e-06,
"loss": 0.47281521558761597,
"memory(GiB)": 74.33,
"step": 340,
"token_acc": 0.8225108225108225,
"train_speed(iter/s)": 0.022787
},
{
"epoch": 0.2758341759352882,
"grad_norm": 2.295105218887329,
"learning_rate": 8.67217965427246e-06,
"loss": 0.42724931240081787,
"memory(GiB)": 74.33,
"step": 341,
"token_acc": 0.875,
"train_speed(iter/s)": 0.022787
},
{
"epoch": 0.27664307381193126,
"grad_norm": 3.001664876937866,
"learning_rate": 8.663093646328166e-06,
"loss": 0.5214186310768127,
"memory(GiB)": 74.33,
"step": 342,
"token_acc": 0.8205128205128205,
"train_speed(iter/s)": 0.022787
},
{
"epoch": 0.27745197168857433,
"grad_norm": 2.665395736694336,
"learning_rate": 8.653981452226418e-06,
"loss": 0.43387383222579956,
"memory(GiB)": 74.33,
"step": 343,
"token_acc": 0.908256880733945,
"train_speed(iter/s)": 0.022787
},
{
"epoch": 0.2782608695652174,
"grad_norm": 2.3217623233795166,
"learning_rate": 8.644843137107058e-06,
"loss": 0.5246144533157349,
"memory(GiB)": 74.33,
"step": 344,
"token_acc": 0.825925925925926,
"train_speed(iter/s)": 0.022787
},
{
"epoch": 0.27906976744186046,
"grad_norm": 2.4558563232421875,
"learning_rate": 8.635678766296663e-06,
"loss": 0.48798543214797974,
"memory(GiB)": 74.33,
"step": 345,
"token_acc": 0.848780487804878,
"train_speed(iter/s)": 0.022787
},
{
"epoch": 0.27987866531850353,
"grad_norm": 2.1867096424102783,
"learning_rate": 8.626488405308067e-06,
"loss": 0.5087660551071167,
"memory(GiB)": 74.33,
"step": 346,
"token_acc": 0.8311688311688312,
"train_speed(iter/s)": 0.022787
},
{
"epoch": 0.2806875631951466,
"grad_norm": 2.2217187881469727,
"learning_rate": 8.617272119839903e-06,
"loss": 0.43445640802383423,
"memory(GiB)": 74.33,
"step": 347,
"token_acc": 0.8571428571428571,
"train_speed(iter/s)": 0.022787
},
{
"epoch": 0.28149646107178966,
"grad_norm": 2.6297953128814697,
"learning_rate": 8.608029975776128e-06,
"loss": 0.4504978656768799,
"memory(GiB)": 74.33,
"step": 348,
"token_acc": 0.8523809523809524,
"train_speed(iter/s)": 0.022788
},
{
"epoch": 0.2823053589484328,
"grad_norm": 3.717496156692505,
"learning_rate": 8.598762039185553e-06,
"loss": 0.45087772607803345,
"memory(GiB)": 74.33,
"step": 349,
"token_acc": 0.8565400843881856,
"train_speed(iter/s)": 0.022787
},
{
"epoch": 0.28311425682507585,
"grad_norm": 2.353040933609009,
"learning_rate": 8.589468376321369e-06,
"loss": 0.4105454683303833,
"memory(GiB)": 74.33,
"step": 350,
"token_acc": 0.8566775244299675,
"train_speed(iter/s)": 0.022788
},
{
"epoch": 0.2839231547017189,
"grad_norm": 2.3427672386169434,
"learning_rate": 8.580149053620674e-06,
"loss": 0.5255011320114136,
"memory(GiB)": 74.33,
"step": 351,
"token_acc": 0.8346456692913385,
"train_speed(iter/s)": 0.022788
},
{
"epoch": 0.284732052578362,
"grad_norm": 2.3275554180145264,
"learning_rate": 8.570804137704005e-06,
"loss": 0.443267822265625,
"memory(GiB)": 74.33,
"step": 352,
"token_acc": 0.8314176245210728,
"train_speed(iter/s)": 0.022787
},
{
"epoch": 0.28554095045500505,
"grad_norm": 2.162351608276367,
"learning_rate": 8.561433695374848e-06,
"loss": 0.4688035249710083,
"memory(GiB)": 74.33,
"step": 353,
"token_acc": 0.8375451263537906,
"train_speed(iter/s)": 0.022787
},
{
"epoch": 0.2863498483316481,
"grad_norm": 2.127072811126709,
"learning_rate": 8.552037793619177e-06,
"loss": 0.488004207611084,
"memory(GiB)": 74.33,
"step": 354,
"token_acc": 0.8119266055045872,
"train_speed(iter/s)": 0.022787
},
{
"epoch": 0.2871587462082912,
"grad_norm": 2.731759786605835,
"learning_rate": 8.542616499604958e-06,
"loss": 0.4488160312175751,
"memory(GiB)": 74.33,
"step": 355,
"token_acc": 0.8196078431372549,
"train_speed(iter/s)": 0.022787
},
{
"epoch": 0.2879676440849343,
"grad_norm": 2.025136709213257,
"learning_rate": 8.533169880681682e-06,
"loss": 0.3923991024494171,
"memory(GiB)": 74.33,
"step": 356,
"token_acc": 0.8362989323843416,
"train_speed(iter/s)": 0.022787
},
{
"epoch": 0.28877654196157737,
"grad_norm": 2.501194477081299,
"learning_rate": 8.523698004379878e-06,
"loss": 0.46766936779022217,
"memory(GiB)": 74.33,
"step": 357,
"token_acc": 0.8260869565217391,
"train_speed(iter/s)": 0.022787
},
{
"epoch": 0.28958543983822044,
"grad_norm": 2.192864179611206,
"learning_rate": 8.514200938410628e-06,
"loss": 0.48559021949768066,
"memory(GiB)": 74.33,
"step": 358,
"token_acc": 0.86328125,
"train_speed(iter/s)": 0.022788
},
{
"epoch": 0.2903943377148635,
"grad_norm": 2.9228947162628174,
"learning_rate": 8.504678750665094e-06,
"loss": 0.5047175288200378,
"memory(GiB)": 74.33,
"step": 359,
"token_acc": 0.8647540983606558,
"train_speed(iter/s)": 0.022788
},
{
"epoch": 0.29120323559150657,
"grad_norm": 2.388331174850464,
"learning_rate": 8.495131509214015e-06,
"loss": 0.4464142620563507,
"memory(GiB)": 74.33,
"step": 360,
"token_acc": 0.8411552346570397,
"train_speed(iter/s)": 0.022788
},
{
"epoch": 0.29201213346814964,
"grad_norm": 3.4440038204193115,
"learning_rate": 8.485559282307237e-06,
"loss": 0.44610536098480225,
"memory(GiB)": 74.33,
"step": 361,
"token_acc": 0.892018779342723,
"train_speed(iter/s)": 0.022788
},
{
"epoch": 0.2928210313447927,
"grad_norm": 2.4162344932556152,
"learning_rate": 8.475962138373212e-06,
"loss": 0.43880611658096313,
"memory(GiB)": 74.33,
"step": 362,
"token_acc": 0.8632478632478633,
"train_speed(iter/s)": 0.022788
},
{
"epoch": 0.29362992922143577,
"grad_norm": 2.4398529529571533,
"learning_rate": 8.466340146018522e-06,
"loss": 0.4168269634246826,
"memory(GiB)": 74.33,
"step": 363,
"token_acc": 0.8543307086614174,
"train_speed(iter/s)": 0.022788
},
{
"epoch": 0.2944388270980789,
"grad_norm": 2.5178182125091553,
"learning_rate": 8.456693374027378e-06,
"loss": 0.4725669026374817,
"memory(GiB)": 74.33,
"step": 364,
"token_acc": 0.8543689320388349,
"train_speed(iter/s)": 0.022788
},
{
"epoch": 0.29524772497472196,
"grad_norm": 2.5267229080200195,
"learning_rate": 8.44702189136113e-06,
"loss": 0.5213101506233215,
"memory(GiB)": 74.33,
"step": 365,
"token_acc": 0.839344262295082,
"train_speed(iter/s)": 0.022789
},
{
"epoch": 0.296056622851365,
"grad_norm": 2.3971071243286133,
"learning_rate": 8.43732576715778e-06,
"loss": 0.4878075122833252,
"memory(GiB)": 74.33,
"step": 366,
"token_acc": 0.8620689655172413,
"train_speed(iter/s)": 0.022789
},
{
"epoch": 0.2968655207280081,
"grad_norm": 3.86580753326416,
"learning_rate": 8.427605070731482e-06,
"loss": 0.38472825288772583,
"memory(GiB)": 74.33,
"step": 367,
"token_acc": 0.8538461538461538,
"train_speed(iter/s)": 0.022789
},
{
"epoch": 0.29767441860465116,
"grad_norm": 2.5940558910369873,
"learning_rate": 8.417859871572045e-06,
"loss": 0.5018994808197021,
"memory(GiB)": 74.33,
"step": 368,
"token_acc": 0.8375796178343949,
"train_speed(iter/s)": 0.022789
},
{
"epoch": 0.2984833164812942,
"grad_norm": 2.456550359725952,
"learning_rate": 8.408090239344442e-06,
"loss": 0.4518444240093231,
"memory(GiB)": 74.33,
"step": 369,
"token_acc": 0.8458149779735683,
"train_speed(iter/s)": 0.022789
},
{
"epoch": 0.2992922143579373,
"grad_norm": 3.4539546966552734,
"learning_rate": 8.39829624388831e-06,
"loss": 0.4444255828857422,
"memory(GiB)": 74.33,
"step": 370,
"token_acc": 0.8786407766990292,
"train_speed(iter/s)": 0.022789
},
{
"epoch": 0.30010111223458036,
"grad_norm": 2.5049355030059814,
"learning_rate": 8.38847795521745e-06,
"loss": 0.4359434247016907,
"memory(GiB)": 74.33,
"step": 371,
"token_acc": 0.8415841584158416,
"train_speed(iter/s)": 0.022789
},
{
"epoch": 0.3009100101112235,
"grad_norm": 2.7211098670959473,
"learning_rate": 8.378635443519327e-06,
"loss": 0.4071110785007477,
"memory(GiB)": 74.33,
"step": 372,
"token_acc": 0.8516949152542372,
"train_speed(iter/s)": 0.02279
},
{
"epoch": 0.30171890798786655,
"grad_norm": 2.0721325874328613,
"learning_rate": 8.368768779154564e-06,
"loss": 0.449047714471817,
"memory(GiB)": 74.33,
"step": 373,
"token_acc": 0.8604651162790697,
"train_speed(iter/s)": 0.02279
},
{
"epoch": 0.3025278058645096,
"grad_norm": 2.6694495677948,
"learning_rate": 8.358878032656446e-06,
"loss": 0.436679869890213,
"memory(GiB)": 74.33,
"step": 374,
"token_acc": 0.8672566371681416,
"train_speed(iter/s)": 0.02279
},
{
"epoch": 0.3033367037411527,
"grad_norm": 2.6044750213623047,
"learning_rate": 8.348963274730413e-06,
"loss": 0.4522557556629181,
"memory(GiB)": 74.33,
"step": 375,
"token_acc": 0.8481675392670157,
"train_speed(iter/s)": 0.02279
},
{
"epoch": 0.30414560161779575,
"grad_norm": 2.2683019638061523,
"learning_rate": 8.339024576253555e-06,
"loss": 0.3990349769592285,
"memory(GiB)": 74.33,
"step": 376,
"token_acc": 0.8393574297188755,
"train_speed(iter/s)": 0.02279
},
{
"epoch": 0.3049544994944388,
"grad_norm": 2.6098105907440186,
"learning_rate": 8.3290620082741e-06,
"loss": 0.47003981471061707,
"memory(GiB)": 74.33,
"step": 377,
"token_acc": 0.8828125,
"train_speed(iter/s)": 0.02279
},
{
"epoch": 0.3057633973710819,
"grad_norm": 2.756648540496826,
"learning_rate": 8.319075642010914e-06,
"loss": 0.46801501512527466,
"memory(GiB)": 74.33,
"step": 378,
"token_acc": 0.8024691358024691,
"train_speed(iter/s)": 0.022791
},
{
"epoch": 0.306572295247725,
"grad_norm": 2.435135841369629,
"learning_rate": 8.30906554885299e-06,
"loss": 0.45518428087234497,
"memory(GiB)": 74.33,
"step": 379,
"token_acc": 0.86328125,
"train_speed(iter/s)": 0.022791
},
{
"epoch": 0.30738119312436807,
"grad_norm": 2.305549144744873,
"learning_rate": 8.299031800358933e-06,
"loss": 0.40630266070365906,
"memory(GiB)": 74.33,
"step": 380,
"token_acc": 0.8652173913043478,
"train_speed(iter/s)": 0.022791
},
{
"epoch": 0.30819009100101113,
"grad_norm": 2.8813188076019287,
"learning_rate": 8.288974468256453e-06,
"loss": 0.5275822877883911,
"memory(GiB)": 74.33,
"step": 381,
"token_acc": 0.8652849740932642,
"train_speed(iter/s)": 0.022791
},
{
"epoch": 0.3089989888776542,
"grad_norm": 2.2883760929107666,
"learning_rate": 8.278893624441849e-06,
"loss": 0.4657808542251587,
"memory(GiB)": 74.33,
"step": 382,
"token_acc": 0.8081632653061225,
"train_speed(iter/s)": 0.022791
},
{
"epoch": 0.30980788675429727,
"grad_norm": 2.4337222576141357,
"learning_rate": 8.268789340979499e-06,
"loss": 0.4899158179759979,
"memory(GiB)": 74.33,
"step": 383,
"token_acc": 0.8776371308016878,
"train_speed(iter/s)": 0.022791
},
{
"epoch": 0.31061678463094033,
"grad_norm": 2.359471082687378,
"learning_rate": 8.258661690101347e-06,
"loss": 0.4913978576660156,
"memory(GiB)": 74.33,
"step": 384,
"token_acc": 0.8454935622317596,
"train_speed(iter/s)": 0.022791
},
{
"epoch": 0.3114256825075834,
"grad_norm": 2.946106433868408,
"learning_rate": 8.24851074420637e-06,
"loss": 0.3954363167285919,
"memory(GiB)": 74.33,
"step": 385,
"token_acc": 0.876984126984127,
"train_speed(iter/s)": 0.022792
},
{
"epoch": 0.31223458038422647,
"grad_norm": 2.676274299621582,
"learning_rate": 8.238336575860085e-06,
"loss": 0.4366721212863922,
"memory(GiB)": 74.33,
"step": 386,
"token_acc": 0.8426229508196721,
"train_speed(iter/s)": 0.022791
},
{
"epoch": 0.3130434782608696,
"grad_norm": 2.2800793647766113,
"learning_rate": 8.228139257794012e-06,
"loss": 0.4242827892303467,
"memory(GiB)": 74.33,
"step": 387,
"token_acc": 0.8724137931034482,
"train_speed(iter/s)": 0.022792
},
{
"epoch": 0.31385237613751266,
"grad_norm": 2.1262009143829346,
"learning_rate": 8.217918862905163e-06,
"loss": 0.44696488976478577,
"memory(GiB)": 74.33,
"step": 388,
"token_acc": 0.8759398496240601,
"train_speed(iter/s)": 0.022792
},
{
"epoch": 0.3146612740141557,
"grad_norm": 2.389130115509033,
"learning_rate": 8.207675464255519e-06,
"loss": 0.4506322741508484,
"memory(GiB)": 74.33,
"step": 389,
"token_acc": 0.8823529411764706,
"train_speed(iter/s)": 0.022792
},
{
"epoch": 0.3154701718907988,
"grad_norm": 2.2962496280670166,
"learning_rate": 8.197409135071497e-06,
"loss": 0.416850209236145,
"memory(GiB)": 74.33,
"step": 390,
"token_acc": 0.8865248226950354,
"train_speed(iter/s)": 0.022792
},
{
"epoch": 0.31627906976744186,
"grad_norm": 2.0682525634765625,
"learning_rate": 8.18711994874345e-06,
"loss": 0.423944354057312,
"memory(GiB)": 74.33,
"step": 391,
"token_acc": 0.8411552346570397,
"train_speed(iter/s)": 0.022792
},
{
"epoch": 0.3170879676440849,
"grad_norm": 2.43737530708313,
"learning_rate": 8.17680797882512e-06,
"loss": 0.44383469223976135,
"memory(GiB)": 74.33,
"step": 392,
"token_acc": 0.8977777777777778,
"train_speed(iter/s)": 0.022792
},
{
"epoch": 0.317896865520728,
"grad_norm": 3.0157485008239746,
"learning_rate": 8.166473299033122e-06,
"loss": 0.4669773280620575,
"memory(GiB)": 74.33,
"step": 393,
"token_acc": 0.8571428571428571,
"train_speed(iter/s)": 0.022792
},
{
"epoch": 0.31870576339737106,
"grad_norm": 2.434302568435669,
"learning_rate": 8.15611598324642e-06,
"loss": 0.46818387508392334,
"memory(GiB)": 74.33,
"step": 394,
"token_acc": 0.7833333333333333,
"train_speed(iter/s)": 0.022792
},
{
"epoch": 0.3195146612740142,
"grad_norm": 2.063925266265869,
"learning_rate": 8.145736105505788e-06,
"loss": 0.45939022302627563,
"memory(GiB)": 74.33,
"step": 395,
"token_acc": 0.8424908424908425,
"train_speed(iter/s)": 0.022792
},
{
"epoch": 0.32032355915065724,
"grad_norm": 2.5207791328430176,
"learning_rate": 8.135333740013294e-06,
"loss": 0.5139025449752808,
"memory(GiB)": 74.33,
"step": 396,
"token_acc": 0.8441176470588235,
"train_speed(iter/s)": 0.022792
},
{
"epoch": 0.3211324570273003,
"grad_norm": 2.687681198120117,
"learning_rate": 8.124908961131759e-06,
"loss": 0.4349074959754944,
"memory(GiB)": 74.33,
"step": 397,
"token_acc": 0.852017937219731,
"train_speed(iter/s)": 0.022792
},
{
"epoch": 0.3219413549039434,
"grad_norm": 2.1986069679260254,
"learning_rate": 8.114461843384229e-06,
"loss": 0.4546552300453186,
"memory(GiB)": 74.33,
"step": 398,
"token_acc": 0.8714859437751004,
"train_speed(iter/s)": 0.022793
},
{
"epoch": 0.32275025278058644,
"grad_norm": 2.6796491146087646,
"learning_rate": 8.103992461453447e-06,
"loss": 0.5386300086975098,
"memory(GiB)": 74.33,
"step": 399,
"token_acc": 0.8553191489361702,
"train_speed(iter/s)": 0.022793
},
{
"epoch": 0.3235591506572295,
"grad_norm": 2.465752363204956,
"learning_rate": 8.093500890181307e-06,
"loss": 0.4470570683479309,
"memory(GiB)": 74.33,
"step": 400,
"token_acc": 0.8025889967637541,
"train_speed(iter/s)": 0.022793
},
{
"epoch": 0.3243680485338726,
"grad_norm": 2.695773124694824,
"learning_rate": 8.082987204568336e-06,
"loss": 0.4630998373031616,
"memory(GiB)": 74.33,
"step": 401,
"token_acc": 0.8252788104089219,
"train_speed(iter/s)": 0.022793
},
{
"epoch": 0.3251769464105157,
"grad_norm": 2.6388256549835205,
"learning_rate": 8.072451479773143e-06,
"loss": 0.47690147161483765,
"memory(GiB)": 74.33,
"step": 402,
"token_acc": 0.8565400843881856,
"train_speed(iter/s)": 0.022793
},
{
"epoch": 0.32598584428715877,
"grad_norm": 2.6586854457855225,
"learning_rate": 8.061893791111887e-06,
"loss": 0.5046311020851135,
"memory(GiB)": 74.33,
"step": 403,
"token_acc": 0.825,
"train_speed(iter/s)": 0.022793
},
{
"epoch": 0.32679474216380183,
"grad_norm": 2.575148820877075,
"learning_rate": 8.05131421405774e-06,
"loss": 0.45166927576065063,
"memory(GiB)": 74.33,
"step": 404,
"token_acc": 0.8725490196078431,
"train_speed(iter/s)": 0.022794
},
{
"epoch": 0.3276036400404449,
"grad_norm": 2.7520835399627686,
"learning_rate": 8.040712824240348e-06,
"loss": 0.47704529762268066,
"memory(GiB)": 74.33,
"step": 405,
"token_acc": 0.8539682539682539,
"train_speed(iter/s)": 0.022793
},
{
"epoch": 0.32841253791708797,
"grad_norm": 2.6821768283843994,
"learning_rate": 8.030089697445287e-06,
"loss": 0.44387978315353394,
"memory(GiB)": 74.33,
"step": 406,
"token_acc": 0.8506224066390041,
"train_speed(iter/s)": 0.022793
},
{
"epoch": 0.32922143579373103,
"grad_norm": 2.5903446674346924,
"learning_rate": 8.019444909613524e-06,
"loss": 0.47109007835388184,
"memory(GiB)": 74.33,
"step": 407,
"token_acc": 0.8415300546448088,
"train_speed(iter/s)": 0.022794
},
{
"epoch": 0.3300303336703741,
"grad_norm": 1.9421981573104858,
"learning_rate": 8.00877853684087e-06,
"loss": 0.4276235103607178,
"memory(GiB)": 74.33,
"step": 408,
"token_acc": 0.8691588785046729,
"train_speed(iter/s)": 0.022794
},
{
"epoch": 0.33083923154701717,
"grad_norm": 1.9274567365646362,
"learning_rate": 7.998090655377441e-06,
"loss": 0.4399895668029785,
"memory(GiB)": 74.33,
"step": 409,
"token_acc": 0.8153846153846154,
"train_speed(iter/s)": 0.022794
},
{
"epoch": 0.3316481294236603,
"grad_norm": 2.349695920944214,
"learning_rate": 7.987381341627116e-06,
"loss": 0.4371504485607147,
"memory(GiB)": 74.33,
"step": 410,
"token_acc": 0.8447488584474886,
"train_speed(iter/s)": 0.022794
},
{
"epoch": 0.33245702730030335,
"grad_norm": 2.508023738861084,
"learning_rate": 7.976650672146977e-06,
"loss": 0.4392384886741638,
"memory(GiB)": 74.33,
"step": 411,
"token_acc": 0.845360824742268,
"train_speed(iter/s)": 0.022794
},
{
"epoch": 0.3332659251769464,
"grad_norm": 2.007159948348999,
"learning_rate": 7.965898723646777e-06,
"loss": 0.42986416816711426,
"memory(GiB)": 74.33,
"step": 412,
"token_acc": 0.8504273504273504,
"train_speed(iter/s)": 0.022793
},
{
"epoch": 0.3340748230535895,
"grad_norm": 2.3318965435028076,
"learning_rate": 7.955125572988381e-06,
"loss": 0.45020729303359985,
"memory(GiB)": 74.33,
"step": 413,
"token_acc": 0.8546099290780141,
"train_speed(iter/s)": 0.022794
},
{
"epoch": 0.33488372093023255,
"grad_norm": 2.5200366973876953,
"learning_rate": 7.944331297185224e-06,
"loss": 0.4530584216117859,
"memory(GiB)": 74.33,
"step": 414,
"token_acc": 0.8896103896103896,
"train_speed(iter/s)": 0.022794
},
{
"epoch": 0.3356926188068756,
"grad_norm": 2.353825569152832,
"learning_rate": 7.933515973401756e-06,
"loss": 0.44627559185028076,
"memory(GiB)": 74.33,
"step": 415,
"token_acc": 0.875,
"train_speed(iter/s)": 0.022794
},
{
"epoch": 0.3365015166835187,
"grad_norm": 2.2710440158843994,
"learning_rate": 7.92267967895289e-06,
"loss": 0.4454203248023987,
"memory(GiB)": 74.33,
"step": 416,
"token_acc": 0.8151260504201681,
"train_speed(iter/s)": 0.022794
},
{
"epoch": 0.33731041456016175,
"grad_norm": 2.4699690341949463,
"learning_rate": 7.911822491303453e-06,
"loss": 0.4395456910133362,
"memory(GiB)": 74.33,
"step": 417,
"token_acc": 0.8617021276595744,
"train_speed(iter/s)": 0.022794
},
{
"epoch": 0.3381193124368049,
"grad_norm": 2.3089406490325928,
"learning_rate": 7.90094448806763e-06,
"loss": 0.4436686038970947,
"memory(GiB)": 74.33,
"step": 418,
"token_acc": 0.8864468864468864,
"train_speed(iter/s)": 0.022794
},
{
"epoch": 0.33892821031344794,
"grad_norm": 2.105353593826294,
"learning_rate": 7.890045747008406e-06,
"loss": 0.48908939957618713,
"memory(GiB)": 74.33,
"step": 419,
"token_acc": 0.8593155893536122,
"train_speed(iter/s)": 0.022794
},
{
"epoch": 0.339737108190091,
"grad_norm": 2.435878276824951,
"learning_rate": 7.879126346037018e-06,
"loss": 0.4750370979309082,
"memory(GiB)": 74.33,
"step": 420,
"token_acc": 0.8844444444444445,
"train_speed(iter/s)": 0.022795
},
{
"epoch": 0.3405460060667341,
"grad_norm": 2.587909698486328,
"learning_rate": 7.868186363212392e-06,
"loss": 0.4756377339363098,
"memory(GiB)": 74.33,
"step": 421,
"token_acc": 0.8487084870848709,
"train_speed(iter/s)": 0.022795
},
{
"epoch": 0.34135490394337714,
"grad_norm": 2.2281887531280518,
"learning_rate": 7.857225876740585e-06,
"loss": 0.4277176558971405,
"memory(GiB)": 74.62,
"step": 422,
"token_acc": 0.8493150684931506,
"train_speed(iter/s)": 0.022795
},
{
"epoch": 0.3421638018200202,
"grad_norm": 2.5752649307250977,
"learning_rate": 7.846244964974226e-06,
"loss": 0.48055747151374817,
"memory(GiB)": 74.62,
"step": 423,
"token_acc": 0.8837209302325582,
"train_speed(iter/s)": 0.022795
},
{
"epoch": 0.3429726996966633,
"grad_norm": 2.586489200592041,
"learning_rate": 7.835243706411961e-06,
"loss": 0.4750707745552063,
"memory(GiB)": 74.62,
"step": 424,
"token_acc": 0.8576642335766423,
"train_speed(iter/s)": 0.022795
},
{
"epoch": 0.3437815975733064,
"grad_norm": 2.450918674468994,
"learning_rate": 7.824222179697884e-06,
"loss": 0.5177239179611206,
"memory(GiB)": 74.62,
"step": 425,
"token_acc": 0.852589641434263,
"train_speed(iter/s)": 0.022795
},
{
"epoch": 0.34459049544994946,
"grad_norm": 2.3722708225250244,
"learning_rate": 7.813180463620987e-06,
"loss": 0.46518608927726746,
"memory(GiB)": 74.62,
"step": 426,
"token_acc": 0.8423645320197044,
"train_speed(iter/s)": 0.022795
},
{
"epoch": 0.34539939332659253,
"grad_norm": 2.5841665267944336,
"learning_rate": 7.802118637114575e-06,
"loss": 0.4838918149471283,
"memory(GiB)": 74.62,
"step": 427,
"token_acc": 0.8434782608695652,
"train_speed(iter/s)": 0.022795
},
{
"epoch": 0.3462082912032356,
"grad_norm": 2.3192875385284424,
"learning_rate": 7.791036779255726e-06,
"loss": 0.42157137393951416,
"memory(GiB)": 74.62,
"step": 428,
"token_acc": 0.8404669260700389,
"train_speed(iter/s)": 0.022796
},
{
"epoch": 0.34701718907987866,
"grad_norm": 2.49680495262146,
"learning_rate": 7.779934969264714e-06,
"loss": 0.4023537039756775,
"memory(GiB)": 74.62,
"step": 429,
"token_acc": 0.8734939759036144,
"train_speed(iter/s)": 0.022796
},
{
"epoch": 0.34782608695652173,
"grad_norm": 2.0230259895324707,
"learning_rate": 7.768813286504439e-06,
"loss": 0.37253955006599426,
"memory(GiB)": 74.62,
"step": 430,
"token_acc": 0.9224806201550387,
"train_speed(iter/s)": 0.022796
},
{
"epoch": 0.3486349848331648,
"grad_norm": 2.3140506744384766,
"learning_rate": 7.757671810479865e-06,
"loss": 0.4874904751777649,
"memory(GiB)": 74.62,
"step": 431,
"token_acc": 0.8592057761732852,
"train_speed(iter/s)": 0.022796
},
{
"epoch": 0.34944388270980786,
"grad_norm": 2.2125346660614014,
"learning_rate": 7.74651062083746e-06,
"loss": 0.37930744886398315,
"memory(GiB)": 74.62,
"step": 432,
"token_acc": 0.8764940239043825,
"train_speed(iter/s)": 0.022796
},
{
"epoch": 0.350252780586451,
"grad_norm": 2.240590810775757,
"learning_rate": 7.735329797364605e-06,
"loss": 0.47669389843940735,
"memory(GiB)": 74.62,
"step": 433,
"token_acc": 0.8710801393728222,
"train_speed(iter/s)": 0.022796
},
{
"epoch": 0.35106167846309405,
"grad_norm": 2.510114908218384,
"learning_rate": 7.724129419989044e-06,
"loss": 0.4742322266101837,
"memory(GiB)": 74.62,
"step": 434,
"token_acc": 0.8536585365853658,
"train_speed(iter/s)": 0.022796
},
{
"epoch": 0.3518705763397371,
"grad_norm": 2.476958990097046,
"learning_rate": 7.712909568778302e-06,
"loss": 0.4492417871952057,
"memory(GiB)": 74.62,
"step": 435,
"token_acc": 0.8333333333333334,
"train_speed(iter/s)": 0.022795
},
{
"epoch": 0.3526794742163802,
"grad_norm": 2.098637104034424,
"learning_rate": 7.701670323939117e-06,
"loss": 0.4481479525566101,
"memory(GiB)": 74.62,
"step": 436,
"token_acc": 0.8601398601398601,
"train_speed(iter/s)": 0.022796
},
{
"epoch": 0.35348837209302325,
"grad_norm": 2.2469687461853027,
"learning_rate": 7.690411765816864e-06,
"loss": 0.43956851959228516,
"memory(GiB)": 74.62,
"step": 437,
"token_acc": 0.8629032258064516,
"train_speed(iter/s)": 0.022796
},
{
"epoch": 0.3542972699696663,
"grad_norm": 2.8738715648651123,
"learning_rate": 7.679133974894984e-06,
"loss": 0.4626030921936035,
"memory(GiB)": 74.62,
"step": 438,
"token_acc": 0.8680851063829788,
"train_speed(iter/s)": 0.022796
},
{
"epoch": 0.3551061678463094,
"grad_norm": 2.638291358947754,
"learning_rate": 7.667837031794404e-06,
"loss": 0.45615172386169434,
"memory(GiB)": 74.62,
"step": 439,
"token_acc": 0.8088235294117647,
"train_speed(iter/s)": 0.022796
},
{
"epoch": 0.35591506572295245,
"grad_norm": 2.2586326599121094,
"learning_rate": 7.656521017272965e-06,
"loss": 0.4124460816383362,
"memory(GiB)": 74.62,
"step": 440,
"token_acc": 0.8611111111111112,
"train_speed(iter/s)": 0.022796
},
{
"epoch": 0.3567239635995956,
"grad_norm": 2.374500274658203,
"learning_rate": 7.64518601222484e-06,
"loss": 0.4275168180465698,
"memory(GiB)": 74.62,
"step": 441,
"token_acc": 0.8487084870848709,
"train_speed(iter/s)": 0.022796
},
{
"epoch": 0.35753286147623864,
"grad_norm": 1.9997868537902832,
"learning_rate": 7.633832097679959e-06,
"loss": 0.3909873068332672,
"memory(GiB)": 74.62,
"step": 442,
"token_acc": 0.8868613138686131,
"train_speed(iter/s)": 0.022796
},
{
"epoch": 0.3583417593528817,
"grad_norm": 4.926924705505371,
"learning_rate": 7.622459354803435e-06,
"loss": 0.43666255474090576,
"memory(GiB)": 74.62,
"step": 443,
"token_acc": 0.8704453441295547,
"train_speed(iter/s)": 0.022796
},
{
"epoch": 0.3591506572295248,
"grad_norm": 2.317330837249756,
"learning_rate": 7.611067864894972e-06,
"loss": 0.44106507301330566,
"memory(GiB)": 74.62,
"step": 444,
"token_acc": 0.8059701492537313,
"train_speed(iter/s)": 0.022796
},
{
"epoch": 0.35995955510616784,
"grad_norm": 2.5835938453674316,
"learning_rate": 7.599657709388292e-06,
"loss": 0.46531200408935547,
"memory(GiB)": 74.62,
"step": 445,
"token_acc": 0.7931034482758621,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3607684529828109,
"grad_norm": 2.8004226684570312,
"learning_rate": 7.58822896985055e-06,
"loss": 0.5187166333198547,
"memory(GiB)": 74.62,
"step": 446,
"token_acc": 0.8099173553719008,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.36157735085945397,
"grad_norm": 2.7265071868896484,
"learning_rate": 7.5767817279817505e-06,
"loss": 0.47425639629364014,
"memory(GiB)": 74.62,
"step": 447,
"token_acc": 0.8085106382978723,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3623862487360971,
"grad_norm": 2.1328177452087402,
"learning_rate": 7.565316065614168e-06,
"loss": 0.4435673952102661,
"memory(GiB)": 74.62,
"step": 448,
"token_acc": 0.8631578947368421,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.36319514661274016,
"grad_norm": 2.4672372341156006,
"learning_rate": 7.5538320647117565e-06,
"loss": 0.41679224371910095,
"memory(GiB)": 74.62,
"step": 449,
"token_acc": 0.8908296943231441,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3640040444893832,
"grad_norm": 2.6723108291625977,
"learning_rate": 7.542329807369566e-06,
"loss": 0.5179734826087952,
"memory(GiB)": 74.62,
"step": 450,
"token_acc": 0.7644444444444445,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3648129423660263,
"grad_norm": 3.7509987354278564,
"learning_rate": 7.530809375813155e-06,
"loss": 0.4264351725578308,
"memory(GiB)": 74.62,
"step": 451,
"token_acc": 0.9,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.36562184024266936,
"grad_norm": 1.9851875305175781,
"learning_rate": 7.519270852398002e-06,
"loss": 0.4789334535598755,
"memory(GiB)": 74.62,
"step": 452,
"token_acc": 0.8250950570342205,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3664307381193124,
"grad_norm": 2.21183705329895,
"learning_rate": 7.507714319608922e-06,
"loss": 0.36344388127326965,
"memory(GiB)": 74.62,
"step": 453,
"token_acc": 0.8487394957983193,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3672396359959555,
"grad_norm": 1.613560676574707,
"learning_rate": 7.496139860059468e-06,
"loss": 0.4224799871444702,
"memory(GiB)": 74.62,
"step": 454,
"token_acc": 0.8813056379821959,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.36804853387259856,
"grad_norm": 2.4515528678894043,
"learning_rate": 7.484547556491346e-06,
"loss": 0.4368416368961334,
"memory(GiB)": 74.62,
"step": 455,
"token_acc": 0.8559322033898306,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3688574317492417,
"grad_norm": 2.2103137969970703,
"learning_rate": 7.472937491773824e-06,
"loss": 0.3967626392841339,
"memory(GiB)": 74.62,
"step": 456,
"token_acc": 0.8217821782178217,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.36966632962588475,
"grad_norm": 2.522752046585083,
"learning_rate": 7.461309748903138e-06,
"loss": 0.45169344544410706,
"memory(GiB)": 74.62,
"step": 457,
"token_acc": 0.8535714285714285,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3704752275025278,
"grad_norm": 3.0310842990875244,
"learning_rate": 7.449664411001898e-06,
"loss": 0.37837380170822144,
"memory(GiB)": 74.62,
"step": 458,
"token_acc": 0.9108527131782945,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3712841253791709,
"grad_norm": 2.2086234092712402,
"learning_rate": 7.438001561318494e-06,
"loss": 0.44610685110092163,
"memory(GiB)": 74.62,
"step": 459,
"token_acc": 0.870722433460076,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.37209302325581395,
"grad_norm": 2.4862678050994873,
"learning_rate": 7.426321283226504e-06,
"loss": 0.4015771746635437,
"memory(GiB)": 74.62,
"step": 460,
"token_acc": 0.8907563025210085,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.372901921132457,
"grad_norm": 2.0166738033294678,
"learning_rate": 7.4146236602240936e-06,
"loss": 0.4152040481567383,
"memory(GiB)": 74.62,
"step": 461,
"token_acc": 0.9248826291079812,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3737108190091001,
"grad_norm": 2.448951005935669,
"learning_rate": 7.402908775933419e-06,
"loss": 0.5621334910392761,
"memory(GiB)": 74.62,
"step": 462,
"token_acc": 0.8628318584070797,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.37451971688574315,
"grad_norm": 2.186652183532715,
"learning_rate": 7.391176714100038e-06,
"loss": 0.4613068699836731,
"memory(GiB)": 74.62,
"step": 463,
"token_acc": 0.8188679245283019,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.37532861476238627,
"grad_norm": 2.2740073204040527,
"learning_rate": 7.379427558592296e-06,
"loss": 0.4919006824493408,
"memory(GiB)": 74.62,
"step": 464,
"token_acc": 0.8471760797342193,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.37613751263902934,
"grad_norm": 2.158538579940796,
"learning_rate": 7.36766139340074e-06,
"loss": 0.42273247241973877,
"memory(GiB)": 74.62,
"step": 465,
"token_acc": 0.8622047244094488,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3769464105156724,
"grad_norm": 3.0366506576538086,
"learning_rate": 7.3558783026375156e-06,
"loss": 0.5097289085388184,
"memory(GiB)": 74.62,
"step": 466,
"token_acc": 0.9178082191780822,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.37775530839231547,
"grad_norm": 2.2849361896514893,
"learning_rate": 7.344078370535757e-06,
"loss": 0.5165024995803833,
"memory(GiB)": 74.62,
"step": 467,
"token_acc": 0.8006430868167203,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.37856420626895854,
"grad_norm": 1.753194808959961,
"learning_rate": 7.3322616814489955e-06,
"loss": 0.4367058277130127,
"memory(GiB)": 74.62,
"step": 468,
"token_acc": 0.8678571428571429,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3793731041456016,
"grad_norm": 1.9058223962783813,
"learning_rate": 7.32042831985055e-06,
"loss": 0.41317999362945557,
"memory(GiB)": 74.62,
"step": 469,
"token_acc": 0.8257261410788381,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.38018200202224467,
"grad_norm": 2.459209680557251,
"learning_rate": 7.308578370332926e-06,
"loss": 0.3700507581233978,
"memory(GiB)": 74.62,
"step": 470,
"token_acc": 0.8687943262411347,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3809908998988878,
"grad_norm": 1.8641716241836548,
"learning_rate": 7.296711917607211e-06,
"loss": 0.40189939737319946,
"memory(GiB)": 74.62,
"step": 471,
"token_acc": 0.8717948717948718,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.38179979777553086,
"grad_norm": 2.2401087284088135,
"learning_rate": 7.284829046502467e-06,
"loss": 0.4430382251739502,
"memory(GiB)": 74.62,
"step": 472,
"token_acc": 0.8419243986254296,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3826086956521739,
"grad_norm": 2.416550636291504,
"learning_rate": 7.272929841965126e-06,
"loss": 0.4755879342556,
"memory(GiB)": 74.62,
"step": 473,
"token_acc": 0.8486238532110092,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.383417593528817,
"grad_norm": 2.260345935821533,
"learning_rate": 7.261014389058383e-06,
"loss": 0.44997456669807434,
"memory(GiB)": 74.62,
"step": 474,
"token_acc": 0.7671957671957672,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.38422649140546006,
"grad_norm": 2.261056661605835,
"learning_rate": 7.2490827729615835e-06,
"loss": 0.47697365283966064,
"memory(GiB)": 74.62,
"step": 475,
"token_acc": 0.8628318584070797,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3850353892821031,
"grad_norm": 2.013577461242676,
"learning_rate": 7.237135078969618e-06,
"loss": 0.3827347159385681,
"memory(GiB)": 74.62,
"step": 476,
"token_acc": 0.8478964401294499,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3858442871587462,
"grad_norm": 2.1973073482513428,
"learning_rate": 7.225171392492316e-06,
"loss": 0.40540656447410583,
"memory(GiB)": 74.62,
"step": 477,
"token_acc": 0.863013698630137,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.38665318503538926,
"grad_norm": 2.2481391429901123,
"learning_rate": 7.213191799053832e-06,
"loss": 0.4136468172073364,
"memory(GiB)": 74.62,
"step": 478,
"token_acc": 0.8339100346020761,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3874620829120324,
"grad_norm": 2.1501901149749756,
"learning_rate": 7.201196384292027e-06,
"loss": 0.4204309284687042,
"memory(GiB)": 74.62,
"step": 479,
"token_acc": 0.8870967741935484,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.38827098078867545,
"grad_norm": 2.1305158138275146,
"learning_rate": 7.189185233957868e-06,
"loss": 0.4197065830230713,
"memory(GiB)": 74.62,
"step": 480,
"token_acc": 0.8160919540229885,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3890798786653185,
"grad_norm": 2.526954174041748,
"learning_rate": 7.177158433914811e-06,
"loss": 0.4064275622367859,
"memory(GiB)": 74.62,
"step": 481,
"token_acc": 0.8907103825136612,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3898887765419616,
"grad_norm": 3.277456283569336,
"learning_rate": 7.165116070138183e-06,
"loss": 0.46176213026046753,
"memory(GiB)": 74.62,
"step": 482,
"token_acc": 0.834983498349835,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.39069767441860465,
"grad_norm": 2.337390184402466,
"learning_rate": 7.153058228714573e-06,
"loss": 0.3911609649658203,
"memory(GiB)": 74.62,
"step": 483,
"token_acc": 0.8909952606635071,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3915065722952477,
"grad_norm": 2.273653745651245,
"learning_rate": 7.140984995841214e-06,
"loss": 0.43842604756355286,
"memory(GiB)": 74.62,
"step": 484,
"token_acc": 0.844559585492228,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3923154701718908,
"grad_norm": 2.842496395111084,
"learning_rate": 7.128896457825364e-06,
"loss": 0.41556501388549805,
"memory(GiB)": 74.62,
"step": 485,
"token_acc": 0.8459016393442623,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.39312436804853385,
"grad_norm": 2.3521416187286377,
"learning_rate": 7.116792701083697e-06,
"loss": 0.4312630891799927,
"memory(GiB)": 74.62,
"step": 486,
"token_acc": 0.8566433566433567,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.39393326592517697,
"grad_norm": 2.2411739826202393,
"learning_rate": 7.104673812141676e-06,
"loss": 0.4646815359592438,
"memory(GiB)": 74.62,
"step": 487,
"token_acc": 0.8078602620087336,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.39474216380182003,
"grad_norm": 2.26692533493042,
"learning_rate": 7.09253987763294e-06,
"loss": 0.41715699434280396,
"memory(GiB)": 74.62,
"step": 488,
"token_acc": 0.8636363636363636,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3955510616784631,
"grad_norm": 2.127204179763794,
"learning_rate": 7.080390984298686e-06,
"loss": 0.39702218770980835,
"memory(GiB)": 74.62,
"step": 489,
"token_acc": 0.8631578947368421,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.39635995955510617,
"grad_norm": 1.905442476272583,
"learning_rate": 7.068227218987043e-06,
"loss": 0.3825928568840027,
"memory(GiB)": 74.62,
"step": 490,
"token_acc": 0.8986784140969163,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.39716885743174923,
"grad_norm": 1.9447747468948364,
"learning_rate": 7.056048668652454e-06,
"loss": 0.45161956548690796,
"memory(GiB)": 74.62,
"step": 491,
"token_acc": 0.8728813559322034,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3979777553083923,
"grad_norm": 2.295433282852173,
"learning_rate": 7.04385542035506e-06,
"loss": 0.41795414686203003,
"memory(GiB)": 74.62,
"step": 492,
"token_acc": 0.8817733990147784,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.39878665318503537,
"grad_norm": 2.265631675720215,
"learning_rate": 7.031647561260065e-06,
"loss": 0.4432828426361084,
"memory(GiB)": 74.62,
"step": 493,
"token_acc": 0.7985865724381626,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.3995955510616785,
"grad_norm": 2.9621176719665527,
"learning_rate": 7.019425178637127e-06,
"loss": 0.44883739948272705,
"memory(GiB)": 74.62,
"step": 494,
"token_acc": 0.9203539823008849,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.40040444893832156,
"grad_norm": 2.9266443252563477,
"learning_rate": 7.007188359859727e-06,
"loss": 0.48823320865631104,
"memory(GiB)": 74.62,
"step": 495,
"token_acc": 0.8736842105263158,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.4012133468149646,
"grad_norm": 3.5501937866210938,
"learning_rate": 6.994937192404539e-06,
"loss": 0.41887539625167847,
"memory(GiB)": 74.62,
"step": 496,
"token_acc": 0.8600823045267489,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.4020222446916077,
"grad_norm": 2.9611189365386963,
"learning_rate": 6.982671763850814e-06,
"loss": 0.460665225982666,
"memory(GiB)": 74.62,
"step": 497,
"token_acc": 0.8066037735849056,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.40283114256825076,
"grad_norm": 2.5562634468078613,
"learning_rate": 6.9703921618797556e-06,
"loss": 0.42445844411849976,
"memory(GiB)": 74.62,
"step": 498,
"token_acc": 0.8809523809523809,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.4036400404448938,
"grad_norm": 2.2612838745117188,
"learning_rate": 6.95809847427388e-06,
"loss": 0.4139663577079773,
"memory(GiB)": 74.62,
"step": 499,
"token_acc": 0.8612244897959184,
"train_speed(iter/s)": 0.022797
},
{
"epoch": 0.4044489383215369,
"grad_norm": 2.0981252193450928,
"learning_rate": 6.945790788916402e-06,
"loss": 0.4424452781677246,
"memory(GiB)": 74.62,
"step": 500,
"token_acc": 0.8401486988847584,
"train_speed(iter/s)": 0.022798
},
{
"epoch": 0.4044489383215369,
"eval_loss": 0.42885029315948486,
"eval_runtime": 431.8839,
"eval_samples_per_second": 3.7,
"eval_steps_per_second": 0.116,
"eval_token_acc": 0.8577324229008779,
"step": 500
},
{
"epoch": 0.40525783619817995,
"grad_norm": 2.1216652393341064,
"learning_rate": 6.9334691937905995e-06,
"loss": 0.4369218349456787,
"memory(GiB)": 74.62,
"step": 501,
"token_acc": 0.8514335360556038,
"train_speed(iter/s)": 0.022355
},
{
"epoch": 0.4060667340748231,
"grad_norm": 2.564833641052246,
"learning_rate": 6.921133776979186e-06,
"loss": 0.4658987820148468,
"memory(GiB)": 74.62,
"step": 502,
"token_acc": 0.8582089552238806,
"train_speed(iter/s)": 0.022356
},
{
"epoch": 0.40687563195146614,
"grad_norm": 1.8351505994796753,
"learning_rate": 6.908784626663681e-06,
"loss": 0.4119420647621155,
"memory(GiB)": 74.62,
"step": 503,
"token_acc": 0.8387096774193549,
"train_speed(iter/s)": 0.022357
},
{
"epoch": 0.4076845298281092,
"grad_norm": 2.2373807430267334,
"learning_rate": 6.896421831123783e-06,
"loss": 0.45484626293182373,
"memory(GiB)": 74.62,
"step": 504,
"token_acc": 0.8540772532188842,
"train_speed(iter/s)": 0.022358
},
{
"epoch": 0.4084934277047523,
"grad_norm": 2.1204137802124023,
"learning_rate": 6.884045478736732e-06,
"loss": 0.3930210471153259,
"memory(GiB)": 74.62,
"step": 505,
"token_acc": 0.9181034482758621,
"train_speed(iter/s)": 0.022359
},
{
"epoch": 0.40930232558139534,
"grad_norm": 2.195955276489258,
"learning_rate": 6.871655657976682e-06,
"loss": 0.4383777976036072,
"memory(GiB)": 74.62,
"step": 506,
"token_acc": 0.8703703703703703,
"train_speed(iter/s)": 0.022359
},
{
"epoch": 0.4101112234580384,
"grad_norm": 2.449862241744995,
"learning_rate": 6.859252457414067e-06,
"loss": 0.5421361923217773,
"memory(GiB)": 74.62,
"step": 507,
"token_acc": 0.8745247148288974,
"train_speed(iter/s)": 0.02236
},
{
"epoch": 0.4109201213346815,
"grad_norm": 2.8813657760620117,
"learning_rate": 6.8468359657149705e-06,
"loss": 0.3448445498943329,
"memory(GiB)": 74.62,
"step": 508,
"token_acc": 0.8831168831168831,
"train_speed(iter/s)": 0.022361
},
{
"epoch": 0.41172901921132454,
"grad_norm": 2.2587554454803467,
"learning_rate": 6.834406271640488e-06,
"loss": 0.40410223603248596,
"memory(GiB)": 74.62,
"step": 509,
"token_acc": 0.9575289575289575,
"train_speed(iter/s)": 0.022362
},
{
"epoch": 0.41253791708796766,
"grad_norm": 2.2055654525756836,
"learning_rate": 6.821963464046096e-06,
"loss": 0.4498205780982971,
"memory(GiB)": 74.62,
"step": 510,
"token_acc": 0.8311258278145696,
"train_speed(iter/s)": 0.022363
},
{
"epoch": 0.41334681496461073,
"grad_norm": 2.171542167663574,
"learning_rate": 6.809507631881014e-06,
"loss": 0.4186447858810425,
"memory(GiB)": 74.62,
"step": 511,
"token_acc": 0.8443708609271523,
"train_speed(iter/s)": 0.022364
},
{
"epoch": 0.4141557128412538,
"grad_norm": 2.509507417678833,
"learning_rate": 6.797038864187564e-06,
"loss": 0.4081672728061676,
"memory(GiB)": 74.62,
"step": 512,
"token_acc": 0.8518518518518519,
"train_speed(iter/s)": 0.022364
},
{
"epoch": 0.41496461071789686,
"grad_norm": 2.3102705478668213,
"learning_rate": 6.78455725010055e-06,
"loss": 0.4792659878730774,
"memory(GiB)": 74.62,
"step": 513,
"token_acc": 0.8571428571428571,
"train_speed(iter/s)": 0.022365
},
{
"epoch": 0.41577350859453993,
"grad_norm": 2.7244982719421387,
"learning_rate": 6.772062878846604e-06,
"loss": 0.41006016731262207,
"memory(GiB)": 74.62,
"step": 514,
"token_acc": 0.8380952380952381,
"train_speed(iter/s)": 0.022366
},
{
"epoch": 0.416582406471183,
"grad_norm": 2.3030154705047607,
"learning_rate": 6.75955583974355e-06,
"loss": 0.4155740737915039,
"memory(GiB)": 74.62,
"step": 515,
"token_acc": 0.8803418803418803,
"train_speed(iter/s)": 0.022367
},
{
"epoch": 0.41739130434782606,
"grad_norm": 3.1387264728546143,
"learning_rate": 6.747036222199783e-06,
"loss": 0.4403674602508545,
"memory(GiB)": 74.62,
"step": 516,
"token_acc": 0.8486486486486486,
"train_speed(iter/s)": 0.022368
},
{
"epoch": 0.4182002022244692,
"grad_norm": 2.3326053619384766,
"learning_rate": 6.7345041157136035e-06,
"loss": 0.5110398530960083,
"memory(GiB)": 74.62,
"step": 517,
"token_acc": 0.8466453674121406,
"train_speed(iter/s)": 0.022369
},
{
"epoch": 0.41900910010111225,
"grad_norm": 1.981326699256897,
"learning_rate": 6.7219596098725995e-06,
"loss": 0.3945692181587219,
"memory(GiB)": 74.62,
"step": 518,
"token_acc": 0.8426395939086294,
"train_speed(iter/s)": 0.02237
},
{
"epoch": 0.4198179979777553,
"grad_norm": 2.0242714881896973,
"learning_rate": 6.709402794352993e-06,
"loss": 0.3980899155139923,
"memory(GiB)": 74.62,
"step": 519,
"token_acc": 0.8425925925925926,
"train_speed(iter/s)": 0.022371
},
{
"epoch": 0.4206268958543984,
"grad_norm": 2.2979252338409424,
"learning_rate": 6.696833758919006e-06,
"loss": 0.4187348484992981,
"memory(GiB)": 74.62,
"step": 520,
"token_acc": 0.9004329004329005,
"train_speed(iter/s)": 0.022372
},
{
"epoch": 0.42143579373104145,
"grad_norm": 2.154912233352661,
"learning_rate": 6.684252593422214e-06,
"loss": 0.4182782471179962,
"memory(GiB)": 74.62,
"step": 521,
"token_acc": 0.896414342629482,
"train_speed(iter/s)": 0.022372
},
{
"epoch": 0.4222446916076845,
"grad_norm": 2.3540515899658203,
"learning_rate": 6.67165938780091e-06,
"loss": 0.41942286491394043,
"memory(GiB)": 74.62,
"step": 522,
"token_acc": 0.7923728813559322,
"train_speed(iter/s)": 0.022373
},
{
"epoch": 0.4230535894843276,
"grad_norm": 2.746999502182007,
"learning_rate": 6.659054232079454e-06,
"loss": 0.48690980672836304,
"memory(GiB)": 74.62,
"step": 523,
"token_acc": 0.8956521739130435,
"train_speed(iter/s)": 0.022374
},
{
"epoch": 0.42386248736097065,
"grad_norm": 2.6656594276428223,
"learning_rate": 6.646437216367634e-06,
"loss": 0.41001442074775696,
"memory(GiB)": 74.62,
"step": 524,
"token_acc": 0.871244635193133,
"train_speed(iter/s)": 0.022375
},
{
"epoch": 0.4246713852376138,
"grad_norm": 3.287884473800659,
"learning_rate": 6.633808430860021e-06,
"loss": 0.3976552486419678,
"memory(GiB)": 74.62,
"step": 525,
"token_acc": 0.8932584269662921,
"train_speed(iter/s)": 0.022376
},
{
"epoch": 0.42548028311425684,
"grad_norm": 1.8821219205856323,
"learning_rate": 6.6211679658353235e-06,
"loss": 0.40812772512435913,
"memory(GiB)": 74.62,
"step": 526,
"token_acc": 0.8380281690140845,
"train_speed(iter/s)": 0.022377
},
{
"epoch": 0.4262891809908999,
"grad_norm": 2.2975385189056396,
"learning_rate": 6.608515911655744e-06,
"loss": 0.4923143982887268,
"memory(GiB)": 74.62,
"step": 527,
"token_acc": 0.8621908127208481,
"train_speed(iter/s)": 0.022378
},
{
"epoch": 0.427098078867543,
"grad_norm": 2.0141286849975586,
"learning_rate": 6.595852358766334e-06,
"loss": 0.42522329092025757,
"memory(GiB)": 74.62,
"step": 528,
"token_acc": 0.8579234972677595,
"train_speed(iter/s)": 0.022379
},
{
"epoch": 0.42790697674418604,
"grad_norm": 2.7446937561035156,
"learning_rate": 6.583177397694338e-06,
"loss": 0.4497550129890442,
"memory(GiB)": 74.62,
"step": 529,
"token_acc": 0.8915094339622641,
"train_speed(iter/s)": 0.022379
},
{
"epoch": 0.4287158746208291,
"grad_norm": 2.207721710205078,
"learning_rate": 6.570491119048558e-06,
"loss": 0.48890426754951477,
"memory(GiB)": 74.62,
"step": 530,
"token_acc": 0.8237082066869301,
"train_speed(iter/s)": 0.02238
},
{
"epoch": 0.4295247724974722,
"grad_norm": 1.9948323965072632,
"learning_rate": 6.557793613518704e-06,
"loss": 0.39835628867149353,
"memory(GiB)": 74.62,
"step": 531,
"token_acc": 0.8313953488372093,
"train_speed(iter/s)": 0.022381
},
{
"epoch": 0.43033367037411524,
"grad_norm": 2.0337955951690674,
"learning_rate": 6.545084971874738e-06,
"loss": 0.4067310094833374,
"memory(GiB)": 74.62,
"step": 532,
"token_acc": 0.8481848184818482,
"train_speed(iter/s)": 0.022382
},
{
"epoch": 0.43114256825075836,
"grad_norm": 1.673884630203247,
"learning_rate": 6.5323652849662335e-06,
"loss": 0.4390275478363037,
"memory(GiB)": 74.62,
"step": 533,
"token_acc": 0.7947976878612717,
"train_speed(iter/s)": 0.022382
},
{
"epoch": 0.43195146612740143,
"grad_norm": 2.2995364665985107,
"learning_rate": 6.519634643721721e-06,
"loss": 0.40432244539260864,
"memory(GiB)": 74.62,
"step": 534,
"token_acc": 0.8676470588235294,
"train_speed(iter/s)": 0.022383
},
{
"epoch": 0.4327603640040445,
"grad_norm": 2.3338489532470703,
"learning_rate": 6.50689313914804e-06,
"loss": 0.4244130849838257,
"memory(GiB)": 74.62,
"step": 535,
"token_acc": 0.8333333333333334,
"train_speed(iter/s)": 0.022384
},
{
"epoch": 0.43356926188068756,
"grad_norm": 3.962207078933716,
"learning_rate": 6.494140862329688e-06,
"loss": 0.43396979570388794,
"memory(GiB)": 74.62,
"step": 536,
"token_acc": 0.8958333333333334,
"train_speed(iter/s)": 0.022385
},
{
"epoch": 0.43437815975733063,
"grad_norm": 2.2048377990722656,
"learning_rate": 6.481377904428171e-06,
"loss": 0.4214767813682556,
"memory(GiB)": 74.62,
"step": 537,
"token_acc": 0.92,
"train_speed(iter/s)": 0.022385
},
{
"epoch": 0.4351870576339737,
"grad_norm": 2.1275532245635986,
"learning_rate": 6.468604356681347e-06,
"loss": 0.47981610894203186,
"memory(GiB)": 74.62,
"step": 538,
"token_acc": 0.8615384615384616,
"train_speed(iter/s)": 0.022386
},
{
"epoch": 0.43599595551061676,
"grad_norm": 2.525294542312622,
"learning_rate": 6.4558203104027805e-06,
"loss": 0.3834857940673828,
"memory(GiB)": 74.62,
"step": 539,
"token_acc": 0.8962264150943396,
"train_speed(iter/s)": 0.022387
},
{
"epoch": 0.4368048533872599,
"grad_norm": 1.9019864797592163,
"learning_rate": 6.443025856981086e-06,
"loss": 0.4347085952758789,
"memory(GiB)": 74.62,
"step": 540,
"token_acc": 0.8483606557377049,
"train_speed(iter/s)": 0.022388
},
{
"epoch": 0.43761375126390295,
"grad_norm": 2.1029298305511475,
"learning_rate": 6.430221087879272e-06,
"loss": 0.3873569071292877,
"memory(GiB)": 74.62,
"step": 541,
"token_acc": 0.8458149779735683,
"train_speed(iter/s)": 0.022388
},
{
"epoch": 0.438422649140546,
"grad_norm": 2.2039341926574707,
"learning_rate": 6.41740609463409e-06,
"loss": 0.41179242730140686,
"memory(GiB)": 74.62,
"step": 542,
"token_acc": 0.8433179723502304,
"train_speed(iter/s)": 0.022389
},
{
"epoch": 0.4392315470171891,
"grad_norm": 1.985140085220337,
"learning_rate": 6.404580968855385e-06,
"loss": 0.3754437565803528,
"memory(GiB)": 74.62,
"step": 543,
"token_acc": 0.8695652173913043,
"train_speed(iter/s)": 0.02239
},
{
"epoch": 0.44004044489383215,
"grad_norm": 2.1291117668151855,
"learning_rate": 6.3917458022254345e-06,
"loss": 0.382461816072464,
"memory(GiB)": 74.62,
"step": 544,
"token_acc": 0.8463768115942029,
"train_speed(iter/s)": 0.022391
},
{
"epoch": 0.4408493427704752,
"grad_norm": 2.164369583129883,
"learning_rate": 6.3789006864982885e-06,
"loss": 0.41792726516723633,
"memory(GiB)": 74.62,
"step": 545,
"token_acc": 0.8883495145631068,
"train_speed(iter/s)": 0.022391
},
{
"epoch": 0.4416582406471183,
"grad_norm": 2.030388355255127,
"learning_rate": 6.366045713499129e-06,
"loss": 0.42167988419532776,
"memory(GiB)": 74.62,
"step": 546,
"token_acc": 0.8613445378151261,
"train_speed(iter/s)": 0.022392
},
{
"epoch": 0.44246713852376135,
"grad_norm": 1.9591219425201416,
"learning_rate": 6.353180975123595e-06,
"loss": 0.3823608458042145,
"memory(GiB)": 74.62,
"step": 547,
"token_acc": 0.8422818791946308,
"train_speed(iter/s)": 0.022393
},
{
"epoch": 0.44327603640040447,
"grad_norm": 2.547567367553711,
"learning_rate": 6.340306563337142e-06,
"loss": 0.4388830363750458,
"memory(GiB)": 74.62,
"step": 548,
"token_acc": 0.8425925925925926,
"train_speed(iter/s)": 0.022393
},
{
"epoch": 0.44408493427704754,
"grad_norm": 2.0034782886505127,
"learning_rate": 6.327422570174373e-06,
"loss": 0.3995330035686493,
"memory(GiB)": 74.62,
"step": 549,
"token_acc": 0.8996138996138996,
"train_speed(iter/s)": 0.022394
},
{
"epoch": 0.4448938321536906,
"grad_norm": 2.489525079727173,
"learning_rate": 6.314529087738387e-06,
"loss": 0.4121745824813843,
"memory(GiB)": 74.62,
"step": 550,
"token_acc": 0.842741935483871,
"train_speed(iter/s)": 0.022395
},
{
"epoch": 0.44570273003033367,
"grad_norm": 2.647597551345825,
"learning_rate": 6.301626208200116e-06,
"loss": 0.4198951721191406,
"memory(GiB)": 74.62,
"step": 551,
"token_acc": 0.8409090909090909,
"train_speed(iter/s)": 0.022395
},
{
"epoch": 0.44651162790697674,
"grad_norm": 3.1573736667633057,
"learning_rate": 6.2887140237976714e-06,
"loss": 0.36342883110046387,
"memory(GiB)": 74.62,
"step": 552,
"token_acc": 0.8653846153846154,
"train_speed(iter/s)": 0.022396
},
{
"epoch": 0.4473205257836198,
"grad_norm": 2.4319777488708496,
"learning_rate": 6.27579262683568e-06,
"loss": 0.4457288086414337,
"memory(GiB)": 74.62,
"step": 553,
"token_acc": 0.842741935483871,
"train_speed(iter/s)": 0.022397
},
{
"epoch": 0.44812942366026287,
"grad_norm": 2.0444133281707764,
"learning_rate": 6.2628621096846265e-06,
"loss": 0.3989095091819763,
"memory(GiB)": 74.62,
"step": 554,
"token_acc": 0.8648648648648649,
"train_speed(iter/s)": 0.022398
},
{
"epoch": 0.448938321536906,
"grad_norm": 2.0919275283813477,
"learning_rate": 6.249922564780193e-06,
"loss": 0.4167803227901459,
"memory(GiB)": 74.62,
"step": 555,
"token_acc": 0.8681318681318682,
"train_speed(iter/s)": 0.022398
},
{
"epoch": 0.44974721941354906,
"grad_norm": 2.3367862701416016,
"learning_rate": 6.236974084622598e-06,
"loss": 0.43416649103164673,
"memory(GiB)": 74.62,
"step": 556,
"token_acc": 0.8571428571428571,
"train_speed(iter/s)": 0.022399
},
{
"epoch": 0.4505561172901921,
"grad_norm": 2.491732597351074,
"learning_rate": 6.224016761775933e-06,
"loss": 0.451057493686676,
"memory(GiB)": 74.62,
"step": 557,
"token_acc": 0.8170347003154574,
"train_speed(iter/s)": 0.0224
},
{
"epoch": 0.4513650151668352,
"grad_norm": 2.020247459411621,
"learning_rate": 6.211050688867504e-06,
"loss": 0.4087960422039032,
"memory(GiB)": 74.62,
"step": 558,
"token_acc": 0.8835978835978836,
"train_speed(iter/s)": 0.0224
},
{
"epoch": 0.45217391304347826,
"grad_norm": 2.914745807647705,
"learning_rate": 6.198075958587168e-06,
"loss": 0.42803430557250977,
"memory(GiB)": 74.62,
"step": 559,
"token_acc": 0.8418079096045198,
"train_speed(iter/s)": 0.022401
},
{
"epoch": 0.4529828109201213,
"grad_norm": 2.470507860183716,
"learning_rate": 6.185092663686671e-06,
"loss": 0.4218277931213379,
"memory(GiB)": 74.62,
"step": 560,
"token_acc": 0.8411764705882353,
"train_speed(iter/s)": 0.022402
},
{
"epoch": 0.4537917087967644,
"grad_norm": 1.9057127237319946,
"learning_rate": 6.172100896978985e-06,
"loss": 0.3940941095352173,
"memory(GiB)": 74.62,
"step": 561,
"token_acc": 0.8507936507936508,
"train_speed(iter/s)": 0.022403
},
{
"epoch": 0.45460060667340746,
"grad_norm": 3.1265318393707275,
"learning_rate": 6.1591007513376425e-06,
"loss": 0.4158666431903839,
"memory(GiB)": 74.62,
"step": 562,
"token_acc": 0.8809523809523809,
"train_speed(iter/s)": 0.022403
},
{
"epoch": 0.4554095045500506,
"grad_norm": 2.3407959938049316,
"learning_rate": 6.146092319696073e-06,
"loss": 0.4111853241920471,
"memory(GiB)": 74.62,
"step": 563,
"token_acc": 0.8944723618090452,
"train_speed(iter/s)": 0.022404
},
{
"epoch": 0.45621840242669365,
"grad_norm": 2.639300584793091,
"learning_rate": 6.133075695046944e-06,
"loss": 0.41796183586120605,
"memory(GiB)": 74.62,
"step": 564,
"token_acc": 0.8415094339622642,
"train_speed(iter/s)": 0.022405
},
{
"epoch": 0.4570273003033367,
"grad_norm": 2.0815927982330322,
"learning_rate": 6.120050970441485e-06,
"loss": 0.4047802686691284,
"memory(GiB)": 74.62,
"step": 565,
"token_acc": 0.8901734104046243,
"train_speed(iter/s)": 0.022406
},
{
"epoch": 0.4578361981799798,
"grad_norm": 2.186722993850708,
"learning_rate": 6.107018238988838e-06,
"loss": 0.45547983050346375,
"memory(GiB)": 74.62,
"step": 566,
"token_acc": 0.8584905660377359,
"train_speed(iter/s)": 0.022406
},
{
"epoch": 0.45864509605662285,
"grad_norm": 2.1137285232543945,
"learning_rate": 6.093977593855376e-06,
"loss": 0.4355093836784363,
"memory(GiB)": 74.62,
"step": 567,
"token_acc": 0.8921933085501859,
"train_speed(iter/s)": 0.022407
},
{
"epoch": 0.4594539939332659,
"grad_norm": 2.740379810333252,
"learning_rate": 6.080929128264046e-06,
"loss": 0.5192371606826782,
"memory(GiB)": 74.62,
"step": 568,
"token_acc": 0.8766519823788547,
"train_speed(iter/s)": 0.022408
},
{
"epoch": 0.460262891809909,
"grad_norm": 2.2080211639404297,
"learning_rate": 6.067872935493703e-06,
"loss": 0.3434896767139435,
"memory(GiB)": 74.62,
"step": 569,
"token_acc": 0.9264069264069265,
"train_speed(iter/s)": 0.022408
},
{
"epoch": 0.46107178968655205,
"grad_norm": 2.196671724319458,
"learning_rate": 6.054809108878438e-06,
"loss": 0.4425520896911621,
"memory(GiB)": 74.62,
"step": 570,
"token_acc": 0.8904761904761904,
"train_speed(iter/s)": 0.022409
},
{
"epoch": 0.46188068756319517,
"grad_norm": 2.0799689292907715,
"learning_rate": 6.041737741806914e-06,
"loss": 0.4603237509727478,
"memory(GiB)": 74.62,
"step": 571,
"token_acc": 0.8606060606060606,
"train_speed(iter/s)": 0.02241
},
{
"epoch": 0.46268958543983824,
"grad_norm": 2.2659521102905273,
"learning_rate": 6.028658927721698e-06,
"loss": 0.3965636193752289,
"memory(GiB)": 74.62,
"step": 572,
"token_acc": 0.8088235294117647,
"train_speed(iter/s)": 0.02241
},
{
"epoch": 0.4634984833164813,
"grad_norm": 1.9087399244308472,
"learning_rate": 6.015572760118597e-06,
"loss": 0.3759012222290039,
"memory(GiB)": 74.62,
"step": 573,
"token_acc": 0.8742331288343558,
"train_speed(iter/s)": 0.022411
},
{
"epoch": 0.46430738119312437,
"grad_norm": 1.982033610343933,
"learning_rate": 6.002479332545982e-06,
"loss": 0.45862114429473877,
"memory(GiB)": 74.62,
"step": 574,
"token_acc": 0.8328173374613003,
"train_speed(iter/s)": 0.022411
},
{
"epoch": 0.46511627906976744,
"grad_norm": 3.0300614833831787,
"learning_rate": 5.989378738604121e-06,
"loss": 0.47833582758903503,
"memory(GiB)": 74.62,
"step": 575,
"token_acc": 0.8853211009174312,
"train_speed(iter/s)": 0.022412
},
{
"epoch": 0.4659251769464105,
"grad_norm": 2.1511874198913574,
"learning_rate": 5.976271071944517e-06,
"loss": 0.4461168348789215,
"memory(GiB)": 74.62,
"step": 576,
"token_acc": 0.8412698412698413,
"train_speed(iter/s)": 0.022413
},
{
"epoch": 0.46673407482305357,
"grad_norm": 2.324009418487549,
"learning_rate": 5.963156426269228e-06,
"loss": 0.3640004098415375,
"memory(GiB)": 74.62,
"step": 577,
"token_acc": 0.8808510638297873,
"train_speed(iter/s)": 0.022413
},
{
"epoch": 0.4675429726996967,
"grad_norm": 2.6052918434143066,
"learning_rate": 5.9500348953302055e-06,
"loss": 0.3626942038536072,
"memory(GiB)": 74.62,
"step": 578,
"token_acc": 0.8615384615384616,
"train_speed(iter/s)": 0.022414
},
{
"epoch": 0.46835187057633976,
"grad_norm": 3.0375425815582275,
"learning_rate": 5.936906572928625e-06,
"loss": 0.4241126775741577,
"memory(GiB)": 74.62,
"step": 579,
"token_acc": 0.8881987577639752,
"train_speed(iter/s)": 0.022415
},
{
"epoch": 0.4691607684529828,
"grad_norm": 2.636939764022827,
"learning_rate": 5.923771552914202e-06,
"loss": 0.4479450583457947,
"memory(GiB)": 74.62,
"step": 580,
"token_acc": 0.8616071428571429,
"train_speed(iter/s)": 0.022416
},
{
"epoch": 0.4699696663296259,
"grad_norm": 1.9995110034942627,
"learning_rate": 5.910629929184541e-06,
"loss": 0.37398701906204224,
"memory(GiB)": 74.62,
"step": 581,
"token_acc": 0.8115942028985508,
"train_speed(iter/s)": 0.022416
},
{
"epoch": 0.47077856420626896,
"grad_norm": 2.149606227874756,
"learning_rate": 5.897481795684447e-06,
"loss": 0.4055722951889038,
"memory(GiB)": 74.62,
"step": 582,
"token_acc": 0.8668941979522184,
"train_speed(iter/s)": 0.022417
},
{
"epoch": 0.471587462082912,
"grad_norm": 3.842085599899292,
"learning_rate": 5.8843272464052626e-06,
"loss": 0.38462674617767334,
"memory(GiB)": 74.62,
"step": 583,
"token_acc": 0.8869565217391304,
"train_speed(iter/s)": 0.022418
},
{
"epoch": 0.4723963599595551,
"grad_norm": 2.599775552749634,
"learning_rate": 5.871166375384201e-06,
"loss": 0.4538233280181885,
"memory(GiB)": 74.62,
"step": 584,
"token_acc": 0.8263888888888888,
"train_speed(iter/s)": 0.022418
},
{
"epoch": 0.47320525783619816,
"grad_norm": 2.188464403152466,
"learning_rate": 5.857999276703657e-06,
"loss": 0.39639097452163696,
"memory(GiB)": 74.62,
"step": 585,
"token_acc": 0.8488372093023255,
"train_speed(iter/s)": 0.022419
},
{
"epoch": 0.4740141557128413,
"grad_norm": 2.0777783393859863,
"learning_rate": 5.844826044490551e-06,
"loss": 0.40574946999549866,
"memory(GiB)": 74.62,
"step": 586,
"token_acc": 0.863013698630137,
"train_speed(iter/s)": 0.02242
},
{
"epoch": 0.47482305358948435,
"grad_norm": 2.120650053024292,
"learning_rate": 5.831646772915651e-06,
"loss": 0.4573715329170227,
"memory(GiB)": 74.62,
"step": 587,
"token_acc": 0.8333333333333334,
"train_speed(iter/s)": 0.022421
},
{
"epoch": 0.4756319514661274,
"grad_norm": 2.0684597492218018,
"learning_rate": 5.8184615561928924e-06,
"loss": 0.39603498578071594,
"memory(GiB)": 74.62,
"step": 588,
"token_acc": 0.8802816901408451,
"train_speed(iter/s)": 0.022421
},
{
"epoch": 0.4764408493427705,
"grad_norm": 2.653454303741455,
"learning_rate": 5.805270488578715e-06,
"loss": 0.4210537075996399,
"memory(GiB)": 74.62,
"step": 589,
"token_acc": 0.8981481481481481,
"train_speed(iter/s)": 0.022422
},
{
"epoch": 0.47724974721941354,
"grad_norm": 2.2436983585357666,
"learning_rate": 5.7920736643713835e-06,
"loss": 0.3758474290370941,
"memory(GiB)": 74.62,
"step": 590,
"token_acc": 0.8515625,
"train_speed(iter/s)": 0.022423
},
{
"epoch": 0.4780586450960566,
"grad_norm": 3.6357314586639404,
"learning_rate": 5.778871177910315e-06,
"loss": 0.4624039828777313,
"memory(GiB)": 74.62,
"step": 591,
"token_acc": 0.8033898305084746,
"train_speed(iter/s)": 0.022423
},
{
"epoch": 0.4788675429726997,
"grad_norm": 2.0779330730438232,
"learning_rate": 5.765663123575401e-06,
"loss": 0.4041805863380432,
"memory(GiB)": 74.62,
"step": 592,
"token_acc": 0.8620689655172413,
"train_speed(iter/s)": 0.022424
},
{
"epoch": 0.47967644084934274,
"grad_norm": 2.654712200164795,
"learning_rate": 5.752449595786341e-06,
"loss": 0.3960053324699402,
"memory(GiB)": 74.62,
"step": 593,
"token_acc": 0.8228782287822878,
"train_speed(iter/s)": 0.022424
},
{
"epoch": 0.48048533872598587,
"grad_norm": 2.4642553329467773,
"learning_rate": 5.7392306890019565e-06,
"loss": 0.41592419147491455,
"memory(GiB)": 74.62,
"step": 594,
"token_acc": 0.7847533632286996,
"train_speed(iter/s)": 0.022425
},
{
"epoch": 0.48129423660262893,
"grad_norm": 2.2550253868103027,
"learning_rate": 5.726006497719525e-06,
"loss": 0.46111100912094116,
"memory(GiB)": 74.62,
"step": 595,
"token_acc": 0.8361204013377926,
"train_speed(iter/s)": 0.022426
},
{
"epoch": 0.482103134479272,
"grad_norm": 2.8922863006591797,
"learning_rate": 5.712777116474103e-06,
"loss": 0.5086416006088257,
"memory(GiB)": 74.62,
"step": 596,
"token_acc": 0.8284023668639053,
"train_speed(iter/s)": 0.022427
},
{
"epoch": 0.48291203235591507,
"grad_norm": 2.173737049102783,
"learning_rate": 5.699542639837844e-06,
"loss": 0.45955491065979004,
"memory(GiB)": 74.62,
"step": 597,
"token_acc": 0.8786610878661087,
"train_speed(iter/s)": 0.022427
},
{
"epoch": 0.48372093023255813,
"grad_norm": 1.9948984384536743,
"learning_rate": 5.686303162419326e-06,
"loss": 0.4127792716026306,
"memory(GiB)": 74.62,
"step": 598,
"token_acc": 0.8712121212121212,
"train_speed(iter/s)": 0.022428
},
{
"epoch": 0.4845298281092012,
"grad_norm": 2.446259021759033,
"learning_rate": 5.6730587788628785e-06,
"loss": 0.4015938341617584,
"memory(GiB)": 74.62,
"step": 599,
"token_acc": 0.8502202643171806,
"train_speed(iter/s)": 0.022429
},
{
"epoch": 0.48533872598584427,
"grad_norm": 2.781144618988037,
"learning_rate": 5.659809583847907e-06,
"loss": 0.44586971402168274,
"memory(GiB)": 74.62,
"step": 600,
"token_acc": 0.8482490272373541,
"train_speed(iter/s)": 0.022429
},
{
"epoch": 0.4861476238624874,
"grad_norm": 2.267489433288574,
"learning_rate": 5.646555672088203e-06,
"loss": 0.36807918548583984,
"memory(GiB)": 74.62,
"step": 601,
"token_acc": 0.8648648648648649,
"train_speed(iter/s)": 0.02243
},
{
"epoch": 0.48695652173913045,
"grad_norm": 2.3026046752929688,
"learning_rate": 5.633297138331285e-06,
"loss": 0.4327083230018616,
"memory(GiB)": 74.62,
"step": 602,
"token_acc": 0.8597122302158273,
"train_speed(iter/s)": 0.02243
},
{
"epoch": 0.4877654196157735,
"grad_norm": 2.635984420776367,
"learning_rate": 5.620034077357708e-06,
"loss": 0.44607388973236084,
"memory(GiB)": 74.62,
"step": 603,
"token_acc": 0.8711111111111111,
"train_speed(iter/s)": 0.022431
},
{
"epoch": 0.4885743174924166,
"grad_norm": 2.5992751121520996,
"learning_rate": 5.60676658398039e-06,
"loss": 0.3917505145072937,
"memory(GiB)": 74.62,
"step": 604,
"token_acc": 0.9137931034482759,
"train_speed(iter/s)": 0.022431
},
{
"epoch": 0.48938321536905965,
"grad_norm": 2.3977952003479004,
"learning_rate": 5.593494753043938e-06,
"loss": 0.41896378993988037,
"memory(GiB)": 74.62,
"step": 605,
"token_acc": 0.8821548821548821,
"train_speed(iter/s)": 0.022432
},
{
"epoch": 0.4901921132457027,
"grad_norm": 2.1268513202667236,
"learning_rate": 5.580218679423965e-06,
"loss": 0.436327189207077,
"memory(GiB)": 74.62,
"step": 606,
"token_acc": 0.8737864077669902,
"train_speed(iter/s)": 0.022432
},
{
"epoch": 0.4910010111223458,
"grad_norm": 3.2890071868896484,
"learning_rate": 5.566938458026411e-06,
"loss": 0.4408925771713257,
"memory(GiB)": 74.62,
"step": 607,
"token_acc": 0.9095744680851063,
"train_speed(iter/s)": 0.022433
},
{
"epoch": 0.49180990899898885,
"grad_norm": 2.2176642417907715,
"learning_rate": 5.553654183786872e-06,
"loss": 0.46782928705215454,
"memory(GiB)": 74.62,
"step": 608,
"token_acc": 0.8888888888888888,
"train_speed(iter/s)": 0.022434
},
{
"epoch": 0.492618806875632,
"grad_norm": 2.8756251335144043,
"learning_rate": 5.540365951669913e-06,
"loss": 0.4359992742538452,
"memory(GiB)": 74.62,
"step": 609,
"token_acc": 0.8753993610223643,
"train_speed(iter/s)": 0.022434
},
{
"epoch": 0.49342770475227504,
"grad_norm": 2.9646661281585693,
"learning_rate": 5.527073856668391e-06,
"loss": 0.4747014343738556,
"memory(GiB)": 74.62,
"step": 610,
"token_acc": 0.889795918367347,
"train_speed(iter/s)": 0.022435
},
{
"epoch": 0.4942366026289181,
"grad_norm": 2.289034128189087,
"learning_rate": 5.513777993802781e-06,
"loss": 0.4281376600265503,
"memory(GiB)": 74.62,
"step": 611,
"token_acc": 0.87890625,
"train_speed(iter/s)": 0.022435
},
{
"epoch": 0.4950455005055612,
"grad_norm": 2.541618585586548,
"learning_rate": 5.500478458120493e-06,
"loss": 0.45447611808776855,
"memory(GiB)": 74.62,
"step": 612,
"token_acc": 0.8346456692913385,
"train_speed(iter/s)": 0.022436
},
{
"epoch": 0.49585439838220424,
"grad_norm": 3.065063953399658,
"learning_rate": 5.487175344695188e-06,
"loss": 0.4350849688053131,
"memory(GiB)": 74.62,
"step": 613,
"token_acc": 0.8583333333333333,
"train_speed(iter/s)": 0.022436
},
{
"epoch": 0.4966632962588473,
"grad_norm": 1.9416303634643555,
"learning_rate": 5.47386874862611e-06,
"loss": 0.4030672311782837,
"memory(GiB)": 74.62,
"step": 614,
"token_acc": 0.8527397260273972,
"train_speed(iter/s)": 0.022437
},
{
"epoch": 0.4974721941354904,
"grad_norm": 2.4637768268585205,
"learning_rate": 5.460558765037392e-06,
"loss": 0.4326108396053314,
"memory(GiB)": 74.62,
"step": 615,
"token_acc": 0.8831168831168831,
"train_speed(iter/s)": 0.022437
},
{
"epoch": 0.49828109201213344,
"grad_norm": 2.7800002098083496,
"learning_rate": 5.447245489077389e-06,
"loss": 0.42490726709365845,
"memory(GiB)": 74.62,
"step": 616,
"token_acc": 0.8571428571428571,
"train_speed(iter/s)": 0.022438
},
{
"epoch": 0.49908998988877656,
"grad_norm": 4.720980167388916,
"learning_rate": 5.433929015917988e-06,
"loss": 0.39446377754211426,
"memory(GiB)": 74.62,
"step": 617,
"token_acc": 0.8888888888888888,
"train_speed(iter/s)": 0.022439
},
{
"epoch": 0.49989888776541963,
"grad_norm": 2.4783382415771484,
"learning_rate": 5.420609440753935e-06,
"loss": 0.41358453035354614,
"memory(GiB)": 74.62,
"step": 618,
"token_acc": 0.8716216216216216,
"train_speed(iter/s)": 0.022439
},
{
"epoch": 0.5007077856420626,
"grad_norm": 2.4651012420654297,
"learning_rate": 5.407286858802147e-06,
"loss": 0.3854910433292389,
"memory(GiB)": 74.62,
"step": 619,
"token_acc": 0.8565217391304348,
"train_speed(iter/s)": 0.022424
},
{
"epoch": 0.5015166835187057,
"grad_norm": 2.053473472595215,
"learning_rate": 5.393961365301041e-06,
"loss": 0.3815562427043915,
"memory(GiB)": 74.62,
"step": 620,
"token_acc": 0.888135593220339,
"train_speed(iter/s)": 0.022425
},
{
"epoch": 0.5023255813953489,
"grad_norm": 2.1635167598724365,
"learning_rate": 5.380633055509843e-06,
"loss": 0.45562463998794556,
"memory(GiB)": 74.62,
"step": 621,
"token_acc": 0.8426573426573427,
"train_speed(iter/s)": 0.022426
},
{
"epoch": 0.503134479271992,
"grad_norm": 2.1759238243103027,
"learning_rate": 5.367302024707911e-06,
"loss": 0.4003329873085022,
"memory(GiB)": 74.62,
"step": 622,
"token_acc": 0.8444444444444444,
"train_speed(iter/s)": 0.022427
},
{
"epoch": 0.503943377148635,
"grad_norm": 2.391221284866333,
"learning_rate": 5.35396836819406e-06,
"loss": 0.4506310820579529,
"memory(GiB)": 74.62,
"step": 623,
"token_acc": 0.8243727598566308,
"train_speed(iter/s)": 0.022427
},
{
"epoch": 0.5047522750252781,
"grad_norm": 2.422003746032715,
"learning_rate": 5.340632181285872e-06,
"loss": 0.3775983154773712,
"memory(GiB)": 74.62,
"step": 624,
"token_acc": 0.9178082191780822,
"train_speed(iter/s)": 0.022428
},
{
"epoch": 0.5055611729019212,
"grad_norm": 2.822801113128662,
"learning_rate": 5.327293559319014e-06,
"loss": 0.46088916063308716,
"memory(GiB)": 74.62,
"step": 625,
"token_acc": 0.8203389830508474,
"train_speed(iter/s)": 0.022428
},
{
"epoch": 0.5063700707785642,
"grad_norm": 2.9713943004608154,
"learning_rate": 5.3139525976465675e-06,
"loss": 0.4233189821243286,
"memory(GiB)": 74.62,
"step": 626,
"token_acc": 0.8892988929889298,
"train_speed(iter/s)": 0.022429
},
{
"epoch": 0.5071789686552073,
"grad_norm": 2.24816632270813,
"learning_rate": 5.300609391638336e-06,
"loss": 0.45002853870391846,
"memory(GiB)": 74.62,
"step": 627,
"token_acc": 0.835820895522388,
"train_speed(iter/s)": 0.02243
},
{
"epoch": 0.5079878665318504,
"grad_norm": 3.1802284717559814,
"learning_rate": 5.287264036680166e-06,
"loss": 0.40955209732055664,
"memory(GiB)": 74.62,
"step": 628,
"token_acc": 0.9,
"train_speed(iter/s)": 0.02243
},
{
"epoch": 0.5087967644084934,
"grad_norm": 2.9746017456054688,
"learning_rate": 5.27391662817327e-06,
"loss": 0.4412648677825928,
"memory(GiB)": 74.62,
"step": 629,
"token_acc": 0.864951768488746,
"train_speed(iter/s)": 0.022431
},
{
"epoch": 0.5096056622851365,
"grad_norm": 7.995876312255859,
"learning_rate": 5.260567261533538e-06,
"loss": 0.4368639886379242,
"memory(GiB)": 74.62,
"step": 630,
"token_acc": 0.9067796610169492,
"train_speed(iter/s)": 0.022431
},
{
"epoch": 0.5104145601617796,
"grad_norm": 4.124439239501953,
"learning_rate": 5.2472160321908535e-06,
"loss": 0.3601537346839905,
"memory(GiB)": 74.62,
"step": 631,
"token_acc": 0.9384615384615385,
"train_speed(iter/s)": 0.022432
},
{
"epoch": 0.5112234580384226,
"grad_norm": 2.16349196434021,
"learning_rate": 5.233863035588427e-06,
"loss": 0.49298688769340515,
"memory(GiB)": 74.62,
"step": 632,
"token_acc": 0.8697318007662835,
"train_speed(iter/s)": 0.022432
},
{
"epoch": 0.5120323559150657,
"grad_norm": 3.2173032760620117,
"learning_rate": 5.22050836718209e-06,
"loss": 0.3806041479110718,
"memory(GiB)": 74.62,
"step": 633,
"token_acc": 0.9253112033195021,
"train_speed(iter/s)": 0.022433
},
{
"epoch": 0.5128412537917088,
"grad_norm": 2.4195048809051514,
"learning_rate": 5.207152122439635e-06,
"loss": 0.41035759449005127,
"memory(GiB)": 74.62,
"step": 634,
"token_acc": 0.86328125,
"train_speed(iter/s)": 0.022434
},
{
"epoch": 0.5136501516683518,
"grad_norm": 2.598662853240967,
"learning_rate": 5.1937943968401175e-06,
"loss": 0.40409672260284424,
"memory(GiB)": 74.62,
"step": 635,
"token_acc": 0.9050279329608939,
"train_speed(iter/s)": 0.022434
},
{
"epoch": 0.514459049544995,
"grad_norm": 3.158039093017578,
"learning_rate": 5.180435285873182e-06,
"loss": 0.4163573682308197,
"memory(GiB)": 74.62,
"step": 636,
"token_acc": 0.8577405857740585,
"train_speed(iter/s)": 0.022435
},
{
"epoch": 0.5152679474216381,
"grad_norm": 2.9024956226348877,
"learning_rate": 5.1670748850383734e-06,
"loss": 0.43788814544677734,
"memory(GiB)": 74.62,
"step": 637,
"token_acc": 0.8318181818181818,
"train_speed(iter/s)": 0.022435
},
{
"epoch": 0.5160768452982811,
"grad_norm": 5.88484001159668,
"learning_rate": 5.153713289844462e-06,
"loss": 0.43005481362342834,
"memory(GiB)": 74.62,
"step": 638,
"token_acc": 0.8546099290780141,
"train_speed(iter/s)": 0.022436
},
{
"epoch": 0.5168857431749242,
"grad_norm": 2.6073086261749268,
"learning_rate": 5.140350595808751e-06,
"loss": 0.441942036151886,
"memory(GiB)": 74.62,
"step": 639,
"token_acc": 0.7777777777777778,
"train_speed(iter/s)": 0.022437
},
{
"epoch": 0.5176946410515673,
"grad_norm": 2.607276439666748,
"learning_rate": 5.126986898456401e-06,
"loss": 0.40762656927108765,
"memory(GiB)": 74.62,
"step": 640,
"token_acc": 0.9018181818181819,
"train_speed(iter/s)": 0.022437
},
{
"epoch": 0.5185035389282103,
"grad_norm": 3.1285383701324463,
"learning_rate": 5.113622293319749e-06,
"loss": 0.4376784861087799,
"memory(GiB)": 74.62,
"step": 641,
"token_acc": 0.8571428571428571,
"train_speed(iter/s)": 0.022438
},
{
"epoch": 0.5193124368048534,
"grad_norm": 2.1132287979125977,
"learning_rate": 5.1002568759376134e-06,
"loss": 0.3872153162956238,
"memory(GiB)": 74.62,
"step": 642,
"token_acc": 0.8991596638655462,
"train_speed(iter/s)": 0.022438
},
{
"epoch": 0.5201213346814965,
"grad_norm": 2.294435501098633,
"learning_rate": 5.086890741854626e-06,
"loss": 0.4477715492248535,
"memory(GiB)": 74.62,
"step": 643,
"token_acc": 0.8445945945945946,
"train_speed(iter/s)": 0.022439
},
{
"epoch": 0.5209302325581395,
"grad_norm": 4.424786567687988,
"learning_rate": 5.073523986620539e-06,
"loss": 0.4204040765762329,
"memory(GiB)": 74.62,
"step": 644,
"token_acc": 0.8901960784313725,
"train_speed(iter/s)": 0.022439
},
{
"epoch": 0.5217391304347826,
"grad_norm": 6.769619941711426,
"learning_rate": 5.060156705789545e-06,
"loss": 0.433963418006897,
"memory(GiB)": 74.62,
"step": 645,
"token_acc": 0.8599221789883269,
"train_speed(iter/s)": 0.02244
},
{
"epoch": 0.5225480283114257,
"grad_norm": 2.297720193862915,
"learning_rate": 5.046788994919595e-06,
"loss": 0.38716062903404236,
"memory(GiB)": 74.62,
"step": 646,
"token_acc": 0.9004329004329005,
"train_speed(iter/s)": 0.022441
},
{
"epoch": 0.5233569261880687,
"grad_norm": 3.8223865032196045,
"learning_rate": 5.033420949571712e-06,
"loss": 0.3824414610862732,
"memory(GiB)": 74.62,
"step": 647,
"token_acc": 0.898989898989899,
"train_speed(iter/s)": 0.022441
},
{
"epoch": 0.5241658240647118,
"grad_norm": 2.3025248050689697,
"learning_rate": 5.020052665309312e-06,
"loss": 0.40017083287239075,
"memory(GiB)": 74.62,
"step": 648,
"token_acc": 0.8936170212765957,
"train_speed(iter/s)": 0.022442
},
{
"epoch": 0.5249747219413549,
"grad_norm": 1.8813366889953613,
"learning_rate": 5.00668423769752e-06,
"loss": 0.3807840347290039,
"memory(GiB)": 74.62,
"step": 649,
"token_acc": 0.8823529411764706,
"train_speed(iter/s)": 0.022442
},
{
"epoch": 0.5257836198179979,
"grad_norm": 2.805870532989502,
"learning_rate": 4.993315762302483e-06,
"loss": 0.4545632302761078,
"memory(GiB)": 74.62,
"step": 650,
"token_acc": 0.8395061728395061,
"train_speed(iter/s)": 0.022443
},
{
"epoch": 0.5265925176946411,
"grad_norm": 2.4668116569519043,
"learning_rate": 4.97994733469069e-06,
"loss": 0.39456599950790405,
"memory(GiB)": 74.62,
"step": 651,
"token_acc": 0.8664122137404581,
"train_speed(iter/s)": 0.022443
},
{
"epoch": 0.5274014155712842,
"grad_norm": 2.224895715713501,
"learning_rate": 4.96657905042829e-06,
"loss": 0.3933877944946289,
"memory(GiB)": 74.62,
"step": 652,
"token_acc": 0.8654708520179372,
"train_speed(iter/s)": 0.022444
},
{
"epoch": 0.5282103134479272,
"grad_norm": 2.5314419269561768,
"learning_rate": 4.9532110050804074e-06,
"loss": 0.36528831720352173,
"memory(GiB)": 74.62,
"step": 653,
"token_acc": 0.9087591240875912,
"train_speed(iter/s)": 0.022444
},
{
"epoch": 0.5290192113245703,
"grad_norm": 2.0852181911468506,
"learning_rate": 4.939843294210456e-06,
"loss": 0.39938467741012573,
"memory(GiB)": 74.62,
"step": 654,
"token_acc": 0.8872180451127819,
"train_speed(iter/s)": 0.022445
},
{
"epoch": 0.5298281092012134,
"grad_norm": 2.4768409729003906,
"learning_rate": 4.926476013379462e-06,
"loss": 0.4587656259536743,
"memory(GiB)": 74.62,
"step": 655,
"token_acc": 0.9147540983606557,
"train_speed(iter/s)": 0.022446
},
{
"epoch": 0.5306370070778564,
"grad_norm": 3.768552303314209,
"learning_rate": 4.9131092581453745e-06,
"loss": 0.4000494182109833,
"memory(GiB)": 74.62,
"step": 656,
"token_acc": 0.8588957055214724,
"train_speed(iter/s)": 0.022446
},
{
"epoch": 0.5314459049544995,
"grad_norm": 2.7904086112976074,
"learning_rate": 4.899743124062387e-06,
"loss": 0.42587220668792725,
"memory(GiB)": 74.62,
"step": 657,
"token_acc": 0.8741258741258742,
"train_speed(iter/s)": 0.022447
},
{
"epoch": 0.5322548028311426,
"grad_norm": 2.2774369716644287,
"learning_rate": 4.886377706680253e-06,
"loss": 0.38174745440483093,
"memory(GiB)": 74.62,
"step": 658,
"token_acc": 0.8697318007662835,
"train_speed(iter/s)": 0.022447
},
{
"epoch": 0.5330637007077856,
"grad_norm": 2.049821376800537,
"learning_rate": 4.873013101543599e-06,
"loss": 0.4340623617172241,
"memory(GiB)": 74.62,
"step": 659,
"token_acc": 0.8543046357615894,
"train_speed(iter/s)": 0.022448
},
{
"epoch": 0.5338725985844287,
"grad_norm": 2.252617120742798,
"learning_rate": 4.859649404191251e-06,
"loss": 0.35842257738113403,
"memory(GiB)": 74.62,
"step": 660,
"token_acc": 0.8933333333333333,
"train_speed(iter/s)": 0.022448
},
{
"epoch": 0.5346814964610718,
"grad_norm": 2.1607117652893066,
"learning_rate": 4.84628671015554e-06,
"loss": 0.40685737133026123,
"memory(GiB)": 74.62,
"step": 661,
"token_acc": 0.8737201365187713,
"train_speed(iter/s)": 0.022449
},
{
"epoch": 0.5354903943377148,
"grad_norm": 2.924506425857544,
"learning_rate": 4.832925114961629e-06,
"loss": 0.44293731451034546,
"memory(GiB)": 74.62,
"step": 662,
"token_acc": 0.8465608465608465,
"train_speed(iter/s)": 0.02245
},
{
"epoch": 0.5362992922143579,
"grad_norm": 3.0079522132873535,
"learning_rate": 4.8195647141268196e-06,
"loss": 0.4585626423358917,
"memory(GiB)": 74.62,
"step": 663,
"token_acc": 0.8599221789883269,
"train_speed(iter/s)": 0.02245
},
{
"epoch": 0.537108190091001,
"grad_norm": 2.986860990524292,
"learning_rate": 4.8062056031598825e-06,
"loss": 0.4173978567123413,
"memory(GiB)": 74.62,
"step": 664,
"token_acc": 0.8721804511278195,
"train_speed(iter/s)": 0.022451
},
{
"epoch": 0.537917087967644,
"grad_norm": 2.1893157958984375,
"learning_rate": 4.792847877560367e-06,
"loss": 0.40209460258483887,
"memory(GiB)": 74.62,
"step": 665,
"token_acc": 0.8129770992366412,
"train_speed(iter/s)": 0.022451
},
{
"epoch": 0.5387259858442872,
"grad_norm": 2.2716012001037598,
"learning_rate": 4.779491632817911e-06,
"loss": 0.4765605926513672,
"memory(GiB)": 74.62,
"step": 666,
"token_acc": 0.8706293706293706,
"train_speed(iter/s)": 0.022452
},
{
"epoch": 0.5395348837209303,
"grad_norm": 2.23425555229187,
"learning_rate": 4.766136964411576e-06,
"loss": 0.39718160033226013,
"memory(GiB)": 74.62,
"step": 667,
"token_acc": 0.8536585365853658,
"train_speed(iter/s)": 0.022452
},
{
"epoch": 0.5403437815975733,
"grad_norm": 2.647259473800659,
"learning_rate": 4.752783967809147e-06,
"loss": 0.4938986301422119,
"memory(GiB)": 74.62,
"step": 668,
"token_acc": 0.8101694915254237,
"train_speed(iter/s)": 0.022453
},
{
"epoch": 0.5411526794742164,
"grad_norm": 2.081202507019043,
"learning_rate": 4.739432738466465e-06,
"loss": 0.4376961588859558,
"memory(GiB)": 74.62,
"step": 669,
"token_acc": 0.8683274021352313,
"train_speed(iter/s)": 0.022453
},
{
"epoch": 0.5419615773508595,
"grad_norm": 2.3195981979370117,
"learning_rate": 4.726083371826731e-06,
"loss": 0.3606075644493103,
"memory(GiB)": 74.62,
"step": 670,
"token_acc": 0.8583690987124464,
"train_speed(iter/s)": 0.022454
},
{
"epoch": 0.5427704752275025,
"grad_norm": 2.1184582710266113,
"learning_rate": 4.712735963319834e-06,
"loss": 0.4429006576538086,
"memory(GiB)": 74.62,
"step": 671,
"token_acc": 0.8438818565400844,
"train_speed(iter/s)": 0.022454
},
{
"epoch": 0.5435793731041456,
"grad_norm": 2.6941933631896973,
"learning_rate": 4.699390608361665e-06,
"loss": 0.41405189037323,
"memory(GiB)": 74.62,
"step": 672,
"token_acc": 0.8790697674418605,
"train_speed(iter/s)": 0.022455
},
{
"epoch": 0.5443882709807887,
"grad_norm": 2.466550588607788,
"learning_rate": 4.686047402353433e-06,
"loss": 0.4570333659648895,
"memory(GiB)": 74.62,
"step": 673,
"token_acc": 0.8647686832740213,
"train_speed(iter/s)": 0.022455
},
{
"epoch": 0.5451971688574317,
"grad_norm": 3.1605703830718994,
"learning_rate": 4.672706440680989e-06,
"loss": 0.3652383089065552,
"memory(GiB)": 74.62,
"step": 674,
"token_acc": 0.8957345971563981,
"train_speed(iter/s)": 0.022456
},
{
"epoch": 0.5460060667340748,
"grad_norm": 2.547511577606201,
"learning_rate": 4.65936781871413e-06,
"loss": 0.4206015467643738,
"memory(GiB)": 74.62,
"step": 675,
"token_acc": 0.88671875,
"train_speed(iter/s)": 0.022456
},
{
"epoch": 0.5468149646107179,
"grad_norm": 2.2908408641815186,
"learning_rate": 4.64603163180594e-06,
"loss": 0.42101001739501953,
"memory(GiB)": 74.62,
"step": 676,
"token_acc": 0.9054054054054054,
"train_speed(iter/s)": 0.022457
},
{
"epoch": 0.547623862487361,
"grad_norm": 2.6179423332214355,
"learning_rate": 4.6326979752920905e-06,
"loss": 0.4017224907875061,
"memory(GiB)": 74.62,
"step": 677,
"token_acc": 0.8642533936651584,
"train_speed(iter/s)": 0.022457
},
{
"epoch": 0.548432760364004,
"grad_norm": 2.2148091793060303,
"learning_rate": 4.619366944490158e-06,
"loss": 0.3605102300643921,
"memory(GiB)": 74.62,
"step": 678,
"token_acc": 0.8927038626609443,
"train_speed(iter/s)": 0.022458
},
{
"epoch": 0.5492416582406471,
"grad_norm": 2.3841159343719482,
"learning_rate": 4.60603863469896e-06,
"loss": 0.3840959370136261,
"memory(GiB)": 74.62,
"step": 679,
"token_acc": 0.8226415094339623,
"train_speed(iter/s)": 0.022458
},
{
"epoch": 0.5500505561172901,
"grad_norm": 2.1525049209594727,
"learning_rate": 4.5927131411978536e-06,
"loss": 0.41845589876174927,
"memory(GiB)": 74.62,
"step": 680,
"token_acc": 0.8461538461538461,
"train_speed(iter/s)": 0.022459
},
{
"epoch": 0.5508594539939332,
"grad_norm": 2.088181495666504,
"learning_rate": 4.579390559246066e-06,
"loss": 0.3538067936897278,
"memory(GiB)": 74.62,
"step": 681,
"token_acc": 0.8301282051282052,
"train_speed(iter/s)": 0.022459
},
{
"epoch": 0.5516683518705764,
"grad_norm": 4.506858825683594,
"learning_rate": 4.566070984082013e-06,
"loss": 0.4188098907470703,
"memory(GiB)": 74.62,
"step": 682,
"token_acc": 0.8808777429467085,
"train_speed(iter/s)": 0.02246
},
{
"epoch": 0.5524772497472195,
"grad_norm": 7.24404764175415,
"learning_rate": 4.552754510922612e-06,
"loss": 0.3949962258338928,
"memory(GiB)": 74.62,
"step": 683,
"token_acc": 0.8771929824561403,
"train_speed(iter/s)": 0.02246
},
{
"epoch": 0.5532861476238625,
"grad_norm": 2.410817861557007,
"learning_rate": 4.539441234962609e-06,
"loss": 0.36630767583847046,
"memory(GiB)": 74.62,
"step": 684,
"token_acc": 0.8398692810457516,
"train_speed(iter/s)": 0.022461
},
{
"epoch": 0.5540950455005056,
"grad_norm": 3.47383975982666,
"learning_rate": 4.526131251373892e-06,
"loss": 0.4143676161766052,
"memory(GiB)": 74.62,
"step": 685,
"token_acc": 0.8458149779735683,
"train_speed(iter/s)": 0.022461
},
{
"epoch": 0.5549039433771487,
"grad_norm": 3.989591360092163,
"learning_rate": 4.512824655304814e-06,
"loss": 0.39957284927368164,
"memory(GiB)": 74.62,
"step": 686,
"token_acc": 0.8847457627118644,
"train_speed(iter/s)": 0.022462
},
{
"epoch": 0.5557128412537917,
"grad_norm": 2.368927001953125,
"learning_rate": 4.499521541879508e-06,
"loss": 0.3500638008117676,
"memory(GiB)": 74.62,
"step": 687,
"token_acc": 0.8498402555910544,
"train_speed(iter/s)": 0.022462
},
{
"epoch": 0.5565217391304348,
"grad_norm": 2.1441452503204346,
"learning_rate": 4.48622200619722e-06,
"loss": 0.3939352035522461,
"memory(GiB)": 74.62,
"step": 688,
"token_acc": 0.9003831417624522,
"train_speed(iter/s)": 0.022463
},
{
"epoch": 0.5573306370070779,
"grad_norm": 2.4296200275421143,
"learning_rate": 4.472926143331612e-06,
"loss": 0.4165255129337311,
"memory(GiB)": 74.62,
"step": 689,
"token_acc": 0.8741935483870967,
"train_speed(iter/s)": 0.022463
},
{
"epoch": 0.5581395348837209,
"grad_norm": 2.0704715251922607,
"learning_rate": 4.459634048330089e-06,
"loss": 0.3778902292251587,
"memory(GiB)": 74.62,
"step": 690,
"token_acc": 0.8571428571428571,
"train_speed(iter/s)": 0.022464
},
{
"epoch": 0.558948432760364,
"grad_norm": 1.9288545846939087,
"learning_rate": 4.44634581621313e-06,
"loss": 0.3621513843536377,
"memory(GiB)": 74.62,
"step": 691,
"token_acc": 0.8803827751196173,
"train_speed(iter/s)": 0.022464
},
{
"epoch": 0.5597573306370071,
"grad_norm": 2.8786773681640625,
"learning_rate": 4.433061541973591e-06,
"loss": 0.46439093351364136,
"memory(GiB)": 74.62,
"step": 692,
"token_acc": 0.8423076923076923,
"train_speed(iter/s)": 0.022465
},
{
"epoch": 0.5605662285136501,
"grad_norm": 7.472469329833984,
"learning_rate": 4.419781320576037e-06,
"loss": 0.3596475124359131,
"memory(GiB)": 74.62,
"step": 693,
"token_acc": 0.8888888888888888,
"train_speed(iter/s)": 0.022465
},
{
"epoch": 0.5613751263902932,
"grad_norm": 2.2149417400360107,
"learning_rate": 4.406505246956064e-06,
"loss": 0.39849790930747986,
"memory(GiB)": 74.62,
"step": 694,
"token_acc": 0.861904761904762,
"train_speed(iter/s)": 0.022466
},
{
"epoch": 0.5621840242669363,
"grad_norm": 2.573707342147827,
"learning_rate": 4.393233416019611e-06,
"loss": 0.33962416648864746,
"memory(GiB)": 74.62,
"step": 695,
"token_acc": 0.875,
"train_speed(iter/s)": 0.022466
},
{
"epoch": 0.5629929221435793,
"grad_norm": 2.2001919746398926,
"learning_rate": 4.379965922642294e-06,
"loss": 0.43496495485305786,
"memory(GiB)": 74.62,
"step": 696,
"token_acc": 0.8486842105263158,
"train_speed(iter/s)": 0.022467
},
{
"epoch": 0.5638018200202225,
"grad_norm": 1.9872112274169922,
"learning_rate": 4.366702861668717e-06,
"loss": 0.3653467297554016,
"memory(GiB)": 74.62,
"step": 697,
"token_acc": 0.8991935483870968,
"train_speed(iter/s)": 0.022467
},
{
"epoch": 0.5646107178968656,
"grad_norm": 2.000946521759033,
"learning_rate": 4.353444327911797e-06,
"loss": 0.4383889138698578,
"memory(GiB)": 74.62,
"step": 698,
"token_acc": 0.8419243986254296,
"train_speed(iter/s)": 0.022468
},
{
"epoch": 0.5654196157735086,
"grad_norm": 2.3316028118133545,
"learning_rate": 4.3401904161520944e-06,
"loss": 0.4090406000614166,
"memory(GiB)": 74.62,
"step": 699,
"token_acc": 0.8454106280193237,
"train_speed(iter/s)": 0.022468
},
{
"epoch": 0.5662285136501517,
"grad_norm": 2.3193917274475098,
"learning_rate": 4.3269412211371215e-06,
"loss": 0.40262287855148315,
"memory(GiB)": 74.62,
"step": 700,
"token_acc": 0.8765432098765432,
"train_speed(iter/s)": 0.022469
},
{
"epoch": 0.5670374115267948,
"grad_norm": 2.7743844985961914,
"learning_rate": 4.313696837580677e-06,
"loss": 0.40288880467414856,
"memory(GiB)": 74.62,
"step": 701,
"token_acc": 0.8993055555555556,
"train_speed(iter/s)": 0.022469
},
{
"epoch": 0.5678463094034378,
"grad_norm": 1.9505183696746826,
"learning_rate": 4.300457360162158e-06,
"loss": 0.34644150733947754,
"memory(GiB)": 74.62,
"step": 702,
"token_acc": 0.8622222222222222,
"train_speed(iter/s)": 0.02247
},
{
"epoch": 0.5686552072800809,
"grad_norm": 2.183720588684082,
"learning_rate": 4.287222883525897e-06,
"loss": 0.429502934217453,
"memory(GiB)": 74.62,
"step": 703,
"token_acc": 0.8661417322834646,
"train_speed(iter/s)": 0.022471
},
{
"epoch": 0.569464105156724,
"grad_norm": 2.0480737686157227,
"learning_rate": 4.273993502280476e-06,
"loss": 0.3910590708255768,
"memory(GiB)": 74.62,
"step": 704,
"token_acc": 0.8404255319148937,
"train_speed(iter/s)": 0.022471
},
{
"epoch": 0.570273003033367,
"grad_norm": 2.1780683994293213,
"learning_rate": 4.2607693109980435e-06,
"loss": 0.45382118225097656,
"memory(GiB)": 74.62,
"step": 705,
"token_acc": 0.8862745098039215,
"train_speed(iter/s)": 0.022472
},
{
"epoch": 0.5710819009100101,
"grad_norm": 2.0752146244049072,
"learning_rate": 4.247550404213661e-06,
"loss": 0.39520663022994995,
"memory(GiB)": 74.62,
"step": 706,
"token_acc": 0.8012422360248447,
"train_speed(iter/s)": 0.022472
},
{
"epoch": 0.5718907987866532,
"grad_norm": 2.0002593994140625,
"learning_rate": 4.2343368764246005e-06,
"loss": 0.4130653738975525,
"memory(GiB)": 74.62,
"step": 707,
"token_acc": 0.862453531598513,
"train_speed(iter/s)": 0.022473
},
{
"epoch": 0.5726996966632962,
"grad_norm": 2.031238317489624,
"learning_rate": 4.221128822089687e-06,
"loss": 0.36960452795028687,
"memory(GiB)": 74.62,
"step": 708,
"token_acc": 0.8981818181818182,
"train_speed(iter/s)": 0.022473
},
{
"epoch": 0.5735085945399393,
"grad_norm": 2.3516478538513184,
"learning_rate": 4.207926335628617e-06,
"loss": 0.43690210580825806,
"memory(GiB)": 74.62,
"step": 709,
"token_acc": 0.8461538461538461,
"train_speed(iter/s)": 0.022474
},
{
"epoch": 0.5743174924165824,
"grad_norm": 2.5592732429504395,
"learning_rate": 4.194729511421285e-06,
"loss": 0.3793370723724365,
"memory(GiB)": 74.62,
"step": 710,
"token_acc": 0.8393574297188755,
"train_speed(iter/s)": 0.022474
},
{
"epoch": 0.5751263902932254,
"grad_norm": 2.097623825073242,
"learning_rate": 4.181538443807109e-06,
"loss": 0.39188504219055176,
"memory(GiB)": 74.62,
"step": 711,
"token_acc": 0.9033613445378151,
"train_speed(iter/s)": 0.022475
},
{
"epoch": 0.5759352881698686,
"grad_norm": 1.9303717613220215,
"learning_rate": 4.1683532270843505e-06,
"loss": 0.4174485504627228,
"memory(GiB)": 74.62,
"step": 712,
"token_acc": 0.8346774193548387,
"train_speed(iter/s)": 0.022475
},
{
"epoch": 0.5767441860465117,
"grad_norm": 2.5618019104003906,
"learning_rate": 4.15517395550945e-06,
"loss": 0.3718748390674591,
"memory(GiB)": 74.62,
"step": 713,
"token_acc": 0.8719723183391004,
"train_speed(iter/s)": 0.022475
},
{
"epoch": 0.5775530839231547,
"grad_norm": 2.322850227355957,
"learning_rate": 4.1420007232963435e-06,
"loss": 0.3762381970882416,
"memory(GiB)": 74.62,
"step": 714,
"token_acc": 0.8874458874458875,
"train_speed(iter/s)": 0.022476
},
{
"epoch": 0.5783619817997978,
"grad_norm": 2.1827359199523926,
"learning_rate": 4.1288336246158e-06,
"loss": 0.40151140093803406,
"memory(GiB)": 74.62,
"step": 715,
"token_acc": 0.8538461538461538,
"train_speed(iter/s)": 0.022476
},
{
"epoch": 0.5791708796764409,
"grad_norm": 2.6647045612335205,
"learning_rate": 4.115672753594739e-06,
"loss": 0.34364283084869385,
"memory(GiB)": 74.62,
"step": 716,
"token_acc": 0.903448275862069,
"train_speed(iter/s)": 0.022477
},
{
"epoch": 0.5799797775530839,
"grad_norm": 2.086578845977783,
"learning_rate": 4.102518204315555e-06,
"loss": 0.4202456474304199,
"memory(GiB)": 74.62,
"step": 717,
"token_acc": 0.8202764976958525,
"train_speed(iter/s)": 0.022477
},
{
"epoch": 0.580788675429727,
"grad_norm": 1.952487826347351,
"learning_rate": 4.089370070815463e-06,
"loss": 0.37721166014671326,
"memory(GiB)": 74.62,
"step": 718,
"token_acc": 0.878419452887538,
"train_speed(iter/s)": 0.022478
},
{
"epoch": 0.5815975733063701,
"grad_norm": 1.9967212677001953,
"learning_rate": 4.0762284470858e-06,
"loss": 0.42397576570510864,
"memory(GiB)": 74.62,
"step": 719,
"token_acc": 0.8559322033898306,
"train_speed(iter/s)": 0.022479
},
{
"epoch": 0.5824064711830131,
"grad_norm": 2.281806707382202,
"learning_rate": 4.063093427071376e-06,
"loss": 0.3868061900138855,
"memory(GiB)": 74.62,
"step": 720,
"token_acc": 0.9313304721030042,
"train_speed(iter/s)": 0.022479
},
{
"epoch": 0.5832153690596562,
"grad_norm": 2.5271997451782227,
"learning_rate": 4.049965104669795e-06,
"loss": 0.4714341163635254,
"memory(GiB)": 74.62,
"step": 721,
"token_acc": 0.8309859154929577,
"train_speed(iter/s)": 0.022479
},
{
"epoch": 0.5840242669362993,
"grad_norm": 2.1930084228515625,
"learning_rate": 4.036843573730774e-06,
"loss": 0.4007885456085205,
"memory(GiB)": 74.62,
"step": 722,
"token_acc": 0.9045643153526971,
"train_speed(iter/s)": 0.02248
},
{
"epoch": 0.5848331648129423,
"grad_norm": 2.2075302600860596,
"learning_rate": 4.023728928055486e-06,
"loss": 0.4345509707927704,
"memory(GiB)": 74.62,
"step": 723,
"token_acc": 0.8504672897196262,
"train_speed(iter/s)": 0.02248
},
{
"epoch": 0.5856420626895854,
"grad_norm": 2.093959331512451,
"learning_rate": 4.0106212613958805e-06,
"loss": 0.39234721660614014,
"memory(GiB)": 74.62,
"step": 724,
"token_acc": 0.8838174273858921,
"train_speed(iter/s)": 0.022481
},
{
"epoch": 0.5864509605662285,
"grad_norm": 2.8163022994995117,
"learning_rate": 3.99752066745402e-06,
"loss": 0.377105712890625,
"memory(GiB)": 74.62,
"step": 725,
"token_acc": 0.8704318936877077,
"train_speed(iter/s)": 0.022481
},
{
"epoch": 0.5872598584428715,
"grad_norm": 3.310258626937866,
"learning_rate": 3.984427239881404e-06,
"loss": 0.33992162346839905,
"memory(GiB)": 74.62,
"step": 726,
"token_acc": 0.8781725888324873,
"train_speed(iter/s)": 0.022482
},
{
"epoch": 0.5880687563195146,
"grad_norm": 2.1290695667266846,
"learning_rate": 3.971341072278302e-06,
"loss": 0.3612005114555359,
"memory(GiB)": 74.62,
"step": 727,
"token_acc": 0.8576642335766423,
"train_speed(iter/s)": 0.022482
},
{
"epoch": 0.5888776541961578,
"grad_norm": 2.370741844177246,
"learning_rate": 3.958262258193089e-06,
"loss": 0.39483344554901123,
"memory(GiB)": 74.62,
"step": 728,
"token_acc": 0.8625954198473282,
"train_speed(iter/s)": 0.022482
},
{
"epoch": 0.5896865520728009,
"grad_norm": 1.9654161930084229,
"learning_rate": 3.9451908911215645e-06,
"loss": 0.3784998059272766,
"memory(GiB)": 74.62,
"step": 729,
"token_acc": 0.8663101604278075,
"train_speed(iter/s)": 0.022483
},
{
"epoch": 0.5904954499494439,
"grad_norm": 2.5404610633850098,
"learning_rate": 3.9321270645062995e-06,
"loss": 0.4317411780357361,
"memory(GiB)": 74.62,
"step": 730,
"token_acc": 0.8413793103448276,
"train_speed(iter/s)": 0.022484
},
{
"epoch": 0.591304347826087,
"grad_norm": 1.932789921760559,
"learning_rate": 3.919070871735956e-06,
"loss": 0.3979855179786682,
"memory(GiB)": 74.62,
"step": 731,
"token_acc": 0.8699551569506726,
"train_speed(iter/s)": 0.022484
},
{
"epoch": 0.59211324570273,
"grad_norm": 2.322033643722534,
"learning_rate": 3.906022406144625e-06,
"loss": 0.4147607088088989,
"memory(GiB)": 74.62,
"step": 732,
"token_acc": 0.8712871287128713,
"train_speed(iter/s)": 0.022484
},
{
"epoch": 0.5929221435793731,
"grad_norm": 2.0661261081695557,
"learning_rate": 3.892981761011164e-06,
"loss": 0.3968489170074463,
"memory(GiB)": 74.62,
"step": 733,
"token_acc": 0.8418367346938775,
"train_speed(iter/s)": 0.022485
},
{
"epoch": 0.5937310414560162,
"grad_norm": 1.8793938159942627,
"learning_rate": 3.8799490295585155e-06,
"loss": 0.34254151582717896,
"memory(GiB)": 74.62,
"step": 734,
"token_acc": 0.9105263157894737,
"train_speed(iter/s)": 0.022485
},
{
"epoch": 0.5945399393326593,
"grad_norm": 3.2460901737213135,
"learning_rate": 3.866924304953059e-06,
"loss": 0.4647367298603058,
"memory(GiB)": 74.62,
"step": 735,
"token_acc": 0.864,
"train_speed(iter/s)": 0.022486
},
{
"epoch": 0.5953488372093023,
"grad_norm": 2.1490590572357178,
"learning_rate": 3.8539076803039285e-06,
"loss": 0.4941931962966919,
"memory(GiB)": 74.62,
"step": 736,
"token_acc": 0.843065693430657,
"train_speed(iter/s)": 0.022486
},
{
"epoch": 0.5961577350859454,
"grad_norm": 2.9426324367523193,
"learning_rate": 3.840899248662358e-06,
"loss": 0.43801772594451904,
"memory(GiB)": 74.62,
"step": 737,
"token_acc": 0.7985611510791367,
"train_speed(iter/s)": 0.022487
},
{
"epoch": 0.5969666329625885,
"grad_norm": 1.8307894468307495,
"learning_rate": 3.827899103021017e-06,
"loss": 0.36532309651374817,
"memory(GiB)": 74.62,
"step": 738,
"token_acc": 0.8484848484848485,
"train_speed(iter/s)": 0.022487
},
{
"epoch": 0.5977755308392315,
"grad_norm": 1.6826763153076172,
"learning_rate": 3.814907336313329e-06,
"loss": 0.3788911998271942,
"memory(GiB)": 74.62,
"step": 739,
"token_acc": 0.8656716417910447,
"train_speed(iter/s)": 0.022488
},
{
"epoch": 0.5985844287158746,
"grad_norm": 3.5640852451324463,
"learning_rate": 3.8019240414128335e-06,
"loss": 0.3946545720100403,
"memory(GiB)": 74.62,
"step": 740,
"token_acc": 0.8245614035087719,
"train_speed(iter/s)": 0.022488
},
{
"epoch": 0.5993933265925177,
"grad_norm": 3.612060785293579,
"learning_rate": 3.7889493111324977e-06,
"loss": 0.4639260172843933,
"memory(GiB)": 74.62,
"step": 741,
"token_acc": 0.8678571428571429,
"train_speed(iter/s)": 0.022489
},
{
"epoch": 0.6002022244691607,
"grad_norm": 2.10774564743042,
"learning_rate": 3.77598323822407e-06,
"loss": 0.3779371380805969,
"memory(GiB)": 74.62,
"step": 742,
"token_acc": 0.8962264150943396,
"train_speed(iter/s)": 0.022489
},
{
"epoch": 0.6010111223458039,
"grad_norm": 2.0632522106170654,
"learning_rate": 3.763025915377403e-06,
"loss": 0.4415694773197174,
"memory(GiB)": 74.62,
"step": 743,
"token_acc": 0.8744939271255061,
"train_speed(iter/s)": 0.02249
},
{
"epoch": 0.601820020222447,
"grad_norm": 2.2084765434265137,
"learning_rate": 3.7500774352198066e-06,
"loss": 0.4385090470314026,
"memory(GiB)": 74.62,
"step": 744,
"token_acc": 0.8181818181818182,
"train_speed(iter/s)": 0.02249
},
{
"epoch": 0.60262891809909,
"grad_norm": 3.2526354789733887,
"learning_rate": 3.7371378903153747e-06,
"loss": 0.36739417910575867,
"memory(GiB)": 74.62,
"step": 745,
"token_acc": 0.8622047244094488,
"train_speed(iter/s)": 0.022491
},
{
"epoch": 0.6034378159757331,
"grad_norm": 2.1862826347351074,
"learning_rate": 3.7242073731643212e-06,
"loss": 0.39445218443870544,
"memory(GiB)": 74.62,
"step": 746,
"token_acc": 0.9465648854961832,
"train_speed(iter/s)": 0.022491
},
{
"epoch": 0.6042467138523762,
"grad_norm": 1.964879035949707,
"learning_rate": 3.711285976202331e-06,
"loss": 0.4600139558315277,
"memory(GiB)": 74.62,
"step": 747,
"token_acc": 0.8509803921568627,
"train_speed(iter/s)": 0.022491
},
{
"epoch": 0.6050556117290192,
"grad_norm": 2.6029324531555176,
"learning_rate": 3.6983737917998858e-06,
"loss": 0.38224440813064575,
"memory(GiB)": 74.62,
"step": 748,
"token_acc": 0.8801498127340824,
"train_speed(iter/s)": 0.022492
},
{
"epoch": 0.6058645096056623,
"grad_norm": 2.0742950439453125,
"learning_rate": 3.685470912261615e-06,
"loss": 0.3933752775192261,
"memory(GiB)": 74.62,
"step": 749,
"token_acc": 0.8681318681318682,
"train_speed(iter/s)": 0.022492
},
{
"epoch": 0.6066734074823054,
"grad_norm": 3.2914257049560547,
"learning_rate": 3.672577429825629e-06,
"loss": 0.39733976125717163,
"memory(GiB)": 74.62,
"step": 750,
"token_acc": 0.9066147859922179,
"train_speed(iter/s)": 0.022493
},
{
"epoch": 0.6074823053589484,
"grad_norm": 1.9089115858078003,
"learning_rate": 3.659693436662859e-06,
"loss": 0.40482792258262634,
"memory(GiB)": 74.62,
"step": 751,
"token_acc": 0.8535564853556485,
"train_speed(iter/s)": 0.022493
},
{
"epoch": 0.6082912032355915,
"grad_norm": 3.0140185356140137,
"learning_rate": 3.6468190248764063e-06,
"loss": 0.5314335823059082,
"memory(GiB)": 74.62,
"step": 752,
"token_acc": 0.8707865168539326,
"train_speed(iter/s)": 0.022493
},
{
"epoch": 0.6091001011122346,
"grad_norm": 2.3016703128814697,
"learning_rate": 3.6339542865008724e-06,
"loss": 0.3704250454902649,
"memory(GiB)": 74.62,
"step": 753,
"token_acc": 0.8878923766816144,
"train_speed(iter/s)": 0.022494
},
{
"epoch": 0.6099089989888776,
"grad_norm": 1.9638766050338745,
"learning_rate": 3.6210993135017115e-06,
"loss": 0.4164350628852844,
"memory(GiB)": 74.62,
"step": 754,
"token_acc": 0.8492462311557789,
"train_speed(iter/s)": 0.022494
},
{
"epoch": 0.6107178968655207,
"grad_norm": 2.505688428878784,
"learning_rate": 3.608254197774567e-06,
"loss": 0.40423935651779175,
"memory(GiB)": 74.62,
"step": 755,
"token_acc": 0.8679245283018868,
"train_speed(iter/s)": 0.022495
},
{
"epoch": 0.6115267947421638,
"grad_norm": 2.152834415435791,
"learning_rate": 3.595419031144615e-06,
"loss": 0.3799169957637787,
"memory(GiB)": 74.62,
"step": 756,
"token_acc": 0.8670520231213873,
"train_speed(iter/s)": 0.022495
},
{
"epoch": 0.6123356926188068,
"grad_norm": 2.534213066101074,
"learning_rate": 3.582593905365912e-06,
"loss": 0.4056301414966583,
"memory(GiB)": 74.62,
"step": 757,
"token_acc": 0.855072463768116,
"train_speed(iter/s)": 0.022496
},
{
"epoch": 0.61314459049545,
"grad_norm": 1.9786441326141357,
"learning_rate": 3.56977891212073e-06,
"loss": 0.4082239270210266,
"memory(GiB)": 74.62,
"step": 758,
"token_acc": 0.8888888888888888,
"train_speed(iter/s)": 0.022496
},
{
"epoch": 0.6139534883720931,
"grad_norm": 1.8767694234848022,
"learning_rate": 3.5569741430189163e-06,
"loss": 0.39076924324035645,
"memory(GiB)": 74.62,
"step": 759,
"token_acc": 0.8728070175438597,
"train_speed(iter/s)": 0.022496
},
{
"epoch": 0.6147623862487361,
"grad_norm": 2.0986220836639404,
"learning_rate": 3.5441796895972203e-06,
"loss": 0.4426667094230652,
"memory(GiB)": 74.62,
"step": 760,
"token_acc": 0.8986486486486487,
"train_speed(iter/s)": 0.022497
},
{
"epoch": 0.6155712841253792,
"grad_norm": 2.349647045135498,
"learning_rate": 3.5313956433186535e-06,
"loss": 0.3979909121990204,
"memory(GiB)": 74.62,
"step": 761,
"token_acc": 0.8770949720670391,
"train_speed(iter/s)": 0.022497
},
{
"epoch": 0.6163801820020223,
"grad_norm": 2.267604351043701,
"learning_rate": 3.518622095571831e-06,
"loss": 0.3654158413410187,
"memory(GiB)": 74.62,
"step": 762,
"token_acc": 0.8448979591836735,
"train_speed(iter/s)": 0.022497
},
{
"epoch": 0.6171890798786653,
"grad_norm": 2.626412868499756,
"learning_rate": 3.505859137670313e-06,
"loss": 0.3898380398750305,
"memory(GiB)": 74.62,
"step": 763,
"token_acc": 0.860655737704918,
"train_speed(iter/s)": 0.022497
},
{
"epoch": 0.6179979777553084,
"grad_norm": 2.134931802749634,
"learning_rate": 3.4931068608519626e-06,
"loss": 0.45385637879371643,
"memory(GiB)": 74.62,
"step": 764,
"token_acc": 0.8448275862068966,
"train_speed(iter/s)": 0.022498
},
{
"epoch": 0.6188068756319515,
"grad_norm": 2.1175262928009033,
"learning_rate": 3.4803653562782807e-06,
"loss": 0.44239288568496704,
"memory(GiB)": 74.62,
"step": 765,
"token_acc": 0.8226600985221675,
"train_speed(iter/s)": 0.022498
},
{
"epoch": 0.6196157735085945,
"grad_norm": 1.9157018661499023,
"learning_rate": 3.4676347150337673e-06,
"loss": 0.37729379534721375,
"memory(GiB)": 74.62,
"step": 766,
"token_acc": 0.8744588744588745,
"train_speed(iter/s)": 0.022499
},
{
"epoch": 0.6204246713852376,
"grad_norm": 2.0690548419952393,
"learning_rate": 3.4549150281252635e-06,
"loss": 0.40089553594589233,
"memory(GiB)": 74.62,
"step": 767,
"token_acc": 0.9056603773584906,
"train_speed(iter/s)": 0.022499
},
{
"epoch": 0.6212335692618807,
"grad_norm": 2.284152030944824,
"learning_rate": 3.442206386481297e-06,
"loss": 0.37923118472099304,
"memory(GiB)": 74.62,
"step": 768,
"token_acc": 0.8286713286713286,
"train_speed(iter/s)": 0.022499
},
{
"epoch": 0.6220424671385237,
"grad_norm": 3.0014827251434326,
"learning_rate": 3.429508880951444e-06,
"loss": 0.38093435764312744,
"memory(GiB)": 74.62,
"step": 769,
"token_acc": 0.8698224852071006,
"train_speed(iter/s)": 0.0225
},
{
"epoch": 0.6228513650151668,
"grad_norm": 2.2891621589660645,
"learning_rate": 3.4168226023056638e-06,
"loss": 0.4511076509952545,
"memory(GiB)": 74.62,
"step": 770,
"token_acc": 0.7976190476190477,
"train_speed(iter/s)": 0.0225
},
{
"epoch": 0.6236602628918099,
"grad_norm": 2.116448163986206,
"learning_rate": 3.4041476412336672e-06,
"loss": 0.49026528000831604,
"memory(GiB)": 74.62,
"step": 771,
"token_acc": 0.840625,
"train_speed(iter/s)": 0.022501
},
{
"epoch": 0.6244691607684529,
"grad_norm": 1.6491224765777588,
"learning_rate": 3.391484088344257e-06,
"loss": 0.3303212523460388,
"memory(GiB)": 74.62,
"step": 772,
"token_acc": 0.8959276018099548,
"train_speed(iter/s)": 0.022501
},
{
"epoch": 0.625278058645096,
"grad_norm": 2.458468198776245,
"learning_rate": 3.3788320341646764e-06,
"loss": 0.37041348218917847,
"memory(GiB)": 74.62,
"step": 773,
"token_acc": 0.8658008658008658,
"train_speed(iter/s)": 0.022501
},
{
"epoch": 0.6260869565217392,
"grad_norm": 1.9400595426559448,
"learning_rate": 3.3661915691399814e-06,
"loss": 0.40716874599456787,
"memory(GiB)": 74.62,
"step": 774,
"token_acc": 0.8565400843881856,
"train_speed(iter/s)": 0.022502
},
{
"epoch": 0.6268958543983822,
"grad_norm": 2.076422691345215,
"learning_rate": 3.3535627836323683e-06,
"loss": 0.4028838276863098,
"memory(GiB)": 74.62,
"step": 775,
"token_acc": 0.8844444444444445,
"train_speed(iter/s)": 0.022502
},
{
"epoch": 0.6277047522750253,
"grad_norm": 2.1392087936401367,
"learning_rate": 3.340945767920547e-06,
"loss": 0.3876573443412781,
"memory(GiB)": 74.62,
"step": 776,
"token_acc": 0.9047619047619048,
"train_speed(iter/s)": 0.022502
},
{
"epoch": 0.6285136501516684,
"grad_norm": 1.980198860168457,
"learning_rate": 3.328340612199091e-06,
"loss": 0.3929121494293213,
"memory(GiB)": 74.62,
"step": 777,
"token_acc": 0.8625,
"train_speed(iter/s)": 0.022503
},
{
"epoch": 0.6293225480283114,
"grad_norm": 2.5135369300842285,
"learning_rate": 3.315747406577787e-06,
"loss": 0.4506552815437317,
"memory(GiB)": 74.62,
"step": 778,
"token_acc": 0.8625954198473282,
"train_speed(iter/s)": 0.022503
},
{
"epoch": 0.6301314459049545,
"grad_norm": 2.9397776126861572,
"learning_rate": 3.303166241080996e-06,
"loss": 0.366382896900177,
"memory(GiB)": 74.62,
"step": 779,
"token_acc": 0.9363636363636364,
"train_speed(iter/s)": 0.022504
},
{
"epoch": 0.6309403437815976,
"grad_norm": 2.5433013439178467,
"learning_rate": 3.290597205647009e-06,
"loss": 0.39890724420547485,
"memory(GiB)": 74.62,
"step": 780,
"token_acc": 0.8835341365461847,
"train_speed(iter/s)": 0.022504
},
{
"epoch": 0.6317492416582406,
"grad_norm": 1.8281358480453491,
"learning_rate": 3.2780403901274026e-06,
"loss": 0.3230600953102112,
"memory(GiB)": 74.62,
"step": 781,
"token_acc": 0.8682170542635659,
"train_speed(iter/s)": 0.022504
},
{
"epoch": 0.6325581395348837,
"grad_norm": 2.3992929458618164,
"learning_rate": 3.265495884286397e-06,
"loss": 0.3860858082771301,
"memory(GiB)": 74.62,
"step": 782,
"token_acc": 0.8675213675213675,
"train_speed(iter/s)": 0.022505
},
{
"epoch": 0.6333670374115268,
"grad_norm": 2.3929519653320312,
"learning_rate": 3.2529637778002177e-06,
"loss": 0.41789501905441284,
"memory(GiB)": 74.62,
"step": 783,
"token_acc": 0.8291666666666667,
"train_speed(iter/s)": 0.022505
},
{
"epoch": 0.6341759352881698,
"grad_norm": 2.3482816219329834,
"learning_rate": 3.2404441602564507e-06,
"loss": 0.42455971240997314,
"memory(GiB)": 74.62,
"step": 784,
"token_acc": 0.8830188679245283,
"train_speed(iter/s)": 0.022506
},
{
"epoch": 0.6349848331648129,
"grad_norm": 1.525108814239502,
"learning_rate": 3.2279371211533976e-06,
"loss": 0.3243609070777893,
"memory(GiB)": 74.62,
"step": 785,
"token_acc": 0.8814229249011858,
"train_speed(iter/s)": 0.022506
},
{
"epoch": 0.635793731041456,
"grad_norm": 2.330397367477417,
"learning_rate": 3.2154427498994517e-06,
"loss": 0.424887478351593,
"memory(GiB)": 74.62,
"step": 786,
"token_acc": 0.8571428571428571,
"train_speed(iter/s)": 0.022506
},
{
"epoch": 0.636602628918099,
"grad_norm": 1.9202159643173218,
"learning_rate": 3.202961135812437e-06,
"loss": 0.3225609064102173,
"memory(GiB)": 74.62,
"step": 787,
"token_acc": 0.9056603773584906,
"train_speed(iter/s)": 0.022507
},
{
"epoch": 0.6374115267947421,
"grad_norm": 1.9144957065582275,
"learning_rate": 3.1904923681189883e-06,
"loss": 0.3496546745300293,
"memory(GiB)": 74.62,
"step": 788,
"token_acc": 0.903010033444816,
"train_speed(iter/s)": 0.022507
},
{
"epoch": 0.6382204246713853,
"grad_norm": 2.0034921169281006,
"learning_rate": 3.1780365359539043e-06,
"loss": 0.41543805599212646,
"memory(GiB)": 74.62,
"step": 789,
"token_acc": 0.8977272727272727,
"train_speed(iter/s)": 0.022508
},
{
"epoch": 0.6390293225480284,
"grad_norm": 1.9115188121795654,
"learning_rate": 3.1655937283595116e-06,
"loss": 0.38339167833328247,
"memory(GiB)": 74.62,
"step": 790,
"token_acc": 0.8555133079847909,
"train_speed(iter/s)": 0.022508
},
{
"epoch": 0.6398382204246714,
"grad_norm": 2.29840350151062,
"learning_rate": 3.153164034285031e-06,
"loss": 0.3975831866264343,
"memory(GiB)": 74.62,
"step": 791,
"token_acc": 0.8322368421052632,
"train_speed(iter/s)": 0.022508
},
{
"epoch": 0.6406471183013145,
"grad_norm": 2.4968957901000977,
"learning_rate": 3.1407475425859348e-06,
"loss": 0.346437931060791,
"memory(GiB)": 74.62,
"step": 792,
"token_acc": 0.8744588744588745,
"train_speed(iter/s)": 0.022509
},
{
"epoch": 0.6414560161779576,
"grad_norm": 2.1374566555023193,
"learning_rate": 3.1283443420233196e-06,
"loss": 0.4348532557487488,
"memory(GiB)": 74.62,
"step": 793,
"token_acc": 0.881578947368421,
"train_speed(iter/s)": 0.022509
},
{
"epoch": 0.6422649140546006,
"grad_norm": 2.104574203491211,
"learning_rate": 3.1159545212632697e-06,
"loss": 0.3768533170223236,
"memory(GiB)": 74.62,
"step": 794,
"token_acc": 0.8533834586466166,
"train_speed(iter/s)": 0.02251
},
{
"epoch": 0.6430738119312437,
"grad_norm": 2.8082082271575928,
"learning_rate": 3.1035781688762177e-06,
"loss": 0.3694327473640442,
"memory(GiB)": 74.62,
"step": 795,
"token_acc": 0.8781512605042017,
"train_speed(iter/s)": 0.02251
},
{
"epoch": 0.6438827098078868,
"grad_norm": 2.036285161972046,
"learning_rate": 3.0912153733363203e-06,
"loss": 0.4223785996437073,
"memory(GiB)": 74.62,
"step": 796,
"token_acc": 0.9013452914798207,
"train_speed(iter/s)": 0.02251
},
{
"epoch": 0.6446916076845298,
"grad_norm": 1.9402992725372314,
"learning_rate": 3.078866223020815e-06,
"loss": 0.39007920026779175,
"memory(GiB)": 74.62,
"step": 797,
"token_acc": 0.8409090909090909,
"train_speed(iter/s)": 0.022511
},
{
"epoch": 0.6455005055611729,
"grad_norm": 2.3638556003570557,
"learning_rate": 3.066530806209402e-06,
"loss": 0.39857369661331177,
"memory(GiB)": 74.62,
"step": 798,
"token_acc": 0.8956521739130435,
"train_speed(iter/s)": 0.022511
},
{
"epoch": 0.646309403437816,
"grad_norm": 1.8377914428710938,
"learning_rate": 3.0542092110835996e-06,
"loss": 0.3549560010433197,
"memory(GiB)": 74.62,
"step": 799,
"token_acc": 0.8955823293172691,
"train_speed(iter/s)": 0.022512
},
{
"epoch": 0.647118301314459,
"grad_norm": 2.2061686515808105,
"learning_rate": 3.04190152572612e-06,
"loss": 0.43962785601615906,
"memory(GiB)": 74.62,
"step": 800,
"token_acc": 0.8395522388059702,
"train_speed(iter/s)": 0.022512
},
{
"epoch": 0.6479271991911021,
"grad_norm": 2.3892087936401367,
"learning_rate": 3.0296078381202465e-06,
"loss": 0.37227606773376465,
"memory(GiB)": 74.62,
"step": 801,
"token_acc": 0.8662420382165605,
"train_speed(iter/s)": 0.022512
},
{
"epoch": 0.6487360970677452,
"grad_norm": 2.125608444213867,
"learning_rate": 3.017328236149187e-06,
"loss": 0.43218767642974854,
"memory(GiB)": 74.62,
"step": 802,
"token_acc": 0.8796992481203008,
"train_speed(iter/s)": 0.022513
},
{
"epoch": 0.6495449949443882,
"grad_norm": 2.3993020057678223,
"learning_rate": 3.0050628075954643e-06,
"loss": 0.3682135343551636,
"memory(GiB)": 74.62,
"step": 803,
"token_acc": 0.9087136929460581,
"train_speed(iter/s)": 0.022513
},
{
"epoch": 0.6503538928210314,
"grad_norm": 2.251502513885498,
"learning_rate": 2.9928116401402753e-06,
"loss": 0.4699886441230774,
"memory(GiB)": 74.62,
"step": 804,
"token_acc": 0.8686440677966102,
"train_speed(iter/s)": 0.022513
},
{
"epoch": 0.6511627906976745,
"grad_norm": 13.69151496887207,
"learning_rate": 2.9805748213628727e-06,
"loss": 0.3267248272895813,
"memory(GiB)": 74.62,
"step": 805,
"token_acc": 0.8592057761732852,
"train_speed(iter/s)": 0.022514
},
{
"epoch": 0.6519716885743175,
"grad_norm": 2.1798858642578125,
"learning_rate": 2.968352438739936e-06,
"loss": 0.4122653901576996,
"memory(GiB)": 74.62,
"step": 806,
"token_acc": 0.8377581120943953,
"train_speed(iter/s)": 0.022514
},
{
"epoch": 0.6527805864509606,
"grad_norm": 1.9182910919189453,
"learning_rate": 2.956144579644942e-06,
"loss": 0.36671823263168335,
"memory(GiB)": 74.62,
"step": 807,
"token_acc": 0.8716216216216216,
"train_speed(iter/s)": 0.022515
},
{
"epoch": 0.6535894843276037,
"grad_norm": 2.026547908782959,
"learning_rate": 2.9439513313475464e-06,
"loss": 0.3970714807510376,
"memory(GiB)": 74.62,
"step": 808,
"token_acc": 0.9066666666666666,
"train_speed(iter/s)": 0.022515
},
{
"epoch": 0.6543983822042467,
"grad_norm": 2.1154861450195312,
"learning_rate": 2.931772781012958e-06,
"loss": 0.3996396064758301,
"memory(GiB)": 74.62,
"step": 809,
"token_acc": 0.8494623655913979,
"train_speed(iter/s)": 0.022515
},
{
"epoch": 0.6552072800808898,
"grad_norm": 2.0756337642669678,
"learning_rate": 2.9196090157013146e-06,
"loss": 0.44487231969833374,
"memory(GiB)": 74.62,
"step": 810,
"token_acc": 0.8075471698113208,
"train_speed(iter/s)": 0.022516
},
{
"epoch": 0.6560161779575329,
"grad_norm": 2.0214574337005615,
"learning_rate": 2.907460122367062e-06,
"loss": 0.3471815586090088,
"memory(GiB)": 74.62,
"step": 811,
"token_acc": 0.8540925266903915,
"train_speed(iter/s)": 0.022516
},
{
"epoch": 0.6568250758341759,
"grad_norm": 1.8203327655792236,
"learning_rate": 2.8953261878583263e-06,
"loss": 0.3285714387893677,
"memory(GiB)": 74.62,
"step": 812,
"token_acc": 0.9137931034482759,
"train_speed(iter/s)": 0.022517
},
{
"epoch": 0.657633973710819,
"grad_norm": 2.6111230850219727,
"learning_rate": 2.8832072989163048e-06,
"loss": 0.38925743103027344,
"memory(GiB)": 74.62,
"step": 813,
"token_acc": 0.8852459016393442,
"train_speed(iter/s)": 0.022517
},
{
"epoch": 0.6584428715874621,
"grad_norm": 1.8417023420333862,
"learning_rate": 2.871103542174637e-06,
"loss": 0.3698727488517761,
"memory(GiB)": 74.62,
"step": 814,
"token_acc": 0.8767605633802817,
"train_speed(iter/s)": 0.022517
},
{
"epoch": 0.6592517694641051,
"grad_norm": 2.0547242164611816,
"learning_rate": 2.859015004158789e-06,
"loss": 0.37436971068382263,
"memory(GiB)": 74.62,
"step": 815,
"token_acc": 0.8426966292134831,
"train_speed(iter/s)": 0.022518
},
{
"epoch": 0.6600606673407482,
"grad_norm": 3.1478235721588135,
"learning_rate": 2.8469417712854287e-06,
"loss": 0.4491364359855652,
"memory(GiB)": 74.62,
"step": 816,
"token_acc": 0.8157894736842105,
"train_speed(iter/s)": 0.022518
},
{
"epoch": 0.6608695652173913,
"grad_norm": 2.21091890335083,
"learning_rate": 2.834883929861818e-06,
"loss": 0.3636167049407959,
"memory(GiB)": 74.62,
"step": 817,
"token_acc": 0.9236947791164659,
"train_speed(iter/s)": 0.022518
},
{
"epoch": 0.6616784630940343,
"grad_norm": 2.1053714752197266,
"learning_rate": 2.822841566085192e-06,
"loss": 0.3697773218154907,
"memory(GiB)": 74.62,
"step": 818,
"token_acc": 0.9090909090909091,
"train_speed(iter/s)": 0.022519
},
{
"epoch": 0.6624873609706774,
"grad_norm": 1.9461814165115356,
"learning_rate": 2.8108147660421325e-06,
"loss": 0.42437541484832764,
"memory(GiB)": 74.62,
"step": 819,
"token_acc": 0.8985507246376812,
"train_speed(iter/s)": 0.022519
},
{
"epoch": 0.6632962588473206,
"grad_norm": 1.9878171682357788,
"learning_rate": 2.798803615707976e-06,
"loss": 0.40904805064201355,
"memory(GiB)": 74.62,
"step": 820,
"token_acc": 0.8475609756097561,
"train_speed(iter/s)": 0.022519
},
{
"epoch": 0.6641051567239636,
"grad_norm": 1.8959929943084717,
"learning_rate": 2.78680820094617e-06,
"loss": 0.3745640218257904,
"memory(GiB)": 74.62,
"step": 821,
"token_acc": 0.914396887159533,
"train_speed(iter/s)": 0.02252
},
{
"epoch": 0.6649140546006067,
"grad_norm": 2.005540609359741,
"learning_rate": 2.7748286075076834e-06,
"loss": 0.364071786403656,
"memory(GiB)": 74.62,
"step": 822,
"token_acc": 0.8765432098765432,
"train_speed(iter/s)": 0.02252
},
{
"epoch": 0.6657229524772498,
"grad_norm": 2.166395902633667,
"learning_rate": 2.762864921030384e-06,
"loss": 0.37051212787628174,
"memory(GiB)": 74.62,
"step": 823,
"token_acc": 0.8909774436090225,
"train_speed(iter/s)": 0.02252
},
{
"epoch": 0.6665318503538928,
"grad_norm": 1.9548283815383911,
"learning_rate": 2.750917227038419e-06,
"loss": 0.39772191643714905,
"memory(GiB)": 74.62,
"step": 824,
"token_acc": 0.8986928104575164,
"train_speed(iter/s)": 0.022521
},
{
"epoch": 0.6673407482305359,
"grad_norm": 2.373486280441284,
"learning_rate": 2.7389856109416178e-06,
"loss": 0.39033639430999756,
"memory(GiB)": 74.62,
"step": 825,
"token_acc": 0.8876404494382022,
"train_speed(iter/s)": 0.022521
},
{
"epoch": 0.668149646107179,
"grad_norm": 1.9656351804733276,
"learning_rate": 2.7270701580348737e-06,
"loss": 0.4327496588230133,
"memory(GiB)": 74.62,
"step": 826,
"token_acc": 0.8840579710144928,
"train_speed(iter/s)": 0.022522
},
{
"epoch": 0.668958543983822,
"grad_norm": 1.7876020669937134,
"learning_rate": 2.715170953497532e-06,
"loss": 0.4038127064704895,
"memory(GiB)": 74.62,
"step": 827,
"token_acc": 0.8581081081081081,
"train_speed(iter/s)": 0.022522
},
{
"epoch": 0.6697674418604651,
"grad_norm": 2.269183397293091,
"learning_rate": 2.703288082392791e-06,
"loss": 0.3742678165435791,
"memory(GiB)": 74.62,
"step": 828,
"token_acc": 0.9116279069767442,
"train_speed(iter/s)": 0.022522
},
{
"epoch": 0.6705763397371082,
"grad_norm": 2.3092498779296875,
"learning_rate": 2.691421629667076e-06,
"loss": 0.3477456867694855,
"memory(GiB)": 74.62,
"step": 829,
"token_acc": 0.8858447488584474,
"train_speed(iter/s)": 0.022523
},
{
"epoch": 0.6713852376137512,
"grad_norm": 2.0374417304992676,
"learning_rate": 2.6795716801494538e-06,
"loss": 0.3951851725578308,
"memory(GiB)": 74.62,
"step": 830,
"token_acc": 0.8655913978494624,
"train_speed(iter/s)": 0.022523
},
{
"epoch": 0.6721941354903943,
"grad_norm": 2.6279661655426025,
"learning_rate": 2.6677383185510053e-06,
"loss": 0.37477776408195496,
"memory(GiB)": 74.62,
"step": 831,
"token_acc": 0.8745519713261649,
"train_speed(iter/s)": 0.022523
},
{
"epoch": 0.6730030333670374,
"grad_norm": 2.128077268600464,
"learning_rate": 2.6559216294642446e-06,
"loss": 0.34244000911712646,
"memory(GiB)": 74.62,
"step": 832,
"token_acc": 0.8764478764478765,
"train_speed(iter/s)": 0.022524
},
{
"epoch": 0.6738119312436804,
"grad_norm": 1.9825257062911987,
"learning_rate": 2.6441216973624857e-06,
"loss": 0.36798208951950073,
"memory(GiB)": 74.62,
"step": 833,
"token_acc": 0.9363957597173145,
"train_speed(iter/s)": 0.022524
},
{
"epoch": 0.6746208291203235,
"grad_norm": 2.1210215091705322,
"learning_rate": 2.6323386065992596e-06,
"loss": 0.3946457505226135,
"memory(GiB)": 74.62,
"step": 834,
"token_acc": 0.8380681818181818,
"train_speed(iter/s)": 0.022524
},
{
"epoch": 0.6754297269969667,
"grad_norm": 1.8778958320617676,
"learning_rate": 2.6205724414077064e-06,
"loss": 0.3758698105812073,
"memory(GiB)": 74.62,
"step": 835,
"token_acc": 0.8895705521472392,
"train_speed(iter/s)": 0.022525
},
{
"epoch": 0.6762386248736098,
"grad_norm": 1.917371153831482,
"learning_rate": 2.6088232858999644e-06,
"loss": 0.3301732540130615,
"memory(GiB)": 74.62,
"step": 836,
"token_acc": 0.9437229437229437,
"train_speed(iter/s)": 0.022525
},
{
"epoch": 0.6770475227502528,
"grad_norm": 2.223240613937378,
"learning_rate": 2.5970912240665815e-06,
"loss": 0.4553636908531189,
"memory(GiB)": 74.62,
"step": 837,
"token_acc": 0.8589211618257261,
"train_speed(iter/s)": 0.022525
},
{
"epoch": 0.6778564206268959,
"grad_norm": 3.028218984603882,
"learning_rate": 2.585376339775908e-06,
"loss": 0.46183380484580994,
"memory(GiB)": 74.62,
"step": 838,
"token_acc": 0.8557213930348259,
"train_speed(iter/s)": 0.022526
},
{
"epoch": 0.678665318503539,
"grad_norm": 1.9921714067459106,
"learning_rate": 2.573678716773496e-06,
"loss": 0.38901880383491516,
"memory(GiB)": 74.62,
"step": 839,
"token_acc": 0.8819188191881919,
"train_speed(iter/s)": 0.022526
},
{
"epoch": 0.679474216380182,
"grad_norm": 2.3916425704956055,
"learning_rate": 2.5619984386815073e-06,
"loss": 0.4160255193710327,
"memory(GiB)": 74.62,
"step": 840,
"token_acc": 0.8577981651376146,
"train_speed(iter/s)": 0.022526
},
{
"epoch": 0.6802831142568251,
"grad_norm": 2.2416515350341797,
"learning_rate": 2.550335588998103e-06,
"loss": 0.46858906745910645,
"memory(GiB)": 74.62,
"step": 841,
"token_acc": 0.8333333333333334,
"train_speed(iter/s)": 0.022526
},
{
"epoch": 0.6810920121334681,
"grad_norm": 1.9555854797363281,
"learning_rate": 2.5386902510968627e-06,
"loss": 0.4036467969417572,
"memory(GiB)": 74.62,
"step": 842,
"token_acc": 0.8421052631578947,
"train_speed(iter/s)": 0.022527
},
{
"epoch": 0.6819009100101112,
"grad_norm": 4.791243553161621,
"learning_rate": 2.527062508226176e-06,
"loss": 0.37610989809036255,
"memory(GiB)": 74.62,
"step": 843,
"token_acc": 0.8712871287128713,
"train_speed(iter/s)": 0.022527
},
{
"epoch": 0.6827098078867543,
"grad_norm": 1.9034098386764526,
"learning_rate": 2.5154524435086537e-06,
"loss": 0.3707886040210724,
"memory(GiB)": 74.62,
"step": 844,
"token_acc": 0.8761904761904762,
"train_speed(iter/s)": 0.022527
},
{
"epoch": 0.6835187057633973,
"grad_norm": 2.0733163356781006,
"learning_rate": 2.5038601399405337e-06,
"loss": 0.4223529100418091,
"memory(GiB)": 74.62,
"step": 845,
"token_acc": 0.844106463878327,
"train_speed(iter/s)": 0.022528
},
{
"epoch": 0.6843276036400404,
"grad_norm": 1.9344497919082642,
"learning_rate": 2.492285680391079e-06,
"loss": 0.38297271728515625,
"memory(GiB)": 74.62,
"step": 846,
"token_acc": 0.8446215139442231,
"train_speed(iter/s)": 0.022528
},
{
"epoch": 0.6851365015166835,
"grad_norm": 2.1887664794921875,
"learning_rate": 2.4807291476019996e-06,
"loss": 0.3631531000137329,
"memory(GiB)": 74.62,
"step": 847,
"token_acc": 0.8963963963963963,
"train_speed(iter/s)": 0.022529
},
{
"epoch": 0.6859453993933265,
"grad_norm": 2.7835731506347656,
"learning_rate": 2.4691906241868473e-06,
"loss": 0.4326528310775757,
"memory(GiB)": 74.62,
"step": 848,
"token_acc": 0.8395522388059702,
"train_speed(iter/s)": 0.022529
},
{
"epoch": 0.6867542972699696,
"grad_norm": 2.0431745052337646,
"learning_rate": 2.4576701926304357e-06,
"loss": 0.34864187240600586,
"memory(GiB)": 74.62,
"step": 849,
"token_acc": 0.899581589958159,
"train_speed(iter/s)": 0.022529
},
{
"epoch": 0.6875631951466128,
"grad_norm": 5.60698938369751,
"learning_rate": 2.4461679352882443e-06,
"loss": 0.4054935574531555,
"memory(GiB)": 74.62,
"step": 850,
"token_acc": 0.8398058252427184,
"train_speed(iter/s)": 0.022529
},
{
"epoch": 0.6883720930232559,
"grad_norm": 1.8018779754638672,
"learning_rate": 2.434683934385833e-06,
"loss": 0.32462042570114136,
"memory(GiB)": 74.62,
"step": 851,
"token_acc": 0.8726591760299626,
"train_speed(iter/s)": 0.02253
},
{
"epoch": 0.6891809908998989,
"grad_norm": 3.250086545944214,
"learning_rate": 2.4232182720182524e-06,
"loss": 0.3477787375450134,
"memory(GiB)": 74.62,
"step": 852,
"token_acc": 0.8364312267657993,
"train_speed(iter/s)": 0.02253
},
{
"epoch": 0.689989888776542,
"grad_norm": 2.627101421356201,
"learning_rate": 2.4117710301494527e-06,
"loss": 0.38884738087654114,
"memory(GiB)": 74.62,
"step": 853,
"token_acc": 0.9395348837209302,
"train_speed(iter/s)": 0.022531
},
{
"epoch": 0.6907987866531851,
"grad_norm": 2.8539373874664307,
"learning_rate": 2.40034229061171e-06,
"loss": 0.40084555745124817,
"memory(GiB)": 74.62,
"step": 854,
"token_acc": 0.8843283582089553,
"train_speed(iter/s)": 0.022531
},
{
"epoch": 0.6916076845298281,
"grad_norm": 2.5881996154785156,
"learning_rate": 2.3889321351050286e-06,
"loss": 0.36527204513549805,
"memory(GiB)": 74.62,
"step": 855,
"token_acc": 0.911504424778761,
"train_speed(iter/s)": 0.022531
},
{
"epoch": 0.6924165824064712,
"grad_norm": 2.2433817386627197,
"learning_rate": 2.377540645196565e-06,
"loss": 0.4530036151409149,
"memory(GiB)": 74.62,
"step": 856,
"token_acc": 0.8681672025723473,
"train_speed(iter/s)": 0.022532
},
{
"epoch": 0.6932254802831143,
"grad_norm": 2.251718759536743,
"learning_rate": 2.3661679023200422e-06,
"loss": 0.44757646322250366,
"memory(GiB)": 74.62,
"step": 857,
"token_acc": 0.9019607843137255,
"train_speed(iter/s)": 0.022532
},
{
"epoch": 0.6940343781597573,
"grad_norm": 1.987608790397644,
"learning_rate": 2.354813987775163e-06,
"loss": 0.34107983112335205,
"memory(GiB)": 74.62,
"step": 858,
"token_acc": 0.871875,
"train_speed(iter/s)": 0.022532
},
{
"epoch": 0.6948432760364004,
"grad_norm": 2.4668984413146973,
"learning_rate": 2.343478982727039e-06,
"loss": 0.4043659269809723,
"memory(GiB)": 74.62,
"step": 859,
"token_acc": 0.8947368421052632,
"train_speed(iter/s)": 0.022532
},
{
"epoch": 0.6956521739130435,
"grad_norm": 1.9259587526321411,
"learning_rate": 2.3321629682055984e-06,
"loss": 0.378429114818573,
"memory(GiB)": 74.62,
"step": 860,
"token_acc": 0.848297213622291,
"train_speed(iter/s)": 0.022533
},
{
"epoch": 0.6964610717896865,
"grad_norm": 1.9682130813598633,
"learning_rate": 2.320866025105016e-06,
"loss": 0.34357139468193054,
"memory(GiB)": 74.62,
"step": 861,
"token_acc": 0.8348348348348348,
"train_speed(iter/s)": 0.022533
},
{
"epoch": 0.6972699696663296,
"grad_norm": 2.785592794418335,
"learning_rate": 2.309588234183137e-06,
"loss": 0.3498800992965698,
"memory(GiB)": 74.62,
"step": 862,
"token_acc": 0.8847736625514403,
"train_speed(iter/s)": 0.022534
},
{
"epoch": 0.6980788675429727,
"grad_norm": 2.4636342525482178,
"learning_rate": 2.298329676060884e-06,
"loss": 0.39585980772972107,
"memory(GiB)": 74.62,
"step": 863,
"token_acc": 0.865546218487395,
"train_speed(iter/s)": 0.022534
},
{
"epoch": 0.6988877654196157,
"grad_norm": 1.8095598220825195,
"learning_rate": 2.287090431221701e-06,
"loss": 0.37628334760665894,
"memory(GiB)": 74.62,
"step": 864,
"token_acc": 0.8954703832752613,
"train_speed(iter/s)": 0.022534
},
{
"epoch": 0.6996966632962589,
"grad_norm": 1.9140504598617554,
"learning_rate": 2.275870580010958e-06,
"loss": 0.3849208354949951,
"memory(GiB)": 74.62,
"step": 865,
"token_acc": 0.910958904109589,
"train_speed(iter/s)": 0.022534
},
{
"epoch": 0.700505561172902,
"grad_norm": 1.7582415342330933,
"learning_rate": 2.264670202635396e-06,
"loss": 0.3840162754058838,
"memory(GiB)": 74.62,
"step": 866,
"token_acc": 0.8550185873605948,
"train_speed(iter/s)": 0.022535
},
{
"epoch": 0.701314459049545,
"grad_norm": 1.8664969205856323,
"learning_rate": 2.2534893791625408e-06,
"loss": 0.3248283565044403,
"memory(GiB)": 74.62,
"step": 867,
"token_acc": 0.896,
"train_speed(iter/s)": 0.022535
},
{
"epoch": 0.7021233569261881,
"grad_norm": 1.9030721187591553,
"learning_rate": 2.242328189520134e-06,
"loss": 0.35055387020111084,
"memory(GiB)": 74.62,
"step": 868,
"token_acc": 0.8939393939393939,
"train_speed(iter/s)": 0.022535
},
{
"epoch": 0.7029322548028312,
"grad_norm": 2.2921035289764404,
"learning_rate": 2.2311867134955637e-06,
"loss": 0.41889488697052,
"memory(GiB)": 74.62,
"step": 869,
"token_acc": 0.8565573770491803,
"train_speed(iter/s)": 0.022535
},
{
"epoch": 0.7037411526794742,
"grad_norm": 2.5671067237854004,
"learning_rate": 2.2200650307352883e-06,
"loss": 0.3641519844532013,
"memory(GiB)": 74.62,
"step": 870,
"token_acc": 0.8726415094339622,
"train_speed(iter/s)": 0.022536
},
{
"epoch": 0.7045500505561173,
"grad_norm": 2.0666255950927734,
"learning_rate": 2.2089632207442763e-06,
"loss": 0.34707674384117126,
"memory(GiB)": 74.62,
"step": 871,
"token_acc": 0.898876404494382,
"train_speed(iter/s)": 0.022536
},
{
"epoch": 0.7053589484327604,
"grad_norm": 2.3214352130889893,
"learning_rate": 2.197881362885426e-06,
"loss": 0.30853113532066345,
"memory(GiB)": 74.62,
"step": 872,
"token_acc": 0.8477611940298507,
"train_speed(iter/s)": 0.022536
},
{
"epoch": 0.7061678463094034,
"grad_norm": 2.3969626426696777,
"learning_rate": 2.1868195363790147e-06,
"loss": 0.44838905334472656,
"memory(GiB)": 74.62,
"step": 873,
"token_acc": 0.8190954773869347,
"train_speed(iter/s)": 0.022537
},
{
"epoch": 0.7069767441860465,
"grad_norm": 2.3142099380493164,
"learning_rate": 2.1757778203021163e-06,
"loss": 0.4084170460700989,
"memory(GiB)": 74.62,
"step": 874,
"token_acc": 0.875,
"train_speed(iter/s)": 0.022537
},
{
"epoch": 0.7077856420626896,
"grad_norm": 2.4327192306518555,
"learning_rate": 2.1647562935880405e-06,
"loss": 0.4108632802963257,
"memory(GiB)": 74.62,
"step": 875,
"token_acc": 0.8553054662379421,
"train_speed(iter/s)": 0.022537
},
{
"epoch": 0.7085945399393326,
"grad_norm": 1.7217832803726196,
"learning_rate": 2.153755035025777e-06,
"loss": 0.3645017743110657,
"memory(GiB)": 74.62,
"step": 876,
"token_acc": 0.825925925925926,
"train_speed(iter/s)": 0.022538
},
{
"epoch": 0.7094034378159757,
"grad_norm": 1.7630640268325806,
"learning_rate": 2.1427741232594185e-06,
"loss": 0.3739239573478699,
"memory(GiB)": 74.62,
"step": 877,
"token_acc": 0.8757961783439491,
"train_speed(iter/s)": 0.022538
},
{
"epoch": 0.7102123356926188,
"grad_norm": 1.9821792840957642,
"learning_rate": 2.1318136367876098e-06,
"loss": 0.3128720223903656,
"memory(GiB)": 74.62,
"step": 878,
"token_acc": 0.8744769874476988,
"train_speed(iter/s)": 0.022538
},
{
"epoch": 0.7110212335692618,
"grad_norm": 1.9988818168640137,
"learning_rate": 2.120873653962983e-06,
"loss": 0.39012840390205383,
"memory(GiB)": 74.62,
"step": 879,
"token_acc": 0.865814696485623,
"train_speed(iter/s)": 0.022538
},
{
"epoch": 0.7118301314459049,
"grad_norm": 2.3474910259246826,
"learning_rate": 2.109954252991595e-06,
"loss": 0.3977096676826477,
"memory(GiB)": 74.62,
"step": 880,
"token_acc": 0.8461538461538461,
"train_speed(iter/s)": 0.022539
},
{
"epoch": 0.7126390293225481,
"grad_norm": 1.7941343784332275,
"learning_rate": 2.0990555119323737e-06,
"loss": 0.37561237812042236,
"memory(GiB)": 74.62,
"step": 881,
"token_acc": 0.8759124087591241,
"train_speed(iter/s)": 0.022539
},
{
"epoch": 0.7134479271991911,
"grad_norm": 2.288217782974243,
"learning_rate": 2.0881775086965494e-06,
"loss": 0.3414373993873596,
"memory(GiB)": 74.62,
"step": 882,
"token_acc": 0.8681318681318682,
"train_speed(iter/s)": 0.022539
},
{
"epoch": 0.7142568250758342,
"grad_norm": 1.7807132005691528,
"learning_rate": 2.0773203210471115e-06,
"loss": 0.3832324147224426,
"memory(GiB)": 74.62,
"step": 883,
"token_acc": 0.8442028985507246,
"train_speed(iter/s)": 0.02254
},
{
"epoch": 0.7150657229524773,
"grad_norm": 1.990700602531433,
"learning_rate": 2.0664840265982457e-06,
"loss": 0.4304344952106476,
"memory(GiB)": 74.62,
"step": 884,
"token_acc": 0.8095238095238095,
"train_speed(iter/s)": 0.02254
},
{
"epoch": 0.7158746208291203,
"grad_norm": 1.9708170890808105,
"learning_rate": 2.0556687028147765e-06,
"loss": 0.4029080867767334,
"memory(GiB)": 74.62,
"step": 885,
"token_acc": 0.8530612244897959,
"train_speed(iter/s)": 0.02254
},
{
"epoch": 0.7166835187057634,
"grad_norm": 2.2865779399871826,
"learning_rate": 2.0448744270116206e-06,
"loss": 0.390356183052063,
"memory(GiB)": 74.62,
"step": 886,
"token_acc": 0.8823529411764706,
"train_speed(iter/s)": 0.022541
},
{
"epoch": 0.7174924165824065,
"grad_norm": 2.5284066200256348,
"learning_rate": 2.0341012763532243e-06,
"loss": 0.40166282653808594,
"memory(GiB)": 74.62,
"step": 887,
"token_acc": 0.8840579710144928,
"train_speed(iter/s)": 0.022541
},
{
"epoch": 0.7183013144590495,
"grad_norm": 6.747030258178711,
"learning_rate": 2.023349327853025e-06,
"loss": 0.38176417350769043,
"memory(GiB)": 74.62,
"step": 888,
"token_acc": 0.8652482269503546,
"train_speed(iter/s)": 0.022541
},
{
"epoch": 0.7191102123356926,
"grad_norm": 2.049042224884033,
"learning_rate": 2.0126186583728856e-06,
"loss": 0.3778286576271057,
"memory(GiB)": 74.62,
"step": 889,
"token_acc": 0.8429319371727748,
"train_speed(iter/s)": 0.022542
},
{
"epoch": 0.7199191102123357,
"grad_norm": 2.2993712425231934,
"learning_rate": 2.001909344622559e-06,
"loss": 0.4231566786766052,
"memory(GiB)": 74.62,
"step": 890,
"token_acc": 0.8865979381443299,
"train_speed(iter/s)": 0.022542
},
{
"epoch": 0.7207280080889787,
"grad_norm": 2.244127035140991,
"learning_rate": 1.9912214631591314e-06,
"loss": 0.3927876651287079,
"memory(GiB)": 74.62,
"step": 891,
"token_acc": 0.8867924528301887,
"train_speed(iter/s)": 0.022542
},
{
"epoch": 0.7215369059656218,
"grad_norm": 1.9843049049377441,
"learning_rate": 1.9805550903864775e-06,
"loss": 0.39008790254592896,
"memory(GiB)": 74.62,
"step": 892,
"token_acc": 0.8244274809160306,
"train_speed(iter/s)": 0.022543
},
{
"epoch": 0.7223458038422649,
"grad_norm": 2.253777027130127,
"learning_rate": 1.9699103025547145e-06,
"loss": 0.3611776828765869,
"memory(GiB)": 74.62,
"step": 893,
"token_acc": 0.84765625,
"train_speed(iter/s)": 0.022543
},
{
"epoch": 0.7231547017189079,
"grad_norm": 2.2141964435577393,
"learning_rate": 1.9592871757596532e-06,
"loss": 0.4213542640209198,
"memory(GiB)": 74.62,
"step": 894,
"token_acc": 0.8754716981132076,
"train_speed(iter/s)": 0.022543
},
{
"epoch": 0.723963599595551,
"grad_norm": 1.9213643074035645,
"learning_rate": 1.9486857859422607e-06,
"loss": 0.4320271611213684,
"memory(GiB)": 74.62,
"step": 895,
"token_acc": 0.8327526132404182,
"train_speed(iter/s)": 0.022543
},
{
"epoch": 0.7247724974721942,
"grad_norm": 2.10569167137146,
"learning_rate": 1.9381062088881142e-06,
"loss": 0.3284885883331299,
"memory(GiB)": 74.62,
"step": 896,
"token_acc": 0.8831615120274914,
"train_speed(iter/s)": 0.022543
},
{
"epoch": 0.7255813953488373,
"grad_norm": 1.6468027830123901,
"learning_rate": 1.9275485202268574e-06,
"loss": 0.35665562748908997,
"memory(GiB)": 74.62,
"step": 897,
"token_acc": 0.9037037037037037,
"train_speed(iter/s)": 0.022544
},
{
"epoch": 0.7263902932254803,
"grad_norm": 1.961858868598938,
"learning_rate": 1.917012795431665e-06,
"loss": 0.3552227020263672,
"memory(GiB)": 74.62,
"step": 898,
"token_acc": 0.8705882352941177,
"train_speed(iter/s)": 0.022544
},
{
"epoch": 0.7271991911021234,
"grad_norm": 2.2594661712646484,
"learning_rate": 1.9064991098186935e-06,
"loss": 0.42378872632980347,
"memory(GiB)": 74.62,
"step": 899,
"token_acc": 0.8449612403100775,
"train_speed(iter/s)": 0.022544
},
{
"epoch": 0.7280080889787665,
"grad_norm": 2.3480887413024902,
"learning_rate": 1.8960075385465547e-06,
"loss": 0.38160020112991333,
"memory(GiB)": 74.62,
"step": 900,
"token_acc": 0.8577235772357723,
"train_speed(iter/s)": 0.022545
},
{
"epoch": 0.7288169868554095,
"grad_norm": 2.0713682174682617,
"learning_rate": 1.8855381566157727e-06,
"loss": 0.3788355588912964,
"memory(GiB)": 74.62,
"step": 901,
"token_acc": 0.9140271493212669,
"train_speed(iter/s)": 0.022545
},
{
"epoch": 0.7296258847320526,
"grad_norm": 1.8822578191757202,
"learning_rate": 1.875091038868243e-06,
"loss": 0.38564032316207886,
"memory(GiB)": 74.62,
"step": 902,
"token_acc": 0.8618181818181818,
"train_speed(iter/s)": 0.022545
},
{
"epoch": 0.7304347826086957,
"grad_norm": 2.0705273151397705,
"learning_rate": 1.8646662599867072e-06,
"loss": 0.4137299060821533,
"memory(GiB)": 74.62,
"step": 903,
"token_acc": 0.8893617021276595,
"train_speed(iter/s)": 0.022545
},
{
"epoch": 0.7312436804853387,
"grad_norm": 2.7392282485961914,
"learning_rate": 1.8542638944942127e-06,
"loss": 0.41165363788604736,
"memory(GiB)": 74.62,
"step": 904,
"token_acc": 0.873015873015873,
"train_speed(iter/s)": 0.022546
},
{
"epoch": 0.7320525783619818,
"grad_norm": 2.251229763031006,
"learning_rate": 1.8438840167535826e-06,
"loss": 0.39759790897369385,
"memory(GiB)": 74.62,
"step": 905,
"token_acc": 0.8949416342412452,
"train_speed(iter/s)": 0.022546
},
{
"epoch": 0.7328614762386249,
"grad_norm": 2.1164135932922363,
"learning_rate": 1.8335267009668794e-06,
"loss": 0.36323827505111694,
"memory(GiB)": 74.62,
"step": 906,
"token_acc": 0.9142857142857143,
"train_speed(iter/s)": 0.022546
},
{
"epoch": 0.7336703741152679,
"grad_norm": 2.421180009841919,
"learning_rate": 1.8231920211748822e-06,
"loss": 0.35361167788505554,
"memory(GiB)": 74.62,
"step": 907,
"token_acc": 0.8603603603603603,
"train_speed(iter/s)": 0.022547
},
{
"epoch": 0.734479271991911,
"grad_norm": 2.0135669708251953,
"learning_rate": 1.8128800512565514e-06,
"loss": 0.37238454818725586,
"memory(GiB)": 74.62,
"step": 908,
"token_acc": 0.8442906574394463,
"train_speed(iter/s)": 0.022547
},
{
"epoch": 0.735288169868554,
"grad_norm": 3.3785688877105713,
"learning_rate": 1.8025908649285033e-06,
"loss": 0.41406381130218506,
"memory(GiB)": 74.62,
"step": 909,
"token_acc": 0.8631178707224335,
"train_speed(iter/s)": 0.022548
},
{
"epoch": 0.7360970677451971,
"grad_norm": 2.393422842025757,
"learning_rate": 1.7923245357444847e-06,
"loss": 0.3641640543937683,
"memory(GiB)": 74.62,
"step": 910,
"token_acc": 0.9076923076923077,
"train_speed(iter/s)": 0.022548
},
{
"epoch": 0.7369059656218403,
"grad_norm": 2.425569534301758,
"learning_rate": 1.7820811370948371e-06,
"loss": 0.35734257102012634,
"memory(GiB)": 74.62,
"step": 911,
"token_acc": 0.8491379310344828,
"train_speed(iter/s)": 0.022548
},
{
"epoch": 0.7377148634984834,
"grad_norm": 2.3215856552124023,
"learning_rate": 1.771860742205988e-06,
"loss": 0.4984077513217926,
"memory(GiB)": 74.62,
"step": 912,
"token_acc": 0.8333333333333334,
"train_speed(iter/s)": 0.022548
},
{
"epoch": 0.7385237613751264,
"grad_norm": 2.343384265899658,
"learning_rate": 1.7616634241399177e-06,
"loss": 0.3875213861465454,
"memory(GiB)": 74.62,
"step": 913,
"token_acc": 0.8376383763837638,
"train_speed(iter/s)": 0.022548
},
{
"epoch": 0.7393326592517695,
"grad_norm": 1.9467604160308838,
"learning_rate": 1.7514892557936309e-06,
"loss": 0.3730790615081787,
"memory(GiB)": 74.62,
"step": 914,
"token_acc": 0.8776223776223776,
"train_speed(iter/s)": 0.022549
},
{
"epoch": 0.7401415571284126,
"grad_norm": 2.171644687652588,
"learning_rate": 1.7413383098986563e-06,
"loss": 0.3576545715332031,
"memory(GiB)": 74.62,
"step": 915,
"token_acc": 0.8874172185430463,
"train_speed(iter/s)": 0.022549
},
{
"epoch": 0.7409504550050556,
"grad_norm": 2.274996519088745,
"learning_rate": 1.7312106590205014e-06,
"loss": 0.3864002227783203,
"memory(GiB)": 74.62,
"step": 916,
"token_acc": 0.9036144578313253,
"train_speed(iter/s)": 0.022549
},
{
"epoch": 0.7417593528816987,
"grad_norm": 1.9122254848480225,
"learning_rate": 1.7211063755581524e-06,
"loss": 0.36603063344955444,
"memory(GiB)": 74.62,
"step": 917,
"token_acc": 0.8798798798798799,
"train_speed(iter/s)": 0.022549
},
{
"epoch": 0.7425682507583418,
"grad_norm": 2.126805067062378,
"learning_rate": 1.7110255317435503e-06,
"loss": 0.38110482692718506,
"memory(GiB)": 74.62,
"step": 918,
"token_acc": 0.8325581395348837,
"train_speed(iter/s)": 0.02255
},
{
"epoch": 0.7433771486349848,
"grad_norm": 2.5423946380615234,
"learning_rate": 1.7009681996410693e-06,
"loss": 0.3204044699668884,
"memory(GiB)": 74.62,
"step": 919,
"token_acc": 0.8575757575757575,
"train_speed(iter/s)": 0.02255
},
{
"epoch": 0.7441860465116279,
"grad_norm": 2.0116817951202393,
"learning_rate": 1.6909344511470116e-06,
"loss": 0.3353678286075592,
"memory(GiB)": 74.62,
"step": 920,
"token_acc": 0.9169811320754717,
"train_speed(iter/s)": 0.02255
},
{
"epoch": 0.744994944388271,
"grad_norm": 2.245171546936035,
"learning_rate": 1.6809243579890865e-06,
"loss": 0.401151180267334,
"memory(GiB)": 74.62,
"step": 921,
"token_acc": 0.823321554770318,
"train_speed(iter/s)": 0.022551
},
{
"epoch": 0.745803842264914,
"grad_norm": 2.26719331741333,
"learning_rate": 1.6709379917259028e-06,
"loss": 0.4471093714237213,
"memory(GiB)": 74.62,
"step": 922,
"token_acc": 0.8719723183391004,
"train_speed(iter/s)": 0.022551
},
{
"epoch": 0.7466127401415571,
"grad_norm": 2.297231435775757,
"learning_rate": 1.6609754237464475e-06,
"loss": 0.4046187400817871,
"memory(GiB)": 74.62,
"step": 923,
"token_acc": 0.835,
"train_speed(iter/s)": 0.022551
},
{
"epoch": 0.7474216380182002,
"grad_norm": 2.375325918197632,
"learning_rate": 1.651036725269588e-06,
"loss": 0.3570983409881592,
"memory(GiB)": 74.62,
"step": 924,
"token_acc": 0.883177570093458,
"train_speed(iter/s)": 0.022552
},
{
"epoch": 0.7482305358948432,
"grad_norm": 1.6737430095672607,
"learning_rate": 1.6411219673435564e-06,
"loss": 0.33470356464385986,
"memory(GiB)": 74.62,
"step": 925,
"token_acc": 0.9057377049180327,
"train_speed(iter/s)": 0.022552
},
{
"epoch": 0.7490394337714863,
"grad_norm": 1.9846247434616089,
"learning_rate": 1.6312312208454373e-06,
"loss": 0.37246596813201904,
"memory(GiB)": 74.62,
"step": 926,
"token_acc": 0.8854166666666666,
"train_speed(iter/s)": 0.022552
},
{
"epoch": 0.7498483316481295,
"grad_norm": 1.810141921043396,
"learning_rate": 1.6213645564806751e-06,
"loss": 0.3770504593849182,
"memory(GiB)": 74.62,
"step": 927,
"token_acc": 0.9072164948453608,
"train_speed(iter/s)": 0.022553
},
{
"epoch": 0.7506572295247725,
"grad_norm": 2.10400128364563,
"learning_rate": 1.6115220447825503e-06,
"loss": 0.376004159450531,
"memory(GiB)": 74.62,
"step": 928,
"token_acc": 0.9196428571428571,
"train_speed(iter/s)": 0.022553
},
{
"epoch": 0.7514661274014156,
"grad_norm": 2.000704050064087,
"learning_rate": 1.6017037561116899e-06,
"loss": 0.3456549048423767,
"memory(GiB)": 74.62,
"step": 929,
"token_acc": 0.8565573770491803,
"train_speed(iter/s)": 0.022553
},
{
"epoch": 0.7522750252780587,
"grad_norm": 1.5539889335632324,
"learning_rate": 1.59190976065556e-06,
"loss": 0.33691200613975525,
"memory(GiB)": 74.62,
"step": 930,
"token_acc": 0.8848920863309353,
"train_speed(iter/s)": 0.022554
},
{
"epoch": 0.7530839231547017,
"grad_norm": 1.6070380210876465,
"learning_rate": 1.582140128427957e-06,
"loss": 0.39200344681739807,
"memory(GiB)": 74.62,
"step": 931,
"token_acc": 0.8892857142857142,
"train_speed(iter/s)": 0.022554
},
{
"epoch": 0.7538928210313448,
"grad_norm": 1.8517992496490479,
"learning_rate": 1.5723949292685193e-06,
"loss": 0.34315165877342224,
"memory(GiB)": 74.62,
"step": 932,
"token_acc": 0.8774193548387097,
"train_speed(iter/s)": 0.022554
},
{
"epoch": 0.7547017189079879,
"grad_norm": 2.0841267108917236,
"learning_rate": 1.5626742328422195e-06,
"loss": 0.3751834034919739,
"memory(GiB)": 74.62,
"step": 933,
"token_acc": 0.9234234234234234,
"train_speed(iter/s)": 0.022554
},
{
"epoch": 0.7555106167846309,
"grad_norm": 2.080343008041382,
"learning_rate": 1.552978108638869e-06,
"loss": 0.37340766191482544,
"memory(GiB)": 74.62,
"step": 934,
"token_acc": 0.9068627450980392,
"train_speed(iter/s)": 0.022555
},
{
"epoch": 0.756319514661274,
"grad_norm": 2.0687668323516846,
"learning_rate": 1.543306625972623e-06,
"loss": 0.4011552929878235,
"memory(GiB)": 74.62,
"step": 935,
"token_acc": 0.9128440366972477,
"train_speed(iter/s)": 0.022555
},
{
"epoch": 0.7571284125379171,
"grad_norm": 1.9438579082489014,
"learning_rate": 1.5336598539814784e-06,
"loss": 0.389544278383255,
"memory(GiB)": 74.62,
"step": 936,
"token_acc": 0.8618181818181818,
"train_speed(iter/s)": 0.022555
},
{
"epoch": 0.7579373104145601,
"grad_norm": 2.186204671859741,
"learning_rate": 1.5240378616267887e-06,
"loss": 0.34044280648231506,
"memory(GiB)": 74.62,
"step": 937,
"token_acc": 0.8879310344827587,
"train_speed(iter/s)": 0.022555
},
{
"epoch": 0.7587462082912032,
"grad_norm": 2.069333076477051,
"learning_rate": 1.514440717692765e-06,
"loss": 0.41251152753829956,
"memory(GiB)": 74.62,
"step": 938,
"token_acc": 0.8347107438016529,
"train_speed(iter/s)": 0.022556
},
{
"epoch": 0.7595551061678463,
"grad_norm": 1.9282809495925903,
"learning_rate": 1.5048684907859873e-06,
"loss": 0.4127691984176636,
"memory(GiB)": 74.62,
"step": 939,
"token_acc": 0.842443729903537,
"train_speed(iter/s)": 0.022556
},
{
"epoch": 0.7603640040444893,
"grad_norm": 2.28041410446167,
"learning_rate": 1.495321249334908e-06,
"loss": 0.42502281069755554,
"memory(GiB)": 74.62,
"step": 940,
"token_acc": 0.8642857142857143,
"train_speed(iter/s)": 0.022556
},
{
"epoch": 0.7611729019211324,
"grad_norm": 1.8921377658843994,
"learning_rate": 1.485799061589372e-06,
"loss": 0.4206182658672333,
"memory(GiB)": 74.62,
"step": 941,
"token_acc": 0.8811475409836066,
"train_speed(iter/s)": 0.022556
},
{
"epoch": 0.7619817997977756,
"grad_norm": 1.8928072452545166,
"learning_rate": 1.4763019956201252e-06,
"loss": 0.3889954090118408,
"memory(GiB)": 74.62,
"step": 942,
"token_acc": 0.8821752265861027,
"train_speed(iter/s)": 0.022556
},
{
"epoch": 0.7627906976744186,
"grad_norm": 3.128412961959839,
"learning_rate": 1.4668301193183198e-06,
"loss": 0.38851073384284973,
"memory(GiB)": 74.62,
"step": 943,
"token_acc": 0.8724137931034482,
"train_speed(iter/s)": 0.022556
},
{
"epoch": 0.7635995955510617,
"grad_norm": 1.9432473182678223,
"learning_rate": 1.4573835003950438e-06,
"loss": 0.38765308260917664,
"memory(GiB)": 74.62,
"step": 944,
"token_acc": 0.8867924528301887,
"train_speed(iter/s)": 0.022557
},
{
"epoch": 0.7644084934277048,
"grad_norm": 2.4022583961486816,
"learning_rate": 1.4479622063808242e-06,
"loss": 0.43059998750686646,
"memory(GiB)": 74.62,
"step": 945,
"token_acc": 0.8793774319066148,
"train_speed(iter/s)": 0.022557
},
{
"epoch": 0.7652173913043478,
"grad_norm": 2.3695461750030518,
"learning_rate": 1.4385663046251514e-06,
"loss": 0.4037495255470276,
"memory(GiB)": 74.62,
"step": 946,
"token_acc": 0.8772727272727273,
"train_speed(iter/s)": 0.022557
},
{
"epoch": 0.7660262891809909,
"grad_norm": 1.9513347148895264,
"learning_rate": 1.4291958622959972e-06,
"loss": 0.35621780157089233,
"memory(GiB)": 74.62,
"step": 947,
"token_acc": 0.9019607843137255,
"train_speed(iter/s)": 0.022557
},
{
"epoch": 0.766835187057634,
"grad_norm": 2.0191597938537598,
"learning_rate": 1.4198509463793275e-06,
"loss": 0.38198453187942505,
"memory(GiB)": 74.62,
"step": 948,
"token_acc": 0.8981132075471698,
"train_speed(iter/s)": 0.022558
},
{
"epoch": 0.767644084934277,
"grad_norm": 1.8823531866073608,
"learning_rate": 1.4105316236786332e-06,
"loss": 0.39389660954475403,
"memory(GiB)": 74.62,
"step": 949,
"token_acc": 0.8419689119170984,
"train_speed(iter/s)": 0.022558
},
{
"epoch": 0.7684529828109201,
"grad_norm": 2.254852771759033,
"learning_rate": 1.4012379608144477e-06,
"loss": 0.40055525302886963,
"memory(GiB)": 74.62,
"step": 950,
"token_acc": 0.8776371308016878,
"train_speed(iter/s)": 0.022558
},
{
"epoch": 0.7692618806875632,
"grad_norm": 2.2618825435638428,
"learning_rate": 1.3919700242238715e-06,
"loss": 0.4659748673439026,
"memory(GiB)": 74.62,
"step": 951,
"token_acc": 0.804635761589404,
"train_speed(iter/s)": 0.022559
},
{
"epoch": 0.7700707785642062,
"grad_norm": 1.884406328201294,
"learning_rate": 1.382727880160098e-06,
"loss": 0.34575021266937256,
"memory(GiB)": 74.62,
"step": 952,
"token_acc": 0.8618421052631579,
"train_speed(iter/s)": 0.022559
},
{
"epoch": 0.7708796764408493,
"grad_norm": 2.369433641433716,
"learning_rate": 1.3735115946919342e-06,
"loss": 0.35355186462402344,
"memory(GiB)": 74.62,
"step": 953,
"token_acc": 0.9132231404958677,
"train_speed(iter/s)": 0.022559
},
{
"epoch": 0.7716885743174924,
"grad_norm": 1.9989012479782104,
"learning_rate": 1.3643212337033396e-06,
"loss": 0.35935360193252563,
"memory(GiB)": 74.62,
"step": 954,
"token_acc": 0.8350877192982457,
"train_speed(iter/s)": 0.02256
},
{
"epoch": 0.7724974721941354,
"grad_norm": 2.4037156105041504,
"learning_rate": 1.3551568628929434e-06,
"loss": 0.41261640191078186,
"memory(GiB)": 74.62,
"step": 955,
"token_acc": 0.849112426035503,
"train_speed(iter/s)": 0.02256
},
{
"epoch": 0.7733063700707785,
"grad_norm": 3.044893741607666,
"learning_rate": 1.346018547773582e-06,
"loss": 0.46254298090934753,
"memory(GiB)": 74.62,
"step": 956,
"token_acc": 0.8521400778210116,
"train_speed(iter/s)": 0.02256
},
{
"epoch": 0.7741152679474217,
"grad_norm": 2.8486831188201904,
"learning_rate": 1.3369063536718347e-06,
"loss": 0.39035511016845703,
"memory(GiB)": 74.62,
"step": 957,
"token_acc": 0.8973509933774835,
"train_speed(iter/s)": 0.02256
},
{
"epoch": 0.7749241658240648,
"grad_norm": 2.8728833198547363,
"learning_rate": 1.3278203457275401e-06,
"loss": 0.4135955572128296,
"memory(GiB)": 74.62,
"step": 958,
"token_acc": 0.8804347826086957,
"train_speed(iter/s)": 0.02256
},
{
"epoch": 0.7757330637007078,
"grad_norm": 2.2483723163604736,
"learning_rate": 1.3187605888933508e-06,
"loss": 0.3800261616706848,
"memory(GiB)": 74.62,
"step": 959,
"token_acc": 0.8988095238095238,
"train_speed(iter/s)": 0.022561
},
{
"epoch": 0.7765419615773509,
"grad_norm": 2.3790223598480225,
"learning_rate": 1.3097271479342526e-06,
"loss": 0.4093528389930725,
"memory(GiB)": 74.62,
"step": 960,
"token_acc": 0.8419243986254296,
"train_speed(iter/s)": 0.022561
},
{
"epoch": 0.777350859453994,
"grad_norm": 2.5826141834259033,
"learning_rate": 1.3007200874271126e-06,
"loss": 0.30737096071243286,
"memory(GiB)": 74.62,
"step": 961,
"token_acc": 0.8328173374613003,
"train_speed(iter/s)": 0.022561
},
{
"epoch": 0.778159757330637,
"grad_norm": 1.8254023790359497,
"learning_rate": 1.2917394717602123e-06,
"loss": 0.3649098575115204,
"memory(GiB)": 74.62,
"step": 962,
"token_acc": 0.8901960784313725,
"train_speed(iter/s)": 0.022562
},
{
"epoch": 0.7789686552072801,
"grad_norm": 1.9518779516220093,
"learning_rate": 1.2827853651327883e-06,
"loss": 0.3445701599121094,
"memory(GiB)": 74.62,
"step": 963,
"token_acc": 0.87890625,
"train_speed(iter/s)": 0.022562
},
{
"epoch": 0.7797775530839232,
"grad_norm": 2.0577752590179443,
"learning_rate": 1.2738578315545751e-06,
"loss": 0.3813546299934387,
"memory(GiB)": 74.62,
"step": 964,
"token_acc": 0.8389830508474576,
"train_speed(iter/s)": 0.022562
},
{
"epoch": 0.7805864509605662,
"grad_norm": 2.2759878635406494,
"learning_rate": 1.2649569348453416e-06,
"loss": 0.4146482050418854,
"memory(GiB)": 74.62,
"step": 965,
"token_acc": 0.8581314878892734,
"train_speed(iter/s)": 0.022562
},
{
"epoch": 0.7813953488372093,
"grad_norm": 2.162762403488159,
"learning_rate": 1.2560827386344444e-06,
"loss": 0.43926411867141724,
"memory(GiB)": 74.62,
"step": 966,
"token_acc": 0.908256880733945,
"train_speed(iter/s)": 0.022563
},
{
"epoch": 0.7822042467138524,
"grad_norm": 2.6653337478637695,
"learning_rate": 1.2472353063603626e-06,
"loss": 0.3915598690509796,
"memory(GiB)": 74.62,
"step": 967,
"token_acc": 0.8888888888888888,
"train_speed(iter/s)": 0.022563
},
{
"epoch": 0.7830131445904954,
"grad_norm": 1.9902511835098267,
"learning_rate": 1.238414701270252e-06,
"loss": 0.3581811189651489,
"memory(GiB)": 74.62,
"step": 968,
"token_acc": 0.8584615384615385,
"train_speed(iter/s)": 0.022563
},
{
"epoch": 0.7838220424671385,
"grad_norm": 2.0768163204193115,
"learning_rate": 1.229620986419494e-06,
"loss": 0.40156054496765137,
"memory(GiB)": 74.62,
"step": 969,
"token_acc": 0.8660436137071651,
"train_speed(iter/s)": 0.022563
},
{
"epoch": 0.7846309403437816,
"grad_norm": 2.0157761573791504,
"learning_rate": 1.2208542246712346e-06,
"loss": 0.3723048269748688,
"memory(GiB)": 74.62,
"step": 970,
"token_acc": 0.9051724137931034,
"train_speed(iter/s)": 0.022564
},
{
"epoch": 0.7854398382204246,
"grad_norm": 2.2510571479797363,
"learning_rate": 1.2121144786959466e-06,
"loss": 0.39407879114151,
"memory(GiB)": 74.62,
"step": 971,
"token_acc": 0.8828125,
"train_speed(iter/s)": 0.022564
},
{
"epoch": 0.7862487360970677,
"grad_norm": 1.9419714212417603,
"learning_rate": 1.2034018109709716e-06,
"loss": 0.3809299170970917,
"memory(GiB)": 74.62,
"step": 972,
"token_acc": 0.8663967611336032,
"train_speed(iter/s)": 0.022564
},
{
"epoch": 0.7870576339737109,
"grad_norm": 2.204801321029663,
"learning_rate": 1.1947162837800842e-06,
"loss": 0.41355523467063904,
"memory(GiB)": 74.62,
"step": 973,
"token_acc": 0.8683274021352313,
"train_speed(iter/s)": 0.022565
},
{
"epoch": 0.7878665318503539,
"grad_norm": 2.461207866668701,
"learning_rate": 1.1860579592130366e-06,
"loss": 0.407459557056427,
"memory(GiB)": 74.62,
"step": 974,
"token_acc": 0.8583333333333333,
"train_speed(iter/s)": 0.022565
},
{
"epoch": 0.788675429726997,
"grad_norm": 1.8681888580322266,
"learning_rate": 1.177426899165121e-06,
"loss": 0.33745524287223816,
"memory(GiB)": 74.62,
"step": 975,
"token_acc": 0.9018691588785047,
"train_speed(iter/s)": 0.022565
},
{
"epoch": 0.7894843276036401,
"grad_norm": 1.9317001104354858,
"learning_rate": 1.1688231653367271e-06,
"loss": 0.36072519421577454,
"memory(GiB)": 74.62,
"step": 976,
"token_acc": 0.8922413793103449,
"train_speed(iter/s)": 0.022565
},
{
"epoch": 0.7902932254802831,
"grad_norm": 1.4545793533325195,
"learning_rate": 1.1602468192328936e-06,
"loss": 0.3215617537498474,
"memory(GiB)": 74.62,
"step": 977,
"token_acc": 0.9003436426116839,
"train_speed(iter/s)": 0.022566
},
{
"epoch": 0.7911021233569262,
"grad_norm": 2.098681926727295,
"learning_rate": 1.1516979221628804e-06,
"loss": 0.36492764949798584,
"memory(GiB)": 74.62,
"step": 978,
"token_acc": 0.8724489795918368,
"train_speed(iter/s)": 0.022566
},
{
"epoch": 0.7919110212335693,
"grad_norm": 1.852514386177063,
"learning_rate": 1.1431765352397167e-06,
"loss": 0.3920031785964966,
"memory(GiB)": 74.62,
"step": 979,
"token_acc": 0.875,
"train_speed(iter/s)": 0.022566
},
{
"epoch": 0.7927199191102123,
"grad_norm": 2.011186122894287,
"learning_rate": 1.13468271937978e-06,
"loss": 0.3568735718727112,
"memory(GiB)": 74.62,
"step": 980,
"token_acc": 0.9057971014492754,
"train_speed(iter/s)": 0.022566
},
{
"epoch": 0.7935288169868554,
"grad_norm": 2.8331449031829834,
"learning_rate": 1.1262165353023474e-06,
"loss": 0.3684077560901642,
"memory(GiB)": 74.62,
"step": 981,
"token_acc": 0.8900343642611683,
"train_speed(iter/s)": 0.022567
},
{
"epoch": 0.7943377148634985,
"grad_norm": 2.3888087272644043,
"learning_rate": 1.1177780435291641e-06,
"loss": 0.3318890929222107,
"memory(GiB)": 74.62,
"step": 982,
"token_acc": 0.8755760368663594,
"train_speed(iter/s)": 0.022567
},
{
"epoch": 0.7951466127401415,
"grad_norm": 1.8067930936813354,
"learning_rate": 1.1093673043840182e-06,
"loss": 0.32926511764526367,
"memory(GiB)": 74.62,
"step": 983,
"token_acc": 0.8690095846645367,
"train_speed(iter/s)": 0.022567
},
{
"epoch": 0.7959555106167846,
"grad_norm": 2.238401412963867,
"learning_rate": 1.100984377992298e-06,
"loss": 0.39484211802482605,
"memory(GiB)": 74.62,
"step": 984,
"token_acc": 0.8741496598639455,
"train_speed(iter/s)": 0.022567
},
{
"epoch": 0.7967644084934277,
"grad_norm": 2.389265298843384,
"learning_rate": 1.0926293242805735e-06,
"loss": 0.45714324712753296,
"memory(GiB)": 74.62,
"step": 985,
"token_acc": 0.8478260869565217,
"train_speed(iter/s)": 0.022568
},
{
"epoch": 0.7975733063700707,
"grad_norm": 2.173175811767578,
"learning_rate": 1.0843022029761596e-06,
"loss": 0.37196797132492065,
"memory(GiB)": 74.62,
"step": 986,
"token_acc": 0.8589211618257261,
"train_speed(iter/s)": 0.022568
},
{
"epoch": 0.7983822042467138,
"grad_norm": 2.0259294509887695,
"learning_rate": 1.0760030736066952e-06,
"loss": 0.44243717193603516,
"memory(GiB)": 74.62,
"step": 987,
"token_acc": 0.8244274809160306,
"train_speed(iter/s)": 0.022568
},
{
"epoch": 0.799191102123357,
"grad_norm": 2.151653528213501,
"learning_rate": 1.0677319954997129e-06,
"loss": 0.39491477608680725,
"memory(GiB)": 74.62,
"step": 988,
"token_acc": 0.9015544041450777,
"train_speed(iter/s)": 0.022568
},
{
"epoch": 0.8,
"grad_norm": 2.1169228553771973,
"learning_rate": 1.0594890277822151e-06,
"loss": 0.3383401334285736,
"memory(GiB)": 74.62,
"step": 989,
"token_acc": 0.8840125391849529,
"train_speed(iter/s)": 0.022569
},
{
"epoch": 0.8008088978766431,
"grad_norm": 2.4547696113586426,
"learning_rate": 1.0512742293802558e-06,
"loss": 0.38963425159454346,
"memory(GiB)": 74.62,
"step": 990,
"token_acc": 0.8666666666666667,
"train_speed(iter/s)": 0.022569
},
{
"epoch": 0.8016177957532862,
"grad_norm": 1.8448153734207153,
"learning_rate": 1.0430876590185162e-06,
"loss": 0.36352628469467163,
"memory(GiB)": 74.62,
"step": 991,
"token_acc": 0.92,
"train_speed(iter/s)": 0.022569
},
{
"epoch": 0.8024266936299292,
"grad_norm": 1.883742094039917,
"learning_rate": 1.0349293752198842e-06,
"loss": 0.37957262992858887,
"memory(GiB)": 74.62,
"step": 992,
"token_acc": 0.887240356083086,
"train_speed(iter/s)": 0.022569
},
{
"epoch": 0.8032355915065723,
"grad_norm": 2.0374629497528076,
"learning_rate": 1.0267994363050387e-06,
"loss": 0.3739085793495178,
"memory(GiB)": 74.62,
"step": 993,
"token_acc": 0.834061135371179,
"train_speed(iter/s)": 0.02257
},
{
"epoch": 0.8040444893832154,
"grad_norm": 2.806663751602173,
"learning_rate": 1.0186979003920273e-06,
"loss": 0.31939688324928284,
"memory(GiB)": 74.62,
"step": 994,
"token_acc": 0.8784313725490196,
"train_speed(iter/s)": 0.02257
},
{
"epoch": 0.8048533872598584,
"grad_norm": 2.3647608757019043,
"learning_rate": 1.0106248253958607e-06,
"loss": 0.37592533230781555,
"memory(GiB)": 74.62,
"step": 995,
"token_acc": 0.900355871886121,
"train_speed(iter/s)": 0.02257
},
{
"epoch": 0.8056622851365015,
"grad_norm": 2.112464427947998,
"learning_rate": 1.0025802690280851e-06,
"loss": 0.3363335132598877,
"memory(GiB)": 74.62,
"step": 996,
"token_acc": 0.9078014184397163,
"train_speed(iter/s)": 0.02257
},
{
"epoch": 0.8064711830131446,
"grad_norm": 2.177457809448242,
"learning_rate": 9.945642887963842e-07,
"loss": 0.38282421231269836,
"memory(GiB)": 74.62,
"step": 997,
"token_acc": 0.8685446009389671,
"train_speed(iter/s)": 0.022571
},
{
"epoch": 0.8072800808897876,
"grad_norm": 2.463026523590088,
"learning_rate": 9.86576942004156e-07,
"loss": 0.3649854063987732,
"memory(GiB)": 74.62,
"step": 998,
"token_acc": 0.8651685393258427,
"train_speed(iter/s)": 0.022571
},
{
"epoch": 0.8080889787664307,
"grad_norm": 2.1493732929229736,
"learning_rate": 9.78618285750112e-07,
"loss": 0.4093163013458252,
"memory(GiB)": 74.62,
"step": 999,
"token_acc": 0.8106060606060606,
"train_speed(iter/s)": 0.022571
},
{
"epoch": 0.8088978766430738,
"grad_norm": 1.8683381080627441,
"learning_rate": 9.70688376927864e-07,
"loss": 0.3501003682613373,
"memory(GiB)": 74.62,
"step": 1000,
"token_acc": 0.8641975308641975,
"train_speed(iter/s)": 0.022571
},
{
"epoch": 0.8088978766430738,
"eval_loss": 0.36755362153053284,
"eval_runtime": 428.8026,
"eval_samples_per_second": 3.727,
"eval_steps_per_second": 0.117,
"eval_token_acc": 0.8743528175883545,
"step": 1000
},
{
"epoch": 0.8097067745197168,
"grad_norm": 6.642233371734619,
"learning_rate": 9.627872722255154e-07,
"loss": 0.3149925470352173,
"memory(GiB)": 74.62,
"step": 1001,
"token_acc": 0.875,
"train_speed(iter/s)": 0.022354
},
{
"epoch": 0.8105156723963599,
"grad_norm": 2.2048041820526123,
"learning_rate": 9.549150281252633e-07,
"loss": 0.4250641167163849,
"memory(GiB)": 74.62,
"step": 1002,
"token_acc": 0.8754448398576512,
"train_speed(iter/s)": 0.022354
},
{
"epoch": 0.8113245702730031,
"grad_norm": 1.8660839796066284,
"learning_rate": 9.470717009029889e-07,
"loss": 0.32009008526802063,
"memory(GiB)": 74.62,
"step": 1003,
"token_acc": 0.89,
"train_speed(iter/s)": 0.022354
},
{
"epoch": 0.8121334681496462,
"grad_norm": 1.9054193496704102,
"learning_rate": 9.39257346627857e-07,
"loss": 0.3357139825820923,
"memory(GiB)": 74.62,
"step": 1004,
"token_acc": 0.9012875536480687,
"train_speed(iter/s)": 0.022355
},
{
"epoch": 0.8129423660262892,
"grad_norm": 3.9081051349639893,
"learning_rate": 9.314720211619166e-07,
"loss": 0.38648778200149536,
"memory(GiB)": 74.62,
"step": 1005,
"token_acc": 0.9029850746268657,
"train_speed(iter/s)": 0.022355
},
{
"epoch": 0.8137512639029323,
"grad_norm": 2.1040167808532715,
"learning_rate": 9.237157801596958e-07,
"loss": 0.3301439881324768,
"memory(GiB)": 74.62,
"step": 1006,
"token_acc": 0.8925925925925926,
"train_speed(iter/s)": 0.022356
},
{
"epoch": 0.8145601617795754,
"grad_norm": 1.6679681539535522,
"learning_rate": 9.159886790678124e-07,
"loss": 0.37634193897247314,
"memory(GiB)": 74.62,
"step": 1007,
"token_acc": 0.8092485549132948,
"train_speed(iter/s)": 0.022356
},
{
"epoch": 0.8153690596562184,
"grad_norm": 2.3380022048950195,
"learning_rate": 9.082907731245733e-07,
"loss": 0.4119229316711426,
"memory(GiB)": 74.62,
"step": 1008,
"token_acc": 0.8262548262548263,
"train_speed(iter/s)": 0.022357
},
{
"epoch": 0.8161779575328615,
"grad_norm": 1.9643757343292236,
"learning_rate": 9.006221173595741e-07,
"loss": 0.355658620595932,
"memory(GiB)": 74.62,
"step": 1009,
"token_acc": 0.926530612244898,
"train_speed(iter/s)": 0.022357
},
{
"epoch": 0.8169868554095046,
"grad_norm": 1.6694306135177612,
"learning_rate": 8.929827665933211e-07,
"loss": 0.3310469388961792,
"memory(GiB)": 74.62,
"step": 1010,
"token_acc": 0.9036697247706422,
"train_speed(iter/s)": 0.022358
},
{
"epoch": 0.8177957532861476,
"grad_norm": 3.9332058429718018,
"learning_rate": 8.853727754368191e-07,
"loss": 0.3335992693901062,
"memory(GiB)": 74.62,
"step": 1011,
"token_acc": 0.9045226130653267,
"train_speed(iter/s)": 0.022358
},
{
"epoch": 0.8186046511627907,
"grad_norm": 2.0935213565826416,
"learning_rate": 8.777921982911996e-07,
"loss": 0.3944769501686096,
"memory(GiB)": 74.62,
"step": 1012,
"token_acc": 0.8847583643122676,
"train_speed(iter/s)": 0.022359
},
{
"epoch": 0.8194135490394338,
"grad_norm": 1.8062115907669067,
"learning_rate": 8.702410893473173e-07,
"loss": 0.3291887938976288,
"memory(GiB)": 74.62,
"step": 1013,
"token_acc": 0.8565217391304348,
"train_speed(iter/s)": 0.022359
},
{
"epoch": 0.8202224469160768,
"grad_norm": 2.1609699726104736,
"learning_rate": 8.627195025853735e-07,
"loss": 0.2895755469799042,
"memory(GiB)": 74.62,
"step": 1014,
"token_acc": 0.9145299145299145,
"train_speed(iter/s)": 0.02236
},
{
"epoch": 0.8210313447927199,
"grad_norm": 2.0408060550689697,
"learning_rate": 8.552274917745246e-07,
"loss": 0.3750014901161194,
"memory(GiB)": 74.62,
"step": 1015,
"token_acc": 0.8930817610062893,
"train_speed(iter/s)": 0.02236
},
{
"epoch": 0.821840242669363,
"grad_norm": 2.2596545219421387,
"learning_rate": 8.477651104724994e-07,
"loss": 0.3800932466983795,
"memory(GiB)": 74.62,
"step": 1016,
"token_acc": 0.8439306358381503,
"train_speed(iter/s)": 0.022361
},
{
"epoch": 0.822649140546006,
"grad_norm": 1.547613263130188,
"learning_rate": 8.40332412025216e-07,
"loss": 0.3251078128814697,
"memory(GiB)": 74.62,
"step": 1017,
"token_acc": 0.9111969111969112,
"train_speed(iter/s)": 0.022361
},
{
"epoch": 0.8234580384226491,
"grad_norm": 2.1521153450012207,
"learning_rate": 8.329294495663981e-07,
"loss": 0.38296395540237427,
"memory(GiB)": 74.62,
"step": 1018,
"token_acc": 0.8782894736842105,
"train_speed(iter/s)": 0.022361
},
{
"epoch": 0.8242669362992923,
"grad_norm": 2.0719387531280518,
"learning_rate": 8.255562760172004e-07,
"loss": 0.3523367643356323,
"memory(GiB)": 74.62,
"step": 1019,
"token_acc": 0.8269896193771626,
"train_speed(iter/s)": 0.022362
},
{
"epoch": 0.8250758341759353,
"grad_norm": 2.2503058910369873,
"learning_rate": 8.18212944085826e-07,
"loss": 0.37082982063293457,
"memory(GiB)": 74.62,
"step": 1020,
"token_acc": 0.8984126984126984,
"train_speed(iter/s)": 0.022362
},
{
"epoch": 0.8258847320525784,
"grad_norm": 2.572887659072876,
"learning_rate": 8.108995062671482e-07,
"loss": 0.44092637300491333,
"memory(GiB)": 74.62,
"step": 1021,
"token_acc": 0.8571428571428571,
"train_speed(iter/s)": 0.022363
},
{
"epoch": 0.8266936299292215,
"grad_norm": 2.121467113494873,
"learning_rate": 8.036160148423449e-07,
"loss": 0.3986678123474121,
"memory(GiB)": 74.62,
"step": 1022,
"token_acc": 0.9176470588235294,
"train_speed(iter/s)": 0.022363
},
{
"epoch": 0.8275025278058645,
"grad_norm": 1.8472903966903687,
"learning_rate": 7.963625218785099e-07,
"loss": 0.35733747482299805,
"memory(GiB)": 74.62,
"step": 1023,
"token_acc": 0.9130434782608695,
"train_speed(iter/s)": 0.022364
},
{
"epoch": 0.8283114256825076,
"grad_norm": 1.9388898611068726,
"learning_rate": 7.891390792282927e-07,
"loss": 0.2967044711112976,
"memory(GiB)": 74.62,
"step": 1024,
"token_acc": 0.9016393442622951,
"train_speed(iter/s)": 0.022364
},
{
"epoch": 0.8291203235591507,
"grad_norm": 1.7470604181289673,
"learning_rate": 7.819457385295254e-07,
"loss": 0.31166231632232666,
"memory(GiB)": 74.62,
"step": 1025,
"token_acc": 0.9090909090909091,
"train_speed(iter/s)": 0.022364
},
{
"epoch": 0.8299292214357937,
"grad_norm": 2.2927539348602295,
"learning_rate": 7.747825512048462e-07,
"loss": 0.3713032007217407,
"memory(GiB)": 74.62,
"step": 1026,
"token_acc": 0.8540540540540541,
"train_speed(iter/s)": 0.022365
},
{
"epoch": 0.8307381193124368,
"grad_norm": 2.0093610286712646,
"learning_rate": 7.676495684613433e-07,
"loss": 0.3384319543838501,
"memory(GiB)": 74.62,
"step": 1027,
"token_acc": 0.9325842696629213,
"train_speed(iter/s)": 0.022365
},
{
"epoch": 0.8315470171890799,
"grad_norm": 1.9166637659072876,
"learning_rate": 7.605468412901801e-07,
"loss": 0.4422561824321747,
"memory(GiB)": 74.62,
"step": 1028,
"token_acc": 0.8424657534246576,
"train_speed(iter/s)": 0.022366
},
{
"epoch": 0.8323559150657229,
"grad_norm": 2.4499292373657227,
"learning_rate": 7.534744204662348e-07,
"loss": 0.42556819319725037,
"memory(GiB)": 74.62,
"step": 1029,
"token_acc": 0.8181818181818182,
"train_speed(iter/s)": 0.022366
},
{
"epoch": 0.833164812942366,
"grad_norm": 2.4436521530151367,
"learning_rate": 7.464323565477372e-07,
"loss": 0.46478235721588135,
"memory(GiB)": 74.62,
"step": 1030,
"token_acc": 0.8811881188118812,
"train_speed(iter/s)": 0.022367
},
{
"epoch": 0.8339737108190091,
"grad_norm": 1.8678390979766846,
"learning_rate": 7.394206998759013e-07,
"loss": 0.34241783618927,
"memory(GiB)": 74.62,
"step": 1031,
"token_acc": 0.8908450704225352,
"train_speed(iter/s)": 0.022367
},
{
"epoch": 0.8347826086956521,
"grad_norm": 2.002629041671753,
"learning_rate": 7.324395005745772e-07,
"loss": 0.3532907962799072,
"memory(GiB)": 74.62,
"step": 1032,
"token_acc": 0.8291814946619217,
"train_speed(iter/s)": 0.022368
},
{
"epoch": 0.8355915065722952,
"grad_norm": 2.4754257202148438,
"learning_rate": 7.254888085498812e-07,
"loss": 0.39124253392219543,
"memory(GiB)": 74.62,
"step": 1033,
"token_acc": 0.8664122137404581,
"train_speed(iter/s)": 0.022368
},
{
"epoch": 0.8364004044489384,
"grad_norm": 2.009551763534546,
"learning_rate": 7.185686734898478e-07,
"loss": 0.3519361913204193,
"memory(GiB)": 74.62,
"step": 1034,
"token_acc": 0.8725490196078431,
"train_speed(iter/s)": 0.022368
},
{
"epoch": 0.8372093023255814,
"grad_norm": 2.077303886413574,
"learning_rate": 7.116791448640664e-07,
"loss": 0.3848615884780884,
"memory(GiB)": 74.62,
"step": 1035,
"token_acc": 0.8981481481481481,
"train_speed(iter/s)": 0.022369
},
{
"epoch": 0.8380182002022245,
"grad_norm": 1.8623238801956177,
"learning_rate": 7.048202719233344e-07,
"loss": 0.3747529983520508,
"memory(GiB)": 74.62,
"step": 1036,
"token_acc": 0.9141914191419142,
"train_speed(iter/s)": 0.022369
},
{
"epoch": 0.8388270980788676,
"grad_norm": 1.8672590255737305,
"learning_rate": 6.979921036993042e-07,
"loss": 0.3627777099609375,
"memory(GiB)": 74.62,
"step": 1037,
"token_acc": 0.8225352112676056,
"train_speed(iter/s)": 0.02237
},
{
"epoch": 0.8396359959555106,
"grad_norm": 2.0797042846679688,
"learning_rate": 6.911946890041254e-07,
"loss": 0.4054332375526428,
"memory(GiB)": 74.62,
"step": 1038,
"token_acc": 0.8860759493670886,
"train_speed(iter/s)": 0.02237
},
{
"epoch": 0.8404448938321537,
"grad_norm": 2.2241296768188477,
"learning_rate": 6.844280764301075e-07,
"loss": 0.33668115735054016,
"memory(GiB)": 74.62,
"step": 1039,
"token_acc": 0.9083665338645418,
"train_speed(iter/s)": 0.022371
},
{
"epoch": 0.8412537917087968,
"grad_norm": 1.7550405263900757,
"learning_rate": 6.776923143493636e-07,
"loss": 0.3522379696369171,
"memory(GiB)": 74.62,
"step": 1040,
"token_acc": 0.8508771929824561,
"train_speed(iter/s)": 0.022371
},
{
"epoch": 0.8420626895854398,
"grad_norm": 1.8860352039337158,
"learning_rate": 6.709874509134684e-07,
"loss": 0.4433209300041199,
"memory(GiB)": 74.62,
"step": 1041,
"token_acc": 0.8392282958199357,
"train_speed(iter/s)": 0.022371
},
{
"epoch": 0.8428715874620829,
"grad_norm": 2.263840913772583,
"learning_rate": 6.643135340531137e-07,
"loss": 0.3951689302921295,
"memory(GiB)": 74.62,
"step": 1042,
"token_acc": 0.8928571428571429,
"train_speed(iter/s)": 0.022372
},
{
"epoch": 0.843680485338726,
"grad_norm": 2.3143765926361084,
"learning_rate": 6.576706114777626e-07,
"loss": 0.39435216784477234,
"memory(GiB)": 74.62,
"step": 1043,
"token_acc": 0.8184523809523809,
"train_speed(iter/s)": 0.022372
},
{
"epoch": 0.844489383215369,
"grad_norm": 2.4204423427581787,
"learning_rate": 6.510587306753135e-07,
"loss": 0.38613927364349365,
"memory(GiB)": 74.62,
"step": 1044,
"token_acc": 0.8405797101449275,
"train_speed(iter/s)": 0.022373
},
{
"epoch": 0.8452982810920121,
"grad_norm": 1.9565153121948242,
"learning_rate": 6.444779389117579e-07,
"loss": 0.3638315498828888,
"memory(GiB)": 74.62,
"step": 1045,
"token_acc": 0.8671328671328671,
"train_speed(iter/s)": 0.022373
},
{
"epoch": 0.8461071789686552,
"grad_norm": 1.82338547706604,
"learning_rate": 6.379282832308414e-07,
"loss": 0.3477684557437897,
"memory(GiB)": 74.62,
"step": 1046,
"token_acc": 0.8731343283582089,
"train_speed(iter/s)": 0.022374
},
{
"epoch": 0.8469160768452982,
"grad_norm": 2.053645610809326,
"learning_rate": 6.314098104537325e-07,
"loss": 0.359966516494751,
"memory(GiB)": 74.62,
"step": 1047,
"token_acc": 0.84,
"train_speed(iter/s)": 0.022374
},
{
"epoch": 0.8477249747219413,
"grad_norm": 2.145159959793091,
"learning_rate": 6.249225671786785e-07,
"loss": 0.3331785202026367,
"memory(GiB)": 74.62,
"step": 1048,
"token_acc": 0.9153846153846154,
"train_speed(iter/s)": 0.022375
},
{
"epoch": 0.8485338725985845,
"grad_norm": 2.2616126537323,
"learning_rate": 6.184665997806832e-07,
"loss": 0.3494233191013336,
"memory(GiB)": 74.62,
"step": 1049,
"token_acc": 0.8663594470046083,
"train_speed(iter/s)": 0.022375
},
{
"epoch": 0.8493427704752275,
"grad_norm": 2.032336711883545,
"learning_rate": 6.120419544111655e-07,
"loss": 0.35964176058769226,
"memory(GiB)": 74.62,
"step": 1050,
"token_acc": 0.893687707641196,
"train_speed(iter/s)": 0.022375
},
{
"epoch": 0.8501516683518706,
"grad_norm": 1.9737732410430908,
"learning_rate": 6.056486769976388e-07,
"loss": 0.37345680594444275,
"memory(GiB)": 74.62,
"step": 1051,
"token_acc": 0.8767123287671232,
"train_speed(iter/s)": 0.022376
},
{
"epoch": 0.8509605662285137,
"grad_norm": 3.4677176475524902,
"learning_rate": 5.992868132433755e-07,
"loss": 0.3770935535430908,
"memory(GiB)": 74.62,
"step": 1052,
"token_acc": 0.8894230769230769,
"train_speed(iter/s)": 0.022376
},
{
"epoch": 0.8517694641051567,
"grad_norm": 2.0082759857177734,
"learning_rate": 5.929564086270834e-07,
"loss": 0.40682828426361084,
"memory(GiB)": 74.62,
"step": 1053,
"token_acc": 0.875,
"train_speed(iter/s)": 0.022376
},
{
"epoch": 0.8525783619817998,
"grad_norm": 1.6112618446350098,
"learning_rate": 5.866575084025816e-07,
"loss": 0.3442041873931885,
"memory(GiB)": 74.62,
"step": 1054,
"token_acc": 0.86,
"train_speed(iter/s)": 0.022377
},
{
"epoch": 0.8533872598584429,
"grad_norm": 2.1978917121887207,
"learning_rate": 5.803901575984721e-07,
"loss": 0.37148886919021606,
"memory(GiB)": 74.62,
"step": 1055,
"token_acc": 0.9269406392694064,
"train_speed(iter/s)": 0.022377
},
{
"epoch": 0.854196157735086,
"grad_norm": 2.1480906009674072,
"learning_rate": 5.74154401017824e-07,
"loss": 0.37758809328079224,
"memory(GiB)": 74.62,
"step": 1056,
"token_acc": 0.8744939271255061,
"train_speed(iter/s)": 0.022378
},
{
"epoch": 0.855005055611729,
"grad_norm": 2.161919593811035,
"learning_rate": 5.679502832378497e-07,
"loss": 0.3692307472229004,
"memory(GiB)": 74.62,
"step": 1057,
"token_acc": 0.8982035928143712,
"train_speed(iter/s)": 0.022378
},
{
"epoch": 0.8558139534883721,
"grad_norm": 2.31783390045166,
"learning_rate": 5.61777848609587e-07,
"loss": 0.36903953552246094,
"memory(GiB)": 74.62,
"step": 1058,
"token_acc": 0.8892857142857142,
"train_speed(iter/s)": 0.022378
},
{
"epoch": 0.8566228513650151,
"grad_norm": 2.582380771636963,
"learning_rate": 5.556371412575834e-07,
"loss": 0.40472832322120667,
"memory(GiB)": 74.62,
"step": 1059,
"token_acc": 0.8706896551724138,
"train_speed(iter/s)": 0.022379
},
{
"epoch": 0.8574317492416582,
"grad_norm": 1.9625579118728638,
"learning_rate": 5.495282050795763e-07,
"loss": 0.3849819302558899,
"memory(GiB)": 74.62,
"step": 1060,
"token_acc": 0.8406374501992032,
"train_speed(iter/s)": 0.022379
},
{
"epoch": 0.8582406471183013,
"grad_norm": 2.0185904502868652,
"learning_rate": 5.434510837461854e-07,
"loss": 0.43619173765182495,
"memory(GiB)": 74.62,
"step": 1061,
"token_acc": 0.8464730290456431,
"train_speed(iter/s)": 0.02238
},
{
"epoch": 0.8590495449949443,
"grad_norm": 2.0642013549804688,
"learning_rate": 5.374058207005945e-07,
"loss": 0.37471503019332886,
"memory(GiB)": 74.62,
"step": 1062,
"token_acc": 0.9219512195121952,
"train_speed(iter/s)": 0.02238
},
{
"epoch": 0.8598584428715874,
"grad_norm": 2.187964677810669,
"learning_rate": 5.313924591582453e-07,
"loss": 0.3878336548805237,
"memory(GiB)": 74.62,
"step": 1063,
"token_acc": 0.8531073446327684,
"train_speed(iter/s)": 0.02238
},
{
"epoch": 0.8606673407482305,
"grad_norm": 3.5268666744232178,
"learning_rate": 5.254110421065301e-07,
"loss": 0.4011298716068268,
"memory(GiB)": 74.62,
"step": 1064,
"token_acc": 0.8282442748091603,
"train_speed(iter/s)": 0.022381
},
{
"epoch": 0.8614762386248737,
"grad_norm": 1.9126622676849365,
"learning_rate": 5.194616123044749e-07,
"loss": 0.3823421597480774,
"memory(GiB)": 74.62,
"step": 1065,
"token_acc": 0.8555555555555555,
"train_speed(iter/s)": 0.022381
},
{
"epoch": 0.8622851365015167,
"grad_norm": 1.9851644039154053,
"learning_rate": 5.135442122824453e-07,
"loss": 0.41584277153015137,
"memory(GiB)": 74.62,
"step": 1066,
"token_acc": 0.896,
"train_speed(iter/s)": 0.022382
},
{
"epoch": 0.8630940343781598,
"grad_norm": 2.158141613006592,
"learning_rate": 5.076588843418345e-07,
"loss": 0.3853064775466919,
"memory(GiB)": 74.62,
"step": 1067,
"token_acc": 0.8201634877384196,
"train_speed(iter/s)": 0.022382
},
{
"epoch": 0.8639029322548029,
"grad_norm": 2.003866672515869,
"learning_rate": 5.018056705547652e-07,
"loss": 0.3744017481803894,
"memory(GiB)": 74.62,
"step": 1068,
"token_acc": 0.8693693693693694,
"train_speed(iter/s)": 0.022382
},
{
"epoch": 0.8647118301314459,
"grad_norm": 3.3579702377319336,
"learning_rate": 4.959846127637874e-07,
"loss": 0.3795039653778076,
"memory(GiB)": 74.62,
"step": 1069,
"token_acc": 0.8388625592417062,
"train_speed(iter/s)": 0.022383
},
{
"epoch": 0.865520728008089,
"grad_norm": 2.1418285369873047,
"learning_rate": 4.901957525815787e-07,
"loss": 0.35196787118911743,
"memory(GiB)": 74.62,
"step": 1070,
"token_acc": 0.8385650224215246,
"train_speed(iter/s)": 0.022383
},
{
"epoch": 0.8663296258847321,
"grad_norm": 2.060997486114502,
"learning_rate": 4.844391313906482e-07,
"loss": 0.3312758207321167,
"memory(GiB)": 74.62,
"step": 1071,
"token_acc": 0.8912280701754386,
"train_speed(iter/s)": 0.022384
},
{
"epoch": 0.8671385237613751,
"grad_norm": 2.250108242034912,
"learning_rate": 4.787147903430383e-07,
"loss": 0.4016328752040863,
"memory(GiB)": 74.62,
"step": 1072,
"token_acc": 0.8404669260700389,
"train_speed(iter/s)": 0.022384
},
{
"epoch": 0.8679474216380182,
"grad_norm": 1.5963561534881592,
"learning_rate": 4.730227703600354e-07,
"loss": 0.3070691227912903,
"memory(GiB)": 74.62,
"step": 1073,
"token_acc": 0.8928571428571429,
"train_speed(iter/s)": 0.022384
},
{
"epoch": 0.8687563195146613,
"grad_norm": 2.321164846420288,
"learning_rate": 4.6736311213186724e-07,
"loss": 0.32245370745658875,
"memory(GiB)": 74.62,
"step": 1074,
"token_acc": 0.8725868725868726,
"train_speed(iter/s)": 0.022385
},
{
"epoch": 0.8695652173913043,
"grad_norm": 1.9174984693527222,
"learning_rate": 4.617358561174279e-07,
"loss": 0.32412296533584595,
"memory(GiB)": 74.62,
"step": 1075,
"token_acc": 0.9473684210526315,
"train_speed(iter/s)": 0.022385
},
{
"epoch": 0.8703741152679474,
"grad_norm": 1.674944281578064,
"learning_rate": 4.561410425439744e-07,
"loss": 0.299832284450531,
"memory(GiB)": 74.62,
"step": 1076,
"token_acc": 0.9108527131782945,
"train_speed(iter/s)": 0.022386
},
{
"epoch": 0.8711830131445905,
"grad_norm": 1.9611433744430542,
"learning_rate": 4.505787114068433e-07,
"loss": 0.3502030670642853,
"memory(GiB)": 74.62,
"step": 1077,
"token_acc": 0.8602941176470589,
"train_speed(iter/s)": 0.022386
},
{
"epoch": 0.8719919110212335,
"grad_norm": 2.2846431732177734,
"learning_rate": 4.45048902469169e-07,
"loss": 0.39019766449928284,
"memory(GiB)": 74.62,
"step": 1078,
"token_acc": 0.8958333333333334,
"train_speed(iter/s)": 0.022386
},
{
"epoch": 0.8728008088978766,
"grad_norm": 2.490588903427124,
"learning_rate": 4.3955165526159306e-07,
"loss": 0.37344303727149963,
"memory(GiB)": 74.62,
"step": 1079,
"token_acc": 0.9163179916317992,
"train_speed(iter/s)": 0.022387
},
{
"epoch": 0.8736097067745198,
"grad_norm": 5.213693141937256,
"learning_rate": 4.3408700908198654e-07,
"loss": 0.3260120153427124,
"memory(GiB)": 74.62,
"step": 1080,
"token_acc": 0.8585858585858586,
"train_speed(iter/s)": 0.022387
},
{
"epoch": 0.8744186046511628,
"grad_norm": 2.62857723236084,
"learning_rate": 4.2865500299516747e-07,
"loss": 0.36192968487739563,
"memory(GiB)": 74.62,
"step": 1081,
"token_acc": 0.8915094339622641,
"train_speed(iter/s)": 0.022387
},
{
"epoch": 0.8752275025278059,
"grad_norm": 2.0130198001861572,
"learning_rate": 4.232556758326212e-07,
"loss": 0.35925909876823425,
"memory(GiB)": 74.62,
"step": 1082,
"token_acc": 0.8312236286919831,
"train_speed(iter/s)": 0.022388
},
{
"epoch": 0.876036400404449,
"grad_norm": 1.795419454574585,
"learning_rate": 4.178890661922241e-07,
"loss": 0.34093332290649414,
"memory(GiB)": 74.62,
"step": 1083,
"token_acc": 0.8543046357615894,
"train_speed(iter/s)": 0.022388
},
{
"epoch": 0.876845298281092,
"grad_norm": 2.5592668056488037,
"learning_rate": 4.125552124379628e-07,
"loss": 0.412899911403656,
"memory(GiB)": 74.62,
"step": 1084,
"token_acc": 0.85,
"train_speed(iter/s)": 0.022389
},
{
"epoch": 0.8776541961577351,
"grad_norm": 1.8965997695922852,
"learning_rate": 4.072541526996682e-07,
"loss": 0.3767935633659363,
"memory(GiB)": 74.62,
"step": 1085,
"token_acc": 0.8523676880222841,
"train_speed(iter/s)": 0.022389
},
{
"epoch": 0.8784630940343782,
"grad_norm": 2.412139415740967,
"learning_rate": 4.0198592487273426e-07,
"loss": 0.3973158597946167,
"memory(GiB)": 74.62,
"step": 1086,
"token_acc": 0.8678414096916299,
"train_speed(iter/s)": 0.022389
},
{
"epoch": 0.8792719919110212,
"grad_norm": 1.8268601894378662,
"learning_rate": 3.9675056661785563e-07,
"loss": 0.35584717988967896,
"memory(GiB)": 74.62,
"step": 1087,
"token_acc": 0.8561643835616438,
"train_speed(iter/s)": 0.02239
},
{
"epoch": 0.8800808897876643,
"grad_norm": 2.1522209644317627,
"learning_rate": 3.915481153607525e-07,
"loss": 0.37817463278770447,
"memory(GiB)": 74.62,
"step": 1088,
"token_acc": 0.8680851063829788,
"train_speed(iter/s)": 0.02239
},
{
"epoch": 0.8808897876643074,
"grad_norm": 1.805523157119751,
"learning_rate": 3.863786082919019e-07,
"loss": 0.33031123876571655,
"memory(GiB)": 74.62,
"step": 1089,
"token_acc": 0.9203539823008849,
"train_speed(iter/s)": 0.02239
},
{
"epoch": 0.8816986855409504,
"grad_norm": 1.8276246786117554,
"learning_rate": 3.8124208236627825e-07,
"loss": 0.32658106088638306,
"memory(GiB)": 74.62,
"step": 1090,
"token_acc": 0.900990099009901,
"train_speed(iter/s)": 0.022391
},
{
"epoch": 0.8825075834175935,
"grad_norm": 2.1186046600341797,
"learning_rate": 3.761385743030821e-07,
"loss": 0.3983362019062042,
"memory(GiB)": 74.62,
"step": 1091,
"token_acc": 0.9166666666666666,
"train_speed(iter/s)": 0.022391
},
{
"epoch": 0.8833164812942366,
"grad_norm": 2.2194223403930664,
"learning_rate": 3.710681205854838e-07,
"loss": 0.34843602776527405,
"memory(GiB)": 74.62,
"step": 1092,
"token_acc": 0.8144329896907216,
"train_speed(iter/s)": 0.022391
},
{
"epoch": 0.8841253791708796,
"grad_norm": 1.7586379051208496,
"learning_rate": 3.6603075746035886e-07,
"loss": 0.3717504143714905,
"memory(GiB)": 74.62,
"step": 1093,
"token_acc": 0.9043824701195219,
"train_speed(iter/s)": 0.022392
},
{
"epoch": 0.8849342770475227,
"grad_norm": 1.730454921722412,
"learning_rate": 3.6102652093802983e-07,
"loss": 0.33724552392959595,
"memory(GiB)": 74.62,
"step": 1094,
"token_acc": 0.8942857142857142,
"train_speed(iter/s)": 0.022392
},
{
"epoch": 0.8857431749241659,
"grad_norm": 1.7972487211227417,
"learning_rate": 3.5605544679200966e-07,
"loss": 0.40413105487823486,
"memory(GiB)": 74.62,
"step": 1095,
"token_acc": 0.8922413793103449,
"train_speed(iter/s)": 0.022393
},
{
"epoch": 0.8865520728008089,
"grad_norm": 2.4188039302825928,
"learning_rate": 3.511175705587433e-07,
"loss": 0.4261808693408966,
"memory(GiB)": 74.62,
"step": 1096,
"token_acc": 0.8990384615384616,
"train_speed(iter/s)": 0.022393
},
{
"epoch": 0.887360970677452,
"grad_norm": 2.6165802478790283,
"learning_rate": 3.462129275373577e-07,
"loss": 0.3905704617500305,
"memory(GiB)": 74.62,
"step": 1097,
"token_acc": 0.9346153846153846,
"train_speed(iter/s)": 0.022394
},
{
"epoch": 0.8881698685540951,
"grad_norm": 1.8218803405761719,
"learning_rate": 3.4134155278940594e-07,
"loss": 0.42883560061454773,
"memory(GiB)": 74.62,
"step": 1098,
"token_acc": 0.8745247148288974,
"train_speed(iter/s)": 0.022394
},
{
"epoch": 0.8889787664307381,
"grad_norm": 1.979760766029358,
"learning_rate": 3.3650348113861864e-07,
"loss": 0.36739590764045715,
"memory(GiB)": 74.62,
"step": 1099,
"token_acc": 0.8987341772151899,
"train_speed(iter/s)": 0.022394
},
{
"epoch": 0.8897876643073812,
"grad_norm": 2.169462203979492,
"learning_rate": 3.3169874717065564e-07,
"loss": 0.43099868297576904,
"memory(GiB)": 74.62,
"step": 1100,
"token_acc": 0.8531746031746031,
"train_speed(iter/s)": 0.022395
},
{
"epoch": 0.8905965621840243,
"grad_norm": 3.057952642440796,
"learning_rate": 3.269273852328547e-07,
"loss": 0.3875833749771118,
"memory(GiB)": 74.62,
"step": 1101,
"token_acc": 0.9,
"train_speed(iter/s)": 0.022395
},
{
"epoch": 0.8914054600606673,
"grad_norm": 1.8207221031188965,
"learning_rate": 3.2218942943399114e-07,
"loss": 0.3375704884529114,
"memory(GiB)": 74.62,
"step": 1102,
"token_acc": 0.8617511520737328,
"train_speed(iter/s)": 0.022396
},
{
"epoch": 0.8922143579373104,
"grad_norm": 2.1824142932891846,
"learning_rate": 3.174849136440294e-07,
"loss": 0.36066344380378723,
"memory(GiB)": 74.62,
"step": 1103,
"token_acc": 0.8494208494208494,
"train_speed(iter/s)": 0.022396
},
{
"epoch": 0.8930232558139535,
"grad_norm": 2.046804428100586,
"learning_rate": 3.1281387149388556e-07,
"loss": 0.39939042925834656,
"memory(GiB)": 74.62,
"step": 1104,
"token_acc": 0.8765432098765432,
"train_speed(iter/s)": 0.022396
},
{
"epoch": 0.8938321536905965,
"grad_norm": 2.1102182865142822,
"learning_rate": 3.081763363751844e-07,
"loss": 0.35777053236961365,
"memory(GiB)": 74.62,
"step": 1105,
"token_acc": 0.8803827751196173,
"train_speed(iter/s)": 0.022397
},
{
"epoch": 0.8946410515672396,
"grad_norm": 1.6538591384887695,
"learning_rate": 3.0357234144001766e-07,
"loss": 0.32706207036972046,
"memory(GiB)": 74.62,
"step": 1106,
"token_acc": 0.8989169675090253,
"train_speed(iter/s)": 0.022397
},
{
"epoch": 0.8954499494438827,
"grad_norm": 2.0191094875335693,
"learning_rate": 2.9900191960071544e-07,
"loss": 0.3731483817100525,
"memory(GiB)": 74.62,
"step": 1107,
"token_acc": 0.8875,
"train_speed(iter/s)": 0.022397
},
{
"epoch": 0.8962588473205257,
"grad_norm": 1.9920696020126343,
"learning_rate": 2.9446510352959924e-07,
"loss": 0.3792566657066345,
"memory(GiB)": 74.62,
"step": 1108,
"token_acc": 0.8431372549019608,
"train_speed(iter/s)": 0.022398
},
{
"epoch": 0.8970677451971688,
"grad_norm": 4.2869157791137695,
"learning_rate": 2.899619256587605e-07,
"loss": 0.4134003520011902,
"memory(GiB)": 74.62,
"step": 1109,
"token_acc": 0.8088888888888889,
"train_speed(iter/s)": 0.022398
},
{
"epoch": 0.897876643073812,
"grad_norm": 1.730612874031067,
"learning_rate": 2.854924181798202e-07,
"loss": 0.3089058995246887,
"memory(GiB)": 74.62,
"step": 1110,
"token_acc": 0.8550185873605948,
"train_speed(iter/s)": 0.022398
},
{
"epoch": 0.898685540950455,
"grad_norm": 2.020568370819092,
"learning_rate": 2.8105661304370256e-07,
"loss": 0.33643391728401184,
"memory(GiB)": 74.62,
"step": 1111,
"token_acc": 0.8732876712328768,
"train_speed(iter/s)": 0.022399
},
{
"epoch": 0.8994944388270981,
"grad_norm": 2.182412624359131,
"learning_rate": 2.7665454196040665e-07,
"loss": 0.39632314443588257,
"memory(GiB)": 74.62,
"step": 1112,
"token_acc": 0.8884297520661157,
"train_speed(iter/s)": 0.022399
},
{
"epoch": 0.9003033367037412,
"grad_norm": 2.2093279361724854,
"learning_rate": 2.722862363987749e-07,
"loss": 0.43140286207199097,
"memory(GiB)": 74.62,
"step": 1113,
"token_acc": 0.8701298701298701,
"train_speed(iter/s)": 0.022399
},
{
"epoch": 0.9011122345803843,
"grad_norm": 4.873557090759277,
"learning_rate": 2.6795172758627584e-07,
"loss": 0.40689289569854736,
"memory(GiB)": 74.62,
"step": 1114,
"token_acc": 0.880184331797235,
"train_speed(iter/s)": 0.0224
},
{
"epoch": 0.9019211324570273,
"grad_norm": 2.0055012702941895,
"learning_rate": 2.6365104650877716e-07,
"loss": 0.3976328372955322,
"memory(GiB)": 74.62,
"step": 1115,
"token_acc": 0.8812260536398467,
"train_speed(iter/s)": 0.0224
},
{
"epoch": 0.9027300303336704,
"grad_norm": 1.9500057697296143,
"learning_rate": 2.593842239103206e-07,
"loss": 0.40250563621520996,
"memory(GiB)": 74.62,
"step": 1116,
"token_acc": 0.8953168044077136,
"train_speed(iter/s)": 0.022401
},
{
"epoch": 0.9035389282103135,
"grad_norm": 1.8744258880615234,
"learning_rate": 2.5515129029290984e-07,
"loss": 0.35562485456466675,
"memory(GiB)": 74.62,
"step": 1117,
"token_acc": 0.8726591760299626,
"train_speed(iter/s)": 0.022401
},
{
"epoch": 0.9043478260869565,
"grad_norm": 1.818701982498169,
"learning_rate": 2.5095227591628467e-07,
"loss": 0.32878684997558594,
"memory(GiB)": 74.62,
"step": 1118,
"token_acc": 0.8952879581151832,
"train_speed(iter/s)": 0.022401
},
{
"epoch": 0.9051567239635996,
"grad_norm": 2.0827207565307617,
"learning_rate": 2.4678721079770984e-07,
"loss": 0.4192107617855072,
"memory(GiB)": 74.62,
"step": 1119,
"token_acc": 0.8461538461538461,
"train_speed(iter/s)": 0.022402
},
{
"epoch": 0.9059656218402427,
"grad_norm": 2.060375690460205,
"learning_rate": 2.4265612471176036e-07,
"loss": 0.3454943895339966,
"memory(GiB)": 74.62,
"step": 1120,
"token_acc": 0.9144981412639405,
"train_speed(iter/s)": 0.022402
},
{
"epoch": 0.9067745197168857,
"grad_norm": 1.8084218502044678,
"learning_rate": 2.385590471901045e-07,
"loss": 0.31142184138298035,
"memory(GiB)": 74.62,
"step": 1121,
"token_acc": 0.8678571428571429,
"train_speed(iter/s)": 0.022402
},
{
"epoch": 0.9075834175935288,
"grad_norm": 2.012327194213867,
"learning_rate": 2.3449600752129598e-07,
"loss": 0.3716868460178375,
"memory(GiB)": 74.62,
"step": 1122,
"token_acc": 0.8819672131147541,
"train_speed(iter/s)": 0.022403
},
{
"epoch": 0.9083923154701719,
"grad_norm": 2.0449485778808594,
"learning_rate": 2.3046703475056554e-07,
"loss": 0.3710024356842041,
"memory(GiB)": 74.62,
"step": 1123,
"token_acc": 0.8555555555555555,
"train_speed(iter/s)": 0.022403
},
{
"epoch": 0.9092012133468149,
"grad_norm": 2.0092179775238037,
"learning_rate": 2.2647215767961083e-07,
"loss": 0.3403990864753723,
"memory(GiB)": 74.62,
"step": 1124,
"token_acc": 0.925,
"train_speed(iter/s)": 0.022403
},
{
"epoch": 0.910010111223458,
"grad_norm": 2.1806256771087646,
"learning_rate": 2.2251140486639068e-07,
"loss": 0.37321048974990845,
"memory(GiB)": 74.62,
"step": 1125,
"token_acc": 0.9308510638297872,
"train_speed(iter/s)": 0.022404
},
{
"epoch": 0.9108190091001012,
"grad_norm": 2.1333301067352295,
"learning_rate": 2.1858480462492283e-07,
"loss": 0.37797796726226807,
"memory(GiB)": 74.62,
"step": 1126,
"token_acc": 0.8571428571428571,
"train_speed(iter/s)": 0.022404
},
{
"epoch": 0.9116279069767442,
"grad_norm": 2.240083694458008,
"learning_rate": 2.1469238502507926e-07,
"loss": 0.3420672118663788,
"memory(GiB)": 74.62,
"step": 1127,
"token_acc": 0.889795918367347,
"train_speed(iter/s)": 0.022404
},
{
"epoch": 0.9124368048533873,
"grad_norm": 2.032658338546753,
"learning_rate": 2.1083417389238858e-07,
"loss": 0.3892640471458435,
"memory(GiB)": 74.62,
"step": 1128,
"token_acc": 0.8831168831168831,
"train_speed(iter/s)": 0.022405
},
{
"epoch": 0.9132457027300304,
"grad_norm": 2.2067453861236572,
"learning_rate": 2.0701019880783324e-07,
"loss": 0.33542943000793457,
"memory(GiB)": 74.62,
"step": 1129,
"token_acc": 0.8740458015267175,
"train_speed(iter/s)": 0.022405
},
{
"epoch": 0.9140546006066734,
"grad_norm": 2.052781343460083,
"learning_rate": 2.0322048710765485e-07,
"loss": 0.3520893454551697,
"memory(GiB)": 74.62,
"step": 1130,
"token_acc": 0.8688524590163934,
"train_speed(iter/s)": 0.022405
},
{
"epoch": 0.9148634984833165,
"grad_norm": 7.5011773109436035,
"learning_rate": 1.9946506588315818e-07,
"loss": 0.3370997905731201,
"memory(GiB)": 74.62,
"step": 1131,
"token_acc": 0.8984771573604061,
"train_speed(iter/s)": 0.022406
},
{
"epoch": 0.9156723963599596,
"grad_norm": 2.2244808673858643,
"learning_rate": 1.957439619805196e-07,
"loss": 0.3234095871448517,
"memory(GiB)": 74.62,
"step": 1132,
"token_acc": 0.8681818181818182,
"train_speed(iter/s)": 0.022406
},
{
"epoch": 0.9164812942366026,
"grad_norm": 1.946089506149292,
"learning_rate": 1.9205720200058843e-07,
"loss": 0.39126190543174744,
"memory(GiB)": 74.62,
"step": 1133,
"token_acc": 0.909433962264151,
"train_speed(iter/s)": 0.022406
},
{
"epoch": 0.9172901921132457,
"grad_norm": 11.597419738769531,
"learning_rate": 1.8840481229870644e-07,
"loss": 0.37995028495788574,
"memory(GiB)": 74.62,
"step": 1134,
"token_acc": 0.8557046979865772,
"train_speed(iter/s)": 0.022407
},
{
"epoch": 0.9180990899898888,
"grad_norm": 1.789217233657837,
"learning_rate": 1.84786818984512e-07,
"loss": 0.3505871295928955,
"memory(GiB)": 74.62,
"step": 1135,
"token_acc": 0.9249146757679181,
"train_speed(iter/s)": 0.022407
},
{
"epoch": 0.9189079878665318,
"grad_norm": 1.919080138206482,
"learning_rate": 1.8120324792175569e-07,
"loss": 0.3749197721481323,
"memory(GiB)": 74.62,
"step": 1136,
"token_acc": 0.886435331230284,
"train_speed(iter/s)": 0.022407
},
{
"epoch": 0.9197168857431749,
"grad_norm": 2.741631269454956,
"learning_rate": 1.776541247281177e-07,
"loss": 0.3757126033306122,
"memory(GiB)": 74.62,
"step": 1137,
"token_acc": 0.8934010152284264,
"train_speed(iter/s)": 0.022408
},
{
"epoch": 0.920525783619818,
"grad_norm": 1.856645107269287,
"learning_rate": 1.7413947477501913e-07,
"loss": 0.3616572320461273,
"memory(GiB)": 74.62,
"step": 1138,
"token_acc": 0.9,
"train_speed(iter/s)": 0.022408
},
{
"epoch": 0.921334681496461,
"grad_norm": 2.687711477279663,
"learning_rate": 1.7065932318744704e-07,
"loss": 0.3780667185783386,
"memory(GiB)": 74.62,
"step": 1139,
"token_acc": 0.8723404255319149,
"train_speed(iter/s)": 0.022409
},
{
"epoch": 0.9221435793731041,
"grad_norm": 1.6964043378829956,
"learning_rate": 1.6721369484377082e-07,
"loss": 0.35959312319755554,
"memory(GiB)": 74.62,
"step": 1140,
"token_acc": 0.8790849673202614,
"train_speed(iter/s)": 0.022409
},
{
"epoch": 0.9229524772497473,
"grad_norm": 2.040339469909668,
"learning_rate": 1.6380261437556666e-07,
"loss": 0.34360718727111816,
"memory(GiB)": 74.62,
"step": 1141,
"token_acc": 0.9063829787234042,
"train_speed(iter/s)": 0.022409
},
{
"epoch": 0.9237613751263903,
"grad_norm": 1.9790493249893188,
"learning_rate": 1.6042610616743782e-07,
"loss": 0.36330220103263855,
"memory(GiB)": 74.62,
"step": 1142,
"token_acc": 0.8585858585858586,
"train_speed(iter/s)": 0.02241
},
{
"epoch": 0.9245702730030334,
"grad_norm": 1.878999948501587,
"learning_rate": 1.5708419435684463e-07,
"loss": 0.3349642753601074,
"memory(GiB)": 74.62,
"step": 1143,
"token_acc": 0.8650519031141869,
"train_speed(iter/s)": 0.02241
},
{
"epoch": 0.9253791708796765,
"grad_norm": 1.9973299503326416,
"learning_rate": 1.5377690283392977e-07,
"loss": 0.3546566963195801,
"memory(GiB)": 74.62,
"step": 1144,
"token_acc": 0.8781725888324873,
"train_speed(iter/s)": 0.02241
},
{
"epoch": 0.9261880687563195,
"grad_norm": 1.9398893117904663,
"learning_rate": 1.505042552413466e-07,
"loss": 0.34872984886169434,
"memory(GiB)": 74.62,
"step": 1145,
"token_acc": 0.8742138364779874,
"train_speed(iter/s)": 0.022411
},
{
"epoch": 0.9269969666329626,
"grad_norm": 1.9519524574279785,
"learning_rate": 1.4726627497409274e-07,
"loss": 0.3644063472747803,
"memory(GiB)": 74.62,
"step": 1146,
"token_acc": 0.8945147679324894,
"train_speed(iter/s)": 0.022411
},
{
"epoch": 0.9278058645096057,
"grad_norm": 2.4077093601226807,
"learning_rate": 1.440629851793407e-07,
"loss": 0.42128363251686096,
"memory(GiB)": 74.62,
"step": 1147,
"token_acc": 0.8775510204081632,
"train_speed(iter/s)": 0.022411
},
{
"epoch": 0.9286147623862487,
"grad_norm": 2.0536437034606934,
"learning_rate": 1.408944087562736e-07,
"loss": 0.3700520396232605,
"memory(GiB)": 74.62,
"step": 1148,
"token_acc": 0.8464566929133859,
"train_speed(iter/s)": 0.022412
},
{
"epoch": 0.9294236602628918,
"grad_norm": 2.154677391052246,
"learning_rate": 1.3776056835592132e-07,
"loss": 0.3489128351211548,
"memory(GiB)": 74.62,
"step": 1149,
"token_acc": 0.8795811518324608,
"train_speed(iter/s)": 0.022412
},
{
"epoch": 0.9302325581395349,
"grad_norm": 1.8740899562835693,
"learning_rate": 1.346614863809953e-07,
"loss": 0.36078181862831116,
"memory(GiB)": 74.62,
"step": 1150,
"token_acc": 0.8864468864468864,
"train_speed(iter/s)": 0.022412
},
{
"epoch": 0.9310414560161779,
"grad_norm": 2.146127700805664,
"learning_rate": 1.315971849857356e-07,
"loss": 0.3723437190055847,
"memory(GiB)": 74.62,
"step": 1151,
"token_acc": 0.8691275167785235,
"train_speed(iter/s)": 0.022413
},
{
"epoch": 0.931850353892821,
"grad_norm": 1.787015438079834,
"learning_rate": 1.2856768607574565e-07,
"loss": 0.3393116891384125,
"memory(GiB)": 74.62,
"step": 1152,
"token_acc": 0.9015544041450777,
"train_speed(iter/s)": 0.022413
},
{
"epoch": 0.9326592517694641,
"grad_norm": 2.211394786834717,
"learning_rate": 1.255730113078385e-07,
"loss": 0.34008848667144775,
"memory(GiB)": 74.62,
"step": 1153,
"token_acc": 0.8700787401574803,
"train_speed(iter/s)": 0.022413
},
{
"epoch": 0.9334681496461071,
"grad_norm": 1.7942789793014526,
"learning_rate": 1.2261318208988294e-07,
"loss": 0.31053483486175537,
"memory(GiB)": 74.62,
"step": 1154,
"token_acc": 0.8535825545171339,
"train_speed(iter/s)": 0.022414
},
{
"epoch": 0.9342770475227502,
"grad_norm": 2.598997116088867,
"learning_rate": 1.1968821958064702e-07,
"loss": 0.4369804859161377,
"memory(GiB)": 74.62,
"step": 1155,
"token_acc": 0.8713692946058091,
"train_speed(iter/s)": 0.022414
},
{
"epoch": 0.9350859453993934,
"grad_norm": 1.7106472253799438,
"learning_rate": 1.1679814468965211e-07,
"loss": 0.3438988924026489,
"memory(GiB)": 74.62,
"step": 1156,
"token_acc": 0.8736059479553904,
"train_speed(iter/s)": 0.022414
},
{
"epoch": 0.9358948432760364,
"grad_norm": 1.8687455654144287,
"learning_rate": 1.1394297807701737e-07,
"loss": 0.3768293261528015,
"memory(GiB)": 74.62,
"step": 1157,
"token_acc": 0.9270833333333334,
"train_speed(iter/s)": 0.022415
},
{
"epoch": 0.9367037411526795,
"grad_norm": 1.5831663608551025,
"learning_rate": 1.111227401533166e-07,
"loss": 0.3412172496318817,
"memory(GiB)": 74.62,
"step": 1158,
"token_acc": 0.875,
"train_speed(iter/s)": 0.022415
},
{
"epoch": 0.9375126390293226,
"grad_norm": 1.8993335962295532,
"learning_rate": 1.083374510794305e-07,
"loss": 0.4136160910129547,
"memory(GiB)": 74.62,
"step": 1159,
"token_acc": 0.9094488188976378,
"train_speed(iter/s)": 0.022415
},
{
"epoch": 0.9383215369059656,
"grad_norm": 3.2496023178100586,
"learning_rate": 1.0558713076640415e-07,
"loss": 0.3755384087562561,
"memory(GiB)": 74.62,
"step": 1160,
"token_acc": 0.9172932330827067,
"train_speed(iter/s)": 0.022416
},
{
"epoch": 0.9391304347826087,
"grad_norm": 2.1333253383636475,
"learning_rate": 1.028717988753014e-07,
"loss": 0.3936523199081421,
"memory(GiB)": 74.62,
"step": 1161,
"token_acc": 0.8974358974358975,
"train_speed(iter/s)": 0.022416
},
{
"epoch": 0.9399393326592518,
"grad_norm": 2.6341114044189453,
"learning_rate": 1.0019147481706626e-07,
"loss": 0.40892741084098816,
"memory(GiB)": 74.62,
"step": 1162,
"token_acc": 0.9217391304347826,
"train_speed(iter/s)": 0.022417
},
{
"epoch": 0.9407482305358948,
"grad_norm": 1.8160382509231567,
"learning_rate": 9.754617775238562e-08,
"loss": 0.36974000930786133,
"memory(GiB)": 74.62,
"step": 1163,
"token_acc": 0.8614457831325302,
"train_speed(iter/s)": 0.022417
},
{
"epoch": 0.9415571284125379,
"grad_norm": 2.1739790439605713,
"learning_rate": 9.493592659155004e-08,
"loss": 0.3862905502319336,
"memory(GiB)": 74.62,
"step": 1164,
"token_acc": 0.8700787401574803,
"train_speed(iter/s)": 0.022417
},
{
"epoch": 0.942366026289181,
"grad_norm": 2.973860502243042,
"learning_rate": 9.236073999431939e-08,
"loss": 0.4268924593925476,
"memory(GiB)": 74.62,
"step": 1165,
"token_acc": 0.8847736625514403,
"train_speed(iter/s)": 0.022418
},
{
"epoch": 0.943174924165824,
"grad_norm": 2.2699947357177734,
"learning_rate": 8.98206363697901e-08,
"loss": 0.3827816843986511,
"memory(GiB)": 74.62,
"step": 1166,
"token_acc": 0.8765432098765432,
"train_speed(iter/s)": 0.022418
},
{
"epoch": 0.9439838220424671,
"grad_norm": 2.014028549194336,
"learning_rate": 8.731563387626096e-08,
"loss": 0.3976903259754181,
"memory(GiB)": 74.62,
"step": 1167,
"token_acc": 0.8338658146964856,
"train_speed(iter/s)": 0.022418
},
{
"epoch": 0.9447927199191102,
"grad_norm": 2.3635129928588867,
"learning_rate": 8.484575042110699e-08,
"loss": 0.3837153911590576,
"memory(GiB)": 74.62,
"step": 1168,
"token_acc": 0.8766666666666667,
"train_speed(iter/s)": 0.022418
},
{
"epoch": 0.9456016177957532,
"grad_norm": 2.5257232189178467,
"learning_rate": 8.241100366064902e-08,
"loss": 0.37266969680786133,
"memory(GiB)": 74.62,
"step": 1169,
"token_acc": 0.8828828828828829,
"train_speed(iter/s)": 0.022419
},
{
"epoch": 0.9464105156723963,
"grad_norm": 2.1283090114593506,
"learning_rate": 8.001141100002885e-08,
"loss": 0.32720375061035156,
"memory(GiB)": 74.62,
"step": 1170,
"token_acc": 0.8850174216027874,
"train_speed(iter/s)": 0.022419
},
{
"epoch": 0.9472194135490394,
"grad_norm": 2.261035919189453,
"learning_rate": 7.764698959308315e-08,
"loss": 0.38027650117874146,
"memory(GiB)": 74.62,
"step": 1171,
"token_acc": 0.8956521739130435,
"train_speed(iter/s)": 0.022419
},
{
"epoch": 0.9480283114256826,
"grad_norm": 1.921704888343811,
"learning_rate": 7.531775634222138e-08,
"loss": 0.37682783603668213,
"memory(GiB)": 74.62,
"step": 1172,
"token_acc": 0.8680851063829788,
"train_speed(iter/s)": 0.02242
},
{
"epoch": 0.9488372093023256,
"grad_norm": 2.031587600708008,
"learning_rate": 7.302372789830702e-08,
"loss": 0.3404289484024048,
"memory(GiB)": 74.62,
"step": 1173,
"token_acc": 0.8355555555555556,
"train_speed(iter/s)": 0.02242
},
{
"epoch": 0.9496461071789687,
"grad_norm": 1.8540045022964478,
"learning_rate": 7.076492066053486e-08,
"loss": 0.3675205111503601,
"memory(GiB)": 74.62,
"step": 1174,
"token_acc": 0.8758389261744967,
"train_speed(iter/s)": 0.02242
},
{
"epoch": 0.9504550050556118,
"grad_norm": 2.207390546798706,
"learning_rate": 6.854135077631774e-08,
"loss": 0.3710861802101135,
"memory(GiB)": 74.62,
"step": 1175,
"token_acc": 0.8367875647668394,
"train_speed(iter/s)": 0.022421
},
{
"epoch": 0.9512639029322548,
"grad_norm": 2.1160874366760254,
"learning_rate": 6.635303414116834e-08,
"loss": 0.375140517950058,
"memory(GiB)": 74.62,
"step": 1176,
"token_acc": 0.8616600790513834,
"train_speed(iter/s)": 0.022421
},
{
"epoch": 0.9520728008088979,
"grad_norm": 1.8097771406173706,
"learning_rate": 6.419998639858538e-08,
"loss": 0.33210816979408264,
"memory(GiB)": 74.62,
"step": 1177,
"token_acc": 0.9314079422382672,
"train_speed(iter/s)": 0.022421
},
{
"epoch": 0.952881698685541,
"grad_norm": 1.6278916597366333,
"learning_rate": 6.208222293994425e-08,
"loss": 0.3717727065086365,
"memory(GiB)": 74.62,
"step": 1178,
"token_acc": 0.8639455782312925,
"train_speed(iter/s)": 0.022422
},
{
"epoch": 0.953690596562184,
"grad_norm": 2.6115875244140625,
"learning_rate": 5.999975890438436e-08,
"loss": 0.35759437084198,
"memory(GiB)": 74.62,
"step": 1179,
"token_acc": 0.9078498293515358,
"train_speed(iter/s)": 0.022422
},
{
"epoch": 0.9544994944388271,
"grad_norm": 2.0658047199249268,
"learning_rate": 5.79526091787036e-08,
"loss": 0.37362658977508545,
"memory(GiB)": 74.62,
"step": 1180,
"token_acc": 0.8454545454545455,
"train_speed(iter/s)": 0.022422
},
{
"epoch": 0.9553083923154702,
"grad_norm": 2.23612117767334,
"learning_rate": 5.594078839724793e-08,
"loss": 0.37239736318588257,
"memory(GiB)": 74.62,
"step": 1181,
"token_acc": 0.855072463768116,
"train_speed(iter/s)": 0.022423
},
{
"epoch": 0.9561172901921132,
"grad_norm": 1.8740304708480835,
"learning_rate": 5.396431094181198e-08,
"loss": 0.3480920195579529,
"memory(GiB)": 74.62,
"step": 1182,
"token_acc": 0.8709677419354839,
"train_speed(iter/s)": 0.022423
},
{
"epoch": 0.9569261880687563,
"grad_norm": 2.2320539951324463,
"learning_rate": 5.202319094153252e-08,
"loss": 0.3483563959598541,
"memory(GiB)": 74.62,
"step": 1183,
"token_acc": 0.8866666666666667,
"train_speed(iter/s)": 0.022423
},
{
"epoch": 0.9577350859453994,
"grad_norm": 1.7620937824249268,
"learning_rate": 5.011744227278625e-08,
"loss": 0.33139705657958984,
"memory(GiB)": 74.62,
"step": 1184,
"token_acc": 0.9110169491525424,
"train_speed(iter/s)": 0.022424
},
{
"epoch": 0.9585439838220424,
"grad_norm": 1.869081974029541,
"learning_rate": 4.824707855909605e-08,
"loss": 0.3572564125061035,
"memory(GiB)": 74.62,
"step": 1185,
"token_acc": 0.8842592592592593,
"train_speed(iter/s)": 0.022424
},
{
"epoch": 0.9593528816986855,
"grad_norm": 2.5178749561309814,
"learning_rate": 4.6412113171028226e-08,
"loss": 0.39302319288253784,
"memory(GiB)": 74.62,
"step": 1186,
"token_acc": 0.9107142857142857,
"train_speed(iter/s)": 0.022424
},
{
"epoch": 0.9601617795753287,
"grad_norm": 2.3168158531188965,
"learning_rate": 4.461255922609986e-08,
"loss": 0.3867931365966797,
"memory(GiB)": 74.62,
"step": 1187,
"token_acc": 0.8819444444444444,
"train_speed(iter/s)": 0.022425
},
{
"epoch": 0.9609706774519717,
"grad_norm": 2.4859671592712402,
"learning_rate": 4.2848429588683295e-08,
"loss": 0.3992939591407776,
"memory(GiB)": 74.62,
"step": 1188,
"token_acc": 0.8392857142857143,
"train_speed(iter/s)": 0.022425
},
{
"epoch": 0.9617795753286148,
"grad_norm": 3.0036697387695312,
"learning_rate": 4.111973686991677e-08,
"loss": 0.49971675872802734,
"memory(GiB)": 74.62,
"step": 1189,
"token_acc": 0.8101694915254237,
"train_speed(iter/s)": 0.022425
},
{
"epoch": 0.9625884732052579,
"grad_norm": 2.2183077335357666,
"learning_rate": 3.9426493427611177e-08,
"loss": 0.38460367918014526,
"memory(GiB)": 74.62,
"step": 1190,
"token_acc": 0.8319327731092437,
"train_speed(iter/s)": 0.022426
},
{
"epoch": 0.9633973710819009,
"grad_norm": 2.1675848960876465,
"learning_rate": 3.776871136616289e-08,
"loss": 0.4845053553581238,
"memory(GiB)": 74.62,
"step": 1191,
"token_acc": 0.8,
"train_speed(iter/s)": 0.022426
},
{
"epoch": 0.964206268958544,
"grad_norm": 1.8861103057861328,
"learning_rate": 3.6146402536468285e-08,
"loss": 0.40070268511772156,
"memory(GiB)": 74.62,
"step": 1192,
"token_acc": 0.8436363636363636,
"train_speed(iter/s)": 0.022426
},
{
"epoch": 0.9650151668351871,
"grad_norm": 2.5333354473114014,
"learning_rate": 3.455957853583769e-08,
"loss": 0.3965553343296051,
"memory(GiB)": 74.62,
"step": 1193,
"token_acc": 0.8791208791208791,
"train_speed(iter/s)": 0.022427
},
{
"epoch": 0.9658240647118301,
"grad_norm": 2.1063308715820312,
"learning_rate": 3.3008250707913246e-08,
"loss": 0.35347798466682434,
"memory(GiB)": 74.62,
"step": 1194,
"token_acc": 0.8908296943231441,
"train_speed(iter/s)": 0.022427
},
{
"epoch": 0.9666329625884732,
"grad_norm": 2.082961320877075,
"learning_rate": 3.14924301425884e-08,
"loss": 0.3923337757587433,
"memory(GiB)": 74.62,
"step": 1195,
"token_acc": 0.8774834437086093,
"train_speed(iter/s)": 0.022427
},
{
"epoch": 0.9674418604651163,
"grad_norm": 1.8798726797103882,
"learning_rate": 3.0012127675925206e-08,
"loss": 0.35899072885513306,
"memory(GiB)": 74.62,
"step": 1196,
"token_acc": 0.8819444444444444,
"train_speed(iter/s)": 0.022428
},
{
"epoch": 0.9682507583417593,
"grad_norm": 13.926689147949219,
"learning_rate": 2.8567353890082696e-08,
"loss": 0.3928597569465637,
"memory(GiB)": 74.62,
"step": 1197,
"token_acc": 0.8653846153846154,
"train_speed(iter/s)": 0.022428
},
{
"epoch": 0.9690596562184024,
"grad_norm": 1.9069607257843018,
"learning_rate": 2.7158119113234738e-08,
"loss": 0.344777375459671,
"memory(GiB)": 74.62,
"step": 1198,
"token_acc": 0.9090909090909091,
"train_speed(iter/s)": 0.022428
},
{
"epoch": 0.9698685540950455,
"grad_norm": 2.385317087173462,
"learning_rate": 2.5784433419501763e-08,
"loss": 0.35486793518066406,
"memory(GiB)": 74.62,
"step": 1199,
"token_acc": 0.8652849740932642,
"train_speed(iter/s)": 0.022428
},
{
"epoch": 0.9706774519716885,
"grad_norm": 2.183742046356201,
"learning_rate": 2.4446306628875814e-08,
"loss": 0.3595341444015503,
"memory(GiB)": 74.62,
"step": 1200,
"token_acc": 0.8879310344827587,
"train_speed(iter/s)": 0.022429
},
{
"epoch": 0.9714863498483316,
"grad_norm": 2.103287935256958,
"learning_rate": 2.3143748307150605e-08,
"loss": 0.39095747470855713,
"memory(GiB)": 74.62,
"step": 1201,
"token_acc": 0.8861788617886179,
"train_speed(iter/s)": 0.022429
},
{
"epoch": 0.9722952477249748,
"grad_norm": 2.1582367420196533,
"learning_rate": 2.1876767765853237e-08,
"loss": 0.3016042113304138,
"memory(GiB)": 74.62,
"step": 1202,
"token_acc": 0.8571428571428571,
"train_speed(iter/s)": 0.022429
},
{
"epoch": 0.9731041456016178,
"grad_norm": 2.0449063777923584,
"learning_rate": 2.0645374062179257e-08,
"loss": 0.36447232961654663,
"memory(GiB)": 74.62,
"step": 1203,
"token_acc": 0.8480392156862745,
"train_speed(iter/s)": 0.02243
},
{
"epoch": 0.9739130434782609,
"grad_norm": 3.5183372497558594,
"learning_rate": 1.9449575998924387e-08,
"loss": 0.43112486600875854,
"memory(GiB)": 74.62,
"step": 1204,
"token_acc": 0.8607594936708861,
"train_speed(iter/s)": 0.02243
},
{
"epoch": 0.974721941354904,
"grad_norm": 2.14886736869812,
"learning_rate": 1.8289382124426214e-08,
"loss": 0.38468360900878906,
"memory(GiB)": 74.62,
"step": 1205,
"token_acc": 0.8654545454545455,
"train_speed(iter/s)": 0.02243
},
{
"epoch": 0.975530839231547,
"grad_norm": 2.688023090362549,
"learning_rate": 1.7164800732498156e-08,
"loss": 0.3501737713813782,
"memory(GiB)": 74.62,
"step": 1206,
"token_acc": 0.8855421686746988,
"train_speed(iter/s)": 0.022431
},
{
"epoch": 0.9763397371081901,
"grad_norm": 2.0248029232025146,
"learning_rate": 1.6075839862374487e-08,
"loss": 0.31531471014022827,
"memory(GiB)": 74.62,
"step": 1207,
"token_acc": 0.865979381443299,
"train_speed(iter/s)": 0.022431
},
{
"epoch": 0.9771486349848332,
"grad_norm": 3.5692150592803955,
"learning_rate": 1.5022507298649848e-08,
"loss": 0.3675447106361389,
"memory(GiB)": 74.62,
"step": 1208,
"token_acc": 0.8636363636363636,
"train_speed(iter/s)": 0.022431
},
{
"epoch": 0.9779575328614762,
"grad_norm": 1.9649704694747925,
"learning_rate": 1.400481057122538e-08,
"loss": 0.38956940174102783,
"memory(GiB)": 74.62,
"step": 1209,
"token_acc": 0.8914473684210527,
"train_speed(iter/s)": 0.022431
},
{
"epoch": 0.9787664307381193,
"grad_norm": 2.3865509033203125,
"learning_rate": 1.3022756955254901e-08,
"loss": 0.3772105574607849,
"memory(GiB)": 74.62,
"step": 1210,
"token_acc": 0.8963963963963963,
"train_speed(iter/s)": 0.022432
},
{
"epoch": 0.9795753286147624,
"grad_norm": 9.275412559509277,
"learning_rate": 1.207635347108993e-08,
"loss": 0.39102572202682495,
"memory(GiB)": 74.62,
"step": 1211,
"token_acc": 0.8317757009345794,
"train_speed(iter/s)": 0.022432
},
{
"epoch": 0.9803842264914054,
"grad_norm": 2.0313827991485596,
"learning_rate": 1.1165606884234182e-08,
"loss": 0.37432482838630676,
"memory(GiB)": 74.62,
"step": 1212,
"token_acc": 0.875,
"train_speed(iter/s)": 0.022432
},
{
"epoch": 0.9811931243680485,
"grad_norm": 1.960199236869812,
"learning_rate": 1.0290523705291932e-08,
"loss": 0.3433490991592407,
"memory(GiB)": 74.62,
"step": 1213,
"token_acc": 0.8885714285714286,
"train_speed(iter/s)": 0.022433
},
{
"epoch": 0.9820020222446916,
"grad_norm": 1.8676866292953491,
"learning_rate": 9.451110189923063e-09,
"loss": 0.3818192183971405,
"memory(GiB)": 74.62,
"step": 1214,
"token_acc": 0.8989547038327527,
"train_speed(iter/s)": 0.022433
},
{
"epoch": 0.9828109201213346,
"grad_norm": 2.4343481063842773,
"learning_rate": 8.647372338795867e-09,
"loss": 0.4184320569038391,
"memory(GiB)": 74.62,
"step": 1215,
"token_acc": 0.8434782608695652,
"train_speed(iter/s)": 0.022433
},
{
"epoch": 0.9836198179979777,
"grad_norm": 2.3009696006774902,
"learning_rate": 7.8793158975482e-09,
"loss": 0.40056365728378296,
"memory(GiB)": 74.62,
"step": 1216,
"token_acc": 0.8210526315789474,
"train_speed(iter/s)": 0.022433
},
{
"epoch": 0.9844287158746208,
"grad_norm": 4.763977527618408,
"learning_rate": 7.146946356743068e-09,
"loss": 0.37496888637542725,
"memory(GiB)": 74.62,
"step": 1217,
"token_acc": 0.9244444444444444,
"train_speed(iter/s)": 0.022434
},
{
"epoch": 0.985237613751264,
"grad_norm": 2.2471978664398193,
"learning_rate": 6.450268951830319e-09,
"loss": 0.3727502226829529,
"memory(GiB)": 74.62,
"step": 1218,
"token_acc": 0.819327731092437,
"train_speed(iter/s)": 0.022434
},
{
"epoch": 0.986046511627907,
"grad_norm": 1.7557698488235474,
"learning_rate": 5.789288663110015e-09,
"loss": 0.32791298627853394,
"memory(GiB)": 74.62,
"step": 1219,
"token_acc": 0.8659420289855072,
"train_speed(iter/s)": 0.022434
},
{
"epoch": 0.9868554095045501,
"grad_norm": 2.5717544555664062,
"learning_rate": 5.164010215695792e-09,
"loss": 0.37463176250457764,
"memory(GiB)": 74.62,
"step": 1220,
"token_acc": 0.8560885608856088,
"train_speed(iter/s)": 0.022434
},
{
"epoch": 0.9876643073811932,
"grad_norm": 3.5073463916778564,
"learning_rate": 4.574438079480992e-09,
"loss": 0.32435593008995056,
"memory(GiB)": 74.62,
"step": 1221,
"token_acc": 0.8685446009389671,
"train_speed(iter/s)": 0.022435
},
{
"epoch": 0.9884732052578362,
"grad_norm": 1.9765585660934448,
"learning_rate": 4.020576469108139e-09,
"loss": 0.38409414887428284,
"memory(GiB)": 74.62,
"step": 1222,
"token_acc": 0.8888888888888888,
"train_speed(iter/s)": 0.022435
},
{
"epoch": 0.9892821031344793,
"grad_norm": 1.8832907676696777,
"learning_rate": 3.502429343937297e-09,
"loss": 0.3716433644294739,
"memory(GiB)": 74.62,
"step": 1223,
"token_acc": 0.8876811594202898,
"train_speed(iter/s)": 0.022435
},
{
"epoch": 0.9900910010111224,
"grad_norm": 1.9831905364990234,
"learning_rate": 3.020000408018864e-09,
"loss": 0.3268841505050659,
"memory(GiB)": 74.62,
"step": 1224,
"token_acc": 0.9003831417624522,
"train_speed(iter/s)": 0.022436
},
{
"epoch": 0.9908998988877654,
"grad_norm": 2.281235456466675,
"learning_rate": 2.573293110065822e-09,
"loss": 0.33263713121414185,
"memory(GiB)": 74.62,
"step": 1225,
"token_acc": 0.8669527896995708,
"train_speed(iter/s)": 0.022436
},
{
"epoch": 0.9917087967644085,
"grad_norm": 2.3608005046844482,
"learning_rate": 2.162310643430976e-09,
"loss": 0.39835768938064575,
"memory(GiB)": 74.62,
"step": 1226,
"token_acc": 0.8962655601659751,
"train_speed(iter/s)": 0.022436
},
{
"epoch": 0.9925176946410516,
"grad_norm": 2.6654913425445557,
"learning_rate": 1.7870559460814173e-09,
"loss": 0.4261908531188965,
"memory(GiB)": 74.62,
"step": 1227,
"token_acc": 0.8935574229691877,
"train_speed(iter/s)": 0.022437
},
{
"epoch": 0.9933265925176946,
"grad_norm": 1.8069103956222534,
"learning_rate": 1.447531700580207e-09,
"loss": 0.3241886496543884,
"memory(GiB)": 74.62,
"step": 1228,
"token_acc": 0.9383886255924171,
"train_speed(iter/s)": 0.022437
},
{
"epoch": 0.9941354903943377,
"grad_norm": 2.0414981842041016,
"learning_rate": 1.1437403340652797e-09,
"loss": 0.4070656895637512,
"memory(GiB)": 74.62,
"step": 1229,
"token_acc": 0.8465753424657534,
"train_speed(iter/s)": 0.022437
},
{
"epoch": 0.9949443882709808,
"grad_norm": 2.6518869400024414,
"learning_rate": 8.756840182344573e-10,
"loss": 0.3987523317337036,
"memory(GiB)": 74.62,
"step": 1230,
"token_acc": 0.8187134502923976,
"train_speed(iter/s)": 0.022438
},
{
"epoch": 0.9957532861476238,
"grad_norm": 1.9646754264831543,
"learning_rate": 6.433646693265738e-10,
"loss": 0.32140272855758667,
"memory(GiB)": 74.62,
"step": 1231,
"token_acc": 0.9049773755656109,
"train_speed(iter/s)": 0.022438
},
{
"epoch": 0.9965621840242669,
"grad_norm": 2.0284359455108643,
"learning_rate": 4.4678394810981904e-10,
"loss": 0.38582661747932434,
"memory(GiB)": 74.62,
"step": 1232,
"token_acc": 0.8961937716262975,
"train_speed(iter/s)": 0.022438
},
{
"epoch": 0.9973710819009101,
"grad_norm": 1.9221043586730957,
"learning_rate": 2.8594325987119086e-10,
"loss": 0.3542518615722656,
"memory(GiB)": 74.62,
"step": 1233,
"token_acc": 0.9240506329113924,
"train_speed(iter/s)": 0.022439
},
{
"epoch": 0.9981799797775531,
"grad_norm": 2.5311009883880615,
"learning_rate": 1.6084375440317268e-10,
"loss": 0.44038695096969604,
"memory(GiB)": 74.62,
"step": 1234,
"token_acc": 0.8537735849056604,
"train_speed(iter/s)": 0.022439
},
{
"epoch": 0.9989888776541962,
"grad_norm": 2.092437505722046,
"learning_rate": 7.148632599707217e-11,
"loss": 0.3628859221935272,
"memory(GiB)": 74.62,
"step": 1235,
"token_acc": 0.8671328671328671,
"train_speed(iter/s)": 0.022439
},
{
"epoch": 0.9997977755308393,
"grad_norm": 2.2749087810516357,
"learning_rate": 1.787161343858035e-11,
"loss": 0.4479348063468933,
"memory(GiB)": 74.62,
"step": 1236,
"token_acc": 0.8859934853420195,
"train_speed(iter/s)": 0.02244
},
{
"epoch": 1.0,
"grad_norm": 4.017106056213379,
"learning_rate": 0.0,
"loss": 0.41172629594802856,
"memory(GiB)": 74.62,
"step": 1237,
"token_acc": 0.8541666666666666,
"train_speed(iter/s)": 0.022445
},
{
"epoch": 1.0,
"eval_loss": 0.3615947365760803,
"eval_runtime": 428.6167,
"eval_samples_per_second": 3.728,
"eval_steps_per_second": 0.117,
"eval_token_acc": 0.8760036017108126,
"step": 1237
}
],
"logging_steps": 1,
"max_steps": 1237,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 618,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.135344722858895e+19,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}