zheminh's picture
Add files using upload-large-folder tool
b053480 verified
{
"best_metric": 0.35549462,
"best_model_checkpoint": "/home/ubuntu/output/v3-20250315-011617/checkpoint-1400",
"epoch": 0.9408602150537635,
"eval_steps": 100,
"global_step": 1400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006720430107526882,
"grad_norm": 6.624470388540664,
"learning_rate": 4.4642857142857145e-08,
"loss": 0.8636100888252258,
"memory(GiB)": 30.88,
"step": 1,
"token_acc": 0.7959770114942529,
"train_speed(iter/s)": 0.075305
},
{
"epoch": 0.003360215053763441,
"grad_norm": 5.470006090474637,
"learning_rate": 2.2321428571428574e-07,
"loss": 0.8933483362197876,
"memory(GiB)": 34.75,
"step": 5,
"token_acc": 0.8101460415065335,
"train_speed(iter/s)": 0.130654
},
{
"epoch": 0.006720430107526882,
"grad_norm": 7.203568128298078,
"learning_rate": 4.4642857142857147e-07,
"loss": 0.8624147415161133,
"memory(GiB)": 36.25,
"step": 10,
"token_acc": 0.8137791870856155,
"train_speed(iter/s)": 0.135384
},
{
"epoch": 0.010080645161290322,
"grad_norm": 6.286014373217196,
"learning_rate": 6.696428571428571e-07,
"loss": 0.8743539810180664,
"memory(GiB)": 36.41,
"step": 15,
"token_acc": 0.7762515262515263,
"train_speed(iter/s)": 0.140929
},
{
"epoch": 0.013440860215053764,
"grad_norm": 6.507242651795415,
"learning_rate": 8.928571428571429e-07,
"loss": 0.7428807258605957,
"memory(GiB)": 36.41,
"step": 20,
"token_acc": 0.8280786849905686,
"train_speed(iter/s)": 0.148933
},
{
"epoch": 0.016801075268817203,
"grad_norm": 4.90720329584475,
"learning_rate": 1.1160714285714287e-06,
"loss": 0.7442412853240967,
"memory(GiB)": 36.98,
"step": 25,
"token_acc": 0.847274158630191,
"train_speed(iter/s)": 0.148937
},
{
"epoch": 0.020161290322580645,
"grad_norm": 2.96004802436193,
"learning_rate": 1.3392857142857143e-06,
"loss": 0.6135346412658691,
"memory(GiB)": 36.98,
"step": 30,
"token_acc": 0.8366533864541833,
"train_speed(iter/s)": 0.151917
},
{
"epoch": 0.023521505376344086,
"grad_norm": 2.5106205597924416,
"learning_rate": 1.5625e-06,
"loss": 0.5674124717712402,
"memory(GiB)": 36.98,
"step": 35,
"token_acc": 0.8213552361396304,
"train_speed(iter/s)": 0.153037
},
{
"epoch": 0.026881720430107527,
"grad_norm": 2.1951186534941787,
"learning_rate": 1.7857142857142859e-06,
"loss": 0.5761374473571778,
"memory(GiB)": 36.98,
"step": 40,
"token_acc": 0.8286891204264599,
"train_speed(iter/s)": 0.154552
},
{
"epoch": 0.03024193548387097,
"grad_norm": 2.133434174676425,
"learning_rate": 2.0089285714285715e-06,
"loss": 0.5460317611694336,
"memory(GiB)": 36.98,
"step": 45,
"token_acc": 0.7985267034990792,
"train_speed(iter/s)": 0.153801
},
{
"epoch": 0.033602150537634407,
"grad_norm": 1.984334647250237,
"learning_rate": 2.2321428571428573e-06,
"loss": 0.5067720413208008,
"memory(GiB)": 36.98,
"step": 50,
"token_acc": 0.8156975549619889,
"train_speed(iter/s)": 0.154771
},
{
"epoch": 0.03696236559139785,
"grad_norm": 1.9203518870194811,
"learning_rate": 2.455357142857143e-06,
"loss": 0.4443554401397705,
"memory(GiB)": 36.98,
"step": 55,
"token_acc": 0.8660345246778507,
"train_speed(iter/s)": 0.156385
},
{
"epoch": 0.04032258064516129,
"grad_norm": 1.8799641787273145,
"learning_rate": 2.6785714285714285e-06,
"loss": 0.4721069812774658,
"memory(GiB)": 36.98,
"step": 60,
"token_acc": 0.8127896200185357,
"train_speed(iter/s)": 0.158928
},
{
"epoch": 0.043682795698924734,
"grad_norm": 2.013534899867525,
"learning_rate": 2.9017857142857148e-06,
"loss": 0.45479617118835447,
"memory(GiB)": 36.98,
"step": 65,
"token_acc": 0.8543247344461306,
"train_speed(iter/s)": 0.159897
},
{
"epoch": 0.04704301075268817,
"grad_norm": 1.9289442808657369,
"learning_rate": 3.125e-06,
"loss": 0.4564558982849121,
"memory(GiB)": 36.98,
"step": 70,
"token_acc": 0.8599615014436959,
"train_speed(iter/s)": 0.160729
},
{
"epoch": 0.05040322580645161,
"grad_norm": 1.647464466961927,
"learning_rate": 3.3482142857142855e-06,
"loss": 0.434113073348999,
"memory(GiB)": 36.98,
"step": 75,
"token_acc": 0.8652792990142387,
"train_speed(iter/s)": 0.161226
},
{
"epoch": 0.053763440860215055,
"grad_norm": 1.715327717999067,
"learning_rate": 3.5714285714285718e-06,
"loss": 0.44471006393432616,
"memory(GiB)": 36.98,
"step": 80,
"token_acc": 0.8362068965517241,
"train_speed(iter/s)": 0.16071
},
{
"epoch": 0.05712365591397849,
"grad_norm": 2.014035980278717,
"learning_rate": 3.794642857142857e-06,
"loss": 0.4432626247406006,
"memory(GiB)": 36.98,
"step": 85,
"token_acc": 0.8698553948832035,
"train_speed(iter/s)": 0.161156
},
{
"epoch": 0.06048387096774194,
"grad_norm": 1.5564040192565838,
"learning_rate": 4.017857142857143e-06,
"loss": 0.43194828033447263,
"memory(GiB)": 36.98,
"step": 90,
"token_acc": 0.8666666666666667,
"train_speed(iter/s)": 0.161467
},
{
"epoch": 0.06384408602150538,
"grad_norm": 1.61832820761015,
"learning_rate": 4.241071428571429e-06,
"loss": 0.4064196586608887,
"memory(GiB)": 36.98,
"step": 95,
"token_acc": 0.8643278484942565,
"train_speed(iter/s)": 0.161074
},
{
"epoch": 0.06720430107526881,
"grad_norm": 1.5539812424163364,
"learning_rate": 4.464285714285715e-06,
"loss": 0.426595401763916,
"memory(GiB)": 36.98,
"step": 100,
"token_acc": 0.8269726663228468,
"train_speed(iter/s)": 0.162108
},
{
"epoch": 0.06720430107526881,
"eval_loss": 0.4180561602115631,
"eval_runtime": 22.2875,
"eval_samples_per_second": 21.133,
"eval_steps_per_second": 2.647,
"eval_token_acc": 0.8593293797924896,
"step": 100
},
{
"epoch": 0.07056451612903226,
"grad_norm": 1.70607185424576,
"learning_rate": 4.6875000000000004e-06,
"loss": 0.37628803253173826,
"memory(GiB)": 36.98,
"step": 105,
"token_acc": 0.8750300264232524,
"train_speed(iter/s)": 0.145711
},
{
"epoch": 0.0739247311827957,
"grad_norm": 1.8191153432807592,
"learning_rate": 4.910714285714286e-06,
"loss": 0.4305882453918457,
"memory(GiB)": 36.98,
"step": 110,
"token_acc": 0.83150800336984,
"train_speed(iter/s)": 0.146623
},
{
"epoch": 0.07728494623655914,
"grad_norm": 1.516959275850232,
"learning_rate": 5.133928571428571e-06,
"loss": 0.3796091079711914,
"memory(GiB)": 36.98,
"step": 115,
"token_acc": 0.8756284153005465,
"train_speed(iter/s)": 0.147653
},
{
"epoch": 0.08064516129032258,
"grad_norm": 1.695400164810879,
"learning_rate": 5.357142857142857e-06,
"loss": 0.40662312507629395,
"memory(GiB)": 36.98,
"step": 120,
"token_acc": 0.8457357859531772,
"train_speed(iter/s)": 0.148073
},
{
"epoch": 0.08400537634408602,
"grad_norm": 1.8736717783155716,
"learning_rate": 5.580357142857144e-06,
"loss": 0.4302112102508545,
"memory(GiB)": 36.98,
"step": 125,
"token_acc": 0.8765765765765766,
"train_speed(iter/s)": 0.148479
},
{
"epoch": 0.08736559139784947,
"grad_norm": 1.7896300849207585,
"learning_rate": 5.8035714285714295e-06,
"loss": 0.40099687576293946,
"memory(GiB)": 36.98,
"step": 130,
"token_acc": 0.8720675633406318,
"train_speed(iter/s)": 0.14916
},
{
"epoch": 0.0907258064516129,
"grad_norm": 1.3724965970536878,
"learning_rate": 6.0267857142857145e-06,
"loss": 0.3895423889160156,
"memory(GiB)": 36.98,
"step": 135,
"token_acc": 0.8717873831775701,
"train_speed(iter/s)": 0.149285
},
{
"epoch": 0.09408602150537634,
"grad_norm": 1.5030342275560018,
"learning_rate": 6.25e-06,
"loss": 0.36336331367492675,
"memory(GiB)": 36.98,
"step": 140,
"token_acc": 0.8672401767030923,
"train_speed(iter/s)": 0.149903
},
{
"epoch": 0.09744623655913978,
"grad_norm": 2.1533049159645983,
"learning_rate": 6.473214285714287e-06,
"loss": 0.4152632713317871,
"memory(GiB)": 36.98,
"step": 145,
"token_acc": 0.850445481544336,
"train_speed(iter/s)": 0.149889
},
{
"epoch": 0.10080645161290322,
"grad_norm": 1.618692560488842,
"learning_rate": 6.696428571428571e-06,
"loss": 0.3959961891174316,
"memory(GiB)": 36.98,
"step": 150,
"token_acc": 0.8774609640190089,
"train_speed(iter/s)": 0.150655
},
{
"epoch": 0.10416666666666667,
"grad_norm": 1.6433698117562718,
"learning_rate": 6.919642857142858e-06,
"loss": 0.3835208654403687,
"memory(GiB)": 36.98,
"step": 155,
"token_acc": 0.8653895935801191,
"train_speed(iter/s)": 0.151188
},
{
"epoch": 0.10752688172043011,
"grad_norm": 1.482923442532512,
"learning_rate": 7.1428571428571436e-06,
"loss": 0.374590539932251,
"memory(GiB)": 36.98,
"step": 160,
"token_acc": 0.8786279683377308,
"train_speed(iter/s)": 0.151943
},
{
"epoch": 0.11088709677419355,
"grad_norm": 1.4090449360939217,
"learning_rate": 7.366071428571429e-06,
"loss": 0.36885733604431153,
"memory(GiB)": 36.98,
"step": 165,
"token_acc": 0.85635602555629,
"train_speed(iter/s)": 0.152229
},
{
"epoch": 0.11424731182795698,
"grad_norm": 1.9132620037814079,
"learning_rate": 7.589285714285714e-06,
"loss": 0.4086275577545166,
"memory(GiB)": 36.98,
"step": 170,
"token_acc": 0.8498422712933754,
"train_speed(iter/s)": 0.152742
},
{
"epoch": 0.11760752688172044,
"grad_norm": 1.92320190057799,
"learning_rate": 7.8125e-06,
"loss": 0.3889497756958008,
"memory(GiB)": 36.98,
"step": 175,
"token_acc": 0.9108433734939759,
"train_speed(iter/s)": 0.152976
},
{
"epoch": 0.12096774193548387,
"grad_norm": 1.6746635838154391,
"learning_rate": 8.035714285714286e-06,
"loss": 0.4047250270843506,
"memory(GiB)": 36.98,
"step": 180,
"token_acc": 0.8924731182795699,
"train_speed(iter/s)": 0.153229
},
{
"epoch": 0.12432795698924731,
"grad_norm": 1.6017724626754042,
"learning_rate": 8.258928571428572e-06,
"loss": 0.3953080654144287,
"memory(GiB)": 36.98,
"step": 185,
"token_acc": 0.8757406188281764,
"train_speed(iter/s)": 0.153785
},
{
"epoch": 0.12768817204301075,
"grad_norm": 1.416612700811783,
"learning_rate": 8.482142857142858e-06,
"loss": 0.38227014541625975,
"memory(GiB)": 36.98,
"step": 190,
"token_acc": 0.8174425126191811,
"train_speed(iter/s)": 0.153823
},
{
"epoch": 0.1310483870967742,
"grad_norm": 1.607816186735269,
"learning_rate": 8.705357142857143e-06,
"loss": 0.3862867832183838,
"memory(GiB)": 36.98,
"step": 195,
"token_acc": 0.8581641406920217,
"train_speed(iter/s)": 0.154023
},
{
"epoch": 0.13440860215053763,
"grad_norm": 1.4260899797117483,
"learning_rate": 8.92857142857143e-06,
"loss": 0.3993819713592529,
"memory(GiB)": 36.98,
"step": 200,
"token_acc": 0.8897238399398836,
"train_speed(iter/s)": 0.154175
},
{
"epoch": 0.13440860215053763,
"eval_loss": 0.3838532567024231,
"eval_runtime": 22.251,
"eval_samples_per_second": 21.168,
"eval_steps_per_second": 2.652,
"eval_token_acc": 0.8661689381004819,
"step": 200
},
{
"epoch": 0.13776881720430106,
"grad_norm": 1.6111939807098428,
"learning_rate": 9.151785714285715e-06,
"loss": 0.39362945556640627,
"memory(GiB)": 36.98,
"step": 205,
"token_acc": 0.870816083081889,
"train_speed(iter/s)": 0.146456
},
{
"epoch": 0.14112903225806453,
"grad_norm": 1.45575497509253,
"learning_rate": 9.375000000000001e-06,
"loss": 0.37442193031311033,
"memory(GiB)": 36.98,
"step": 210,
"token_acc": 0.83461210571185,
"train_speed(iter/s)": 0.146913
},
{
"epoch": 0.14448924731182797,
"grad_norm": 1.5444085134752388,
"learning_rate": 9.598214285714287e-06,
"loss": 0.38316497802734373,
"memory(GiB)": 36.98,
"step": 215,
"token_acc": 0.8777614138438881,
"train_speed(iter/s)": 0.147416
},
{
"epoch": 0.1478494623655914,
"grad_norm": 1.3520701605942982,
"learning_rate": 9.821428571428573e-06,
"loss": 0.3548079013824463,
"memory(GiB)": 36.98,
"step": 220,
"token_acc": 0.8824049513704686,
"train_speed(iter/s)": 0.147708
},
{
"epoch": 0.15120967741935484,
"grad_norm": 1.6545075322794685,
"learning_rate": 9.999998627513692e-06,
"loss": 0.38989009857177737,
"memory(GiB)": 36.98,
"step": 225,
"token_acc": 0.8820522529344945,
"train_speed(iter/s)": 0.147715
},
{
"epoch": 0.15456989247311828,
"grad_norm": 1.7618846612441343,
"learning_rate": 9.999950590571983e-06,
"loss": 0.37784652709960936,
"memory(GiB)": 36.98,
"step": 230,
"token_acc": 0.8712837837837838,
"train_speed(iter/s)": 0.148241
},
{
"epoch": 0.15793010752688172,
"grad_norm": 1.639163396629249,
"learning_rate": 9.999833930068294e-06,
"loss": 0.4089940547943115,
"memory(GiB)": 36.98,
"step": 235,
"token_acc": 0.8786279683377308,
"train_speed(iter/s)": 0.148602
},
{
"epoch": 0.16129032258064516,
"grad_norm": 1.6330896141172941,
"learning_rate": 9.999648647603774e-06,
"loss": 0.3805581569671631,
"memory(GiB)": 36.98,
"step": 240,
"token_acc": 0.8617309697601668,
"train_speed(iter/s)": 0.149149
},
{
"epoch": 0.1646505376344086,
"grad_norm": 1.3404402756331484,
"learning_rate": 9.999394745721398e-06,
"loss": 0.3552532196044922,
"memory(GiB)": 36.98,
"step": 245,
"token_acc": 0.8820645161290323,
"train_speed(iter/s)": 0.149569
},
{
"epoch": 0.16801075268817203,
"grad_norm": 1.6681551792745373,
"learning_rate": 9.99907222790593e-06,
"loss": 0.39254608154296877,
"memory(GiB)": 36.98,
"step": 250,
"token_acc": 0.8685992955838526,
"train_speed(iter/s)": 0.149851
},
{
"epoch": 0.17137096774193547,
"grad_norm": 1.5895776155869723,
"learning_rate": 9.998681098583875e-06,
"loss": 0.3646648645401001,
"memory(GiB)": 36.98,
"step": 255,
"token_acc": 0.8523304107060452,
"train_speed(iter/s)": 0.150089
},
{
"epoch": 0.17473118279569894,
"grad_norm": 1.6243707032073051,
"learning_rate": 9.998221363123425e-06,
"loss": 0.37679805755615237,
"memory(GiB)": 36.98,
"step": 260,
"token_acc": 0.875,
"train_speed(iter/s)": 0.150347
},
{
"epoch": 0.17809139784946237,
"grad_norm": 1.355667950869519,
"learning_rate": 9.997693027834384e-06,
"loss": 0.4236710548400879,
"memory(GiB)": 36.98,
"step": 265,
"token_acc": 0.8643803585346843,
"train_speed(iter/s)": 0.150219
},
{
"epoch": 0.1814516129032258,
"grad_norm": 1.6123089970883007,
"learning_rate": 9.997096099968065e-06,
"loss": 0.3853868246078491,
"memory(GiB)": 36.98,
"step": 270,
"token_acc": 0.8670948616600791,
"train_speed(iter/s)": 0.150287
},
{
"epoch": 0.18481182795698925,
"grad_norm": 1.5728048169270692,
"learning_rate": 9.996430587717219e-06,
"loss": 0.39868209362030027,
"memory(GiB)": 36.98,
"step": 275,
"token_acc": 0.8995027479717351,
"train_speed(iter/s)": 0.150527
},
{
"epoch": 0.1881720430107527,
"grad_norm": 1.6115770374072542,
"learning_rate": 9.995696500215899e-06,
"loss": 0.38219125270843507,
"memory(GiB)": 36.98,
"step": 280,
"token_acc": 0.8556105610561056,
"train_speed(iter/s)": 0.150658
},
{
"epoch": 0.19153225806451613,
"grad_norm": 1.7888055041840647,
"learning_rate": 9.994893847539341e-06,
"loss": 0.39760608673095704,
"memory(GiB)": 36.98,
"step": 285,
"token_acc": 0.8500110448420588,
"train_speed(iter/s)": 0.150648
},
{
"epoch": 0.19489247311827956,
"grad_norm": 1.7509518458716071,
"learning_rate": 9.994022640703837e-06,
"loss": 0.3869561910629272,
"memory(GiB)": 36.98,
"step": 290,
"token_acc": 0.8347354138398915,
"train_speed(iter/s)": 0.150683
},
{
"epoch": 0.198252688172043,
"grad_norm": 1.548716222125624,
"learning_rate": 9.993082891666564e-06,
"loss": 0.38693482875823976,
"memory(GiB)": 36.98,
"step": 295,
"token_acc": 0.882627817482133,
"train_speed(iter/s)": 0.150731
},
{
"epoch": 0.20161290322580644,
"grad_norm": 1.4513252709739255,
"learning_rate": 9.992074613325435e-06,
"loss": 0.3878211498260498,
"memory(GiB)": 36.98,
"step": 300,
"token_acc": 0.8645980253878702,
"train_speed(iter/s)": 0.150735
},
{
"epoch": 0.20161290322580644,
"eval_loss": 0.3748473823070526,
"eval_runtime": 22.1986,
"eval_samples_per_second": 21.218,
"eval_steps_per_second": 2.658,
"eval_token_acc": 0.8676136306234404,
"step": 300
},
{
"epoch": 0.2049731182795699,
"grad_norm": 1.553685980316146,
"learning_rate": 9.990997819518916e-06,
"loss": 0.3740631103515625,
"memory(GiB)": 36.98,
"step": 305,
"token_acc": 0.8838088271153377,
"train_speed(iter/s)": 0.145632
},
{
"epoch": 0.20833333333333334,
"grad_norm": 1.4595129136479905,
"learning_rate": 9.989852525025845e-06,
"loss": 0.39213879108428956,
"memory(GiB)": 36.98,
"step": 310,
"token_acc": 0.8785211267605634,
"train_speed(iter/s)": 0.146129
},
{
"epoch": 0.21169354838709678,
"grad_norm": 1.6424371283556445,
"learning_rate": 9.988638745565207e-06,
"loss": 0.3821288585662842,
"memory(GiB)": 36.98,
"step": 315,
"token_acc": 0.855286827903258,
"train_speed(iter/s)": 0.146459
},
{
"epoch": 0.21505376344086022,
"grad_norm": 1.4969668062939072,
"learning_rate": 9.987356497795944e-06,
"loss": 0.3804674863815308,
"memory(GiB)": 36.98,
"step": 320,
"token_acc": 0.8544133476856836,
"train_speed(iter/s)": 0.146629
},
{
"epoch": 0.21841397849462366,
"grad_norm": 1.7200733273955053,
"learning_rate": 9.986005799316711e-06,
"loss": 0.38283023834228513,
"memory(GiB)": 36.98,
"step": 325,
"token_acc": 0.8894173602853745,
"train_speed(iter/s)": 0.146936
},
{
"epoch": 0.2217741935483871,
"grad_norm": 1.3976699612621017,
"learning_rate": 9.984586668665641e-06,
"loss": 0.3722895622253418,
"memory(GiB)": 36.98,
"step": 330,
"token_acc": 0.8618727366787378,
"train_speed(iter/s)": 0.147204
},
{
"epoch": 0.22513440860215053,
"grad_norm": 1.81408532653209,
"learning_rate": 9.983099125320083e-06,
"loss": 0.36228926181793214,
"memory(GiB)": 36.98,
"step": 335,
"token_acc": 0.8845631507469444,
"train_speed(iter/s)": 0.147325
},
{
"epoch": 0.22849462365591397,
"grad_norm": 1.6305131775402135,
"learning_rate": 9.981543189696349e-06,
"loss": 0.3938841104507446,
"memory(GiB)": 36.98,
"step": 340,
"token_acc": 0.86302780638517,
"train_speed(iter/s)": 0.147585
},
{
"epoch": 0.2318548387096774,
"grad_norm": 1.3804298809031184,
"learning_rate": 9.979918883149412e-06,
"loss": 0.3789535999298096,
"memory(GiB)": 36.98,
"step": 345,
"token_acc": 0.8524590163934426,
"train_speed(iter/s)": 0.148072
},
{
"epoch": 0.23521505376344087,
"grad_norm": 1.566906969179067,
"learning_rate": 9.97822622797264e-06,
"loss": 0.35543198585510255,
"memory(GiB)": 36.98,
"step": 350,
"token_acc": 0.8679446219382322,
"train_speed(iter/s)": 0.148335
},
{
"epoch": 0.2385752688172043,
"grad_norm": 1.4075206244947833,
"learning_rate": 9.976465247397463e-06,
"loss": 0.3800350666046143,
"memory(GiB)": 36.98,
"step": 355,
"token_acc": 0.8800525796910943,
"train_speed(iter/s)": 0.148683
},
{
"epoch": 0.24193548387096775,
"grad_norm": 1.515030938288751,
"learning_rate": 9.97463596559307e-06,
"loss": 0.3830380439758301,
"memory(GiB)": 36.98,
"step": 360,
"token_acc": 0.8495917490330898,
"train_speed(iter/s)": 0.148913
},
{
"epoch": 0.2452956989247312,
"grad_norm": 1.4289246071399373,
"learning_rate": 9.97273840766608e-06,
"loss": 0.3855442047119141,
"memory(GiB)": 36.98,
"step": 365,
"token_acc": 0.8720626631853786,
"train_speed(iter/s)": 0.149089
},
{
"epoch": 0.24865591397849462,
"grad_norm": 1.2845230370748955,
"learning_rate": 9.970772599660188e-06,
"loss": 0.3582731246948242,
"memory(GiB)": 36.98,
"step": 370,
"token_acc": 0.8702564102564102,
"train_speed(iter/s)": 0.149282
},
{
"epoch": 0.25201612903225806,
"grad_norm": 1.3569524106626425,
"learning_rate": 9.968738568555806e-06,
"loss": 0.37152538299560545,
"memory(GiB)": 36.98,
"step": 375,
"token_acc": 0.8608198284080076,
"train_speed(iter/s)": 0.149486
},
{
"epoch": 0.2553763440860215,
"grad_norm": 1.3704888639008825,
"learning_rate": 9.966636342269706e-06,
"loss": 0.37766404151916505,
"memory(GiB)": 36.98,
"step": 380,
"token_acc": 0.8613061797752809,
"train_speed(iter/s)": 0.14955
},
{
"epoch": 0.25873655913978494,
"grad_norm": 1.4312626718198491,
"learning_rate": 9.964465949654621e-06,
"loss": 0.3510261058807373,
"memory(GiB)": 36.98,
"step": 385,
"token_acc": 0.8919360104472739,
"train_speed(iter/s)": 0.149815
},
{
"epoch": 0.2620967741935484,
"grad_norm": 1.5012892071738868,
"learning_rate": 9.96222742049886e-06,
"loss": 0.394317102432251,
"memory(GiB)": 36.98,
"step": 390,
"token_acc": 0.8867279894875164,
"train_speed(iter/s)": 0.150039
},
{
"epoch": 0.2654569892473118,
"grad_norm": 1.2294914710999452,
"learning_rate": 9.959920785525896e-06,
"loss": 0.37460813522338865,
"memory(GiB)": 36.98,
"step": 395,
"token_acc": 0.8766404199475065,
"train_speed(iter/s)": 0.150189
},
{
"epoch": 0.26881720430107525,
"grad_norm": 1.312969727779233,
"learning_rate": 9.957546076393944e-06,
"loss": 0.3609046220779419,
"memory(GiB)": 36.98,
"step": 400,
"token_acc": 0.8716331401295602,
"train_speed(iter/s)": 0.150494
},
{
"epoch": 0.26881720430107525,
"eval_loss": 0.37320902943611145,
"eval_runtime": 22.1942,
"eval_samples_per_second": 21.222,
"eval_steps_per_second": 2.658,
"eval_token_acc": 0.8664013012335452,
"step": 400
},
{
"epoch": 0.2721774193548387,
"grad_norm": 1.4201872276695326,
"learning_rate": 9.955103325695526e-06,
"loss": 0.3812046766281128,
"memory(GiB)": 36.98,
"step": 405,
"token_acc": 0.8722784634586049,
"train_speed(iter/s)": 0.146726
},
{
"epoch": 0.27553763440860213,
"grad_norm": 1.68708660262874,
"learning_rate": 9.952592566957024e-06,
"loss": 0.3877915382385254,
"memory(GiB)": 36.98,
"step": 410,
"token_acc": 0.8642187798814305,
"train_speed(iter/s)": 0.146883
},
{
"epoch": 0.27889784946236557,
"grad_norm": 1.1600519908390228,
"learning_rate": 9.95001383463822e-06,
"loss": 0.3642563343048096,
"memory(GiB)": 36.98,
"step": 415,
"token_acc": 0.8828267082359055,
"train_speed(iter/s)": 0.146962
},
{
"epoch": 0.28225806451612906,
"grad_norm": 1.2285637627216721,
"learning_rate": 9.947367164131823e-06,
"loss": 0.37247591018676757,
"memory(GiB)": 36.98,
"step": 420,
"token_acc": 0.8826968755709849,
"train_speed(iter/s)": 0.147109
},
{
"epoch": 0.2856182795698925,
"grad_norm": 1.259887568682347,
"learning_rate": 9.944652591762982e-06,
"loss": 0.36926772594451907,
"memory(GiB)": 36.98,
"step": 425,
"token_acc": 0.8709743171626579,
"train_speed(iter/s)": 0.147357
},
{
"epoch": 0.28897849462365593,
"grad_norm": 1.3616426835834692,
"learning_rate": 9.941870154788793e-06,
"loss": 0.3907612323760986,
"memory(GiB)": 36.98,
"step": 430,
"token_acc": 0.8674786845310597,
"train_speed(iter/s)": 0.147389
},
{
"epoch": 0.2923387096774194,
"grad_norm": 1.4245606118453644,
"learning_rate": 9.939019891397778e-06,
"loss": 0.3910489320755005,
"memory(GiB)": 36.98,
"step": 435,
"token_acc": 0.8651339608979001,
"train_speed(iter/s)": 0.14749
},
{
"epoch": 0.2956989247311828,
"grad_norm": 1.2523176077069897,
"learning_rate": 9.936101840709373e-06,
"loss": 0.3995755672454834,
"memory(GiB)": 36.98,
"step": 440,
"token_acc": 0.8203216947822676,
"train_speed(iter/s)": 0.147702
},
{
"epoch": 0.29905913978494625,
"grad_norm": 1.3331291522700441,
"learning_rate": 9.933116042773375e-06,
"loss": 0.3527373313903809,
"memory(GiB)": 36.98,
"step": 445,
"token_acc": 0.8523316062176166,
"train_speed(iter/s)": 0.147868
},
{
"epoch": 0.3024193548387097,
"grad_norm": 1.408211021179238,
"learning_rate": 9.93006253856941e-06,
"loss": 0.37166690826416016,
"memory(GiB)": 36.98,
"step": 450,
"token_acc": 0.8806776331942058,
"train_speed(iter/s)": 0.148099
},
{
"epoch": 0.3057795698924731,
"grad_norm": 1.665951694594511,
"learning_rate": 9.92694137000636e-06,
"loss": 0.39462215900421144,
"memory(GiB)": 36.98,
"step": 455,
"token_acc": 0.8611589213998853,
"train_speed(iter/s)": 0.148142
},
{
"epoch": 0.30913978494623656,
"grad_norm": 1.3584787225981534,
"learning_rate": 9.923752579921787e-06,
"loss": 0.3762697696685791,
"memory(GiB)": 36.98,
"step": 460,
"token_acc": 0.8588405797101449,
"train_speed(iter/s)": 0.148298
},
{
"epoch": 0.3125,
"grad_norm": 1.3251616436516978,
"learning_rate": 9.92049621208135e-06,
"loss": 0.3711697101593018,
"memory(GiB)": 36.98,
"step": 465,
"token_acc": 0.8663532572196104,
"train_speed(iter/s)": 0.148468
},
{
"epoch": 0.31586021505376344,
"grad_norm": 1.2515144595137409,
"learning_rate": 9.917172311178203e-06,
"loss": 0.360841178894043,
"memory(GiB)": 36.98,
"step": 470,
"token_acc": 0.8898142216527867,
"train_speed(iter/s)": 0.148629
},
{
"epoch": 0.3192204301075269,
"grad_norm": 1.1833628858332188,
"learning_rate": 9.913780922832383e-06,
"loss": 0.3361430883407593,
"memory(GiB)": 36.98,
"step": 475,
"token_acc": 0.8739758095981272,
"train_speed(iter/s)": 0.148749
},
{
"epoch": 0.3225806451612903,
"grad_norm": 1.5056179038816182,
"learning_rate": 9.910322093590177e-06,
"loss": 0.39411134719848634,
"memory(GiB)": 36.98,
"step": 480,
"token_acc": 0.8696789536266349,
"train_speed(iter/s)": 0.148862
},
{
"epoch": 0.32594086021505375,
"grad_norm": 1.3378379217665801,
"learning_rate": 9.90679587092349e-06,
"loss": 0.3966991662979126,
"memory(GiB)": 36.98,
"step": 485,
"token_acc": 0.8551621688536767,
"train_speed(iter/s)": 0.148869
},
{
"epoch": 0.3293010752688172,
"grad_norm": 1.0480697431497084,
"learning_rate": 9.903202303229199e-06,
"loss": 0.3763158321380615,
"memory(GiB)": 36.98,
"step": 490,
"token_acc": 0.8711036225779275,
"train_speed(iter/s)": 0.148966
},
{
"epoch": 0.3326612903225806,
"grad_norm": 1.3180202797254805,
"learning_rate": 9.899541439828464e-06,
"loss": 0.3638334274291992,
"memory(GiB)": 36.98,
"step": 495,
"token_acc": 0.881887165951831,
"train_speed(iter/s)": 0.149291
},
{
"epoch": 0.33602150537634407,
"grad_norm": 1.3597315112923483,
"learning_rate": 9.895813330966086e-06,
"loss": 0.3699483394622803,
"memory(GiB)": 36.98,
"step": 500,
"token_acc": 0.8665288306914771,
"train_speed(iter/s)": 0.149428
},
{
"epoch": 0.33602150537634407,
"eval_loss": 0.36967214941978455,
"eval_runtime": 22.2237,
"eval_samples_per_second": 21.194,
"eval_steps_per_second": 2.655,
"eval_token_acc": 0.8679470212056616,
"step": 500
},
{
"epoch": 0.3393817204301075,
"grad_norm": 1.5431659281776462,
"learning_rate": 9.892018027809793e-06,
"loss": 0.37493338584899905,
"memory(GiB)": 36.98,
"step": 505,
"token_acc": 0.8687758281959217,
"train_speed(iter/s)": 0.146352
},
{
"epoch": 0.34274193548387094,
"grad_norm": 1.3449879580767712,
"learning_rate": 9.88815558244954e-06,
"loss": 0.39549927711486815,
"memory(GiB)": 36.98,
"step": 510,
"token_acc": 0.8586065573770492,
"train_speed(iter/s)": 0.146469
},
{
"epoch": 0.34610215053763443,
"grad_norm": 1.573469085654153,
"learning_rate": 9.884226047896803e-06,
"loss": 0.35512893199920653,
"memory(GiB)": 36.98,
"step": 515,
"token_acc": 0.8800461361014994,
"train_speed(iter/s)": 0.146625
},
{
"epoch": 0.34946236559139787,
"grad_norm": 1.306949068152781,
"learning_rate": 9.880229478083849e-06,
"loss": 0.35407171249389646,
"memory(GiB)": 36.98,
"step": 520,
"token_acc": 0.8720520466251016,
"train_speed(iter/s)": 0.146718
},
{
"epoch": 0.3528225806451613,
"grad_norm": 1.377421152139763,
"learning_rate": 9.87616592786299e-06,
"loss": 0.39537363052368163,
"memory(GiB)": 36.98,
"step": 525,
"token_acc": 0.8787520680690144,
"train_speed(iter/s)": 0.146839
},
{
"epoch": 0.35618279569892475,
"grad_norm": 1.4366698685832746,
"learning_rate": 9.872035453005836e-06,
"loss": 0.37119576930999754,
"memory(GiB)": 36.98,
"step": 530,
"token_acc": 0.8672721945502491,
"train_speed(iter/s)": 0.147073
},
{
"epoch": 0.3595430107526882,
"grad_norm": 1.2355637712340461,
"learning_rate": 9.867838110202525e-06,
"loss": 0.3660418510437012,
"memory(GiB)": 36.98,
"step": 535,
"token_acc": 0.8535449735449735,
"train_speed(iter/s)": 0.147263
},
{
"epoch": 0.3629032258064516,
"grad_norm": 1.2495337501258665,
"learning_rate": 9.863573957060953e-06,
"loss": 0.36377530097961425,
"memory(GiB)": 36.98,
"step": 540,
"token_acc": 0.8842736561528209,
"train_speed(iter/s)": 0.14738
},
{
"epoch": 0.36626344086021506,
"grad_norm": 1.305490979194108,
"learning_rate": 9.859243052105967e-06,
"loss": 0.38242766857147215,
"memory(GiB)": 36.98,
"step": 545,
"token_acc": 0.858521717043434,
"train_speed(iter/s)": 0.14751
},
{
"epoch": 0.3696236559139785,
"grad_norm": 1.3434721486823473,
"learning_rate": 9.854845454778585e-06,
"loss": 0.37967212200164796,
"memory(GiB)": 36.98,
"step": 550,
"token_acc": 0.8517232829159435,
"train_speed(iter/s)": 0.147798
},
{
"epoch": 0.37298387096774194,
"grad_norm": 1.5896521776380288,
"learning_rate": 9.85038122543516e-06,
"loss": 0.40446958541870115,
"memory(GiB)": 36.98,
"step": 555,
"token_acc": 0.8549883990719258,
"train_speed(iter/s)": 0.147841
},
{
"epoch": 0.3763440860215054,
"grad_norm": 1.5541190326682846,
"learning_rate": 9.845850425346563e-06,
"loss": 0.3877220869064331,
"memory(GiB)": 36.98,
"step": 560,
"token_acc": 0.8556073818698033,
"train_speed(iter/s)": 0.148023
},
{
"epoch": 0.3797043010752688,
"grad_norm": 1.2488745915944466,
"learning_rate": 9.841253116697333e-06,
"loss": 0.36992673873901366,
"memory(GiB)": 36.98,
"step": 565,
"token_acc": 0.8745377707342842,
"train_speed(iter/s)": 0.148144
},
{
"epoch": 0.38306451612903225,
"grad_norm": 1.4733618409553315,
"learning_rate": 9.836589362584837e-06,
"loss": 0.37825746536254884,
"memory(GiB)": 36.98,
"step": 570,
"token_acc": 0.8752980448259419,
"train_speed(iter/s)": 0.148207
},
{
"epoch": 0.3864247311827957,
"grad_norm": 1.2525075479382355,
"learning_rate": 9.831859227018387e-06,
"loss": 0.39865090847015383,
"memory(GiB)": 36.98,
"step": 575,
"token_acc": 0.8661578555472822,
"train_speed(iter/s)": 0.148276
},
{
"epoch": 0.3897849462365591,
"grad_norm": 1.449301845425235,
"learning_rate": 9.827062774918377e-06,
"loss": 0.3785409927368164,
"memory(GiB)": 36.98,
"step": 580,
"token_acc": 0.8519668737060041,
"train_speed(iter/s)": 0.148282
},
{
"epoch": 0.39314516129032256,
"grad_norm": 1.2168801605641335,
"learning_rate": 9.822200072115385e-06,
"loss": 0.3902196645736694,
"memory(GiB)": 36.98,
"step": 585,
"token_acc": 0.8548924568312632,
"train_speed(iter/s)": 0.148421
},
{
"epoch": 0.396505376344086,
"grad_norm": 1.3081294024585297,
"learning_rate": 9.817271185349263e-06,
"loss": 0.3606643438339233,
"memory(GiB)": 36.98,
"step": 590,
"token_acc": 0.8854581673306773,
"train_speed(iter/s)": 0.148553
},
{
"epoch": 0.39986559139784944,
"grad_norm": 1.1879061305415564,
"learning_rate": 9.812276182268236e-06,
"loss": 0.36301345825195314,
"memory(GiB)": 36.98,
"step": 595,
"token_acc": 0.8823529411764706,
"train_speed(iter/s)": 0.148614
},
{
"epoch": 0.4032258064516129,
"grad_norm": 1.3201174180172996,
"learning_rate": 9.807215131427966e-06,
"loss": 0.35914902687072753,
"memory(GiB)": 36.98,
"step": 600,
"token_acc": 0.8685085699667434,
"train_speed(iter/s)": 0.14863
},
{
"epoch": 0.4032258064516129,
"eval_loss": 0.3670656979084015,
"eval_runtime": 22.3702,
"eval_samples_per_second": 21.055,
"eval_steps_per_second": 2.637,
"eval_token_acc": 0.8686845215845145,
"step": 600
},
{
"epoch": 0.40658602150537637,
"grad_norm": 1.0706504342347392,
"learning_rate": 9.802088102290598e-06,
"loss": 0.36201565265655516,
"memory(GiB)": 36.98,
"step": 605,
"token_acc": 0.8745827060286705,
"train_speed(iter/s)": 0.145719
},
{
"epoch": 0.4099462365591398,
"grad_norm": 1.3426300690915438,
"learning_rate": 9.796895165223835e-06,
"loss": 0.3961642265319824,
"memory(GiB)": 36.98,
"step": 610,
"token_acc": 0.8796818031156778,
"train_speed(iter/s)": 0.145743
},
{
"epoch": 0.41330645161290325,
"grad_norm": 1.2789042611843648,
"learning_rate": 9.791636391499944e-06,
"loss": 0.38369901180267335,
"memory(GiB)": 36.98,
"step": 615,
"token_acc": 0.8649926144756278,
"train_speed(iter/s)": 0.145891
},
{
"epoch": 0.4166666666666667,
"grad_norm": 1.713547331206706,
"learning_rate": 9.786311853294799e-06,
"loss": 0.37184116840362547,
"memory(GiB)": 36.98,
"step": 620,
"token_acc": 0.8365116279069768,
"train_speed(iter/s)": 0.14596
},
{
"epoch": 0.4200268817204301,
"grad_norm": 1.4766281192277437,
"learning_rate": 9.780921623686873e-06,
"loss": 0.3832437515258789,
"memory(GiB)": 36.98,
"step": 625,
"token_acc": 0.8809590973201693,
"train_speed(iter/s)": 0.146172
},
{
"epoch": 0.42338709677419356,
"grad_norm": 1.2467031836603142,
"learning_rate": 9.775465776656257e-06,
"loss": 0.3634549856185913,
"memory(GiB)": 36.98,
"step": 630,
"token_acc": 0.8876265744628303,
"train_speed(iter/s)": 0.14642
},
{
"epoch": 0.426747311827957,
"grad_norm": 1.2190081005378173,
"learning_rate": 9.769944387083613e-06,
"loss": 0.35646920204162597,
"memory(GiB)": 36.98,
"step": 635,
"token_acc": 0.9050476526650194,
"train_speed(iter/s)": 0.146633
},
{
"epoch": 0.43010752688172044,
"grad_norm": 1.4111684796366029,
"learning_rate": 9.764357530749178e-06,
"loss": 0.368475079536438,
"memory(GiB)": 36.98,
"step": 640,
"token_acc": 0.8574933551420977,
"train_speed(iter/s)": 0.146775
},
{
"epoch": 0.4334677419354839,
"grad_norm": 1.2502663586719176,
"learning_rate": 9.758705284331704e-06,
"loss": 0.36438665390014646,
"memory(GiB)": 36.98,
"step": 645,
"token_acc": 0.8530818278427205,
"train_speed(iter/s)": 0.146859
},
{
"epoch": 0.4368279569892473,
"grad_norm": 1.2436432744079327,
"learning_rate": 9.752987725407416e-06,
"loss": 0.35638880729675293,
"memory(GiB)": 36.98,
"step": 650,
"token_acc": 0.8638570167696381,
"train_speed(iter/s)": 0.146953
},
{
"epoch": 0.44018817204301075,
"grad_norm": 1.3569390750268104,
"learning_rate": 9.747204932448942e-06,
"loss": 0.37936763763427733,
"memory(GiB)": 36.98,
"step": 655,
"token_acc": 0.8516341406066306,
"train_speed(iter/s)": 0.147038
},
{
"epoch": 0.4435483870967742,
"grad_norm": 1.5388545697441205,
"learning_rate": 9.741356984824234e-06,
"loss": 0.3363116502761841,
"memory(GiB)": 36.98,
"step": 660,
"token_acc": 0.8699498672957829,
"train_speed(iter/s)": 0.147318
},
{
"epoch": 0.4469086021505376,
"grad_norm": 1.3962981286379317,
"learning_rate": 9.73544396279549e-06,
"loss": 0.34099550247192384,
"memory(GiB)": 36.98,
"step": 665,
"token_acc": 0.8706088992974239,
"train_speed(iter/s)": 0.147444
},
{
"epoch": 0.45026881720430106,
"grad_norm": 1.4093270418166992,
"learning_rate": 9.72946594751803e-06,
"loss": 0.37921915054321287,
"memory(GiB)": 36.98,
"step": 670,
"token_acc": 0.8673262873847538,
"train_speed(iter/s)": 0.147527
},
{
"epoch": 0.4536290322580645,
"grad_norm": 1.1728008554779097,
"learning_rate": 9.723423021039211e-06,
"loss": 0.3977491855621338,
"memory(GiB)": 36.98,
"step": 675,
"token_acc": 0.8671311271460562,
"train_speed(iter/s)": 0.147683
},
{
"epoch": 0.45698924731182794,
"grad_norm": 1.1840245257190896,
"learning_rate": 9.717315266297277e-06,
"loss": 0.3696840763092041,
"memory(GiB)": 36.98,
"step": 680,
"token_acc": 0.8644768856447689,
"train_speed(iter/s)": 0.147724
},
{
"epoch": 0.4603494623655914,
"grad_norm": 1.2426389912018956,
"learning_rate": 9.711142767120238e-06,
"loss": 0.3768123149871826,
"memory(GiB)": 36.98,
"step": 685,
"token_acc": 0.885691231845078,
"train_speed(iter/s)": 0.1478
},
{
"epoch": 0.4637096774193548,
"grad_norm": 1.2302759474202611,
"learning_rate": 9.704905608224706e-06,
"loss": 0.3753058433532715,
"memory(GiB)": 36.98,
"step": 690,
"token_acc": 0.8787061994609164,
"train_speed(iter/s)": 0.147897
},
{
"epoch": 0.46706989247311825,
"grad_norm": 1.2541456992823259,
"learning_rate": 9.698603875214737e-06,
"loss": 0.38683831691741943,
"memory(GiB)": 36.98,
"step": 695,
"token_acc": 0.8655941878567722,
"train_speed(iter/s)": 0.147816
},
{
"epoch": 0.47043010752688175,
"grad_norm": 1.169233647864971,
"learning_rate": 9.692237654580658e-06,
"loss": 0.379689884185791,
"memory(GiB)": 36.98,
"step": 700,
"token_acc": 0.8643190056965303,
"train_speed(iter/s)": 0.147882
},
{
"epoch": 0.47043010752688175,
"eval_loss": 0.36553341150283813,
"eval_runtime": 22.2168,
"eval_samples_per_second": 21.2,
"eval_steps_per_second": 2.656,
"eval_token_acc": 0.8678156855217563,
"step": 700
},
{
"epoch": 0.4737903225806452,
"grad_norm": 1.2629929939158209,
"learning_rate": 9.685807033697883e-06,
"loss": 0.37116250991821287,
"memory(GiB)": 36.98,
"step": 705,
"token_acc": 0.8659203980099502,
"train_speed(iter/s)": 0.143693
},
{
"epoch": 0.4771505376344086,
"grad_norm": 1.2566121281394267,
"learning_rate": 9.679312100825703e-06,
"loss": 0.3787511348724365,
"memory(GiB)": 36.98,
"step": 710,
"token_acc": 0.8787093735280264,
"train_speed(iter/s)": 0.14381
},
{
"epoch": 0.48051075268817206,
"grad_norm": 1.4778551662573782,
"learning_rate": 9.672752945106088e-06,
"loss": 0.39328994750976565,
"memory(GiB)": 36.98,
"step": 715,
"token_acc": 0.885866802236909,
"train_speed(iter/s)": 0.144022
},
{
"epoch": 0.4838709677419355,
"grad_norm": 1.2521632439474264,
"learning_rate": 9.66612965656245e-06,
"loss": 0.3512038469314575,
"memory(GiB)": 36.98,
"step": 720,
"token_acc": 0.8695306284805091,
"train_speed(iter/s)": 0.144128
},
{
"epoch": 0.48723118279569894,
"grad_norm": 1.5687563338452333,
"learning_rate": 9.65944232609842e-06,
"loss": 0.38547964096069337,
"memory(GiB)": 36.98,
"step": 725,
"token_acc": 0.8300395256916996,
"train_speed(iter/s)": 0.14415
},
{
"epoch": 0.4905913978494624,
"grad_norm": 1.5100891343298852,
"learning_rate": 9.652691045496591e-06,
"loss": 0.38148338794708253,
"memory(GiB)": 36.98,
"step": 730,
"token_acc": 0.8454720616570327,
"train_speed(iter/s)": 0.144282
},
{
"epoch": 0.4939516129032258,
"grad_norm": 1.2871381068183412,
"learning_rate": 9.645875907417266e-06,
"loss": 0.3543083667755127,
"memory(GiB)": 36.98,
"step": 735,
"token_acc": 0.877696190913263,
"train_speed(iter/s)": 0.144319
},
{
"epoch": 0.49731182795698925,
"grad_norm": 1.5160670685722115,
"learning_rate": 9.638997005397174e-06,
"loss": 0.3613048791885376,
"memory(GiB)": 36.98,
"step": 740,
"token_acc": 0.8726815466834329,
"train_speed(iter/s)": 0.144365
},
{
"epoch": 0.5006720430107527,
"grad_norm": 1.135167693746094,
"learning_rate": 9.632054433848205e-06,
"loss": 0.3598623514175415,
"memory(GiB)": 36.98,
"step": 745,
"token_acc": 0.8829736211031175,
"train_speed(iter/s)": 0.144478
},
{
"epoch": 0.5040322580645161,
"grad_norm": 1.187612792246231,
"learning_rate": 9.625048288056098e-06,
"loss": 0.3665189266204834,
"memory(GiB)": 36.98,
"step": 750,
"token_acc": 0.8640932850824287,
"train_speed(iter/s)": 0.144599
},
{
"epoch": 0.5073924731182796,
"grad_norm": 1.1980021625067683,
"learning_rate": 9.617978664179135e-06,
"loss": 0.3561270236968994,
"memory(GiB)": 36.98,
"step": 755,
"token_acc": 0.8723776223776224,
"train_speed(iter/s)": 0.144765
},
{
"epoch": 0.510752688172043,
"grad_norm": 1.232472450356955,
"learning_rate": 9.610845659246833e-06,
"loss": 0.3396260976791382,
"memory(GiB)": 36.98,
"step": 760,
"token_acc": 0.8916548797736916,
"train_speed(iter/s)": 0.144853
},
{
"epoch": 0.5141129032258065,
"grad_norm": 1.3225586989381914,
"learning_rate": 9.6036493711586e-06,
"loss": 0.38932969570159914,
"memory(GiB)": 36.98,
"step": 765,
"token_acc": 0.874160048869884,
"train_speed(iter/s)": 0.145003
},
{
"epoch": 0.5174731182795699,
"grad_norm": 1.4950056939211132,
"learning_rate": 9.596389898682396e-06,
"loss": 0.3547043323516846,
"memory(GiB)": 36.98,
"step": 770,
"token_acc": 0.8626444159178434,
"train_speed(iter/s)": 0.145082
},
{
"epoch": 0.5208333333333334,
"grad_norm": 1.1802965236685845,
"learning_rate": 9.589067341453375e-06,
"loss": 0.34569859504699707,
"memory(GiB)": 36.98,
"step": 775,
"token_acc": 0.8858312858312858,
"train_speed(iter/s)": 0.145186
},
{
"epoch": 0.5241935483870968,
"grad_norm": 1.115277934031332,
"learning_rate": 9.581681799972528e-06,
"loss": 0.36436331272125244,
"memory(GiB)": 36.98,
"step": 780,
"token_acc": 0.8587719298245614,
"train_speed(iter/s)": 0.145311
},
{
"epoch": 0.5275537634408602,
"grad_norm": 1.2521450260761797,
"learning_rate": 9.574233375605284e-06,
"loss": 0.368036675453186,
"memory(GiB)": 36.98,
"step": 785,
"token_acc": 0.8453327276856689,
"train_speed(iter/s)": 0.145343
},
{
"epoch": 0.5309139784946236,
"grad_norm": 1.3001720572785291,
"learning_rate": 9.566722170580138e-06,
"loss": 0.37065854072570803,
"memory(GiB)": 36.98,
"step": 790,
"token_acc": 0.8767784552845529,
"train_speed(iter/s)": 0.145526
},
{
"epoch": 0.5342741935483871,
"grad_norm": 1.2400854128619574,
"learning_rate": 9.559148287987236e-06,
"loss": 0.36444659233093263,
"memory(GiB)": 36.98,
"step": 795,
"token_acc": 0.8533872598584429,
"train_speed(iter/s)": 0.145651
},
{
"epoch": 0.5376344086021505,
"grad_norm": 1.3328229333855304,
"learning_rate": 9.551511831776966e-06,
"loss": 0.3739351749420166,
"memory(GiB)": 36.98,
"step": 800,
"token_acc": 0.8744081172491545,
"train_speed(iter/s)": 0.145742
},
{
"epoch": 0.5376344086021505,
"eval_loss": 0.3648330569267273,
"eval_runtime": 22.3843,
"eval_samples_per_second": 21.042,
"eval_steps_per_second": 2.636,
"eval_token_acc": 0.8687956517785882,
"step": 800
},
{
"epoch": 0.540994623655914,
"grad_norm": 1.1730739182895107,
"learning_rate": 9.543812906758529e-06,
"loss": 0.3657143831253052,
"memory(GiB)": 36.98,
"step": 805,
"token_acc": 0.8716752216518898,
"train_speed(iter/s)": 0.140525
},
{
"epoch": 0.5443548387096774,
"grad_norm": 1.4833023676939197,
"learning_rate": 9.536051618598503e-06,
"loss": 0.3627838373184204,
"memory(GiB)": 36.98,
"step": 810,
"token_acc": 0.873323994396638,
"train_speed(iter/s)": 0.140671
},
{
"epoch": 0.5477150537634409,
"grad_norm": 1.127923630638402,
"learning_rate": 9.528228073819385e-06,
"loss": 0.3465721845626831,
"memory(GiB)": 36.98,
"step": 815,
"token_acc": 0.8859081097748998,
"train_speed(iter/s)": 0.140707
},
{
"epoch": 0.5510752688172043,
"grad_norm": 1.3463540160197853,
"learning_rate": 9.520342379798141e-06,
"loss": 0.3924789190292358,
"memory(GiB)": 36.98,
"step": 820,
"token_acc": 0.8654133094598627,
"train_speed(iter/s)": 0.140898
},
{
"epoch": 0.5544354838709677,
"grad_norm": 1.4061087313801883,
"learning_rate": 9.51239464476472e-06,
"loss": 0.3804301738739014,
"memory(GiB)": 36.98,
"step": 825,
"token_acc": 0.8837538316434803,
"train_speed(iter/s)": 0.140894
},
{
"epoch": 0.5577956989247311,
"grad_norm": 1.1715838130476242,
"learning_rate": 9.50438497780058e-06,
"loss": 0.34037508964538576,
"memory(GiB)": 36.98,
"step": 830,
"token_acc": 0.8872146118721461,
"train_speed(iter/s)": 0.140962
},
{
"epoch": 0.5611559139784946,
"grad_norm": 1.315222425795007,
"learning_rate": 9.496313488837183e-06,
"loss": 0.36748337745666504,
"memory(GiB)": 36.98,
"step": 835,
"token_acc": 0.8590819348469891,
"train_speed(iter/s)": 0.141068
},
{
"epoch": 0.5645161290322581,
"grad_norm": 1.2776429071415132,
"learning_rate": 9.488180288654485e-06,
"loss": 0.37864868640899657,
"memory(GiB)": 36.98,
"step": 840,
"token_acc": 0.8681633627895868,
"train_speed(iter/s)": 0.141264
},
{
"epoch": 0.5678763440860215,
"grad_norm": 1.396507233078662,
"learning_rate": 9.479985488879426e-06,
"loss": 0.3637056589126587,
"memory(GiB)": 36.98,
"step": 845,
"token_acc": 0.8704206241519674,
"train_speed(iter/s)": 0.141334
},
{
"epoch": 0.571236559139785,
"grad_norm": 1.1602905232651854,
"learning_rate": 9.471729201984385e-06,
"loss": 0.35836124420166016,
"memory(GiB)": 36.98,
"step": 850,
"token_acc": 0.8477945744460551,
"train_speed(iter/s)": 0.141429
},
{
"epoch": 0.5745967741935484,
"grad_norm": 1.2802615859096695,
"learning_rate": 9.463411541285648e-06,
"loss": 0.3767895221710205,
"memory(GiB)": 36.98,
"step": 855,
"token_acc": 0.842360942433811,
"train_speed(iter/s)": 0.141522
},
{
"epoch": 0.5779569892473119,
"grad_norm": 1.152936856157292,
"learning_rate": 9.45503262094184e-06,
"loss": 0.34860782623291015,
"memory(GiB)": 36.98,
"step": 860,
"token_acc": 0.890283046396265,
"train_speed(iter/s)": 0.141556
},
{
"epoch": 0.5813172043010753,
"grad_norm": 1.3209240471576482,
"learning_rate": 9.446592555952372e-06,
"loss": 0.3571927547454834,
"memory(GiB)": 36.98,
"step": 865,
"token_acc": 0.864054003375211,
"train_speed(iter/s)": 0.141632
},
{
"epoch": 0.5846774193548387,
"grad_norm": 1.3540565356974807,
"learning_rate": 9.438091462155854e-06,
"loss": 0.39406092166900636,
"memory(GiB)": 36.98,
"step": 870,
"token_acc": 0.8713701871370187,
"train_speed(iter/s)": 0.141743
},
{
"epoch": 0.5880376344086021,
"grad_norm": 1.1536152243383244,
"learning_rate": 9.429529456228503e-06,
"loss": 0.3671466827392578,
"memory(GiB)": 36.98,
"step": 875,
"token_acc": 0.8574139976275208,
"train_speed(iter/s)": 0.141896
},
{
"epoch": 0.5913978494623656,
"grad_norm": 1.111417750503976,
"learning_rate": 9.420906655682553e-06,
"loss": 0.4034374713897705,
"memory(GiB)": 36.98,
"step": 880,
"token_acc": 0.8493527953731754,
"train_speed(iter/s)": 0.141998
},
{
"epoch": 0.594758064516129,
"grad_norm": 1.1371173509292516,
"learning_rate": 9.412223178864628e-06,
"loss": 0.3718876838684082,
"memory(GiB)": 36.98,
"step": 885,
"token_acc": 0.865809922295278,
"train_speed(iter/s)": 0.141971
},
{
"epoch": 0.5981182795698925,
"grad_norm": 1.4351101955941779,
"learning_rate": 9.403479144954129e-06,
"loss": 0.38196277618408203,
"memory(GiB)": 36.98,
"step": 890,
"token_acc": 0.8545710267229255,
"train_speed(iter/s)": 0.142118
},
{
"epoch": 0.6014784946236559,
"grad_norm": 1.406812942939455,
"learning_rate": 9.394674673961592e-06,
"loss": 0.4024368762969971,
"memory(GiB)": 36.98,
"step": 895,
"token_acc": 0.8243031016882607,
"train_speed(iter/s)": 0.142274
},
{
"epoch": 0.6048387096774194,
"grad_norm": 1.2905388696557576,
"learning_rate": 9.385809886727044e-06,
"loss": 0.36548285484313964,
"memory(GiB)": 36.98,
"step": 900,
"token_acc": 0.8452690972222222,
"train_speed(iter/s)": 0.142383
},
{
"epoch": 0.6048387096774194,
"eval_loss": 0.3623374104499817,
"eval_runtime": 22.2664,
"eval_samples_per_second": 21.153,
"eval_steps_per_second": 2.65,
"eval_token_acc": 0.8692300698099674,
"step": 900
},
{
"epoch": 0.6081989247311828,
"grad_norm": 1.1835524274220626,
"learning_rate": 9.376884904918342e-06,
"loss": 0.39243431091308595,
"memory(GiB)": 36.98,
"step": 905,
"token_acc": 0.8737482819556254,
"train_speed(iter/s)": 0.138228
},
{
"epoch": 0.6115591397849462,
"grad_norm": 1.2772878229089082,
"learning_rate": 9.367899851029506e-06,
"loss": 0.3636767387390137,
"memory(GiB)": 36.98,
"step": 910,
"token_acc": 0.8840648854961832,
"train_speed(iter/s)": 0.138378
},
{
"epoch": 0.6149193548387096,
"grad_norm": 1.3603322316139836,
"learning_rate": 9.358854848379034e-06,
"loss": 0.3802945613861084,
"memory(GiB)": 36.98,
"step": 915,
"token_acc": 0.8738487301144292,
"train_speed(iter/s)": 0.138443
},
{
"epoch": 0.6182795698924731,
"grad_norm": 1.352541823908552,
"learning_rate": 9.349750021108212e-06,
"loss": 0.3642538785934448,
"memory(GiB)": 36.98,
"step": 920,
"token_acc": 0.854089709762533,
"train_speed(iter/s)": 0.138538
},
{
"epoch": 0.6216397849462365,
"grad_norm": 0.9306176011999149,
"learning_rate": 9.340585494179412e-06,
"loss": 0.35754919052124023,
"memory(GiB)": 36.98,
"step": 925,
"token_acc": 0.9050704225352113,
"train_speed(iter/s)": 0.13871
},
{
"epoch": 0.625,
"grad_norm": 1.1564276522228065,
"learning_rate": 9.331361393374373e-06,
"loss": 0.3721582889556885,
"memory(GiB)": 36.98,
"step": 930,
"token_acc": 0.8556311413454271,
"train_speed(iter/s)": 0.138847
},
{
"epoch": 0.6283602150537635,
"grad_norm": 1.198966805981438,
"learning_rate": 9.322077845292476e-06,
"loss": 0.3688213348388672,
"memory(GiB)": 36.98,
"step": 935,
"token_acc": 0.859504132231405,
"train_speed(iter/s)": 0.139024
},
{
"epoch": 0.6317204301075269,
"grad_norm": 1.0471663636235522,
"learning_rate": 9.31273497734901e-06,
"loss": 0.3661798477172852,
"memory(GiB)": 36.98,
"step": 940,
"token_acc": 0.87492762015055,
"train_speed(iter/s)": 0.139147
},
{
"epoch": 0.6350806451612904,
"grad_norm": 1.1434543165368571,
"learning_rate": 9.303332917773412e-06,
"loss": 0.378633451461792,
"memory(GiB)": 36.98,
"step": 945,
"token_acc": 0.8751429224788475,
"train_speed(iter/s)": 0.139348
},
{
"epoch": 0.6384408602150538,
"grad_norm": 1.4342447965175822,
"learning_rate": 9.293871795607527e-06,
"loss": 0.3892825603485107,
"memory(GiB)": 36.98,
"step": 950,
"token_acc": 0.8776422764227643,
"train_speed(iter/s)": 0.13948
},
{
"epoch": 0.6418010752688172,
"grad_norm": 1.2379295477470091,
"learning_rate": 9.284351740703817e-06,
"loss": 0.3719215154647827,
"memory(GiB)": 36.98,
"step": 955,
"token_acc": 0.8418898174995736,
"train_speed(iter/s)": 0.139663
},
{
"epoch": 0.6451612903225806,
"grad_norm": 1.1402428987562707,
"learning_rate": 9.274772883723587e-06,
"loss": 0.3425445079803467,
"memory(GiB)": 36.98,
"step": 960,
"token_acc": 0.8607703281027104,
"train_speed(iter/s)": 0.139765
},
{
"epoch": 0.6485215053763441,
"grad_norm": 1.2256536961952824,
"learning_rate": 9.265135356135195e-06,
"loss": 0.35796480178833007,
"memory(GiB)": 36.98,
"step": 965,
"token_acc": 0.8771626297577855,
"train_speed(iter/s)": 0.139872
},
{
"epoch": 0.6518817204301075,
"grad_norm": 1.1798596505310708,
"learning_rate": 9.25543929021224e-06,
"loss": 0.3797889232635498,
"memory(GiB)": 36.98,
"step": 970,
"token_acc": 0.86508155930329,
"train_speed(iter/s)": 0.13994
},
{
"epoch": 0.655241935483871,
"grad_norm": 1.3153118732048075,
"learning_rate": 9.245684819031757e-06,
"loss": 0.3661204814910889,
"memory(GiB)": 36.98,
"step": 975,
"token_acc": 0.8765514184397163,
"train_speed(iter/s)": 0.140006
},
{
"epoch": 0.6586021505376344,
"grad_norm": 0.9790199677836383,
"learning_rate": 9.235872076472378e-06,
"loss": 0.35305283069610593,
"memory(GiB)": 36.98,
"step": 980,
"token_acc": 0.8554982135768162,
"train_speed(iter/s)": 0.140074
},
{
"epoch": 0.6619623655913979,
"grad_norm": 1.342098542147494,
"learning_rate": 9.226001197212505e-06,
"loss": 0.35271439552307127,
"memory(GiB)": 36.98,
"step": 985,
"token_acc": 0.8696958317686819,
"train_speed(iter/s)": 0.140211
},
{
"epoch": 0.6653225806451613,
"grad_norm": 1.168743571907197,
"learning_rate": 9.216072316728453e-06,
"loss": 0.3402960538864136,
"memory(GiB)": 36.98,
"step": 990,
"token_acc": 0.8855403348554034,
"train_speed(iter/s)": 0.140343
},
{
"epoch": 0.6686827956989247,
"grad_norm": 1.3186904670113182,
"learning_rate": 9.2060855712926e-06,
"loss": 0.3960063934326172,
"memory(GiB)": 36.98,
"step": 995,
"token_acc": 0.860874062580812,
"train_speed(iter/s)": 0.140371
},
{
"epoch": 0.6720430107526881,
"grad_norm": 1.3153391708535236,
"learning_rate": 9.196041097971509e-06,
"loss": 0.3522847890853882,
"memory(GiB)": 36.98,
"step": 1000,
"token_acc": 0.8593790764414296,
"train_speed(iter/s)": 0.140526
},
{
"epoch": 0.6720430107526881,
"eval_loss": 0.3627634346485138,
"eval_runtime": 22.2222,
"eval_samples_per_second": 21.195,
"eval_steps_per_second": 2.655,
"eval_token_acc": 0.8697251043108413,
"step": 1000
},
{
"epoch": 0.6754032258064516,
"grad_norm": 1.6648875853015621,
"learning_rate": 9.185939034624048e-06,
"loss": 0.3742126226425171,
"memory(GiB)": 36.98,
"step": 1005,
"token_acc": 0.8709945104046981,
"train_speed(iter/s)": 0.136625
},
{
"epoch": 0.678763440860215,
"grad_norm": 0.9496792330923782,
"learning_rate": 9.175779519899504e-06,
"loss": 0.36241965293884276,
"memory(GiB)": 36.98,
"step": 1010,
"token_acc": 0.878876404494382,
"train_speed(iter/s)": 0.136756
},
{
"epoch": 0.6821236559139785,
"grad_norm": 1.159148138819774,
"learning_rate": 9.165562693235671e-06,
"loss": 0.34667220115661623,
"memory(GiB)": 36.98,
"step": 1015,
"token_acc": 0.8697904933543591,
"train_speed(iter/s)": 0.136863
},
{
"epoch": 0.6854838709677419,
"grad_norm": 1.5210774238674614,
"learning_rate": 9.155288694856942e-06,
"loss": 0.3921034574508667,
"memory(GiB)": 36.98,
"step": 1020,
"token_acc": 0.8514568158168574,
"train_speed(iter/s)": 0.136995
},
{
"epoch": 0.6888440860215054,
"grad_norm": 1.4036032954503623,
"learning_rate": 9.144957665772383e-06,
"loss": 0.3707906246185303,
"memory(GiB)": 36.98,
"step": 1025,
"token_acc": 0.8549953022236142,
"train_speed(iter/s)": 0.137102
},
{
"epoch": 0.6922043010752689,
"grad_norm": 1.2363135604055617,
"learning_rate": 9.134569747773799e-06,
"loss": 0.3501997709274292,
"memory(GiB)": 36.98,
"step": 1030,
"token_acc": 0.8955597248280175,
"train_speed(iter/s)": 0.137218
},
{
"epoch": 0.6955645161290323,
"grad_norm": 1.2031587722298478,
"learning_rate": 9.124125083433785e-06,
"loss": 0.35233426094055176,
"memory(GiB)": 36.98,
"step": 1035,
"token_acc": 0.8718586713112715,
"train_speed(iter/s)": 0.137257
},
{
"epoch": 0.6989247311827957,
"grad_norm": 1.194003310180007,
"learning_rate": 9.113623816103775e-06,
"loss": 0.38490521907806396,
"memory(GiB)": 36.98,
"step": 1040,
"token_acc": 0.8675508720930233,
"train_speed(iter/s)": 0.137309
},
{
"epoch": 0.7022849462365591,
"grad_norm": 1.1443958062714952,
"learning_rate": 9.103066089912062e-06,
"loss": 0.37537660598754885,
"memory(GiB)": 36.98,
"step": 1045,
"token_acc": 0.8820490744726647,
"train_speed(iter/s)": 0.137442
},
{
"epoch": 0.7056451612903226,
"grad_norm": 1.4564894479548467,
"learning_rate": 9.092452049761837e-06,
"loss": 0.36408531665802,
"memory(GiB)": 36.98,
"step": 1050,
"token_acc": 0.8774496995035276,
"train_speed(iter/s)": 0.137483
},
{
"epoch": 0.709005376344086,
"grad_norm": 1.2480131437713693,
"learning_rate": 9.081781841329186e-06,
"loss": 0.3492724895477295,
"memory(GiB)": 36.98,
"step": 1055,
"token_acc": 0.8737158606865447,
"train_speed(iter/s)": 0.137587
},
{
"epoch": 0.7123655913978495,
"grad_norm": 1.1879780995611289,
"learning_rate": 9.071055611061102e-06,
"loss": 0.3583995819091797,
"memory(GiB)": 36.98,
"step": 1060,
"token_acc": 0.8709024686361797,
"train_speed(iter/s)": 0.137746
},
{
"epoch": 0.7157258064516129,
"grad_norm": 1.167689141740901,
"learning_rate": 9.06027350617346e-06,
"loss": 0.3558779716491699,
"memory(GiB)": 36.98,
"step": 1065,
"token_acc": 0.8789541639767592,
"train_speed(iter/s)": 0.137819
},
{
"epoch": 0.7190860215053764,
"grad_norm": 1.3103844035853496,
"learning_rate": 9.049435674649012e-06,
"loss": 0.3494231700897217,
"memory(GiB)": 36.98,
"step": 1070,
"token_acc": 0.875264910687254,
"train_speed(iter/s)": 0.13791
},
{
"epoch": 0.7224462365591398,
"grad_norm": 1.447881789596697,
"learning_rate": 9.038542265235353e-06,
"loss": 0.37793238162994386,
"memory(GiB)": 36.98,
"step": 1075,
"token_acc": 0.8793576184880533,
"train_speed(iter/s)": 0.137992
},
{
"epoch": 0.7258064516129032,
"grad_norm": 1.1420570268637051,
"learning_rate": 9.027593427442867e-06,
"loss": 0.3852741241455078,
"memory(GiB)": 36.98,
"step": 1080,
"token_acc": 0.8560224089635854,
"train_speed(iter/s)": 0.138133
},
{
"epoch": 0.7291666666666666,
"grad_norm": 0.9932383409822715,
"learning_rate": 9.01658931154269e-06,
"loss": 0.37255063056945803,
"memory(GiB)": 36.98,
"step": 1085,
"token_acc": 0.8839645447219984,
"train_speed(iter/s)": 0.138268
},
{
"epoch": 0.7325268817204301,
"grad_norm": 1.189078778252946,
"learning_rate": 9.005530068564641e-06,
"loss": 0.365185284614563,
"memory(GiB)": 36.98,
"step": 1090,
"token_acc": 0.8813782219884272,
"train_speed(iter/s)": 0.138348
},
{
"epoch": 0.7358870967741935,
"grad_norm": 1.157742839318529,
"learning_rate": 8.994415850295148e-06,
"loss": 0.35784361362457273,
"memory(GiB)": 36.98,
"step": 1095,
"token_acc": 0.8647392875580795,
"train_speed(iter/s)": 0.138449
},
{
"epoch": 0.739247311827957,
"grad_norm": 1.220404878981132,
"learning_rate": 8.98324680927517e-06,
"loss": 0.373700475692749,
"memory(GiB)": 36.98,
"step": 1100,
"token_acc": 0.8641396933560477,
"train_speed(iter/s)": 0.138546
},
{
"epoch": 0.739247311827957,
"eval_loss": 0.35993626713752747,
"eval_runtime": 22.2705,
"eval_samples_per_second": 21.149,
"eval_steps_per_second": 2.649,
"eval_token_acc": 0.8690179121667357,
"step": 1100
},
{
"epoch": 0.7426075268817204,
"grad_norm": 1.027962609432086,
"learning_rate": 8.972023098798095e-06,
"loss": 0.40328526496887207,
"memory(GiB)": 36.98,
"step": 1105,
"token_acc": 0.8727120885239056,
"train_speed(iter/s)": 0.135213
},
{
"epoch": 0.7459677419354839,
"grad_norm": 1.1113471488531306,
"learning_rate": 8.960744872907645e-06,
"loss": 0.38501567840576173,
"memory(GiB)": 36.98,
"step": 1110,
"token_acc": 0.8484187568157033,
"train_speed(iter/s)": 0.135318
},
{
"epoch": 0.7493279569892473,
"grad_norm": 1.1306811125471197,
"learning_rate": 8.949412286395755e-06,
"loss": 0.36622910499572753,
"memory(GiB)": 36.98,
"step": 1115,
"token_acc": 0.8693091732729332,
"train_speed(iter/s)": 0.135432
},
{
"epoch": 0.7526881720430108,
"grad_norm": 1.129836101156622,
"learning_rate": 8.938025494800454e-06,
"loss": 0.39587087631225587,
"memory(GiB)": 36.98,
"step": 1120,
"token_acc": 0.8733031674208145,
"train_speed(iter/s)": 0.1355
},
{
"epoch": 0.7560483870967742,
"grad_norm": 1.13107139794073,
"learning_rate": 8.926584654403725e-06,
"loss": 0.3494365692138672,
"memory(GiB)": 36.98,
"step": 1125,
"token_acc": 0.8773690078037905,
"train_speed(iter/s)": 0.13563
},
{
"epoch": 0.7594086021505376,
"grad_norm": 1.2782802692395279,
"learning_rate": 8.915089922229365e-06,
"loss": 0.3868433952331543,
"memory(GiB)": 36.98,
"step": 1130,
"token_acc": 0.8728874077600571,
"train_speed(iter/s)": 0.135686
},
{
"epoch": 0.7627688172043011,
"grad_norm": 1.1154758878343582,
"learning_rate": 8.903541456040825e-06,
"loss": 0.3632142782211304,
"memory(GiB)": 36.98,
"step": 1135,
"token_acc": 0.8736906962415281,
"train_speed(iter/s)": 0.13579
},
{
"epoch": 0.7661290322580645,
"grad_norm": 1.2723668159978256,
"learning_rate": 8.891939414339048e-06,
"loss": 0.36965441703796387,
"memory(GiB)": 36.98,
"step": 1140,
"token_acc": 0.8772372069573985,
"train_speed(iter/s)": 0.135898
},
{
"epoch": 0.769489247311828,
"grad_norm": 1.154588616850164,
"learning_rate": 8.880283956360297e-06,
"loss": 0.37631180286407473,
"memory(GiB)": 36.98,
"step": 1145,
"token_acc": 0.8772019402604033,
"train_speed(iter/s)": 0.135982
},
{
"epoch": 0.7728494623655914,
"grad_norm": 1.170600321406014,
"learning_rate": 8.868575242073954e-06,
"loss": 0.37340292930603025,
"memory(GiB)": 36.98,
"step": 1150,
"token_acc": 0.8703907539900936,
"train_speed(iter/s)": 0.136084
},
{
"epoch": 0.7762096774193549,
"grad_norm": 1.231802273714498,
"learning_rate": 8.856813432180349e-06,
"loss": 0.35198609828948973,
"memory(GiB)": 36.98,
"step": 1155,
"token_acc": 0.887806411062225,
"train_speed(iter/s)": 0.136204
},
{
"epoch": 0.7795698924731183,
"grad_norm": 1.1688665900475064,
"learning_rate": 8.844998688108535e-06,
"loss": 0.3763306140899658,
"memory(GiB)": 36.98,
"step": 1160,
"token_acc": 0.873224618621778,
"train_speed(iter/s)": 0.136285
},
{
"epoch": 0.7829301075268817,
"grad_norm": 1.2175946582909498,
"learning_rate": 8.833131172014075e-06,
"loss": 0.3766140937805176,
"memory(GiB)": 36.98,
"step": 1165,
"token_acc": 0.8513879485443466,
"train_speed(iter/s)": 0.136437
},
{
"epoch": 0.7862903225806451,
"grad_norm": 1.0227148148425746,
"learning_rate": 8.821211046776828e-06,
"loss": 0.3676277160644531,
"memory(GiB)": 36.98,
"step": 1170,
"token_acc": 0.8948380010982976,
"train_speed(iter/s)": 0.136557
},
{
"epoch": 0.7896505376344086,
"grad_norm": 1.1429501203864452,
"learning_rate": 8.809238475998699e-06,
"loss": 0.36184470653533934,
"memory(GiB)": 36.98,
"step": 1175,
"token_acc": 0.8679896462467644,
"train_speed(iter/s)": 0.136615
},
{
"epoch": 0.793010752688172,
"grad_norm": 1.2033359899326155,
"learning_rate": 8.797213624001403e-06,
"loss": 0.3503121852874756,
"memory(GiB)": 36.98,
"step": 1180,
"token_acc": 0.8806179775280899,
"train_speed(iter/s)": 0.136717
},
{
"epoch": 0.7963709677419355,
"grad_norm": 1.0758929439541713,
"learning_rate": 8.785136655824208e-06,
"loss": 0.3926861763000488,
"memory(GiB)": 36.98,
"step": 1185,
"token_acc": 0.8738359867828177,
"train_speed(iter/s)": 0.136848
},
{
"epoch": 0.7997311827956989,
"grad_norm": 1.3930864885101153,
"learning_rate": 8.773007737221661e-06,
"loss": 0.36632988452911375,
"memory(GiB)": 36.98,
"step": 1190,
"token_acc": 0.8565597667638484,
"train_speed(iter/s)": 0.13695
},
{
"epoch": 0.8030913978494624,
"grad_norm": 1.2851603723997174,
"learning_rate": 8.760827034661333e-06,
"loss": 0.37504141330718993,
"memory(GiB)": 36.98,
"step": 1195,
"token_acc": 0.8499542543458372,
"train_speed(iter/s)": 0.137062
},
{
"epoch": 0.8064516129032258,
"grad_norm": 1.1404438471085583,
"learning_rate": 8.748594715321512e-06,
"loss": 0.37414093017578126,
"memory(GiB)": 36.98,
"step": 1200,
"token_acc": 0.8452418096723869,
"train_speed(iter/s)": 0.137153
},
{
"epoch": 0.8064516129032258,
"eval_loss": 0.3591736853122711,
"eval_runtime": 22.3034,
"eval_samples_per_second": 21.118,
"eval_steps_per_second": 2.645,
"eval_token_acc": 0.8695634603921886,
"step": 1200
},
{
"epoch": 0.8098118279569892,
"grad_norm": 1.120175429790956,
"learning_rate": 8.736310947088925e-06,
"loss": 0.364569354057312,
"memory(GiB)": 36.98,
"step": 1205,
"token_acc": 0.8810635780559188,
"train_speed(iter/s)": 0.134072
},
{
"epoch": 0.8131720430107527,
"grad_norm": 1.1584760304462782,
"learning_rate": 8.723975898556418e-06,
"loss": 0.3694889545440674,
"memory(GiB)": 36.98,
"step": 1210,
"token_acc": 0.8554807103490508,
"train_speed(iter/s)": 0.134198
},
{
"epoch": 0.8165322580645161,
"grad_norm": 1.422435654534571,
"learning_rate": 8.711589739020666e-06,
"loss": 0.4119666576385498,
"memory(GiB)": 36.98,
"step": 1215,
"token_acc": 0.8824604141291108,
"train_speed(iter/s)": 0.134323
},
{
"epoch": 0.8198924731182796,
"grad_norm": 1.3248122307528065,
"learning_rate": 8.699152638479817e-06,
"loss": 0.35853800773620603,
"memory(GiB)": 36.98,
"step": 1220,
"token_acc": 0.8483437779767233,
"train_speed(iter/s)": 0.134403
},
{
"epoch": 0.823252688172043,
"grad_norm": 1.280124494652183,
"learning_rate": 8.686664767631194e-06,
"loss": 0.3697690486907959,
"memory(GiB)": 36.98,
"step": 1225,
"token_acc": 0.8864232398628202,
"train_speed(iter/s)": 0.134529
},
{
"epoch": 0.8266129032258065,
"grad_norm": 1.451060096605754,
"learning_rate": 8.67412629786892e-06,
"loss": 0.3914652347564697,
"memory(GiB)": 36.98,
"step": 1230,
"token_acc": 0.8521779425393883,
"train_speed(iter/s)": 0.134562
},
{
"epoch": 0.8299731182795699,
"grad_norm": 1.0228658030093372,
"learning_rate": 8.661537401281592e-06,
"loss": 0.3691814422607422,
"memory(GiB)": 36.98,
"step": 1235,
"token_acc": 0.8398544131028207,
"train_speed(iter/s)": 0.134623
},
{
"epoch": 0.8333333333333334,
"grad_norm": 1.2177785216419144,
"learning_rate": 8.6488982506499e-06,
"loss": 0.3742197275161743,
"memory(GiB)": 36.98,
"step": 1240,
"token_acc": 0.8786339754816113,
"train_speed(iter/s)": 0.134723
},
{
"epoch": 0.8366935483870968,
"grad_norm": 1.2600614569860304,
"learning_rate": 8.636209019444266e-06,
"loss": 0.3671316146850586,
"memory(GiB)": 36.98,
"step": 1245,
"token_acc": 0.8633489700183561,
"train_speed(iter/s)": 0.134806
},
{
"epoch": 0.8400537634408602,
"grad_norm": 1.2097597846514752,
"learning_rate": 8.623469881822459e-06,
"loss": 0.35306849479675295,
"memory(GiB)": 36.98,
"step": 1250,
"token_acc": 0.867431315017338,
"train_speed(iter/s)": 0.134879
},
{
"epoch": 0.8434139784946236,
"grad_norm": 1.3489602392669622,
"learning_rate": 8.610681012627206e-06,
"loss": 0.3992285966873169,
"memory(GiB)": 36.98,
"step": 1255,
"token_acc": 0.8585302686953706,
"train_speed(iter/s)": 0.135015
},
{
"epoch": 0.8467741935483871,
"grad_norm": 1.1284388647627097,
"learning_rate": 8.597842587383797e-06,
"loss": 0.35171847343444823,
"memory(GiB)": 36.98,
"step": 1260,
"token_acc": 0.883495145631068,
"train_speed(iter/s)": 0.135129
},
{
"epoch": 0.8501344086021505,
"grad_norm": 1.1397663299803389,
"learning_rate": 8.584954782297664e-06,
"loss": 0.3620689153671265,
"memory(GiB)": 36.98,
"step": 1265,
"token_acc": 0.8496460824993898,
"train_speed(iter/s)": 0.135164
},
{
"epoch": 0.853494623655914,
"grad_norm": 1.567282299034232,
"learning_rate": 8.572017774251975e-06,
"loss": 0.3426519870758057,
"memory(GiB)": 36.98,
"step": 1270,
"token_acc": 0.8790885703785373,
"train_speed(iter/s)": 0.13528
},
{
"epoch": 0.8568548387096774,
"grad_norm": 1.024146079054877,
"learning_rate": 8.559031740805197e-06,
"loss": 0.3614107608795166,
"memory(GiB)": 36.98,
"step": 1275,
"token_acc": 0.8775425487754255,
"train_speed(iter/s)": 0.135367
},
{
"epoch": 0.8602150537634409,
"grad_norm": 1.2877357462438697,
"learning_rate": 8.545996860188668e-06,
"loss": 0.36385910511016845,
"memory(GiB)": 36.98,
"step": 1280,
"token_acc": 0.8840579710144928,
"train_speed(iter/s)": 0.135448
},
{
"epoch": 0.8635752688172043,
"grad_norm": 0.9919794513523525,
"learning_rate": 8.53291331130414e-06,
"loss": 0.3696147441864014,
"memory(GiB)": 36.98,
"step": 1285,
"token_acc": 0.8672872340425531,
"train_speed(iter/s)": 0.135528
},
{
"epoch": 0.8669354838709677,
"grad_norm": 1.2702352256267997,
"learning_rate": 8.519781273721337e-06,
"loss": 0.37546935081481936,
"memory(GiB)": 36.98,
"step": 1290,
"token_acc": 0.8561872909698997,
"train_speed(iter/s)": 0.135644
},
{
"epoch": 0.8702956989247311,
"grad_norm": 1.1454414254082632,
"learning_rate": 8.506600927675479e-06,
"loss": 0.3705191373825073,
"memory(GiB)": 36.98,
"step": 1295,
"token_acc": 0.8777910685805422,
"train_speed(iter/s)": 0.135768
},
{
"epoch": 0.8736559139784946,
"grad_norm": 1.0281894853631854,
"learning_rate": 8.493372454064809e-06,
"loss": 0.3680076599121094,
"memory(GiB)": 36.98,
"step": 1300,
"token_acc": 0.8766331658291457,
"train_speed(iter/s)": 0.135882
},
{
"epoch": 0.8736559139784946,
"eval_loss": 0.35564836859703064,
"eval_runtime": 22.2867,
"eval_samples_per_second": 21.134,
"eval_steps_per_second": 2.647,
"eval_token_acc": 0.8703110635159573,
"step": 1300
},
{
"epoch": 0.8770161290322581,
"grad_norm": 1.1119653997133574,
"learning_rate": 8.480096034448118e-06,
"loss": 0.38275277614593506,
"memory(GiB)": 36.98,
"step": 1305,
"token_acc": 0.879605039155601,
"train_speed(iter/s)": 0.133094
},
{
"epoch": 0.8803763440860215,
"grad_norm": 1.3654028882674394,
"learning_rate": 8.46677185104225e-06,
"loss": 0.3956867218017578,
"memory(GiB)": 36.98,
"step": 1310,
"token_acc": 0.8529342997428104,
"train_speed(iter/s)": 0.133217
},
{
"epoch": 0.883736559139785,
"grad_norm": 1.446764466047648,
"learning_rate": 8.453400086719595e-06,
"loss": 0.3735336780548096,
"memory(GiB)": 36.98,
"step": 1315,
"token_acc": 0.8798710024187046,
"train_speed(iter/s)": 0.13335
},
{
"epoch": 0.8870967741935484,
"grad_norm": 1.4121926531149445,
"learning_rate": 8.439980925005587e-06,
"loss": 0.3659780502319336,
"memory(GiB)": 36.98,
"step": 1320,
"token_acc": 0.883887957864496,
"train_speed(iter/s)": 0.133469
},
{
"epoch": 0.8904569892473119,
"grad_norm": 1.164980050744471,
"learning_rate": 8.426514550076179e-06,
"loss": 0.3616935729980469,
"memory(GiB)": 36.98,
"step": 1325,
"token_acc": 0.8850448872345085,
"train_speed(iter/s)": 0.133589
},
{
"epoch": 0.8938172043010753,
"grad_norm": 1.0691472658135925,
"learning_rate": 8.413001146755322e-06,
"loss": 0.35316460132598876,
"memory(GiB)": 36.98,
"step": 1330,
"token_acc": 0.8866596268919394,
"train_speed(iter/s)": 0.133666
},
{
"epoch": 0.8971774193548387,
"grad_norm": 1.0994724914400242,
"learning_rate": 8.399440900512426e-06,
"loss": 0.3949526309967041,
"memory(GiB)": 36.98,
"step": 1335,
"token_acc": 0.8785682916117699,
"train_speed(iter/s)": 0.133743
},
{
"epoch": 0.9005376344086021,
"grad_norm": 1.1861769349247893,
"learning_rate": 8.385833997459804e-06,
"loss": 0.3643625259399414,
"memory(GiB)": 36.98,
"step": 1340,
"token_acc": 0.9060324825986079,
"train_speed(iter/s)": 0.133823
},
{
"epoch": 0.9038978494623656,
"grad_norm": 1.1612362502540743,
"learning_rate": 8.372180624350135e-06,
"loss": 0.3484476089477539,
"memory(GiB)": 36.98,
"step": 1345,
"token_acc": 0.891963426371511,
"train_speed(iter/s)": 0.133922
},
{
"epoch": 0.907258064516129,
"grad_norm": 1.3879852587828685,
"learning_rate": 8.358480968573891e-06,
"loss": 0.3419426441192627,
"memory(GiB)": 36.98,
"step": 1350,
"token_acc": 0.884161752316765,
"train_speed(iter/s)": 0.133959
},
{
"epoch": 0.9106182795698925,
"grad_norm": 1.2332875378045212,
"learning_rate": 8.344735218156765e-06,
"loss": 0.3734895706176758,
"memory(GiB)": 36.98,
"step": 1355,
"token_acc": 0.8606686111734351,
"train_speed(iter/s)": 0.134034
},
{
"epoch": 0.9139784946236559,
"grad_norm": 1.1771957299877707,
"learning_rate": 8.330943561757092e-06,
"loss": 0.3600280284881592,
"memory(GiB)": 36.98,
"step": 1360,
"token_acc": 0.8553880513679508,
"train_speed(iter/s)": 0.134132
},
{
"epoch": 0.9173387096774194,
"grad_norm": 1.0255370794800907,
"learning_rate": 8.31710618866326e-06,
"loss": 0.35778398513793946,
"memory(GiB)": 36.98,
"step": 1365,
"token_acc": 0.9062277580071174,
"train_speed(iter/s)": 0.134223
},
{
"epoch": 0.9206989247311828,
"grad_norm": 1.1122296528253035,
"learning_rate": 8.303223288791111e-06,
"loss": 0.3720524311065674,
"memory(GiB)": 36.98,
"step": 1370,
"token_acc": 0.8743216223936018,
"train_speed(iter/s)": 0.134348
},
{
"epoch": 0.9240591397849462,
"grad_norm": 1.3439572837247773,
"learning_rate": 8.289295052681338e-06,
"loss": 0.3542968273162842,
"memory(GiB)": 36.98,
"step": 1375,
"token_acc": 0.8697758933979406,
"train_speed(iter/s)": 0.134431
},
{
"epoch": 0.9274193548387096,
"grad_norm": 1.3514644861757483,
"learning_rate": 8.275321671496862e-06,
"loss": 0.35950050354003904,
"memory(GiB)": 36.98,
"step": 1380,
"token_acc": 0.8709907341411262,
"train_speed(iter/s)": 0.134551
},
{
"epoch": 0.9307795698924731,
"grad_norm": 1.0141397599722326,
"learning_rate": 8.26130333702022e-06,
"loss": 0.3641530990600586,
"memory(GiB)": 36.98,
"step": 1385,
"token_acc": 0.8553926118433265,
"train_speed(iter/s)": 0.134571
},
{
"epoch": 0.9341397849462365,
"grad_norm": 1.2175959834253216,
"learning_rate": 8.247240241650918e-06,
"loss": 0.3760999202728271,
"memory(GiB)": 36.98,
"step": 1390,
"token_acc": 0.8769605191995673,
"train_speed(iter/s)": 0.134653
},
{
"epoch": 0.9375,
"grad_norm": 1.294717847990162,
"learning_rate": 8.233132578402808e-06,
"loss": 0.3714743614196777,
"memory(GiB)": 36.98,
"step": 1395,
"token_acc": 0.8819672131147541,
"train_speed(iter/s)": 0.134765
},
{
"epoch": 0.9408602150537635,
"grad_norm": 1.2980876103734693,
"learning_rate": 8.218980540901417e-06,
"loss": 0.3365382194519043,
"memory(GiB)": 36.98,
"step": 1400,
"token_acc": 0.8853006681514477,
"train_speed(iter/s)": 0.13485
},
{
"epoch": 0.9408602150537635,
"eval_loss": 0.3554946184158325,
"eval_runtime": 22.3352,
"eval_samples_per_second": 21.088,
"eval_steps_per_second": 2.642,
"eval_token_acc": 0.8695735631371043,
"step": 1400
}
],
"logging_steps": 5,
"max_steps": 4464,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 171818621501440.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}