shulijia's picture
Training in progress, step 2000, checkpoint
87c8dde verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.849759401176261,
"eval_steps": 100,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014257708073427196,
"grad_norm": 80.31977081298828,
"learning_rate": 4.265402843601896e-07,
"loss": 1.7215,
"mean_token_accuracy": 0.4386619367171079,
"num_tokens": 81920.0,
"step": 10
},
{
"epoch": 0.02851541614685439,
"grad_norm": 5.226085186004639,
"learning_rate": 9.004739336492892e-07,
"loss": 0.4476,
"mean_token_accuracy": 0.7448018610477447,
"num_tokens": 163840.0,
"step": 20
},
{
"epoch": 0.04277312422028159,
"grad_norm": 1.4328534603118896,
"learning_rate": 1.3744075829383887e-06,
"loss": 0.293,
"mean_token_accuracy": 0.7609466724097729,
"num_tokens": 245760.0,
"step": 30
},
{
"epoch": 0.05703083229370878,
"grad_norm": 1.468935251235962,
"learning_rate": 1.8483412322274883e-06,
"loss": 0.2499,
"mean_token_accuracy": 0.7796599786728621,
"num_tokens": 327680.0,
"step": 40
},
{
"epoch": 0.07128854036713599,
"grad_norm": 1.6373568773269653,
"learning_rate": 2.322274881516588e-06,
"loss": 0.225,
"mean_token_accuracy": 0.7888087052851915,
"num_tokens": 409600.0,
"step": 50
},
{
"epoch": 0.08554624844056317,
"grad_norm": 1.3717502355575562,
"learning_rate": 2.7962085308056874e-06,
"loss": 0.2301,
"mean_token_accuracy": 0.7796599812805652,
"num_tokens": 491520.0,
"step": 60
},
{
"epoch": 0.09980395651399038,
"grad_norm": 1.4575324058532715,
"learning_rate": 3.2701421800947867e-06,
"loss": 0.2165,
"mean_token_accuracy": 0.7822773970663548,
"num_tokens": 573440.0,
"step": 70
},
{
"epoch": 0.11406166458741757,
"grad_norm": 1.7066882848739624,
"learning_rate": 3.7440758293838865e-06,
"loss": 0.2484,
"mean_token_accuracy": 0.7653864935040474,
"num_tokens": 655360.0,
"step": 80
},
{
"epoch": 0.12831937266084478,
"grad_norm": 1.3280051946640015,
"learning_rate": 4.218009478672986e-06,
"loss": 0.2092,
"mean_token_accuracy": 0.8033390413969755,
"num_tokens": 737280.0,
"step": 90
},
{
"epoch": 0.14257708073427197,
"grad_norm": 1.1487761735916138,
"learning_rate": 4.691943127962086e-06,
"loss": 0.2303,
"step": 100
},
{
"epoch": 0.14257708073427197,
"eval_loss": 0.4363424479961395,
"eval_mean_token_accuracy": 0.9056754631873889,
"eval_num_tokens": 819200.0,
"eval_runtime": 41.3394,
"eval_samples_per_second": 30.165,
"eval_steps_per_second": 1.887,
"step": 100
},
{
"epoch": 0.15683478880769916,
"grad_norm": 1.2161787748336792,
"learning_rate": 5.165876777251185e-06,
"loss": 0.1977,
"mean_token_accuracy": 0.7886374732479453,
"num_tokens": 901120.0,
"step": 110
},
{
"epoch": 0.17109249688112635,
"grad_norm": 1.2622631788253784,
"learning_rate": 5.639810426540285e-06,
"loss": 0.272,
"mean_token_accuracy": 0.7660469669848681,
"num_tokens": 983040.0,
"step": 120
},
{
"epoch": 0.18535020495455357,
"grad_norm": 1.6876330375671387,
"learning_rate": 6.1137440758293845e-06,
"loss": 0.2425,
"mean_token_accuracy": 0.7557363010942936,
"num_tokens": 1064960.0,
"step": 130
},
{
"epoch": 0.19960791302798075,
"grad_norm": 1.7104541063308716,
"learning_rate": 6.587677725118484e-06,
"loss": 0.2536,
"mean_token_accuracy": 0.7589774951338768,
"num_tokens": 1146880.0,
"step": 140
},
{
"epoch": 0.21386562110140794,
"grad_norm": 1.7957197427749634,
"learning_rate": 7.061611374407583e-06,
"loss": 0.2307,
"mean_token_accuracy": 0.7757460869848728,
"num_tokens": 1228800.0,
"step": 150
},
{
"epoch": 0.22812332917483513,
"grad_norm": 1.3486217260360718,
"learning_rate": 7.535545023696683e-06,
"loss": 0.2096,
"mean_token_accuracy": 0.8065435424447059,
"num_tokens": 1310720.0,
"step": 160
},
{
"epoch": 0.24238103724826235,
"grad_norm": 1.7759106159210205,
"learning_rate": 8.009478672985783e-06,
"loss": 0.2394,
"mean_token_accuracy": 0.7681506846100092,
"num_tokens": 1392640.0,
"step": 170
},
{
"epoch": 0.25663874532168957,
"grad_norm": 1.4754517078399658,
"learning_rate": 8.483412322274883e-06,
"loss": 0.2284,
"mean_token_accuracy": 0.7852617405354977,
"num_tokens": 1474560.0,
"step": 180
},
{
"epoch": 0.27089645339511675,
"grad_norm": 1.4712748527526855,
"learning_rate": 8.957345971563981e-06,
"loss": 0.1902,
"mean_token_accuracy": 0.7985200572758913,
"num_tokens": 1556480.0,
"step": 190
},
{
"epoch": 0.28515416146854394,
"grad_norm": 1.8678069114685059,
"learning_rate": 9.431279620853082e-06,
"loss": 0.2482,
"step": 200
},
{
"epoch": 0.28515416146854394,
"eval_loss": 0.4305071532726288,
"eval_mean_token_accuracy": 0.9062988207890437,
"eval_num_tokens": 1638400.0,
"eval_runtime": 41.308,
"eval_samples_per_second": 30.188,
"eval_steps_per_second": 1.888,
"step": 200
},
{
"epoch": 0.29941186954197113,
"grad_norm": 1.591244101524353,
"learning_rate": 9.905213270142182e-06,
"loss": 0.219,
"mean_token_accuracy": 0.7843933456577361,
"num_tokens": 1720320.0,
"step": 210
},
{
"epoch": 0.3136695776153983,
"grad_norm": 2.2734687328338623,
"learning_rate": 9.95778364116095e-06,
"loss": 0.2099,
"mean_token_accuracy": 0.7872798424214125,
"num_tokens": 1802240.0,
"step": 220
},
{
"epoch": 0.3279272856888255,
"grad_norm": 1.6370080709457397,
"learning_rate": 9.905013192612138e-06,
"loss": 0.192,
"mean_token_accuracy": 0.7989359103143215,
"num_tokens": 1884160.0,
"step": 230
},
{
"epoch": 0.3421849937622527,
"grad_norm": 1.4441734552383423,
"learning_rate": 9.852242744063325e-06,
"loss": 0.2143,
"mean_token_accuracy": 0.808133564144373,
"num_tokens": 1966080.0,
"step": 240
},
{
"epoch": 0.3564427018356799,
"grad_norm": 1.1315490007400513,
"learning_rate": 9.799472295514513e-06,
"loss": 0.1918,
"mean_token_accuracy": 0.7888331696391105,
"num_tokens": 2048000.0,
"step": 250
},
{
"epoch": 0.37070040990910713,
"grad_norm": 0.9603747725486755,
"learning_rate": 9.7467018469657e-06,
"loss": 0.1924,
"mean_token_accuracy": 0.8039628200232982,
"num_tokens": 2129920.0,
"step": 260
},
{
"epoch": 0.3849581179825343,
"grad_norm": 1.666515827178955,
"learning_rate": 9.693931398416887e-06,
"loss": 0.1782,
"mean_token_accuracy": 0.803094419836998,
"num_tokens": 2211840.0,
"step": 270
},
{
"epoch": 0.3992158260559615,
"grad_norm": 1.8663837909698486,
"learning_rate": 9.641160949868074e-06,
"loss": 0.2374,
"mean_token_accuracy": 0.7656678084284068,
"num_tokens": 2293760.0,
"step": 280
},
{
"epoch": 0.4134735341293887,
"grad_norm": 1.5346232652664185,
"learning_rate": 9.588390501319263e-06,
"loss": 0.2,
"mean_token_accuracy": 0.7928816072642804,
"num_tokens": 2375680.0,
"step": 290
},
{
"epoch": 0.4277312422028159,
"grad_norm": 1.631354570388794,
"learning_rate": 9.53562005277045e-06,
"loss": 0.2238,
"step": 300
},
{
"epoch": 0.4277312422028159,
"eval_loss": 0.42917218804359436,
"eval_mean_token_accuracy": 0.9063309125411205,
"eval_num_tokens": 2457600.0,
"eval_runtime": 41.3603,
"eval_samples_per_second": 30.15,
"eval_steps_per_second": 1.886,
"step": 300
},
{
"epoch": 0.4419889502762431,
"grad_norm": 1.4803297519683838,
"learning_rate": 9.482849604221636e-06,
"loss": 0.2441,
"mean_token_accuracy": 0.7797455977648496,
"num_tokens": 2539520.0,
"step": 310
},
{
"epoch": 0.45624665834967026,
"grad_norm": 1.4234380722045898,
"learning_rate": 9.430079155672825e-06,
"loss": 0.2552,
"mean_token_accuracy": 0.7678449124097824,
"num_tokens": 2621440.0,
"step": 320
},
{
"epoch": 0.4705043664230975,
"grad_norm": 1.5810368061065674,
"learning_rate": 9.37730870712401e-06,
"loss": 0.225,
"mean_token_accuracy": 0.7840631131082774,
"num_tokens": 2703360.0,
"step": 330
},
{
"epoch": 0.4847620744965247,
"grad_norm": 1.540651559829712,
"learning_rate": 9.324538258575199e-06,
"loss": 0.1879,
"mean_token_accuracy": 0.7923556782305241,
"num_tokens": 2785280.0,
"step": 340
},
{
"epoch": 0.4990197825699519,
"grad_norm": 1.7700861692428589,
"learning_rate": 9.271767810026386e-06,
"loss": 0.228,
"mean_token_accuracy": 0.7943493168801069,
"num_tokens": 2867200.0,
"step": 350
},
{
"epoch": 0.5132774906433791,
"grad_norm": 1.2500240802764893,
"learning_rate": 9.218997361477573e-06,
"loss": 0.2557,
"mean_token_accuracy": 0.7765655562281608,
"num_tokens": 2949120.0,
"step": 360
},
{
"epoch": 0.5275351987168063,
"grad_norm": 1.4669880867004395,
"learning_rate": 9.166226912928761e-06,
"loss": 0.2077,
"mean_token_accuracy": 0.7948140885680914,
"num_tokens": 3031040.0,
"step": 370
},
{
"epoch": 0.5417929067902335,
"grad_norm": 1.4171335697174072,
"learning_rate": 9.113456464379948e-06,
"loss": 0.2263,
"mean_token_accuracy": 0.7792441301047802,
"num_tokens": 3112960.0,
"step": 380
},
{
"epoch": 0.5560506148636607,
"grad_norm": 1.6694875955581665,
"learning_rate": 9.060686015831135e-06,
"loss": 0.1963,
"mean_token_accuracy": 0.8029109600931406,
"num_tokens": 3194880.0,
"step": 390
},
{
"epoch": 0.5703083229370879,
"grad_norm": 1.242436170578003,
"learning_rate": 9.007915567282322e-06,
"loss": 0.2046,
"step": 400
},
{
"epoch": 0.5703083229370879,
"eval_loss": 0.4270094931125641,
"eval_mean_token_accuracy": 0.9065381089846293,
"eval_num_tokens": 3276800.0,
"eval_runtime": 41.4607,
"eval_samples_per_second": 30.077,
"eval_steps_per_second": 1.881,
"step": 400
},
{
"epoch": 0.5845660310105151,
"grad_norm": 1.7970621585845947,
"learning_rate": 8.95514511873351e-06,
"loss": 0.2489,
"mean_token_accuracy": 0.7863931017927825,
"num_tokens": 3358720.0,
"step": 410
},
{
"epoch": 0.5988237390839423,
"grad_norm": 1.1633917093276978,
"learning_rate": 8.902374670184698e-06,
"loss": 0.2433,
"mean_token_accuracy": 0.7638698622584343,
"num_tokens": 3440640.0,
"step": 420
},
{
"epoch": 0.6130814471573695,
"grad_norm": 1.1505024433135986,
"learning_rate": 8.849604221635884e-06,
"loss": 0.2055,
"mean_token_accuracy": 0.7901785716414451,
"num_tokens": 3522560.0,
"step": 430
},
{
"epoch": 0.6273391552307966,
"grad_norm": 1.0602478981018066,
"learning_rate": 8.796833773087073e-06,
"loss": 0.2245,
"mean_token_accuracy": 0.7706947140395641,
"num_tokens": 3604480.0,
"step": 440
},
{
"epoch": 0.6415968633042238,
"grad_norm": 1.772160291671753,
"learning_rate": 8.744063324538258e-06,
"loss": 0.1874,
"mean_token_accuracy": 0.8206702545285225,
"num_tokens": 3686400.0,
"step": 450
},
{
"epoch": 0.655854571377651,
"grad_norm": 1.3542793989181519,
"learning_rate": 8.691292875989447e-06,
"loss": 0.1848,
"mean_token_accuracy": 0.804293054714799,
"num_tokens": 3768320.0,
"step": 460
},
{
"epoch": 0.6701122794510782,
"grad_norm": 1.0653384923934937,
"learning_rate": 8.638522427440634e-06,
"loss": 0.235,
"mean_token_accuracy": 0.7650929525494575,
"num_tokens": 3850240.0,
"step": 470
},
{
"epoch": 0.6843699875245054,
"grad_norm": 1.6029295921325684,
"learning_rate": 8.58575197889182e-06,
"loss": 0.1925,
"mean_token_accuracy": 0.8006971623748541,
"num_tokens": 3932160.0,
"step": 480
},
{
"epoch": 0.6986276955979326,
"grad_norm": 1.4445295333862305,
"learning_rate": 8.53298153034301e-06,
"loss": 0.2081,
"mean_token_accuracy": 0.7856042090803385,
"num_tokens": 4014080.0,
"step": 490
},
{
"epoch": 0.7128854036713598,
"grad_norm": 1.2714146375656128,
"learning_rate": 8.480211081794196e-06,
"loss": 0.1993,
"step": 500
},
{
"epoch": 0.7128854036713598,
"eval_loss": 0.4256138503551483,
"eval_mean_token_accuracy": 0.9067788590223361,
"eval_num_tokens": 4096000.0,
"eval_runtime": 41.1978,
"eval_samples_per_second": 30.269,
"eval_steps_per_second": 1.893,
"step": 500
},
{
"epoch": 0.7271431117447871,
"grad_norm": 1.3898621797561646,
"learning_rate": 8.427440633245383e-06,
"loss": 0.2309,
"mean_token_accuracy": 0.7859711354598403,
"num_tokens": 4177920.0,
"step": 510
},
{
"epoch": 0.7414008198182143,
"grad_norm": 1.7414770126342773,
"learning_rate": 8.37467018469657e-06,
"loss": 0.184,
"mean_token_accuracy": 0.8112646777182817,
"num_tokens": 4259840.0,
"step": 520
},
{
"epoch": 0.7556585278916415,
"grad_norm": 1.3587582111358643,
"learning_rate": 8.321899736147759e-06,
"loss": 0.1971,
"mean_token_accuracy": 0.8050636004656553,
"num_tokens": 4341760.0,
"step": 530
},
{
"epoch": 0.7699162359650686,
"grad_norm": 1.324808120727539,
"learning_rate": 8.269129287598946e-06,
"loss": 0.2105,
"mean_token_accuracy": 0.7933341484516859,
"num_tokens": 4423680.0,
"step": 540
},
{
"epoch": 0.7841739440384958,
"grad_norm": 1.2644426822662354,
"learning_rate": 8.216358839050133e-06,
"loss": 0.2382,
"mean_token_accuracy": 0.7831457942724228,
"num_tokens": 4505600.0,
"step": 550
},
{
"epoch": 0.798431652111923,
"grad_norm": 1.3145424127578735,
"learning_rate": 8.16358839050132e-06,
"loss": 0.2081,
"mean_token_accuracy": 0.7934319969266653,
"num_tokens": 4587520.0,
"step": 560
},
{
"epoch": 0.8126893601853502,
"grad_norm": 1.205351710319519,
"learning_rate": 8.110817941952506e-06,
"loss": 0.2078,
"mean_token_accuracy": 0.7768101781606674,
"num_tokens": 4669440.0,
"step": 570
},
{
"epoch": 0.8269470682587774,
"grad_norm": 1.2244083881378174,
"learning_rate": 8.058047493403695e-06,
"loss": 0.2068,
"mean_token_accuracy": 0.7885518573224545,
"num_tokens": 4751360.0,
"step": 580
},
{
"epoch": 0.8412047763322046,
"grad_norm": 1.6164205074310303,
"learning_rate": 8.005277044854882e-06,
"loss": 0.1852,
"mean_token_accuracy": 0.7931751444935798,
"num_tokens": 4833280.0,
"step": 590
},
{
"epoch": 0.8554624844056318,
"grad_norm": 1.507039189338684,
"learning_rate": 7.952506596306069e-06,
"loss": 0.2409,
"step": 600
},
{
"epoch": 0.8554624844056318,
"eval_loss": 0.42402541637420654,
"eval_mean_token_accuracy": 0.9070860980412899,
"eval_num_tokens": 4915200.0,
"eval_runtime": 41.2209,
"eval_samples_per_second": 30.252,
"eval_steps_per_second": 1.892,
"step": 600
},
{
"epoch": 0.869720192479059,
"grad_norm": 1.43611478805542,
"learning_rate": 7.899736147757256e-06,
"loss": 0.1694,
"mean_token_accuracy": 0.7737157551571727,
"num_tokens": 4997120.0,
"step": 610
},
{
"epoch": 0.8839779005524862,
"grad_norm": 1.1565167903900146,
"learning_rate": 7.846965699208444e-06,
"loss": 0.1831,
"mean_token_accuracy": 0.8027274951338768,
"num_tokens": 5079040.0,
"step": 620
},
{
"epoch": 0.8982356086259133,
"grad_norm": 1.421459674835205,
"learning_rate": 7.794195250659631e-06,
"loss": 0.1821,
"mean_token_accuracy": 0.8165117412805557,
"num_tokens": 5160960.0,
"step": 630
},
{
"epoch": 0.9124933166993405,
"grad_norm": 1.2855138778686523,
"learning_rate": 7.741424802110818e-06,
"loss": 0.1795,
"mean_token_accuracy": 0.8006971597671508,
"num_tokens": 5242880.0,
"step": 640
},
{
"epoch": 0.9267510247727678,
"grad_norm": 1.2338076829910278,
"learning_rate": 7.688654353562007e-06,
"loss": 0.1988,
"mean_token_accuracy": 0.7919031333178281,
"num_tokens": 5324800.0,
"step": 650
},
{
"epoch": 0.941008732846195,
"grad_norm": 1.3217254877090454,
"learning_rate": 7.635883905013192e-06,
"loss": 0.2178,
"mean_token_accuracy": 0.7840019542723894,
"num_tokens": 5406720.0,
"step": 660
},
{
"epoch": 0.9552664409196222,
"grad_norm": 1.0036381483078003,
"learning_rate": 7.583113456464381e-06,
"loss": 0.2076,
"mean_token_accuracy": 0.80275196172297,
"num_tokens": 5488640.0,
"step": 670
},
{
"epoch": 0.9695241489930494,
"grad_norm": 1.1931681632995605,
"learning_rate": 7.5303430079155685e-06,
"loss": 0.1649,
"mean_token_accuracy": 0.8191291578114033,
"num_tokens": 5570560.0,
"step": 680
},
{
"epoch": 0.9837818570664766,
"grad_norm": 1.2504717111587524,
"learning_rate": 7.4775725593667545e-06,
"loss": 0.214,
"mean_token_accuracy": 0.7973703525960445,
"num_tokens": 5652480.0,
"step": 690
},
{
"epoch": 0.9980395651399038,
"grad_norm": 1.1769760847091675,
"learning_rate": 7.424802110817942e-06,
"loss": 0.1853,
"step": 700
},
{
"epoch": 0.9980395651399038,
"eval_loss": 0.42192551493644714,
"eval_mean_token_accuracy": 0.9072852402161329,
"eval_num_tokens": 5734400.0,
"eval_runtime": 41.2475,
"eval_samples_per_second": 30.232,
"eval_steps_per_second": 1.891,
"step": 700
},
{
"epoch": 1.0114061664587417,
"grad_norm": 1.4164894819259644,
"learning_rate": 7.37203166226913e-06,
"loss": 0.1644,
"mean_token_accuracy": 0.7994192252236028,
"num_tokens": 5810688.0,
"step": 710
},
{
"epoch": 1.025663874532169,
"grad_norm": 1.4920251369476318,
"learning_rate": 7.319261213720317e-06,
"loss": 0.2075,
"mean_token_accuracy": 0.771135026961565,
"num_tokens": 5892608.0,
"step": 720
},
{
"epoch": 1.039921582605596,
"grad_norm": 1.4728736877441406,
"learning_rate": 7.266490765171505e-06,
"loss": 0.2103,
"mean_token_accuracy": 0.7777886509895324,
"num_tokens": 5974528.0,
"step": 730
},
{
"epoch": 1.0541792906790233,
"grad_norm": 1.3879398107528687,
"learning_rate": 7.2137203166226925e-06,
"loss": 0.1873,
"mean_token_accuracy": 0.810420742072165,
"num_tokens": 6056448.0,
"step": 740
},
{
"epoch": 1.0684369987524505,
"grad_norm": 1.2428431510925293,
"learning_rate": 7.160949868073879e-06,
"loss": 0.179,
"mean_token_accuracy": 0.8042074333876371,
"num_tokens": 6138368.0,
"step": 750
},
{
"epoch": 1.0826947068258777,
"grad_norm": 1.2117047309875488,
"learning_rate": 7.108179419525066e-06,
"loss": 0.1715,
"mean_token_accuracy": 0.7988258298486471,
"num_tokens": 6220288.0,
"step": 760
},
{
"epoch": 1.0969524148993048,
"grad_norm": 1.0141360759735107,
"learning_rate": 7.055408970976254e-06,
"loss": 0.1622,
"mean_token_accuracy": 0.7959515653550625,
"num_tokens": 6302208.0,
"step": 770
},
{
"epoch": 1.1112101229727323,
"grad_norm": 1.3603819608688354,
"learning_rate": 7.002638522427441e-06,
"loss": 0.1843,
"mean_token_accuracy": 0.7814090006053448,
"num_tokens": 6384128.0,
"step": 780
},
{
"epoch": 1.1254678310461594,
"grad_norm": 1.2628884315490723,
"learning_rate": 6.949868073878628e-06,
"loss": 0.1975,
"mean_token_accuracy": 0.7898238770663738,
"num_tokens": 6466048.0,
"step": 790
},
{
"epoch": 1.1397255391195866,
"grad_norm": 1.3789145946502686,
"learning_rate": 6.897097625329816e-06,
"loss": 0.188,
"step": 800
},
{
"epoch": 1.1397255391195866,
"eval_loss": 0.42303529381752014,
"eval_mean_token_accuracy": 0.9073893580681238,
"eval_num_tokens": 6547968.0,
"eval_runtime": 41.1713,
"eval_samples_per_second": 30.288,
"eval_steps_per_second": 1.895,
"step": 800
},
{
"epoch": 1.1539832471930138,
"grad_norm": 1.688471794128418,
"learning_rate": 6.844327176781003e-06,
"loss": 0.1963,
"mean_token_accuracy": 0.7926553322002292,
"num_tokens": 6629888.0,
"step": 810
},
{
"epoch": 1.168240955266441,
"grad_norm": 1.4517184495925903,
"learning_rate": 6.7915567282321904e-06,
"loss": 0.2068,
"mean_token_accuracy": 0.7929794482886792,
"num_tokens": 6711808.0,
"step": 820
},
{
"epoch": 1.1824986633398682,
"grad_norm": 1.250712275505066,
"learning_rate": 6.738786279683378e-06,
"loss": 0.2249,
"mean_token_accuracy": 0.7757338518276811,
"num_tokens": 6793728.0,
"step": 830
},
{
"epoch": 1.1967563714132954,
"grad_norm": 1.5452476739883423,
"learning_rate": 6.686015831134564e-06,
"loss": 0.1976,
"mean_token_accuracy": 0.7758683927357197,
"num_tokens": 6875648.0,
"step": 840
},
{
"epoch": 1.2110140794867226,
"grad_norm": 1.7196730375289917,
"learning_rate": 6.633245382585752e-06,
"loss": 0.1876,
"mean_token_accuracy": 0.7862769093364477,
"num_tokens": 6957568.0,
"step": 850
},
{
"epoch": 1.2252717875601498,
"grad_norm": 1.0415942668914795,
"learning_rate": 6.58047493403694e-06,
"loss": 0.1789,
"mean_token_accuracy": 0.8055528394877911,
"num_tokens": 7039488.0,
"step": 860
},
{
"epoch": 1.239529495633577,
"grad_norm": 1.075972080230713,
"learning_rate": 6.527704485488127e-06,
"loss": 0.1741,
"mean_token_accuracy": 0.788759783655405,
"num_tokens": 7121408.0,
"step": 870
},
{
"epoch": 1.2537872037070041,
"grad_norm": 1.3330209255218506,
"learning_rate": 6.4749340369393145e-06,
"loss": 0.1803,
"mean_token_accuracy": 0.792331212386489,
"num_tokens": 7203328.0,
"step": 880
},
{
"epoch": 1.2680449117804313,
"grad_norm": 1.4352701902389526,
"learning_rate": 6.422163588390502e-06,
"loss": 0.201,
"mean_token_accuracy": 0.7732142828404903,
"num_tokens": 7285248.0,
"step": 890
},
{
"epoch": 1.2823026198538585,
"grad_norm": 1.189164638519287,
"learning_rate": 6.36939313984169e-06,
"loss": 0.1709,
"step": 900
},
{
"epoch": 1.2823026198538585,
"eval_loss": 0.4221397936344147,
"eval_mean_token_accuracy": 0.9075833811209753,
"eval_num_tokens": 7367168.0,
"eval_runtime": 41.1163,
"eval_samples_per_second": 30.329,
"eval_steps_per_second": 1.897,
"step": 900
},
{
"epoch": 1.2965603279272857,
"grad_norm": 1.0476454496383667,
"learning_rate": 6.316622691292876e-06,
"loss": 0.1622,
"mean_token_accuracy": 0.8043847857043147,
"num_tokens": 7449088.0,
"step": 910
},
{
"epoch": 1.310818036000713,
"grad_norm": 1.4576106071472168,
"learning_rate": 6.263852242744064e-06,
"loss": 0.1771,
"mean_token_accuracy": 0.7940190762281418,
"num_tokens": 7531008.0,
"step": 920
},
{
"epoch": 1.32507574407414,
"grad_norm": 1.3801617622375488,
"learning_rate": 6.211081794195252e-06,
"loss": 0.1683,
"mean_token_accuracy": 0.8146893348544836,
"num_tokens": 7612928.0,
"step": 930
},
{
"epoch": 1.3393334521475673,
"grad_norm": 1.3410853147506714,
"learning_rate": 6.1583113456464385e-06,
"loss": 0.1822,
"mean_token_accuracy": 0.7949853200465441,
"num_tokens": 7694848.0,
"step": 940
},
{
"epoch": 1.3535911602209945,
"grad_norm": 1.3625820875167847,
"learning_rate": 6.105540897097626e-06,
"loss": 0.1851,
"mean_token_accuracy": 0.7908635035157203,
"num_tokens": 7776768.0,
"step": 950
},
{
"epoch": 1.3678488682944216,
"grad_norm": 1.2172579765319824,
"learning_rate": 6.052770448548814e-06,
"loss": 0.1819,
"mean_token_accuracy": 0.7890900187194347,
"num_tokens": 7858688.0,
"step": 960
},
{
"epoch": 1.3821065763678488,
"grad_norm": 1.2454630136489868,
"learning_rate": 6e-06,
"loss": 0.1972,
"mean_token_accuracy": 0.791903131455183,
"num_tokens": 7940608.0,
"step": 970
},
{
"epoch": 1.396364284441276,
"grad_norm": 1.0972909927368164,
"learning_rate": 5.947229551451188e-06,
"loss": 0.1745,
"mean_token_accuracy": 0.795584636926651,
"num_tokens": 8022528.0,
"step": 980
},
{
"epoch": 1.4106219925147032,
"grad_norm": 1.3013101816177368,
"learning_rate": 5.894459102902376e-06,
"loss": 0.2051,
"mean_token_accuracy": 0.7778742641210556,
"num_tokens": 8104448.0,
"step": 990
},
{
"epoch": 1.4248797005881304,
"grad_norm": 1.4143636226654053,
"learning_rate": 5.841688654353563e-06,
"loss": 0.228,
"step": 1000
},
{
"epoch": 1.4248797005881304,
"eval_loss": 0.4206378161907196,
"eval_mean_token_accuracy": 0.9077556622334015,
"eval_num_tokens": 8186368.0,
"eval_runtime": 41.158,
"eval_samples_per_second": 30.298,
"eval_steps_per_second": 1.895,
"step": 1000
},
{
"epoch": 1.4391374086615576,
"grad_norm": 1.8749654293060303,
"learning_rate": 5.7889182058047495e-06,
"loss": 0.1719,
"mean_token_accuracy": 0.7847602725028991,
"num_tokens": 8268288.0,
"step": 1010
},
{
"epoch": 1.4533951167349848,
"grad_norm": 1.5693446397781372,
"learning_rate": 5.736147757255937e-06,
"loss": 0.187,
"mean_token_accuracy": 0.7941780813038349,
"num_tokens": 8350208.0,
"step": 1020
},
{
"epoch": 1.467652824808412,
"grad_norm": 1.1207451820373535,
"learning_rate": 5.683377308707124e-06,
"loss": 0.1705,
"mean_token_accuracy": 0.7942636970430612,
"num_tokens": 8432128.0,
"step": 1030
},
{
"epoch": 1.4819105328818392,
"grad_norm": 1.1814815998077393,
"learning_rate": 5.630606860158312e-06,
"loss": 0.1723,
"mean_token_accuracy": 0.8012964788824319,
"num_tokens": 8514048.0,
"step": 1040
},
{
"epoch": 1.4961682409552663,
"grad_norm": 1.3927719593048096,
"learning_rate": 5.5778364116095e-06,
"loss": 0.2018,
"mean_token_accuracy": 0.7847480427473783,
"num_tokens": 8595968.0,
"step": 1050
},
{
"epoch": 1.5104259490286935,
"grad_norm": 1.4986419677734375,
"learning_rate": 5.525065963060686e-06,
"loss": 0.1803,
"mean_token_accuracy": 0.7783757321536541,
"num_tokens": 8677888.0,
"step": 1060
},
{
"epoch": 1.5246836571021207,
"grad_norm": 1.8012514114379883,
"learning_rate": 5.472295514511874e-06,
"loss": 0.1863,
"mean_token_accuracy": 0.8155821930617094,
"num_tokens": 8759808.0,
"step": 1070
},
{
"epoch": 1.538941365175548,
"grad_norm": 1.2655534744262695,
"learning_rate": 5.419525065963061e-06,
"loss": 0.1967,
"mean_token_accuracy": 0.7537671256810426,
"num_tokens": 8841728.0,
"step": 1080
},
{
"epoch": 1.553199073248975,
"grad_norm": 1.3260008096694946,
"learning_rate": 5.366754617414248e-06,
"loss": 0.199,
"mean_token_accuracy": 0.7909124296158552,
"num_tokens": 8923648.0,
"step": 1090
},
{
"epoch": 1.5674567813224023,
"grad_norm": 1.490972876548767,
"learning_rate": 5.313984168865436e-06,
"loss": 0.1835,
"step": 1100
},
{
"epoch": 1.5674567813224023,
"eval_loss": 0.4212629497051239,
"eval_mean_token_accuracy": 0.9076559314360986,
"eval_num_tokens": 9005568.0,
"eval_runtime": 41.1762,
"eval_samples_per_second": 30.284,
"eval_steps_per_second": 1.894,
"step": 1100
},
{
"epoch": 1.5817144893958295,
"grad_norm": 1.5151809453964233,
"learning_rate": 5.261213720316624e-06,
"loss": 0.1791,
"mean_token_accuracy": 0.8009601265192032,
"num_tokens": 9087488.0,
"step": 1110
},
{
"epoch": 1.5959721974692567,
"grad_norm": 1.244946837425232,
"learning_rate": 5.20844327176781e-06,
"loss": 0.1692,
"mean_token_accuracy": 0.8105185899883509,
"num_tokens": 9169408.0,
"step": 1120
},
{
"epoch": 1.6102299055426839,
"grad_norm": 1.328723669052124,
"learning_rate": 5.155672823218998e-06,
"loss": 0.19,
"mean_token_accuracy": 0.7765777885913849,
"num_tokens": 9251328.0,
"step": 1130
},
{
"epoch": 1.624487613616111,
"grad_norm": 1.1754485368728638,
"learning_rate": 5.102902374670185e-06,
"loss": 0.166,
"mean_token_accuracy": 0.8148972604423761,
"num_tokens": 9333248.0,
"step": 1140
},
{
"epoch": 1.6387453216895385,
"grad_norm": 1.2302050590515137,
"learning_rate": 5.050131926121372e-06,
"loss": 0.2001,
"mean_token_accuracy": 0.7948018629103899,
"num_tokens": 9415168.0,
"step": 1150
},
{
"epoch": 1.6530030297629656,
"grad_norm": 1.1300264596939087,
"learning_rate": 4.99736147757256e-06,
"loss": 0.1678,
"mean_token_accuracy": 0.8071673218160867,
"num_tokens": 9497088.0,
"step": 1160
},
{
"epoch": 1.6672607378363928,
"grad_norm": 1.0087612867355347,
"learning_rate": 4.944591029023747e-06,
"loss": 0.1486,
"mean_token_accuracy": 0.8172700595110655,
"num_tokens": 9579008.0,
"step": 1170
},
{
"epoch": 1.68151844590982,
"grad_norm": 1.4350190162658691,
"learning_rate": 4.891820580474935e-06,
"loss": 0.186,
"mean_token_accuracy": 0.8002446169033647,
"num_tokens": 9660928.0,
"step": 1180
},
{
"epoch": 1.6957761539832472,
"grad_norm": 1.69225013256073,
"learning_rate": 4.839050131926122e-06,
"loss": 0.1788,
"mean_token_accuracy": 0.7850782759487629,
"num_tokens": 9742848.0,
"step": 1190
},
{
"epoch": 1.7100338620566744,
"grad_norm": 1.1749773025512695,
"learning_rate": 4.786279683377309e-06,
"loss": 0.1869,
"step": 1200
},
{
"epoch": 1.7100338620566744,
"eval_loss": 0.4190373420715332,
"eval_mean_token_accuracy": 0.9080827587690109,
"eval_num_tokens": 9824768.0,
"eval_runtime": 41.1505,
"eval_samples_per_second": 30.303,
"eval_steps_per_second": 1.895,
"step": 1200
},
{
"epoch": 1.7242915701301016,
"grad_norm": 1.0468658208847046,
"learning_rate": 4.733509234828496e-06,
"loss": 0.1812,
"mean_token_accuracy": 0.8027397247031332,
"num_tokens": 9906688.0,
"step": 1210
},
{
"epoch": 1.7385492782035288,
"grad_norm": 1.4152122735977173,
"learning_rate": 4.680738786279684e-06,
"loss": 0.1816,
"mean_token_accuracy": 0.7942759301513433,
"num_tokens": 9988608.0,
"step": 1220
},
{
"epoch": 1.752806986276956,
"grad_norm": 1.4354445934295654,
"learning_rate": 4.627968337730871e-06,
"loss": 0.2029,
"mean_token_accuracy": 0.7708537172526121,
"num_tokens": 10070528.0,
"step": 1230
},
{
"epoch": 1.7670646943503832,
"grad_norm": 1.2206268310546875,
"learning_rate": 4.575197889182059e-06,
"loss": 0.2083,
"mean_token_accuracy": 0.7663405101746321,
"num_tokens": 10152448.0,
"step": 1240
},
{
"epoch": 1.7813224024238103,
"grad_norm": 1.3205043077468872,
"learning_rate": 4.522427440633246e-06,
"loss": 0.1929,
"mean_token_accuracy": 0.7970156516879797,
"num_tokens": 10234368.0,
"step": 1250
},
{
"epoch": 1.7955801104972375,
"grad_norm": 1.1927738189697266,
"learning_rate": 4.469656992084433e-06,
"loss": 0.1527,
"mean_token_accuracy": 0.8137230888009072,
"num_tokens": 10316288.0,
"step": 1260
},
{
"epoch": 1.8098378185706647,
"grad_norm": 1.4011446237564087,
"learning_rate": 4.4168865435356204e-06,
"loss": 0.1938,
"mean_token_accuracy": 0.7995963796973229,
"num_tokens": 10398208.0,
"step": 1270
},
{
"epoch": 1.824095526644092,
"grad_norm": 1.3237054347991943,
"learning_rate": 4.364116094986807e-06,
"loss": 0.166,
"mean_token_accuracy": 0.794483856856823,
"num_tokens": 10480128.0,
"step": 1280
},
{
"epoch": 1.8383532347175193,
"grad_norm": 1.1077933311462402,
"learning_rate": 4.311345646437995e-06,
"loss": 0.1773,
"mean_token_accuracy": 0.7855797458440066,
"num_tokens": 10562048.0,
"step": 1290
},
{
"epoch": 1.8526109427909465,
"grad_norm": 1.12204909324646,
"learning_rate": 4.258575197889183e-06,
"loss": 0.1679,
"step": 1300
},
{
"epoch": 1.8526109427909465,
"eval_loss": 0.41882508993148804,
"eval_mean_token_accuracy": 0.9083497478411748,
"eval_num_tokens": 10643968.0,
"eval_runtime": 41.3617,
"eval_samples_per_second": 30.149,
"eval_steps_per_second": 1.886,
"step": 1300
},
{
"epoch": 1.8668686508643737,
"grad_norm": 1.453183650970459,
"learning_rate": 4.20580474934037e-06,
"loss": 0.1681,
"mean_token_accuracy": 0.8072040120139718,
"num_tokens": 10725888.0,
"step": 1310
},
{
"epoch": 1.881126358937801,
"grad_norm": 1.229581356048584,
"learning_rate": 4.153034300791557e-06,
"loss": 0.2113,
"mean_token_accuracy": 0.7945083156228065,
"num_tokens": 10807808.0,
"step": 1320
},
{
"epoch": 1.895384067011228,
"grad_norm": 1.193543553352356,
"learning_rate": 4.1002638522427445e-06,
"loss": 0.1821,
"mean_token_accuracy": 0.7945450108498335,
"num_tokens": 10889728.0,
"step": 1330
},
{
"epoch": 1.9096417750846553,
"grad_norm": 1.3757144212722778,
"learning_rate": 4.047493403693931e-06,
"loss": 0.198,
"mean_token_accuracy": 0.7713062632828951,
"num_tokens": 10971648.0,
"step": 1340
},
{
"epoch": 1.9238994831580825,
"grad_norm": 0.997105062007904,
"learning_rate": 3.994722955145119e-06,
"loss": 0.1507,
"mean_token_accuracy": 0.8220768094062805,
"num_tokens": 11053568.0,
"step": 1350
},
{
"epoch": 1.9381571912315096,
"grad_norm": 1.5941100120544434,
"learning_rate": 3.941952506596307e-06,
"loss": 0.1748,
"mean_token_accuracy": 0.7959882594645024,
"num_tokens": 11135488.0,
"step": 1360
},
{
"epoch": 1.9524148993049368,
"grad_norm": 1.271546721458435,
"learning_rate": 3.889182058047494e-06,
"loss": 0.1581,
"mean_token_accuracy": 0.8203400176018476,
"num_tokens": 11217408.0,
"step": 1370
},
{
"epoch": 1.966672607378364,
"grad_norm": 1.631945252418518,
"learning_rate": 3.836411609498681e-06,
"loss": 0.2008,
"mean_token_accuracy": 0.7798923674970866,
"num_tokens": 11299328.0,
"step": 1380
},
{
"epoch": 1.9809303154517912,
"grad_norm": 1.08231520652771,
"learning_rate": 3.7836411609498686e-06,
"loss": 0.1557,
"mean_token_accuracy": 0.7915484357625246,
"num_tokens": 11381248.0,
"step": 1390
},
{
"epoch": 1.9951880235252184,
"grad_norm": 1.283751130104065,
"learning_rate": 3.730870712401056e-06,
"loss": 0.1926,
"step": 1400
},
{
"epoch": 1.9951880235252184,
"eval_loss": 0.417749285697937,
"eval_mean_token_accuracy": 0.908511886994044,
"eval_num_tokens": 11463168.0,
"eval_runtime": 41.1617,
"eval_samples_per_second": 30.295,
"eval_steps_per_second": 1.895,
"step": 1400
},
{
"epoch": 2.0085546248440562,
"grad_norm": 1.2342406511306763,
"learning_rate": 3.678100263852243e-06,
"loss": 0.1856,
"mean_token_accuracy": 0.7944889839618436,
"num_tokens": 11539456.0,
"step": 1410
},
{
"epoch": 2.0228123329174834,
"grad_norm": 1.6273552179336548,
"learning_rate": 3.6253298153034306e-06,
"loss": 0.1697,
"mean_token_accuracy": 0.7902274955064058,
"num_tokens": 11621376.0,
"step": 1420
},
{
"epoch": 2.0370700409909106,
"grad_norm": 0.9989307522773743,
"learning_rate": 3.5725593667546175e-06,
"loss": 0.195,
"mean_token_accuracy": 0.7881115455180406,
"num_tokens": 11703296.0,
"step": 1430
},
{
"epoch": 2.051327749064338,
"grad_norm": 1.885137915611267,
"learning_rate": 3.519788918205805e-06,
"loss": 0.1554,
"mean_token_accuracy": 0.8048434421420098,
"num_tokens": 11785216.0,
"step": 1440
},
{
"epoch": 2.065585457137765,
"grad_norm": 1.6031672954559326,
"learning_rate": 3.4670184696569926e-06,
"loss": 0.1584,
"mean_token_accuracy": 0.794055774062872,
"num_tokens": 11867136.0,
"step": 1450
},
{
"epoch": 2.079843165211192,
"grad_norm": 1.0749027729034424,
"learning_rate": 3.4142480211081795e-06,
"loss": 0.1678,
"mean_token_accuracy": 0.8086961843073368,
"num_tokens": 11949056.0,
"step": 1460
},
{
"epoch": 2.0941008732846194,
"grad_norm": 1.521986484527588,
"learning_rate": 3.361477572559367e-06,
"loss": 0.1919,
"mean_token_accuracy": 0.7812744613736868,
"num_tokens": 12030976.0,
"step": 1470
},
{
"epoch": 2.1083585813580465,
"grad_norm": 1.4633926153182983,
"learning_rate": 3.3087071240105546e-06,
"loss": 0.148,
"mean_token_accuracy": 0.8082069471478462,
"num_tokens": 12112896.0,
"step": 1480
},
{
"epoch": 2.1226162894314737,
"grad_norm": 1.4096753597259521,
"learning_rate": 3.2559366754617416e-06,
"loss": 0.2094,
"mean_token_accuracy": 0.7721991192549467,
"num_tokens": 12194816.0,
"step": 1490
},
{
"epoch": 2.136873997504901,
"grad_norm": 1.5162105560302734,
"learning_rate": 3.203166226912929e-06,
"loss": 0.1665,
"step": 1500
},
{
"epoch": 2.136873997504901,
"eval_loss": 0.4229465126991272,
"eval_mean_token_accuracy": 0.9079398543406756,
"eval_num_tokens": 12276736.0,
"eval_runtime": 41.2908,
"eval_samples_per_second": 30.2,
"eval_steps_per_second": 1.889,
"step": 1500
},
{
"epoch": 2.151131705578328,
"grad_norm": 1.1753822565078735,
"learning_rate": 3.1503957783641167e-06,
"loss": 0.1709,
"mean_token_accuracy": 0.7948324346914888,
"num_tokens": 12358656.0,
"step": 1510
},
{
"epoch": 2.1653894136517553,
"grad_norm": 1.3583290576934814,
"learning_rate": 3.0976253298153036e-06,
"loss": 0.1516,
"mean_token_accuracy": 0.7987769071012736,
"num_tokens": 12440576.0,
"step": 1520
},
{
"epoch": 2.1796471217251825,
"grad_norm": 1.6773642301559448,
"learning_rate": 3.044854881266491e-06,
"loss": 0.1582,
"mean_token_accuracy": 0.8161937419325114,
"num_tokens": 12522496.0,
"step": 1530
},
{
"epoch": 2.1939048297986097,
"grad_norm": 1.700421929359436,
"learning_rate": 2.9920844327176783e-06,
"loss": 0.1651,
"mean_token_accuracy": 0.7837084148079156,
"num_tokens": 12604416.0,
"step": 1540
},
{
"epoch": 2.208162537872037,
"grad_norm": 1.278611183166504,
"learning_rate": 2.9393139841688656e-06,
"loss": 0.1459,
"mean_token_accuracy": 0.8016634039580822,
"num_tokens": 12686336.0,
"step": 1550
},
{
"epoch": 2.2224202459454645,
"grad_norm": 1.3623602390289307,
"learning_rate": 2.8865435356200525e-06,
"loss": 0.1754,
"mean_token_accuracy": 0.7952054802328348,
"num_tokens": 12768256.0,
"step": 1560
},
{
"epoch": 2.2366779540188917,
"grad_norm": 1.1797006130218506,
"learning_rate": 2.8337730870712403e-06,
"loss": 0.1854,
"mean_token_accuracy": 0.7857632093131542,
"num_tokens": 12850176.0,
"step": 1570
},
{
"epoch": 2.250935662092319,
"grad_norm": 1.2017779350280762,
"learning_rate": 2.7810026385224277e-06,
"loss": 0.1482,
"mean_token_accuracy": 0.8103106629103423,
"num_tokens": 12932096.0,
"step": 1580
},
{
"epoch": 2.265193370165746,
"grad_norm": 1.1322146654129028,
"learning_rate": 2.7282321899736154e-06,
"loss": 0.1539,
"mean_token_accuracy": 0.8084882564842701,
"num_tokens": 13014016.0,
"step": 1590
},
{
"epoch": 2.2794510782391733,
"grad_norm": 1.2803654670715332,
"learning_rate": 2.6754617414248023e-06,
"loss": 0.1495,
"step": 1600
},
{
"epoch": 2.2794510782391733,
"eval_loss": 0.423663467168808,
"eval_mean_token_accuracy": 0.9079369283639468,
"eval_num_tokens": 13095936.0,
"eval_runtime": 41.2866,
"eval_samples_per_second": 30.203,
"eval_steps_per_second": 1.889,
"step": 1600
},
{
"epoch": 2.2937087863126004,
"grad_norm": 1.110379934310913,
"learning_rate": 2.6226912928759897e-06,
"loss": 0.157,
"mean_token_accuracy": 0.8001467704772949,
"num_tokens": 13177856.0,
"step": 1610
},
{
"epoch": 2.3079664943860276,
"grad_norm": 1.2236034870147705,
"learning_rate": 2.5699208443271775e-06,
"loss": 0.1566,
"mean_token_accuracy": 0.807118396833539,
"num_tokens": 13259776.0,
"step": 1620
},
{
"epoch": 2.322224202459455,
"grad_norm": 1.439042329788208,
"learning_rate": 2.5171503957783644e-06,
"loss": 0.1979,
"mean_token_accuracy": 0.7804794508963824,
"num_tokens": 13341696.0,
"step": 1630
},
{
"epoch": 2.336481910532882,
"grad_norm": 1.3598966598510742,
"learning_rate": 2.4643799472295517e-06,
"loss": 0.1514,
"mean_token_accuracy": 0.8212695695459843,
"num_tokens": 13423616.0,
"step": 1640
},
{
"epoch": 2.350739618606309,
"grad_norm": 1.401573896408081,
"learning_rate": 2.411609498680739e-06,
"loss": 0.1588,
"mean_token_accuracy": 0.8089774928987026,
"num_tokens": 13505536.0,
"step": 1650
},
{
"epoch": 2.3649973266797364,
"grad_norm": 1.6068435907363892,
"learning_rate": 2.3588390501319264e-06,
"loss": 0.1647,
"mean_token_accuracy": 0.8134907066822052,
"num_tokens": 13587456.0,
"step": 1660
},
{
"epoch": 2.3792550347531636,
"grad_norm": 1.2568259239196777,
"learning_rate": 2.3060686015831133e-06,
"loss": 0.1664,
"mean_token_accuracy": 0.7954256378114224,
"num_tokens": 13669376.0,
"step": 1670
},
{
"epoch": 2.3935127428265908,
"grad_norm": 1.6980928182601929,
"learning_rate": 2.253298153034301e-06,
"loss": 0.1707,
"mean_token_accuracy": 0.7994985327124595,
"num_tokens": 13751296.0,
"step": 1680
},
{
"epoch": 2.407770450900018,
"grad_norm": 1.6247879266738892,
"learning_rate": 2.2005277044854884e-06,
"loss": 0.1579,
"mean_token_accuracy": 0.7971624247729778,
"num_tokens": 13833216.0,
"step": 1690
},
{
"epoch": 2.422028158973445,
"grad_norm": 1.6872649192810059,
"learning_rate": 2.1477572559366753e-06,
"loss": 0.1703,
"step": 1700
},
{
"epoch": 2.422028158973445,
"eval_loss": 0.4227621853351593,
"eval_mean_token_accuracy": 0.9080967650963709,
"eval_num_tokens": 13915136.0,
"eval_runtime": 41.2123,
"eval_samples_per_second": 30.258,
"eval_steps_per_second": 1.893,
"step": 1700
},
{
"epoch": 2.4362858670468723,
"grad_norm": 1.6167148351669312,
"learning_rate": 2.094986807387863e-06,
"loss": 0.1801,
"mean_token_accuracy": 0.7794765178114176,
"num_tokens": 13997056.0,
"step": 1710
},
{
"epoch": 2.4505435751202995,
"grad_norm": 1.2795140743255615,
"learning_rate": 2.0422163588390505e-06,
"loss": 0.1466,
"mean_token_accuracy": 0.8015288673341274,
"num_tokens": 14078976.0,
"step": 1720
},
{
"epoch": 2.4648012831937267,
"grad_norm": 1.2836272716522217,
"learning_rate": 1.989445910290238e-06,
"loss": 0.1587,
"mean_token_accuracy": 0.7941046960651874,
"num_tokens": 14160896.0,
"step": 1730
},
{
"epoch": 2.479058991267154,
"grad_norm": 1.1510287523269653,
"learning_rate": 1.9366754617414247e-06,
"loss": 0.1807,
"mean_token_accuracy": 0.7942025430500508,
"num_tokens": 14242816.0,
"step": 1740
},
{
"epoch": 2.493316699340581,
"grad_norm": 1.2959060668945312,
"learning_rate": 1.8839050131926123e-06,
"loss": 0.187,
"mean_token_accuracy": 0.7789016582071782,
"num_tokens": 14324736.0,
"step": 1750
},
{
"epoch": 2.5075744074140083,
"grad_norm": 1.0948452949523926,
"learning_rate": 1.8311345646437998e-06,
"loss": 0.1995,
"mean_token_accuracy": 0.761827296577394,
"num_tokens": 14406656.0,
"step": 1760
},
{
"epoch": 2.5218321154874355,
"grad_norm": 1.3183213472366333,
"learning_rate": 1.778364116094987e-06,
"loss": 0.1709,
"mean_token_accuracy": 0.7887353241443634,
"num_tokens": 14488576.0,
"step": 1770
},
{
"epoch": 2.5360898235608627,
"grad_norm": 1.2092057466506958,
"learning_rate": 1.7255936675461743e-06,
"loss": 0.1325,
"mean_token_accuracy": 0.8213796474039554,
"num_tokens": 14570496.0,
"step": 1780
},
{
"epoch": 2.55034753163429,
"grad_norm": 1.418562889099121,
"learning_rate": 1.6728232189973616e-06,
"loss": 0.1827,
"mean_token_accuracy": 0.7853595890104771,
"num_tokens": 14652416.0,
"step": 1790
},
{
"epoch": 2.564605239707717,
"grad_norm": 1.0960406064987183,
"learning_rate": 1.6200527704485488e-06,
"loss": 0.1758,
"step": 1800
},
{
"epoch": 2.564605239707717,
"eval_loss": 0.4227621257305145,
"eval_mean_token_accuracy": 0.9082627732020158,
"eval_num_tokens": 14734336.0,
"eval_runtime": 41.1309,
"eval_samples_per_second": 30.318,
"eval_steps_per_second": 1.896,
"step": 1800
},
{
"epoch": 2.578862947781144,
"grad_norm": 1.5267870426177979,
"learning_rate": 1.5672823218997363e-06,
"loss": 0.1732,
"mean_token_accuracy": 0.7900256833992898,
"num_tokens": 14816256.0,
"step": 1810
},
{
"epoch": 2.5931206558545714,
"grad_norm": 2.303779125213623,
"learning_rate": 1.5145118733509237e-06,
"loss": 0.1717,
"mean_token_accuracy": 0.8003057725727558,
"num_tokens": 14898176.0,
"step": 1820
},
{
"epoch": 2.6073783639279986,
"grad_norm": 1.3814704418182373,
"learning_rate": 1.4617414248021108e-06,
"loss": 0.1691,
"mean_token_accuracy": 0.8011741682887077,
"num_tokens": 14980096.0,
"step": 1830
},
{
"epoch": 2.621636072001426,
"grad_norm": 1.4888346195220947,
"learning_rate": 1.4089709762532984e-06,
"loss": 0.1665,
"mean_token_accuracy": 0.7911203544586897,
"num_tokens": 15062016.0,
"step": 1840
},
{
"epoch": 2.635893780074853,
"grad_norm": 1.7252527475357056,
"learning_rate": 1.3562005277044857e-06,
"loss": 0.1462,
"mean_token_accuracy": 0.8204623281955719,
"num_tokens": 15143936.0,
"step": 1850
},
{
"epoch": 2.65015148814828,
"grad_norm": 1.3731549978256226,
"learning_rate": 1.3034300791556728e-06,
"loss": 0.1469,
"mean_token_accuracy": 0.8153620343655348,
"num_tokens": 15225856.0,
"step": 1860
},
{
"epoch": 2.6644091962217074,
"grad_norm": 1.1390541791915894,
"learning_rate": 1.2506596306068602e-06,
"loss": 0.1511,
"mean_token_accuracy": 0.7933586105704308,
"num_tokens": 15307776.0,
"step": 1870
},
{
"epoch": 2.6786669042951345,
"grad_norm": 1.3843096494674683,
"learning_rate": 1.1978891820580475e-06,
"loss": 0.1743,
"mean_token_accuracy": 0.7874510768800974,
"num_tokens": 15389696.0,
"step": 1880
},
{
"epoch": 2.6929246123685617,
"grad_norm": 1.4261775016784668,
"learning_rate": 1.1451187335092349e-06,
"loss": 0.1775,
"mean_token_accuracy": 0.7992783728986979,
"num_tokens": 15471616.0,
"step": 1890
},
{
"epoch": 2.707182320441989,
"grad_norm": 1.4358237981796265,
"learning_rate": 1.0923482849604222e-06,
"loss": 0.1488,
"step": 1900
},
{
"epoch": 2.707182320441989,
"eval_loss": 0.4216897487640381,
"eval_mean_token_accuracy": 0.9083614570972247,
"eval_num_tokens": 15553536.0,
"eval_runtime": 41.1549,
"eval_samples_per_second": 30.3,
"eval_steps_per_second": 1.895,
"step": 1900
},
{
"epoch": 2.721440028515416,
"grad_norm": 1.4193668365478516,
"learning_rate": 1.0395778364116096e-06,
"loss": 0.1432,
"mean_token_accuracy": 0.8027458423748612,
"num_tokens": 15635456.0,
"step": 1910
},
{
"epoch": 2.7356977365888433,
"grad_norm": 1.3984283208847046,
"learning_rate": 9.86807387862797e-07,
"loss": 0.1751,
"mean_token_accuracy": 0.7997309185564518,
"num_tokens": 15717376.0,
"step": 1920
},
{
"epoch": 2.7499554446622705,
"grad_norm": 1.2041066884994507,
"learning_rate": 9.340369393139842e-07,
"loss": 0.2063,
"mean_token_accuracy": 0.770768103376031,
"num_tokens": 15799296.0,
"step": 1930
},
{
"epoch": 2.7642131527356977,
"grad_norm": 1.4668165445327759,
"learning_rate": 8.812664907651716e-07,
"loss": 0.1496,
"mean_token_accuracy": 0.7937133066356182,
"num_tokens": 15881216.0,
"step": 1940
},
{
"epoch": 2.778470860809125,
"grad_norm": 1.1798230409622192,
"learning_rate": 8.284960422163589e-07,
"loss": 0.1696,
"mean_token_accuracy": 0.7978228941559792,
"num_tokens": 15963136.0,
"step": 1950
},
{
"epoch": 2.792728568882552,
"grad_norm": 1.4253802299499512,
"learning_rate": 7.757255936675462e-07,
"loss": 0.1602,
"mean_token_accuracy": 0.8014432441443204,
"num_tokens": 16045056.0,
"step": 1960
},
{
"epoch": 2.8069862769559792,
"grad_norm": 1.3596400022506714,
"learning_rate": 7.229551451187336e-07,
"loss": 0.1672,
"mean_token_accuracy": 0.808916338160634,
"num_tokens": 16126976.0,
"step": 1970
},
{
"epoch": 2.8212439850294064,
"grad_norm": 1.4225387573242188,
"learning_rate": 6.701846965699208e-07,
"loss": 0.1767,
"mean_token_accuracy": 0.7800391383469105,
"num_tokens": 16208896.0,
"step": 1980
},
{
"epoch": 2.8355016931028336,
"grad_norm": 1.8448420763015747,
"learning_rate": 6.174142480211082e-07,
"loss": 0.1846,
"mean_token_accuracy": 0.792747063189745,
"num_tokens": 16290816.0,
"step": 1990
},
{
"epoch": 2.849759401176261,
"grad_norm": 1.4115536212921143,
"learning_rate": 5.646437994722955e-07,
"loss": 0.1398,
"step": 2000
},
{
"epoch": 2.849759401176261,
"eval_loss": 0.42159923911094666,
"eval_mean_token_accuracy": 0.9084225067725549,
"eval_num_tokens": 16372736.0,
"eval_runtime": 41.1801,
"eval_samples_per_second": 30.282,
"eval_steps_per_second": 1.894,
"step": 2000
}
],
"logging_steps": 10,
"max_steps": 2106,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.32699442420777e+16,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}