craa's picture
End of training
37cf733 verified
Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity, "... is not valid JSON
{
"best_global_step": 72000,
"best_metric": 3.5336711406707764,
"best_model_checkpoint": "/scratch/cl5625/exceptions/models/swap_last_to_drop_5039/checkpoint-40000",
"epoch": 26.814387314911976,
"eval_steps": 1000,
"global_step": 92000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014573860324122653,
"grad_norm": 0.6621675491333008,
"learning_rate": 0.000294,
"loss": 8.4935,
"step": 50
},
{
"epoch": 0.029147720648245307,
"grad_norm": 0.49097710847854614,
"learning_rate": 0.0005939999999999999,
"loss": 6.7519,
"step": 100
},
{
"epoch": 0.04372158097236796,
"grad_norm": 0.44661810994148254,
"learning_rate": 0.0005998285214348206,
"loss": 6.373,
"step": 150
},
{
"epoch": 0.05829544129649061,
"grad_norm": 0.4845108091831207,
"learning_rate": 0.0005996535433070866,
"loss": 6.1463,
"step": 200
},
{
"epoch": 0.07286930162061327,
"grad_norm": 0.7079914212226868,
"learning_rate": 0.0005994785651793525,
"loss": 6.0021,
"step": 250
},
{
"epoch": 0.08744316194473592,
"grad_norm": 0.5005303621292114,
"learning_rate": 0.0005993035870516185,
"loss": 5.8895,
"step": 300
},
{
"epoch": 0.10201702226885857,
"grad_norm": 0.47492167353630066,
"learning_rate": 0.0005991286089238845,
"loss": 5.7663,
"step": 350
},
{
"epoch": 0.11659088259298123,
"grad_norm": 0.4829168915748596,
"learning_rate": 0.0005989536307961504,
"loss": 5.6455,
"step": 400
},
{
"epoch": 0.1311647429171039,
"grad_norm": 0.5034458041191101,
"learning_rate": 0.0005987786526684164,
"loss": 5.5263,
"step": 450
},
{
"epoch": 0.14573860324122653,
"grad_norm": 0.4625563621520996,
"learning_rate": 0.0005986036745406824,
"loss": 5.4249,
"step": 500
},
{
"epoch": 0.1603124635653492,
"grad_norm": 0.4806194603443146,
"learning_rate": 0.0005984286964129484,
"loss": 5.3596,
"step": 550
},
{
"epoch": 0.17488632388947184,
"grad_norm": 0.4784989058971405,
"learning_rate": 0.0005982537182852143,
"loss": 5.2812,
"step": 600
},
{
"epoch": 0.1894601842135945,
"grad_norm": 0.4541167616844177,
"learning_rate": 0.0005980787401574803,
"loss": 5.2211,
"step": 650
},
{
"epoch": 0.20403404453771715,
"grad_norm": 0.49250224232673645,
"learning_rate": 0.0005979037620297463,
"loss": 5.1392,
"step": 700
},
{
"epoch": 0.2186079048618398,
"grad_norm": 0.439749538898468,
"learning_rate": 0.0005977287839020123,
"loss": 5.0797,
"step": 750
},
{
"epoch": 0.23318176518596245,
"grad_norm": 0.5406767725944519,
"learning_rate": 0.0005975538057742782,
"loss": 5.0266,
"step": 800
},
{
"epoch": 0.24775562551008512,
"grad_norm": 0.44189009070396423,
"learning_rate": 0.0005973788276465442,
"loss": 4.9857,
"step": 850
},
{
"epoch": 0.2623294858342078,
"grad_norm": 0.3949568271636963,
"learning_rate": 0.0005972038495188102,
"loss": 4.9285,
"step": 900
},
{
"epoch": 0.2769033461583304,
"grad_norm": 0.43237313628196716,
"learning_rate": 0.000597028871391076,
"loss": 4.8772,
"step": 950
},
{
"epoch": 0.29147720648245307,
"grad_norm": 0.5117595195770264,
"learning_rate": 0.000596853893263342,
"loss": 4.8306,
"step": 1000
},
{
"epoch": 0.29147720648245307,
"eval_accuracy": 0.25472762160242746,
"eval_loss": 4.7552924156188965,
"eval_runtime": 53.5229,
"eval_samples_per_second": 310.652,
"eval_steps_per_second": 19.431,
"step": 1000
},
{
"epoch": 0.30605106680657573,
"grad_norm": 0.43191710114479065,
"learning_rate": 0.000596678915135608,
"loss": 4.7805,
"step": 1050
},
{
"epoch": 0.3206249271306984,
"grad_norm": 0.427901953458786,
"learning_rate": 0.0005965039370078739,
"loss": 4.7407,
"step": 1100
},
{
"epoch": 0.335198787454821,
"grad_norm": 0.6437082290649414,
"learning_rate": 0.0005963289588801399,
"loss": 4.7037,
"step": 1150
},
{
"epoch": 0.3497726477789437,
"grad_norm": 0.43874093890190125,
"learning_rate": 0.0005961539807524059,
"loss": 4.6684,
"step": 1200
},
{
"epoch": 0.36434650810306635,
"grad_norm": 0.4586627185344696,
"learning_rate": 0.0005959790026246719,
"loss": 4.6442,
"step": 1250
},
{
"epoch": 0.378920368427189,
"grad_norm": 0.4979628324508667,
"learning_rate": 0.0005958040244969378,
"loss": 4.6031,
"step": 1300
},
{
"epoch": 0.3934942287513116,
"grad_norm": 0.4893154203891754,
"learning_rate": 0.0005956290463692038,
"loss": 4.5684,
"step": 1350
},
{
"epoch": 0.4080680890754343,
"grad_norm": 0.4149429202079773,
"learning_rate": 0.0005954540682414698,
"loss": 4.556,
"step": 1400
},
{
"epoch": 0.42264194939955696,
"grad_norm": 0.43906036019325256,
"learning_rate": 0.0005952790901137357,
"loss": 4.5275,
"step": 1450
},
{
"epoch": 0.4372158097236796,
"grad_norm": 0.47645220160484314,
"learning_rate": 0.0005951041119860017,
"loss": 4.5114,
"step": 1500
},
{
"epoch": 0.45178967004780224,
"grad_norm": 0.4202033281326294,
"learning_rate": 0.0005949291338582677,
"loss": 4.5,
"step": 1550
},
{
"epoch": 0.4663635303719249,
"grad_norm": 0.4314590096473694,
"learning_rate": 0.0005947541557305336,
"loss": 4.4597,
"step": 1600
},
{
"epoch": 0.4809373906960476,
"grad_norm": 0.4086189568042755,
"learning_rate": 0.0005945791776027996,
"loss": 4.4403,
"step": 1650
},
{
"epoch": 0.49551125102017024,
"grad_norm": 0.3905881643295288,
"learning_rate": 0.0005944041994750656,
"loss": 4.4181,
"step": 1700
},
{
"epoch": 0.5100851113442929,
"grad_norm": 0.40394750237464905,
"learning_rate": 0.0005942292213473315,
"loss": 4.4135,
"step": 1750
},
{
"epoch": 0.5246589716684156,
"grad_norm": 0.4248270094394684,
"learning_rate": 0.0005940542432195975,
"loss": 4.3972,
"step": 1800
},
{
"epoch": 0.5392328319925381,
"grad_norm": 0.40690281987190247,
"learning_rate": 0.0005938792650918635,
"loss": 4.3909,
"step": 1850
},
{
"epoch": 0.5538066923166608,
"grad_norm": 0.3957778513431549,
"learning_rate": 0.0005937042869641295,
"loss": 4.3735,
"step": 1900
},
{
"epoch": 0.5683805526407835,
"grad_norm": 0.4087545871734619,
"learning_rate": 0.0005935293088363953,
"loss": 4.3596,
"step": 1950
},
{
"epoch": 0.5829544129649061,
"grad_norm": 0.3965914845466614,
"learning_rate": 0.0005933543307086613,
"loss": 4.3462,
"step": 2000
},
{
"epoch": 0.5829544129649061,
"eval_accuracy": 0.2984550980845175,
"eval_loss": 4.287600517272949,
"eval_runtime": 53.0696,
"eval_samples_per_second": 313.306,
"eval_steps_per_second": 19.597,
"step": 2000
},
{
"epoch": 0.5975282732890288,
"grad_norm": 0.3693578541278839,
"learning_rate": 0.0005931793525809273,
"loss": 4.339,
"step": 2050
},
{
"epoch": 0.6121021336131515,
"grad_norm": 0.4273644983768463,
"learning_rate": 0.0005930043744531933,
"loss": 4.336,
"step": 2100
},
{
"epoch": 0.6266759939372741,
"grad_norm": 0.40753233432769775,
"learning_rate": 0.0005928293963254592,
"loss": 4.3252,
"step": 2150
},
{
"epoch": 0.6412498542613968,
"grad_norm": 0.3944893181324005,
"learning_rate": 0.0005926544181977252,
"loss": 4.3,
"step": 2200
},
{
"epoch": 0.6558237145855194,
"grad_norm": 0.40525901317596436,
"learning_rate": 0.0005924794400699912,
"loss": 4.2763,
"step": 2250
},
{
"epoch": 0.670397574909642,
"grad_norm": 0.3596903085708618,
"learning_rate": 0.0005923044619422571,
"loss": 4.2768,
"step": 2300
},
{
"epoch": 0.6849714352337647,
"grad_norm": 0.4436907172203064,
"learning_rate": 0.0005921294838145231,
"loss": 4.2644,
"step": 2350
},
{
"epoch": 0.6995452955578874,
"grad_norm": 0.3655913472175598,
"learning_rate": 0.0005919545056867891,
"loss": 4.2584,
"step": 2400
},
{
"epoch": 0.71411915588201,
"grad_norm": 0.3963913321495056,
"learning_rate": 0.0005917795275590551,
"loss": 4.2567,
"step": 2450
},
{
"epoch": 0.7286930162061327,
"grad_norm": 0.3979296386241913,
"learning_rate": 0.000591604549431321,
"loss": 4.2257,
"step": 2500
},
{
"epoch": 0.7432668765302554,
"grad_norm": 0.4119713306427002,
"learning_rate": 0.000591429571303587,
"loss": 4.2345,
"step": 2550
},
{
"epoch": 0.757840736854378,
"grad_norm": 0.41694387793540955,
"learning_rate": 0.000591254593175853,
"loss": 4.2196,
"step": 2600
},
{
"epoch": 0.7724145971785007,
"grad_norm": 0.35006240010261536,
"learning_rate": 0.000591079615048119,
"loss": 4.2018,
"step": 2650
},
{
"epoch": 0.7869884575026233,
"grad_norm": 0.365438312292099,
"learning_rate": 0.0005909046369203849,
"loss": 4.2114,
"step": 2700
},
{
"epoch": 0.8015623178267459,
"grad_norm": 0.3674423396587372,
"learning_rate": 0.0005907296587926509,
"loss": 4.2021,
"step": 2750
},
{
"epoch": 0.8161361781508686,
"grad_norm": 0.3676803410053253,
"learning_rate": 0.0005905546806649169,
"loss": 4.2025,
"step": 2800
},
{
"epoch": 0.8307100384749913,
"grad_norm": 0.429167240858078,
"learning_rate": 0.0005903797025371829,
"loss": 4.1849,
"step": 2850
},
{
"epoch": 0.8452838987991139,
"grad_norm": 0.3721235990524292,
"learning_rate": 0.0005902047244094488,
"loss": 4.1595,
"step": 2900
},
{
"epoch": 0.8598577591232366,
"grad_norm": 0.3640214204788208,
"learning_rate": 0.0005900297462817148,
"loss": 4.1594,
"step": 2950
},
{
"epoch": 0.8744316194473593,
"grad_norm": 0.35601043701171875,
"learning_rate": 0.0005898547681539808,
"loss": 4.1483,
"step": 3000
},
{
"epoch": 0.8744316194473593,
"eval_accuracy": 0.3144607061087188,
"eval_loss": 4.105985164642334,
"eval_runtime": 53.2638,
"eval_samples_per_second": 312.163,
"eval_steps_per_second": 19.525,
"step": 3000
},
{
"epoch": 0.8890054797714819,
"grad_norm": 0.35206031799316406,
"learning_rate": 0.0005896797900262466,
"loss": 4.1486,
"step": 3050
},
{
"epoch": 0.9035793400956045,
"grad_norm": 0.36506882309913635,
"learning_rate": 0.0005895048118985126,
"loss": 4.1418,
"step": 3100
},
{
"epoch": 0.9181532004197271,
"grad_norm": 0.3510189950466156,
"learning_rate": 0.0005893298337707786,
"loss": 4.147,
"step": 3150
},
{
"epoch": 0.9327270607438498,
"grad_norm": 0.3484358787536621,
"learning_rate": 0.0005891548556430446,
"loss": 4.1269,
"step": 3200
},
{
"epoch": 0.9473009210679725,
"grad_norm": 0.3678065836429596,
"learning_rate": 0.0005889798775153105,
"loss": 4.1302,
"step": 3250
},
{
"epoch": 0.9618747813920951,
"grad_norm": 0.3914882242679596,
"learning_rate": 0.0005888048993875765,
"loss": 4.1129,
"step": 3300
},
{
"epoch": 0.9764486417162178,
"grad_norm": 0.3623497188091278,
"learning_rate": 0.0005886299212598425,
"loss": 4.1136,
"step": 3350
},
{
"epoch": 0.9910225020403405,
"grad_norm": 0.3361571431159973,
"learning_rate": 0.0005884549431321084,
"loss": 4.1018,
"step": 3400
},
{
"epoch": 1.0055380669231666,
"grad_norm": 0.3415493071079254,
"learning_rate": 0.0005882799650043744,
"loss": 4.0828,
"step": 3450
},
{
"epoch": 1.0201119272472892,
"grad_norm": 0.34337010979652405,
"learning_rate": 0.0005881049868766404,
"loss": 4.0262,
"step": 3500
},
{
"epoch": 1.034685787571412,
"grad_norm": 0.3571913242340088,
"learning_rate": 0.0005879300087489063,
"loss": 4.0256,
"step": 3550
},
{
"epoch": 1.0492596478955345,
"grad_norm": 0.36647820472717285,
"learning_rate": 0.0005877550306211723,
"loss": 4.0044,
"step": 3600
},
{
"epoch": 1.0638335082196573,
"grad_norm": 0.359706312417984,
"learning_rate": 0.0005875800524934383,
"loss": 4.0143,
"step": 3650
},
{
"epoch": 1.0784073685437798,
"grad_norm": 0.35660111904144287,
"learning_rate": 0.0005874050743657042,
"loss": 4.0265,
"step": 3700
},
{
"epoch": 1.0929812288679026,
"grad_norm": 0.3381880521774292,
"learning_rate": 0.0005872300962379702,
"loss": 4.0087,
"step": 3750
},
{
"epoch": 1.1075550891920252,
"grad_norm": 0.3747643530368805,
"learning_rate": 0.0005870551181102362,
"loss": 4.01,
"step": 3800
},
{
"epoch": 1.122128949516148,
"grad_norm": 0.3946177661418915,
"learning_rate": 0.0005868801399825022,
"loss": 3.9809,
"step": 3850
},
{
"epoch": 1.1367028098402705,
"grad_norm": 0.3495027720928192,
"learning_rate": 0.0005867051618547681,
"loss": 3.9893,
"step": 3900
},
{
"epoch": 1.151276670164393,
"grad_norm": 0.356650173664093,
"learning_rate": 0.0005865301837270341,
"loss": 4.0052,
"step": 3950
},
{
"epoch": 1.1658505304885158,
"grad_norm": 0.37137117981910706,
"learning_rate": 0.0005863552055993001,
"loss": 4.003,
"step": 4000
},
{
"epoch": 1.1658505304885158,
"eval_accuracy": 0.32443093231166104,
"eval_loss": 3.994969129562378,
"eval_runtime": 53.0558,
"eval_samples_per_second": 313.387,
"eval_steps_per_second": 19.602,
"step": 4000
},
{
"epoch": 1.1804243908126384,
"grad_norm": 0.34678372740745544,
"learning_rate": 0.0005861802274715659,
"loss": 4.0064,
"step": 4050
},
{
"epoch": 1.1949982511367612,
"grad_norm": 0.33129486441612244,
"learning_rate": 0.0005860052493438319,
"loss": 3.9905,
"step": 4100
},
{
"epoch": 1.2095721114608837,
"grad_norm": 0.3196367621421814,
"learning_rate": 0.0005858302712160979,
"loss": 4.0104,
"step": 4150
},
{
"epoch": 1.2241459717850065,
"grad_norm": 0.3991895318031311,
"learning_rate": 0.0005856552930883638,
"loss": 3.9833,
"step": 4200
},
{
"epoch": 1.238719832109129,
"grad_norm": 0.3395180106163025,
"learning_rate": 0.0005854803149606298,
"loss": 3.984,
"step": 4250
},
{
"epoch": 1.2532936924332518,
"grad_norm": 0.32869836688041687,
"learning_rate": 0.0005853053368328958,
"loss": 3.98,
"step": 4300
},
{
"epoch": 1.2678675527573744,
"grad_norm": 0.3420329689979553,
"learning_rate": 0.0005851303587051618,
"loss": 3.9745,
"step": 4350
},
{
"epoch": 1.282441413081497,
"grad_norm": 0.34595686197280884,
"learning_rate": 0.0005849553805774277,
"loss": 3.9838,
"step": 4400
},
{
"epoch": 1.2970152734056197,
"grad_norm": 0.3202042877674103,
"learning_rate": 0.0005847804024496937,
"loss": 3.9751,
"step": 4450
},
{
"epoch": 1.3115891337297423,
"grad_norm": 0.33933225274086,
"learning_rate": 0.0005846054243219597,
"loss": 3.9762,
"step": 4500
},
{
"epoch": 1.326162994053865,
"grad_norm": 0.3334672749042511,
"learning_rate": 0.0005844304461942257,
"loss": 3.9556,
"step": 4550
},
{
"epoch": 1.3407368543779876,
"grad_norm": 0.33130043745040894,
"learning_rate": 0.0005842554680664916,
"loss": 3.9444,
"step": 4600
},
{
"epoch": 1.3553107147021102,
"grad_norm": 0.33543631434440613,
"learning_rate": 0.0005840804899387576,
"loss": 3.9586,
"step": 4650
},
{
"epoch": 1.369884575026233,
"grad_norm": 0.3611527979373932,
"learning_rate": 0.0005839055118110236,
"loss": 3.9639,
"step": 4700
},
{
"epoch": 1.3844584353503557,
"grad_norm": 0.32615354657173157,
"learning_rate": 0.0005837305336832896,
"loss": 3.9483,
"step": 4750
},
{
"epoch": 1.3990322956744783,
"grad_norm": 0.3202286958694458,
"learning_rate": 0.0005835555555555555,
"loss": 3.9413,
"step": 4800
},
{
"epoch": 1.4136061559986008,
"grad_norm": 0.32702603936195374,
"learning_rate": 0.0005833805774278215,
"loss": 3.9485,
"step": 4850
},
{
"epoch": 1.4281800163227236,
"grad_norm": 0.33720141649246216,
"learning_rate": 0.0005832055993000875,
"loss": 3.9391,
"step": 4900
},
{
"epoch": 1.4427538766468462,
"grad_norm": 0.34604305028915405,
"learning_rate": 0.0005830306211723534,
"loss": 3.9409,
"step": 4950
},
{
"epoch": 1.457327736970969,
"grad_norm": 0.33776533603668213,
"learning_rate": 0.0005828556430446194,
"loss": 3.9556,
"step": 5000
},
{
"epoch": 1.457327736970969,
"eval_accuracy": 0.3308636590309987,
"eval_loss": 3.919948101043701,
"eval_runtime": 53.1821,
"eval_samples_per_second": 312.643,
"eval_steps_per_second": 19.555,
"step": 5000
},
{
"epoch": 1.4719015972950915,
"grad_norm": 0.33289724588394165,
"learning_rate": 0.0005826806649168854,
"loss": 3.9443,
"step": 5050
},
{
"epoch": 1.486475457619214,
"grad_norm": 0.35393133759498596,
"learning_rate": 0.0005825056867891514,
"loss": 3.927,
"step": 5100
},
{
"epoch": 1.5010493179433368,
"grad_norm": 0.3489775061607361,
"learning_rate": 0.0005823307086614172,
"loss": 3.9243,
"step": 5150
},
{
"epoch": 1.5156231782674596,
"grad_norm": 0.33480021357536316,
"learning_rate": 0.0005821557305336832,
"loss": 3.9314,
"step": 5200
},
{
"epoch": 1.5301970385915822,
"grad_norm": 0.32323822379112244,
"learning_rate": 0.0005819807524059492,
"loss": 3.9173,
"step": 5250
},
{
"epoch": 1.5447708989157047,
"grad_norm": 0.3644621670246124,
"learning_rate": 0.0005818057742782152,
"loss": 3.9221,
"step": 5300
},
{
"epoch": 1.5593447592398273,
"grad_norm": 0.32539132237434387,
"learning_rate": 0.0005816307961504811,
"loss": 3.9161,
"step": 5350
},
{
"epoch": 1.57391861956395,
"grad_norm": 0.32493916153907776,
"learning_rate": 0.0005814558180227471,
"loss": 3.9183,
"step": 5400
},
{
"epoch": 1.5884924798880729,
"grad_norm": 0.3585509955883026,
"learning_rate": 0.0005812808398950131,
"loss": 3.9221,
"step": 5450
},
{
"epoch": 1.6030663402121954,
"grad_norm": 0.32815247774124146,
"learning_rate": 0.0005811058617672791,
"loss": 3.9087,
"step": 5500
},
{
"epoch": 1.617640200536318,
"grad_norm": 0.3581700623035431,
"learning_rate": 0.000580930883639545,
"loss": 3.9166,
"step": 5550
},
{
"epoch": 1.6322140608604407,
"grad_norm": 0.3333188593387604,
"learning_rate": 0.000580755905511811,
"loss": 3.8997,
"step": 5600
},
{
"epoch": 1.6467879211845635,
"grad_norm": 0.3736306130886078,
"learning_rate": 0.000580580927384077,
"loss": 3.908,
"step": 5650
},
{
"epoch": 1.661361781508686,
"grad_norm": 0.3163699805736542,
"learning_rate": 0.0005804059492563429,
"loss": 3.9041,
"step": 5700
},
{
"epoch": 1.6759356418328086,
"grad_norm": 0.33947551250457764,
"learning_rate": 0.0005802309711286089,
"loss": 3.8887,
"step": 5750
},
{
"epoch": 1.6905095021569312,
"grad_norm": 0.3375563621520996,
"learning_rate": 0.0005800559930008749,
"loss": 3.8895,
"step": 5800
},
{
"epoch": 1.705083362481054,
"grad_norm": 0.34993767738342285,
"learning_rate": 0.0005798810148731408,
"loss": 3.8793,
"step": 5850
},
{
"epoch": 1.7196572228051767,
"grad_norm": 0.30305829644203186,
"learning_rate": 0.0005797060367454068,
"loss": 3.8957,
"step": 5900
},
{
"epoch": 1.7342310831292993,
"grad_norm": 0.34079498052597046,
"learning_rate": 0.0005795310586176728,
"loss": 3.8875,
"step": 5950
},
{
"epoch": 1.7488049434534219,
"grad_norm": 0.31197983026504517,
"learning_rate": 0.0005793560804899387,
"loss": 3.8834,
"step": 6000
},
{
"epoch": 1.7488049434534219,
"eval_accuracy": 0.3360885796650039,
"eval_loss": 3.8626701831817627,
"eval_runtime": 53.2105,
"eval_samples_per_second": 312.476,
"eval_steps_per_second": 19.545,
"step": 6000
},
{
"epoch": 1.7633788037775446,
"grad_norm": 0.35590726137161255,
"learning_rate": 0.0005791811023622047,
"loss": 3.8794,
"step": 6050
},
{
"epoch": 1.7779526641016674,
"grad_norm": 0.32964015007019043,
"learning_rate": 0.0005790061242344707,
"loss": 3.8809,
"step": 6100
},
{
"epoch": 1.79252652442579,
"grad_norm": 0.326513409614563,
"learning_rate": 0.0005788311461067365,
"loss": 3.8919,
"step": 6150
},
{
"epoch": 1.8071003847499125,
"grad_norm": 0.3079835772514343,
"learning_rate": 0.0005786561679790025,
"loss": 3.8739,
"step": 6200
},
{
"epoch": 1.821674245074035,
"grad_norm": 0.32225608825683594,
"learning_rate": 0.0005784811898512685,
"loss": 3.8682,
"step": 6250
},
{
"epoch": 1.8362481053981579,
"grad_norm": 0.3490994870662689,
"learning_rate": 0.0005783062117235344,
"loss": 3.8679,
"step": 6300
},
{
"epoch": 1.8508219657222806,
"grad_norm": 0.3336406648159027,
"learning_rate": 0.0005781312335958004,
"loss": 3.8754,
"step": 6350
},
{
"epoch": 1.8653958260464032,
"grad_norm": 0.3177735209465027,
"learning_rate": 0.0005779562554680664,
"loss": 3.8614,
"step": 6400
},
{
"epoch": 1.8799696863705257,
"grad_norm": 0.3288932740688324,
"learning_rate": 0.0005777812773403324,
"loss": 3.8722,
"step": 6450
},
{
"epoch": 1.8945435466946485,
"grad_norm": 0.32330477237701416,
"learning_rate": 0.0005776062992125983,
"loss": 3.8629,
"step": 6500
},
{
"epoch": 1.909117407018771,
"grad_norm": 0.329217791557312,
"learning_rate": 0.0005774313210848643,
"loss": 3.8653,
"step": 6550
},
{
"epoch": 1.9236912673428939,
"grad_norm": 0.3131251633167267,
"learning_rate": 0.0005772563429571303,
"loss": 3.8552,
"step": 6600
},
{
"epoch": 1.9382651276670164,
"grad_norm": 0.3294188976287842,
"learning_rate": 0.0005770813648293962,
"loss": 3.8606,
"step": 6650
},
{
"epoch": 1.952838987991139,
"grad_norm": 0.3348577916622162,
"learning_rate": 0.0005769063867016622,
"loss": 3.8644,
"step": 6700
},
{
"epoch": 1.9674128483152618,
"grad_norm": 0.3283770680427551,
"learning_rate": 0.0005767314085739282,
"loss": 3.849,
"step": 6750
},
{
"epoch": 1.9819867086393845,
"grad_norm": 0.3316490948200226,
"learning_rate": 0.0005765564304461942,
"loss": 3.8496,
"step": 6800
},
{
"epoch": 1.996560568963507,
"grad_norm": 0.30883973836898804,
"learning_rate": 0.0005763814523184601,
"loss": 3.8497,
"step": 6850
},
{
"epoch": 2.011076133846333,
"grad_norm": 0.32743337750434875,
"learning_rate": 0.0005762064741907261,
"loss": 3.7759,
"step": 6900
},
{
"epoch": 2.0256499941704558,
"grad_norm": 0.32366102933883667,
"learning_rate": 0.0005760314960629921,
"loss": 3.7522,
"step": 6950
},
{
"epoch": 2.0402238544945783,
"grad_norm": 0.3358854353427887,
"learning_rate": 0.0005758565179352581,
"loss": 3.7614,
"step": 7000
},
{
"epoch": 2.0402238544945783,
"eval_accuracy": 0.3401457111761609,
"eval_loss": 3.820042371749878,
"eval_runtime": 53.1073,
"eval_samples_per_second": 313.083,
"eval_steps_per_second": 19.583,
"step": 7000
},
{
"epoch": 2.0547977148187013,
"grad_norm": 0.32670819759368896,
"learning_rate": 0.000575681539807524,
"loss": 3.7503,
"step": 7050
},
{
"epoch": 2.069371575142824,
"grad_norm": 0.31555989384651184,
"learning_rate": 0.00057550656167979,
"loss": 3.7607,
"step": 7100
},
{
"epoch": 2.0839454354669464,
"grad_norm": 0.3106895089149475,
"learning_rate": 0.000575331583552056,
"loss": 3.7461,
"step": 7150
},
{
"epoch": 2.098519295791069,
"grad_norm": 0.33727002143859863,
"learning_rate": 0.000575156605424322,
"loss": 3.7591,
"step": 7200
},
{
"epoch": 2.113093156115192,
"grad_norm": 0.32135823369026184,
"learning_rate": 0.0005749816272965878,
"loss": 3.7584,
"step": 7250
},
{
"epoch": 2.1276670164393146,
"grad_norm": 0.3297559916973114,
"learning_rate": 0.0005748066491688538,
"loss": 3.7566,
"step": 7300
},
{
"epoch": 2.142240876763437,
"grad_norm": 0.34436535835266113,
"learning_rate": 0.0005746316710411198,
"loss": 3.7664,
"step": 7350
},
{
"epoch": 2.1568147370875597,
"grad_norm": 0.3044912815093994,
"learning_rate": 0.0005744566929133858,
"loss": 3.7561,
"step": 7400
},
{
"epoch": 2.171388597411682,
"grad_norm": 0.3473377525806427,
"learning_rate": 0.0005742817147856517,
"loss": 3.7588,
"step": 7450
},
{
"epoch": 2.185962457735805,
"grad_norm": 0.3503433167934418,
"learning_rate": 0.0005741067366579177,
"loss": 3.775,
"step": 7500
},
{
"epoch": 2.2005363180599278,
"grad_norm": 0.3245205581188202,
"learning_rate": 0.0005739317585301837,
"loss": 3.7608,
"step": 7550
},
{
"epoch": 2.2151101783840503,
"grad_norm": 0.313618004322052,
"learning_rate": 0.0005737567804024496,
"loss": 3.7689,
"step": 7600
},
{
"epoch": 2.229684038708173,
"grad_norm": 0.325967013835907,
"learning_rate": 0.0005735818022747156,
"loss": 3.7553,
"step": 7650
},
{
"epoch": 2.244257899032296,
"grad_norm": 0.31503552198410034,
"learning_rate": 0.0005734068241469816,
"loss": 3.7574,
"step": 7700
},
{
"epoch": 2.2588317593564184,
"grad_norm": 0.33212393522262573,
"learning_rate": 0.0005732318460192476,
"loss": 3.7583,
"step": 7750
},
{
"epoch": 2.273405619680541,
"grad_norm": 0.33172062039375305,
"learning_rate": 0.0005730568678915135,
"loss": 3.7637,
"step": 7800
},
{
"epoch": 2.2879794800046636,
"grad_norm": 0.33467426896095276,
"learning_rate": 0.0005728818897637795,
"loss": 3.7787,
"step": 7850
},
{
"epoch": 2.302553340328786,
"grad_norm": 0.3073655068874359,
"learning_rate": 0.0005727069116360455,
"loss": 3.7648,
"step": 7900
},
{
"epoch": 2.317127200652909,
"grad_norm": 0.3209739625453949,
"learning_rate": 0.0005725319335083115,
"loss": 3.7622,
"step": 7950
},
{
"epoch": 2.3317010609770317,
"grad_norm": 0.32014501094818115,
"learning_rate": 0.0005723569553805774,
"loss": 3.757,
"step": 8000
},
{
"epoch": 2.3317010609770317,
"eval_accuracy": 0.34350807760042285,
"eval_loss": 3.789243459701538,
"eval_runtime": 53.1145,
"eval_samples_per_second": 313.041,
"eval_steps_per_second": 19.58,
"step": 8000
},
{
"epoch": 2.3462749213011542,
"grad_norm": 0.32058241963386536,
"learning_rate": 0.0005721819772528434,
"loss": 3.7583,
"step": 8050
},
{
"epoch": 2.360848781625277,
"grad_norm": 0.3153894543647766,
"learning_rate": 0.0005720069991251094,
"loss": 3.7588,
"step": 8100
},
{
"epoch": 2.3754226419494,
"grad_norm": 0.3164069652557373,
"learning_rate": 0.0005718320209973753,
"loss": 3.7473,
"step": 8150
},
{
"epoch": 2.3899965022735223,
"grad_norm": 0.301470011472702,
"learning_rate": 0.0005716570428696413,
"loss": 3.7519,
"step": 8200
},
{
"epoch": 2.404570362597645,
"grad_norm": 0.31935805082321167,
"learning_rate": 0.0005714820647419073,
"loss": 3.7565,
"step": 8250
},
{
"epoch": 2.4191442229217675,
"grad_norm": 0.3229046165943146,
"learning_rate": 0.0005713070866141731,
"loss": 3.76,
"step": 8300
},
{
"epoch": 2.43371808324589,
"grad_norm": 0.3205743432044983,
"learning_rate": 0.0005711321084864391,
"loss": 3.7615,
"step": 8350
},
{
"epoch": 2.448291943570013,
"grad_norm": 0.33511775732040405,
"learning_rate": 0.0005709571303587051,
"loss": 3.7459,
"step": 8400
},
{
"epoch": 2.4628658038941356,
"grad_norm": 0.30935677886009216,
"learning_rate": 0.000570782152230971,
"loss": 3.7575,
"step": 8450
},
{
"epoch": 2.477439664218258,
"grad_norm": 0.31650465726852417,
"learning_rate": 0.000570607174103237,
"loss": 3.7435,
"step": 8500
},
{
"epoch": 2.4920135245423807,
"grad_norm": 0.32589587569236755,
"learning_rate": 0.000570432195975503,
"loss": 3.7407,
"step": 8550
},
{
"epoch": 2.5065873848665037,
"grad_norm": 0.33884397149086,
"learning_rate": 0.0005702572178477689,
"loss": 3.7479,
"step": 8600
},
{
"epoch": 2.5211612451906262,
"grad_norm": 0.3378530740737915,
"learning_rate": 0.0005700822397200349,
"loss": 3.7527,
"step": 8650
},
{
"epoch": 2.535735105514749,
"grad_norm": 0.31704697012901306,
"learning_rate": 0.0005699072615923009,
"loss": 3.7519,
"step": 8700
},
{
"epoch": 2.5503089658388713,
"grad_norm": 0.316654235124588,
"learning_rate": 0.0005697322834645668,
"loss": 3.7468,
"step": 8750
},
{
"epoch": 2.564882826162994,
"grad_norm": 0.3165382146835327,
"learning_rate": 0.0005695573053368328,
"loss": 3.7497,
"step": 8800
},
{
"epoch": 2.5794566864871165,
"grad_norm": 0.3091493248939514,
"learning_rate": 0.0005693823272090988,
"loss": 3.7419,
"step": 8850
},
{
"epoch": 2.5940305468112395,
"grad_norm": 0.30668315291404724,
"learning_rate": 0.0005692073490813648,
"loss": 3.7545,
"step": 8900
},
{
"epoch": 2.608604407135362,
"grad_norm": 0.3360009789466858,
"learning_rate": 0.0005690323709536307,
"loss": 3.7493,
"step": 8950
},
{
"epoch": 2.6231782674594846,
"grad_norm": 0.30944499373435974,
"learning_rate": 0.0005688573928258967,
"loss": 3.7486,
"step": 9000
},
{
"epoch": 2.6231782674594846,
"eval_accuracy": 0.3460370319324768,
"eval_loss": 3.757467031478882,
"eval_runtime": 53.2431,
"eval_samples_per_second": 312.284,
"eval_steps_per_second": 19.533,
"step": 9000
},
{
"epoch": 2.6377521277836076,
"grad_norm": 0.3274962902069092,
"learning_rate": 0.0005686824146981627,
"loss": 3.7323,
"step": 9050
},
{
"epoch": 2.65232598810773,
"grad_norm": 0.3120983839035034,
"learning_rate": 0.0005685074365704287,
"loss": 3.7383,
"step": 9100
},
{
"epoch": 2.6668998484318527,
"grad_norm": 0.31694895029067993,
"learning_rate": 0.0005683324584426946,
"loss": 3.7308,
"step": 9150
},
{
"epoch": 2.6814737087559752,
"grad_norm": 0.33465129137039185,
"learning_rate": 0.0005681574803149606,
"loss": 3.7403,
"step": 9200
},
{
"epoch": 2.696047569080098,
"grad_norm": 0.33591049909591675,
"learning_rate": 0.0005679825021872266,
"loss": 3.7318,
"step": 9250
},
{
"epoch": 2.7106214294042204,
"grad_norm": 0.3214552104473114,
"learning_rate": 0.0005678075240594926,
"loss": 3.7361,
"step": 9300
},
{
"epoch": 2.7251952897283434,
"grad_norm": 0.3201562166213989,
"learning_rate": 0.0005676325459317584,
"loss": 3.7222,
"step": 9350
},
{
"epoch": 2.739769150052466,
"grad_norm": 0.32140442728996277,
"learning_rate": 0.0005674575678040244,
"loss": 3.7411,
"step": 9400
},
{
"epoch": 2.7543430103765885,
"grad_norm": 0.3104303181171417,
"learning_rate": 0.0005672825896762904,
"loss": 3.7268,
"step": 9450
},
{
"epoch": 2.7689168707007115,
"grad_norm": 0.30419158935546875,
"learning_rate": 0.0005671076115485563,
"loss": 3.7381,
"step": 9500
},
{
"epoch": 2.783490731024834,
"grad_norm": 0.3095656931400299,
"learning_rate": 0.0005669326334208223,
"loss": 3.7364,
"step": 9550
},
{
"epoch": 2.7980645913489566,
"grad_norm": 0.33916303515434265,
"learning_rate": 0.0005667576552930883,
"loss": 3.7464,
"step": 9600
},
{
"epoch": 2.812638451673079,
"grad_norm": 0.3232407569885254,
"learning_rate": 0.0005665826771653543,
"loss": 3.7295,
"step": 9650
},
{
"epoch": 2.8272123119972017,
"grad_norm": 0.32287347316741943,
"learning_rate": 0.0005664076990376202,
"loss": 3.7335,
"step": 9700
},
{
"epoch": 2.8417861723213242,
"grad_norm": 0.30645373463630676,
"learning_rate": 0.0005662327209098862,
"loss": 3.717,
"step": 9750
},
{
"epoch": 2.8563600326454472,
"grad_norm": 0.3092345595359802,
"learning_rate": 0.0005660577427821522,
"loss": 3.7239,
"step": 9800
},
{
"epoch": 2.87093389296957,
"grad_norm": 0.3197495937347412,
"learning_rate": 0.0005658827646544182,
"loss": 3.739,
"step": 9850
},
{
"epoch": 2.8855077532936924,
"grad_norm": 0.32873114943504333,
"learning_rate": 0.0005657077865266841,
"loss": 3.7426,
"step": 9900
},
{
"epoch": 2.9000816136178154,
"grad_norm": 0.31242650747299194,
"learning_rate": 0.0005655328083989501,
"loss": 3.7344,
"step": 9950
},
{
"epoch": 2.914655473941938,
"grad_norm": 0.30841541290283203,
"learning_rate": 0.0005653578302712161,
"loss": 3.7276,
"step": 10000
},
{
"epoch": 2.914655473941938,
"eval_accuracy": 0.34864237158409617,
"eval_loss": 3.731783151626587,
"eval_runtime": 53.4879,
"eval_samples_per_second": 310.856,
"eval_steps_per_second": 19.444,
"step": 10000
},
{
"epoch": 2.9292293342660605,
"grad_norm": 0.2998133897781372,
"learning_rate": 0.0005651828521434821,
"loss": 3.7182,
"step": 10050
},
{
"epoch": 2.943803194590183,
"grad_norm": 0.2998456656932831,
"learning_rate": 0.000565007874015748,
"loss": 3.7475,
"step": 10100
},
{
"epoch": 2.9583770549143056,
"grad_norm": 0.308724969625473,
"learning_rate": 0.000564832895888014,
"loss": 3.7205,
"step": 10150
},
{
"epoch": 2.972950915238428,
"grad_norm": 0.3019901216030121,
"learning_rate": 0.00056465791776028,
"loss": 3.7242,
"step": 10200
},
{
"epoch": 2.987524775562551,
"grad_norm": 0.30775097012519836,
"learning_rate": 0.0005644829396325459,
"loss": 3.7263,
"step": 10250
},
{
"epoch": 3.0020403404453773,
"grad_norm": 0.31498757004737854,
"learning_rate": 0.0005643079615048119,
"loss": 3.7102,
"step": 10300
},
{
"epoch": 3.0166142007695,
"grad_norm": 0.3100306987762451,
"learning_rate": 0.0005641329833770779,
"loss": 3.6121,
"step": 10350
},
{
"epoch": 3.0311880610936224,
"grad_norm": 0.32588666677474976,
"learning_rate": 0.0005639580052493437,
"loss": 3.6233,
"step": 10400
},
{
"epoch": 3.045761921417745,
"grad_norm": 0.3099152147769928,
"learning_rate": 0.0005637830271216097,
"loss": 3.6062,
"step": 10450
},
{
"epoch": 3.060335781741868,
"grad_norm": 0.33343327045440674,
"learning_rate": 0.0005636080489938757,
"loss": 3.6271,
"step": 10500
},
{
"epoch": 3.0749096420659905,
"grad_norm": 0.3227072060108185,
"learning_rate": 0.0005634330708661417,
"loss": 3.6244,
"step": 10550
},
{
"epoch": 3.089483502390113,
"grad_norm": 0.3116724491119385,
"learning_rate": 0.0005632580927384076,
"loss": 3.6408,
"step": 10600
},
{
"epoch": 3.1040573627142356,
"grad_norm": 0.3207854628562927,
"learning_rate": 0.0005630831146106736,
"loss": 3.6246,
"step": 10650
},
{
"epoch": 3.1186312230383586,
"grad_norm": 0.33564624190330505,
"learning_rate": 0.0005629081364829396,
"loss": 3.6297,
"step": 10700
},
{
"epoch": 3.133205083362481,
"grad_norm": 0.3260132074356079,
"learning_rate": 0.0005627331583552055,
"loss": 3.6283,
"step": 10750
},
{
"epoch": 3.1477789436866037,
"grad_norm": 0.319987416267395,
"learning_rate": 0.0005625581802274715,
"loss": 3.6464,
"step": 10800
},
{
"epoch": 3.1623528040107263,
"grad_norm": 0.3306788206100464,
"learning_rate": 0.0005623832020997375,
"loss": 3.6449,
"step": 10850
},
{
"epoch": 3.176926664334849,
"grad_norm": 0.31286585330963135,
"learning_rate": 0.0005622082239720034,
"loss": 3.6245,
"step": 10900
},
{
"epoch": 3.191500524658972,
"grad_norm": 0.3328874111175537,
"learning_rate": 0.0005620332458442694,
"loss": 3.6358,
"step": 10950
},
{
"epoch": 3.2060743849830944,
"grad_norm": 0.31454306840896606,
"learning_rate": 0.0005618582677165354,
"loss": 3.6398,
"step": 11000
},
{
"epoch": 3.2060743849830944,
"eval_accuracy": 0.3506840605494305,
"eval_loss": 3.7200303077697754,
"eval_runtime": 53.1789,
"eval_samples_per_second": 312.662,
"eval_steps_per_second": 19.557,
"step": 11000
},
{
"epoch": 3.220648245307217,
"grad_norm": 0.34588032960891724,
"learning_rate": 0.0005616832895888013,
"loss": 3.6533,
"step": 11050
},
{
"epoch": 3.2352221056313395,
"grad_norm": 0.313573956489563,
"learning_rate": 0.0005615083114610673,
"loss": 3.6573,
"step": 11100
},
{
"epoch": 3.249795965955462,
"grad_norm": 0.31849417090415955,
"learning_rate": 0.0005613333333333333,
"loss": 3.6237,
"step": 11150
},
{
"epoch": 3.264369826279585,
"grad_norm": 0.32497328519821167,
"learning_rate": 0.0005611583552055992,
"loss": 3.6429,
"step": 11200
},
{
"epoch": 3.2789436866037076,
"grad_norm": 0.3148914873600006,
"learning_rate": 0.0005609833770778652,
"loss": 3.6336,
"step": 11250
},
{
"epoch": 3.29351754692783,
"grad_norm": 0.3106619417667389,
"learning_rate": 0.0005608083989501312,
"loss": 3.6433,
"step": 11300
},
{
"epoch": 3.3080914072519527,
"grad_norm": 0.31093135476112366,
"learning_rate": 0.0005606334208223972,
"loss": 3.6351,
"step": 11350
},
{
"epoch": 3.3226652675760757,
"grad_norm": 0.308928906917572,
"learning_rate": 0.000560458442694663,
"loss": 3.6378,
"step": 11400
},
{
"epoch": 3.3372391279001983,
"grad_norm": 0.3089756965637207,
"learning_rate": 0.000560283464566929,
"loss": 3.6515,
"step": 11450
},
{
"epoch": 3.351812988224321,
"grad_norm": 0.3176999092102051,
"learning_rate": 0.000560108486439195,
"loss": 3.6379,
"step": 11500
},
{
"epoch": 3.3663868485484434,
"grad_norm": 0.3044137954711914,
"learning_rate": 0.000559933508311461,
"loss": 3.6399,
"step": 11550
},
{
"epoch": 3.380960708872566,
"grad_norm": 0.31266817450523376,
"learning_rate": 0.0005597585301837269,
"loss": 3.6518,
"step": 11600
},
{
"epoch": 3.395534569196689,
"grad_norm": 0.32951512932777405,
"learning_rate": 0.0005595835520559929,
"loss": 3.6541,
"step": 11650
},
{
"epoch": 3.4101084295208115,
"grad_norm": 0.3169146478176117,
"learning_rate": 0.0005594085739282589,
"loss": 3.6457,
"step": 11700
},
{
"epoch": 3.424682289844934,
"grad_norm": 0.31720760464668274,
"learning_rate": 0.0005592335958005249,
"loss": 3.6483,
"step": 11750
},
{
"epoch": 3.4392561501690566,
"grad_norm": 0.3167465925216675,
"learning_rate": 0.0005590586176727908,
"loss": 3.6523,
"step": 11800
},
{
"epoch": 3.4538300104931796,
"grad_norm": 0.308098703622818,
"learning_rate": 0.0005588836395450568,
"loss": 3.6401,
"step": 11850
},
{
"epoch": 3.468403870817302,
"grad_norm": 0.3026895821094513,
"learning_rate": 0.0005587086614173228,
"loss": 3.6505,
"step": 11900
},
{
"epoch": 3.4829777311414247,
"grad_norm": 0.3005221486091614,
"learning_rate": 0.0005585336832895888,
"loss": 3.6451,
"step": 11950
},
{
"epoch": 3.4975515914655473,
"grad_norm": 0.3093535304069519,
"learning_rate": 0.0005583587051618547,
"loss": 3.6512,
"step": 12000
},
{
"epoch": 3.4975515914655473,
"eval_accuracy": 0.35233946812984374,
"eval_loss": 3.7018749713897705,
"eval_runtime": 53.3009,
"eval_samples_per_second": 311.946,
"eval_steps_per_second": 19.512,
"step": 12000
},
{
"epoch": 3.51212545178967,
"grad_norm": 0.3283046782016754,
"learning_rate": 0.0005581837270341207,
"loss": 3.6413,
"step": 12050
},
{
"epoch": 3.526699312113793,
"grad_norm": 0.32031574845314026,
"learning_rate": 0.0005580087489063867,
"loss": 3.6525,
"step": 12100
},
{
"epoch": 3.5412731724379154,
"grad_norm": 0.30100202560424805,
"learning_rate": 0.0005578337707786526,
"loss": 3.6395,
"step": 12150
},
{
"epoch": 3.555847032762038,
"grad_norm": 0.32140886783599854,
"learning_rate": 0.0005576587926509186,
"loss": 3.6548,
"step": 12200
},
{
"epoch": 3.5704208930861605,
"grad_norm": 0.31598982214927673,
"learning_rate": 0.0005574838145231846,
"loss": 3.6401,
"step": 12250
},
{
"epoch": 3.5849947534102835,
"grad_norm": 0.33419525623321533,
"learning_rate": 0.0005573088363954506,
"loss": 3.6517,
"step": 12300
},
{
"epoch": 3.599568613734406,
"grad_norm": 0.32463476061820984,
"learning_rate": 0.0005571338582677165,
"loss": 3.6566,
"step": 12350
},
{
"epoch": 3.6141424740585286,
"grad_norm": 0.30612531304359436,
"learning_rate": 0.0005569588801399825,
"loss": 3.6432,
"step": 12400
},
{
"epoch": 3.628716334382651,
"grad_norm": 0.3223789930343628,
"learning_rate": 0.0005567839020122485,
"loss": 3.6425,
"step": 12450
},
{
"epoch": 3.6432901947067737,
"grad_norm": 0.30926087498664856,
"learning_rate": 0.0005566089238845145,
"loss": 3.6481,
"step": 12500
},
{
"epoch": 3.6578640550308967,
"grad_norm": 0.33441799879074097,
"learning_rate": 0.0005564339457567803,
"loss": 3.6521,
"step": 12550
},
{
"epoch": 3.6724379153550193,
"grad_norm": 0.3189132511615753,
"learning_rate": 0.0005562589676290463,
"loss": 3.6557,
"step": 12600
},
{
"epoch": 3.687011775679142,
"grad_norm": 0.30546867847442627,
"learning_rate": 0.0005560839895013123,
"loss": 3.653,
"step": 12650
},
{
"epoch": 3.7015856360032644,
"grad_norm": 0.3111048936843872,
"learning_rate": 0.0005559090113735782,
"loss": 3.6362,
"step": 12700
},
{
"epoch": 3.7161594963273874,
"grad_norm": 0.3029773235321045,
"learning_rate": 0.0005557340332458442,
"loss": 3.6503,
"step": 12750
},
{
"epoch": 3.73073335665151,
"grad_norm": 0.31810298562049866,
"learning_rate": 0.0005555590551181102,
"loss": 3.6596,
"step": 12800
},
{
"epoch": 3.7453072169756325,
"grad_norm": 0.3288934528827667,
"learning_rate": 0.0005553840769903761,
"loss": 3.6478,
"step": 12850
},
{
"epoch": 3.759881077299755,
"grad_norm": 0.3125859200954437,
"learning_rate": 0.0005552090988626421,
"loss": 3.6452,
"step": 12900
},
{
"epoch": 3.7744549376238776,
"grad_norm": 0.3162294626235962,
"learning_rate": 0.0005550341207349081,
"loss": 3.6445,
"step": 12950
},
{
"epoch": 3.7890287979480006,
"grad_norm": 0.3025747537612915,
"learning_rate": 0.000554859142607174,
"loss": 3.6577,
"step": 13000
},
{
"epoch": 3.7890287979480006,
"eval_accuracy": 0.35399511110415394,
"eval_loss": 3.682143449783325,
"eval_runtime": 53.2805,
"eval_samples_per_second": 312.065,
"eval_steps_per_second": 19.519,
"step": 13000
},
{
"epoch": 3.803602658272123,
"grad_norm": 0.3304448425769806,
"learning_rate": 0.00055468416447944,
"loss": 3.6377,
"step": 13050
},
{
"epoch": 3.8181765185962457,
"grad_norm": 0.29563280940055847,
"learning_rate": 0.000554509186351706,
"loss": 3.6455,
"step": 13100
},
{
"epoch": 3.8327503789203683,
"grad_norm": 0.30134159326553345,
"learning_rate": 0.000554334208223972,
"loss": 3.6413,
"step": 13150
},
{
"epoch": 3.8473242392444913,
"grad_norm": 0.3190068304538727,
"learning_rate": 0.0005541592300962379,
"loss": 3.6308,
"step": 13200
},
{
"epoch": 3.861898099568614,
"grad_norm": 0.30791667103767395,
"learning_rate": 0.0005539842519685039,
"loss": 3.6437,
"step": 13250
},
{
"epoch": 3.8764719598927364,
"grad_norm": 0.3044162094593048,
"learning_rate": 0.0005538092738407699,
"loss": 3.6238,
"step": 13300
},
{
"epoch": 3.891045820216859,
"grad_norm": 0.31529924273490906,
"learning_rate": 0.0005536342957130358,
"loss": 3.6585,
"step": 13350
},
{
"epoch": 3.9056196805409815,
"grad_norm": 0.2981610596179962,
"learning_rate": 0.0005534593175853018,
"loss": 3.6456,
"step": 13400
},
{
"epoch": 3.920193540865104,
"grad_norm": 0.3133598566055298,
"learning_rate": 0.0005532843394575678,
"loss": 3.6483,
"step": 13450
},
{
"epoch": 3.934767401189227,
"grad_norm": 0.3186839520931244,
"learning_rate": 0.0005531093613298337,
"loss": 3.6346,
"step": 13500
},
{
"epoch": 3.9493412615133496,
"grad_norm": 0.3375805914402008,
"learning_rate": 0.0005529343832020997,
"loss": 3.6408,
"step": 13550
},
{
"epoch": 3.963915121837472,
"grad_norm": 0.30909714102745056,
"learning_rate": 0.0005527594050743656,
"loss": 3.6381,
"step": 13600
},
{
"epoch": 3.978488982161595,
"grad_norm": 0.30542704463005066,
"learning_rate": 0.0005525844269466316,
"loss": 3.6544,
"step": 13650
},
{
"epoch": 3.9930628424857177,
"grad_norm": 0.31719449162483215,
"learning_rate": 0.0005524094488188975,
"loss": 3.6531,
"step": 13700
},
{
"epoch": 4.007578407368544,
"grad_norm": 0.32333192229270935,
"learning_rate": 0.0005522344706911635,
"loss": 3.5804,
"step": 13750
},
{
"epoch": 4.022152267692666,
"grad_norm": 0.33345669507980347,
"learning_rate": 0.0005520594925634295,
"loss": 3.5267,
"step": 13800
},
{
"epoch": 4.036726128016789,
"grad_norm": 0.34269770979881287,
"learning_rate": 0.0005518845144356954,
"loss": 3.5349,
"step": 13850
},
{
"epoch": 4.0512999883409115,
"grad_norm": 0.3315916955471039,
"learning_rate": 0.0005517095363079614,
"loss": 3.5392,
"step": 13900
},
{
"epoch": 4.065873848665034,
"grad_norm": 0.31645020842552185,
"learning_rate": 0.0005515345581802274,
"loss": 3.563,
"step": 13950
},
{
"epoch": 4.080447708989157,
"grad_norm": 0.3305814564228058,
"learning_rate": 0.0005513595800524934,
"loss": 3.5496,
"step": 14000
},
{
"epoch": 4.080447708989157,
"eval_accuracy": 0.3554473737514855,
"eval_loss": 3.676452875137329,
"eval_runtime": 53.3246,
"eval_samples_per_second": 311.807,
"eval_steps_per_second": 19.503,
"step": 14000
},
{
"epoch": 4.09502156931328,
"grad_norm": 0.30775004625320435,
"learning_rate": 0.0005511846019247593,
"loss": 3.5461,
"step": 14050
},
{
"epoch": 4.109595429637403,
"grad_norm": 0.32936856150627136,
"learning_rate": 0.0005510096237970253,
"loss": 3.539,
"step": 14100
},
{
"epoch": 4.124169289961525,
"grad_norm": 0.3322300314903259,
"learning_rate": 0.0005508346456692913,
"loss": 3.5399,
"step": 14150
},
{
"epoch": 4.138743150285648,
"grad_norm": 0.34817028045654297,
"learning_rate": 0.0005506596675415573,
"loss": 3.5356,
"step": 14200
},
{
"epoch": 4.15331701060977,
"grad_norm": 0.31786543130874634,
"learning_rate": 0.0005504846894138232,
"loss": 3.5594,
"step": 14250
},
{
"epoch": 4.167890870933893,
"grad_norm": 0.3049430549144745,
"learning_rate": 0.0005503097112860892,
"loss": 3.5576,
"step": 14300
},
{
"epoch": 4.182464731258015,
"grad_norm": 0.32783564925193787,
"learning_rate": 0.0005501347331583552,
"loss": 3.5633,
"step": 14350
},
{
"epoch": 4.197038591582138,
"grad_norm": 0.33510342240333557,
"learning_rate": 0.0005499597550306212,
"loss": 3.5555,
"step": 14400
},
{
"epoch": 4.2116124519062605,
"grad_norm": 0.301746666431427,
"learning_rate": 0.0005497847769028871,
"loss": 3.5529,
"step": 14450
},
{
"epoch": 4.226186312230384,
"grad_norm": 0.33126580715179443,
"learning_rate": 0.0005496097987751531,
"loss": 3.5658,
"step": 14500
},
{
"epoch": 4.2407601725545065,
"grad_norm": 0.3298133313655853,
"learning_rate": 0.0005494348206474191,
"loss": 3.561,
"step": 14550
},
{
"epoch": 4.255334032878629,
"grad_norm": 0.33059850335121155,
"learning_rate": 0.0005492598425196851,
"loss": 3.5577,
"step": 14600
},
{
"epoch": 4.269907893202752,
"grad_norm": 0.32975366711616516,
"learning_rate": 0.000549084864391951,
"loss": 3.5681,
"step": 14650
},
{
"epoch": 4.284481753526874,
"grad_norm": 0.33336469531059265,
"learning_rate": 0.000548909886264217,
"loss": 3.572,
"step": 14700
},
{
"epoch": 4.299055613850997,
"grad_norm": 0.3046822249889374,
"learning_rate": 0.000548734908136483,
"loss": 3.5711,
"step": 14750
},
{
"epoch": 4.313629474175119,
"grad_norm": 0.3244114816188812,
"learning_rate": 0.0005485599300087488,
"loss": 3.5777,
"step": 14800
},
{
"epoch": 4.328203334499242,
"grad_norm": 0.3168890178203583,
"learning_rate": 0.0005483849518810148,
"loss": 3.5727,
"step": 14850
},
{
"epoch": 4.342777194823364,
"grad_norm": 0.3299658000469208,
"learning_rate": 0.0005482099737532808,
"loss": 3.5626,
"step": 14900
},
{
"epoch": 4.357351055147488,
"grad_norm": 0.29865631461143494,
"learning_rate": 0.0005480349956255468,
"loss": 3.5794,
"step": 14950
},
{
"epoch": 4.37192491547161,
"grad_norm": 0.33295169472694397,
"learning_rate": 0.0005478600174978127,
"loss": 3.5622,
"step": 15000
},
{
"epoch": 4.37192491547161,
"eval_accuracy": 0.35635163940668024,
"eval_loss": 3.6663150787353516,
"eval_runtime": 53.2482,
"eval_samples_per_second": 312.255,
"eval_steps_per_second": 19.531,
"step": 15000
},
{
"epoch": 4.386498775795733,
"grad_norm": 0.3243556618690491,
"learning_rate": 0.0005476850393700787,
"loss": 3.5707,
"step": 15050
},
{
"epoch": 4.4010726361198556,
"grad_norm": 0.3029650151729584,
"learning_rate": 0.0005475100612423447,
"loss": 3.5766,
"step": 15100
},
{
"epoch": 4.415646496443978,
"grad_norm": 0.3167254626750946,
"learning_rate": 0.0005473350831146106,
"loss": 3.5817,
"step": 15150
},
{
"epoch": 4.430220356768101,
"grad_norm": 0.32114019989967346,
"learning_rate": 0.0005471601049868766,
"loss": 3.5813,
"step": 15200
},
{
"epoch": 4.444794217092223,
"grad_norm": 0.3149368464946747,
"learning_rate": 0.0005469851268591426,
"loss": 3.5623,
"step": 15250
},
{
"epoch": 4.459368077416346,
"grad_norm": 0.3121349513530731,
"learning_rate": 0.0005468101487314085,
"loss": 3.5768,
"step": 15300
},
{
"epoch": 4.473941937740468,
"grad_norm": 0.32959362864494324,
"learning_rate": 0.0005466351706036745,
"loss": 3.5778,
"step": 15350
},
{
"epoch": 4.488515798064592,
"grad_norm": 0.349617063999176,
"learning_rate": 0.0005464601924759405,
"loss": 3.5847,
"step": 15400
},
{
"epoch": 4.503089658388714,
"grad_norm": 0.326031357049942,
"learning_rate": 0.0005462852143482064,
"loss": 3.5799,
"step": 15450
},
{
"epoch": 4.517663518712837,
"grad_norm": 0.33209681510925293,
"learning_rate": 0.0005461102362204724,
"loss": 3.572,
"step": 15500
},
{
"epoch": 4.5322373790369594,
"grad_norm": 0.30135366320610046,
"learning_rate": 0.0005459352580927384,
"loss": 3.5844,
"step": 15550
},
{
"epoch": 4.546811239361082,
"grad_norm": 0.3455754518508911,
"learning_rate": 0.0005457602799650043,
"loss": 3.5838,
"step": 15600
},
{
"epoch": 4.561385099685205,
"grad_norm": 0.30380716919898987,
"learning_rate": 0.0005455853018372703,
"loss": 3.5783,
"step": 15650
},
{
"epoch": 4.575958960009327,
"grad_norm": 0.33048829436302185,
"learning_rate": 0.0005454103237095363,
"loss": 3.5857,
"step": 15700
},
{
"epoch": 4.59053282033345,
"grad_norm": 0.32799360156059265,
"learning_rate": 0.0005452353455818022,
"loss": 3.5731,
"step": 15750
},
{
"epoch": 4.605106680657572,
"grad_norm": 0.3069393038749695,
"learning_rate": 0.0005450603674540681,
"loss": 3.5851,
"step": 15800
},
{
"epoch": 4.619680540981696,
"grad_norm": 0.3098676800727844,
"learning_rate": 0.0005448853893263341,
"loss": 3.5782,
"step": 15850
},
{
"epoch": 4.634254401305818,
"grad_norm": 0.3361780643463135,
"learning_rate": 0.0005447104111986001,
"loss": 3.5892,
"step": 15900
},
{
"epoch": 4.648828261629941,
"grad_norm": 0.3276433050632477,
"learning_rate": 0.000544535433070866,
"loss": 3.5825,
"step": 15950
},
{
"epoch": 4.663402121954063,
"grad_norm": 0.3088076412677765,
"learning_rate": 0.000544360454943132,
"loss": 3.5784,
"step": 16000
},
{
"epoch": 4.663402121954063,
"eval_accuracy": 0.3573661871026036,
"eval_loss": 3.6518714427948,
"eval_runtime": 53.2511,
"eval_samples_per_second": 312.238,
"eval_steps_per_second": 19.53,
"step": 16000
},
{
"epoch": 4.677975982278186,
"grad_norm": 0.3108406960964203,
"learning_rate": 0.000544185476815398,
"loss": 3.596,
"step": 16050
},
{
"epoch": 4.6925498426023085,
"grad_norm": 0.31319963932037354,
"learning_rate": 0.000544010498687664,
"loss": 3.5762,
"step": 16100
},
{
"epoch": 4.707123702926431,
"grad_norm": 0.3112225830554962,
"learning_rate": 0.0005438355205599299,
"loss": 3.5807,
"step": 16150
},
{
"epoch": 4.721697563250554,
"grad_norm": 0.32752206921577454,
"learning_rate": 0.0005436605424321959,
"loss": 3.5715,
"step": 16200
},
{
"epoch": 4.736271423574676,
"grad_norm": 0.3147103786468506,
"learning_rate": 0.0005434855643044619,
"loss": 3.579,
"step": 16250
},
{
"epoch": 4.7508452838988,
"grad_norm": 0.3242517113685608,
"learning_rate": 0.0005433105861767279,
"loss": 3.5785,
"step": 16300
},
{
"epoch": 4.765419144222922,
"grad_norm": 0.29891932010650635,
"learning_rate": 0.0005431356080489938,
"loss": 3.5741,
"step": 16350
},
{
"epoch": 4.779993004547045,
"grad_norm": 0.3357846736907959,
"learning_rate": 0.0005429606299212598,
"loss": 3.5738,
"step": 16400
},
{
"epoch": 4.794566864871167,
"grad_norm": 0.3278196454048157,
"learning_rate": 0.0005427856517935258,
"loss": 3.5824,
"step": 16450
},
{
"epoch": 4.80914072519529,
"grad_norm": 0.29816699028015137,
"learning_rate": 0.0005426106736657917,
"loss": 3.5886,
"step": 16500
},
{
"epoch": 4.823714585519412,
"grad_norm": 0.31529319286346436,
"learning_rate": 0.0005424356955380577,
"loss": 3.5942,
"step": 16550
},
{
"epoch": 4.838288445843535,
"grad_norm": 0.31740570068359375,
"learning_rate": 0.0005422607174103237,
"loss": 3.5816,
"step": 16600
},
{
"epoch": 4.8528623061676575,
"grad_norm": 0.31539270281791687,
"learning_rate": 0.0005420857392825897,
"loss": 3.5812,
"step": 16650
},
{
"epoch": 4.86743616649178,
"grad_norm": 0.3055557608604431,
"learning_rate": 0.0005419107611548556,
"loss": 3.583,
"step": 16700
},
{
"epoch": 4.8820100268159035,
"grad_norm": 0.3405109643936157,
"learning_rate": 0.0005417357830271216,
"loss": 3.5824,
"step": 16750
},
{
"epoch": 4.896583887140026,
"grad_norm": 0.3146877586841583,
"learning_rate": 0.0005415608048993876,
"loss": 3.5799,
"step": 16800
},
{
"epoch": 4.911157747464149,
"grad_norm": 0.32447442412376404,
"learning_rate": 0.0005413858267716535,
"loss": 3.5765,
"step": 16850
},
{
"epoch": 4.925731607788271,
"grad_norm": 0.3202003240585327,
"learning_rate": 0.0005412108486439194,
"loss": 3.5972,
"step": 16900
},
{
"epoch": 4.940305468112394,
"grad_norm": 0.3180946409702301,
"learning_rate": 0.0005410358705161854,
"loss": 3.5874,
"step": 16950
},
{
"epoch": 4.954879328436516,
"grad_norm": 0.3090243637561798,
"learning_rate": 0.0005408608923884514,
"loss": 3.5766,
"step": 17000
},
{
"epoch": 4.954879328436516,
"eval_accuracy": 0.35883657508000155,
"eval_loss": 3.636709690093994,
"eval_runtime": 53.2502,
"eval_samples_per_second": 312.243,
"eval_steps_per_second": 19.53,
"step": 17000
},
{
"epoch": 4.969453188760639,
"grad_norm": 0.30562207102775574,
"learning_rate": 0.0005406859142607174,
"loss": 3.5694,
"step": 17050
},
{
"epoch": 4.984027049084761,
"grad_norm": 0.31890225410461426,
"learning_rate": 0.0005405109361329833,
"loss": 3.5781,
"step": 17100
},
{
"epoch": 4.998600909408884,
"grad_norm": 0.3138265013694763,
"learning_rate": 0.0005403359580052493,
"loss": 3.5757,
"step": 17150
},
{
"epoch": 5.01311647429171,
"grad_norm": 0.3236299753189087,
"learning_rate": 0.0005401609798775153,
"loss": 3.4683,
"step": 17200
},
{
"epoch": 5.0276903346158335,
"grad_norm": 0.3176940381526947,
"learning_rate": 0.0005399860017497813,
"loss": 3.474,
"step": 17250
},
{
"epoch": 5.042264194939956,
"grad_norm": 0.3468603491783142,
"learning_rate": 0.0005398110236220472,
"loss": 3.4827,
"step": 17300
},
{
"epoch": 5.056838055264079,
"grad_norm": 0.3197671175003052,
"learning_rate": 0.0005396360454943132,
"loss": 3.4756,
"step": 17350
},
{
"epoch": 5.071411915588201,
"grad_norm": 0.3230718672275543,
"learning_rate": 0.0005394610673665792,
"loss": 3.484,
"step": 17400
},
{
"epoch": 5.085985775912324,
"grad_norm": 0.3250696361064911,
"learning_rate": 0.0005392860892388451,
"loss": 3.4808,
"step": 17450
},
{
"epoch": 5.100559636236446,
"grad_norm": 0.3313814103603363,
"learning_rate": 0.0005391111111111111,
"loss": 3.4877,
"step": 17500
},
{
"epoch": 5.115133496560569,
"grad_norm": 0.3151894807815552,
"learning_rate": 0.0005389361329833771,
"loss": 3.4858,
"step": 17550
},
{
"epoch": 5.129707356884691,
"grad_norm": 0.31763774156570435,
"learning_rate": 0.000538761154855643,
"loss": 3.4894,
"step": 17600
},
{
"epoch": 5.144281217208814,
"grad_norm": 0.31315064430236816,
"learning_rate": 0.000538586176727909,
"loss": 3.4958,
"step": 17650
},
{
"epoch": 5.158855077532937,
"grad_norm": 0.3251068890094757,
"learning_rate": 0.000538411198600175,
"loss": 3.4978,
"step": 17700
},
{
"epoch": 5.17342893785706,
"grad_norm": 0.3382989466190338,
"learning_rate": 0.0005382362204724409,
"loss": 3.4971,
"step": 17750
},
{
"epoch": 5.1880027981811825,
"grad_norm": 0.3223947286605835,
"learning_rate": 0.0005380612423447069,
"loss": 3.5032,
"step": 17800
},
{
"epoch": 5.202576658505305,
"grad_norm": 0.31984540820121765,
"learning_rate": 0.0005378862642169729,
"loss": 3.5002,
"step": 17850
},
{
"epoch": 5.217150518829428,
"grad_norm": 0.2947542071342468,
"learning_rate": 0.0005377112860892387,
"loss": 3.5097,
"step": 17900
},
{
"epoch": 5.23172437915355,
"grad_norm": 0.3217833936214447,
"learning_rate": 0.0005375363079615047,
"loss": 3.5042,
"step": 17950
},
{
"epoch": 5.246298239477673,
"grad_norm": 0.32693716883659363,
"learning_rate": 0.0005373613298337707,
"loss": 3.5103,
"step": 18000
},
{
"epoch": 5.246298239477673,
"eval_accuracy": 0.3593892799500777,
"eval_loss": 3.6392245292663574,
"eval_runtime": 53.2938,
"eval_samples_per_second": 311.987,
"eval_steps_per_second": 19.514,
"step": 18000
},
{
"epoch": 5.260872099801795,
"grad_norm": 0.3177185654640198,
"learning_rate": 0.0005371863517060366,
"loss": 3.5056,
"step": 18050
},
{
"epoch": 5.275445960125918,
"grad_norm": 0.338419109582901,
"learning_rate": 0.0005370113735783026,
"loss": 3.5179,
"step": 18100
},
{
"epoch": 5.290019820450041,
"grad_norm": 0.32331186532974243,
"learning_rate": 0.0005368363954505686,
"loss": 3.5053,
"step": 18150
},
{
"epoch": 5.304593680774164,
"grad_norm": 0.32105499505996704,
"learning_rate": 0.0005366614173228346,
"loss": 3.5151,
"step": 18200
},
{
"epoch": 5.319167541098286,
"grad_norm": 0.35447439551353455,
"learning_rate": 0.0005364864391951005,
"loss": 3.5032,
"step": 18250
},
{
"epoch": 5.333741401422409,
"grad_norm": 0.325785756111145,
"learning_rate": 0.0005363114610673665,
"loss": 3.5201,
"step": 18300
},
{
"epoch": 5.3483152617465315,
"grad_norm": 0.3268141746520996,
"learning_rate": 0.0005361364829396325,
"loss": 3.5109,
"step": 18350
},
{
"epoch": 5.362889122070654,
"grad_norm": 0.31733396649360657,
"learning_rate": 0.0005359615048118984,
"loss": 3.5147,
"step": 18400
},
{
"epoch": 5.377462982394777,
"grad_norm": 0.317416250705719,
"learning_rate": 0.0005357865266841644,
"loss": 3.5209,
"step": 18450
},
{
"epoch": 5.392036842718899,
"grad_norm": 0.3161477744579315,
"learning_rate": 0.0005356115485564304,
"loss": 3.5195,
"step": 18500
},
{
"epoch": 5.406610703043022,
"grad_norm": 0.3308257758617401,
"learning_rate": 0.0005354365704286964,
"loss": 3.5269,
"step": 18550
},
{
"epoch": 5.421184563367145,
"grad_norm": 0.3476756811141968,
"learning_rate": 0.0005352615923009623,
"loss": 3.5241,
"step": 18600
},
{
"epoch": 5.435758423691268,
"grad_norm": 0.3217465579509735,
"learning_rate": 0.0005350866141732283,
"loss": 3.5084,
"step": 18650
},
{
"epoch": 5.45033228401539,
"grad_norm": 0.31799283623695374,
"learning_rate": 0.0005349116360454943,
"loss": 3.5273,
"step": 18700
},
{
"epoch": 5.464906144339513,
"grad_norm": 0.3282622992992401,
"learning_rate": 0.0005347366579177603,
"loss": 3.5296,
"step": 18750
},
{
"epoch": 5.479480004663635,
"grad_norm": 0.29494890570640564,
"learning_rate": 0.0005345616797900262,
"loss": 3.5258,
"step": 18800
},
{
"epoch": 5.494053864987758,
"grad_norm": 0.3265093266963959,
"learning_rate": 0.0005343867016622922,
"loss": 3.5197,
"step": 18850
},
{
"epoch": 5.5086277253118805,
"grad_norm": 0.3251166045665741,
"learning_rate": 0.0005342117235345582,
"loss": 3.5268,
"step": 18900
},
{
"epoch": 5.523201585636003,
"grad_norm": 0.3073839545249939,
"learning_rate": 0.0005340367454068242,
"loss": 3.5234,
"step": 18950
},
{
"epoch": 5.537775445960126,
"grad_norm": 0.3181171417236328,
"learning_rate": 0.00053386176727909,
"loss": 3.5274,
"step": 19000
},
{
"epoch": 5.537775445960126,
"eval_accuracy": 0.3603505109283382,
"eval_loss": 3.629598617553711,
"eval_runtime": 53.4392,
"eval_samples_per_second": 311.138,
"eval_steps_per_second": 19.461,
"step": 19000
},
{
"epoch": 5.552349306284249,
"grad_norm": 0.31103068590164185,
"learning_rate": 0.000533686789151356,
"loss": 3.5327,
"step": 19050
},
{
"epoch": 5.566923166608372,
"grad_norm": 0.33358463644981384,
"learning_rate": 0.000533511811023622,
"loss": 3.5186,
"step": 19100
},
{
"epoch": 5.581497026932494,
"grad_norm": 0.3126921057701111,
"learning_rate": 0.000533336832895888,
"loss": 3.5254,
"step": 19150
},
{
"epoch": 5.596070887256617,
"grad_norm": 0.3349047303199768,
"learning_rate": 0.0005331618547681539,
"loss": 3.5188,
"step": 19200
},
{
"epoch": 5.610644747580739,
"grad_norm": 0.3327292203903198,
"learning_rate": 0.0005329868766404199,
"loss": 3.5282,
"step": 19250
},
{
"epoch": 5.625218607904862,
"grad_norm": 0.29962432384490967,
"learning_rate": 0.0005328118985126859,
"loss": 3.5257,
"step": 19300
},
{
"epoch": 5.639792468228984,
"grad_norm": 0.31677988171577454,
"learning_rate": 0.0005326369203849518,
"loss": 3.5326,
"step": 19350
},
{
"epoch": 5.654366328553107,
"grad_norm": 0.3229268193244934,
"learning_rate": 0.0005324619422572178,
"loss": 3.5268,
"step": 19400
},
{
"epoch": 5.6689401888772295,
"grad_norm": 0.3216641843318939,
"learning_rate": 0.0005322869641294838,
"loss": 3.5276,
"step": 19450
},
{
"epoch": 5.683514049201353,
"grad_norm": 0.3101942539215088,
"learning_rate": 0.0005321119860017498,
"loss": 3.5275,
"step": 19500
},
{
"epoch": 5.698087909525475,
"grad_norm": 0.3119480013847351,
"learning_rate": 0.0005319370078740157,
"loss": 3.5369,
"step": 19550
},
{
"epoch": 5.712661769849598,
"grad_norm": 0.32610809803009033,
"learning_rate": 0.0005317620297462817,
"loss": 3.5344,
"step": 19600
},
{
"epoch": 5.727235630173721,
"grad_norm": 0.32529962062835693,
"learning_rate": 0.0005315870516185477,
"loss": 3.5331,
"step": 19650
},
{
"epoch": 5.741809490497843,
"grad_norm": 0.3158641457557678,
"learning_rate": 0.0005314120734908137,
"loss": 3.5367,
"step": 19700
},
{
"epoch": 5.756383350821966,
"grad_norm": 0.32387575507164,
"learning_rate": 0.0005312370953630796,
"loss": 3.533,
"step": 19750
},
{
"epoch": 5.770957211146088,
"grad_norm": 0.3478480875492096,
"learning_rate": 0.0005310621172353456,
"loss": 3.5447,
"step": 19800
},
{
"epoch": 5.785531071470211,
"grad_norm": 0.33817481994628906,
"learning_rate": 0.0005308871391076116,
"loss": 3.5409,
"step": 19850
},
{
"epoch": 5.800104931794333,
"grad_norm": 0.31714707612991333,
"learning_rate": 0.0005307121609798775,
"loss": 3.5228,
"step": 19900
},
{
"epoch": 5.814678792118456,
"grad_norm": 0.3329344093799591,
"learning_rate": 0.0005305371828521435,
"loss": 3.5459,
"step": 19950
},
{
"epoch": 5.8292526524425785,
"grad_norm": 0.3378346264362335,
"learning_rate": 0.0005303622047244095,
"loss": 3.5278,
"step": 20000
},
{
"epoch": 5.8292526524425785,
"eval_accuracy": 0.36104056813729396,
"eval_loss": 3.616792917251587,
"eval_runtime": 53.2411,
"eval_samples_per_second": 312.296,
"eval_steps_per_second": 19.534,
"step": 20000
},
{
"epoch": 5.843826512766702,
"grad_norm": 0.324442982673645,
"learning_rate": 0.0005301872265966753,
"loss": 3.5276,
"step": 20050
},
{
"epoch": 5.8584003730908245,
"grad_norm": 0.3171297311782837,
"learning_rate": 0.0005300122484689413,
"loss": 3.5281,
"step": 20100
},
{
"epoch": 5.872974233414947,
"grad_norm": 0.35564813017845154,
"learning_rate": 0.0005298372703412073,
"loss": 3.5407,
"step": 20150
},
{
"epoch": 5.88754809373907,
"grad_norm": 0.3331674337387085,
"learning_rate": 0.0005296622922134732,
"loss": 3.5253,
"step": 20200
},
{
"epoch": 5.902121954063192,
"grad_norm": 0.31681913137435913,
"learning_rate": 0.0005294873140857392,
"loss": 3.536,
"step": 20250
},
{
"epoch": 5.916695814387315,
"grad_norm": 0.3327937722206116,
"learning_rate": 0.0005293123359580052,
"loss": 3.5207,
"step": 20300
},
{
"epoch": 5.931269674711437,
"grad_norm": 0.339324414730072,
"learning_rate": 0.0005291373578302711,
"loss": 3.5475,
"step": 20350
},
{
"epoch": 5.94584353503556,
"grad_norm": 0.3185977637767792,
"learning_rate": 0.0005289623797025371,
"loss": 3.5357,
"step": 20400
},
{
"epoch": 5.960417395359682,
"grad_norm": 0.3692864179611206,
"learning_rate": 0.0005287874015748031,
"loss": 3.5374,
"step": 20450
},
{
"epoch": 5.974991255683806,
"grad_norm": 0.30453935265541077,
"learning_rate": 0.000528612423447069,
"loss": 3.5386,
"step": 20500
},
{
"epoch": 5.989565116007928,
"grad_norm": 0.32170945405960083,
"learning_rate": 0.000528437445319335,
"loss": 3.5519,
"step": 20550
},
{
"epoch": 6.0040806808907545,
"grad_norm": 0.3116176128387451,
"learning_rate": 0.000528262467191601,
"loss": 3.5049,
"step": 20600
},
{
"epoch": 6.018654541214877,
"grad_norm": 0.3333321809768677,
"learning_rate": 0.000528087489063867,
"loss": 3.423,
"step": 20650
},
{
"epoch": 6.033228401539,
"grad_norm": 0.33255186676979065,
"learning_rate": 0.0005279125109361329,
"loss": 3.4219,
"step": 20700
},
{
"epoch": 6.047802261863122,
"grad_norm": 0.3399498164653778,
"learning_rate": 0.0005277375328083989,
"loss": 3.4307,
"step": 20750
},
{
"epoch": 6.062376122187245,
"grad_norm": 0.3460189402103424,
"learning_rate": 0.0005275625546806649,
"loss": 3.4177,
"step": 20800
},
{
"epoch": 6.076949982511367,
"grad_norm": 0.3405349552631378,
"learning_rate": 0.0005273875765529309,
"loss": 3.4299,
"step": 20850
},
{
"epoch": 6.09152384283549,
"grad_norm": 0.32472336292266846,
"learning_rate": 0.0005272125984251968,
"loss": 3.4404,
"step": 20900
},
{
"epoch": 6.106097703159613,
"grad_norm": 0.3217617869377136,
"learning_rate": 0.0005270376202974628,
"loss": 3.4387,
"step": 20950
},
{
"epoch": 6.120671563483736,
"grad_norm": 0.36554691195487976,
"learning_rate": 0.0005268626421697288,
"loss": 3.4431,
"step": 21000
},
{
"epoch": 6.120671563483736,
"eval_accuracy": 0.3611621490850769,
"eval_loss": 3.6243624687194824,
"eval_runtime": 53.2667,
"eval_samples_per_second": 312.146,
"eval_steps_per_second": 19.524,
"step": 21000
},
{
"epoch": 6.135245423807858,
"grad_norm": 0.32540804147720337,
"learning_rate": 0.0005266876640419946,
"loss": 3.4545,
"step": 21050
},
{
"epoch": 6.149819284131981,
"grad_norm": 0.31584632396698,
"learning_rate": 0.0005265126859142606,
"loss": 3.4606,
"step": 21100
},
{
"epoch": 6.1643931444561035,
"grad_norm": 0.3380563259124756,
"learning_rate": 0.0005263377077865266,
"loss": 3.4468,
"step": 21150
},
{
"epoch": 6.178967004780226,
"grad_norm": 0.3270926773548126,
"learning_rate": 0.0005261627296587926,
"loss": 3.4511,
"step": 21200
},
{
"epoch": 6.193540865104349,
"grad_norm": 0.33158567547798157,
"learning_rate": 0.0005259877515310585,
"loss": 3.4604,
"step": 21250
},
{
"epoch": 6.208114725428471,
"grad_norm": 0.3384026885032654,
"learning_rate": 0.0005258127734033245,
"loss": 3.4532,
"step": 21300
},
{
"epoch": 6.222688585752594,
"grad_norm": 0.3033508360385895,
"learning_rate": 0.0005256377952755905,
"loss": 3.4695,
"step": 21350
},
{
"epoch": 6.237262446076717,
"grad_norm": 0.3265267610549927,
"learning_rate": 0.0005254628171478565,
"loss": 3.4575,
"step": 21400
},
{
"epoch": 6.25183630640084,
"grad_norm": 0.3173014223575592,
"learning_rate": 0.0005252878390201224,
"loss": 3.4492,
"step": 21450
},
{
"epoch": 6.266410166724962,
"grad_norm": 0.3339494466781616,
"learning_rate": 0.0005251128608923884,
"loss": 3.4624,
"step": 21500
},
{
"epoch": 6.280984027049085,
"grad_norm": 0.3087364435195923,
"learning_rate": 0.0005249378827646544,
"loss": 3.4773,
"step": 21550
},
{
"epoch": 6.295557887373207,
"grad_norm": 0.3332245349884033,
"learning_rate": 0.0005247629046369204,
"loss": 3.4686,
"step": 21600
},
{
"epoch": 6.31013174769733,
"grad_norm": 0.3170703649520874,
"learning_rate": 0.0005245879265091863,
"loss": 3.4716,
"step": 21650
},
{
"epoch": 6.3247056080214525,
"grad_norm": 0.3388248682022095,
"learning_rate": 0.0005244129483814523,
"loss": 3.4911,
"step": 21700
},
{
"epoch": 6.339279468345575,
"grad_norm": 0.32150861620903015,
"learning_rate": 0.0005242379702537183,
"loss": 3.4705,
"step": 21750
},
{
"epoch": 6.353853328669698,
"grad_norm": 0.29872483015060425,
"learning_rate": 0.0005240629921259843,
"loss": 3.467,
"step": 21800
},
{
"epoch": 6.368427188993821,
"grad_norm": 0.302783727645874,
"learning_rate": 0.0005238880139982502,
"loss": 3.4742,
"step": 21850
},
{
"epoch": 6.383001049317944,
"grad_norm": 0.3234187960624695,
"learning_rate": 0.0005237130358705162,
"loss": 3.4823,
"step": 21900
},
{
"epoch": 6.397574909642066,
"grad_norm": 0.3224688470363617,
"learning_rate": 0.0005235380577427822,
"loss": 3.4926,
"step": 21950
},
{
"epoch": 6.412148769966189,
"grad_norm": 0.35340335965156555,
"learning_rate": 0.0005233630796150481,
"loss": 3.4695,
"step": 22000
},
{
"epoch": 6.412148769966189,
"eval_accuracy": 0.36229804233488616,
"eval_loss": 3.6155829429626465,
"eval_runtime": 53.1967,
"eval_samples_per_second": 312.557,
"eval_steps_per_second": 19.55,
"step": 22000
},
{
"epoch": 6.426722630290311,
"grad_norm": 0.3472863733768463,
"learning_rate": 0.0005231881014873141,
"loss": 3.4776,
"step": 22050
},
{
"epoch": 6.441296490614434,
"grad_norm": 0.3207915127277374,
"learning_rate": 0.00052301312335958,
"loss": 3.4821,
"step": 22100
},
{
"epoch": 6.455870350938556,
"grad_norm": 0.33775514364242554,
"learning_rate": 0.0005228381452318459,
"loss": 3.475,
"step": 22150
},
{
"epoch": 6.470444211262679,
"grad_norm": 0.32991212606430054,
"learning_rate": 0.0005226631671041119,
"loss": 3.48,
"step": 22200
},
{
"epoch": 6.4850180715868015,
"grad_norm": 0.3161657154560089,
"learning_rate": 0.0005224881889763779,
"loss": 3.4863,
"step": 22250
},
{
"epoch": 6.499591931910924,
"grad_norm": 0.3155244290828705,
"learning_rate": 0.0005223132108486439,
"loss": 3.4772,
"step": 22300
},
{
"epoch": 6.5141657922350475,
"grad_norm": 0.332363098859787,
"learning_rate": 0.0005221382327209098,
"loss": 3.4868,
"step": 22350
},
{
"epoch": 6.52873965255917,
"grad_norm": 0.32883933186531067,
"learning_rate": 0.0005219632545931758,
"loss": 3.4962,
"step": 22400
},
{
"epoch": 6.543313512883293,
"grad_norm": 0.30881384015083313,
"learning_rate": 0.0005217882764654418,
"loss": 3.4725,
"step": 22450
},
{
"epoch": 6.557887373207415,
"grad_norm": 0.31679660081863403,
"learning_rate": 0.0005216132983377077,
"loss": 3.4787,
"step": 22500
},
{
"epoch": 6.572461233531538,
"grad_norm": 0.3190597593784332,
"learning_rate": 0.0005214383202099737,
"loss": 3.4981,
"step": 22550
},
{
"epoch": 6.58703509385566,
"grad_norm": 0.35668647289276123,
"learning_rate": 0.0005212633420822397,
"loss": 3.4779,
"step": 22600
},
{
"epoch": 6.601608954179783,
"grad_norm": 0.33341237902641296,
"learning_rate": 0.0005210883639545056,
"loss": 3.4834,
"step": 22650
},
{
"epoch": 6.616182814503905,
"grad_norm": 0.31415843963623047,
"learning_rate": 0.0005209133858267716,
"loss": 3.4832,
"step": 22700
},
{
"epoch": 6.630756674828028,
"grad_norm": 0.3208731710910797,
"learning_rate": 0.0005207384076990376,
"loss": 3.4914,
"step": 22750
},
{
"epoch": 6.645330535152151,
"grad_norm": 0.314189076423645,
"learning_rate": 0.0005205634295713035,
"loss": 3.4995,
"step": 22800
},
{
"epoch": 6.659904395476274,
"grad_norm": 0.3122125566005707,
"learning_rate": 0.0005203884514435695,
"loss": 3.4772,
"step": 22850
},
{
"epoch": 6.6744782558003966,
"grad_norm": 0.3228355348110199,
"learning_rate": 0.0005202134733158355,
"loss": 3.4793,
"step": 22900
},
{
"epoch": 6.689052116124519,
"grad_norm": 0.3040376901626587,
"learning_rate": 0.0005200384951881014,
"loss": 3.491,
"step": 22950
},
{
"epoch": 6.703625976448642,
"grad_norm": 0.3368416726589203,
"learning_rate": 0.0005198635170603674,
"loss": 3.489,
"step": 23000
},
{
"epoch": 6.703625976448642,
"eval_accuracy": 0.3630768430429981,
"eval_loss": 3.6057634353637695,
"eval_runtime": 53.2283,
"eval_samples_per_second": 312.372,
"eval_steps_per_second": 19.538,
"step": 23000
},
{
"epoch": 6.718199836772764,
"grad_norm": 0.33878281712532043,
"learning_rate": 0.0005196885389326334,
"loss": 3.4806,
"step": 23050
},
{
"epoch": 6.732773697096887,
"grad_norm": 0.30997946858406067,
"learning_rate": 0.0005195135608048994,
"loss": 3.4829,
"step": 23100
},
{
"epoch": 6.747347557421009,
"grad_norm": 0.34856128692626953,
"learning_rate": 0.0005193385826771652,
"loss": 3.4915,
"step": 23150
},
{
"epoch": 6.761921417745132,
"grad_norm": 0.33696675300598145,
"learning_rate": 0.0005191636045494312,
"loss": 3.4904,
"step": 23200
},
{
"epoch": 6.776495278069255,
"grad_norm": 0.338143527507782,
"learning_rate": 0.0005189886264216972,
"loss": 3.5018,
"step": 23250
},
{
"epoch": 6.791069138393378,
"grad_norm": 0.31334543228149414,
"learning_rate": 0.0005188136482939632,
"loss": 3.5003,
"step": 23300
},
{
"epoch": 6.8056429987175004,
"grad_norm": 0.33318978548049927,
"learning_rate": 0.0005186386701662291,
"loss": 3.5064,
"step": 23350
},
{
"epoch": 6.820216859041623,
"grad_norm": 0.32178571820259094,
"learning_rate": 0.0005184636920384951,
"loss": 3.4924,
"step": 23400
},
{
"epoch": 6.834790719365746,
"grad_norm": 0.3570283353328705,
"learning_rate": 0.0005182887139107611,
"loss": 3.4984,
"step": 23450
},
{
"epoch": 6.849364579689868,
"grad_norm": 0.3409295380115509,
"learning_rate": 0.0005181137357830271,
"loss": 3.4973,
"step": 23500
},
{
"epoch": 6.863938440013991,
"grad_norm": 0.3298083543777466,
"learning_rate": 0.000517938757655293,
"loss": 3.4946,
"step": 23550
},
{
"epoch": 6.878512300338113,
"grad_norm": 0.3032434582710266,
"learning_rate": 0.000517763779527559,
"loss": 3.484,
"step": 23600
},
{
"epoch": 6.893086160662236,
"grad_norm": 0.3227110505104065,
"learning_rate": 0.000517588801399825,
"loss": 3.5059,
"step": 23650
},
{
"epoch": 6.907660020986359,
"grad_norm": 0.32623517513275146,
"learning_rate": 0.0005174138232720909,
"loss": 3.493,
"step": 23700
},
{
"epoch": 6.922233881310482,
"grad_norm": 0.3259028196334839,
"learning_rate": 0.0005172388451443569,
"loss": 3.4902,
"step": 23750
},
{
"epoch": 6.936807741634604,
"grad_norm": 0.3084574043750763,
"learning_rate": 0.0005170638670166229,
"loss": 3.5085,
"step": 23800
},
{
"epoch": 6.951381601958727,
"grad_norm": 0.33005186915397644,
"learning_rate": 0.0005168888888888889,
"loss": 3.4998,
"step": 23850
},
{
"epoch": 6.9659554622828495,
"grad_norm": 0.3244883418083191,
"learning_rate": 0.0005167139107611548,
"loss": 3.4995,
"step": 23900
},
{
"epoch": 6.980529322606972,
"grad_norm": 0.3078853189945221,
"learning_rate": 0.0005165389326334208,
"loss": 3.4915,
"step": 23950
},
{
"epoch": 6.995103182931095,
"grad_norm": 0.3137339651584625,
"learning_rate": 0.0005163639545056868,
"loss": 3.5016,
"step": 24000
},
{
"epoch": 6.995103182931095,
"eval_accuracy": 0.3637403007415967,
"eval_loss": 3.5967910289764404,
"eval_runtime": 53.4115,
"eval_samples_per_second": 311.3,
"eval_steps_per_second": 19.471,
"step": 24000
},
{
"epoch": 7.009618747813921,
"grad_norm": 0.3230603337287903,
"learning_rate": 0.0005161889763779528,
"loss": 3.4228,
"step": 24050
},
{
"epoch": 7.024192608138043,
"grad_norm": 0.32526594400405884,
"learning_rate": 0.0005160139982502187,
"loss": 3.3865,
"step": 24100
},
{
"epoch": 7.038766468462167,
"grad_norm": 0.3268178105354309,
"learning_rate": 0.0005158390201224847,
"loss": 3.383,
"step": 24150
},
{
"epoch": 7.053340328786289,
"grad_norm": 0.3226619362831116,
"learning_rate": 0.0005156640419947507,
"loss": 3.376,
"step": 24200
},
{
"epoch": 7.067914189110412,
"grad_norm": 0.3219951093196869,
"learning_rate": 0.0005154890638670167,
"loss": 3.4019,
"step": 24250
},
{
"epoch": 7.082488049434534,
"grad_norm": 0.3288451135158539,
"learning_rate": 0.0005153140857392825,
"loss": 3.4092,
"step": 24300
},
{
"epoch": 7.097061909758657,
"grad_norm": 0.3031003475189209,
"learning_rate": 0.0005151391076115485,
"loss": 3.4013,
"step": 24350
},
{
"epoch": 7.1116357700827795,
"grad_norm": 0.34017258882522583,
"learning_rate": 0.0005149641294838145,
"loss": 3.4129,
"step": 24400
},
{
"epoch": 7.126209630406902,
"grad_norm": 0.3318200409412384,
"learning_rate": 0.0005147891513560804,
"loss": 3.4126,
"step": 24450
},
{
"epoch": 7.140783490731025,
"grad_norm": 0.3410212993621826,
"learning_rate": 0.0005146141732283464,
"loss": 3.4194,
"step": 24500
},
{
"epoch": 7.155357351055147,
"grad_norm": 0.3241153955459595,
"learning_rate": 0.0005144391951006124,
"loss": 3.4164,
"step": 24550
},
{
"epoch": 7.16993121137927,
"grad_norm": 0.326886922121048,
"learning_rate": 0.0005142642169728783,
"loss": 3.4167,
"step": 24600
},
{
"epoch": 7.184505071703393,
"grad_norm": 0.33190369606018066,
"learning_rate": 0.0005140892388451443,
"loss": 3.4221,
"step": 24650
},
{
"epoch": 7.199078932027516,
"grad_norm": 0.331858366727829,
"learning_rate": 0.0005139142607174103,
"loss": 3.4321,
"step": 24700
},
{
"epoch": 7.213652792351638,
"grad_norm": 0.35590073466300964,
"learning_rate": 0.0005137392825896762,
"loss": 3.4218,
"step": 24750
},
{
"epoch": 7.228226652675761,
"grad_norm": 0.3808642625808716,
"learning_rate": 0.0005135643044619422,
"loss": 3.4253,
"step": 24800
},
{
"epoch": 7.242800512999883,
"grad_norm": 0.32019373774528503,
"learning_rate": 0.0005133893263342082,
"loss": 3.427,
"step": 24850
},
{
"epoch": 7.257374373324006,
"grad_norm": 0.3348955512046814,
"learning_rate": 0.0005132143482064742,
"loss": 3.4321,
"step": 24900
},
{
"epoch": 7.2719482336481285,
"grad_norm": 0.32612475752830505,
"learning_rate": 0.0005130393700787401,
"loss": 3.4371,
"step": 24950
},
{
"epoch": 7.286522093972251,
"grad_norm": 0.336725115776062,
"learning_rate": 0.0005128643919510061,
"loss": 3.429,
"step": 25000
},
{
"epoch": 7.286522093972251,
"eval_accuracy": 0.3636725072992705,
"eval_loss": 3.6023082733154297,
"eval_runtime": 53.3475,
"eval_samples_per_second": 311.674,
"eval_steps_per_second": 19.495,
"step": 25000
},
{
"epoch": 7.301095954296374,
"grad_norm": 0.31937819719314575,
"learning_rate": 0.0005126894138232721,
"loss": 3.4346,
"step": 25050
},
{
"epoch": 7.315669814620497,
"grad_norm": 0.3397108316421509,
"learning_rate": 0.000512514435695538,
"loss": 3.4382,
"step": 25100
},
{
"epoch": 7.33024367494462,
"grad_norm": 0.3680172860622406,
"learning_rate": 0.000512339457567804,
"loss": 3.4343,
"step": 25150
},
{
"epoch": 7.344817535268742,
"grad_norm": 0.33497798442840576,
"learning_rate": 0.00051216447944007,
"loss": 3.4407,
"step": 25200
},
{
"epoch": 7.359391395592865,
"grad_norm": 0.3387533724308014,
"learning_rate": 0.0005119895013123358,
"loss": 3.4499,
"step": 25250
},
{
"epoch": 7.373965255916987,
"grad_norm": 0.3436633050441742,
"learning_rate": 0.0005118145231846018,
"loss": 3.4469,
"step": 25300
},
{
"epoch": 7.38853911624111,
"grad_norm": 0.33034539222717285,
"learning_rate": 0.0005116395450568678,
"loss": 3.4447,
"step": 25350
},
{
"epoch": 7.403112976565232,
"grad_norm": 0.3452779948711395,
"learning_rate": 0.0005114645669291338,
"loss": 3.4325,
"step": 25400
},
{
"epoch": 7.417686836889355,
"grad_norm": 0.3401397168636322,
"learning_rate": 0.0005112895888013997,
"loss": 3.4428,
"step": 25450
},
{
"epoch": 7.4322606972134775,
"grad_norm": 0.32042157649993896,
"learning_rate": 0.0005111146106736657,
"loss": 3.4451,
"step": 25500
},
{
"epoch": 7.446834557537601,
"grad_norm": 0.3398337960243225,
"learning_rate": 0.0005109396325459317,
"loss": 3.4465,
"step": 25550
},
{
"epoch": 7.4614084178617235,
"grad_norm": 0.34229588508605957,
"learning_rate": 0.0005107646544181976,
"loss": 3.4414,
"step": 25600
},
{
"epoch": 7.475982278185846,
"grad_norm": 0.3290734887123108,
"learning_rate": 0.0005105896762904636,
"loss": 3.4602,
"step": 25650
},
{
"epoch": 7.490556138509969,
"grad_norm": 0.31352612376213074,
"learning_rate": 0.0005104146981627296,
"loss": 3.4439,
"step": 25700
},
{
"epoch": 7.505129998834091,
"grad_norm": 0.34663698077201843,
"learning_rate": 0.0005102397200349956,
"loss": 3.4551,
"step": 25750
},
{
"epoch": 7.519703859158214,
"grad_norm": 0.32665345072746277,
"learning_rate": 0.0005100647419072615,
"loss": 3.4437,
"step": 25800
},
{
"epoch": 7.534277719482336,
"grad_norm": 0.33366599678993225,
"learning_rate": 0.0005098897637795275,
"loss": 3.4361,
"step": 25850
},
{
"epoch": 7.548851579806459,
"grad_norm": 0.34481877088546753,
"learning_rate": 0.0005097147856517935,
"loss": 3.4411,
"step": 25900
},
{
"epoch": 7.563425440130581,
"grad_norm": 0.3386363983154297,
"learning_rate": 0.0005095398075240595,
"loss": 3.4539,
"step": 25950
},
{
"epoch": 7.577999300454705,
"grad_norm": 0.3457847535610199,
"learning_rate": 0.0005093648293963254,
"loss": 3.4487,
"step": 26000
},
{
"epoch": 7.577999300454705,
"eval_accuracy": 0.36446955103439727,
"eval_loss": 3.5960311889648438,
"eval_runtime": 53.3428,
"eval_samples_per_second": 311.701,
"eval_steps_per_second": 19.497,
"step": 26000
},
{
"epoch": 7.592573160778827,
"grad_norm": 0.33010855317115784,
"learning_rate": 0.0005091898512685914,
"loss": 3.4511,
"step": 26050
},
{
"epoch": 7.60714702110295,
"grad_norm": 0.33151471614837646,
"learning_rate": 0.0005090148731408574,
"loss": 3.4468,
"step": 26100
},
{
"epoch": 7.6217208814270725,
"grad_norm": 0.327346533536911,
"learning_rate": 0.0005088398950131234,
"loss": 3.458,
"step": 26150
},
{
"epoch": 7.636294741751195,
"grad_norm": 0.3349602520465851,
"learning_rate": 0.0005086649168853893,
"loss": 3.4536,
"step": 26200
},
{
"epoch": 7.650868602075318,
"grad_norm": 0.34411633014678955,
"learning_rate": 0.0005084899387576553,
"loss": 3.455,
"step": 26250
},
{
"epoch": 7.66544246239944,
"grad_norm": 0.333551824092865,
"learning_rate": 0.0005083149606299213,
"loss": 3.4646,
"step": 26300
},
{
"epoch": 7.680016322723563,
"grad_norm": 0.3610248863697052,
"learning_rate": 0.0005081399825021873,
"loss": 3.4596,
"step": 26350
},
{
"epoch": 7.694590183047685,
"grad_norm": 0.3180255591869354,
"learning_rate": 0.0005079650043744531,
"loss": 3.4569,
"step": 26400
},
{
"epoch": 7.709164043371809,
"grad_norm": 0.32871150970458984,
"learning_rate": 0.0005077900262467191,
"loss": 3.4467,
"step": 26450
},
{
"epoch": 7.723737903695931,
"grad_norm": 0.32751843333244324,
"learning_rate": 0.0005076150481189851,
"loss": 3.4441,
"step": 26500
},
{
"epoch": 7.738311764020054,
"grad_norm": 0.3273012042045593,
"learning_rate": 0.000507440069991251,
"loss": 3.4676,
"step": 26550
},
{
"epoch": 7.752885624344176,
"grad_norm": 0.32619708776474,
"learning_rate": 0.000507265091863517,
"loss": 3.4606,
"step": 26600
},
{
"epoch": 7.767459484668299,
"grad_norm": 0.36359405517578125,
"learning_rate": 0.000507090113735783,
"loss": 3.4586,
"step": 26650
},
{
"epoch": 7.7820333449924215,
"grad_norm": 0.3292996883392334,
"learning_rate": 0.000506915135608049,
"loss": 3.465,
"step": 26700
},
{
"epoch": 7.796607205316544,
"grad_norm": 0.3375926911830902,
"learning_rate": 0.0005067401574803149,
"loss": 3.4535,
"step": 26750
},
{
"epoch": 7.811181065640667,
"grad_norm": 0.31585800647735596,
"learning_rate": 0.0005065651793525809,
"loss": 3.4642,
"step": 26800
},
{
"epoch": 7.825754925964789,
"grad_norm": 0.3204030692577362,
"learning_rate": 0.0005063902012248469,
"loss": 3.4499,
"step": 26850
},
{
"epoch": 7.840328786288913,
"grad_norm": 0.3427492082118988,
"learning_rate": 0.0005062152230971128,
"loss": 3.4637,
"step": 26900
},
{
"epoch": 7.854902646613035,
"grad_norm": 0.34627190232276917,
"learning_rate": 0.0005060402449693788,
"loss": 3.4664,
"step": 26950
},
{
"epoch": 7.869476506937158,
"grad_norm": 0.3362285792827606,
"learning_rate": 0.0005058652668416448,
"loss": 3.4749,
"step": 27000
},
{
"epoch": 7.869476506937158,
"eval_accuracy": 0.36517149563514983,
"eval_loss": 3.586765766143799,
"eval_runtime": 53.4947,
"eval_samples_per_second": 310.816,
"eval_steps_per_second": 19.441,
"step": 27000
},
{
"epoch": 7.88405036726128,
"grad_norm": 0.3395719826221466,
"learning_rate": 0.0005056902887139107,
"loss": 3.4635,
"step": 27050
},
{
"epoch": 7.898624227585403,
"grad_norm": 0.34437862038612366,
"learning_rate": 0.0005055153105861767,
"loss": 3.4604,
"step": 27100
},
{
"epoch": 7.913198087909525,
"grad_norm": 0.32952070236206055,
"learning_rate": 0.0005053403324584427,
"loss": 3.4611,
"step": 27150
},
{
"epoch": 7.927771948233648,
"grad_norm": 0.3154241740703583,
"learning_rate": 0.0005051653543307086,
"loss": 3.4679,
"step": 27200
},
{
"epoch": 7.9423458085577705,
"grad_norm": 0.3297794461250305,
"learning_rate": 0.0005049903762029746,
"loss": 3.4629,
"step": 27250
},
{
"epoch": 7.956919668881893,
"grad_norm": 0.3234383463859558,
"learning_rate": 0.0005048153980752406,
"loss": 3.4671,
"step": 27300
},
{
"epoch": 7.9714935292060165,
"grad_norm": 0.33262306451797485,
"learning_rate": 0.0005046404199475064,
"loss": 3.4733,
"step": 27350
},
{
"epoch": 7.986067389530139,
"grad_norm": 0.3423873484134674,
"learning_rate": 0.0005044654418197724,
"loss": 3.4691,
"step": 27400
},
{
"epoch": 8.000582954412964,
"grad_norm": 0.3569919466972351,
"learning_rate": 0.0005042904636920384,
"loss": 3.4555,
"step": 27450
},
{
"epoch": 8.015156814737088,
"grad_norm": 0.3198401927947998,
"learning_rate": 0.0005041154855643044,
"loss": 3.3531,
"step": 27500
},
{
"epoch": 8.02973067506121,
"grad_norm": 0.32315894961357117,
"learning_rate": 0.0005039405074365703,
"loss": 3.3566,
"step": 27550
},
{
"epoch": 8.044304535385333,
"grad_norm": 0.33363547921180725,
"learning_rate": 0.0005037655293088363,
"loss": 3.3581,
"step": 27600
},
{
"epoch": 8.058878395709456,
"grad_norm": 0.3445553779602051,
"learning_rate": 0.0005035905511811023,
"loss": 3.3566,
"step": 27650
},
{
"epoch": 8.073452256033578,
"grad_norm": 0.3616427183151245,
"learning_rate": 0.0005034155730533682,
"loss": 3.377,
"step": 27700
},
{
"epoch": 8.088026116357701,
"grad_norm": 0.37287381291389465,
"learning_rate": 0.0005032405949256342,
"loss": 3.3849,
"step": 27750
},
{
"epoch": 8.102599976681823,
"grad_norm": 0.339351087808609,
"learning_rate": 0.0005030656167979002,
"loss": 3.3738,
"step": 27800
},
{
"epoch": 8.117173837005947,
"grad_norm": 0.32417935132980347,
"learning_rate": 0.0005028906386701662,
"loss": 3.37,
"step": 27850
},
{
"epoch": 8.131747697330068,
"grad_norm": 0.34558215737342834,
"learning_rate": 0.0005027156605424321,
"loss": 3.387,
"step": 27900
},
{
"epoch": 8.146321557654192,
"grad_norm": 0.3561769127845764,
"learning_rate": 0.0005025406824146981,
"loss": 3.3842,
"step": 27950
},
{
"epoch": 8.160895417978313,
"grad_norm": 0.33284032344818115,
"learning_rate": 0.0005023657042869641,
"loss": 3.3917,
"step": 28000
},
{
"epoch": 8.160895417978313,
"eval_accuracy": 0.36516772933279834,
"eval_loss": 3.596513271331787,
"eval_runtime": 53.192,
"eval_samples_per_second": 312.585,
"eval_steps_per_second": 19.552,
"step": 28000
},
{
"epoch": 8.175469278302437,
"grad_norm": 0.3167806565761566,
"learning_rate": 0.0005021907261592301,
"loss": 3.3807,
"step": 28050
},
{
"epoch": 8.19004313862656,
"grad_norm": 0.3251250386238098,
"learning_rate": 0.000502015748031496,
"loss": 3.3931,
"step": 28100
},
{
"epoch": 8.204616998950682,
"grad_norm": 0.33859023451805115,
"learning_rate": 0.000501840769903762,
"loss": 3.3845,
"step": 28150
},
{
"epoch": 8.219190859274805,
"grad_norm": 0.34484195709228516,
"learning_rate": 0.000501665791776028,
"loss": 3.3959,
"step": 28200
},
{
"epoch": 8.233764719598927,
"grad_norm": 0.3301757872104645,
"learning_rate": 0.0005014908136482939,
"loss": 3.3935,
"step": 28250
},
{
"epoch": 8.24833857992305,
"grad_norm": 0.35394570231437683,
"learning_rate": 0.0005013158355205599,
"loss": 3.3925,
"step": 28300
},
{
"epoch": 8.262912440247172,
"grad_norm": 0.32209792733192444,
"learning_rate": 0.0005011408573928259,
"loss": 3.4074,
"step": 28350
},
{
"epoch": 8.277486300571296,
"grad_norm": 0.36506566405296326,
"learning_rate": 0.0005009658792650919,
"loss": 3.4031,
"step": 28400
},
{
"epoch": 8.292060160895417,
"grad_norm": 0.3617514371871948,
"learning_rate": 0.0005007909011373577,
"loss": 3.405,
"step": 28450
},
{
"epoch": 8.30663402121954,
"grad_norm": 0.33067309856414795,
"learning_rate": 0.0005006159230096237,
"loss": 3.4053,
"step": 28500
},
{
"epoch": 8.321207881543664,
"grad_norm": 0.3240738809108734,
"learning_rate": 0.0005004409448818897,
"loss": 3.4026,
"step": 28550
},
{
"epoch": 8.335781741867786,
"grad_norm": 0.31030556559562683,
"learning_rate": 0.0005002659667541557,
"loss": 3.4,
"step": 28600
},
{
"epoch": 8.35035560219191,
"grad_norm": 0.3443307876586914,
"learning_rate": 0.0005000909886264216,
"loss": 3.4171,
"step": 28650
},
{
"epoch": 8.36492946251603,
"grad_norm": 0.34475865960121155,
"learning_rate": 0.0004999160104986876,
"loss": 3.4069,
"step": 28700
},
{
"epoch": 8.379503322840154,
"grad_norm": 0.33189693093299866,
"learning_rate": 0.0004997410323709536,
"loss": 3.4186,
"step": 28750
},
{
"epoch": 8.394077183164276,
"grad_norm": 0.3285791277885437,
"learning_rate": 0.0004995660542432196,
"loss": 3.4207,
"step": 28800
},
{
"epoch": 8.4086510434884,
"grad_norm": 0.35239142179489136,
"learning_rate": 0.0004993910761154855,
"loss": 3.4052,
"step": 28850
},
{
"epoch": 8.423224903812521,
"grad_norm": 0.34239187836647034,
"learning_rate": 0.0004992160979877515,
"loss": 3.4027,
"step": 28900
},
{
"epoch": 8.437798764136645,
"grad_norm": 0.34864264726638794,
"learning_rate": 0.0004990411198600175,
"loss": 3.4147,
"step": 28950
},
{
"epoch": 8.452372624460768,
"grad_norm": 0.3308429718017578,
"learning_rate": 0.0004988661417322835,
"loss": 3.4029,
"step": 29000
},
{
"epoch": 8.452372624460768,
"eval_accuracy": 0.36567417930212065,
"eval_loss": 3.587951421737671,
"eval_runtime": 53.2307,
"eval_samples_per_second": 312.358,
"eval_steps_per_second": 19.538,
"step": 29000
},
{
"epoch": 8.46694648478489,
"grad_norm": 0.3271889388561249,
"learning_rate": 0.0004986911636045494,
"loss": 3.4046,
"step": 29050
},
{
"epoch": 8.481520345109013,
"grad_norm": 0.34479963779449463,
"learning_rate": 0.0004985161854768154,
"loss": 3.4121,
"step": 29100
},
{
"epoch": 8.496094205433135,
"grad_norm": 0.33550480008125305,
"learning_rate": 0.0004983412073490814,
"loss": 3.4261,
"step": 29150
},
{
"epoch": 8.510668065757258,
"grad_norm": 0.3231935203075409,
"learning_rate": 0.0004981662292213473,
"loss": 3.4109,
"step": 29200
},
{
"epoch": 8.52524192608138,
"grad_norm": 0.3306845426559448,
"learning_rate": 0.0004979912510936133,
"loss": 3.4212,
"step": 29250
},
{
"epoch": 8.539815786405503,
"grad_norm": 0.3305637538433075,
"learning_rate": 0.0004978162729658793,
"loss": 3.4335,
"step": 29300
},
{
"epoch": 8.554389646729625,
"grad_norm": 0.3217015862464905,
"learning_rate": 0.0004976412948381452,
"loss": 3.414,
"step": 29350
},
{
"epoch": 8.568963507053748,
"grad_norm": 0.32402655482292175,
"learning_rate": 0.0004974663167104112,
"loss": 3.4238,
"step": 29400
},
{
"epoch": 8.583537367377872,
"grad_norm": 0.3909885883331299,
"learning_rate": 0.0004972913385826772,
"loss": 3.4278,
"step": 29450
},
{
"epoch": 8.598111227701994,
"grad_norm": 0.33252859115600586,
"learning_rate": 0.000497116360454943,
"loss": 3.4168,
"step": 29500
},
{
"epoch": 8.612685088026117,
"grad_norm": 0.3216465413570404,
"learning_rate": 0.000496941382327209,
"loss": 3.4269,
"step": 29550
},
{
"epoch": 8.627258948350239,
"grad_norm": 0.31453585624694824,
"learning_rate": 0.000496766404199475,
"loss": 3.4184,
"step": 29600
},
{
"epoch": 8.641832808674362,
"grad_norm": 0.32268640398979187,
"learning_rate": 0.0004965914260717409,
"loss": 3.4316,
"step": 29650
},
{
"epoch": 8.656406668998484,
"grad_norm": 0.33388710021972656,
"learning_rate": 0.0004964164479440069,
"loss": 3.4366,
"step": 29700
},
{
"epoch": 8.670980529322607,
"grad_norm": 0.3239559829235077,
"learning_rate": 0.0004962414698162729,
"loss": 3.4147,
"step": 29750
},
{
"epoch": 8.685554389646729,
"grad_norm": 0.32564789056777954,
"learning_rate": 0.0004960664916885388,
"loss": 3.4322,
"step": 29800
},
{
"epoch": 8.700128249970852,
"grad_norm": 0.3407617211341858,
"learning_rate": 0.0004958915135608048,
"loss": 3.4322,
"step": 29850
},
{
"epoch": 8.714702110294976,
"grad_norm": 0.31778067350387573,
"learning_rate": 0.0004957165354330708,
"loss": 3.4334,
"step": 29900
},
{
"epoch": 8.729275970619097,
"grad_norm": 0.34262266755104065,
"learning_rate": 0.0004955415573053368,
"loss": 3.4418,
"step": 29950
},
{
"epoch": 8.74384983094322,
"grad_norm": 0.33617204427719116,
"learning_rate": 0.0004953665791776027,
"loss": 3.4309,
"step": 30000
},
{
"epoch": 8.74384983094322,
"eval_accuracy": 0.36643509007406316,
"eval_loss": 3.5787062644958496,
"eval_runtime": 53.2083,
"eval_samples_per_second": 312.489,
"eval_steps_per_second": 19.546,
"step": 30000
},
{
"epoch": 8.758423691267343,
"grad_norm": 0.31523507833480835,
"learning_rate": 0.0004951916010498687,
"loss": 3.44,
"step": 30050
},
{
"epoch": 8.772997551591466,
"grad_norm": 0.3123234510421753,
"learning_rate": 0.0004950166229221347,
"loss": 3.4403,
"step": 30100
},
{
"epoch": 8.787571411915588,
"grad_norm": 0.32988399267196655,
"learning_rate": 0.0004948416447944006,
"loss": 3.4256,
"step": 30150
},
{
"epoch": 8.802145272239711,
"grad_norm": 0.37780627608299255,
"learning_rate": 0.0004946666666666666,
"loss": 3.4506,
"step": 30200
},
{
"epoch": 8.816719132563833,
"grad_norm": 0.3315257132053375,
"learning_rate": 0.0004944916885389326,
"loss": 3.4441,
"step": 30250
},
{
"epoch": 8.831292992887956,
"grad_norm": 0.3197008967399597,
"learning_rate": 0.0004943167104111986,
"loss": 3.4346,
"step": 30300
},
{
"epoch": 8.84586685321208,
"grad_norm": 0.3454113006591797,
"learning_rate": 0.0004941417322834645,
"loss": 3.4251,
"step": 30350
},
{
"epoch": 8.860440713536201,
"grad_norm": 0.3199872672557831,
"learning_rate": 0.0004939667541557305,
"loss": 3.4346,
"step": 30400
},
{
"epoch": 8.875014573860325,
"grad_norm": 0.3247497081756592,
"learning_rate": 0.0004937917760279965,
"loss": 3.4313,
"step": 30450
},
{
"epoch": 8.889588434184446,
"grad_norm": 0.31507542729377747,
"learning_rate": 0.0004936167979002625,
"loss": 3.4333,
"step": 30500
},
{
"epoch": 8.90416229450857,
"grad_norm": 0.30630066990852356,
"learning_rate": 0.0004934418197725284,
"loss": 3.4309,
"step": 30550
},
{
"epoch": 8.918736154832692,
"grad_norm": 0.31729817390441895,
"learning_rate": 0.0004932668416447943,
"loss": 3.4386,
"step": 30600
},
{
"epoch": 8.933310015156815,
"grad_norm": 0.3511722683906555,
"learning_rate": 0.0004930918635170603,
"loss": 3.4362,
"step": 30650
},
{
"epoch": 8.947883875480937,
"grad_norm": 0.33442434668540955,
"learning_rate": 0.0004929168853893263,
"loss": 3.4393,
"step": 30700
},
{
"epoch": 8.96245773580506,
"grad_norm": 0.34247684478759766,
"learning_rate": 0.0004927419072615922,
"loss": 3.4459,
"step": 30750
},
{
"epoch": 8.977031596129184,
"grad_norm": 0.3286040723323822,
"learning_rate": 0.0004925669291338582,
"loss": 3.4431,
"step": 30800
},
{
"epoch": 8.991605456453305,
"grad_norm": 0.321585088968277,
"learning_rate": 0.0004923919510061242,
"loss": 3.4367,
"step": 30850
},
{
"epoch": 9.006121021336131,
"grad_norm": 0.331307590007782,
"learning_rate": 0.0004922169728783901,
"loss": 3.3862,
"step": 30900
},
{
"epoch": 9.020694881660255,
"grad_norm": 0.37021103501319885,
"learning_rate": 0.0004920419947506561,
"loss": 3.3223,
"step": 30950
},
{
"epoch": 9.035268741984376,
"grad_norm": 0.33175399899482727,
"learning_rate": 0.0004918670166229221,
"loss": 3.3335,
"step": 31000
},
{
"epoch": 9.035268741984376,
"eval_accuracy": 0.36600290687923365,
"eval_loss": 3.586287498474121,
"eval_runtime": 53.3438,
"eval_samples_per_second": 311.695,
"eval_steps_per_second": 19.496,
"step": 31000
},
{
"epoch": 9.0498426023085,
"grad_norm": 0.3926156163215637,
"learning_rate": 0.0004916920384951881,
"loss": 3.3284,
"step": 31050
},
{
"epoch": 9.064416462632622,
"grad_norm": 0.3375106751918793,
"learning_rate": 0.000491517060367454,
"loss": 3.3468,
"step": 31100
},
{
"epoch": 9.078990322956745,
"grad_norm": 0.3336063623428345,
"learning_rate": 0.00049134208223972,
"loss": 3.3463,
"step": 31150
},
{
"epoch": 9.093564183280867,
"grad_norm": 0.3396972715854645,
"learning_rate": 0.000491167104111986,
"loss": 3.3374,
"step": 31200
},
{
"epoch": 9.10813804360499,
"grad_norm": 0.3279450833797455,
"learning_rate": 0.000490992125984252,
"loss": 3.3375,
"step": 31250
},
{
"epoch": 9.122711903929114,
"grad_norm": 0.36093869805336,
"learning_rate": 0.0004908171478565179,
"loss": 3.3489,
"step": 31300
},
{
"epoch": 9.137285764253235,
"grad_norm": 0.3507830798625946,
"learning_rate": 0.0004906421697287839,
"loss": 3.3481,
"step": 31350
},
{
"epoch": 9.151859624577359,
"grad_norm": 0.33186662197113037,
"learning_rate": 0.0004904671916010499,
"loss": 3.3527,
"step": 31400
},
{
"epoch": 9.16643348490148,
"grad_norm": 0.34464725852012634,
"learning_rate": 0.0004902922134733158,
"loss": 3.3577,
"step": 31450
},
{
"epoch": 9.181007345225604,
"grad_norm": 0.3341642916202545,
"learning_rate": 0.0004901172353455818,
"loss": 3.3541,
"step": 31500
},
{
"epoch": 9.195581205549725,
"grad_norm": 0.3413945436477661,
"learning_rate": 0.0004899422572178478,
"loss": 3.3549,
"step": 31550
},
{
"epoch": 9.210155065873849,
"grad_norm": 0.344498872756958,
"learning_rate": 0.0004897672790901138,
"loss": 3.3726,
"step": 31600
},
{
"epoch": 9.22472892619797,
"grad_norm": 0.3580266833305359,
"learning_rate": 0.0004895923009623796,
"loss": 3.3629,
"step": 31650
},
{
"epoch": 9.239302786522094,
"grad_norm": 0.33636781573295593,
"learning_rate": 0.0004894173228346456,
"loss": 3.3797,
"step": 31700
},
{
"epoch": 9.253876646846217,
"grad_norm": 0.3473851978778839,
"learning_rate": 0.0004892423447069116,
"loss": 3.3811,
"step": 31750
},
{
"epoch": 9.26845050717034,
"grad_norm": 0.3385968506336212,
"learning_rate": 0.0004890673665791775,
"loss": 3.3737,
"step": 31800
},
{
"epoch": 9.283024367494463,
"grad_norm": 0.34464573860168457,
"learning_rate": 0.0004888923884514435,
"loss": 3.3843,
"step": 31850
},
{
"epoch": 9.297598227818584,
"grad_norm": 0.3294459581375122,
"learning_rate": 0.0004887174103237095,
"loss": 3.3717,
"step": 31900
},
{
"epoch": 9.312172088142708,
"grad_norm": 0.32553818821907043,
"learning_rate": 0.0004885424321959754,
"loss": 3.3829,
"step": 31950
},
{
"epoch": 9.32674594846683,
"grad_norm": 0.35752755403518677,
"learning_rate": 0.0004883674540682414,
"loss": 3.3802,
"step": 32000
},
{
"epoch": 9.32674594846683,
"eval_accuracy": 0.36622535411186646,
"eval_loss": 3.584019899368286,
"eval_runtime": 53.3181,
"eval_samples_per_second": 311.845,
"eval_steps_per_second": 19.506,
"step": 32000
},
{
"epoch": 9.341319808790953,
"grad_norm": 0.34649282693862915,
"learning_rate": 0.00048819247594050736,
"loss": 3.3942,
"step": 32050
},
{
"epoch": 9.355893669115074,
"grad_norm": 0.3347167670726776,
"learning_rate": 0.00048801749781277336,
"loss": 3.3805,
"step": 32100
},
{
"epoch": 9.370467529439198,
"grad_norm": 0.32364505529403687,
"learning_rate": 0.00048784251968503936,
"loss": 3.3936,
"step": 32150
},
{
"epoch": 9.385041389763321,
"grad_norm": 0.3570871353149414,
"learning_rate": 0.0004876675415573053,
"loss": 3.3723,
"step": 32200
},
{
"epoch": 9.399615250087443,
"grad_norm": 0.3575366735458374,
"learning_rate": 0.00048749256342957124,
"loss": 3.3838,
"step": 32250
},
{
"epoch": 9.414189110411566,
"grad_norm": 0.35830157995224,
"learning_rate": 0.00048731758530183724,
"loss": 3.3849,
"step": 32300
},
{
"epoch": 9.428762970735688,
"grad_norm": 0.33467957377433777,
"learning_rate": 0.0004871426071741032,
"loss": 3.4113,
"step": 32350
},
{
"epoch": 9.443336831059812,
"grad_norm": 0.34144988656044006,
"learning_rate": 0.0004869676290463692,
"loss": 3.3857,
"step": 32400
},
{
"epoch": 9.457910691383933,
"grad_norm": 0.33660706877708435,
"learning_rate": 0.0004867926509186351,
"loss": 3.3921,
"step": 32450
},
{
"epoch": 9.472484551708057,
"grad_norm": 0.3639011085033417,
"learning_rate": 0.00048661767279090107,
"loss": 3.399,
"step": 32500
},
{
"epoch": 9.487058412032178,
"grad_norm": 0.35932692885398865,
"learning_rate": 0.00048644269466316707,
"loss": 3.3896,
"step": 32550
},
{
"epoch": 9.501632272356302,
"grad_norm": 0.34504228830337524,
"learning_rate": 0.00048626771653543306,
"loss": 3.3977,
"step": 32600
},
{
"epoch": 9.516206132680423,
"grad_norm": 0.357723593711853,
"learning_rate": 0.00048609273840769895,
"loss": 3.4023,
"step": 32650
},
{
"epoch": 9.530779993004547,
"grad_norm": 0.34201788902282715,
"learning_rate": 0.00048591776027996495,
"loss": 3.4017,
"step": 32700
},
{
"epoch": 9.54535385332867,
"grad_norm": 0.32081034779548645,
"learning_rate": 0.00048574278215223095,
"loss": 3.3882,
"step": 32750
},
{
"epoch": 9.559927713652792,
"grad_norm": 0.350654661655426,
"learning_rate": 0.0004855678040244969,
"loss": 3.3947,
"step": 32800
},
{
"epoch": 9.574501573976915,
"grad_norm": 0.3591080904006958,
"learning_rate": 0.00048539282589676283,
"loss": 3.3897,
"step": 32850
},
{
"epoch": 9.589075434301037,
"grad_norm": 0.314141184091568,
"learning_rate": 0.00048521784776902883,
"loss": 3.4047,
"step": 32900
},
{
"epoch": 9.60364929462516,
"grad_norm": 0.36194196343421936,
"learning_rate": 0.00048504286964129483,
"loss": 3.4103,
"step": 32950
},
{
"epoch": 9.618223154949282,
"grad_norm": 0.33191657066345215,
"learning_rate": 0.0004848678915135607,
"loss": 3.4027,
"step": 33000
},
{
"epoch": 9.618223154949282,
"eval_accuracy": 0.36700933348571163,
"eval_loss": 3.573310375213623,
"eval_runtime": 53.1509,
"eval_samples_per_second": 312.826,
"eval_steps_per_second": 19.567,
"step": 33000
},
{
"epoch": 9.632797015273406,
"grad_norm": 0.32011061906814575,
"learning_rate": 0.0004846929133858267,
"loss": 3.3887,
"step": 33050
},
{
"epoch": 9.647370875597527,
"grad_norm": 0.3385699391365051,
"learning_rate": 0.0004845179352580927,
"loss": 3.4071,
"step": 33100
},
{
"epoch": 9.66194473592165,
"grad_norm": 0.3338579535484314,
"learning_rate": 0.0004843429571303587,
"loss": 3.4152,
"step": 33150
},
{
"epoch": 9.676518596245774,
"grad_norm": 0.3355548679828644,
"learning_rate": 0.0004841679790026246,
"loss": 3.4062,
"step": 33200
},
{
"epoch": 9.691092456569896,
"grad_norm": 0.33254146575927734,
"learning_rate": 0.0004839930008748906,
"loss": 3.4033,
"step": 33250
},
{
"epoch": 9.70566631689402,
"grad_norm": 0.3543528914451599,
"learning_rate": 0.0004838180227471566,
"loss": 3.4011,
"step": 33300
},
{
"epoch": 9.720240177218141,
"grad_norm": 0.31879448890686035,
"learning_rate": 0.00048364304461942254,
"loss": 3.4044,
"step": 33350
},
{
"epoch": 9.734814037542264,
"grad_norm": 0.3408379852771759,
"learning_rate": 0.0004834680664916885,
"loss": 3.4031,
"step": 33400
},
{
"epoch": 9.749387897866386,
"grad_norm": 0.3313541114330292,
"learning_rate": 0.0004832930883639545,
"loss": 3.4065,
"step": 33450
},
{
"epoch": 9.76396175819051,
"grad_norm": 0.3503141701221466,
"learning_rate": 0.0004831181102362204,
"loss": 3.4135,
"step": 33500
},
{
"epoch": 9.778535618514631,
"grad_norm": 0.34659284353256226,
"learning_rate": 0.00048294313210848637,
"loss": 3.4085,
"step": 33550
},
{
"epoch": 9.793109478838755,
"grad_norm": 0.35435524582862854,
"learning_rate": 0.00048276815398075237,
"loss": 3.403,
"step": 33600
},
{
"epoch": 9.807683339162878,
"grad_norm": 0.35643336176872253,
"learning_rate": 0.0004825931758530183,
"loss": 3.3999,
"step": 33650
},
{
"epoch": 9.822257199487,
"grad_norm": 0.33353114128112793,
"learning_rate": 0.0004824181977252843,
"loss": 3.4155,
"step": 33700
},
{
"epoch": 9.836831059811123,
"grad_norm": 0.30645281076431274,
"learning_rate": 0.00048224321959755025,
"loss": 3.4046,
"step": 33750
},
{
"epoch": 9.851404920135245,
"grad_norm": 0.35525768995285034,
"learning_rate": 0.0004820682414698162,
"loss": 3.403,
"step": 33800
},
{
"epoch": 9.865978780459368,
"grad_norm": 0.34258151054382324,
"learning_rate": 0.0004818932633420822,
"loss": 3.4106,
"step": 33850
},
{
"epoch": 9.88055264078349,
"grad_norm": 0.34737566113471985,
"learning_rate": 0.0004817182852143482,
"loss": 3.4212,
"step": 33900
},
{
"epoch": 9.895126501107613,
"grad_norm": 0.3499617874622345,
"learning_rate": 0.0004815433070866141,
"loss": 3.4168,
"step": 33950
},
{
"epoch": 9.909700361431735,
"grad_norm": 0.3705946207046509,
"learning_rate": 0.0004813683289588801,
"loss": 3.4163,
"step": 34000
},
{
"epoch": 9.909700361431735,
"eval_accuracy": 0.3675073092747432,
"eval_loss": 3.5654964447021484,
"eval_runtime": 53.6957,
"eval_samples_per_second": 309.652,
"eval_steps_per_second": 19.368,
"step": 34000
},
{
"epoch": 9.924274221755859,
"grad_norm": 0.3173365294933319,
"learning_rate": 0.0004811933508311461,
"loss": 3.4174,
"step": 34050
},
{
"epoch": 9.938848082079982,
"grad_norm": 0.3493649661540985,
"learning_rate": 0.00048101837270341207,
"loss": 3.4197,
"step": 34100
},
{
"epoch": 9.953421942404104,
"grad_norm": 0.32634416222572327,
"learning_rate": 0.00048084339457567796,
"loss": 3.4278,
"step": 34150
},
{
"epoch": 9.967995802728227,
"grad_norm": 0.36151570081710815,
"learning_rate": 0.00048066841644794396,
"loss": 3.4142,
"step": 34200
},
{
"epoch": 9.982569663052349,
"grad_norm": 0.3287253677845001,
"learning_rate": 0.00048049343832020996,
"loss": 3.4161,
"step": 34250
},
{
"epoch": 9.997143523376472,
"grad_norm": 0.33584117889404297,
"learning_rate": 0.00048031846019247595,
"loss": 3.419,
"step": 34300
},
{
"epoch": 10.011659088259298,
"grad_norm": 0.32926878333091736,
"learning_rate": 0.00048014348206474184,
"loss": 3.3171,
"step": 34350
},
{
"epoch": 10.02623294858342,
"grad_norm": 0.3427960276603699,
"learning_rate": 0.00047996850393700784,
"loss": 3.3109,
"step": 34400
},
{
"epoch": 10.040806808907544,
"grad_norm": 0.3409363031387329,
"learning_rate": 0.00047979352580927384,
"loss": 3.2946,
"step": 34450
},
{
"epoch": 10.055380669231667,
"grad_norm": 0.3273547887802124,
"learning_rate": 0.00047961854768153973,
"loss": 3.3174,
"step": 34500
},
{
"epoch": 10.069954529555789,
"grad_norm": 0.35705727338790894,
"learning_rate": 0.0004794435695538057,
"loss": 3.3145,
"step": 34550
},
{
"epoch": 10.084528389879912,
"grad_norm": 0.35857057571411133,
"learning_rate": 0.0004792685914260717,
"loss": 3.3145,
"step": 34600
},
{
"epoch": 10.099102250204034,
"grad_norm": 0.3458826541900635,
"learning_rate": 0.00047909361329833767,
"loss": 3.3264,
"step": 34650
},
{
"epoch": 10.113676110528157,
"grad_norm": 0.33831337094306946,
"learning_rate": 0.0004789186351706036,
"loss": 3.3239,
"step": 34700
},
{
"epoch": 10.128249970852279,
"grad_norm": 0.3277437388896942,
"learning_rate": 0.0004787436570428696,
"loss": 3.3232,
"step": 34750
},
{
"epoch": 10.142823831176402,
"grad_norm": 0.3412516415119171,
"learning_rate": 0.00047856867891513555,
"loss": 3.323,
"step": 34800
},
{
"epoch": 10.157397691500524,
"grad_norm": 0.3519359827041626,
"learning_rate": 0.00047839370078740155,
"loss": 3.3357,
"step": 34850
},
{
"epoch": 10.171971551824647,
"grad_norm": 0.3262736201286316,
"learning_rate": 0.0004782187226596675,
"loss": 3.3331,
"step": 34900
},
{
"epoch": 10.18654541214877,
"grad_norm": 0.32696694135665894,
"learning_rate": 0.00047804374453193344,
"loss": 3.3521,
"step": 34950
},
{
"epoch": 10.201119272472893,
"grad_norm": 0.34408679604530334,
"learning_rate": 0.00047786876640419943,
"loss": 3.339,
"step": 35000
},
{
"epoch": 10.201119272472893,
"eval_accuracy": 0.3673739586321119,
"eval_loss": 3.5782310962677,
"eval_runtime": 53.0964,
"eval_samples_per_second": 313.147,
"eval_steps_per_second": 19.587,
"step": 35000
},
{
"epoch": 10.215693132797016,
"grad_norm": 0.3292979598045349,
"learning_rate": 0.00047769378827646543,
"loss": 3.3516,
"step": 35050
},
{
"epoch": 10.230266993121138,
"grad_norm": 0.3297373950481415,
"learning_rate": 0.0004775188101487313,
"loss": 3.3392,
"step": 35100
},
{
"epoch": 10.244840853445261,
"grad_norm": 0.3371959924697876,
"learning_rate": 0.0004773438320209973,
"loss": 3.3412,
"step": 35150
},
{
"epoch": 10.259414713769383,
"grad_norm": 0.32268255949020386,
"learning_rate": 0.0004771688538932633,
"loss": 3.3511,
"step": 35200
},
{
"epoch": 10.273988574093506,
"grad_norm": 0.3661893606185913,
"learning_rate": 0.0004769938757655293,
"loss": 3.3514,
"step": 35250
},
{
"epoch": 10.288562434417628,
"grad_norm": 0.33462610840797424,
"learning_rate": 0.0004768188976377952,
"loss": 3.3512,
"step": 35300
},
{
"epoch": 10.303136294741751,
"grad_norm": 0.3414505422115326,
"learning_rate": 0.0004766439195100612,
"loss": 3.3589,
"step": 35350
},
{
"epoch": 10.317710155065875,
"grad_norm": 0.35679128766059875,
"learning_rate": 0.0004764689413823272,
"loss": 3.3633,
"step": 35400
},
{
"epoch": 10.332284015389996,
"grad_norm": 0.3407164216041565,
"learning_rate": 0.0004762939632545931,
"loss": 3.3417,
"step": 35450
},
{
"epoch": 10.34685787571412,
"grad_norm": 0.32294556498527527,
"learning_rate": 0.0004761189851268591,
"loss": 3.3474,
"step": 35500
},
{
"epoch": 10.361431736038242,
"grad_norm": 0.33205491304397583,
"learning_rate": 0.0004759440069991251,
"loss": 3.366,
"step": 35550
},
{
"epoch": 10.376005596362365,
"grad_norm": 0.3272695541381836,
"learning_rate": 0.0004757690288713911,
"loss": 3.3662,
"step": 35600
},
{
"epoch": 10.390579456686487,
"grad_norm": 0.364637166261673,
"learning_rate": 0.00047559405074365697,
"loss": 3.3581,
"step": 35650
},
{
"epoch": 10.40515331701061,
"grad_norm": 0.331093430519104,
"learning_rate": 0.00047541907261592297,
"loss": 3.3676,
"step": 35700
},
{
"epoch": 10.419727177334732,
"grad_norm": 0.3462320864200592,
"learning_rate": 0.00047524409448818897,
"loss": 3.3619,
"step": 35750
},
{
"epoch": 10.434301037658855,
"grad_norm": 0.35206523537635803,
"learning_rate": 0.0004750691163604549,
"loss": 3.366,
"step": 35800
},
{
"epoch": 10.448874897982979,
"grad_norm": 0.3616088926792145,
"learning_rate": 0.00047489413823272085,
"loss": 3.3777,
"step": 35850
},
{
"epoch": 10.4634487583071,
"grad_norm": 0.33343666791915894,
"learning_rate": 0.00047471916010498685,
"loss": 3.3674,
"step": 35900
},
{
"epoch": 10.478022618631224,
"grad_norm": 0.3323806822299957,
"learning_rate": 0.0004745441819772528,
"loss": 3.3683,
"step": 35950
},
{
"epoch": 10.492596478955345,
"grad_norm": 0.3593173325061798,
"learning_rate": 0.0004743692038495188,
"loss": 3.3697,
"step": 36000
},
{
"epoch": 10.492596478955345,
"eval_accuracy": 0.3675707479299755,
"eval_loss": 3.5710971355438232,
"eval_runtime": 53.2847,
"eval_samples_per_second": 312.041,
"eval_steps_per_second": 19.518,
"step": 36000
},
{
"epoch": 10.507170339279469,
"grad_norm": 0.33613282442092896,
"learning_rate": 0.00047419422572178474,
"loss": 3.3624,
"step": 36050
},
{
"epoch": 10.52174419960359,
"grad_norm": 0.322963684797287,
"learning_rate": 0.0004740192475940507,
"loss": 3.3662,
"step": 36100
},
{
"epoch": 10.536318059927714,
"grad_norm": 0.3449420630931854,
"learning_rate": 0.0004738442694663167,
"loss": 3.3818,
"step": 36150
},
{
"epoch": 10.550891920251836,
"grad_norm": 0.33825379610061646,
"learning_rate": 0.0004736692913385827,
"loss": 3.3729,
"step": 36200
},
{
"epoch": 10.565465780575959,
"grad_norm": 0.34755024313926697,
"learning_rate": 0.00047349431321084856,
"loss": 3.3839,
"step": 36250
},
{
"epoch": 10.580039640900083,
"grad_norm": 0.363221138715744,
"learning_rate": 0.00047331933508311456,
"loss": 3.3833,
"step": 36300
},
{
"epoch": 10.594613501224204,
"grad_norm": 0.38004592061042786,
"learning_rate": 0.00047314435695538056,
"loss": 3.3847,
"step": 36350
},
{
"epoch": 10.609187361548328,
"grad_norm": 0.33482077717781067,
"learning_rate": 0.00047296937882764645,
"loss": 3.3693,
"step": 36400
},
{
"epoch": 10.62376122187245,
"grad_norm": 0.38688504695892334,
"learning_rate": 0.00047279440069991245,
"loss": 3.3994,
"step": 36450
},
{
"epoch": 10.638335082196573,
"grad_norm": 0.8661547303199768,
"learning_rate": 0.00047261942257217844,
"loss": 3.3747,
"step": 36500
},
{
"epoch": 10.652908942520694,
"grad_norm": 0.3520547151565552,
"learning_rate": 0.00047244444444444444,
"loss": 3.3939,
"step": 36550
},
{
"epoch": 10.667482802844818,
"grad_norm": 0.3169161081314087,
"learning_rate": 0.00047226946631671033,
"loss": 3.3853,
"step": 36600
},
{
"epoch": 10.68205666316894,
"grad_norm": 0.3585412800312042,
"learning_rate": 0.00047209448818897633,
"loss": 3.4039,
"step": 36650
},
{
"epoch": 10.696630523493063,
"grad_norm": 0.36897537112236023,
"learning_rate": 0.0004719195100612423,
"loss": 3.3879,
"step": 36700
},
{
"epoch": 10.711204383817186,
"grad_norm": 0.32157233357429504,
"learning_rate": 0.00047174453193350827,
"loss": 3.3873,
"step": 36750
},
{
"epoch": 10.725778244141308,
"grad_norm": 0.3530696630477905,
"learning_rate": 0.0004715695538057742,
"loss": 3.3923,
"step": 36800
},
{
"epoch": 10.740352104465432,
"grad_norm": 0.34365373849868774,
"learning_rate": 0.0004713945756780402,
"loss": 3.3944,
"step": 36850
},
{
"epoch": 10.754925964789553,
"grad_norm": 0.34925419092178345,
"learning_rate": 0.0004712195975503062,
"loss": 3.3658,
"step": 36900
},
{
"epoch": 10.769499825113677,
"grad_norm": 0.31599247455596924,
"learning_rate": 0.00047104461942257215,
"loss": 3.3862,
"step": 36950
},
{
"epoch": 10.784073685437798,
"grad_norm": 0.34453409910202026,
"learning_rate": 0.0004708696412948381,
"loss": 3.3836,
"step": 37000
},
{
"epoch": 10.784073685437798,
"eval_accuracy": 0.36834907785029347,
"eval_loss": 3.565258502960205,
"eval_runtime": 53.3013,
"eval_samples_per_second": 311.944,
"eval_steps_per_second": 19.512,
"step": 37000
},
{
"epoch": 10.798647545761922,
"grad_norm": 0.3643403947353363,
"learning_rate": 0.0004706946631671041,
"loss": 3.3842,
"step": 37050
},
{
"epoch": 10.813221406086043,
"grad_norm": 0.3264981210231781,
"learning_rate": 0.00047051968503937004,
"loss": 3.3748,
"step": 37100
},
{
"epoch": 10.827795266410167,
"grad_norm": 0.3335154056549072,
"learning_rate": 0.000470344706911636,
"loss": 3.3895,
"step": 37150
},
{
"epoch": 10.84236912673429,
"grad_norm": 0.33110693097114563,
"learning_rate": 0.000470169728783902,
"loss": 3.3914,
"step": 37200
},
{
"epoch": 10.856942987058412,
"grad_norm": 0.33247503638267517,
"learning_rate": 0.0004699947506561679,
"loss": 3.3862,
"step": 37250
},
{
"epoch": 10.871516847382535,
"grad_norm": 0.3364832103252411,
"learning_rate": 0.0004698197725284339,
"loss": 3.3992,
"step": 37300
},
{
"epoch": 10.886090707706657,
"grad_norm": 0.3212383985519409,
"learning_rate": 0.00046964479440069986,
"loss": 3.4022,
"step": 37350
},
{
"epoch": 10.90066456803078,
"grad_norm": 0.33531829714775085,
"learning_rate": 0.0004694698162729658,
"loss": 3.3974,
"step": 37400
},
{
"epoch": 10.915238428354902,
"grad_norm": 0.3326956331729889,
"learning_rate": 0.0004692948381452318,
"loss": 3.3913,
"step": 37450
},
{
"epoch": 10.929812288679026,
"grad_norm": 0.3450769782066345,
"learning_rate": 0.0004691198600174978,
"loss": 3.3981,
"step": 37500
},
{
"epoch": 10.944386149003147,
"grad_norm": 0.3500250279903412,
"learning_rate": 0.0004689448818897637,
"loss": 3.3898,
"step": 37550
},
{
"epoch": 10.95896000932727,
"grad_norm": 0.3329240381717682,
"learning_rate": 0.0004687699037620297,
"loss": 3.3965,
"step": 37600
},
{
"epoch": 10.973533869651392,
"grad_norm": 0.35290616750717163,
"learning_rate": 0.0004685949256342957,
"loss": 3.4036,
"step": 37650
},
{
"epoch": 10.988107729975516,
"grad_norm": 0.3301716148853302,
"learning_rate": 0.0004684199475065617,
"loss": 3.3869,
"step": 37700
},
{
"epoch": 11.002623294858342,
"grad_norm": 0.35138195753097534,
"learning_rate": 0.0004682449693788276,
"loss": 3.3837,
"step": 37750
},
{
"epoch": 11.017197155182465,
"grad_norm": 0.325611412525177,
"learning_rate": 0.00046806999125109357,
"loss": 3.2862,
"step": 37800
},
{
"epoch": 11.031771015506587,
"grad_norm": 0.36736229062080383,
"learning_rate": 0.00046789501312335957,
"loss": 3.2805,
"step": 37850
},
{
"epoch": 11.04634487583071,
"grad_norm": 0.3821553885936737,
"learning_rate": 0.0004677200349956255,
"loss": 3.296,
"step": 37900
},
{
"epoch": 11.060918736154832,
"grad_norm": 0.34444135427474976,
"learning_rate": 0.00046754505686789146,
"loss": 3.2868,
"step": 37950
},
{
"epoch": 11.075492596478956,
"grad_norm": 0.34698426723480225,
"learning_rate": 0.00046737007874015745,
"loss": 3.2929,
"step": 38000
},
{
"epoch": 11.075492596478956,
"eval_accuracy": 0.3682230244184682,
"eval_loss": 3.5735981464385986,
"eval_runtime": 53.1976,
"eval_samples_per_second": 312.552,
"eval_steps_per_second": 19.55,
"step": 38000
},
{
"epoch": 11.090066456803077,
"grad_norm": 0.3469058573246002,
"learning_rate": 0.0004671951006124234,
"loss": 3.2903,
"step": 38050
},
{
"epoch": 11.1046403171272,
"grad_norm": 0.3458442687988281,
"learning_rate": 0.00046702012248468934,
"loss": 3.3083,
"step": 38100
},
{
"epoch": 11.119214177451322,
"grad_norm": 0.36308300495147705,
"learning_rate": 0.00046684514435695534,
"loss": 3.3161,
"step": 38150
},
{
"epoch": 11.133788037775446,
"grad_norm": 0.3724916875362396,
"learning_rate": 0.00046667016622922134,
"loss": 3.3028,
"step": 38200
},
{
"epoch": 11.14836189809957,
"grad_norm": 0.34953588247299194,
"learning_rate": 0.0004664951881014873,
"loss": 3.3074,
"step": 38250
},
{
"epoch": 11.162935758423691,
"grad_norm": 0.3273851275444031,
"learning_rate": 0.0004663202099737532,
"loss": 3.3194,
"step": 38300
},
{
"epoch": 11.177509618747814,
"grad_norm": 0.3394640386104584,
"learning_rate": 0.0004661452318460192,
"loss": 3.3129,
"step": 38350
},
{
"epoch": 11.192083479071936,
"grad_norm": 0.3485426902770996,
"learning_rate": 0.00046597025371828516,
"loss": 3.3283,
"step": 38400
},
{
"epoch": 11.20665733939606,
"grad_norm": 0.34133514761924744,
"learning_rate": 0.00046579527559055116,
"loss": 3.3129,
"step": 38450
},
{
"epoch": 11.221231199720181,
"grad_norm": 0.33524519205093384,
"learning_rate": 0.0004656202974628171,
"loss": 3.3224,
"step": 38500
},
{
"epoch": 11.235805060044305,
"grad_norm": 0.3436400294303894,
"learning_rate": 0.00046544531933508305,
"loss": 3.3293,
"step": 38550
},
{
"epoch": 11.250378920368426,
"grad_norm": Infinity,
"learning_rate": 0.00046527034120734905,
"loss": 3.3313,
"step": 38600
},
{
"epoch": 11.26495278069255,
"grad_norm": 0.3648681044578552,
"learning_rate": 0.00046509536307961504,
"loss": 3.32,
"step": 38650
},
{
"epoch": 11.279526641016673,
"grad_norm": 0.3386353850364685,
"learning_rate": 0.00046492038495188093,
"loss": 3.3257,
"step": 38700
},
{
"epoch": 11.294100501340795,
"grad_norm": 0.4027436077594757,
"learning_rate": 0.00046474540682414693,
"loss": 3.325,
"step": 38750
},
{
"epoch": 11.308674361664918,
"grad_norm": 0.36172330379486084,
"learning_rate": 0.00046457042869641293,
"loss": 3.331,
"step": 38800
},
{
"epoch": 11.32324822198904,
"grad_norm": 0.36298421025276184,
"learning_rate": 0.0004643954505686789,
"loss": 3.3357,
"step": 38850
},
{
"epoch": 11.337822082313163,
"grad_norm": 0.36087754368782043,
"learning_rate": 0.0004642204724409448,
"loss": 3.3367,
"step": 38900
},
{
"epoch": 11.352395942637285,
"grad_norm": 0.3267645537853241,
"learning_rate": 0.0004640454943132108,
"loss": 3.3455,
"step": 38950
},
{
"epoch": 11.366969802961409,
"grad_norm": 0.3678348660469055,
"learning_rate": 0.0004638705161854768,
"loss": 3.3442,
"step": 39000
},
{
"epoch": 11.366969802961409,
"eval_accuracy": 0.36826857313753114,
"eval_loss": 3.570142984390259,
"eval_runtime": 53.2477,
"eval_samples_per_second": 312.257,
"eval_steps_per_second": 19.531,
"step": 39000
},
{
"epoch": 11.38154366328553,
"grad_norm": 0.326987624168396,
"learning_rate": 0.0004636955380577427,
"loss": 3.3472,
"step": 39050
},
{
"epoch": 11.396117523609654,
"grad_norm": 0.3479270339012146,
"learning_rate": 0.0004635205599300087,
"loss": 3.334,
"step": 39100
},
{
"epoch": 11.410691383933777,
"grad_norm": 0.3902701437473297,
"learning_rate": 0.0004633455818022747,
"loss": 3.3365,
"step": 39150
},
{
"epoch": 11.425265244257899,
"grad_norm": 0.33477523922920227,
"learning_rate": 0.00046317060367454064,
"loss": 3.3543,
"step": 39200
},
{
"epoch": 11.439839104582022,
"grad_norm": 0.3593793511390686,
"learning_rate": 0.0004629956255468066,
"loss": 3.3543,
"step": 39250
},
{
"epoch": 11.454412964906144,
"grad_norm": 0.31894612312316895,
"learning_rate": 0.0004628206474190726,
"loss": 3.3334,
"step": 39300
},
{
"epoch": 11.468986825230267,
"grad_norm": 0.34372231364250183,
"learning_rate": 0.0004626456692913385,
"loss": 3.3548,
"step": 39350
},
{
"epoch": 11.483560685554389,
"grad_norm": 0.35167017579078674,
"learning_rate": 0.0004624706911636045,
"loss": 3.3532,
"step": 39400
},
{
"epoch": 11.498134545878512,
"grad_norm": 0.34937718510627747,
"learning_rate": 0.00046229571303587046,
"loss": 3.3543,
"step": 39450
},
{
"epoch": 11.512708406202634,
"grad_norm": 0.342166006565094,
"learning_rate": 0.00046212073490813646,
"loss": 3.3563,
"step": 39500
},
{
"epoch": 11.527282266526758,
"grad_norm": 0.3739171028137207,
"learning_rate": 0.0004619457567804024,
"loss": 3.3604,
"step": 39550
},
{
"epoch": 11.541856126850881,
"grad_norm": 0.35929858684539795,
"learning_rate": 0.0004617707786526684,
"loss": 3.3605,
"step": 39600
},
{
"epoch": 11.556429987175003,
"grad_norm": 0.3883790969848633,
"learning_rate": 0.00046159580052493435,
"loss": 3.3472,
"step": 39650
},
{
"epoch": 11.571003847499126,
"grad_norm": 0.3464301824569702,
"learning_rate": 0.0004614208223972003,
"loss": 3.3535,
"step": 39700
},
{
"epoch": 11.585577707823248,
"grad_norm": 0.3334825038909912,
"learning_rate": 0.0004612458442694663,
"loss": 3.3482,
"step": 39750
},
{
"epoch": 11.600151568147371,
"grad_norm": 0.3654293119907379,
"learning_rate": 0.0004610708661417323,
"loss": 3.3473,
"step": 39800
},
{
"epoch": 11.614725428471493,
"grad_norm": 0.3614482879638672,
"learning_rate": 0.0004608958880139982,
"loss": 3.3627,
"step": 39850
},
{
"epoch": 11.629299288795616,
"grad_norm": 0.3462158441543579,
"learning_rate": 0.00046072090988626417,
"loss": 3.3557,
"step": 39900
},
{
"epoch": 11.643873149119738,
"grad_norm": 0.35264861583709717,
"learning_rate": 0.00046054593175853017,
"loss": 3.361,
"step": 39950
},
{
"epoch": 11.658447009443861,
"grad_norm": 0.3280007243156433,
"learning_rate": 0.00046037095363079606,
"loss": 3.3643,
"step": 40000
},
{
"epoch": 11.658447009443861,
"eval_accuracy": 0.36889142538890307,
"eval_loss": 3.5624794960021973,
"eval_runtime": 53.3577,
"eval_samples_per_second": 311.614,
"eval_steps_per_second": 19.491,
"step": 40000
},
{
"epoch": 11.673020869767985,
"grad_norm": 0.34470710158348083,
"learning_rate": 0.00046019597550306206,
"loss": 3.3638,
"step": 40050
},
{
"epoch": 11.687594730092107,
"grad_norm": 0.3455049693584442,
"learning_rate": 0.00046002099737532806,
"loss": 3.3741,
"step": 40100
},
{
"epoch": 11.70216859041623,
"grad_norm": 0.353929340839386,
"learning_rate": 0.00045984601924759405,
"loss": 3.3658,
"step": 40150
},
{
"epoch": 11.716742450740352,
"grad_norm": 0.34111636877059937,
"learning_rate": 0.00045967104111985994,
"loss": 3.3672,
"step": 40200
},
{
"epoch": 11.731316311064475,
"grad_norm": 0.3463384509086609,
"learning_rate": 0.00045949606299212594,
"loss": 3.3819,
"step": 40250
},
{
"epoch": 11.745890171388597,
"grad_norm": 0.3586461842060089,
"learning_rate": 0.00045932108486439194,
"loss": 3.3647,
"step": 40300
},
{
"epoch": 11.76046403171272,
"grad_norm": 0.3610433042049408,
"learning_rate": 0.0004591461067366579,
"loss": 3.3772,
"step": 40350
},
{
"epoch": 11.775037892036842,
"grad_norm": 0.363849937915802,
"learning_rate": 0.0004589711286089238,
"loss": 3.3714,
"step": 40400
},
{
"epoch": 11.789611752360965,
"grad_norm": 0.3719967305660248,
"learning_rate": 0.0004587961504811898,
"loss": 3.3634,
"step": 40450
},
{
"epoch": 11.804185612685089,
"grad_norm": 0.3673126995563507,
"learning_rate": 0.00045862117235345577,
"loss": 3.3663,
"step": 40500
},
{
"epoch": 11.81875947300921,
"grad_norm": 0.3462914228439331,
"learning_rate": 0.00045844619422572176,
"loss": 3.3805,
"step": 40550
},
{
"epoch": 11.833333333333334,
"grad_norm": 0.3419555425643921,
"learning_rate": 0.0004582712160979877,
"loss": 3.3786,
"step": 40600
},
{
"epoch": 11.847907193657456,
"grad_norm": 0.35096514225006104,
"learning_rate": 0.00045809623797025365,
"loss": 3.3566,
"step": 40650
},
{
"epoch": 11.862481053981579,
"grad_norm": 0.3647240698337555,
"learning_rate": 0.00045792125984251965,
"loss": 3.3779,
"step": 40700
},
{
"epoch": 11.8770549143057,
"grad_norm": 0.35765257477760315,
"learning_rate": 0.0004577462817147856,
"loss": 3.3644,
"step": 40750
},
{
"epoch": 11.891628774629824,
"grad_norm": 0.3552255630493164,
"learning_rate": 0.0004575713035870516,
"loss": 3.3735,
"step": 40800
},
{
"epoch": 11.906202634953946,
"grad_norm": 0.3263258635997772,
"learning_rate": 0.00045739632545931753,
"loss": 3.3768,
"step": 40850
},
{
"epoch": 11.92077649527807,
"grad_norm": 0.3834480345249176,
"learning_rate": 0.00045722134733158353,
"loss": 3.3691,
"step": 40900
},
{
"epoch": 11.935350355602193,
"grad_norm": 0.3409174084663391,
"learning_rate": 0.0004570463692038495,
"loss": 3.3798,
"step": 40950
},
{
"epoch": 11.949924215926314,
"grad_norm": 0.338615745306015,
"learning_rate": 0.0004568713910761154,
"loss": 3.3835,
"step": 41000
},
{
"epoch": 11.949924215926314,
"eval_accuracy": 0.3692655839881305,
"eval_loss": 3.555158853530884,
"eval_runtime": 53.1961,
"eval_samples_per_second": 312.56,
"eval_steps_per_second": 19.55,
"step": 41000
},
{
"epoch": 11.964498076250438,
"grad_norm": 0.34794795513153076,
"learning_rate": 0.0004566964129483814,
"loss": 3.3811,
"step": 41050
},
{
"epoch": 11.97907193657456,
"grad_norm": 0.3606904149055481,
"learning_rate": 0.0004565214348206474,
"loss": 3.3831,
"step": 41100
},
{
"epoch": 11.993645796898683,
"grad_norm": 0.37181276082992554,
"learning_rate": 0.0004563464566929133,
"loss": 3.3904,
"step": 41150
},
{
"epoch": 12.008161361781509,
"grad_norm": 0.3550557494163513,
"learning_rate": 0.0004561714785651793,
"loss": 3.3194,
"step": 41200
},
{
"epoch": 12.02273522210563,
"grad_norm": 0.37992751598358154,
"learning_rate": 0.0004559965004374453,
"loss": 3.2675,
"step": 41250
},
{
"epoch": 12.037309082429754,
"grad_norm": 0.33115464448928833,
"learning_rate": 0.0004558215223097113,
"loss": 3.2753,
"step": 41300
},
{
"epoch": 12.051882942753876,
"grad_norm": 0.3398646116256714,
"learning_rate": 0.0004556465441819772,
"loss": 3.2684,
"step": 41350
},
{
"epoch": 12.066456803078,
"grad_norm": 0.335304319858551,
"learning_rate": 0.0004554715660542432,
"loss": 3.2729,
"step": 41400
},
{
"epoch": 12.081030663402123,
"grad_norm": 0.3528042137622833,
"learning_rate": 0.0004552965879265092,
"loss": 3.286,
"step": 41450
},
{
"epoch": 12.095604523726244,
"grad_norm": 0.3518418073654175,
"learning_rate": 0.0004551216097987751,
"loss": 3.2895,
"step": 41500
},
{
"epoch": 12.110178384050368,
"grad_norm": 0.33296331763267517,
"learning_rate": 0.00045494663167104107,
"loss": 3.2851,
"step": 41550
},
{
"epoch": 12.12475224437449,
"grad_norm": 0.3649348318576813,
"learning_rate": 0.00045477165354330706,
"loss": 3.2921,
"step": 41600
},
{
"epoch": 12.139326104698613,
"grad_norm": 0.37086284160614014,
"learning_rate": 0.000454596675415573,
"loss": 3.299,
"step": 41650
},
{
"epoch": 12.153899965022735,
"grad_norm": 0.3619695007801056,
"learning_rate": 0.00045442169728783895,
"loss": 3.291,
"step": 41700
},
{
"epoch": 12.168473825346858,
"grad_norm": 0.3611195981502533,
"learning_rate": 0.00045424671916010495,
"loss": 3.2959,
"step": 41750
},
{
"epoch": 12.18304768567098,
"grad_norm": 0.33316633105278015,
"learning_rate": 0.0004540717410323709,
"loss": 3.3016,
"step": 41800
},
{
"epoch": 12.197621545995103,
"grad_norm": 0.3570798337459564,
"learning_rate": 0.0004538967629046369,
"loss": 3.3126,
"step": 41850
},
{
"epoch": 12.212195406319227,
"grad_norm": 0.37914368510246277,
"learning_rate": 0.00045372178477690283,
"loss": 3.3036,
"step": 41900
},
{
"epoch": 12.226769266643348,
"grad_norm": 0.3518165647983551,
"learning_rate": 0.0004535468066491688,
"loss": 3.3045,
"step": 41950
},
{
"epoch": 12.241343126967472,
"grad_norm": 0.35031190514564514,
"learning_rate": 0.0004533718285214348,
"loss": 3.3082,
"step": 42000
},
{
"epoch": 12.241343126967472,
"eval_accuracy": 0.3688768309672912,
"eval_loss": 3.5698370933532715,
"eval_runtime": 53.2744,
"eval_samples_per_second": 312.101,
"eval_steps_per_second": 19.522,
"step": 42000
},
{
"epoch": 12.255916987291593,
"grad_norm": 0.35693952441215515,
"learning_rate": 0.00045319685039370077,
"loss": 3.3034,
"step": 42050
},
{
"epoch": 12.270490847615717,
"grad_norm": 0.34432175755500793,
"learning_rate": 0.0004530218722659667,
"loss": 3.3106,
"step": 42100
},
{
"epoch": 12.285064707939839,
"grad_norm": 0.3476402461528778,
"learning_rate": 0.00045284689413823266,
"loss": 3.3177,
"step": 42150
},
{
"epoch": 12.299638568263962,
"grad_norm": 0.33668053150177,
"learning_rate": 0.00045267191601049866,
"loss": 3.3185,
"step": 42200
},
{
"epoch": 12.314212428588084,
"grad_norm": 0.3379969596862793,
"learning_rate": 0.00045249693788276465,
"loss": 3.3094,
"step": 42250
},
{
"epoch": 12.328786288912207,
"grad_norm": 0.3845365345478058,
"learning_rate": 0.00045232195975503054,
"loss": 3.3214,
"step": 42300
},
{
"epoch": 12.34336014923633,
"grad_norm": 0.3697710335254669,
"learning_rate": 0.00045214698162729654,
"loss": 3.3116,
"step": 42350
},
{
"epoch": 12.357934009560452,
"grad_norm": 0.33476316928863525,
"learning_rate": 0.00045197200349956254,
"loss": 3.3326,
"step": 42400
},
{
"epoch": 12.372507869884576,
"grad_norm": 0.3501075208187103,
"learning_rate": 0.00045179702537182854,
"loss": 3.3258,
"step": 42450
},
{
"epoch": 12.387081730208697,
"grad_norm": 0.3357802927494049,
"learning_rate": 0.0004516220472440944,
"loss": 3.315,
"step": 42500
},
{
"epoch": 12.40165559053282,
"grad_norm": 0.3776220977306366,
"learning_rate": 0.0004514470691163604,
"loss": 3.335,
"step": 42550
},
{
"epoch": 12.416229450856942,
"grad_norm": 0.3859766125679016,
"learning_rate": 0.0004512720909886264,
"loss": 3.3257,
"step": 42600
},
{
"epoch": 12.430803311181066,
"grad_norm": 0.33255526423454285,
"learning_rate": 0.0004510971128608923,
"loss": 3.3273,
"step": 42650
},
{
"epoch": 12.445377171505188,
"grad_norm": 0.36508461833000183,
"learning_rate": 0.0004509221347331583,
"loss": 3.3366,
"step": 42700
},
{
"epoch": 12.459951031829311,
"grad_norm": 0.3575635254383087,
"learning_rate": 0.0004507471566054243,
"loss": 3.3386,
"step": 42750
},
{
"epoch": 12.474524892153434,
"grad_norm": 0.35500335693359375,
"learning_rate": 0.00045057217847769025,
"loss": 3.3267,
"step": 42800
},
{
"epoch": 12.489098752477556,
"grad_norm": 0.3265441656112671,
"learning_rate": 0.0004503972003499562,
"loss": 3.3363,
"step": 42850
},
{
"epoch": 12.50367261280168,
"grad_norm": 0.3403628468513489,
"learning_rate": 0.0004502222222222222,
"loss": 3.3262,
"step": 42900
},
{
"epoch": 12.518246473125801,
"grad_norm": 0.343070387840271,
"learning_rate": 0.00045004724409448813,
"loss": 3.337,
"step": 42950
},
{
"epoch": 12.532820333449925,
"grad_norm": 0.34135547280311584,
"learning_rate": 0.00044987226596675413,
"loss": 3.3344,
"step": 43000
},
{
"epoch": 12.532820333449925,
"eval_accuracy": 0.3695046264904994,
"eval_loss": 3.561704158782959,
"eval_runtime": 53.3031,
"eval_samples_per_second": 311.933,
"eval_steps_per_second": 19.511,
"step": 43000
},
{
"epoch": 12.547394193774046,
"grad_norm": 0.34756624698638916,
"learning_rate": 0.0004496972878390201,
"loss": 3.3327,
"step": 43050
},
{
"epoch": 12.56196805409817,
"grad_norm": 0.36216285824775696,
"learning_rate": 0.000449522309711286,
"loss": 3.3483,
"step": 43100
},
{
"epoch": 12.576541914422291,
"grad_norm": 0.3356451690196991,
"learning_rate": 0.000449347331583552,
"loss": 3.334,
"step": 43150
},
{
"epoch": 12.591115774746415,
"grad_norm": 0.3678639531135559,
"learning_rate": 0.000449172353455818,
"loss": 3.3387,
"step": 43200
},
{
"epoch": 12.605689635070538,
"grad_norm": 0.3456084132194519,
"learning_rate": 0.0004489973753280839,
"loss": 3.3369,
"step": 43250
},
{
"epoch": 12.62026349539466,
"grad_norm": 0.3634917736053467,
"learning_rate": 0.0004488223972003499,
"loss": 3.3413,
"step": 43300
},
{
"epoch": 12.634837355718783,
"grad_norm": 0.35174956917762756,
"learning_rate": 0.0004486474190726159,
"loss": 3.3357,
"step": 43350
},
{
"epoch": 12.649411216042905,
"grad_norm": 0.3444618284702301,
"learning_rate": 0.0004484724409448819,
"loss": 3.344,
"step": 43400
},
{
"epoch": 12.663985076367029,
"grad_norm": 0.36315611004829407,
"learning_rate": 0.0004482974628171478,
"loss": 3.3515,
"step": 43450
},
{
"epoch": 12.67855893669115,
"grad_norm": 0.3482201099395752,
"learning_rate": 0.0004481224846894138,
"loss": 3.3423,
"step": 43500
},
{
"epoch": 12.693132797015274,
"grad_norm": 0.33746659755706787,
"learning_rate": 0.0004479475065616798,
"loss": 3.3552,
"step": 43550
},
{
"epoch": 12.707706657339395,
"grad_norm": 0.32812613248825073,
"learning_rate": 0.00044777252843394567,
"loss": 3.3375,
"step": 43600
},
{
"epoch": 12.722280517663519,
"grad_norm": 0.36320027709007263,
"learning_rate": 0.00044759755030621167,
"loss": 3.3518,
"step": 43650
},
{
"epoch": 12.736854377987642,
"grad_norm": 0.3756552040576935,
"learning_rate": 0.00044742257217847767,
"loss": 3.3594,
"step": 43700
},
{
"epoch": 12.751428238311764,
"grad_norm": 0.3401622474193573,
"learning_rate": 0.00044724759405074366,
"loss": 3.3632,
"step": 43750
},
{
"epoch": 12.766002098635887,
"grad_norm": 0.34506717324256897,
"learning_rate": 0.00044707261592300955,
"loss": 3.3487,
"step": 43800
},
{
"epoch": 12.780575958960009,
"grad_norm": 0.3645191490650177,
"learning_rate": 0.00044689763779527555,
"loss": 3.3469,
"step": 43850
},
{
"epoch": 12.795149819284132,
"grad_norm": 0.35409530997276306,
"learning_rate": 0.00044672265966754155,
"loss": 3.3453,
"step": 43900
},
{
"epoch": 12.809723679608254,
"grad_norm": 0.3320823907852173,
"learning_rate": 0.0004465476815398075,
"loss": 3.3636,
"step": 43950
},
{
"epoch": 12.824297539932378,
"grad_norm": 0.34940165281295776,
"learning_rate": 0.00044637270341207344,
"loss": 3.3591,
"step": 44000
},
{
"epoch": 12.824297539932378,
"eval_accuracy": 0.36982064279717625,
"eval_loss": 3.5522236824035645,
"eval_runtime": 53.3823,
"eval_samples_per_second": 311.47,
"eval_steps_per_second": 19.482,
"step": 44000
},
{
"epoch": 12.8388714002565,
"grad_norm": 0.36628594994544983,
"learning_rate": 0.00044619772528433943,
"loss": 3.3656,
"step": 44050
},
{
"epoch": 12.853445260580623,
"grad_norm": 0.3620845675468445,
"learning_rate": 0.0004460227471566054,
"loss": 3.3576,
"step": 44100
},
{
"epoch": 12.868019120904744,
"grad_norm": 0.34851986169815063,
"learning_rate": 0.0004458477690288714,
"loss": 3.3474,
"step": 44150
},
{
"epoch": 12.882592981228868,
"grad_norm": 0.37146246433258057,
"learning_rate": 0.0004456727909011373,
"loss": 3.3361,
"step": 44200
},
{
"epoch": 12.897166841552991,
"grad_norm": 0.3710069954395294,
"learning_rate": 0.00044549781277340326,
"loss": 3.3563,
"step": 44250
},
{
"epoch": 12.911740701877113,
"grad_norm": 0.3436692953109741,
"learning_rate": 0.00044532283464566926,
"loss": 3.3659,
"step": 44300
},
{
"epoch": 12.926314562201236,
"grad_norm": 0.3440268635749817,
"learning_rate": 0.0004451478565179352,
"loss": 3.3578,
"step": 44350
},
{
"epoch": 12.940888422525358,
"grad_norm": 0.3370705246925354,
"learning_rate": 0.00044497287839020115,
"loss": 3.3602,
"step": 44400
},
{
"epoch": 12.955462282849481,
"grad_norm": 0.3349348306655884,
"learning_rate": 0.00044479790026246714,
"loss": 3.3533,
"step": 44450
},
{
"epoch": 12.970036143173603,
"grad_norm": 0.3440280854701996,
"learning_rate": 0.00044462292213473314,
"loss": 3.3597,
"step": 44500
},
{
"epoch": 12.984610003497727,
"grad_norm": 0.37189579010009766,
"learning_rate": 0.00044444794400699903,
"loss": 3.3446,
"step": 44550
},
{
"epoch": 12.999183863821848,
"grad_norm": 0.3803972899913788,
"learning_rate": 0.00044427296587926503,
"loss": 3.3488,
"step": 44600
},
{
"epoch": 13.013699428704676,
"grad_norm": 0.3448667824268341,
"learning_rate": 0.000444097987751531,
"loss": 3.2586,
"step": 44650
},
{
"epoch": 13.028273289028798,
"grad_norm": 0.3495483696460724,
"learning_rate": 0.000443923009623797,
"loss": 3.2472,
"step": 44700
},
{
"epoch": 13.042847149352921,
"grad_norm": 0.3531075119972229,
"learning_rate": 0.0004437480314960629,
"loss": 3.2499,
"step": 44750
},
{
"epoch": 13.057421009677043,
"grad_norm": 0.33505311608314514,
"learning_rate": 0.0004435730533683289,
"loss": 3.2616,
"step": 44800
},
{
"epoch": 13.071994870001166,
"grad_norm": 0.37032684683799744,
"learning_rate": 0.0004433980752405949,
"loss": 3.2688,
"step": 44850
},
{
"epoch": 13.086568730325288,
"grad_norm": 0.3590488135814667,
"learning_rate": 0.0004432230971128609,
"loss": 3.264,
"step": 44900
},
{
"epoch": 13.101142590649411,
"grad_norm": 0.3663865029811859,
"learning_rate": 0.0004430481189851268,
"loss": 3.2836,
"step": 44950
},
{
"epoch": 13.115716450973533,
"grad_norm": 0.32889941334724426,
"learning_rate": 0.0004428731408573928,
"loss": 3.271,
"step": 45000
},
{
"epoch": 13.115716450973533,
"eval_accuracy": 0.36953016672832023,
"eval_loss": 3.5666751861572266,
"eval_runtime": 53.3385,
"eval_samples_per_second": 311.726,
"eval_steps_per_second": 19.498,
"step": 45000
},
{
"epoch": 13.130290311297657,
"grad_norm": 0.32199689745903015,
"learning_rate": 0.0004426981627296588,
"loss": 3.2712,
"step": 45050
},
{
"epoch": 13.14486417162178,
"grad_norm": 0.3411182761192322,
"learning_rate": 0.00044252318460192473,
"loss": 3.274,
"step": 45100
},
{
"epoch": 13.159438031945902,
"grad_norm": 0.3830653429031372,
"learning_rate": 0.0004423482064741907,
"loss": 3.2699,
"step": 45150
},
{
"epoch": 13.174011892270025,
"grad_norm": 0.3356707990169525,
"learning_rate": 0.0004421732283464567,
"loss": 3.276,
"step": 45200
},
{
"epoch": 13.188585752594147,
"grad_norm": 0.36504417657852173,
"learning_rate": 0.0004419982502187226,
"loss": 3.2904,
"step": 45250
},
{
"epoch": 13.20315961291827,
"grad_norm": 0.35199934244155884,
"learning_rate": 0.00044182327209098856,
"loss": 3.2868,
"step": 45300
},
{
"epoch": 13.217733473242392,
"grad_norm": 0.3373820185661316,
"learning_rate": 0.00044164829396325456,
"loss": 3.2815,
"step": 45350
},
{
"epoch": 13.232307333566515,
"grad_norm": 0.3519679009914398,
"learning_rate": 0.0004414733158355205,
"loss": 3.294,
"step": 45400
},
{
"epoch": 13.246881193890637,
"grad_norm": 0.3462173342704773,
"learning_rate": 0.0004412983377077865,
"loss": 3.2805,
"step": 45450
},
{
"epoch": 13.26145505421476,
"grad_norm": 0.3485550880432129,
"learning_rate": 0.00044112335958005244,
"loss": 3.2979,
"step": 45500
},
{
"epoch": 13.276028914538884,
"grad_norm": 0.3813554346561432,
"learning_rate": 0.0004409483814523184,
"loss": 3.2948,
"step": 45550
},
{
"epoch": 13.290602774863006,
"grad_norm": 0.36145317554473877,
"learning_rate": 0.0004407734033245844,
"loss": 3.3003,
"step": 45600
},
{
"epoch": 13.305176635187129,
"grad_norm": 0.4004374146461487,
"learning_rate": 0.0004405984251968504,
"loss": 3.2865,
"step": 45650
},
{
"epoch": 13.31975049551125,
"grad_norm": 0.3874744176864624,
"learning_rate": 0.0004404234470691163,
"loss": 3.2975,
"step": 45700
},
{
"epoch": 13.334324355835374,
"grad_norm": 0.3737644553184509,
"learning_rate": 0.00044024846894138227,
"loss": 3.312,
"step": 45750
},
{
"epoch": 13.348898216159496,
"grad_norm": 0.3732014298439026,
"learning_rate": 0.00044007349081364827,
"loss": 3.3073,
"step": 45800
},
{
"epoch": 13.36347207648362,
"grad_norm": 0.33516380190849304,
"learning_rate": 0.00043989851268591427,
"loss": 3.2945,
"step": 45850
},
{
"epoch": 13.378045936807741,
"grad_norm": 0.3793085217475891,
"learning_rate": 0.00043972353455818016,
"loss": 3.3081,
"step": 45900
},
{
"epoch": 13.392619797131864,
"grad_norm": 0.3479321599006653,
"learning_rate": 0.00043954855643044615,
"loss": 3.3064,
"step": 45950
},
{
"epoch": 13.407193657455988,
"grad_norm": 0.3450932204723358,
"learning_rate": 0.00043937357830271215,
"loss": 3.3146,
"step": 46000
},
{
"epoch": 13.407193657455988,
"eval_accuracy": 0.36981722958567026,
"eval_loss": 3.5598223209381104,
"eval_runtime": 53.3903,
"eval_samples_per_second": 311.424,
"eval_steps_per_second": 19.479,
"step": 46000
},
{
"epoch": 13.42176751778011,
"grad_norm": 0.35616064071655273,
"learning_rate": 0.0004391986001749781,
"loss": 3.329,
"step": 46050
},
{
"epoch": 13.436341378104233,
"grad_norm": 0.3846951127052307,
"learning_rate": 0.00043902362204724404,
"loss": 3.3084,
"step": 46100
},
{
"epoch": 13.450915238428355,
"grad_norm": 0.335750550031662,
"learning_rate": 0.00043884864391951004,
"loss": 3.3181,
"step": 46150
},
{
"epoch": 13.465489098752478,
"grad_norm": 0.3532959818840027,
"learning_rate": 0.00043867366579177603,
"loss": 3.3181,
"step": 46200
},
{
"epoch": 13.4800629590766,
"grad_norm": 0.35488593578338623,
"learning_rate": 0.0004384986876640419,
"loss": 3.3174,
"step": 46250
},
{
"epoch": 13.494636819400723,
"grad_norm": 0.3597154915332794,
"learning_rate": 0.0004383237095363079,
"loss": 3.3166,
"step": 46300
},
{
"epoch": 13.509210679724845,
"grad_norm": 0.37795230746269226,
"learning_rate": 0.0004381487314085739,
"loss": 3.3249,
"step": 46350
},
{
"epoch": 13.523784540048968,
"grad_norm": 0.3656541407108307,
"learning_rate": 0.00043797375328083986,
"loss": 3.3218,
"step": 46400
},
{
"epoch": 13.538358400373092,
"grad_norm": 0.36606359481811523,
"learning_rate": 0.0004377987751531058,
"loss": 3.3175,
"step": 46450
},
{
"epoch": 13.552932260697213,
"grad_norm": 0.352518230676651,
"learning_rate": 0.0004376237970253718,
"loss": 3.3238,
"step": 46500
},
{
"epoch": 13.567506121021337,
"grad_norm": 0.32122334837913513,
"learning_rate": 0.00043744881889763775,
"loss": 3.3217,
"step": 46550
},
{
"epoch": 13.582079981345458,
"grad_norm": 0.3561324179172516,
"learning_rate": 0.00043727384076990374,
"loss": 3.3216,
"step": 46600
},
{
"epoch": 13.596653841669582,
"grad_norm": 0.389505535364151,
"learning_rate": 0.0004370988626421697,
"loss": 3.3181,
"step": 46650
},
{
"epoch": 13.611227701993704,
"grad_norm": 0.34612900018692017,
"learning_rate": 0.00043692388451443563,
"loss": 3.3226,
"step": 46700
},
{
"epoch": 13.625801562317827,
"grad_norm": 0.350142240524292,
"learning_rate": 0.00043674890638670163,
"loss": 3.3378,
"step": 46750
},
{
"epoch": 13.640375422641949,
"grad_norm": 0.3523232638835907,
"learning_rate": 0.0004365739282589676,
"loss": 3.3313,
"step": 46800
},
{
"epoch": 13.654949282966072,
"grad_norm": 0.36117634177207947,
"learning_rate": 0.0004363989501312335,
"loss": 3.3417,
"step": 46850
},
{
"epoch": 13.669523143290196,
"grad_norm": 0.34388113021850586,
"learning_rate": 0.0004362239720034995,
"loss": 3.3294,
"step": 46900
},
{
"epoch": 13.684097003614317,
"grad_norm": 0.3423570990562439,
"learning_rate": 0.0004360489938757655,
"loss": 3.3376,
"step": 46950
},
{
"epoch": 13.69867086393844,
"grad_norm": 0.34624671936035156,
"learning_rate": 0.0004358740157480315,
"loss": 3.3277,
"step": 47000
},
{
"epoch": 13.69867086393844,
"eval_accuracy": 0.37011971074327155,
"eval_loss": 3.553891658782959,
"eval_runtime": 53.1879,
"eval_samples_per_second": 312.609,
"eval_steps_per_second": 19.553,
"step": 47000
},
{
"epoch": 13.713244724262562,
"grad_norm": 0.35453182458877563,
"learning_rate": 0.0004356990376202974,
"loss": 3.3401,
"step": 47050
},
{
"epoch": 13.727818584586686,
"grad_norm": 0.3806355893611908,
"learning_rate": 0.0004355240594925634,
"loss": 3.3272,
"step": 47100
},
{
"epoch": 13.742392444910807,
"grad_norm": 0.3244573473930359,
"learning_rate": 0.0004353490813648294,
"loss": 3.332,
"step": 47150
},
{
"epoch": 13.756966305234931,
"grad_norm": 0.35886478424072266,
"learning_rate": 0.0004351741032370953,
"loss": 3.3388,
"step": 47200
},
{
"epoch": 13.771540165559053,
"grad_norm": 0.3819633722305298,
"learning_rate": 0.0004349991251093613,
"loss": 3.3411,
"step": 47250
},
{
"epoch": 13.786114025883176,
"grad_norm": 0.36600205302238464,
"learning_rate": 0.0004348241469816273,
"loss": 3.334,
"step": 47300
},
{
"epoch": 13.8006878862073,
"grad_norm": 0.36532509326934814,
"learning_rate": 0.0004346491688538932,
"loss": 3.3333,
"step": 47350
},
{
"epoch": 13.815261746531421,
"grad_norm": 0.3700556457042694,
"learning_rate": 0.00043447419072615916,
"loss": 3.3347,
"step": 47400
},
{
"epoch": 13.829835606855545,
"grad_norm": 0.32723188400268555,
"learning_rate": 0.00043429921259842516,
"loss": 3.3258,
"step": 47450
},
{
"epoch": 13.844409467179666,
"grad_norm": 0.36504605412483215,
"learning_rate": 0.00043412423447069116,
"loss": 3.3328,
"step": 47500
},
{
"epoch": 13.85898332750379,
"grad_norm": 0.34605199098587036,
"learning_rate": 0.0004339492563429571,
"loss": 3.3427,
"step": 47550
},
{
"epoch": 13.873557187827911,
"grad_norm": 0.40003854036331177,
"learning_rate": 0.00043377427821522305,
"loss": 3.336,
"step": 47600
},
{
"epoch": 13.888131048152035,
"grad_norm": 0.33703961968421936,
"learning_rate": 0.00043359930008748904,
"loss": 3.3425,
"step": 47650
},
{
"epoch": 13.902704908476156,
"grad_norm": 0.3614450693130493,
"learning_rate": 0.000433424321959755,
"loss": 3.3337,
"step": 47700
},
{
"epoch": 13.91727876880028,
"grad_norm": 0.3742247223854065,
"learning_rate": 0.000433249343832021,
"loss": 3.3439,
"step": 47750
},
{
"epoch": 13.931852629124403,
"grad_norm": 0.3677355945110321,
"learning_rate": 0.00043307436570428693,
"loss": 3.3504,
"step": 47800
},
{
"epoch": 13.946426489448525,
"grad_norm": 0.34420591592788696,
"learning_rate": 0.00043289938757655287,
"loss": 3.3484,
"step": 47850
},
{
"epoch": 13.961000349772648,
"grad_norm": 0.36050641536712646,
"learning_rate": 0.00043272440944881887,
"loss": 3.3386,
"step": 47900
},
{
"epoch": 13.97557421009677,
"grad_norm": 0.3552097678184509,
"learning_rate": 0.0004325494313210848,
"loss": 3.3387,
"step": 47950
},
{
"epoch": 13.990148070420894,
"grad_norm": 0.3638666570186615,
"learning_rate": 0.00043237445319335076,
"loss": 3.3397,
"step": 48000
},
{
"epoch": 13.990148070420894,
"eval_accuracy": 0.3705688422986826,
"eval_loss": 3.5461502075195312,
"eval_runtime": 53.3477,
"eval_samples_per_second": 311.672,
"eval_steps_per_second": 19.495,
"step": 48000
},
{
"epoch": 14.00466363530372,
"grad_norm": 0.37190598249435425,
"learning_rate": 0.00043219947506561676,
"loss": 3.3138,
"step": 48050
},
{
"epoch": 14.019237495627841,
"grad_norm": 0.3626306653022766,
"learning_rate": 0.00043202449693788275,
"loss": 3.2264,
"step": 48100
},
{
"epoch": 14.033811355951965,
"grad_norm": 0.36090075969696045,
"learning_rate": 0.00043184951881014864,
"loss": 3.2424,
"step": 48150
},
{
"epoch": 14.048385216276086,
"grad_norm": 0.35671380162239075,
"learning_rate": 0.00043167454068241464,
"loss": 3.2551,
"step": 48200
},
{
"epoch": 14.06295907660021,
"grad_norm": 0.3807269334793091,
"learning_rate": 0.00043149956255468064,
"loss": 3.2405,
"step": 48250
},
{
"epoch": 14.077532936924333,
"grad_norm": 0.35869401693344116,
"learning_rate": 0.00043132458442694664,
"loss": 3.2448,
"step": 48300
},
{
"epoch": 14.092106797248455,
"grad_norm": 0.3535849452018738,
"learning_rate": 0.0004311496062992125,
"loss": 3.2553,
"step": 48350
},
{
"epoch": 14.106680657572578,
"grad_norm": 0.37184974551200867,
"learning_rate": 0.0004309746281714785,
"loss": 3.2573,
"step": 48400
},
{
"epoch": 14.1212545178967,
"grad_norm": 0.36269503831863403,
"learning_rate": 0.0004307996500437445,
"loss": 3.2596,
"step": 48450
},
{
"epoch": 14.135828378220824,
"grad_norm": 0.3423750102519989,
"learning_rate": 0.00043062467191601046,
"loss": 3.2674,
"step": 48500
},
{
"epoch": 14.150402238544945,
"grad_norm": 0.34411948919296265,
"learning_rate": 0.0004304496937882764,
"loss": 3.2583,
"step": 48550
},
{
"epoch": 14.164976098869069,
"grad_norm": 0.37814998626708984,
"learning_rate": 0.0004302747156605424,
"loss": 3.2656,
"step": 48600
},
{
"epoch": 14.17954995919319,
"grad_norm": 0.34621769189834595,
"learning_rate": 0.00043009973753280835,
"loss": 3.2716,
"step": 48650
},
{
"epoch": 14.194123819517314,
"grad_norm": 0.3595483899116516,
"learning_rate": 0.00042992475940507435,
"loss": 3.2708,
"step": 48700
},
{
"epoch": 14.208697679841436,
"grad_norm": 0.3458899259567261,
"learning_rate": 0.0004297497812773403,
"loss": 3.2623,
"step": 48750
},
{
"epoch": 14.223271540165559,
"grad_norm": 0.3649323880672455,
"learning_rate": 0.0004295748031496063,
"loss": 3.2657,
"step": 48800
},
{
"epoch": 14.237845400489682,
"grad_norm": 0.3653738796710968,
"learning_rate": 0.00042939982502187223,
"loss": 3.2688,
"step": 48850
},
{
"epoch": 14.252419260813804,
"grad_norm": 0.3623703420162201,
"learning_rate": 0.0004292248468941382,
"loss": 3.2699,
"step": 48900
},
{
"epoch": 14.266993121137928,
"grad_norm": 0.3507397770881653,
"learning_rate": 0.00042904986876640417,
"loss": 3.2814,
"step": 48950
},
{
"epoch": 14.28156698146205,
"grad_norm": 0.3713599741458893,
"learning_rate": 0.0004288748906386701,
"loss": 3.2894,
"step": 49000
},
{
"epoch": 14.28156698146205,
"eval_accuracy": 0.3698350018248912,
"eval_loss": 3.562056541442871,
"eval_runtime": 53.1481,
"eval_samples_per_second": 312.843,
"eval_steps_per_second": 19.568,
"step": 49000
},
{
"epoch": 14.296140841786173,
"grad_norm": 0.3673607110977173,
"learning_rate": 0.0004286999125109361,
"loss": 3.2879,
"step": 49050
},
{
"epoch": 14.310714702110294,
"grad_norm": 0.3480517268180847,
"learning_rate": 0.00042852493438320206,
"loss": 3.2797,
"step": 49100
},
{
"epoch": 14.325288562434418,
"grad_norm": 0.3668559491634369,
"learning_rate": 0.000428349956255468,
"loss": 3.288,
"step": 49150
},
{
"epoch": 14.33986242275854,
"grad_norm": 0.35511183738708496,
"learning_rate": 0.000428174978127734,
"loss": 3.2788,
"step": 49200
},
{
"epoch": 14.354436283082663,
"grad_norm": 0.3590772747993469,
"learning_rate": 0.000428,
"loss": 3.2956,
"step": 49250
},
{
"epoch": 14.369010143406786,
"grad_norm": 0.34500300884246826,
"learning_rate": 0.0004278250218722659,
"loss": 3.2888,
"step": 49300
},
{
"epoch": 14.383584003730908,
"grad_norm": 0.36788177490234375,
"learning_rate": 0.0004276500437445319,
"loss": 3.2891,
"step": 49350
},
{
"epoch": 14.398157864055031,
"grad_norm": 0.363623708486557,
"learning_rate": 0.0004274750656167979,
"loss": 3.2917,
"step": 49400
},
{
"epoch": 14.412731724379153,
"grad_norm": 0.35550540685653687,
"learning_rate": 0.0004273000874890639,
"loss": 3.2989,
"step": 49450
},
{
"epoch": 14.427305584703277,
"grad_norm": 0.39245107769966125,
"learning_rate": 0.00042712510936132977,
"loss": 3.2925,
"step": 49500
},
{
"epoch": 14.441879445027398,
"grad_norm": 0.38620251417160034,
"learning_rate": 0.00042695013123359576,
"loss": 3.2867,
"step": 49550
},
{
"epoch": 14.456453305351522,
"grad_norm": 0.35340312123298645,
"learning_rate": 0.00042677515310586176,
"loss": 3.3107,
"step": 49600
},
{
"epoch": 14.471027165675643,
"grad_norm": 0.35049957036972046,
"learning_rate": 0.0004266001749781277,
"loss": 3.3003,
"step": 49650
},
{
"epoch": 14.485601025999767,
"grad_norm": 0.3536100387573242,
"learning_rate": 0.00042642519685039365,
"loss": 3.2945,
"step": 49700
},
{
"epoch": 14.50017488632389,
"grad_norm": 0.3375909626483917,
"learning_rate": 0.00042625021872265965,
"loss": 3.2997,
"step": 49750
},
{
"epoch": 14.514748746648012,
"grad_norm": 0.34976649284362793,
"learning_rate": 0.0004260752405949256,
"loss": 3.3008,
"step": 49800
},
{
"epoch": 14.529322606972135,
"grad_norm": 0.3607953190803528,
"learning_rate": 0.00042590026246719153,
"loss": 3.3118,
"step": 49850
},
{
"epoch": 14.543896467296257,
"grad_norm": 0.3413010239601135,
"learning_rate": 0.00042572528433945753,
"loss": 3.3113,
"step": 49900
},
{
"epoch": 14.55847032762038,
"grad_norm": 0.34926819801330566,
"learning_rate": 0.0004255503062117235,
"loss": 3.3018,
"step": 49950
},
{
"epoch": 14.573044187944502,
"grad_norm": 0.37419041991233826,
"learning_rate": 0.00042537532808398947,
"loss": 3.3027,
"step": 50000
},
{
"epoch": 14.573044187944502,
"eval_accuracy": 0.3705401242432528,
"eval_loss": 3.5543155670166016,
"eval_runtime": 53.2259,
"eval_samples_per_second": 312.386,
"eval_steps_per_second": 19.539,
"step": 50000
},
{
"epoch": 14.587618048268626,
"grad_norm": 0.3565591275691986,
"learning_rate": 0.0004252003499562554,
"loss": 3.3013,
"step": 50050
},
{
"epoch": 14.602191908592747,
"grad_norm": 0.36905860900878906,
"learning_rate": 0.0004250253718285214,
"loss": 3.2943,
"step": 50100
},
{
"epoch": 14.61676576891687,
"grad_norm": 0.3611801564693451,
"learning_rate": 0.00042485039370078736,
"loss": 3.3154,
"step": 50150
},
{
"epoch": 14.631339629240994,
"grad_norm": 0.36640042066574097,
"learning_rate": 0.00042467541557305335,
"loss": 3.3241,
"step": 50200
},
{
"epoch": 14.645913489565116,
"grad_norm": 0.34255263209342957,
"learning_rate": 0.0004245004374453193,
"loss": 3.3169,
"step": 50250
},
{
"epoch": 14.66048734988924,
"grad_norm": 0.3561849296092987,
"learning_rate": 0.00042432545931758524,
"loss": 3.3116,
"step": 50300
},
{
"epoch": 14.67506121021336,
"grad_norm": 0.44158047437667847,
"learning_rate": 0.00042415048118985124,
"loss": 3.3122,
"step": 50350
},
{
"epoch": 14.689635070537484,
"grad_norm": 0.37477970123291016,
"learning_rate": 0.00042397550306211724,
"loss": 3.325,
"step": 50400
},
{
"epoch": 14.704208930861606,
"grad_norm": 0.34694111347198486,
"learning_rate": 0.0004238005249343831,
"loss": 3.3128,
"step": 50450
},
{
"epoch": 14.71878279118573,
"grad_norm": 0.3817085027694702,
"learning_rate": 0.0004236255468066491,
"loss": 3.3179,
"step": 50500
},
{
"epoch": 14.733356651509851,
"grad_norm": 0.3483661115169525,
"learning_rate": 0.0004234505686789151,
"loss": 3.3184,
"step": 50550
},
{
"epoch": 14.747930511833975,
"grad_norm": 0.3755340576171875,
"learning_rate": 0.0004232755905511811,
"loss": 3.322,
"step": 50600
},
{
"epoch": 14.762504372158098,
"grad_norm": 0.3704056441783905,
"learning_rate": 0.000423100612423447,
"loss": 3.315,
"step": 50650
},
{
"epoch": 14.77707823248222,
"grad_norm": 0.35658955574035645,
"learning_rate": 0.000422925634295713,
"loss": 3.326,
"step": 50700
},
{
"epoch": 14.791652092806343,
"grad_norm": 0.35138851404190063,
"learning_rate": 0.000422750656167979,
"loss": 3.3246,
"step": 50750
},
{
"epoch": 14.806225953130465,
"grad_norm": 0.36114072799682617,
"learning_rate": 0.0004225756780402449,
"loss": 3.3263,
"step": 50800
},
{
"epoch": 14.820799813454588,
"grad_norm": 0.3541187047958374,
"learning_rate": 0.0004224006999125109,
"loss": 3.3278,
"step": 50850
},
{
"epoch": 14.83537367377871,
"grad_norm": 0.3514479696750641,
"learning_rate": 0.0004222257217847769,
"loss": 3.315,
"step": 50900
},
{
"epoch": 14.849947534102833,
"grad_norm": 0.33424443006515503,
"learning_rate": 0.00042205074365704283,
"loss": 3.3185,
"step": 50950
},
{
"epoch": 14.864521394426955,
"grad_norm": 0.3738822042942047,
"learning_rate": 0.0004218757655293088,
"loss": 3.3197,
"step": 51000
},
{
"epoch": 14.864521394426955,
"eval_accuracy": 0.37113108062158584,
"eval_loss": 3.5453720092773438,
"eval_runtime": 53.2808,
"eval_samples_per_second": 312.064,
"eval_steps_per_second": 19.519,
"step": 51000
},
{
"epoch": 14.879095254751078,
"grad_norm": 0.3561355471611023,
"learning_rate": 0.0004217007874015748,
"loss": 3.3336,
"step": 51050
},
{
"epoch": 14.893669115075202,
"grad_norm": 0.34328049421310425,
"learning_rate": 0.0004215258092738407,
"loss": 3.3355,
"step": 51100
},
{
"epoch": 14.908242975399324,
"grad_norm": 0.3354352116584778,
"learning_rate": 0.0004213508311461067,
"loss": 3.3321,
"step": 51150
},
{
"epoch": 14.922816835723447,
"grad_norm": 0.35511845350265503,
"learning_rate": 0.00042117585301837266,
"loss": 3.3387,
"step": 51200
},
{
"epoch": 14.937390696047569,
"grad_norm": 0.34938520193099976,
"learning_rate": 0.0004210008748906386,
"loss": 3.3265,
"step": 51250
},
{
"epoch": 14.951964556371692,
"grad_norm": 0.3668470084667206,
"learning_rate": 0.0004208258967629046,
"loss": 3.3209,
"step": 51300
},
{
"epoch": 14.966538416695814,
"grad_norm": 0.34431192278862,
"learning_rate": 0.0004206509186351706,
"loss": 3.3524,
"step": 51350
},
{
"epoch": 14.981112277019937,
"grad_norm": 0.3520943820476532,
"learning_rate": 0.00042047594050743654,
"loss": 3.3261,
"step": 51400
},
{
"epoch": 14.995686137344059,
"grad_norm": 0.3720589876174927,
"learning_rate": 0.0004203009623797025,
"loss": 3.3256,
"step": 51450
},
{
"epoch": 15.010201702226885,
"grad_norm": 0.3676832914352417,
"learning_rate": 0.0004201259842519685,
"loss": 3.244,
"step": 51500
},
{
"epoch": 15.024775562551008,
"grad_norm": 0.3591252267360687,
"learning_rate": 0.0004199510061242344,
"loss": 3.2082,
"step": 51550
},
{
"epoch": 15.039349422875132,
"grad_norm": 0.36459583044052124,
"learning_rate": 0.00041977602799650037,
"loss": 3.2133,
"step": 51600
},
{
"epoch": 15.053923283199254,
"grad_norm": 0.3644467890262604,
"learning_rate": 0.00041960104986876637,
"loss": 3.2377,
"step": 51650
},
{
"epoch": 15.068497143523377,
"grad_norm": 0.3457610011100769,
"learning_rate": 0.00041942607174103236,
"loss": 3.2313,
"step": 51700
},
{
"epoch": 15.083071003847499,
"grad_norm": 0.39632362127304077,
"learning_rate": 0.00041925109361329825,
"loss": 3.2374,
"step": 51750
},
{
"epoch": 15.097644864171622,
"grad_norm": 0.36543601751327515,
"learning_rate": 0.00041907611548556425,
"loss": 3.239,
"step": 51800
},
{
"epoch": 15.112218724495744,
"grad_norm": 0.3530130088329315,
"learning_rate": 0.00041890113735783025,
"loss": 3.2509,
"step": 51850
},
{
"epoch": 15.126792584819867,
"grad_norm": 0.3841630816459656,
"learning_rate": 0.00041872615923009625,
"loss": 3.2485,
"step": 51900
},
{
"epoch": 15.141366445143989,
"grad_norm": 0.3543168008327484,
"learning_rate": 0.00041855118110236214,
"loss": 3.2474,
"step": 51950
},
{
"epoch": 15.155940305468112,
"grad_norm": 0.3882521390914917,
"learning_rate": 0.00041837620297462813,
"loss": 3.2525,
"step": 52000
},
{
"epoch": 15.155940305468112,
"eval_accuracy": 0.37026000550586324,
"eval_loss": 3.563176155090332,
"eval_runtime": 53.3952,
"eval_samples_per_second": 311.395,
"eval_steps_per_second": 19.477,
"step": 52000
},
{
"epoch": 15.170514165792236,
"grad_norm": 0.3743119239807129,
"learning_rate": 0.00041820122484689413,
"loss": 3.2451,
"step": 52050
},
{
"epoch": 15.185088026116357,
"grad_norm": 0.35586288571357727,
"learning_rate": 0.0004180262467191601,
"loss": 3.2701,
"step": 52100
},
{
"epoch": 15.19966188644048,
"grad_norm": 0.3539133667945862,
"learning_rate": 0.000417851268591426,
"loss": 3.258,
"step": 52150
},
{
"epoch": 15.214235746764603,
"grad_norm": 0.37858128547668457,
"learning_rate": 0.000417676290463692,
"loss": 3.2425,
"step": 52200
},
{
"epoch": 15.228809607088726,
"grad_norm": 0.35201990604400635,
"learning_rate": 0.00041750131233595796,
"loss": 3.2686,
"step": 52250
},
{
"epoch": 15.243383467412848,
"grad_norm": 0.36704257130622864,
"learning_rate": 0.00041732633420822396,
"loss": 3.2593,
"step": 52300
},
{
"epoch": 15.257957327736971,
"grad_norm": 0.33171460032463074,
"learning_rate": 0.0004171513560804899,
"loss": 3.2692,
"step": 52350
},
{
"epoch": 15.272531188061093,
"grad_norm": 0.36244454979896545,
"learning_rate": 0.00041697637795275584,
"loss": 3.2672,
"step": 52400
},
{
"epoch": 15.287105048385216,
"grad_norm": 0.34771937131881714,
"learning_rate": 0.00041680139982502184,
"loss": 3.2883,
"step": 52450
},
{
"epoch": 15.30167890870934,
"grad_norm": 0.35113075375556946,
"learning_rate": 0.0004166264216972878,
"loss": 3.2633,
"step": 52500
},
{
"epoch": 15.316252769033461,
"grad_norm": 0.3820761442184448,
"learning_rate": 0.00041645144356955373,
"loss": 3.289,
"step": 52550
},
{
"epoch": 15.330826629357585,
"grad_norm": 0.35176438093185425,
"learning_rate": 0.0004162764654418197,
"loss": 3.2792,
"step": 52600
},
{
"epoch": 15.345400489681706,
"grad_norm": 0.34542304277420044,
"learning_rate": 0.0004161014873140857,
"loss": 3.2843,
"step": 52650
},
{
"epoch": 15.35997435000583,
"grad_norm": 0.36796221137046814,
"learning_rate": 0.00041592650918635167,
"loss": 3.2766,
"step": 52700
},
{
"epoch": 15.374548210329952,
"grad_norm": 0.364033967256546,
"learning_rate": 0.0004157515310586176,
"loss": 3.2701,
"step": 52750
},
{
"epoch": 15.389122070654075,
"grad_norm": 0.34179043769836426,
"learning_rate": 0.0004155765529308836,
"loss": 3.2855,
"step": 52800
},
{
"epoch": 15.403695930978197,
"grad_norm": 0.3775258958339691,
"learning_rate": 0.0004154015748031496,
"loss": 3.2618,
"step": 52850
},
{
"epoch": 15.41826979130232,
"grad_norm": 0.3358531892299652,
"learning_rate": 0.0004152265966754155,
"loss": 3.2783,
"step": 52900
},
{
"epoch": 15.432843651626444,
"grad_norm": 0.40009891986846924,
"learning_rate": 0.0004150516185476815,
"loss": 3.2933,
"step": 52950
},
{
"epoch": 15.447417511950565,
"grad_norm": 0.34463268518447876,
"learning_rate": 0.0004148766404199475,
"loss": 3.2921,
"step": 53000
},
{
"epoch": 15.447417511950565,
"eval_accuracy": 0.3706980735481169,
"eval_loss": 3.5539512634277344,
"eval_runtime": 53.3188,
"eval_samples_per_second": 311.841,
"eval_steps_per_second": 19.505,
"step": 53000
},
{
"epoch": 15.461991372274689,
"grad_norm": 0.37157610058784485,
"learning_rate": 0.0004147016622922135,
"loss": 3.2756,
"step": 53050
},
{
"epoch": 15.47656523259881,
"grad_norm": 0.3813033699989319,
"learning_rate": 0.0004145266841644794,
"loss": 3.2895,
"step": 53100
},
{
"epoch": 15.491139092922934,
"grad_norm": 0.3535122573375702,
"learning_rate": 0.0004143517060367454,
"loss": 3.2833,
"step": 53150
},
{
"epoch": 15.505712953247055,
"grad_norm": 0.38189417123794556,
"learning_rate": 0.0004141767279090114,
"loss": 3.2982,
"step": 53200
},
{
"epoch": 15.520286813571179,
"grad_norm": 0.3614920973777771,
"learning_rate": 0.0004140017497812773,
"loss": 3.2903,
"step": 53250
},
{
"epoch": 15.5348606738953,
"grad_norm": 0.35769498348236084,
"learning_rate": 0.00041382677165354326,
"loss": 3.2955,
"step": 53300
},
{
"epoch": 15.549434534219424,
"grad_norm": 0.3598308265209198,
"learning_rate": 0.00041365179352580926,
"loss": 3.3013,
"step": 53350
},
{
"epoch": 15.564008394543547,
"grad_norm": 0.3613493740558624,
"learning_rate": 0.0004134768153980752,
"loss": 3.2847,
"step": 53400
},
{
"epoch": 15.57858225486767,
"grad_norm": 0.3721916675567627,
"learning_rate": 0.00041330183727034114,
"loss": 3.2993,
"step": 53450
},
{
"epoch": 15.593156115191793,
"grad_norm": 0.3966820538043976,
"learning_rate": 0.00041312685914260714,
"loss": 3.2931,
"step": 53500
},
{
"epoch": 15.607729975515914,
"grad_norm": 0.35601505637168884,
"learning_rate": 0.0004129518810148731,
"loss": 3.2995,
"step": 53550
},
{
"epoch": 15.622303835840038,
"grad_norm": 0.36930301785469055,
"learning_rate": 0.0004127769028871391,
"loss": 3.2891,
"step": 53600
},
{
"epoch": 15.63687769616416,
"grad_norm": 0.3665529191493988,
"learning_rate": 0.00041260192475940503,
"loss": 3.2997,
"step": 53650
},
{
"epoch": 15.651451556488283,
"grad_norm": 0.34720444679260254,
"learning_rate": 0.00041242694663167097,
"loss": 3.3025,
"step": 53700
},
{
"epoch": 15.666025416812404,
"grad_norm": 0.33357468247413635,
"learning_rate": 0.00041225196850393697,
"loss": 3.2858,
"step": 53750
},
{
"epoch": 15.680599277136528,
"grad_norm": 0.36720114946365356,
"learning_rate": 0.00041207699037620297,
"loss": 3.2931,
"step": 53800
},
{
"epoch": 15.695173137460651,
"grad_norm": 0.3451690673828125,
"learning_rate": 0.00041190201224846886,
"loss": 3.3067,
"step": 53850
},
{
"epoch": 15.709746997784773,
"grad_norm": 0.3749679625034332,
"learning_rate": 0.00041172703412073485,
"loss": 3.2982,
"step": 53900
},
{
"epoch": 15.724320858108896,
"grad_norm": 0.33904799818992615,
"learning_rate": 0.00041155205599300085,
"loss": 3.3089,
"step": 53950
},
{
"epoch": 15.738894718433018,
"grad_norm": 0.354440301656723,
"learning_rate": 0.00041137707786526685,
"loss": 3.3025,
"step": 54000
},
{
"epoch": 15.738894718433018,
"eval_accuracy": 0.3713644736704276,
"eval_loss": 3.5475549697875977,
"eval_runtime": 53.2281,
"eval_samples_per_second": 312.372,
"eval_steps_per_second": 19.539,
"step": 54000
},
{
"epoch": 15.753468578757142,
"grad_norm": 0.36913421750068665,
"learning_rate": 0.00041120209973753274,
"loss": 3.3109,
"step": 54050
},
{
"epoch": 15.768042439081263,
"grad_norm": 0.3467569649219513,
"learning_rate": 0.00041102712160979874,
"loss": 3.2863,
"step": 54100
},
{
"epoch": 15.782616299405387,
"grad_norm": 0.3800930380821228,
"learning_rate": 0.00041085214348206473,
"loss": 3.3074,
"step": 54150
},
{
"epoch": 15.797190159729508,
"grad_norm": 0.3963550329208374,
"learning_rate": 0.0004106771653543306,
"loss": 3.317,
"step": 54200
},
{
"epoch": 15.811764020053632,
"grad_norm": 0.4001551568508148,
"learning_rate": 0.0004105021872265966,
"loss": 3.302,
"step": 54250
},
{
"epoch": 15.826337880377755,
"grad_norm": 0.39264172315597534,
"learning_rate": 0.0004103272090988626,
"loss": 3.3056,
"step": 54300
},
{
"epoch": 15.840911740701877,
"grad_norm": 0.3680339455604553,
"learning_rate": 0.0004101522309711286,
"loss": 3.3132,
"step": 54350
},
{
"epoch": 15.855485601026,
"grad_norm": 0.3351523280143738,
"learning_rate": 0.0004099772528433945,
"loss": 3.318,
"step": 54400
},
{
"epoch": 15.870059461350122,
"grad_norm": 0.3352740705013275,
"learning_rate": 0.0004098022747156605,
"loss": 3.3053,
"step": 54450
},
{
"epoch": 15.884633321674245,
"grad_norm": 0.3434459865093231,
"learning_rate": 0.0004096272965879265,
"loss": 3.3175,
"step": 54500
},
{
"epoch": 15.899207181998367,
"grad_norm": 0.34822505712509155,
"learning_rate": 0.00040945231846019244,
"loss": 3.3014,
"step": 54550
},
{
"epoch": 15.91378104232249,
"grad_norm": 0.37124574184417725,
"learning_rate": 0.0004092773403324584,
"loss": 3.3199,
"step": 54600
},
{
"epoch": 15.928354902646612,
"grad_norm": 0.36846932768821716,
"learning_rate": 0.0004091023622047244,
"loss": 3.3047,
"step": 54650
},
{
"epoch": 15.942928762970736,
"grad_norm": 0.36084410548210144,
"learning_rate": 0.00040892738407699033,
"loss": 3.3154,
"step": 54700
},
{
"epoch": 15.95750262329486,
"grad_norm": 0.37809786200523376,
"learning_rate": 0.0004087524059492563,
"loss": 3.3178,
"step": 54750
},
{
"epoch": 15.97207648361898,
"grad_norm": 0.3330834209918976,
"learning_rate": 0.00040857742782152227,
"loss": 3.3083,
"step": 54800
},
{
"epoch": 15.986650343943104,
"grad_norm": 0.35704752802848816,
"learning_rate": 0.0004084024496937882,
"loss": 3.3223,
"step": 54850
},
{
"epoch": 16.00116590882593,
"grad_norm": 0.3622002601623535,
"learning_rate": 0.0004082274715660542,
"loss": 3.3095,
"step": 54900
},
{
"epoch": 16.015739769150052,
"grad_norm": 0.33063775300979614,
"learning_rate": 0.0004080524934383202,
"loss": 3.1916,
"step": 54950
},
{
"epoch": 16.030313629474175,
"grad_norm": 0.37001755833625793,
"learning_rate": 0.0004078775153105861,
"loss": 3.2054,
"step": 55000
},
{
"epoch": 16.030313629474175,
"eval_accuracy": 0.37094406017044634,
"eval_loss": 3.557634115219116,
"eval_runtime": 53.1562,
"eval_samples_per_second": 312.795,
"eval_steps_per_second": 19.565,
"step": 55000
},
{
"epoch": 16.0448874897983,
"grad_norm": 0.33795633912086487,
"learning_rate": 0.0004077025371828521,
"loss": 3.2113,
"step": 55050
},
{
"epoch": 16.05946135012242,
"grad_norm": 0.37001416087150574,
"learning_rate": 0.0004075275590551181,
"loss": 3.2144,
"step": 55100
},
{
"epoch": 16.074035210446542,
"grad_norm": 0.3721146881580353,
"learning_rate": 0.000407352580927384,
"loss": 3.2185,
"step": 55150
},
{
"epoch": 16.088609070770666,
"grad_norm": 0.3787801265716553,
"learning_rate": 0.00040717760279965,
"loss": 3.2339,
"step": 55200
},
{
"epoch": 16.10318293109479,
"grad_norm": 0.38708654046058655,
"learning_rate": 0.000407002624671916,
"loss": 3.2246,
"step": 55250
},
{
"epoch": 16.117756791418913,
"grad_norm": 0.37298810482025146,
"learning_rate": 0.000406827646544182,
"loss": 3.2327,
"step": 55300
},
{
"epoch": 16.132330651743032,
"grad_norm": 0.3699816167354584,
"learning_rate": 0.00040665266841644786,
"loss": 3.2237,
"step": 55350
},
{
"epoch": 16.146904512067156,
"grad_norm": 0.37289005517959595,
"learning_rate": 0.00040647769028871386,
"loss": 3.2419,
"step": 55400
},
{
"epoch": 16.16147837239128,
"grad_norm": 0.41655048727989197,
"learning_rate": 0.00040630271216097986,
"loss": 3.2349,
"step": 55450
},
{
"epoch": 16.176052232715403,
"grad_norm": 0.36438512802124023,
"learning_rate": 0.00040612773403324586,
"loss": 3.2401,
"step": 55500
},
{
"epoch": 16.190626093039523,
"grad_norm": 0.38352882862091064,
"learning_rate": 0.00040595275590551175,
"loss": 3.2357,
"step": 55550
},
{
"epoch": 16.205199953363646,
"grad_norm": 0.33786365389823914,
"learning_rate": 0.00040577777777777774,
"loss": 3.2352,
"step": 55600
},
{
"epoch": 16.21977381368777,
"grad_norm": 0.3919704854488373,
"learning_rate": 0.00040560279965004374,
"loss": 3.259,
"step": 55650
},
{
"epoch": 16.234347674011893,
"grad_norm": 0.3714604079723358,
"learning_rate": 0.0004054278215223097,
"loss": 3.2443,
"step": 55700
},
{
"epoch": 16.248921534336016,
"grad_norm": 0.3812396824359894,
"learning_rate": 0.00040525284339457563,
"loss": 3.2373,
"step": 55750
},
{
"epoch": 16.263495394660136,
"grad_norm": 0.37564516067504883,
"learning_rate": 0.0004050778652668416,
"loss": 3.2481,
"step": 55800
},
{
"epoch": 16.27806925498426,
"grad_norm": 0.38240739703178406,
"learning_rate": 0.00040490288713910757,
"loss": 3.2488,
"step": 55850
},
{
"epoch": 16.292643115308383,
"grad_norm": 0.387246698141098,
"learning_rate": 0.00040472790901137357,
"loss": 3.2542,
"step": 55900
},
{
"epoch": 16.307216975632507,
"grad_norm": 0.37336498498916626,
"learning_rate": 0.0004045529308836395,
"loss": 3.2679,
"step": 55950
},
{
"epoch": 16.321790835956627,
"grad_norm": 0.3574172854423523,
"learning_rate": 0.00040437795275590546,
"loss": 3.2648,
"step": 56000
},
{
"epoch": 16.321790835956627,
"eval_accuracy": 0.37116627200918223,
"eval_loss": 3.556175947189331,
"eval_runtime": 53.2307,
"eval_samples_per_second": 312.358,
"eval_steps_per_second": 19.538,
"step": 56000
},
{
"epoch": 16.33636469628075,
"grad_norm": 0.3797742426395416,
"learning_rate": 0.00040420297462817145,
"loss": 3.2606,
"step": 56050
},
{
"epoch": 16.350938556604873,
"grad_norm": 0.3520547151565552,
"learning_rate": 0.0004040279965004374,
"loss": 3.2682,
"step": 56100
},
{
"epoch": 16.365512416928997,
"grad_norm": 0.3968909978866577,
"learning_rate": 0.00040385301837270334,
"loss": 3.2687,
"step": 56150
},
{
"epoch": 16.38008627725312,
"grad_norm": 0.4448312819004059,
"learning_rate": 0.00040367804024496934,
"loss": 3.2677,
"step": 56200
},
{
"epoch": 16.39466013757724,
"grad_norm": 0.36881738901138306,
"learning_rate": 0.00040350306211723534,
"loss": 3.26,
"step": 56250
},
{
"epoch": 16.409233997901364,
"grad_norm": 0.3703789710998535,
"learning_rate": 0.0004033280839895012,
"loss": 3.2568,
"step": 56300
},
{
"epoch": 16.423807858225487,
"grad_norm": 0.38579708337783813,
"learning_rate": 0.0004031531058617672,
"loss": 3.265,
"step": 56350
},
{
"epoch": 16.43838171854961,
"grad_norm": 0.35912278294563293,
"learning_rate": 0.0004029781277340332,
"loss": 3.2769,
"step": 56400
},
{
"epoch": 16.45295557887373,
"grad_norm": 0.36150550842285156,
"learning_rate": 0.0004028031496062992,
"loss": 3.2593,
"step": 56450
},
{
"epoch": 16.467529439197854,
"grad_norm": 0.37937209010124207,
"learning_rate": 0.0004026281714785651,
"loss": 3.273,
"step": 56500
},
{
"epoch": 16.482103299521977,
"grad_norm": 0.3345341980457306,
"learning_rate": 0.0004024531933508311,
"loss": 3.274,
"step": 56550
},
{
"epoch": 16.4966771598461,
"grad_norm": 0.36195069551467896,
"learning_rate": 0.0004022782152230971,
"loss": 3.2831,
"step": 56600
},
{
"epoch": 16.511251020170224,
"grad_norm": 0.3733668625354767,
"learning_rate": 0.0004021032370953631,
"loss": 3.2811,
"step": 56650
},
{
"epoch": 16.525824880494344,
"grad_norm": 0.3509543240070343,
"learning_rate": 0.000401928258967629,
"loss": 3.2788,
"step": 56700
},
{
"epoch": 16.540398740818468,
"grad_norm": 0.3814429044723511,
"learning_rate": 0.000401753280839895,
"loss": 3.2759,
"step": 56750
},
{
"epoch": 16.55497260114259,
"grad_norm": 0.38439804315567017,
"learning_rate": 0.000401578302712161,
"loss": 3.2847,
"step": 56800
},
{
"epoch": 16.569546461466715,
"grad_norm": 0.37428414821624756,
"learning_rate": 0.00040140332458442693,
"loss": 3.2765,
"step": 56850
},
{
"epoch": 16.584120321790834,
"grad_norm": 0.36497533321380615,
"learning_rate": 0.00040122834645669287,
"loss": 3.2783,
"step": 56900
},
{
"epoch": 16.598694182114958,
"grad_norm": 0.3993648588657379,
"learning_rate": 0.00040105336832895887,
"loss": 3.2852,
"step": 56950
},
{
"epoch": 16.61326804243908,
"grad_norm": 0.3501555025577545,
"learning_rate": 0.0004008783902012248,
"loss": 3.2897,
"step": 57000
},
{
"epoch": 16.61326804243908,
"eval_accuracy": 0.3714454491709839,
"eval_loss": 3.550541639328003,
"eval_runtime": 53.2304,
"eval_samples_per_second": 312.359,
"eval_steps_per_second": 19.538,
"step": 57000
},
{
"epoch": 16.627841902763205,
"grad_norm": 0.3381880521774292,
"learning_rate": 0.00040070341207349076,
"loss": 3.2955,
"step": 57050
},
{
"epoch": 16.642415763087328,
"grad_norm": 0.35812264680862427,
"learning_rate": 0.00040052843394575675,
"loss": 3.2785,
"step": 57100
},
{
"epoch": 16.656989623411448,
"grad_norm": 0.3534041941165924,
"learning_rate": 0.0004003534558180227,
"loss": 3.2976,
"step": 57150
},
{
"epoch": 16.67156348373557,
"grad_norm": 0.3645527958869934,
"learning_rate": 0.0004001784776902887,
"loss": 3.2784,
"step": 57200
},
{
"epoch": 16.686137344059695,
"grad_norm": 0.3515307307243347,
"learning_rate": 0.00040000349956255464,
"loss": 3.2845,
"step": 57250
},
{
"epoch": 16.70071120438382,
"grad_norm": 0.3494560122489929,
"learning_rate": 0.0003998285214348206,
"loss": 3.2923,
"step": 57300
},
{
"epoch": 16.71528506470794,
"grad_norm": 0.3853073716163635,
"learning_rate": 0.0003996535433070866,
"loss": 3.3093,
"step": 57350
},
{
"epoch": 16.72985892503206,
"grad_norm": 0.357208788394928,
"learning_rate": 0.0003994785651793526,
"loss": 3.3005,
"step": 57400
},
{
"epoch": 16.744432785356185,
"grad_norm": 0.38168585300445557,
"learning_rate": 0.00039930358705161847,
"loss": 3.2879,
"step": 57450
},
{
"epoch": 16.75900664568031,
"grad_norm": 0.3707534968852997,
"learning_rate": 0.00039912860892388446,
"loss": 3.2997,
"step": 57500
},
{
"epoch": 16.773580506004432,
"grad_norm": 0.3866535425186157,
"learning_rate": 0.00039895363079615046,
"loss": 3.2849,
"step": 57550
},
{
"epoch": 16.788154366328552,
"grad_norm": 0.3614501655101776,
"learning_rate": 0.00039877865266841646,
"loss": 3.2929,
"step": 57600
},
{
"epoch": 16.802728226652675,
"grad_norm": 0.35415467619895935,
"learning_rate": 0.00039860367454068235,
"loss": 3.3009,
"step": 57650
},
{
"epoch": 16.8173020869768,
"grad_norm": 0.35893917083740234,
"learning_rate": 0.00039842869641294835,
"loss": 3.2887,
"step": 57700
},
{
"epoch": 16.831875947300922,
"grad_norm": 0.3701547384262085,
"learning_rate": 0.00039825371828521434,
"loss": 3.2951,
"step": 57750
},
{
"epoch": 16.846449807625042,
"grad_norm": 0.3377358615398407,
"learning_rate": 0.00039807874015748023,
"loss": 3.3029,
"step": 57800
},
{
"epoch": 16.861023667949166,
"grad_norm": 0.36560627818107605,
"learning_rate": 0.00039790376202974623,
"loss": 3.2871,
"step": 57850
},
{
"epoch": 16.87559752827329,
"grad_norm": 0.3592166006565094,
"learning_rate": 0.00039772878390201223,
"loss": 3.3013,
"step": 57900
},
{
"epoch": 16.890171388597413,
"grad_norm": 0.362232506275177,
"learning_rate": 0.0003975538057742782,
"loss": 3.3032,
"step": 57950
},
{
"epoch": 16.904745248921536,
"grad_norm": 0.3759749233722687,
"learning_rate": 0.0003973788276465441,
"loss": 3.2938,
"step": 58000
},
{
"epoch": 16.904745248921536,
"eval_accuracy": 0.3717375729971187,
"eval_loss": 3.543179512023926,
"eval_runtime": 53.2256,
"eval_samples_per_second": 312.387,
"eval_steps_per_second": 19.539,
"step": 58000
},
{
"epoch": 16.919319109245656,
"grad_norm": 0.37933215498924255,
"learning_rate": 0.0003972038495188101,
"loss": 3.3026,
"step": 58050
},
{
"epoch": 16.93389296956978,
"grad_norm": 0.3669482469558716,
"learning_rate": 0.0003970288713910761,
"loss": 3.3075,
"step": 58100
},
{
"epoch": 16.948466829893903,
"grad_norm": 0.41558346152305603,
"learning_rate": 0.00039685389326334205,
"loss": 3.2956,
"step": 58150
},
{
"epoch": 16.963040690218026,
"grad_norm": 0.36796823143959045,
"learning_rate": 0.000396678915135608,
"loss": 3.3017,
"step": 58200
},
{
"epoch": 16.977614550542146,
"grad_norm": 0.4123929440975189,
"learning_rate": 0.000396503937007874,
"loss": 3.3156,
"step": 58250
},
{
"epoch": 16.99218841086627,
"grad_norm": 0.37400203943252563,
"learning_rate": 0.00039632895888013994,
"loss": 3.3063,
"step": 58300
},
{
"epoch": 17.006703975749097,
"grad_norm": 0.3638571798801422,
"learning_rate": 0.00039615398075240594,
"loss": 3.251,
"step": 58350
},
{
"epoch": 17.021277836073217,
"grad_norm": 0.37552201747894287,
"learning_rate": 0.0003959790026246719,
"loss": 3.196,
"step": 58400
},
{
"epoch": 17.03585169639734,
"grad_norm": 0.39781540632247925,
"learning_rate": 0.0003958040244969378,
"loss": 3.1953,
"step": 58450
},
{
"epoch": 17.050425556721464,
"grad_norm": 0.37113073468208313,
"learning_rate": 0.0003956290463692038,
"loss": 3.1901,
"step": 58500
},
{
"epoch": 17.064999417045588,
"grad_norm": 0.37127241492271423,
"learning_rate": 0.0003954540682414698,
"loss": 3.201,
"step": 58550
},
{
"epoch": 17.07957327736971,
"grad_norm": 0.3953086733818054,
"learning_rate": 0.0003952790901137357,
"loss": 3.218,
"step": 58600
},
{
"epoch": 17.09414713769383,
"grad_norm": 0.3666759431362152,
"learning_rate": 0.0003951041119860017,
"loss": 3.214,
"step": 58650
},
{
"epoch": 17.108720998017954,
"grad_norm": 0.35260671377182007,
"learning_rate": 0.0003949291338582677,
"loss": 3.2224,
"step": 58700
},
{
"epoch": 17.123294858342078,
"grad_norm": 0.36087656021118164,
"learning_rate": 0.0003947541557305336,
"loss": 3.2197,
"step": 58750
},
{
"epoch": 17.1378687186662,
"grad_norm": 0.41437220573425293,
"learning_rate": 0.0003945791776027996,
"loss": 3.2232,
"step": 58800
},
{
"epoch": 17.15244257899032,
"grad_norm": 0.3841547966003418,
"learning_rate": 0.0003944041994750656,
"loss": 3.2088,
"step": 58850
},
{
"epoch": 17.167016439314445,
"grad_norm": 0.38770514726638794,
"learning_rate": 0.0003942292213473316,
"loss": 3.2212,
"step": 58900
},
{
"epoch": 17.181590299638568,
"grad_norm": 0.43151119351387024,
"learning_rate": 0.0003940542432195975,
"loss": 3.2283,
"step": 58950
},
{
"epoch": 17.19616415996269,
"grad_norm": 0.35698238015174866,
"learning_rate": 0.0003938792650918635,
"loss": 3.2236,
"step": 59000
},
{
"epoch": 17.19616415996269,
"eval_accuracy": 0.37119805018527263,
"eval_loss": 3.561183452606201,
"eval_runtime": 53.3167,
"eval_samples_per_second": 311.853,
"eval_steps_per_second": 19.506,
"step": 59000
},
{
"epoch": 17.210738020286815,
"grad_norm": 0.36982160806655884,
"learning_rate": 0.00039370428696412947,
"loss": 3.2417,
"step": 59050
},
{
"epoch": 17.225311880610935,
"grad_norm": 0.3661283552646637,
"learning_rate": 0.0003935293088363954,
"loss": 3.2369,
"step": 59100
},
{
"epoch": 17.23988574093506,
"grad_norm": 0.37872856855392456,
"learning_rate": 0.00039335433070866136,
"loss": 3.2424,
"step": 59150
},
{
"epoch": 17.25445960125918,
"grad_norm": 0.37743332982063293,
"learning_rate": 0.00039317935258092736,
"loss": 3.2531,
"step": 59200
},
{
"epoch": 17.269033461583305,
"grad_norm": 0.36602476239204407,
"learning_rate": 0.00039300437445319335,
"loss": 3.2485,
"step": 59250
},
{
"epoch": 17.283607321907425,
"grad_norm": 0.38170450925827026,
"learning_rate": 0.0003928293963254593,
"loss": 3.2447,
"step": 59300
},
{
"epoch": 17.29818118223155,
"grad_norm": 0.3535911440849304,
"learning_rate": 0.00039265441819772524,
"loss": 3.2355,
"step": 59350
},
{
"epoch": 17.312755042555672,
"grad_norm": 0.37537553906440735,
"learning_rate": 0.00039247944006999124,
"loss": 3.2474,
"step": 59400
},
{
"epoch": 17.327328902879795,
"grad_norm": 0.3766328990459442,
"learning_rate": 0.0003923044619422572,
"loss": 3.2498,
"step": 59450
},
{
"epoch": 17.34190276320392,
"grad_norm": 0.4029710292816162,
"learning_rate": 0.0003921294838145232,
"loss": 3.2516,
"step": 59500
},
{
"epoch": 17.35647662352804,
"grad_norm": 0.3706645965576172,
"learning_rate": 0.0003919545056867891,
"loss": 3.255,
"step": 59550
},
{
"epoch": 17.371050483852162,
"grad_norm": 0.376094788312912,
"learning_rate": 0.00039177952755905507,
"loss": 3.2536,
"step": 59600
},
{
"epoch": 17.385624344176286,
"grad_norm": 0.3890036642551422,
"learning_rate": 0.00039160454943132106,
"loss": 3.2558,
"step": 59650
},
{
"epoch": 17.40019820450041,
"grad_norm": 0.3617980182170868,
"learning_rate": 0.000391429571303587,
"loss": 3.2597,
"step": 59700
},
{
"epoch": 17.41477206482453,
"grad_norm": 0.36625897884368896,
"learning_rate": 0.00039125459317585295,
"loss": 3.26,
"step": 59750
},
{
"epoch": 17.429345925148652,
"grad_norm": 0.40066832304000854,
"learning_rate": 0.00039107961504811895,
"loss": 3.2589,
"step": 59800
},
{
"epoch": 17.443919785472776,
"grad_norm": 0.36895522475242615,
"learning_rate": 0.00039090463692038495,
"loss": 3.2513,
"step": 59850
},
{
"epoch": 17.4584936457969,
"grad_norm": 0.3618486821651459,
"learning_rate": 0.00039072965879265084,
"loss": 3.2505,
"step": 59900
},
{
"epoch": 17.473067506121023,
"grad_norm": 0.40019845962524414,
"learning_rate": 0.00039055468066491683,
"loss": 3.2599,
"step": 59950
},
{
"epoch": 17.487641366445143,
"grad_norm": 0.3406798541545868,
"learning_rate": 0.00039037970253718283,
"loss": 3.261,
"step": 60000
},
{
"epoch": 17.487641366445143,
"eval_accuracy": 0.3714183788728328,
"eval_loss": 3.5528781414031982,
"eval_runtime": 53.3399,
"eval_samples_per_second": 311.718,
"eval_steps_per_second": 19.498,
"step": 60000
},
{
"epoch": 17.502215226769266,
"grad_norm": 0.3532922565937042,
"learning_rate": 0.00039020472440944883,
"loss": 3.2684,
"step": 60050
},
{
"epoch": 17.51678908709339,
"grad_norm": 0.3447803556919098,
"learning_rate": 0.0003900297462817147,
"loss": 3.2626,
"step": 60100
},
{
"epoch": 17.531362947417513,
"grad_norm": 0.36045655608177185,
"learning_rate": 0.0003898547681539807,
"loss": 3.2602,
"step": 60150
},
{
"epoch": 17.545936807741633,
"grad_norm": 0.37568119168281555,
"learning_rate": 0.0003896797900262467,
"loss": 3.2687,
"step": 60200
},
{
"epoch": 17.560510668065756,
"grad_norm": 0.348172664642334,
"learning_rate": 0.00038950481189851266,
"loss": 3.278,
"step": 60250
},
{
"epoch": 17.57508452838988,
"grad_norm": 0.3671204447746277,
"learning_rate": 0.0003893298337707786,
"loss": 3.2537,
"step": 60300
},
{
"epoch": 17.589658388714003,
"grad_norm": 0.34588101506233215,
"learning_rate": 0.0003891548556430446,
"loss": 3.2723,
"step": 60350
},
{
"epoch": 17.604232249038127,
"grad_norm": 0.3869750201702118,
"learning_rate": 0.00038897987751531054,
"loss": 3.2739,
"step": 60400
},
{
"epoch": 17.618806109362247,
"grad_norm": 0.3896041214466095,
"learning_rate": 0.00038880489938757654,
"loss": 3.2726,
"step": 60450
},
{
"epoch": 17.63337996968637,
"grad_norm": 0.3611544966697693,
"learning_rate": 0.0003886299212598425,
"loss": 3.2795,
"step": 60500
},
{
"epoch": 17.647953830010493,
"grad_norm": 0.3854808509349823,
"learning_rate": 0.0003884549431321085,
"loss": 3.2688,
"step": 60550
},
{
"epoch": 17.662527690334617,
"grad_norm": 0.3506561517715454,
"learning_rate": 0.0003882799650043744,
"loss": 3.2754,
"step": 60600
},
{
"epoch": 17.677101550658737,
"grad_norm": 0.3766607940196991,
"learning_rate": 0.00038810498687664037,
"loss": 3.2741,
"step": 60650
},
{
"epoch": 17.69167541098286,
"grad_norm": 0.3881302773952484,
"learning_rate": 0.00038793000874890637,
"loss": 3.2766,
"step": 60700
},
{
"epoch": 17.706249271306984,
"grad_norm": 0.3620976209640503,
"learning_rate": 0.0003877550306211723,
"loss": 3.278,
"step": 60750
},
{
"epoch": 17.720823131631107,
"grad_norm": 0.3486970365047455,
"learning_rate": 0.0003875800524934383,
"loss": 3.2781,
"step": 60800
},
{
"epoch": 17.73539699195523,
"grad_norm": 0.34223899245262146,
"learning_rate": 0.00038740507436570425,
"loss": 3.2754,
"step": 60850
},
{
"epoch": 17.74997085227935,
"grad_norm": 0.36822447180747986,
"learning_rate": 0.0003872300962379702,
"loss": 3.272,
"step": 60900
},
{
"epoch": 17.764544712603474,
"grad_norm": 0.34223830699920654,
"learning_rate": 0.0003870551181102362,
"loss": 3.2766,
"step": 60950
},
{
"epoch": 17.779118572927597,
"grad_norm": 0.35672491788864136,
"learning_rate": 0.0003868801399825022,
"loss": 3.2844,
"step": 61000
},
{
"epoch": 17.779118572927597,
"eval_accuracy": 0.3721375072280638,
"eval_loss": 3.5414795875549316,
"eval_runtime": 53.2321,
"eval_samples_per_second": 312.349,
"eval_steps_per_second": 19.537,
"step": 61000
},
{
"epoch": 17.79369243325172,
"grad_norm": 0.35861504077911377,
"learning_rate": 0.0003867051618547681,
"loss": 3.2845,
"step": 61050
},
{
"epoch": 17.80826629357584,
"grad_norm": 0.38925936818122864,
"learning_rate": 0.0003865301837270341,
"loss": 3.2804,
"step": 61100
},
{
"epoch": 17.822840153899964,
"grad_norm": 0.38915199041366577,
"learning_rate": 0.0003863552055993001,
"loss": 3.2986,
"step": 61150
},
{
"epoch": 17.837414014224088,
"grad_norm": 0.3660210371017456,
"learning_rate": 0.00038618022747156607,
"loss": 3.3001,
"step": 61200
},
{
"epoch": 17.85198787454821,
"grad_norm": 0.366853266954422,
"learning_rate": 0.00038600524934383196,
"loss": 3.2869,
"step": 61250
},
{
"epoch": 17.866561734872334,
"grad_norm": 0.3562902808189392,
"learning_rate": 0.00038583027121609796,
"loss": 3.2901,
"step": 61300
},
{
"epoch": 17.881135595196454,
"grad_norm": 0.3574168384075165,
"learning_rate": 0.00038565529308836396,
"loss": 3.2821,
"step": 61350
},
{
"epoch": 17.895709455520578,
"grad_norm": 0.3829963207244873,
"learning_rate": 0.00038548031496062984,
"loss": 3.2883,
"step": 61400
},
{
"epoch": 17.9102833158447,
"grad_norm": 0.3606160283088684,
"learning_rate": 0.00038530533683289584,
"loss": 3.2845,
"step": 61450
},
{
"epoch": 17.924857176168825,
"grad_norm": 0.3578321635723114,
"learning_rate": 0.00038513035870516184,
"loss": 3.2801,
"step": 61500
},
{
"epoch": 17.939431036492945,
"grad_norm": 0.3657797574996948,
"learning_rate": 0.0003849553805774278,
"loss": 3.2871,
"step": 61550
},
{
"epoch": 17.954004896817068,
"grad_norm": 0.3794346749782562,
"learning_rate": 0.00038478040244969373,
"loss": 3.2791,
"step": 61600
},
{
"epoch": 17.96857875714119,
"grad_norm": 0.36017322540283203,
"learning_rate": 0.0003846054243219597,
"loss": 3.2846,
"step": 61650
},
{
"epoch": 17.983152617465315,
"grad_norm": 0.36378708481788635,
"learning_rate": 0.00038443044619422567,
"loss": 3.2921,
"step": 61700
},
{
"epoch": 17.99772647778944,
"grad_norm": 0.3741123378276825,
"learning_rate": 0.00038425546806649167,
"loss": 3.2942,
"step": 61750
},
{
"epoch": 18.012242042672263,
"grad_norm": 0.3481272757053375,
"learning_rate": 0.0003840804899387576,
"loss": 3.2076,
"step": 61800
},
{
"epoch": 18.026815902996386,
"grad_norm": 0.3690752685070038,
"learning_rate": 0.0003839055118110236,
"loss": 3.1866,
"step": 61850
},
{
"epoch": 18.04138976332051,
"grad_norm": 0.37555626034736633,
"learning_rate": 0.00038373053368328955,
"loss": 3.1798,
"step": 61900
},
{
"epoch": 18.05596362364463,
"grad_norm": 0.4028767943382263,
"learning_rate": 0.00038355555555555555,
"loss": 3.1654,
"step": 61950
},
{
"epoch": 18.070537483968753,
"grad_norm": 0.3782511353492737,
"learning_rate": 0.0003833805774278215,
"loss": 3.2023,
"step": 62000
},
{
"epoch": 18.070537483968753,
"eval_accuracy": 0.3714931164351195,
"eval_loss": 3.555772542953491,
"eval_runtime": 53.4272,
"eval_samples_per_second": 311.208,
"eval_steps_per_second": 19.466,
"step": 62000
},
{
"epoch": 18.085111344292876,
"grad_norm": 0.3724522590637207,
"learning_rate": 0.00038320559930008744,
"loss": 3.2086,
"step": 62050
},
{
"epoch": 18.099685204617,
"grad_norm": 0.3868659436702728,
"learning_rate": 0.00038303062117235343,
"loss": 3.2042,
"step": 62100
},
{
"epoch": 18.114259064941123,
"grad_norm": 0.37153860926628113,
"learning_rate": 0.00038285564304461943,
"loss": 3.1851,
"step": 62150
},
{
"epoch": 18.128832925265243,
"grad_norm": 0.37508624792099,
"learning_rate": 0.0003826806649168853,
"loss": 3.2067,
"step": 62200
},
{
"epoch": 18.143406785589367,
"grad_norm": 0.38075685501098633,
"learning_rate": 0.0003825056867891513,
"loss": 3.2042,
"step": 62250
},
{
"epoch": 18.15798064591349,
"grad_norm": 0.36698952317237854,
"learning_rate": 0.0003823307086614173,
"loss": 3.2204,
"step": 62300
},
{
"epoch": 18.172554506237613,
"grad_norm": 0.36773785948753357,
"learning_rate": 0.0003821557305336832,
"loss": 3.2278,
"step": 62350
},
{
"epoch": 18.187128366561733,
"grad_norm": 0.4154503047466278,
"learning_rate": 0.0003819807524059492,
"loss": 3.2207,
"step": 62400
},
{
"epoch": 18.201702226885857,
"grad_norm": 0.36895978450775146,
"learning_rate": 0.0003818057742782152,
"loss": 3.2159,
"step": 62450
},
{
"epoch": 18.21627608720998,
"grad_norm": 0.3558099865913391,
"learning_rate": 0.0003816307961504812,
"loss": 3.2119,
"step": 62500
},
{
"epoch": 18.230849947534104,
"grad_norm": 0.36864882707595825,
"learning_rate": 0.0003814558180227471,
"loss": 3.2354,
"step": 62550
},
{
"epoch": 18.245423807858227,
"grad_norm": 0.3729044795036316,
"learning_rate": 0.0003812808398950131,
"loss": 3.2194,
"step": 62600
},
{
"epoch": 18.259997668182347,
"grad_norm": 0.36386367678642273,
"learning_rate": 0.0003811058617672791,
"loss": 3.2238,
"step": 62650
},
{
"epoch": 18.27457152850647,
"grad_norm": 0.37404847145080566,
"learning_rate": 0.000380930883639545,
"loss": 3.2283,
"step": 62700
},
{
"epoch": 18.289145388830594,
"grad_norm": 0.38174042105674744,
"learning_rate": 0.00038075590551181097,
"loss": 3.2369,
"step": 62750
},
{
"epoch": 18.303719249154717,
"grad_norm": 0.37085598707199097,
"learning_rate": 0.00038058092738407697,
"loss": 3.2269,
"step": 62800
},
{
"epoch": 18.318293109478837,
"grad_norm": 0.37157735228538513,
"learning_rate": 0.0003804059492563429,
"loss": 3.2349,
"step": 62850
},
{
"epoch": 18.33286696980296,
"grad_norm": 0.42219650745391846,
"learning_rate": 0.0003802309711286089,
"loss": 3.2473,
"step": 62900
},
{
"epoch": 18.347440830127084,
"grad_norm": 0.3765549957752228,
"learning_rate": 0.00038005599300087485,
"loss": 3.2328,
"step": 62950
},
{
"epoch": 18.362014690451208,
"grad_norm": 0.3612719178199768,
"learning_rate": 0.0003798810148731408,
"loss": 3.2502,
"step": 63000
},
{
"epoch": 18.362014690451208,
"eval_accuracy": 0.37206123960544685,
"eval_loss": 3.550529956817627,
"eval_runtime": 53.2833,
"eval_samples_per_second": 312.049,
"eval_steps_per_second": 19.518,
"step": 63000
},
{
"epoch": 18.37658855077533,
"grad_norm": 0.3705293536186218,
"learning_rate": 0.0003797060367454068,
"loss": 3.2415,
"step": 63050
},
{
"epoch": 18.39116241109945,
"grad_norm": 0.406534880399704,
"learning_rate": 0.0003795310586176728,
"loss": 3.2368,
"step": 63100
},
{
"epoch": 18.405736271423574,
"grad_norm": 0.4121636748313904,
"learning_rate": 0.00037935608048993873,
"loss": 3.2456,
"step": 63150
},
{
"epoch": 18.420310131747698,
"grad_norm": 0.35870444774627686,
"learning_rate": 0.0003791811023622047,
"loss": 3.2387,
"step": 63200
},
{
"epoch": 18.43488399207182,
"grad_norm": 0.3755796551704407,
"learning_rate": 0.0003790061242344707,
"loss": 3.2539,
"step": 63250
},
{
"epoch": 18.44945785239594,
"grad_norm": 0.3582393229007721,
"learning_rate": 0.0003788311461067366,
"loss": 3.2442,
"step": 63300
},
{
"epoch": 18.464031712720065,
"grad_norm": 0.37926796078681946,
"learning_rate": 0.00037865616797900256,
"loss": 3.2481,
"step": 63350
},
{
"epoch": 18.478605573044188,
"grad_norm": 0.3608848452568054,
"learning_rate": 0.00037848118985126856,
"loss": 3.2444,
"step": 63400
},
{
"epoch": 18.49317943336831,
"grad_norm": 0.40005603432655334,
"learning_rate": 0.00037830621172353456,
"loss": 3.2428,
"step": 63450
},
{
"epoch": 18.507753293692435,
"grad_norm": 0.3767496347427368,
"learning_rate": 0.00037813123359580045,
"loss": 3.2603,
"step": 63500
},
{
"epoch": 18.522327154016555,
"grad_norm": 0.36244866251945496,
"learning_rate": 0.00037795625546806644,
"loss": 3.2525,
"step": 63550
},
{
"epoch": 18.53690101434068,
"grad_norm": 0.3706715703010559,
"learning_rate": 0.00037778127734033244,
"loss": 3.2555,
"step": 63600
},
{
"epoch": 18.5514748746648,
"grad_norm": 0.3899987041950226,
"learning_rate": 0.00037760629921259844,
"loss": 3.2556,
"step": 63650
},
{
"epoch": 18.566048734988925,
"grad_norm": 0.38272711634635925,
"learning_rate": 0.00037743132108486433,
"loss": 3.2559,
"step": 63700
},
{
"epoch": 18.580622595313045,
"grad_norm": 0.38435348868370056,
"learning_rate": 0.0003772563429571303,
"loss": 3.2729,
"step": 63750
},
{
"epoch": 18.59519645563717,
"grad_norm": 0.3879915177822113,
"learning_rate": 0.0003770813648293963,
"loss": 3.2638,
"step": 63800
},
{
"epoch": 18.609770315961292,
"grad_norm": 0.3937206268310547,
"learning_rate": 0.00037690638670166227,
"loss": 3.2614,
"step": 63850
},
{
"epoch": 18.624344176285415,
"grad_norm": 0.37411096692085266,
"learning_rate": 0.0003767314085739282,
"loss": 3.2676,
"step": 63900
},
{
"epoch": 18.63891803660954,
"grad_norm": 0.368111252784729,
"learning_rate": 0.0003765564304461942,
"loss": 3.2642,
"step": 63950
},
{
"epoch": 18.65349189693366,
"grad_norm": 0.35708072781562805,
"learning_rate": 0.00037638145231846015,
"loss": 3.2564,
"step": 64000
},
{
"epoch": 18.65349189693366,
"eval_accuracy": 0.37207807026907996,
"eval_loss": 3.546232223510742,
"eval_runtime": 53.2641,
"eval_samples_per_second": 312.162,
"eval_steps_per_second": 19.525,
"step": 64000
},
{
"epoch": 18.668065757257782,
"grad_norm": 0.3970872461795807,
"learning_rate": 0.00037620647419072615,
"loss": 3.2605,
"step": 64050
},
{
"epoch": 18.682639617581906,
"grad_norm": 0.35976195335388184,
"learning_rate": 0.0003760314960629921,
"loss": 3.2583,
"step": 64100
},
{
"epoch": 18.69721347790603,
"grad_norm": 0.3930548131465912,
"learning_rate": 0.00037585651793525804,
"loss": 3.2628,
"step": 64150
},
{
"epoch": 18.71178733823015,
"grad_norm": 0.40853869915008545,
"learning_rate": 0.00037568153980752404,
"loss": 3.265,
"step": 64200
},
{
"epoch": 18.726361198554272,
"grad_norm": 0.3586309850215912,
"learning_rate": 0.00037550656167979,
"loss": 3.2652,
"step": 64250
},
{
"epoch": 18.740935058878396,
"grad_norm": 0.34977707266807556,
"learning_rate": 0.0003753315835520559,
"loss": 3.2499,
"step": 64300
},
{
"epoch": 18.75550891920252,
"grad_norm": 0.3699282109737396,
"learning_rate": 0.0003751566054243219,
"loss": 3.2593,
"step": 64350
},
{
"epoch": 18.770082779526643,
"grad_norm": 0.3860742747783661,
"learning_rate": 0.0003749816272965879,
"loss": 3.2671,
"step": 64400
},
{
"epoch": 18.784656639850763,
"grad_norm": 0.35206711292266846,
"learning_rate": 0.00037480664916885386,
"loss": 3.2757,
"step": 64450
},
{
"epoch": 18.799230500174886,
"grad_norm": 0.40201523900032043,
"learning_rate": 0.0003746316710411198,
"loss": 3.2733,
"step": 64500
},
{
"epoch": 18.81380436049901,
"grad_norm": 0.35394608974456787,
"learning_rate": 0.0003744566929133858,
"loss": 3.2763,
"step": 64550
},
{
"epoch": 18.828378220823133,
"grad_norm": 0.364339679479599,
"learning_rate": 0.0003742817147856518,
"loss": 3.2755,
"step": 64600
},
{
"epoch": 18.842952081147253,
"grad_norm": 0.3828820288181305,
"learning_rate": 0.0003741067366579177,
"loss": 3.2687,
"step": 64650
},
{
"epoch": 18.857525941471376,
"grad_norm": 0.37939324975013733,
"learning_rate": 0.0003739317585301837,
"loss": 3.2795,
"step": 64700
},
{
"epoch": 18.8720998017955,
"grad_norm": 0.39449378848075867,
"learning_rate": 0.0003737567804024497,
"loss": 3.2844,
"step": 64750
},
{
"epoch": 18.886673662119623,
"grad_norm": 0.3925759494304657,
"learning_rate": 0.0003735818022747157,
"loss": 3.268,
"step": 64800
},
{
"epoch": 18.901247522443747,
"grad_norm": 0.3670575022697449,
"learning_rate": 0.00037340682414698157,
"loss": 3.2804,
"step": 64850
},
{
"epoch": 18.915821382767867,
"grad_norm": 0.38824647665023804,
"learning_rate": 0.00037323184601924757,
"loss": 3.272,
"step": 64900
},
{
"epoch": 18.93039524309199,
"grad_norm": 0.36675015091896057,
"learning_rate": 0.00037305686789151357,
"loss": 3.2985,
"step": 64950
},
{
"epoch": 18.944969103416113,
"grad_norm": 0.3709295690059662,
"learning_rate": 0.00037288188976377946,
"loss": 3.2816,
"step": 65000
},
{
"epoch": 18.944969103416113,
"eval_accuracy": 0.3725047217073308,
"eval_loss": 3.5384469032287598,
"eval_runtime": 53.2486,
"eval_samples_per_second": 312.252,
"eval_steps_per_second": 19.531,
"step": 65000
},
{
"epoch": 18.959542963740237,
"grad_norm": 0.3714747726917267,
"learning_rate": 0.00037270691163604545,
"loss": 3.2805,
"step": 65050
},
{
"epoch": 18.974116824064357,
"grad_norm": 0.36215534806251526,
"learning_rate": 0.00037253193350831145,
"loss": 3.2795,
"step": 65100
},
{
"epoch": 18.98869068438848,
"grad_norm": 0.3677482008934021,
"learning_rate": 0.0003723569553805774,
"loss": 3.2835,
"step": 65150
},
{
"epoch": 19.003206249271308,
"grad_norm": 0.3807225823402405,
"learning_rate": 0.00037218197725284334,
"loss": 3.2563,
"step": 65200
},
{
"epoch": 19.017780109595428,
"grad_norm": 0.3954845070838928,
"learning_rate": 0.00037200699912510934,
"loss": 3.1662,
"step": 65250
},
{
"epoch": 19.03235396991955,
"grad_norm": 0.3760719299316406,
"learning_rate": 0.0003718320209973753,
"loss": 3.1775,
"step": 65300
},
{
"epoch": 19.046927830243675,
"grad_norm": 0.3775408864021301,
"learning_rate": 0.0003716570428696413,
"loss": 3.1867,
"step": 65350
},
{
"epoch": 19.0615016905678,
"grad_norm": 0.3811887502670288,
"learning_rate": 0.0003714820647419072,
"loss": 3.1738,
"step": 65400
},
{
"epoch": 19.07607555089192,
"grad_norm": 0.36342817544937134,
"learning_rate": 0.00037130708661417316,
"loss": 3.1776,
"step": 65450
},
{
"epoch": 19.09064941121604,
"grad_norm": 0.3706771433353424,
"learning_rate": 0.00037113210848643916,
"loss": 3.1833,
"step": 65500
},
{
"epoch": 19.105223271540165,
"grad_norm": 0.3676753640174866,
"learning_rate": 0.00037095713035870516,
"loss": 3.1831,
"step": 65550
},
{
"epoch": 19.11979713186429,
"grad_norm": 0.3650805354118347,
"learning_rate": 0.00037078215223097105,
"loss": 3.2063,
"step": 65600
},
{
"epoch": 19.134370992188412,
"grad_norm": 0.3826095759868622,
"learning_rate": 0.00037060717410323705,
"loss": 3.191,
"step": 65650
},
{
"epoch": 19.148944852512532,
"grad_norm": 0.39902034401893616,
"learning_rate": 0.00037043219597550304,
"loss": 3.1941,
"step": 65700
},
{
"epoch": 19.163518712836655,
"grad_norm": 0.4051700234413147,
"learning_rate": 0.00037025721784776904,
"loss": 3.2042,
"step": 65750
},
{
"epoch": 19.17809257316078,
"grad_norm": 0.3963977098464966,
"learning_rate": 0.00037008223972003493,
"loss": 3.1994,
"step": 65800
},
{
"epoch": 19.192666433484902,
"grad_norm": 0.4009312093257904,
"learning_rate": 0.00036990726159230093,
"loss": 3.206,
"step": 65850
},
{
"epoch": 19.207240293809026,
"grad_norm": 0.4113180637359619,
"learning_rate": 0.0003697322834645669,
"loss": 3.214,
"step": 65900
},
{
"epoch": 19.221814154133146,
"grad_norm": 0.37608832120895386,
"learning_rate": 0.0003695573053368328,
"loss": 3.2084,
"step": 65950
},
{
"epoch": 19.23638801445727,
"grad_norm": 0.3920981287956238,
"learning_rate": 0.0003693823272090988,
"loss": 3.2194,
"step": 66000
},
{
"epoch": 19.23638801445727,
"eval_accuracy": 0.37180807346926,
"eval_loss": 3.55633544921875,
"eval_runtime": 53.3372,
"eval_samples_per_second": 311.733,
"eval_steps_per_second": 19.499,
"step": 66000
},
{
"epoch": 19.250961874781392,
"grad_norm": 0.358112633228302,
"learning_rate": 0.0003692073490813648,
"loss": 3.2184,
"step": 66050
},
{
"epoch": 19.265535735105516,
"grad_norm": 0.38964906334877014,
"learning_rate": 0.0003690323709536308,
"loss": 3.2143,
"step": 66100
},
{
"epoch": 19.280109595429636,
"grad_norm": 0.39134010672569275,
"learning_rate": 0.0003688573928258967,
"loss": 3.2149,
"step": 66150
},
{
"epoch": 19.29468345575376,
"grad_norm": 0.4015119969844818,
"learning_rate": 0.0003686824146981627,
"loss": 3.23,
"step": 66200
},
{
"epoch": 19.309257316077883,
"grad_norm": 0.3915894329547882,
"learning_rate": 0.0003685074365704287,
"loss": 3.2236,
"step": 66250
},
{
"epoch": 19.323831176402006,
"grad_norm": 0.35716143250465393,
"learning_rate": 0.00036833245844269464,
"loss": 3.2368,
"step": 66300
},
{
"epoch": 19.33840503672613,
"grad_norm": 0.40372762084007263,
"learning_rate": 0.0003681574803149606,
"loss": 3.2335,
"step": 66350
},
{
"epoch": 19.35297889705025,
"grad_norm": 0.3974008858203888,
"learning_rate": 0.0003679825021872266,
"loss": 3.2347,
"step": 66400
},
{
"epoch": 19.367552757374373,
"grad_norm": 0.35377946496009827,
"learning_rate": 0.0003678075240594925,
"loss": 3.2272,
"step": 66450
},
{
"epoch": 19.382126617698496,
"grad_norm": 0.3589801788330078,
"learning_rate": 0.0003676325459317585,
"loss": 3.2309,
"step": 66500
},
{
"epoch": 19.39670047802262,
"grad_norm": 0.3724297881126404,
"learning_rate": 0.00036745756780402446,
"loss": 3.245,
"step": 66550
},
{
"epoch": 19.41127433834674,
"grad_norm": 0.4001477360725403,
"learning_rate": 0.0003672825896762904,
"loss": 3.2319,
"step": 66600
},
{
"epoch": 19.425848198670863,
"grad_norm": 0.3749985694885254,
"learning_rate": 0.0003671076115485564,
"loss": 3.2322,
"step": 66650
},
{
"epoch": 19.440422058994987,
"grad_norm": 0.37100356817245483,
"learning_rate": 0.0003669326334208224,
"loss": 3.2318,
"step": 66700
},
{
"epoch": 19.45499591931911,
"grad_norm": 0.36847561597824097,
"learning_rate": 0.0003667576552930883,
"loss": 3.226,
"step": 66750
},
{
"epoch": 19.469569779643233,
"grad_norm": 0.4181145429611206,
"learning_rate": 0.0003665826771653543,
"loss": 3.2353,
"step": 66800
},
{
"epoch": 19.484143639967353,
"grad_norm": 0.3576458990573883,
"learning_rate": 0.0003664076990376203,
"loss": 3.2464,
"step": 66850
},
{
"epoch": 19.498717500291477,
"grad_norm": 0.39416128396987915,
"learning_rate": 0.0003662327209098862,
"loss": 3.2355,
"step": 66900
},
{
"epoch": 19.5132913606156,
"grad_norm": 0.40888020396232605,
"learning_rate": 0.0003660577427821522,
"loss": 3.2547,
"step": 66950
},
{
"epoch": 19.527865220939724,
"grad_norm": 0.38048359751701355,
"learning_rate": 0.00036588276465441817,
"loss": 3.2408,
"step": 67000
},
{
"epoch": 19.527865220939724,
"eval_accuracy": 0.37236913482267836,
"eval_loss": 3.5473415851593018,
"eval_runtime": 53.2303,
"eval_samples_per_second": 312.359,
"eval_steps_per_second": 19.538,
"step": 67000
},
{
"epoch": 19.542439081263844,
"grad_norm": 0.3875918388366699,
"learning_rate": 0.00036570778652668417,
"loss": 3.2431,
"step": 67050
},
{
"epoch": 19.557012941587967,
"grad_norm": 0.3789691627025604,
"learning_rate": 0.00036553280839895006,
"loss": 3.2548,
"step": 67100
},
{
"epoch": 19.57158680191209,
"grad_norm": 0.3792179822921753,
"learning_rate": 0.00036535783027121606,
"loss": 3.2544,
"step": 67150
},
{
"epoch": 19.586160662236214,
"grad_norm": 0.3668409585952759,
"learning_rate": 0.00036518285214348205,
"loss": 3.2473,
"step": 67200
},
{
"epoch": 19.600734522560337,
"grad_norm": 0.3650484085083008,
"learning_rate": 0.00036500787401574805,
"loss": 3.2405,
"step": 67250
},
{
"epoch": 19.615308382884457,
"grad_norm": 0.3609619140625,
"learning_rate": 0.00036483289588801394,
"loss": 3.249,
"step": 67300
},
{
"epoch": 19.62988224320858,
"grad_norm": 0.3582451045513153,
"learning_rate": 0.00036465791776027994,
"loss": 3.2368,
"step": 67350
},
{
"epoch": 19.644456103532704,
"grad_norm": 0.37356671690940857,
"learning_rate": 0.00036448293963254594,
"loss": 3.2464,
"step": 67400
},
{
"epoch": 19.659029963856828,
"grad_norm": 0.42684417963027954,
"learning_rate": 0.0003643079615048119,
"loss": 3.2604,
"step": 67450
},
{
"epoch": 19.673603824180947,
"grad_norm": 0.4055967330932617,
"learning_rate": 0.0003641329833770778,
"loss": 3.26,
"step": 67500
},
{
"epoch": 19.68817768450507,
"grad_norm": 0.34891846776008606,
"learning_rate": 0.0003639580052493438,
"loss": 3.2648,
"step": 67550
},
{
"epoch": 19.702751544829194,
"grad_norm": 0.3491284251213074,
"learning_rate": 0.00036378302712160976,
"loss": 3.2526,
"step": 67600
},
{
"epoch": 19.717325405153318,
"grad_norm": 0.39875558018684387,
"learning_rate": 0.00036360804899387576,
"loss": 3.2551,
"step": 67650
},
{
"epoch": 19.73189926547744,
"grad_norm": 0.344468355178833,
"learning_rate": 0.0003634330708661417,
"loss": 3.2551,
"step": 67700
},
{
"epoch": 19.74647312580156,
"grad_norm": 0.3823426067829132,
"learning_rate": 0.00036325809273840765,
"loss": 3.2577,
"step": 67750
},
{
"epoch": 19.761046986125685,
"grad_norm": 0.4144747257232666,
"learning_rate": 0.00036308311461067365,
"loss": 3.2515,
"step": 67800
},
{
"epoch": 19.775620846449808,
"grad_norm": 0.3994449973106384,
"learning_rate": 0.0003629081364829396,
"loss": 3.245,
"step": 67850
},
{
"epoch": 19.79019470677393,
"grad_norm": 0.38475626707077026,
"learning_rate": 0.00036273315835520553,
"loss": 3.2694,
"step": 67900
},
{
"epoch": 19.80476856709805,
"grad_norm": 0.379226952791214,
"learning_rate": 0.00036255818022747153,
"loss": 3.2528,
"step": 67950
},
{
"epoch": 19.819342427422175,
"grad_norm": 0.42125728726387024,
"learning_rate": 0.00036238320209973753,
"loss": 3.2572,
"step": 68000
},
{
"epoch": 19.819342427422175,
"eval_accuracy": 0.3727095143976912,
"eval_loss": 3.5435328483581543,
"eval_runtime": 53.3288,
"eval_samples_per_second": 311.783,
"eval_steps_per_second": 19.502,
"step": 68000
},
{
"epoch": 19.8339162877463,
"grad_norm": 0.3799435496330261,
"learning_rate": 0.0003622082239720034,
"loss": 3.2545,
"step": 68050
},
{
"epoch": 19.84849014807042,
"grad_norm": 0.3758162260055542,
"learning_rate": 0.0003620332458442694,
"loss": 3.2655,
"step": 68100
},
{
"epoch": 19.863064008394545,
"grad_norm": 0.4051867723464966,
"learning_rate": 0.0003618582677165354,
"loss": 3.2621,
"step": 68150
},
{
"epoch": 19.877637868718665,
"grad_norm": 0.36065474152565,
"learning_rate": 0.0003616832895888014,
"loss": 3.258,
"step": 68200
},
{
"epoch": 19.89221172904279,
"grad_norm": 0.3851832449436188,
"learning_rate": 0.0003615083114610673,
"loss": 3.2575,
"step": 68250
},
{
"epoch": 19.906785589366912,
"grad_norm": 0.390201210975647,
"learning_rate": 0.0003613333333333333,
"loss": 3.2591,
"step": 68300
},
{
"epoch": 19.921359449691035,
"grad_norm": 0.36681830883026123,
"learning_rate": 0.0003611583552055993,
"loss": 3.2666,
"step": 68350
},
{
"epoch": 19.935933310015155,
"grad_norm": 0.38141345977783203,
"learning_rate": 0.00036098337707786524,
"loss": 3.2674,
"step": 68400
},
{
"epoch": 19.95050717033928,
"grad_norm": 0.3752776086330414,
"learning_rate": 0.0003608083989501312,
"loss": 3.2611,
"step": 68450
},
{
"epoch": 19.965081030663402,
"grad_norm": 0.40742582082748413,
"learning_rate": 0.0003606334208223972,
"loss": 3.2726,
"step": 68500
},
{
"epoch": 19.979654890987526,
"grad_norm": 0.37358659505844116,
"learning_rate": 0.0003604584426946632,
"loss": 3.2807,
"step": 68550
},
{
"epoch": 19.99422875131165,
"grad_norm": 0.37046629190444946,
"learning_rate": 0.00036028346456692907,
"loss": 3.2751,
"step": 68600
},
{
"epoch": 20.008744316194473,
"grad_norm": 0.3958808481693268,
"learning_rate": 0.00036010848643919507,
"loss": 3.2098,
"step": 68650
},
{
"epoch": 20.023318176518597,
"grad_norm": 0.38605043292045593,
"learning_rate": 0.00035993350831146106,
"loss": 3.1574,
"step": 68700
},
{
"epoch": 20.03789203684272,
"grad_norm": 0.3664279878139496,
"learning_rate": 0.000359758530183727,
"loss": 3.1624,
"step": 68750
},
{
"epoch": 20.05246589716684,
"grad_norm": 0.4011380970478058,
"learning_rate": 0.00035958355205599295,
"loss": 3.1759,
"step": 68800
},
{
"epoch": 20.067039757490964,
"grad_norm": 0.4068359136581421,
"learning_rate": 0.00035940857392825895,
"loss": 3.1689,
"step": 68850
},
{
"epoch": 20.081613617815087,
"grad_norm": 0.41633710265159607,
"learning_rate": 0.0003592335958005249,
"loss": 3.1759,
"step": 68900
},
{
"epoch": 20.09618747813921,
"grad_norm": 0.38761988282203674,
"learning_rate": 0.0003590586176727909,
"loss": 3.1788,
"step": 68950
},
{
"epoch": 20.110761338463334,
"grad_norm": 0.36621686816215515,
"learning_rate": 0.00035888363954505683,
"loss": 3.2013,
"step": 69000
},
{
"epoch": 20.110761338463334,
"eval_accuracy": 0.3719890913760268,
"eval_loss": 3.554094076156616,
"eval_runtime": 53.1883,
"eval_samples_per_second": 312.606,
"eval_steps_per_second": 19.553,
"step": 69000
},
{
"epoch": 20.125335198787454,
"grad_norm": 0.38778454065322876,
"learning_rate": 0.0003587086614173228,
"loss": 3.1886,
"step": 69050
},
{
"epoch": 20.139909059111577,
"grad_norm": 0.39453768730163574,
"learning_rate": 0.0003585336832895888,
"loss": 3.1855,
"step": 69100
},
{
"epoch": 20.1544829194357,
"grad_norm": 0.3769824802875519,
"learning_rate": 0.00035835870516185477,
"loss": 3.1867,
"step": 69150
},
{
"epoch": 20.169056779759824,
"grad_norm": 0.40915030241012573,
"learning_rate": 0.00035818372703412066,
"loss": 3.1816,
"step": 69200
},
{
"epoch": 20.183630640083944,
"grad_norm": 0.40823474526405334,
"learning_rate": 0.00035800874890638666,
"loss": 3.1942,
"step": 69250
},
{
"epoch": 20.198204500408067,
"grad_norm": 0.40230390429496765,
"learning_rate": 0.00035783377077865266,
"loss": 3.2091,
"step": 69300
},
{
"epoch": 20.21277836073219,
"grad_norm": 0.4020078480243683,
"learning_rate": 0.00035765879265091865,
"loss": 3.1972,
"step": 69350
},
{
"epoch": 20.227352221056314,
"grad_norm": 0.3979397416114807,
"learning_rate": 0.00035748381452318454,
"loss": 3.1991,
"step": 69400
},
{
"epoch": 20.241926081380438,
"grad_norm": 0.35688549280166626,
"learning_rate": 0.00035730883639545054,
"loss": 3.2108,
"step": 69450
},
{
"epoch": 20.256499941704558,
"grad_norm": 0.3942968249320984,
"learning_rate": 0.00035713385826771654,
"loss": 3.2102,
"step": 69500
},
{
"epoch": 20.27107380202868,
"grad_norm": 0.4229877293109894,
"learning_rate": 0.00035695888013998243,
"loss": 3.1885,
"step": 69550
},
{
"epoch": 20.285647662352805,
"grad_norm": 0.36158671975135803,
"learning_rate": 0.0003567839020122484,
"loss": 3.2207,
"step": 69600
},
{
"epoch": 20.300221522676928,
"grad_norm": 0.40409842133522034,
"learning_rate": 0.0003566089238845144,
"loss": 3.2079,
"step": 69650
},
{
"epoch": 20.314795383001048,
"grad_norm": 0.385179340839386,
"learning_rate": 0.00035643394575678037,
"loss": 3.2049,
"step": 69700
},
{
"epoch": 20.32936924332517,
"grad_norm": 0.4204241931438446,
"learning_rate": 0.0003562589676290463,
"loss": 3.2025,
"step": 69750
},
{
"epoch": 20.343943103649295,
"grad_norm": 0.38813483715057373,
"learning_rate": 0.0003560839895013123,
"loss": 3.2236,
"step": 69800
},
{
"epoch": 20.35851696397342,
"grad_norm": 0.35402917861938477,
"learning_rate": 0.0003559090113735783,
"loss": 3.212,
"step": 69850
},
{
"epoch": 20.37309082429754,
"grad_norm": 0.40892571210861206,
"learning_rate": 0.00035573403324584425,
"loss": 3.2152,
"step": 69900
},
{
"epoch": 20.38766468462166,
"grad_norm": 0.3680976927280426,
"learning_rate": 0.0003555590551181102,
"loss": 3.2247,
"step": 69950
},
{
"epoch": 20.402238544945785,
"grad_norm": 0.39239105582237244,
"learning_rate": 0.0003553840769903762,
"loss": 3.2131,
"step": 70000
},
{
"epoch": 20.402238544945785,
"eval_accuracy": 0.372465057835692,
"eval_loss": 3.5467369556427,
"eval_runtime": 53.4137,
"eval_samples_per_second": 311.287,
"eval_steps_per_second": 19.471,
"step": 70000
},
{
"epoch": 20.41681240526991,
"grad_norm": 0.38115379214286804,
"learning_rate": 0.00035520909886264213,
"loss": 3.2223,
"step": 70050
},
{
"epoch": 20.431386265594032,
"grad_norm": 0.38737982511520386,
"learning_rate": 0.00035503412073490813,
"loss": 3.2343,
"step": 70100
},
{
"epoch": 20.445960125918152,
"grad_norm": 0.3989051282405853,
"learning_rate": 0.0003548591426071741,
"loss": 3.2201,
"step": 70150
},
{
"epoch": 20.460533986242275,
"grad_norm": 0.3883934020996094,
"learning_rate": 0.00035468416447944,
"loss": 3.2349,
"step": 70200
},
{
"epoch": 20.4751078465664,
"grad_norm": 0.39728063344955444,
"learning_rate": 0.000354509186351706,
"loss": 3.2319,
"step": 70250
},
{
"epoch": 20.489681706890522,
"grad_norm": 0.38918405771255493,
"learning_rate": 0.000354334208223972,
"loss": 3.2266,
"step": 70300
},
{
"epoch": 20.504255567214642,
"grad_norm": 0.397217333316803,
"learning_rate": 0.0003541592300962379,
"loss": 3.2362,
"step": 70350
},
{
"epoch": 20.518829427538765,
"grad_norm": 0.39398711919784546,
"learning_rate": 0.0003539842519685039,
"loss": 3.2298,
"step": 70400
},
{
"epoch": 20.53340328786289,
"grad_norm": 0.3995719850063324,
"learning_rate": 0.0003538092738407699,
"loss": 3.2338,
"step": 70450
},
{
"epoch": 20.547977148187012,
"grad_norm": 0.4090834856033325,
"learning_rate": 0.0003536342957130358,
"loss": 3.2183,
"step": 70500
},
{
"epoch": 20.562551008511136,
"grad_norm": 0.38448628783226013,
"learning_rate": 0.0003534593175853018,
"loss": 3.226,
"step": 70550
},
{
"epoch": 20.577124868835256,
"grad_norm": 0.39190101623535156,
"learning_rate": 0.0003532843394575678,
"loss": 3.2334,
"step": 70600
},
{
"epoch": 20.59169872915938,
"grad_norm": 0.39090031385421753,
"learning_rate": 0.0003531093613298338,
"loss": 3.248,
"step": 70650
},
{
"epoch": 20.606272589483503,
"grad_norm": 0.41073334217071533,
"learning_rate": 0.00035293438320209967,
"loss": 3.2492,
"step": 70700
},
{
"epoch": 20.620846449807626,
"grad_norm": 0.3776114583015442,
"learning_rate": 0.00035275940507436567,
"loss": 3.2373,
"step": 70750
},
{
"epoch": 20.63542031013175,
"grad_norm": 0.4018295109272003,
"learning_rate": 0.00035258442694663166,
"loss": 3.2414,
"step": 70800
},
{
"epoch": 20.64999417045587,
"grad_norm": 0.35556280612945557,
"learning_rate": 0.0003524094488188976,
"loss": 3.2356,
"step": 70850
},
{
"epoch": 20.664568030779993,
"grad_norm": 0.3811604380607605,
"learning_rate": 0.00035223447069116355,
"loss": 3.2301,
"step": 70900
},
{
"epoch": 20.679141891104116,
"grad_norm": 0.3758731484413147,
"learning_rate": 0.00035205949256342955,
"loss": 3.2462,
"step": 70950
},
{
"epoch": 20.69371575142824,
"grad_norm": 0.403804749250412,
"learning_rate": 0.0003518845144356955,
"loss": 3.2408,
"step": 71000
},
{
"epoch": 20.69371575142824,
"eval_accuracy": 0.3727085728221033,
"eval_loss": 3.5426127910614014,
"eval_runtime": 53.2562,
"eval_samples_per_second": 312.208,
"eval_steps_per_second": 19.528,
"step": 71000
},
{
"epoch": 20.70828961175236,
"grad_norm": 0.41654902696609497,
"learning_rate": 0.0003517095363079615,
"loss": 3.2509,
"step": 71050
},
{
"epoch": 20.722863472076483,
"grad_norm": 0.3992166817188263,
"learning_rate": 0.00035153455818022743,
"loss": 3.2378,
"step": 71100
},
{
"epoch": 20.737437332400606,
"grad_norm": 0.4075424075126648,
"learning_rate": 0.00035135958005249343,
"loss": 3.2402,
"step": 71150
},
{
"epoch": 20.75201119272473,
"grad_norm": 0.404988557100296,
"learning_rate": 0.0003511846019247594,
"loss": 3.251,
"step": 71200
},
{
"epoch": 20.76658505304885,
"grad_norm": 0.37374556064605713,
"learning_rate": 0.0003510096237970253,
"loss": 3.2411,
"step": 71250
},
{
"epoch": 20.781158913372973,
"grad_norm": 0.379692018032074,
"learning_rate": 0.0003508346456692913,
"loss": 3.2491,
"step": 71300
},
{
"epoch": 20.795732773697097,
"grad_norm": 0.3699359595775604,
"learning_rate": 0.00035065966754155726,
"loss": 3.2541,
"step": 71350
},
{
"epoch": 20.81030663402122,
"grad_norm": 0.3997096121311188,
"learning_rate": 0.00035048468941382326,
"loss": 3.2405,
"step": 71400
},
{
"epoch": 20.824880494345344,
"grad_norm": 0.40189990401268005,
"learning_rate": 0.0003503097112860892,
"loss": 3.2496,
"step": 71450
},
{
"epoch": 20.839454354669463,
"grad_norm": 0.3730684518814087,
"learning_rate": 0.00035013473315835514,
"loss": 3.2441,
"step": 71500
},
{
"epoch": 20.854028214993587,
"grad_norm": 0.38036414980888367,
"learning_rate": 0.00034995975503062114,
"loss": 3.2635,
"step": 71550
},
{
"epoch": 20.86860207531771,
"grad_norm": 0.3716338872909546,
"learning_rate": 0.00034978477690288714,
"loss": 3.2611,
"step": 71600
},
{
"epoch": 20.883175935641834,
"grad_norm": 0.39651766419410706,
"learning_rate": 0.00034960979877515303,
"loss": 3.2585,
"step": 71650
},
{
"epoch": 20.897749795965957,
"grad_norm": 0.41758492588996887,
"learning_rate": 0.000349434820647419,
"loss": 3.2526,
"step": 71700
},
{
"epoch": 20.912323656290077,
"grad_norm": 0.37373974919319153,
"learning_rate": 0.000349259842519685,
"loss": 3.2588,
"step": 71750
},
{
"epoch": 20.9268975166142,
"grad_norm": 0.36950206756591797,
"learning_rate": 0.000349084864391951,
"loss": 3.2514,
"step": 71800
},
{
"epoch": 20.941471376938324,
"grad_norm": 0.3997727930545807,
"learning_rate": 0.0003489098862642169,
"loss": 3.2569,
"step": 71850
},
{
"epoch": 20.956045237262448,
"grad_norm": 0.391072541475296,
"learning_rate": 0.0003487349081364829,
"loss": 3.2528,
"step": 71900
},
{
"epoch": 20.970619097586567,
"grad_norm": 0.3755815029144287,
"learning_rate": 0.0003485599300087489,
"loss": 3.2651,
"step": 71950
},
{
"epoch": 20.98519295791069,
"grad_norm": 0.4253823161125183,
"learning_rate": 0.00034838495188101485,
"loss": 3.2684,
"step": 72000
},
{
"epoch": 20.98519295791069,
"eval_accuracy": 0.37319101261393506,
"eval_loss": 3.5336711406707764,
"eval_runtime": 53.3881,
"eval_samples_per_second": 311.436,
"eval_steps_per_second": 19.48,
"step": 72000
},
{
"epoch": 20.999766818234814,
"grad_norm": 0.38008999824523926,
"learning_rate": 0.0003482099737532808,
"loss": 3.2663,
"step": 72050
},
{
"epoch": 21.01428238311764,
"grad_norm": 0.38966473937034607,
"learning_rate": 0.0003480349956255468,
"loss": 3.1576,
"step": 72100
},
{
"epoch": 21.028856243441762,
"grad_norm": 0.35641705989837646,
"learning_rate": 0.00034786001749781274,
"loss": 3.1555,
"step": 72150
},
{
"epoch": 21.043430103765886,
"grad_norm": 0.450893372297287,
"learning_rate": 0.0003476850393700787,
"loss": 3.1528,
"step": 72200
},
{
"epoch": 21.05800396409001,
"grad_norm": 0.3972832262516022,
"learning_rate": 0.0003475100612423447,
"loss": 3.1585,
"step": 72250
},
{
"epoch": 21.072577824414132,
"grad_norm": 0.391781210899353,
"learning_rate": 0.0003473350831146106,
"loss": 3.1495,
"step": 72300
},
{
"epoch": 21.087151684738252,
"grad_norm": 0.3778936564922333,
"learning_rate": 0.0003471601049868766,
"loss": 3.169,
"step": 72350
},
{
"epoch": 21.101725545062376,
"grad_norm": 0.3655568063259125,
"learning_rate": 0.00034698512685914256,
"loss": 3.1737,
"step": 72400
},
{
"epoch": 21.1162994053865,
"grad_norm": 0.3923577070236206,
"learning_rate": 0.00034681014873140856,
"loss": 3.1725,
"step": 72450
},
{
"epoch": 21.130873265710623,
"grad_norm": 0.3961803913116455,
"learning_rate": 0.0003466351706036745,
"loss": 3.1745,
"step": 72500
},
{
"epoch": 21.145447126034743,
"grad_norm": 0.4180135428905487,
"learning_rate": 0.0003464601924759405,
"loss": 3.1888,
"step": 72550
},
{
"epoch": 21.160020986358866,
"grad_norm": 0.3757016956806183,
"learning_rate": 0.00034628521434820644,
"loss": 3.1843,
"step": 72600
},
{
"epoch": 21.17459484668299,
"grad_norm": 0.38864660263061523,
"learning_rate": 0.0003461102362204724,
"loss": 3.2,
"step": 72650
},
{
"epoch": 21.189168707007113,
"grad_norm": 0.39377427101135254,
"learning_rate": 0.0003459352580927384,
"loss": 3.1873,
"step": 72700
},
{
"epoch": 21.203742567331236,
"grad_norm": 0.41590413451194763,
"learning_rate": 0.0003457602799650044,
"loss": 3.2033,
"step": 72750
},
{
"epoch": 21.218316427655356,
"grad_norm": 0.38666868209838867,
"learning_rate": 0.00034558530183727027,
"loss": 3.1979,
"step": 72800
},
{
"epoch": 21.23289028797948,
"grad_norm": 0.403987854719162,
"learning_rate": 0.00034541032370953627,
"loss": 3.1999,
"step": 72850
},
{
"epoch": 21.247464148303603,
"grad_norm": 0.3956553637981415,
"learning_rate": 0.00034523534558180227,
"loss": 3.1968,
"step": 72900
},
{
"epoch": 21.262038008627727,
"grad_norm": 0.43393418192863464,
"learning_rate": 0.00034506036745406826,
"loss": 3.1981,
"step": 72950
},
{
"epoch": 21.276611868951846,
"grad_norm": 0.38441672921180725,
"learning_rate": 0.00034488538932633415,
"loss": 3.2104,
"step": 73000
},
{
"epoch": 21.276611868951846,
"eval_accuracy": 0.3723660747020178,
"eval_loss": 3.5544207096099854,
"eval_runtime": 53.2491,
"eval_samples_per_second": 312.249,
"eval_steps_per_second": 19.531,
"step": 73000
},
{
"epoch": 21.29118572927597,
"grad_norm": 0.38110947608947754,
"learning_rate": 0.00034471041119860015,
"loss": 3.19,
"step": 73050
},
{
"epoch": 21.305759589600093,
"grad_norm": 0.39781099557876587,
"learning_rate": 0.00034453543307086615,
"loss": 3.1996,
"step": 73100
},
{
"epoch": 21.320333449924217,
"grad_norm": 0.3817616105079651,
"learning_rate": 0.00034436045494313204,
"loss": 3.1977,
"step": 73150
},
{
"epoch": 21.33490731024834,
"grad_norm": 0.37882694602012634,
"learning_rate": 0.00034418547681539804,
"loss": 3.2149,
"step": 73200
},
{
"epoch": 21.34948117057246,
"grad_norm": 0.40080127120018005,
"learning_rate": 0.00034401049868766403,
"loss": 3.2046,
"step": 73250
},
{
"epoch": 21.364055030896584,
"grad_norm": 0.3883497416973114,
"learning_rate": 0.00034383552055993,
"loss": 3.215,
"step": 73300
},
{
"epoch": 21.378628891220707,
"grad_norm": 0.39146512746810913,
"learning_rate": 0.0003436605424321959,
"loss": 3.2035,
"step": 73350
},
{
"epoch": 21.39320275154483,
"grad_norm": 0.44710758328437805,
"learning_rate": 0.0003434855643044619,
"loss": 3.2015,
"step": 73400
},
{
"epoch": 21.40777661186895,
"grad_norm": 0.43041878938674927,
"learning_rate": 0.00034331058617672786,
"loss": 3.2143,
"step": 73450
},
{
"epoch": 21.422350472193074,
"grad_norm": 0.4076734781265259,
"learning_rate": 0.00034313560804899386,
"loss": 3.2244,
"step": 73500
},
{
"epoch": 21.436924332517197,
"grad_norm": 0.4158762991428375,
"learning_rate": 0.0003429606299212598,
"loss": 3.2085,
"step": 73550
},
{
"epoch": 21.45149819284132,
"grad_norm": 0.4004541039466858,
"learning_rate": 0.00034278565179352575,
"loss": 3.2125,
"step": 73600
},
{
"epoch": 21.466072053165444,
"grad_norm": 0.37133657932281494,
"learning_rate": 0.00034261067366579174,
"loss": 3.2192,
"step": 73650
},
{
"epoch": 21.480645913489564,
"grad_norm": 0.39652833342552185,
"learning_rate": 0.00034243569553805774,
"loss": 3.2189,
"step": 73700
},
{
"epoch": 21.495219773813687,
"grad_norm": 0.3788989782333374,
"learning_rate": 0.0003422607174103237,
"loss": 3.2197,
"step": 73750
},
{
"epoch": 21.50979363413781,
"grad_norm": 0.405619740486145,
"learning_rate": 0.00034208573928258963,
"loss": 3.2189,
"step": 73800
},
{
"epoch": 21.524367494461934,
"grad_norm": 0.39048507809638977,
"learning_rate": 0.0003419107611548556,
"loss": 3.2186,
"step": 73850
},
{
"epoch": 21.538941354786054,
"grad_norm": 0.40843072533607483,
"learning_rate": 0.0003417357830271216,
"loss": 3.2242,
"step": 73900
},
{
"epoch": 21.553515215110178,
"grad_norm": 0.3887636065483093,
"learning_rate": 0.0003415608048993875,
"loss": 3.217,
"step": 73950
},
{
"epoch": 21.5680890754343,
"grad_norm": 0.3871009051799774,
"learning_rate": 0.0003413858267716535,
"loss": 3.2185,
"step": 74000
},
{
"epoch": 21.5680890754343,
"eval_accuracy": 0.3726364245926832,
"eval_loss": 3.5461220741271973,
"eval_runtime": 53.2502,
"eval_samples_per_second": 312.243,
"eval_steps_per_second": 19.53,
"step": 74000
},
{
"epoch": 21.582662935758425,
"grad_norm": 0.4204442799091339,
"learning_rate": 0.0003412108486439195,
"loss": 3.2321,
"step": 74050
},
{
"epoch": 21.597236796082548,
"grad_norm": 0.3793659806251526,
"learning_rate": 0.0003410358705161854,
"loss": 3.2254,
"step": 74100
},
{
"epoch": 21.611810656406668,
"grad_norm": 0.40363553166389465,
"learning_rate": 0.0003408608923884514,
"loss": 3.217,
"step": 74150
},
{
"epoch": 21.62638451673079,
"grad_norm": 0.3986867070198059,
"learning_rate": 0.0003406859142607174,
"loss": 3.2277,
"step": 74200
},
{
"epoch": 21.640958377054915,
"grad_norm": 0.37159422039985657,
"learning_rate": 0.0003405109361329834,
"loss": 3.2243,
"step": 74250
},
{
"epoch": 21.655532237379038,
"grad_norm": 0.40285640954971313,
"learning_rate": 0.0003403359580052493,
"loss": 3.2353,
"step": 74300
},
{
"epoch": 21.670106097703158,
"grad_norm": 0.4001893699169159,
"learning_rate": 0.0003401609798775153,
"loss": 3.2359,
"step": 74350
},
{
"epoch": 21.68467995802728,
"grad_norm": 0.3854397237300873,
"learning_rate": 0.0003399860017497813,
"loss": 3.2281,
"step": 74400
},
{
"epoch": 21.699253818351405,
"grad_norm": 0.4364528954029083,
"learning_rate": 0.0003398110236220472,
"loss": 3.2206,
"step": 74450
},
{
"epoch": 21.71382767867553,
"grad_norm": 0.4174240827560425,
"learning_rate": 0.00033963604549431316,
"loss": 3.2339,
"step": 74500
},
{
"epoch": 21.728401538999652,
"grad_norm": 0.40123292803764343,
"learning_rate": 0.00033946106736657916,
"loss": 3.2358,
"step": 74550
},
{
"epoch": 21.74297539932377,
"grad_norm": 0.36441606283187866,
"learning_rate": 0.0003392860892388451,
"loss": 3.2384,
"step": 74600
},
{
"epoch": 21.757549259647895,
"grad_norm": 0.431755393743515,
"learning_rate": 0.0003391111111111111,
"loss": 3.2295,
"step": 74650
},
{
"epoch": 21.77212311997202,
"grad_norm": 0.4035051167011261,
"learning_rate": 0.00033893613298337705,
"loss": 3.2409,
"step": 74700
},
{
"epoch": 21.786696980296142,
"grad_norm": 0.4678764045238495,
"learning_rate": 0.000338761154855643,
"loss": 3.225,
"step": 74750
},
{
"epoch": 21.801270840620262,
"grad_norm": 0.3831508159637451,
"learning_rate": 0.000338586176727909,
"loss": 3.2392,
"step": 74800
},
{
"epoch": 21.815844700944385,
"grad_norm": 0.4162442684173584,
"learning_rate": 0.00033841119860017493,
"loss": 3.2509,
"step": 74850
},
{
"epoch": 21.83041856126851,
"grad_norm": 0.37772423028945923,
"learning_rate": 0.0003382362204724409,
"loss": 3.2573,
"step": 74900
},
{
"epoch": 21.844992421592632,
"grad_norm": 0.3931127190589905,
"learning_rate": 0.00033806124234470687,
"loss": 3.2319,
"step": 74950
},
{
"epoch": 21.859566281916756,
"grad_norm": 0.3934958279132843,
"learning_rate": 0.00033788626421697287,
"loss": 3.2397,
"step": 75000
},
{
"epoch": 21.859566281916756,
"eval_accuracy": 0.37314040292608736,
"eval_loss": 3.540088653564453,
"eval_runtime": 53.2315,
"eval_samples_per_second": 312.353,
"eval_steps_per_second": 19.537,
"step": 75000
},
{
"epoch": 21.874140142240876,
"grad_norm": 0.41406193375587463,
"learning_rate": 0.0003377112860892388,
"loss": 3.2408,
"step": 75050
},
{
"epoch": 21.888714002565,
"grad_norm": 0.3920379877090454,
"learning_rate": 0.00033753630796150476,
"loss": 3.242,
"step": 75100
},
{
"epoch": 21.903287862889123,
"grad_norm": 0.407136470079422,
"learning_rate": 0.00033736132983377075,
"loss": 3.2502,
"step": 75150
},
{
"epoch": 21.917861723213246,
"grad_norm": 0.375531941652298,
"learning_rate": 0.00033718635170603675,
"loss": 3.239,
"step": 75200
},
{
"epoch": 21.932435583537366,
"grad_norm": 0.3673073649406433,
"learning_rate": 0.00033701137357830264,
"loss": 3.2481,
"step": 75250
},
{
"epoch": 21.94700944386149,
"grad_norm": 0.38741302490234375,
"learning_rate": 0.00033683639545056864,
"loss": 3.242,
"step": 75300
},
{
"epoch": 21.961583304185613,
"grad_norm": 0.3984997272491455,
"learning_rate": 0.00033666141732283464,
"loss": 3.2439,
"step": 75350
},
{
"epoch": 21.976157164509736,
"grad_norm": 0.3742145895957947,
"learning_rate": 0.00033648643919510063,
"loss": 3.2562,
"step": 75400
},
{
"epoch": 21.99073102483386,
"grad_norm": 0.3876781463623047,
"learning_rate": 0.0003363114610673665,
"loss": 3.2519,
"step": 75450
},
{
"epoch": 22.005246589716684,
"grad_norm": 0.37868696451187134,
"learning_rate": 0.0003361364829396325,
"loss": 3.2046,
"step": 75500
},
{
"epoch": 22.019820450040807,
"grad_norm": 0.3847079277038574,
"learning_rate": 0.0003359615048118985,
"loss": 3.1304,
"step": 75550
},
{
"epoch": 22.03439431036493,
"grad_norm": 0.37587589025497437,
"learning_rate": 0.00033578652668416446,
"loss": 3.1392,
"step": 75600
},
{
"epoch": 22.04896817068905,
"grad_norm": 0.38762593269348145,
"learning_rate": 0.0003356115485564304,
"loss": 3.1518,
"step": 75650
},
{
"epoch": 22.063542031013174,
"grad_norm": 0.4253588020801544,
"learning_rate": 0.0003354365704286964,
"loss": 3.1539,
"step": 75700
},
{
"epoch": 22.078115891337298,
"grad_norm": 0.40510454773902893,
"learning_rate": 0.00033526159230096235,
"loss": 3.1768,
"step": 75750
},
{
"epoch": 22.09268975166142,
"grad_norm": 0.41818878054618835,
"learning_rate": 0.0003350866141732283,
"loss": 3.1706,
"step": 75800
},
{
"epoch": 22.10726361198554,
"grad_norm": 0.39923954010009766,
"learning_rate": 0.0003349116360454943,
"loss": 3.1712,
"step": 75850
},
{
"epoch": 22.121837472309664,
"grad_norm": 0.39292123913764954,
"learning_rate": 0.00033473665791776023,
"loss": 3.1647,
"step": 75900
},
{
"epoch": 22.136411332633788,
"grad_norm": 0.42234423756599426,
"learning_rate": 0.00033456167979002623,
"loss": 3.1826,
"step": 75950
},
{
"epoch": 22.15098519295791,
"grad_norm": 0.4201187193393707,
"learning_rate": 0.00033438670166229217,
"loss": 3.1673,
"step": 76000
},
{
"epoch": 22.15098519295791,
"eval_accuracy": 0.3727125745183517,
"eval_loss": 3.553342580795288,
"eval_runtime": 53.1416,
"eval_samples_per_second": 312.881,
"eval_steps_per_second": 19.57,
"step": 76000
},
{
"epoch": 22.165559053282035,
"grad_norm": 0.38024842739105225,
"learning_rate": 0.0003342117235345581,
"loss": 3.1727,
"step": 76050
},
{
"epoch": 22.180132913606155,
"grad_norm": 0.3718349039554596,
"learning_rate": 0.0003340367454068241,
"loss": 3.1828,
"step": 76100
},
{
"epoch": 22.194706773930278,
"grad_norm": 0.39751553535461426,
"learning_rate": 0.0003338617672790901,
"loss": 3.1716,
"step": 76150
},
{
"epoch": 22.2092806342544,
"grad_norm": 0.4270859360694885,
"learning_rate": 0.000333686789151356,
"loss": 3.1776,
"step": 76200
},
{
"epoch": 22.223854494578525,
"grad_norm": 0.3863823413848877,
"learning_rate": 0.000333511811023622,
"loss": 3.1734,
"step": 76250
},
{
"epoch": 22.238428354902645,
"grad_norm": 0.39657875895500183,
"learning_rate": 0.000333336832895888,
"loss": 3.1724,
"step": 76300
},
{
"epoch": 22.25300221522677,
"grad_norm": 0.4099148213863373,
"learning_rate": 0.000333161854768154,
"loss": 3.1777,
"step": 76350
},
{
"epoch": 22.267576075550892,
"grad_norm": 0.37436968088150024,
"learning_rate": 0.0003329868766404199,
"loss": 3.1932,
"step": 76400
},
{
"epoch": 22.282149935875015,
"grad_norm": 0.4043516516685486,
"learning_rate": 0.0003328118985126859,
"loss": 3.1922,
"step": 76450
},
{
"epoch": 22.29672379619914,
"grad_norm": 0.4059845805168152,
"learning_rate": 0.0003326369203849519,
"loss": 3.1931,
"step": 76500
},
{
"epoch": 22.31129765652326,
"grad_norm": 0.44433534145355225,
"learning_rate": 0.0003324619422572179,
"loss": 3.181,
"step": 76550
},
{
"epoch": 22.325871516847382,
"grad_norm": 0.42227113246917725,
"learning_rate": 0.00033228696412948377,
"loss": 3.2026,
"step": 76600
},
{
"epoch": 22.340445377171505,
"grad_norm": 0.4291574954986572,
"learning_rate": 0.00033211198600174976,
"loss": 3.1957,
"step": 76650
},
{
"epoch": 22.35501923749563,
"grad_norm": 0.4012506306171417,
"learning_rate": 0.00033193700787401576,
"loss": 3.2053,
"step": 76700
},
{
"epoch": 22.36959309781975,
"grad_norm": 0.42267000675201416,
"learning_rate": 0.00033176202974628165,
"loss": 3.2001,
"step": 76750
},
{
"epoch": 22.384166958143872,
"grad_norm": 0.39716577529907227,
"learning_rate": 0.00033158705161854765,
"loss": 3.2073,
"step": 76800
},
{
"epoch": 22.398740818467996,
"grad_norm": 0.3808940351009369,
"learning_rate": 0.00033141207349081365,
"loss": 3.2027,
"step": 76850
},
{
"epoch": 22.41331467879212,
"grad_norm": 0.4222249388694763,
"learning_rate": 0.0003312370953630796,
"loss": 3.2069,
"step": 76900
},
{
"epoch": 22.427888539116243,
"grad_norm": 0.3741794526576996,
"learning_rate": 0.00033106211723534553,
"loss": 3.1951,
"step": 76950
},
{
"epoch": 22.442462399440362,
"grad_norm": 0.37816691398620605,
"learning_rate": 0.00033088713910761153,
"loss": 3.1903,
"step": 77000
},
{
"epoch": 22.442462399440362,
"eval_accuracy": 0.3728844120631369,
"eval_loss": 3.5475516319274902,
"eval_runtime": 53.1477,
"eval_samples_per_second": 312.845,
"eval_steps_per_second": 19.568,
"step": 77000
},
{
"epoch": 22.457036259764486,
"grad_norm": 0.3930409550666809,
"learning_rate": 0.0003307121609798775,
"loss": 3.2086,
"step": 77050
},
{
"epoch": 22.47161012008861,
"grad_norm": 0.4193686246871948,
"learning_rate": 0.00033053718285214347,
"loss": 3.2081,
"step": 77100
},
{
"epoch": 22.486183980412733,
"grad_norm": 0.4014568328857422,
"learning_rate": 0.0003303622047244094,
"loss": 3.1979,
"step": 77150
},
{
"epoch": 22.500757840736853,
"grad_norm": 0.403713583946228,
"learning_rate": 0.00033018722659667536,
"loss": 3.2081,
"step": 77200
},
{
"epoch": 22.515331701060976,
"grad_norm": 0.3767714500427246,
"learning_rate": 0.00033001224846894136,
"loss": 3.2019,
"step": 77250
},
{
"epoch": 22.5299055613851,
"grad_norm": 0.40091049671173096,
"learning_rate": 0.00032983727034120735,
"loss": 3.2092,
"step": 77300
},
{
"epoch": 22.544479421709223,
"grad_norm": 0.407742440700531,
"learning_rate": 0.00032966229221347324,
"loss": 3.2054,
"step": 77350
},
{
"epoch": 22.559053282033346,
"grad_norm": 0.4064652621746063,
"learning_rate": 0.00032948731408573924,
"loss": 3.2105,
"step": 77400
},
{
"epoch": 22.573627142357466,
"grad_norm": 0.41998764872550964,
"learning_rate": 0.00032931233595800524,
"loss": 3.2185,
"step": 77450
},
{
"epoch": 22.58820100268159,
"grad_norm": 0.37851375341415405,
"learning_rate": 0.00032913735783027124,
"loss": 3.2182,
"step": 77500
},
{
"epoch": 22.602774863005713,
"grad_norm": 0.3938767611980438,
"learning_rate": 0.0003289623797025371,
"loss": 3.2215,
"step": 77550
},
{
"epoch": 22.617348723329837,
"grad_norm": 0.38179537653923035,
"learning_rate": 0.0003287874015748031,
"loss": 3.22,
"step": 77600
},
{
"epoch": 22.631922583653957,
"grad_norm": 0.37935879826545715,
"learning_rate": 0.0003286124234470691,
"loss": 3.2212,
"step": 77650
},
{
"epoch": 22.64649644397808,
"grad_norm": 0.4034980535507202,
"learning_rate": 0.000328437445319335,
"loss": 3.218,
"step": 77700
},
{
"epoch": 22.661070304302203,
"grad_norm": 0.37066003680229187,
"learning_rate": 0.000328262467191601,
"loss": 3.2231,
"step": 77750
},
{
"epoch": 22.675644164626327,
"grad_norm": 0.4011686146259308,
"learning_rate": 0.000328087489063867,
"loss": 3.2241,
"step": 77800
},
{
"epoch": 22.69021802495045,
"grad_norm": 0.3722177743911743,
"learning_rate": 0.000327912510936133,
"loss": 3.2249,
"step": 77850
},
{
"epoch": 22.70479188527457,
"grad_norm": 0.39132416248321533,
"learning_rate": 0.0003277375328083989,
"loss": 3.2244,
"step": 77900
},
{
"epoch": 22.719365745598694,
"grad_norm": 0.381059467792511,
"learning_rate": 0.0003275625546806649,
"loss": 3.2381,
"step": 77950
},
{
"epoch": 22.733939605922817,
"grad_norm": 0.4152250587940216,
"learning_rate": 0.0003273875765529309,
"loss": 3.2327,
"step": 78000
},
{
"epoch": 22.733939605922817,
"eval_accuracy": 0.37290053654507904,
"eval_loss": 3.5416500568389893,
"eval_runtime": 53.2893,
"eval_samples_per_second": 312.014,
"eval_steps_per_second": 19.516,
"step": 78000
},
{
"epoch": 22.74851346624694,
"grad_norm": 0.37700963020324707,
"learning_rate": 0.00032721259842519683,
"loss": 3.2255,
"step": 78050
},
{
"epoch": 22.76308732657106,
"grad_norm": 0.3865543603897095,
"learning_rate": 0.0003270376202974628,
"loss": 3.2261,
"step": 78100
},
{
"epoch": 22.777661186895184,
"grad_norm": 0.4093526303768158,
"learning_rate": 0.00032686264216972877,
"loss": 3.2279,
"step": 78150
},
{
"epoch": 22.792235047219307,
"grad_norm": 0.39011725783348083,
"learning_rate": 0.0003266876640419947,
"loss": 3.224,
"step": 78200
},
{
"epoch": 22.80680890754343,
"grad_norm": 0.4137217402458191,
"learning_rate": 0.0003265126859142607,
"loss": 3.2247,
"step": 78250
},
{
"epoch": 22.821382767867554,
"grad_norm": 0.38537049293518066,
"learning_rate": 0.00032633770778652666,
"loss": 3.2353,
"step": 78300
},
{
"epoch": 22.835956628191674,
"grad_norm": 0.39159658551216125,
"learning_rate": 0.0003261627296587926,
"loss": 3.2266,
"step": 78350
},
{
"epoch": 22.850530488515798,
"grad_norm": 0.40833067893981934,
"learning_rate": 0.0003259877515310586,
"loss": 3.2335,
"step": 78400
},
{
"epoch": 22.86510434883992,
"grad_norm": 0.3539735972881317,
"learning_rate": 0.00032581277340332454,
"loss": 3.2215,
"step": 78450
},
{
"epoch": 22.879678209164044,
"grad_norm": 0.4081200659275055,
"learning_rate": 0.0003256377952755905,
"loss": 3.2293,
"step": 78500
},
{
"epoch": 22.894252069488164,
"grad_norm": 0.39085066318511963,
"learning_rate": 0.0003254628171478565,
"loss": 3.2384,
"step": 78550
},
{
"epoch": 22.908825929812288,
"grad_norm": 0.383390873670578,
"learning_rate": 0.0003252878390201225,
"loss": 3.232,
"step": 78600
},
{
"epoch": 22.92339979013641,
"grad_norm": 0.4007982611656189,
"learning_rate": 0.00032511286089238837,
"loss": 3.2366,
"step": 78650
},
{
"epoch": 22.937973650460535,
"grad_norm": 0.4163041412830353,
"learning_rate": 0.00032493788276465437,
"loss": 3.2437,
"step": 78700
},
{
"epoch": 22.952547510784658,
"grad_norm": 0.39729535579681396,
"learning_rate": 0.00032476290463692036,
"loss": 3.2472,
"step": 78750
},
{
"epoch": 22.967121371108778,
"grad_norm": 0.38961970806121826,
"learning_rate": 0.00032458792650918636,
"loss": 3.2443,
"step": 78800
},
{
"epoch": 22.9816952314329,
"grad_norm": 0.3606325685977936,
"learning_rate": 0.00032441294838145225,
"loss": 3.2433,
"step": 78850
},
{
"epoch": 22.996269091757025,
"grad_norm": 0.4011639952659607,
"learning_rate": 0.00032423797025371825,
"loss": 3.2327,
"step": 78900
},
{
"epoch": 23.01078465663985,
"grad_norm": 0.38173967599868774,
"learning_rate": 0.00032406299212598425,
"loss": 3.1527,
"step": 78950
},
{
"epoch": 23.025358516963973,
"grad_norm": 0.42250046133995056,
"learning_rate": 0.0003238880139982502,
"loss": 3.1185,
"step": 79000
},
{
"epoch": 23.025358516963973,
"eval_accuracy": 0.3727990817754867,
"eval_loss": 3.5551207065582275,
"eval_runtime": 53.2206,
"eval_samples_per_second": 312.417,
"eval_steps_per_second": 19.541,
"step": 79000
},
{
"epoch": 23.039932377288096,
"grad_norm": 0.39397215843200684,
"learning_rate": 0.00032371303587051613,
"loss": 3.1487,
"step": 79050
},
{
"epoch": 23.05450623761222,
"grad_norm": 0.4081815779209137,
"learning_rate": 0.00032353805774278213,
"loss": 3.1599,
"step": 79100
},
{
"epoch": 23.069080097936343,
"grad_norm": 0.41130968928337097,
"learning_rate": 0.00032336307961504813,
"loss": 3.1422,
"step": 79150
},
{
"epoch": 23.083653958260463,
"grad_norm": 0.4249158501625061,
"learning_rate": 0.0003231881014873141,
"loss": 3.1564,
"step": 79200
},
{
"epoch": 23.098227818584586,
"grad_norm": Infinity,
"learning_rate": 0.00032301312335958,
"loss": 3.1428,
"step": 79250
},
{
"epoch": 23.11280167890871,
"grad_norm": 0.40779754519462585,
"learning_rate": 0.000322838145231846,
"loss": 3.1596,
"step": 79300
},
{
"epoch": 23.127375539232833,
"grad_norm": 0.418041855096817,
"learning_rate": 0.00032266316710411196,
"loss": 3.1649,
"step": 79350
},
{
"epoch": 23.141949399556953,
"grad_norm": 0.3876676559448242,
"learning_rate": 0.0003224881889763779,
"loss": 3.1583,
"step": 79400
},
{
"epoch": 23.156523259881077,
"grad_norm": 0.411615788936615,
"learning_rate": 0.0003223132108486439,
"loss": 3.1741,
"step": 79450
},
{
"epoch": 23.1710971202052,
"grad_norm": 0.4377012848854065,
"learning_rate": 0.00032213823272090984,
"loss": 3.163,
"step": 79500
},
{
"epoch": 23.185670980529324,
"grad_norm": 0.3975127041339874,
"learning_rate": 0.00032196325459317584,
"loss": 3.1631,
"step": 79550
},
{
"epoch": 23.200244840853447,
"grad_norm": 0.49955064058303833,
"learning_rate": 0.0003217882764654418,
"loss": 3.1762,
"step": 79600
},
{
"epoch": 23.214818701177567,
"grad_norm": 0.3728475272655487,
"learning_rate": 0.0003216132983377077,
"loss": 3.18,
"step": 79650
},
{
"epoch": 23.22939256150169,
"grad_norm": 0.4319852888584137,
"learning_rate": 0.0003214383202099737,
"loss": 3.1784,
"step": 79700
},
{
"epoch": 23.243966421825814,
"grad_norm": 0.39504387974739075,
"learning_rate": 0.0003212633420822397,
"loss": 3.1737,
"step": 79750
},
{
"epoch": 23.258540282149937,
"grad_norm": 0.44203776121139526,
"learning_rate": 0.0003210883639545056,
"loss": 3.1716,
"step": 79800
},
{
"epoch": 23.273114142474057,
"grad_norm": 0.3983067274093628,
"learning_rate": 0.0003209133858267716,
"loss": 3.1892,
"step": 79850
},
{
"epoch": 23.28768800279818,
"grad_norm": 0.39879894256591797,
"learning_rate": 0.0003207384076990376,
"loss": 3.1892,
"step": 79900
},
{
"epoch": 23.302261863122304,
"grad_norm": 0.4163643419742584,
"learning_rate": 0.0003205634295713036,
"loss": 3.1799,
"step": 79950
},
{
"epoch": 23.316835723446427,
"grad_norm": 0.4006156325340271,
"learning_rate": 0.0003203884514435695,
"loss": 3.1744,
"step": 80000
},
{
"epoch": 23.316835723446427,
"eval_accuracy": 0.37262594956426826,
"eval_loss": 3.5548195838928223,
"eval_runtime": 53.2607,
"eval_samples_per_second": 312.182,
"eval_steps_per_second": 19.527,
"step": 80000
},
{
"epoch": 23.33140958377055,
"grad_norm": 0.44247502088546753,
"learning_rate": 0.0003202134733158355,
"loss": 3.1736,
"step": 80050
},
{
"epoch": 23.34598344409467,
"grad_norm": 0.41587111353874207,
"learning_rate": 0.0003200384951881015,
"loss": 3.1803,
"step": 80100
},
{
"epoch": 23.360557304418794,
"grad_norm": 0.3835608959197998,
"learning_rate": 0.00031986351706036743,
"loss": 3.1808,
"step": 80150
},
{
"epoch": 23.375131164742918,
"grad_norm": 0.382169634103775,
"learning_rate": 0.0003196885389326334,
"loss": 3.1914,
"step": 80200
},
{
"epoch": 23.38970502506704,
"grad_norm": 0.4157732427120209,
"learning_rate": 0.0003195135608048994,
"loss": 3.1982,
"step": 80250
},
{
"epoch": 23.40427888539116,
"grad_norm": 0.4157496988773346,
"learning_rate": 0.0003193385826771653,
"loss": 3.1861,
"step": 80300
},
{
"epoch": 23.418852745715284,
"grad_norm": 0.4726715087890625,
"learning_rate": 0.00031916360454943126,
"loss": 3.1849,
"step": 80350
},
{
"epoch": 23.433426606039408,
"grad_norm": 0.4136442244052887,
"learning_rate": 0.00031898862642169726,
"loss": 3.1953,
"step": 80400
},
{
"epoch": 23.44800046636353,
"grad_norm": 0.4338325560092926,
"learning_rate": 0.00031881364829396326,
"loss": 3.1965,
"step": 80450
},
{
"epoch": 23.462574326687655,
"grad_norm": 0.4061412811279297,
"learning_rate": 0.0003186386701662292,
"loss": 3.1974,
"step": 80500
},
{
"epoch": 23.477148187011775,
"grad_norm": 0.39773857593536377,
"learning_rate": 0.00031846369203849514,
"loss": 3.1902,
"step": 80550
},
{
"epoch": 23.491722047335898,
"grad_norm": 0.39488843083381653,
"learning_rate": 0.00031828871391076114,
"loss": 3.1955,
"step": 80600
},
{
"epoch": 23.50629590766002,
"grad_norm": 0.39714646339416504,
"learning_rate": 0.0003181137357830271,
"loss": 3.1985,
"step": 80650
},
{
"epoch": 23.520869767984145,
"grad_norm": 0.38533398509025574,
"learning_rate": 0.0003179387576552931,
"loss": 3.2036,
"step": 80700
},
{
"epoch": 23.535443628308265,
"grad_norm": 0.40813782811164856,
"learning_rate": 0.000317763779527559,
"loss": 3.1977,
"step": 80750
},
{
"epoch": 23.55001748863239,
"grad_norm": 0.40301379561424255,
"learning_rate": 0.00031758880139982497,
"loss": 3.2099,
"step": 80800
},
{
"epoch": 23.56459134895651,
"grad_norm": 0.4391363561153412,
"learning_rate": 0.00031741382327209097,
"loss": 3.2059,
"step": 80850
},
{
"epoch": 23.579165209280635,
"grad_norm": 0.4028854966163635,
"learning_rate": 0.00031723884514435696,
"loss": 3.1987,
"step": 80900
},
{
"epoch": 23.59373906960476,
"grad_norm": 0.4159344732761383,
"learning_rate": 0.00031706386701662285,
"loss": 3.2068,
"step": 80950
},
{
"epoch": 23.60831292992888,
"grad_norm": 0.38993582129478455,
"learning_rate": 0.00031688888888888885,
"loss": 3.2041,
"step": 81000
},
{
"epoch": 23.60831292992888,
"eval_accuracy": 0.3734721906238609,
"eval_loss": 3.5403220653533936,
"eval_runtime": 53.2707,
"eval_samples_per_second": 312.123,
"eval_steps_per_second": 19.523,
"step": 81000
},
{
"epoch": 23.622886790253002,
"grad_norm": 0.40576037764549255,
"learning_rate": 0.00031671391076115485,
"loss": 3.2149,
"step": 81050
},
{
"epoch": 23.637460650577125,
"grad_norm": 0.4368989169597626,
"learning_rate": 0.00031653893263342085,
"loss": 3.204,
"step": 81100
},
{
"epoch": 23.65203451090125,
"grad_norm": 0.43579620122909546,
"learning_rate": 0.00031636395450568674,
"loss": 3.2224,
"step": 81150
},
{
"epoch": 23.66660837122537,
"grad_norm": 0.42136499285697937,
"learning_rate": 0.00031618897637795273,
"loss": 3.2131,
"step": 81200
},
{
"epoch": 23.681182231549492,
"grad_norm": 0.4127443730831146,
"learning_rate": 0.00031601399825021873,
"loss": 3.2246,
"step": 81250
},
{
"epoch": 23.695756091873616,
"grad_norm": 0.4025081694126129,
"learning_rate": 0.0003158390201224846,
"loss": 3.2164,
"step": 81300
},
{
"epoch": 23.71032995219774,
"grad_norm": 0.40500345826148987,
"learning_rate": 0.0003156640419947506,
"loss": 3.2144,
"step": 81350
},
{
"epoch": 23.72490381252186,
"grad_norm": 0.42702171206474304,
"learning_rate": 0.0003154890638670166,
"loss": 3.215,
"step": 81400
},
{
"epoch": 23.739477672845982,
"grad_norm": 0.4396904408931732,
"learning_rate": 0.00031531408573928256,
"loss": 3.2153,
"step": 81450
},
{
"epoch": 23.754051533170106,
"grad_norm": 0.38937604427337646,
"learning_rate": 0.0003151391076115485,
"loss": 3.2199,
"step": 81500
},
{
"epoch": 23.76862539349423,
"grad_norm": 0.43401476740837097,
"learning_rate": 0.0003149641294838145,
"loss": 3.2106,
"step": 81550
},
{
"epoch": 23.783199253818353,
"grad_norm": 0.40791040658950806,
"learning_rate": 0.00031478915135608044,
"loss": 3.2293,
"step": 81600
},
{
"epoch": 23.797773114142473,
"grad_norm": 0.39792293310165405,
"learning_rate": 0.00031461417322834644,
"loss": 3.2349,
"step": 81650
},
{
"epoch": 23.812346974466596,
"grad_norm": 0.4132463335990906,
"learning_rate": 0.0003144391951006124,
"loss": 3.2196,
"step": 81700
},
{
"epoch": 23.82692083479072,
"grad_norm": 0.3725283443927765,
"learning_rate": 0.0003142642169728784,
"loss": 3.2128,
"step": 81750
},
{
"epoch": 23.841494695114843,
"grad_norm": 0.41264012455940247,
"learning_rate": 0.0003140892388451443,
"loss": 3.2286,
"step": 81800
},
{
"epoch": 23.856068555438966,
"grad_norm": 0.39688318967819214,
"learning_rate": 0.0003139142607174103,
"loss": 3.2239,
"step": 81850
},
{
"epoch": 23.870642415763086,
"grad_norm": 0.3925932049751282,
"learning_rate": 0.00031373928258967627,
"loss": 3.2283,
"step": 81900
},
{
"epoch": 23.88521627608721,
"grad_norm": 0.4429666996002197,
"learning_rate": 0.0003135643044619422,
"loss": 3.2317,
"step": 81950
},
{
"epoch": 23.899790136411333,
"grad_norm": 0.4149000942707062,
"learning_rate": 0.0003133893263342082,
"loss": 3.2131,
"step": 82000
},
{
"epoch": 23.899790136411333,
"eval_accuracy": 0.3737786734777106,
"eval_loss": 3.535808563232422,
"eval_runtime": 53.3201,
"eval_samples_per_second": 311.833,
"eval_steps_per_second": 19.505,
"step": 82000
},
{
"epoch": 23.914363996735457,
"grad_norm": 0.41211676597595215,
"learning_rate": 0.00031321434820647415,
"loss": 3.2245,
"step": 82050
},
{
"epoch": 23.928937857059577,
"grad_norm": 0.42245087027549744,
"learning_rate": 0.0003130393700787401,
"loss": 3.2254,
"step": 82100
},
{
"epoch": 23.9435117173837,
"grad_norm": 0.40680941939353943,
"learning_rate": 0.0003128643919510061,
"loss": 3.2294,
"step": 82150
},
{
"epoch": 23.958085577707823,
"grad_norm": 0.40813785791397095,
"learning_rate": 0.0003126894138232721,
"loss": 3.2326,
"step": 82200
},
{
"epoch": 23.972659438031947,
"grad_norm": 0.40311017632484436,
"learning_rate": 0.000312514435695538,
"loss": 3.2207,
"step": 82250
},
{
"epoch": 23.987233298356067,
"grad_norm": 0.4265156388282776,
"learning_rate": 0.000312339457567804,
"loss": 3.2262,
"step": 82300
},
{
"epoch": 24.001748863238895,
"grad_norm": 0.40735989809036255,
"learning_rate": 0.00031216447944007,
"loss": 3.2165,
"step": 82350
},
{
"epoch": 24.016322723563018,
"grad_norm": 0.4180150628089905,
"learning_rate": 0.000311989501312336,
"loss": 3.1257,
"step": 82400
},
{
"epoch": 24.03089658388714,
"grad_norm": 0.4029780924320221,
"learning_rate": 0.00031181452318460186,
"loss": 3.1385,
"step": 82450
},
{
"epoch": 24.04547044421126,
"grad_norm": 0.42957931756973267,
"learning_rate": 0.00031163954505686786,
"loss": 3.1267,
"step": 82500
},
{
"epoch": 24.060044304535385,
"grad_norm": 0.4051404893398285,
"learning_rate": 0.00031146456692913386,
"loss": 3.1293,
"step": 82550
},
{
"epoch": 24.07461816485951,
"grad_norm": 0.3836676776409149,
"learning_rate": 0.0003112895888013998,
"loss": 3.1363,
"step": 82600
},
{
"epoch": 24.089192025183632,
"grad_norm": 0.4129314422607422,
"learning_rate": 0.00031111461067366575,
"loss": 3.1348,
"step": 82650
},
{
"epoch": 24.10376588550775,
"grad_norm": 0.4051026999950409,
"learning_rate": 0.00031093963254593174,
"loss": 3.1671,
"step": 82700
},
{
"epoch": 24.118339745831875,
"grad_norm": 0.39646244049072266,
"learning_rate": 0.0003107646544181977,
"loss": 3.1398,
"step": 82750
},
{
"epoch": 24.132913606156,
"grad_norm": 0.3891238272190094,
"learning_rate": 0.0003105896762904637,
"loss": 3.1521,
"step": 82800
},
{
"epoch": 24.147487466480122,
"grad_norm": 0.3894284665584564,
"learning_rate": 0.00031041469816272963,
"loss": 3.146,
"step": 82850
},
{
"epoch": 24.162061326804245,
"grad_norm": 0.4276731312274933,
"learning_rate": 0.00031023972003499557,
"loss": 3.1619,
"step": 82900
},
{
"epoch": 24.176635187128365,
"grad_norm": 0.45556432008743286,
"learning_rate": 0.00031006474190726157,
"loss": 3.1422,
"step": 82950
},
{
"epoch": 24.19120904745249,
"grad_norm": 0.41922351717948914,
"learning_rate": 0.0003098897637795275,
"loss": 3.1545,
"step": 83000
},
{
"epoch": 24.19120904745249,
"eval_accuracy": 0.3726754999795796,
"eval_loss": 3.552673101425171,
"eval_runtime": 53.3097,
"eval_samples_per_second": 311.894,
"eval_steps_per_second": 19.509,
"step": 83000
},
{
"epoch": 24.205782907776612,
"grad_norm": 0.38522177934646606,
"learning_rate": 0.0003097147856517935,
"loss": 3.1641,
"step": 83050
},
{
"epoch": 24.220356768100736,
"grad_norm": 0.41649022698402405,
"learning_rate": 0.00030953980752405945,
"loss": 3.1551,
"step": 83100
},
{
"epoch": 24.234930628424856,
"grad_norm": 0.3905382752418518,
"learning_rate": 0.00030936482939632545,
"loss": 3.1687,
"step": 83150
},
{
"epoch": 24.24950448874898,
"grad_norm": 0.4067355990409851,
"learning_rate": 0.0003091898512685914,
"loss": 3.1718,
"step": 83200
},
{
"epoch": 24.264078349073102,
"grad_norm": 0.414569228887558,
"learning_rate": 0.00030901487314085734,
"loss": 3.1636,
"step": 83250
},
{
"epoch": 24.278652209397226,
"grad_norm": 0.4139001667499542,
"learning_rate": 0.00030883989501312334,
"loss": 3.1865,
"step": 83300
},
{
"epoch": 24.29322606972135,
"grad_norm": 0.40517884492874146,
"learning_rate": 0.00030866491688538933,
"loss": 3.1784,
"step": 83350
},
{
"epoch": 24.30779993004547,
"grad_norm": 0.3915081024169922,
"learning_rate": 0.0003084899387576552,
"loss": 3.1592,
"step": 83400
},
{
"epoch": 24.322373790369593,
"grad_norm": 0.400414377450943,
"learning_rate": 0.0003083149606299212,
"loss": 3.1815,
"step": 83450
},
{
"epoch": 24.336947650693716,
"grad_norm": 0.4337230324745178,
"learning_rate": 0.0003081399825021872,
"loss": 3.1742,
"step": 83500
},
{
"epoch": 24.35152151101784,
"grad_norm": 0.3856227695941925,
"learning_rate": 0.0003079650043744532,
"loss": 3.1731,
"step": 83550
},
{
"epoch": 24.36609537134196,
"grad_norm": 0.40683674812316895,
"learning_rate": 0.0003077900262467191,
"loss": 3.186,
"step": 83600
},
{
"epoch": 24.380669231666083,
"grad_norm": 0.420723557472229,
"learning_rate": 0.0003076150481189851,
"loss": 3.1901,
"step": 83650
},
{
"epoch": 24.395243091990206,
"grad_norm": 0.39648476243019104,
"learning_rate": 0.0003074400699912511,
"loss": 3.1861,
"step": 83700
},
{
"epoch": 24.40981695231433,
"grad_norm": 0.4028700292110443,
"learning_rate": 0.00030726509186351704,
"loss": 3.1852,
"step": 83750
},
{
"epoch": 24.424390812638453,
"grad_norm": 0.4025539457798004,
"learning_rate": 0.000307090113735783,
"loss": 3.1875,
"step": 83800
},
{
"epoch": 24.438964672962573,
"grad_norm": 0.4205414354801178,
"learning_rate": 0.000306915135608049,
"loss": 3.1882,
"step": 83850
},
{
"epoch": 24.453538533286697,
"grad_norm": 0.4485682547092438,
"learning_rate": 0.00030674015748031493,
"loss": 3.1828,
"step": 83900
},
{
"epoch": 24.46811239361082,
"grad_norm": 0.4024691581726074,
"learning_rate": 0.00030656517935258087,
"loss": 3.1875,
"step": 83950
},
{
"epoch": 24.482686253934943,
"grad_norm": 0.41920554637908936,
"learning_rate": 0.00030639020122484687,
"loss": 3.1889,
"step": 84000
},
{
"epoch": 24.482686253934943,
"eval_accuracy": 0.3733111811983362,
"eval_loss": 3.546642303466797,
"eval_runtime": 53.2322,
"eval_samples_per_second": 312.348,
"eval_steps_per_second": 19.537,
"step": 84000
},
{
"epoch": 24.497260114259063,
"grad_norm": 0.41416135430336,
"learning_rate": 0.0003062152230971128,
"loss": 3.199,
"step": 84050
},
{
"epoch": 24.511833974583187,
"grad_norm": 0.3936392366886139,
"learning_rate": 0.0003060402449693788,
"loss": 3.1838,
"step": 84100
},
{
"epoch": 24.52640783490731,
"grad_norm": 0.4029703438282013,
"learning_rate": 0.00030586526684164475,
"loss": 3.1963,
"step": 84150
},
{
"epoch": 24.540981695231434,
"grad_norm": 0.3771865665912628,
"learning_rate": 0.0003056902887139107,
"loss": 3.1984,
"step": 84200
},
{
"epoch": 24.555555555555557,
"grad_norm": 0.3756367266178131,
"learning_rate": 0.0003055153105861767,
"loss": 3.1901,
"step": 84250
},
{
"epoch": 24.570129415879677,
"grad_norm": 0.40955060720443726,
"learning_rate": 0.0003053403324584427,
"loss": 3.2019,
"step": 84300
},
{
"epoch": 24.5847032762038,
"grad_norm": 0.43558764457702637,
"learning_rate": 0.00030516535433070864,
"loss": 3.1851,
"step": 84350
},
{
"epoch": 24.599277136527924,
"grad_norm": 0.46225878596305847,
"learning_rate": 0.0003049903762029746,
"loss": 3.2034,
"step": 84400
},
{
"epoch": 24.613850996852047,
"grad_norm": 0.38759079575538635,
"learning_rate": 0.0003048153980752406,
"loss": 3.1924,
"step": 84450
},
{
"epoch": 24.628424857176167,
"grad_norm": 0.40002742409706116,
"learning_rate": 0.0003046404199475066,
"loss": 3.2004,
"step": 84500
},
{
"epoch": 24.64299871750029,
"grad_norm": 0.4547050893306732,
"learning_rate": 0.00030446544181977247,
"loss": 3.2056,
"step": 84550
},
{
"epoch": 24.657572577824414,
"grad_norm": 0.42415475845336914,
"learning_rate": 0.00030429046369203846,
"loss": 3.2092,
"step": 84600
},
{
"epoch": 24.672146438148538,
"grad_norm": 0.41034477949142456,
"learning_rate": 0.00030411548556430446,
"loss": 3.2004,
"step": 84650
},
{
"epoch": 24.68672029847266,
"grad_norm": 0.411905974149704,
"learning_rate": 0.00030394050743657046,
"loss": 3.1921,
"step": 84700
},
{
"epoch": 24.70129415879678,
"grad_norm": 0.38791701197624207,
"learning_rate": 0.00030376552930883635,
"loss": 3.2034,
"step": 84750
},
{
"epoch": 24.715868019120904,
"grad_norm": 0.41622650623321533,
"learning_rate": 0.00030359055118110235,
"loss": 3.199,
"step": 84800
},
{
"epoch": 24.730441879445028,
"grad_norm": 0.4126710891723633,
"learning_rate": 0.00030341557305336834,
"loss": 3.2097,
"step": 84850
},
{
"epoch": 24.74501573976915,
"grad_norm": 0.3979971408843994,
"learning_rate": 0.00030324059492563423,
"loss": 3.199,
"step": 84900
},
{
"epoch": 24.75958960009327,
"grad_norm": 0.4198929965496063,
"learning_rate": 0.00030306561679790023,
"loss": 3.2083,
"step": 84950
},
{
"epoch": 24.774163460417395,
"grad_norm": 0.432871013879776,
"learning_rate": 0.00030289063867016623,
"loss": 3.2161,
"step": 85000
},
{
"epoch": 24.774163460417395,
"eval_accuracy": 0.3738008005040254,
"eval_loss": 3.5413074493408203,
"eval_runtime": 53.3055,
"eval_samples_per_second": 311.919,
"eval_steps_per_second": 19.51,
"step": 85000
},
{
"epoch": 24.788737320741518,
"grad_norm": 0.41536158323287964,
"learning_rate": 0.00030271566054243217,
"loss": 3.1999,
"step": 85050
},
{
"epoch": 24.80331118106564,
"grad_norm": 0.39557045698165894,
"learning_rate": 0.0003025406824146981,
"loss": 3.2129,
"step": 85100
},
{
"epoch": 24.817885041389765,
"grad_norm": 0.43374064564704895,
"learning_rate": 0.0003023657042869641,
"loss": 3.2197,
"step": 85150
},
{
"epoch": 24.832458901713885,
"grad_norm": 0.3977198898792267,
"learning_rate": 0.00030219072615923006,
"loss": 3.211,
"step": 85200
},
{
"epoch": 24.84703276203801,
"grad_norm": 0.4073224663734436,
"learning_rate": 0.00030201574803149605,
"loss": 3.2171,
"step": 85250
},
{
"epoch": 24.86160662236213,
"grad_norm": 0.4387545585632324,
"learning_rate": 0.000301840769903762,
"loss": 3.2168,
"step": 85300
},
{
"epoch": 24.876180482686255,
"grad_norm": 0.4089738726615906,
"learning_rate": 0.00030166579177602794,
"loss": 3.2198,
"step": 85350
},
{
"epoch": 24.890754343010375,
"grad_norm": 0.4149879217147827,
"learning_rate": 0.00030149081364829394,
"loss": 3.2195,
"step": 85400
},
{
"epoch": 24.9053282033345,
"grad_norm": 0.43615013360977173,
"learning_rate": 0.00030131583552055994,
"loss": 3.2236,
"step": 85450
},
{
"epoch": 24.919902063658622,
"grad_norm": 0.43617773056030273,
"learning_rate": 0.0003011408573928258,
"loss": 3.2136,
"step": 85500
},
{
"epoch": 24.934475923982745,
"grad_norm": 0.41982802748680115,
"learning_rate": 0.0003009658792650918,
"loss": 3.2235,
"step": 85550
},
{
"epoch": 24.94904978430687,
"grad_norm": 0.3977769911289215,
"learning_rate": 0.0003007909011373578,
"loss": 3.2156,
"step": 85600
},
{
"epoch": 24.96362364463099,
"grad_norm": 0.4145846366882324,
"learning_rate": 0.00030061592300962376,
"loss": 3.2121,
"step": 85650
},
{
"epoch": 24.978197504955112,
"grad_norm": 0.5370997190475464,
"learning_rate": 0.0003004409448818897,
"loss": 3.2193,
"step": 85700
},
{
"epoch": 24.992771365279236,
"grad_norm": 0.3931005895137787,
"learning_rate": 0.0003002659667541557,
"loss": 3.2151,
"step": 85750
},
{
"epoch": 25.00728693016206,
"grad_norm": 0.3954836428165436,
"learning_rate": 0.0003000909886264217,
"loss": 3.162,
"step": 85800
},
{
"epoch": 25.021860790486183,
"grad_norm": 0.42345207929611206,
"learning_rate": 0.00029991601049868765,
"loss": 3.1196,
"step": 85850
},
{
"epoch": 25.036434650810307,
"grad_norm": 0.4194379150867462,
"learning_rate": 0.0002997410323709536,
"loss": 3.1185,
"step": 85900
},
{
"epoch": 25.05100851113443,
"grad_norm": 0.4132949113845825,
"learning_rate": 0.0002995660542432196,
"loss": 3.1275,
"step": 85950
},
{
"epoch": 25.065582371458554,
"grad_norm": 0.3960988521575928,
"learning_rate": 0.00029939107611548553,
"loss": 3.1333,
"step": 86000
},
{
"epoch": 25.065582371458554,
"eval_accuracy": 0.3730515417299827,
"eval_loss": 3.5527703762054443,
"eval_runtime": 53.3809,
"eval_samples_per_second": 311.479,
"eval_steps_per_second": 19.483,
"step": 86000
},
{
"epoch": 25.080156231782674,
"grad_norm": 0.39942601323127747,
"learning_rate": 0.00029921609798775153,
"loss": 3.137,
"step": 86050
},
{
"epoch": 25.094730092106797,
"grad_norm": 0.41080349683761597,
"learning_rate": 0.00029904111986001747,
"loss": 3.1285,
"step": 86100
},
{
"epoch": 25.10930395243092,
"grad_norm": 0.41935205459594727,
"learning_rate": 0.00029886614173228347,
"loss": 3.1401,
"step": 86150
},
{
"epoch": 25.123877812755044,
"grad_norm": 0.4359089732170105,
"learning_rate": 0.0002986911636045494,
"loss": 3.1532,
"step": 86200
},
{
"epoch": 25.138451673079164,
"grad_norm": 0.4485057294368744,
"learning_rate": 0.0002985161854768154,
"loss": 3.1389,
"step": 86250
},
{
"epoch": 25.153025533403287,
"grad_norm": 0.3963627815246582,
"learning_rate": 0.00029834120734908135,
"loss": 3.1384,
"step": 86300
},
{
"epoch": 25.16759939372741,
"grad_norm": 0.4120327830314636,
"learning_rate": 0.0002981662292213473,
"loss": 3.1528,
"step": 86350
},
{
"epoch": 25.182173254051534,
"grad_norm": 0.4055855870246887,
"learning_rate": 0.0002979912510936133,
"loss": 3.1445,
"step": 86400
},
{
"epoch": 25.196747114375654,
"grad_norm": 0.40137195587158203,
"learning_rate": 0.00029781627296587924,
"loss": 3.1552,
"step": 86450
},
{
"epoch": 25.211320974699778,
"grad_norm": 0.4002479612827301,
"learning_rate": 0.0002976412948381452,
"loss": 3.1534,
"step": 86500
},
{
"epoch": 25.2258948350239,
"grad_norm": 0.42092686891555786,
"learning_rate": 0.0002974663167104112,
"loss": 3.16,
"step": 86550
},
{
"epoch": 25.240468695348024,
"grad_norm": 0.4151979982852936,
"learning_rate": 0.0002972913385826771,
"loss": 3.1554,
"step": 86600
},
{
"epoch": 25.255042555672148,
"grad_norm": 0.40470385551452637,
"learning_rate": 0.00029711636045494307,
"loss": 3.1603,
"step": 86650
},
{
"epoch": 25.269616415996268,
"grad_norm": 0.4268498718738556,
"learning_rate": 0.00029694138232720906,
"loss": 3.1603,
"step": 86700
},
{
"epoch": 25.28419027632039,
"grad_norm": 0.40396055579185486,
"learning_rate": 0.000296766404199475,
"loss": 3.1637,
"step": 86750
},
{
"epoch": 25.298764136644515,
"grad_norm": 0.4064796268939972,
"learning_rate": 0.000296591426071741,
"loss": 3.1593,
"step": 86800
},
{
"epoch": 25.313337996968638,
"grad_norm": 0.39150217175483704,
"learning_rate": 0.00029641644794400695,
"loss": 3.1734,
"step": 86850
},
{
"epoch": 25.327911857292758,
"grad_norm": 0.43401914834976196,
"learning_rate": 0.00029624146981627295,
"loss": 3.1573,
"step": 86900
},
{
"epoch": 25.34248571761688,
"grad_norm": 0.43742623925209045,
"learning_rate": 0.0002960664916885389,
"loss": 3.1624,
"step": 86950
},
{
"epoch": 25.357059577941005,
"grad_norm": 0.4239753186702728,
"learning_rate": 0.0002958915135608049,
"loss": 3.1656,
"step": 87000
},
{
"epoch": 25.357059577941005,
"eval_accuracy": 0.37326174847997334,
"eval_loss": 3.553415298461914,
"eval_runtime": 53.3829,
"eval_samples_per_second": 311.467,
"eval_steps_per_second": 19.482,
"step": 87000
},
{
"epoch": 25.37163343826513,
"grad_norm": 0.44977104663848877,
"learning_rate": 0.00029571653543307083,
"loss": 3.1688,
"step": 87050
},
{
"epoch": 25.38620729858925,
"grad_norm": 0.4430921971797943,
"learning_rate": 0.00029554155730533683,
"loss": 3.1682,
"step": 87100
},
{
"epoch": 25.40078115891337,
"grad_norm": 0.4145093262195587,
"learning_rate": 0.0002953665791776028,
"loss": 3.1794,
"step": 87150
},
{
"epoch": 25.415355019237495,
"grad_norm": 0.3987514078617096,
"learning_rate": 0.00029519160104986877,
"loss": 3.1737,
"step": 87200
},
{
"epoch": 25.42992887956162,
"grad_norm": 0.3947341740131378,
"learning_rate": 0.0002950166229221347,
"loss": 3.1772,
"step": 87250
},
{
"epoch": 25.444502739885742,
"grad_norm": 0.4111432433128357,
"learning_rate": 0.0002948416447944007,
"loss": 3.1711,
"step": 87300
},
{
"epoch": 25.459076600209862,
"grad_norm": 0.42870181798934937,
"learning_rate": 0.00029466666666666666,
"loss": 3.1795,
"step": 87350
},
{
"epoch": 25.473650460533985,
"grad_norm": 0.43575218319892883,
"learning_rate": 0.0002944916885389326,
"loss": 3.1767,
"step": 87400
},
{
"epoch": 25.48822432085811,
"grad_norm": 0.43777981400489807,
"learning_rate": 0.0002943167104111986,
"loss": 3.1797,
"step": 87450
},
{
"epoch": 25.502798181182232,
"grad_norm": 0.404784232378006,
"learning_rate": 0.00029414173228346454,
"loss": 3.1845,
"step": 87500
},
{
"epoch": 25.517372041506356,
"grad_norm": 0.418813556432724,
"learning_rate": 0.00029396675415573054,
"loss": 3.1882,
"step": 87550
},
{
"epoch": 25.531945901830476,
"grad_norm": 0.4354451894760132,
"learning_rate": 0.0002937917760279965,
"loss": 3.1846,
"step": 87600
},
{
"epoch": 25.5465197621546,
"grad_norm": 0.4002273380756378,
"learning_rate": 0.0002936167979002624,
"loss": 3.1922,
"step": 87650
},
{
"epoch": 25.561093622478722,
"grad_norm": 0.42676427960395813,
"learning_rate": 0.0002934418197725284,
"loss": 3.1858,
"step": 87700
},
{
"epoch": 25.575667482802846,
"grad_norm": 0.39750683307647705,
"learning_rate": 0.00029326684164479437,
"loss": 3.1836,
"step": 87750
},
{
"epoch": 25.59024134312697,
"grad_norm": 0.44498196244239807,
"learning_rate": 0.0002930918635170603,
"loss": 3.1909,
"step": 87800
},
{
"epoch": 25.60481520345109,
"grad_norm": 0.42401590943336487,
"learning_rate": 0.0002929168853893263,
"loss": 3.1926,
"step": 87850
},
{
"epoch": 25.619389063775213,
"grad_norm": 0.41390058398246765,
"learning_rate": 0.00029274190726159225,
"loss": 3.1983,
"step": 87900
},
{
"epoch": 25.633962924099336,
"grad_norm": 0.4257187247276306,
"learning_rate": 0.00029256692913385825,
"loss": 3.2074,
"step": 87950
},
{
"epoch": 25.64853678442346,
"grad_norm": 0.44523632526397705,
"learning_rate": 0.0002923919510061242,
"loss": 3.2035,
"step": 88000
},
{
"epoch": 25.64853678442346,
"eval_accuracy": 0.3735054988602816,
"eval_loss": 3.5426251888275146,
"eval_runtime": 53.1684,
"eval_samples_per_second": 312.723,
"eval_steps_per_second": 19.56,
"step": 88000
},
{
"epoch": 25.66311064474758,
"grad_norm": 0.41291964054107666,
"learning_rate": 0.0002922169728783902,
"loss": 3.1788,
"step": 88050
},
{
"epoch": 25.677684505071703,
"grad_norm": 0.4161721467971802,
"learning_rate": 0.00029204199475065613,
"loss": 3.193,
"step": 88100
},
{
"epoch": 25.692258365395826,
"grad_norm": 0.46007800102233887,
"learning_rate": 0.00029186701662292213,
"loss": 3.1947,
"step": 88150
},
{
"epoch": 25.70683222571995,
"grad_norm": 0.4044644236564636,
"learning_rate": 0.0002916920384951881,
"loss": 3.1954,
"step": 88200
},
{
"epoch": 25.72140608604407,
"grad_norm": 0.4262496829032898,
"learning_rate": 0.00029151706036745407,
"loss": 3.2019,
"step": 88250
},
{
"epoch": 25.735979946368193,
"grad_norm": 0.43586814403533936,
"learning_rate": 0.00029134208223972,
"loss": 3.1869,
"step": 88300
},
{
"epoch": 25.750553806692317,
"grad_norm": 0.41651099920272827,
"learning_rate": 0.00029116710411198596,
"loss": 3.2099,
"step": 88350
},
{
"epoch": 25.76512766701644,
"grad_norm": 0.4524548053741455,
"learning_rate": 0.00029099212598425196,
"loss": 3.2114,
"step": 88400
},
{
"epoch": 25.779701527340563,
"grad_norm": 0.42696666717529297,
"learning_rate": 0.0002908171478565179,
"loss": 3.1968,
"step": 88450
},
{
"epoch": 25.794275387664683,
"grad_norm": 0.4079219102859497,
"learning_rate": 0.0002906421697287839,
"loss": 3.1967,
"step": 88500
},
{
"epoch": 25.808849247988807,
"grad_norm": 0.40614786744117737,
"learning_rate": 0.00029046719160104984,
"loss": 3.2124,
"step": 88550
},
{
"epoch": 25.82342310831293,
"grad_norm": 0.45513415336608887,
"learning_rate": 0.00029029221347331584,
"loss": 3.1972,
"step": 88600
},
{
"epoch": 25.837996968637054,
"grad_norm": 0.4304491877555847,
"learning_rate": 0.0002901172353455818,
"loss": 3.2128,
"step": 88650
},
{
"epoch": 25.852570828961174,
"grad_norm": 0.39539921283721924,
"learning_rate": 0.0002899422572178477,
"loss": 3.1988,
"step": 88700
},
{
"epoch": 25.867144689285297,
"grad_norm": 0.38909876346588135,
"learning_rate": 0.0002897672790901137,
"loss": 3.1997,
"step": 88750
},
{
"epoch": 25.88171854960942,
"grad_norm": 0.40517860651016235,
"learning_rate": 0.00028959230096237967,
"loss": 3.2029,
"step": 88800
},
{
"epoch": 25.896292409933544,
"grad_norm": 0.38918161392211914,
"learning_rate": 0.00028941732283464566,
"loss": 3.2022,
"step": 88850
},
{
"epoch": 25.910866270257667,
"grad_norm": 0.40442124009132385,
"learning_rate": 0.0002892423447069116,
"loss": 3.207,
"step": 88900
},
{
"epoch": 25.925440130581787,
"grad_norm": 0.4054309129714966,
"learning_rate": 0.00028906736657917755,
"loss": 3.2071,
"step": 88950
},
{
"epoch": 25.94001399090591,
"grad_norm": 0.42414960265159607,
"learning_rate": 0.00028889238845144355,
"loss": 3.2018,
"step": 89000
},
{
"epoch": 25.94001399090591,
"eval_accuracy": 0.3743577424642469,
"eval_loss": 3.5337586402893066,
"eval_runtime": 53.3199,
"eval_samples_per_second": 311.835,
"eval_steps_per_second": 19.505,
"step": 89000
},
{
"epoch": 25.954587851230034,
"grad_norm": 0.38889071345329285,
"learning_rate": 0.0002887174103237095,
"loss": 3.2059,
"step": 89050
},
{
"epoch": 25.969161711554158,
"grad_norm": 0.4038815200328827,
"learning_rate": 0.0002885424321959755,
"loss": 3.2054,
"step": 89100
},
{
"epoch": 25.983735571878277,
"grad_norm": 0.4206139147281647,
"learning_rate": 0.00028836745406824143,
"loss": 3.1969,
"step": 89150
},
{
"epoch": 25.9983094322024,
"grad_norm": 0.42321598529815674,
"learning_rate": 0.00028819247594050743,
"loss": 3.2222,
"step": 89200
},
{
"epoch": 26.01282499708523,
"grad_norm": 0.4227030277252197,
"learning_rate": 0.0002880174978127734,
"loss": 3.1136,
"step": 89250
},
{
"epoch": 26.027398857409352,
"grad_norm": 0.4692704677581787,
"learning_rate": 0.0002878425196850393,
"loss": 3.1086,
"step": 89300
},
{
"epoch": 26.041972717733472,
"grad_norm": 0.3992956578731537,
"learning_rate": 0.0002876675415573053,
"loss": 3.1058,
"step": 89350
},
{
"epoch": 26.056546578057596,
"grad_norm": 0.39758923649787903,
"learning_rate": 0.00028749256342957126,
"loss": 3.1025,
"step": 89400
},
{
"epoch": 26.07112043838172,
"grad_norm": 0.42007169127464294,
"learning_rate": 0.00028731758530183726,
"loss": 3.124,
"step": 89450
},
{
"epoch": 26.085694298705842,
"grad_norm": 0.45216524600982666,
"learning_rate": 0.0002871426071741032,
"loss": 3.1132,
"step": 89500
},
{
"epoch": 26.100268159029962,
"grad_norm": 0.42939022183418274,
"learning_rate": 0.0002869676290463692,
"loss": 3.1365,
"step": 89550
},
{
"epoch": 26.114842019354086,
"grad_norm": 0.42322081327438354,
"learning_rate": 0.00028679265091863514,
"loss": 3.1188,
"step": 89600
},
{
"epoch": 26.12941587967821,
"grad_norm": 0.433601438999176,
"learning_rate": 0.00028661767279090114,
"loss": 3.1403,
"step": 89650
},
{
"epoch": 26.143989740002333,
"grad_norm": 0.43289831280708313,
"learning_rate": 0.0002864426946631671,
"loss": 3.1435,
"step": 89700
},
{
"epoch": 26.158563600326456,
"grad_norm": 0.43380987644195557,
"learning_rate": 0.0002862677165354331,
"loss": 3.1476,
"step": 89750
},
{
"epoch": 26.173137460650576,
"grad_norm": 0.4708380699157715,
"learning_rate": 0.000286092738407699,
"loss": 3.1418,
"step": 89800
},
{
"epoch": 26.1877113209747,
"grad_norm": 0.42673295736312866,
"learning_rate": 0.00028591776027996497,
"loss": 3.149,
"step": 89850
},
{
"epoch": 26.202285181298823,
"grad_norm": 0.46042966842651367,
"learning_rate": 0.00028574278215223097,
"loss": 3.152,
"step": 89900
},
{
"epoch": 26.216859041622946,
"grad_norm": 0.43242865800857544,
"learning_rate": 0.0002855678040244969,
"loss": 3.132,
"step": 89950
},
{
"epoch": 26.231432901947066,
"grad_norm": 0.4106438457965851,
"learning_rate": 0.00028539282589676285,
"loss": 3.158,
"step": 90000
},
{
"epoch": 26.231432901947066,
"eval_accuracy": 0.3732618661769218,
"eval_loss": 3.551401138305664,
"eval_runtime": 53.2441,
"eval_samples_per_second": 312.279,
"eval_steps_per_second": 19.533,
"step": 90000
},
{
"epoch": 26.24600676227119,
"grad_norm": 0.442765474319458,
"learning_rate": 0.00028521784776902885,
"loss": 3.1519,
"step": 90050
},
{
"epoch": 26.260580622595313,
"grad_norm": 0.42735040187835693,
"learning_rate": 0.0002850428696412948,
"loss": 3.1545,
"step": 90100
},
{
"epoch": 26.275154482919437,
"grad_norm": 0.4304802417755127,
"learning_rate": 0.00028486789151356074,
"loss": 3.1398,
"step": 90150
},
{
"epoch": 26.28972834324356,
"grad_norm": 0.4221126139163971,
"learning_rate": 0.00028469291338582673,
"loss": 3.1707,
"step": 90200
},
{
"epoch": 26.30430220356768,
"grad_norm": 0.4367322623729706,
"learning_rate": 0.0002845179352580927,
"loss": 3.1604,
"step": 90250
},
{
"epoch": 26.318876063891803,
"grad_norm": 0.45300185680389404,
"learning_rate": 0.0002843429571303587,
"loss": 3.1601,
"step": 90300
},
{
"epoch": 26.333449924215927,
"grad_norm": 0.4125306308269501,
"learning_rate": 0.0002841679790026246,
"loss": 3.1538,
"step": 90350
},
{
"epoch": 26.34802378454005,
"grad_norm": 0.4720957279205322,
"learning_rate": 0.0002839930008748906,
"loss": 3.1676,
"step": 90400
},
{
"epoch": 26.36259764486417,
"grad_norm": 0.41910240054130554,
"learning_rate": 0.00028381802274715656,
"loss": 3.1667,
"step": 90450
},
{
"epoch": 26.377171505188294,
"grad_norm": 0.41693606972694397,
"learning_rate": 0.00028364304461942256,
"loss": 3.164,
"step": 90500
},
{
"epoch": 26.391745365512417,
"grad_norm": 0.44004714488983154,
"learning_rate": 0.0002834680664916885,
"loss": 3.1756,
"step": 90550
},
{
"epoch": 26.40631922583654,
"grad_norm": 0.41405948996543884,
"learning_rate": 0.0002832930883639545,
"loss": 3.1642,
"step": 90600
},
{
"epoch": 26.420893086160664,
"grad_norm": 0.4055291414260864,
"learning_rate": 0.00028311811023622044,
"loss": 3.1812,
"step": 90650
},
{
"epoch": 26.435466946484784,
"grad_norm": 0.4173514246940613,
"learning_rate": 0.00028294313210848644,
"loss": 3.1768,
"step": 90700
},
{
"epoch": 26.450040806808907,
"grad_norm": 0.4086509644985199,
"learning_rate": 0.0002827681539807524,
"loss": 3.1814,
"step": 90750
},
{
"epoch": 26.46461466713303,
"grad_norm": 0.43321284651756287,
"learning_rate": 0.0002825931758530184,
"loss": 3.1673,
"step": 90800
},
{
"epoch": 26.479188527457154,
"grad_norm": 0.40561938285827637,
"learning_rate": 0.0002824181977252843,
"loss": 3.1713,
"step": 90850
},
{
"epoch": 26.493762387781274,
"grad_norm": 0.3966347873210907,
"learning_rate": 0.0002822432195975503,
"loss": 3.1821,
"step": 90900
},
{
"epoch": 26.508336248105397,
"grad_norm": 0.42379072308540344,
"learning_rate": 0.00028206824146981627,
"loss": 3.1734,
"step": 90950
},
{
"epoch": 26.52291010842952,
"grad_norm": 0.446698933839798,
"learning_rate": 0.0002818932633420822,
"loss": 3.1658,
"step": 91000
},
{
"epoch": 26.52291010842952,
"eval_accuracy": 0.37363249386769476,
"eval_loss": 3.5465731620788574,
"eval_runtime": 53.3076,
"eval_samples_per_second": 311.907,
"eval_steps_per_second": 19.509,
"step": 91000
},
{
"epoch": 26.537483968753644,
"grad_norm": 0.4048849940299988,
"learning_rate": 0.0002817182852143482,
"loss": 3.1809,
"step": 91050
},
{
"epoch": 26.552057829077768,
"grad_norm": 0.41570523381233215,
"learning_rate": 0.00028154330708661415,
"loss": 3.1688,
"step": 91100
},
{
"epoch": 26.566631689401888,
"grad_norm": 0.416762113571167,
"learning_rate": 0.0002813683289588801,
"loss": 3.1737,
"step": 91150
},
{
"epoch": 26.58120554972601,
"grad_norm": 0.4728192389011383,
"learning_rate": 0.0002811933508311461,
"loss": 3.1878,
"step": 91200
},
{
"epoch": 26.595779410050135,
"grad_norm": 0.4537615478038788,
"learning_rate": 0.00028101837270341204,
"loss": 3.1861,
"step": 91250
},
{
"epoch": 26.610353270374258,
"grad_norm": 0.43289709091186523,
"learning_rate": 0.000280843394575678,
"loss": 3.1801,
"step": 91300
},
{
"epoch": 26.624927130698378,
"grad_norm": 0.4409210979938507,
"learning_rate": 0.000280668416447944,
"loss": 3.1817,
"step": 91350
},
{
"epoch": 26.6395009910225,
"grad_norm": 0.42330750823020935,
"learning_rate": 0.0002804934383202099,
"loss": 3.1697,
"step": 91400
},
{
"epoch": 26.654074851346625,
"grad_norm": 0.4004875123500824,
"learning_rate": 0.0002803184601924759,
"loss": 3.1854,
"step": 91450
},
{
"epoch": 26.66864871167075,
"grad_norm": 0.4062268137931824,
"learning_rate": 0.00028014348206474186,
"loss": 3.1803,
"step": 91500
},
{
"epoch": 26.68322257199487,
"grad_norm": 0.41472315788269043,
"learning_rate": 0.00027996850393700786,
"loss": 3.1903,
"step": 91550
},
{
"epoch": 26.69779643231899,
"grad_norm": 0.4198923408985138,
"learning_rate": 0.0002797935258092738,
"loss": 3.1957,
"step": 91600
},
{
"epoch": 26.712370292643115,
"grad_norm": 0.4187171459197998,
"learning_rate": 0.0002796185476815398,
"loss": 3.1835,
"step": 91650
},
{
"epoch": 26.72694415296724,
"grad_norm": 0.4212145507335663,
"learning_rate": 0.00027944356955380574,
"loss": 3.1809,
"step": 91700
},
{
"epoch": 26.741518013291362,
"grad_norm": 0.39935165643692017,
"learning_rate": 0.00027926859142607174,
"loss": 3.1862,
"step": 91750
},
{
"epoch": 26.756091873615482,
"grad_norm": 0.42284080386161804,
"learning_rate": 0.0002790936132983377,
"loss": 3.1944,
"step": 91800
},
{
"epoch": 26.770665733939605,
"grad_norm": 0.40127700567245483,
"learning_rate": 0.0002789186351706037,
"loss": 3.196,
"step": 91850
},
{
"epoch": 26.78523959426373,
"grad_norm": 0.4029678702354431,
"learning_rate": 0.0002787436570428696,
"loss": 3.2019,
"step": 91900
},
{
"epoch": 26.799813454587852,
"grad_norm": 0.41483286023139954,
"learning_rate": 0.00027856867891513557,
"loss": 3.195,
"step": 91950
},
{
"epoch": 26.814387314911976,
"grad_norm": 0.42294371128082275,
"learning_rate": 0.00027839370078740157,
"loss": 3.199,
"step": 92000
},
{
"epoch": 26.814387314911976,
"eval_accuracy": 0.3742275696392247,
"eval_loss": 3.536498546600342,
"eval_runtime": 53.2794,
"eval_samples_per_second": 312.072,
"eval_steps_per_second": 19.52,
"step": 92000
},
{
"epoch": 26.814387314911976,
"step": 92000,
"total_flos": 1.922898754142208e+18,
"train_loss": 3.4146219946819802,
"train_runtime": 40914.164,
"train_samples_per_second": 335.396,
"train_steps_per_second": 4.193
}
],
"logging_steps": 50,
"max_steps": 171550,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 10000,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 20,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 18
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.922898754142208e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}