craa's picture
End of training
c296c2d verified
{
"best_global_step": 96000,
"best_metric": 3.5370290279388428,
"best_model_checkpoint": "/scratch/cl5625/exceptions/models/swap_0.7_last_to_push_2128/checkpoint-40000",
"epoch": 33.809818097014926,
"eval_steps": 1000,
"global_step": 116000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014575559701492538,
"grad_norm": 0.9652895927429199,
"learning_rate": 0.000294,
"loss": 8.4539,
"step": 50
},
{
"epoch": 0.029151119402985076,
"grad_norm": 0.9208124279975891,
"learning_rate": 0.0005939999999999999,
"loss": 6.749,
"step": 100
},
{
"epoch": 0.04372667910447761,
"grad_norm": 0.525312602519989,
"learning_rate": 0.0005998285214348206,
"loss": 6.365,
"step": 150
},
{
"epoch": 0.05830223880597015,
"grad_norm": 0.47033366560935974,
"learning_rate": 0.0005996535433070866,
"loss": 6.1474,
"step": 200
},
{
"epoch": 0.07287779850746269,
"grad_norm": 0.5568996071815491,
"learning_rate": 0.0005994785651793525,
"loss": 6.0107,
"step": 250
},
{
"epoch": 0.08745335820895522,
"grad_norm": 0.46809056401252747,
"learning_rate": 0.0005993035870516185,
"loss": 5.8928,
"step": 300
},
{
"epoch": 0.10202891791044776,
"grad_norm": 0.4372883141040802,
"learning_rate": 0.0005991286089238845,
"loss": 5.774,
"step": 350
},
{
"epoch": 0.1166044776119403,
"grad_norm": 0.48103225231170654,
"learning_rate": 0.0005989536307961504,
"loss": 5.6236,
"step": 400
},
{
"epoch": 0.13118003731343283,
"grad_norm": 0.5330483913421631,
"learning_rate": 0.0005987786526684164,
"loss": 5.522,
"step": 450
},
{
"epoch": 0.14575559701492538,
"grad_norm": 0.4759756922721863,
"learning_rate": 0.0005986036745406824,
"loss": 5.442,
"step": 500
},
{
"epoch": 0.1603311567164179,
"grad_norm": 0.41947564482688904,
"learning_rate": 0.0005984286964129484,
"loss": 5.3423,
"step": 550
},
{
"epoch": 0.17490671641791045,
"grad_norm": 0.4382849335670471,
"learning_rate": 0.0005982537182852143,
"loss": 5.2748,
"step": 600
},
{
"epoch": 0.189482276119403,
"grad_norm": 0.47097328305244446,
"learning_rate": 0.0005980787401574803,
"loss": 5.1983,
"step": 650
},
{
"epoch": 0.2040578358208955,
"grad_norm": 0.47024649381637573,
"learning_rate": 0.0005979037620297463,
"loss": 5.1564,
"step": 700
},
{
"epoch": 0.21863339552238806,
"grad_norm": 0.4276546537876129,
"learning_rate": 0.0005977287839020123,
"loss": 5.0876,
"step": 750
},
{
"epoch": 0.2332089552238806,
"grad_norm": 0.41784146428108215,
"learning_rate": 0.0005975538057742782,
"loss": 5.0278,
"step": 800
},
{
"epoch": 0.24778451492537312,
"grad_norm": 0.4272647500038147,
"learning_rate": 0.0005973788276465442,
"loss": 4.9741,
"step": 850
},
{
"epoch": 0.26236007462686567,
"grad_norm": 0.496510773897171,
"learning_rate": 0.0005972038495188102,
"loss": 4.9425,
"step": 900
},
{
"epoch": 0.2769356343283582,
"grad_norm": 0.5926377773284912,
"learning_rate": 0.000597028871391076,
"loss": 4.8891,
"step": 950
},
{
"epoch": 0.29151119402985076,
"grad_norm": 0.43382635712623596,
"learning_rate": 0.000596853893263342,
"loss": 4.8405,
"step": 1000
},
{
"epoch": 0.29151119402985076,
"eval_accuracy": 0.2538303000671932,
"eval_loss": 4.754518032073975,
"eval_runtime": 179.7864,
"eval_samples_per_second": 92.482,
"eval_steps_per_second": 5.785,
"step": 1000
},
{
"epoch": 0.3060867537313433,
"grad_norm": 0.4467147886753082,
"learning_rate": 0.000596678915135608,
"loss": 4.7826,
"step": 1050
},
{
"epoch": 0.3206623134328358,
"grad_norm": 0.4645467698574066,
"learning_rate": 0.0005965039370078739,
"loss": 4.7438,
"step": 1100
},
{
"epoch": 0.33523787313432835,
"grad_norm": 0.4239553213119507,
"learning_rate": 0.0005963289588801399,
"loss": 4.705,
"step": 1150
},
{
"epoch": 0.3498134328358209,
"grad_norm": 0.4928569495677948,
"learning_rate": 0.0005961539807524059,
"loss": 4.6655,
"step": 1200
},
{
"epoch": 0.36438899253731344,
"grad_norm": 0.39162495732307434,
"learning_rate": 0.0005959790026246719,
"loss": 4.6408,
"step": 1250
},
{
"epoch": 0.378964552238806,
"grad_norm": 0.4309455454349518,
"learning_rate": 0.0005958040244969378,
"loss": 4.6164,
"step": 1300
},
{
"epoch": 0.39354011194029853,
"grad_norm": 0.41590699553489685,
"learning_rate": 0.0005956290463692038,
"loss": 4.5852,
"step": 1350
},
{
"epoch": 0.408115671641791,
"grad_norm": 0.43023690581321716,
"learning_rate": 0.0005954540682414698,
"loss": 4.5578,
"step": 1400
},
{
"epoch": 0.42269123134328357,
"grad_norm": 0.42544642090797424,
"learning_rate": 0.0005952790901137357,
"loss": 4.5267,
"step": 1450
},
{
"epoch": 0.4372667910447761,
"grad_norm": 0.438936710357666,
"learning_rate": 0.0005951041119860017,
"loss": 4.5129,
"step": 1500
},
{
"epoch": 0.45184235074626866,
"grad_norm": 0.49558717012405396,
"learning_rate": 0.0005949291338582677,
"loss": 4.5006,
"step": 1550
},
{
"epoch": 0.4664179104477612,
"grad_norm": 0.4251459836959839,
"learning_rate": 0.0005947541557305336,
"loss": 4.4832,
"step": 1600
},
{
"epoch": 0.48099347014925375,
"grad_norm": 0.4005463421344757,
"learning_rate": 0.0005945791776027996,
"loss": 4.4631,
"step": 1650
},
{
"epoch": 0.49556902985074625,
"grad_norm": 0.3984415531158447,
"learning_rate": 0.0005944041994750656,
"loss": 4.4357,
"step": 1700
},
{
"epoch": 0.5101445895522388,
"grad_norm": 0.43978074193000793,
"learning_rate": 0.0005942292213473315,
"loss": 4.4312,
"step": 1750
},
{
"epoch": 0.5247201492537313,
"grad_norm": 0.40760400891304016,
"learning_rate": 0.0005940542432195975,
"loss": 4.3969,
"step": 1800
},
{
"epoch": 0.5392957089552238,
"grad_norm": 0.45624998211860657,
"learning_rate": 0.0005938792650918635,
"loss": 4.3909,
"step": 1850
},
{
"epoch": 0.5538712686567164,
"grad_norm": 0.3956749439239502,
"learning_rate": 0.0005937042869641295,
"loss": 4.3708,
"step": 1900
},
{
"epoch": 0.5684468283582089,
"grad_norm": 0.3885388672351837,
"learning_rate": 0.0005935293088363953,
"loss": 4.3676,
"step": 1950
},
{
"epoch": 0.5830223880597015,
"grad_norm": 0.3808089792728424,
"learning_rate": 0.0005933543307086613,
"loss": 4.3421,
"step": 2000
},
{
"epoch": 0.5830223880597015,
"eval_accuracy": 0.2982805535099172,
"eval_loss": 4.293015956878662,
"eval_runtime": 179.2043,
"eval_samples_per_second": 92.782,
"eval_steps_per_second": 5.803,
"step": 2000
},
{
"epoch": 0.597597947761194,
"grad_norm": 0.4052446782588959,
"learning_rate": 0.0005931793525809273,
"loss": 4.3384,
"step": 2050
},
{
"epoch": 0.6121735074626866,
"grad_norm": 0.4344984292984009,
"learning_rate": 0.0005930043744531933,
"loss": 4.3264,
"step": 2100
},
{
"epoch": 0.6267490671641791,
"grad_norm": 0.4268837869167328,
"learning_rate": 0.0005928293963254592,
"loss": 4.3188,
"step": 2150
},
{
"epoch": 0.6413246268656716,
"grad_norm": 0.4293970763683319,
"learning_rate": 0.0005926544181977252,
"loss": 4.3085,
"step": 2200
},
{
"epoch": 0.6559001865671642,
"grad_norm": 0.40907061100006104,
"learning_rate": 0.0005924794400699912,
"loss": 4.2966,
"step": 2250
},
{
"epoch": 0.6704757462686567,
"grad_norm": 0.3840656280517578,
"learning_rate": 0.0005923044619422571,
"loss": 4.2739,
"step": 2300
},
{
"epoch": 0.6850513059701493,
"grad_norm": 0.37641018629074097,
"learning_rate": 0.0005921294838145231,
"loss": 4.2741,
"step": 2350
},
{
"epoch": 0.6996268656716418,
"grad_norm": 0.38551846146583557,
"learning_rate": 0.0005919545056867891,
"loss": 4.2682,
"step": 2400
},
{
"epoch": 0.7142024253731343,
"grad_norm": 0.34833261370658875,
"learning_rate": 0.0005917795275590551,
"loss": 4.243,
"step": 2450
},
{
"epoch": 0.7287779850746269,
"grad_norm": 0.4164075255393982,
"learning_rate": 0.000591604549431321,
"loss": 4.2357,
"step": 2500
},
{
"epoch": 0.7433535447761194,
"grad_norm": 0.3375675678253174,
"learning_rate": 0.000591429571303587,
"loss": 4.2287,
"step": 2550
},
{
"epoch": 0.757929104477612,
"grad_norm": 0.3546270430088043,
"learning_rate": 0.000591254593175853,
"loss": 4.2134,
"step": 2600
},
{
"epoch": 0.7725046641791045,
"grad_norm": 0.3893532156944275,
"learning_rate": 0.000591079615048119,
"loss": 4.2053,
"step": 2650
},
{
"epoch": 0.7870802238805971,
"grad_norm": 0.37014102935791016,
"learning_rate": 0.0005909046369203849,
"loss": 4.2145,
"step": 2700
},
{
"epoch": 0.8016557835820896,
"grad_norm": 0.3541073799133301,
"learning_rate": 0.0005907296587926509,
"loss": 4.1914,
"step": 2750
},
{
"epoch": 0.816231343283582,
"grad_norm": 0.43208348751068115,
"learning_rate": 0.0005905546806649169,
"loss": 4.1925,
"step": 2800
},
{
"epoch": 0.8308069029850746,
"grad_norm": 0.36169344186782837,
"learning_rate": 0.0005903797025371829,
"loss": 4.1852,
"step": 2850
},
{
"epoch": 0.8453824626865671,
"grad_norm": 0.40239009261131287,
"learning_rate": 0.0005902047244094488,
"loss": 4.1766,
"step": 2900
},
{
"epoch": 0.8599580223880597,
"grad_norm": 0.38017842173576355,
"learning_rate": 0.0005900297462817148,
"loss": 4.1628,
"step": 2950
},
{
"epoch": 0.8745335820895522,
"grad_norm": 0.34212303161621094,
"learning_rate": 0.0005898547681539808,
"loss": 4.1511,
"step": 3000
},
{
"epoch": 0.8745335820895522,
"eval_accuracy": 0.31430675850010303,
"eval_loss": 4.104100704193115,
"eval_runtime": 179.1747,
"eval_samples_per_second": 92.798,
"eval_steps_per_second": 5.804,
"step": 3000
},
{
"epoch": 0.8891091417910447,
"grad_norm": 0.35792702436447144,
"learning_rate": 0.0005896797900262466,
"loss": 4.1473,
"step": 3050
},
{
"epoch": 0.9036847014925373,
"grad_norm": 0.36840522289276123,
"learning_rate": 0.0005895048118985126,
"loss": 4.1411,
"step": 3100
},
{
"epoch": 0.9182602611940298,
"grad_norm": 0.35885506868362427,
"learning_rate": 0.0005893298337707786,
"loss": 4.1293,
"step": 3150
},
{
"epoch": 0.9328358208955224,
"grad_norm": 0.34642812609672546,
"learning_rate": 0.0005891548556430446,
"loss": 4.1281,
"step": 3200
},
{
"epoch": 0.9474113805970149,
"grad_norm": 0.3401717245578766,
"learning_rate": 0.0005889798775153105,
"loss": 4.1295,
"step": 3250
},
{
"epoch": 0.9619869402985075,
"grad_norm": 0.3363310396671295,
"learning_rate": 0.0005888048993875765,
"loss": 4.1209,
"step": 3300
},
{
"epoch": 0.9765625,
"grad_norm": 0.3683488965034485,
"learning_rate": 0.0005886299212598425,
"loss": 4.1139,
"step": 3350
},
{
"epoch": 0.9911380597014925,
"grad_norm": 0.34876251220703125,
"learning_rate": 0.0005884549431321084,
"loss": 4.1054,
"step": 3400
},
{
"epoch": 1.0055387126865671,
"grad_norm": 0.32942771911621094,
"learning_rate": 0.0005882799650043744,
"loss": 4.0683,
"step": 3450
},
{
"epoch": 1.0201142723880596,
"grad_norm": 0.36168619990348816,
"learning_rate": 0.0005881049868766404,
"loss": 4.0293,
"step": 3500
},
{
"epoch": 1.0346898320895523,
"grad_norm": 0.35932570695877075,
"learning_rate": 0.0005879300087489063,
"loss": 4.02,
"step": 3550
},
{
"epoch": 1.0492653917910448,
"grad_norm": 0.428357869386673,
"learning_rate": 0.0005877550306211723,
"loss": 4.0357,
"step": 3600
},
{
"epoch": 1.0638409514925373,
"grad_norm": 0.35688987374305725,
"learning_rate": 0.0005875800524934383,
"loss": 4.0291,
"step": 3650
},
{
"epoch": 1.0784165111940298,
"grad_norm": 0.3705274164676666,
"learning_rate": 0.0005874050743657042,
"loss": 4.0165,
"step": 3700
},
{
"epoch": 1.0929920708955223,
"grad_norm": 0.3511093556880951,
"learning_rate": 0.0005872300962379702,
"loss": 4.0205,
"step": 3750
},
{
"epoch": 1.107567630597015,
"grad_norm": 0.32943227887153625,
"learning_rate": 0.0005870551181102362,
"loss": 3.9982,
"step": 3800
},
{
"epoch": 1.1221431902985075,
"grad_norm": 0.36079925298690796,
"learning_rate": 0.0005868801399825022,
"loss": 4.0146,
"step": 3850
},
{
"epoch": 1.13671875,
"grad_norm": 0.36787062883377075,
"learning_rate": 0.0005867051618547681,
"loss": 4.0149,
"step": 3900
},
{
"epoch": 1.1512943097014925,
"grad_norm": 0.34740957617759705,
"learning_rate": 0.0005865301837270341,
"loss": 4.0014,
"step": 3950
},
{
"epoch": 1.165869869402985,
"grad_norm": 0.3389481008052826,
"learning_rate": 0.0005863552055993001,
"loss": 4.0075,
"step": 4000
},
{
"epoch": 1.165869869402985,
"eval_accuracy": 0.3235803364649745,
"eval_loss": 4.001465320587158,
"eval_runtime": 179.5462,
"eval_samples_per_second": 92.606,
"eval_steps_per_second": 5.792,
"step": 4000
},
{
"epoch": 1.1804454291044777,
"grad_norm": 0.33885183930397034,
"learning_rate": 0.0005861802274715659,
"loss": 3.9982,
"step": 4050
},
{
"epoch": 1.1950209888059702,
"grad_norm": 0.3423576056957245,
"learning_rate": 0.0005860052493438319,
"loss": 3.9952,
"step": 4100
},
{
"epoch": 1.2095965485074627,
"grad_norm": 0.3361322283744812,
"learning_rate": 0.0005858302712160979,
"loss": 3.9863,
"step": 4150
},
{
"epoch": 1.2241721082089552,
"grad_norm": 0.35886096954345703,
"learning_rate": 0.0005856552930883638,
"loss": 3.9995,
"step": 4200
},
{
"epoch": 1.2387476679104479,
"grad_norm": 0.3898662328720093,
"learning_rate": 0.0005854803149606298,
"loss": 3.9802,
"step": 4250
},
{
"epoch": 1.2533232276119404,
"grad_norm": 0.35210487246513367,
"learning_rate": 0.0005853053368328958,
"loss": 3.9833,
"step": 4300
},
{
"epoch": 1.2678987873134329,
"grad_norm": 0.3297649919986725,
"learning_rate": 0.0005851303587051618,
"loss": 3.979,
"step": 4350
},
{
"epoch": 1.2824743470149254,
"grad_norm": 0.33918461203575134,
"learning_rate": 0.0005849553805774277,
"loss": 3.9715,
"step": 4400
},
{
"epoch": 1.2970499067164178,
"grad_norm": 0.3349836468696594,
"learning_rate": 0.0005847804024496937,
"loss": 3.9705,
"step": 4450
},
{
"epoch": 1.3116254664179103,
"grad_norm": 0.335245817899704,
"learning_rate": 0.0005846054243219597,
"loss": 3.9666,
"step": 4500
},
{
"epoch": 1.326201026119403,
"grad_norm": 0.3286707401275635,
"learning_rate": 0.0005844304461942257,
"loss": 3.9713,
"step": 4550
},
{
"epoch": 1.3407765858208955,
"grad_norm": 0.3266748785972595,
"learning_rate": 0.0005842554680664916,
"loss": 3.9634,
"step": 4600
},
{
"epoch": 1.355352145522388,
"grad_norm": 0.34682974219322205,
"learning_rate": 0.0005840804899387576,
"loss": 3.9671,
"step": 4650
},
{
"epoch": 1.3699277052238805,
"grad_norm": 0.34403321146965027,
"learning_rate": 0.0005839055118110236,
"loss": 3.9689,
"step": 4700
},
{
"epoch": 1.3845032649253732,
"grad_norm": 0.3424343466758728,
"learning_rate": 0.0005837305336832896,
"loss": 3.9673,
"step": 4750
},
{
"epoch": 1.3990788246268657,
"grad_norm": 0.3347780704498291,
"learning_rate": 0.0005835555555555555,
"loss": 3.9669,
"step": 4800
},
{
"epoch": 1.4136543843283582,
"grad_norm": 0.3549891710281372,
"learning_rate": 0.0005833805774278215,
"loss": 3.944,
"step": 4850
},
{
"epoch": 1.4282299440298507,
"grad_norm": 0.336101233959198,
"learning_rate": 0.0005832055993000875,
"loss": 3.9374,
"step": 4900
},
{
"epoch": 1.4428055037313432,
"grad_norm": 0.33941197395324707,
"learning_rate": 0.0005830306211723534,
"loss": 3.9447,
"step": 4950
},
{
"epoch": 1.4573810634328357,
"grad_norm": 0.32939550280570984,
"learning_rate": 0.0005828556430446194,
"loss": 3.9398,
"step": 5000
},
{
"epoch": 1.4573810634328357,
"eval_accuracy": 0.33042841571550857,
"eval_loss": 3.9233782291412354,
"eval_runtime": 179.5532,
"eval_samples_per_second": 92.602,
"eval_steps_per_second": 5.792,
"step": 5000
},
{
"epoch": 1.4719566231343284,
"grad_norm": 0.3391132950782776,
"learning_rate": 0.0005826806649168854,
"loss": 3.9332,
"step": 5050
},
{
"epoch": 1.486532182835821,
"grad_norm": 0.32835477590560913,
"learning_rate": 0.0005825056867891514,
"loss": 3.949,
"step": 5100
},
{
"epoch": 1.5011077425373134,
"grad_norm": 0.33751094341278076,
"learning_rate": 0.0005823307086614172,
"loss": 3.9421,
"step": 5150
},
{
"epoch": 1.515683302238806,
"grad_norm": 0.3318590223789215,
"learning_rate": 0.0005821557305336832,
"loss": 3.9229,
"step": 5200
},
{
"epoch": 1.5302588619402986,
"grad_norm": 0.31536349654197693,
"learning_rate": 0.0005819807524059492,
"loss": 3.9301,
"step": 5250
},
{
"epoch": 1.544834421641791,
"grad_norm": 0.34051841497421265,
"learning_rate": 0.0005818057742782152,
"loss": 3.913,
"step": 5300
},
{
"epoch": 1.5594099813432836,
"grad_norm": 0.32359224557876587,
"learning_rate": 0.0005816307961504811,
"loss": 3.9288,
"step": 5350
},
{
"epoch": 1.573985541044776,
"grad_norm": 0.3286752998828888,
"learning_rate": 0.0005814558180227471,
"loss": 3.923,
"step": 5400
},
{
"epoch": 1.5885611007462686,
"grad_norm": 0.3351253867149353,
"learning_rate": 0.0005812808398950131,
"loss": 3.9175,
"step": 5450
},
{
"epoch": 1.603136660447761,
"grad_norm": 0.3059863746166229,
"learning_rate": 0.0005811058617672791,
"loss": 3.9218,
"step": 5500
},
{
"epoch": 1.6177122201492538,
"grad_norm": 0.3618924617767334,
"learning_rate": 0.000580930883639545,
"loss": 3.9225,
"step": 5550
},
{
"epoch": 1.6322877798507462,
"grad_norm": 0.33408987522125244,
"learning_rate": 0.000580755905511811,
"loss": 3.9018,
"step": 5600
},
{
"epoch": 1.646863339552239,
"grad_norm": 0.331617534160614,
"learning_rate": 0.000580580927384077,
"loss": 3.9062,
"step": 5650
},
{
"epoch": 1.6614388992537314,
"grad_norm": 0.3273450434207916,
"learning_rate": 0.0005804059492563429,
"loss": 3.9078,
"step": 5700
},
{
"epoch": 1.676014458955224,
"grad_norm": 0.32543638348579407,
"learning_rate": 0.0005802309711286089,
"loss": 3.9034,
"step": 5750
},
{
"epoch": 1.6905900186567164,
"grad_norm": 0.3410203158855438,
"learning_rate": 0.0005800559930008749,
"loss": 3.8986,
"step": 5800
},
{
"epoch": 1.705165578358209,
"grad_norm": 0.32851478457450867,
"learning_rate": 0.0005798810148731408,
"loss": 3.9116,
"step": 5850
},
{
"epoch": 1.7197411380597014,
"grad_norm": 0.32616376876831055,
"learning_rate": 0.0005797060367454068,
"loss": 3.9015,
"step": 5900
},
{
"epoch": 1.734316697761194,
"grad_norm": 0.3180261254310608,
"learning_rate": 0.0005795310586176728,
"loss": 3.881,
"step": 5950
},
{
"epoch": 1.7488922574626866,
"grad_norm": 0.31671977043151855,
"learning_rate": 0.0005793560804899387,
"loss": 3.8836,
"step": 6000
},
{
"epoch": 1.7488922574626866,
"eval_accuracy": 0.3354620788082289,
"eval_loss": 3.8672115802764893,
"eval_runtime": 179.177,
"eval_samples_per_second": 92.797,
"eval_steps_per_second": 5.804,
"step": 6000
},
{
"epoch": 1.763467817164179,
"grad_norm": 0.3349725902080536,
"learning_rate": 0.0005791811023622047,
"loss": 3.8863,
"step": 6050
},
{
"epoch": 1.7780433768656716,
"grad_norm": 0.31868061423301697,
"learning_rate": 0.0005790061242344707,
"loss": 3.8875,
"step": 6100
},
{
"epoch": 1.7926189365671643,
"grad_norm": 0.31870362162590027,
"learning_rate": 0.0005788311461067365,
"loss": 3.8957,
"step": 6150
},
{
"epoch": 1.8071944962686568,
"grad_norm": 0.32470956444740295,
"learning_rate": 0.0005786561679790025,
"loss": 3.88,
"step": 6200
},
{
"epoch": 1.8217700559701493,
"grad_norm": 0.31178992986679077,
"learning_rate": 0.0005784811898512685,
"loss": 3.8796,
"step": 6250
},
{
"epoch": 1.8363456156716418,
"grad_norm": 0.34831923246383667,
"learning_rate": 0.0005783062117235344,
"loss": 3.8669,
"step": 6300
},
{
"epoch": 1.8509211753731343,
"grad_norm": 0.32867011427879333,
"learning_rate": 0.0005781312335958004,
"loss": 3.8775,
"step": 6350
},
{
"epoch": 1.8654967350746268,
"grad_norm": 0.34291231632232666,
"learning_rate": 0.0005779562554680664,
"loss": 3.8788,
"step": 6400
},
{
"epoch": 1.8800722947761193,
"grad_norm": 0.32128509879112244,
"learning_rate": 0.0005777812773403324,
"loss": 3.8689,
"step": 6450
},
{
"epoch": 1.894647854477612,
"grad_norm": 0.3260379135608673,
"learning_rate": 0.0005776062992125983,
"loss": 3.8723,
"step": 6500
},
{
"epoch": 1.9092234141791045,
"grad_norm": 0.3153153657913208,
"learning_rate": 0.0005774313210848643,
"loss": 3.8804,
"step": 6550
},
{
"epoch": 1.9237989738805972,
"grad_norm": 0.30965352058410645,
"learning_rate": 0.0005772563429571303,
"loss": 3.8792,
"step": 6600
},
{
"epoch": 1.9383745335820897,
"grad_norm": 0.3271130919456482,
"learning_rate": 0.0005770813648293962,
"loss": 3.8668,
"step": 6650
},
{
"epoch": 1.9529500932835822,
"grad_norm": 0.30338090658187866,
"learning_rate": 0.0005769063867016622,
"loss": 3.8572,
"step": 6700
},
{
"epoch": 1.9675256529850746,
"grad_norm": 0.3297083377838135,
"learning_rate": 0.0005767314085739282,
"loss": 3.8625,
"step": 6750
},
{
"epoch": 1.9821012126865671,
"grad_norm": 0.3095833957195282,
"learning_rate": 0.0005765564304461942,
"loss": 3.86,
"step": 6800
},
{
"epoch": 1.9966767723880596,
"grad_norm": 0.29658767580986023,
"learning_rate": 0.0005763814523184601,
"loss": 3.8555,
"step": 6850
},
{
"epoch": 2.0110774253731343,
"grad_norm": 0.3722371459007263,
"learning_rate": 0.0005762064741907261,
"loss": 3.7759,
"step": 6900
},
{
"epoch": 2.0256529850746268,
"grad_norm": 0.31017979979515076,
"learning_rate": 0.0005760314960629921,
"loss": 3.7559,
"step": 6950
},
{
"epoch": 2.0402285447761193,
"grad_norm": 0.3352357745170593,
"learning_rate": 0.0005758565179352581,
"loss": 3.7667,
"step": 7000
},
{
"epoch": 2.0402285447761193,
"eval_accuracy": 0.33953509940743115,
"eval_loss": 3.8253839015960693,
"eval_runtime": 179.2981,
"eval_samples_per_second": 92.734,
"eval_steps_per_second": 5.8,
"step": 7000
},
{
"epoch": 2.0548041044776117,
"grad_norm": 0.3400469422340393,
"learning_rate": 0.000575681539807524,
"loss": 3.7712,
"step": 7050
},
{
"epoch": 2.0693796641791047,
"grad_norm": 0.3268464505672455,
"learning_rate": 0.00057550656167979,
"loss": 3.7554,
"step": 7100
},
{
"epoch": 2.083955223880597,
"grad_norm": 0.3046896159648895,
"learning_rate": 0.000575331583552056,
"loss": 3.7594,
"step": 7150
},
{
"epoch": 2.0985307835820897,
"grad_norm": 0.3351455628871918,
"learning_rate": 0.000575156605424322,
"loss": 3.7651,
"step": 7200
},
{
"epoch": 2.113106343283582,
"grad_norm": 0.3188965618610382,
"learning_rate": 0.0005749816272965878,
"loss": 3.7658,
"step": 7250
},
{
"epoch": 2.1276819029850746,
"grad_norm": 0.34054598212242126,
"learning_rate": 0.0005748066491688538,
"loss": 3.7516,
"step": 7300
},
{
"epoch": 2.142257462686567,
"grad_norm": 0.31955304741859436,
"learning_rate": 0.0005746316710411198,
"loss": 3.7661,
"step": 7350
},
{
"epoch": 2.1568330223880596,
"grad_norm": 0.32968559861183167,
"learning_rate": 0.0005744566929133858,
"loss": 3.7657,
"step": 7400
},
{
"epoch": 2.171408582089552,
"grad_norm": 0.3298960030078888,
"learning_rate": 0.0005742817147856517,
"loss": 3.7712,
"step": 7450
},
{
"epoch": 2.1859841417910446,
"grad_norm": 0.332374632358551,
"learning_rate": 0.0005741067366579177,
"loss": 3.7827,
"step": 7500
},
{
"epoch": 2.200559701492537,
"grad_norm": 0.3308345377445221,
"learning_rate": 0.0005739317585301837,
"loss": 3.7748,
"step": 7550
},
{
"epoch": 2.21513526119403,
"grad_norm": 0.3271458148956299,
"learning_rate": 0.0005737567804024496,
"loss": 3.7587,
"step": 7600
},
{
"epoch": 2.2297108208955225,
"grad_norm": 0.3361659348011017,
"learning_rate": 0.0005735818022747156,
"loss": 3.7688,
"step": 7650
},
{
"epoch": 2.244286380597015,
"grad_norm": 0.3123524785041809,
"learning_rate": 0.0005734068241469816,
"loss": 3.7575,
"step": 7700
},
{
"epoch": 2.2588619402985075,
"grad_norm": 0.32984843850135803,
"learning_rate": 0.0005732318460192476,
"loss": 3.7734,
"step": 7750
},
{
"epoch": 2.2734375,
"grad_norm": 0.3111555874347687,
"learning_rate": 0.0005730568678915135,
"loss": 3.7567,
"step": 7800
},
{
"epoch": 2.2880130597014925,
"grad_norm": 0.3495906591415405,
"learning_rate": 0.0005728818897637795,
"loss": 3.7766,
"step": 7850
},
{
"epoch": 2.302588619402985,
"grad_norm": 0.3339783549308777,
"learning_rate": 0.0005727069116360455,
"loss": 3.7592,
"step": 7900
},
{
"epoch": 2.3171641791044775,
"grad_norm": 0.3110646605491638,
"learning_rate": 0.0005725319335083115,
"loss": 3.7721,
"step": 7950
},
{
"epoch": 2.33173973880597,
"grad_norm": 0.3329947292804718,
"learning_rate": 0.0005723569553805774,
"loss": 3.7647,
"step": 8000
},
{
"epoch": 2.33173973880597,
"eval_accuracy": 0.3429995090860279,
"eval_loss": 3.794593334197998,
"eval_runtime": 179.1534,
"eval_samples_per_second": 92.809,
"eval_steps_per_second": 5.805,
"step": 8000
},
{
"epoch": 2.346315298507463,
"grad_norm": 0.31891825795173645,
"learning_rate": 0.0005721819772528434,
"loss": 3.7522,
"step": 8050
},
{
"epoch": 2.3608908582089554,
"grad_norm": 0.3220847547054291,
"learning_rate": 0.0005720069991251094,
"loss": 3.7619,
"step": 8100
},
{
"epoch": 2.375466417910448,
"grad_norm": 0.3521746098995209,
"learning_rate": 0.0005718320209973753,
"loss": 3.762,
"step": 8150
},
{
"epoch": 2.3900419776119404,
"grad_norm": 0.3538142442703247,
"learning_rate": 0.0005716570428696413,
"loss": 3.7666,
"step": 8200
},
{
"epoch": 2.404617537313433,
"grad_norm": 0.31546488404273987,
"learning_rate": 0.0005714820647419073,
"loss": 3.7655,
"step": 8250
},
{
"epoch": 2.4191930970149254,
"grad_norm": 0.32537829875946045,
"learning_rate": 0.0005713070866141731,
"loss": 3.7601,
"step": 8300
},
{
"epoch": 2.433768656716418,
"grad_norm": 0.3402611017227173,
"learning_rate": 0.0005711321084864391,
"loss": 3.7643,
"step": 8350
},
{
"epoch": 2.4483442164179103,
"grad_norm": 0.32665756344795227,
"learning_rate": 0.0005709571303587051,
"loss": 3.7538,
"step": 8400
},
{
"epoch": 2.462919776119403,
"grad_norm": 0.3143838047981262,
"learning_rate": 0.000570782152230971,
"loss": 3.7657,
"step": 8450
},
{
"epoch": 2.4774953358208958,
"grad_norm": 0.3173374533653259,
"learning_rate": 0.000570607174103237,
"loss": 3.7716,
"step": 8500
},
{
"epoch": 2.4920708955223883,
"grad_norm": 0.3324846625328064,
"learning_rate": 0.000570432195975503,
"loss": 3.7515,
"step": 8550
},
{
"epoch": 2.5066464552238807,
"grad_norm": 0.31547409296035767,
"learning_rate": 0.0005702572178477689,
"loss": 3.7565,
"step": 8600
},
{
"epoch": 2.5212220149253732,
"grad_norm": 0.32761460542678833,
"learning_rate": 0.0005700822397200349,
"loss": 3.7614,
"step": 8650
},
{
"epoch": 2.5357975746268657,
"grad_norm": 0.3202444314956665,
"learning_rate": 0.0005699072615923009,
"loss": 3.7589,
"step": 8700
},
{
"epoch": 2.550373134328358,
"grad_norm": 0.3207978308200836,
"learning_rate": 0.0005697322834645668,
"loss": 3.7514,
"step": 8750
},
{
"epoch": 2.5649486940298507,
"grad_norm": 0.31655776500701904,
"learning_rate": 0.0005695573053368328,
"loss": 3.76,
"step": 8800
},
{
"epoch": 2.579524253731343,
"grad_norm": 0.31503432989120483,
"learning_rate": 0.0005693823272090988,
"loss": 3.748,
"step": 8850
},
{
"epoch": 2.5940998134328357,
"grad_norm": 0.32607924938201904,
"learning_rate": 0.0005692073490813648,
"loss": 3.7602,
"step": 8900
},
{
"epoch": 2.6086753731343286,
"grad_norm": 0.30219122767448425,
"learning_rate": 0.0005690323709536307,
"loss": 3.7443,
"step": 8950
},
{
"epoch": 2.6232509328358207,
"grad_norm": 0.3029521107673645,
"learning_rate": 0.0005688573928258967,
"loss": 3.7476,
"step": 9000
},
{
"epoch": 2.6232509328358207,
"eval_accuracy": 0.34559260825500504,
"eval_loss": 3.7627193927764893,
"eval_runtime": 179.6396,
"eval_samples_per_second": 92.558,
"eval_steps_per_second": 5.789,
"step": 9000
},
{
"epoch": 2.6378264925373136,
"grad_norm": 0.3177446722984314,
"learning_rate": 0.0005686824146981627,
"loss": 3.7555,
"step": 9050
},
{
"epoch": 2.652402052238806,
"grad_norm": 0.3198622763156891,
"learning_rate": 0.0005685074365704287,
"loss": 3.7526,
"step": 9100
},
{
"epoch": 2.6669776119402986,
"grad_norm": 0.3512043058872223,
"learning_rate": 0.0005683324584426946,
"loss": 3.735,
"step": 9150
},
{
"epoch": 2.681553171641791,
"grad_norm": 0.32351306080818176,
"learning_rate": 0.0005681574803149606,
"loss": 3.7329,
"step": 9200
},
{
"epoch": 2.6961287313432836,
"grad_norm": 0.32276666164398193,
"learning_rate": 0.0005679825021872266,
"loss": 3.7585,
"step": 9250
},
{
"epoch": 2.710704291044776,
"grad_norm": 0.31967878341674805,
"learning_rate": 0.0005678075240594926,
"loss": 3.7465,
"step": 9300
},
{
"epoch": 2.7252798507462686,
"grad_norm": 0.31496620178222656,
"learning_rate": 0.0005676325459317584,
"loss": 3.7507,
"step": 9350
},
{
"epoch": 2.739855410447761,
"grad_norm": 0.31688955426216125,
"learning_rate": 0.0005674575678040244,
"loss": 3.7486,
"step": 9400
},
{
"epoch": 2.7544309701492535,
"grad_norm": 0.300443559885025,
"learning_rate": 0.0005672825896762904,
"loss": 3.7405,
"step": 9450
},
{
"epoch": 2.7690065298507465,
"grad_norm": 0.3132357597351074,
"learning_rate": 0.0005671076115485563,
"loss": 3.7294,
"step": 9500
},
{
"epoch": 2.783582089552239,
"grad_norm": 0.3266209363937378,
"learning_rate": 0.0005669326334208223,
"loss": 3.746,
"step": 9550
},
{
"epoch": 2.7981576492537314,
"grad_norm": 0.3130579888820648,
"learning_rate": 0.0005667576552930883,
"loss": 3.7399,
"step": 9600
},
{
"epoch": 2.812733208955224,
"grad_norm": 0.32442179322242737,
"learning_rate": 0.0005665826771653543,
"loss": 3.7519,
"step": 9650
},
{
"epoch": 2.8273087686567164,
"grad_norm": 0.3083656430244446,
"learning_rate": 0.0005664076990376202,
"loss": 3.7361,
"step": 9700
},
{
"epoch": 2.841884328358209,
"grad_norm": 0.3123016655445099,
"learning_rate": 0.0005662327209098862,
"loss": 3.7319,
"step": 9750
},
{
"epoch": 2.8564598880597014,
"grad_norm": 0.30334439873695374,
"learning_rate": 0.0005660577427821522,
"loss": 3.7272,
"step": 9800
},
{
"epoch": 2.871035447761194,
"grad_norm": 0.3039874732494354,
"learning_rate": 0.0005658827646544182,
"loss": 3.7239,
"step": 9850
},
{
"epoch": 2.8856110074626864,
"grad_norm": 0.330578088760376,
"learning_rate": 0.0005657077865266841,
"loss": 3.7356,
"step": 9900
},
{
"epoch": 2.9001865671641793,
"grad_norm": 0.34357017278671265,
"learning_rate": 0.0005655328083989501,
"loss": 3.7436,
"step": 9950
},
{
"epoch": 2.9147621268656714,
"grad_norm": 0.3257700800895691,
"learning_rate": 0.0005653578302712161,
"loss": 3.7253,
"step": 10000
},
{
"epoch": 2.9147621268656714,
"eval_accuracy": 0.34803717387499666,
"eval_loss": 3.739212989807129,
"eval_runtime": 179.5993,
"eval_samples_per_second": 92.578,
"eval_steps_per_second": 5.791,
"step": 10000
},
{
"epoch": 2.9293376865671643,
"grad_norm": 0.3030923902988434,
"learning_rate": 0.0005651828521434821,
"loss": 3.7311,
"step": 10050
},
{
"epoch": 2.943913246268657,
"grad_norm": 0.3330658972263336,
"learning_rate": 0.000565007874015748,
"loss": 3.7252,
"step": 10100
},
{
"epoch": 2.9584888059701493,
"grad_norm": 0.30661967396736145,
"learning_rate": 0.000564832895888014,
"loss": 3.7154,
"step": 10150
},
{
"epoch": 2.973064365671642,
"grad_norm": 0.3306799829006195,
"learning_rate": 0.00056465791776028,
"loss": 3.7316,
"step": 10200
},
{
"epoch": 2.9876399253731343,
"grad_norm": 0.3185974359512329,
"learning_rate": 0.0005644829396325459,
"loss": 3.7361,
"step": 10250
},
{
"epoch": 3.002040578358209,
"grad_norm": 0.33761319518089294,
"learning_rate": 0.0005643079615048119,
"loss": 3.71,
"step": 10300
},
{
"epoch": 3.0166161380597014,
"grad_norm": 0.31549760699272156,
"learning_rate": 0.0005641329833770779,
"loss": 3.6183,
"step": 10350
},
{
"epoch": 3.031191697761194,
"grad_norm": 0.32368534803390503,
"learning_rate": 0.0005639580052493437,
"loss": 3.6255,
"step": 10400
},
{
"epoch": 3.0457672574626864,
"grad_norm": 0.30971264839172363,
"learning_rate": 0.0005637830271216097,
"loss": 3.6278,
"step": 10450
},
{
"epoch": 3.0603428171641793,
"grad_norm": 0.33048370480537415,
"learning_rate": 0.0005636080489938757,
"loss": 3.6361,
"step": 10500
},
{
"epoch": 3.074918376865672,
"grad_norm": 0.31469184160232544,
"learning_rate": 0.0005634330708661417,
"loss": 3.6288,
"step": 10550
},
{
"epoch": 3.0894939365671643,
"grad_norm": 0.3115905225276947,
"learning_rate": 0.0005632580927384076,
"loss": 3.6349,
"step": 10600
},
{
"epoch": 3.104069496268657,
"grad_norm": 0.321036159992218,
"learning_rate": 0.0005630831146106736,
"loss": 3.6364,
"step": 10650
},
{
"epoch": 3.1186450559701493,
"grad_norm": 0.30595359206199646,
"learning_rate": 0.0005629081364829396,
"loss": 3.6337,
"step": 10700
},
{
"epoch": 3.133220615671642,
"grad_norm": 0.3319248557090759,
"learning_rate": 0.0005627331583552055,
"loss": 3.6406,
"step": 10750
},
{
"epoch": 3.1477961753731343,
"grad_norm": 0.3592631220817566,
"learning_rate": 0.0005625581802274715,
"loss": 3.6457,
"step": 10800
},
{
"epoch": 3.1623717350746268,
"grad_norm": 0.30832141637802124,
"learning_rate": 0.0005623832020997375,
"loss": 3.6465,
"step": 10850
},
{
"epoch": 3.1769472947761193,
"grad_norm": 0.30971667170524597,
"learning_rate": 0.0005622082239720034,
"loss": 3.6442,
"step": 10900
},
{
"epoch": 3.1915228544776117,
"grad_norm": 0.32361266016960144,
"learning_rate": 0.0005620332458442694,
"loss": 3.6469,
"step": 10950
},
{
"epoch": 3.2060984141791047,
"grad_norm": 0.3234313726425171,
"learning_rate": 0.0005618582677165354,
"loss": 3.6522,
"step": 11000
},
{
"epoch": 3.2060984141791047,
"eval_accuracy": 0.34968304800258276,
"eval_loss": 3.725311756134033,
"eval_runtime": 179.4792,
"eval_samples_per_second": 92.64,
"eval_steps_per_second": 5.795,
"step": 11000
},
{
"epoch": 3.220673973880597,
"grad_norm": 0.32695841789245605,
"learning_rate": 0.0005616832895888013,
"loss": 3.6504,
"step": 11050
},
{
"epoch": 3.2352495335820897,
"grad_norm": 0.31431734561920166,
"learning_rate": 0.0005615083114610673,
"loss": 3.6456,
"step": 11100
},
{
"epoch": 3.249825093283582,
"grad_norm": 0.3156610131263733,
"learning_rate": 0.0005613333333333333,
"loss": 3.6549,
"step": 11150
},
{
"epoch": 3.2644006529850746,
"grad_norm": 0.31469911336898804,
"learning_rate": 0.0005611583552055992,
"loss": 3.6604,
"step": 11200
},
{
"epoch": 3.278976212686567,
"grad_norm": 0.32614484429359436,
"learning_rate": 0.0005609833770778652,
"loss": 3.6538,
"step": 11250
},
{
"epoch": 3.2935517723880596,
"grad_norm": 0.3248981535434723,
"learning_rate": 0.0005608083989501312,
"loss": 3.6495,
"step": 11300
},
{
"epoch": 3.308127332089552,
"grad_norm": 0.31083929538726807,
"learning_rate": 0.0005606334208223972,
"loss": 3.6475,
"step": 11350
},
{
"epoch": 3.3227028917910446,
"grad_norm": 0.2996702492237091,
"learning_rate": 0.000560458442694663,
"loss": 3.6517,
"step": 11400
},
{
"epoch": 3.337278451492537,
"grad_norm": 0.31601446866989136,
"learning_rate": 0.000560283464566929,
"loss": 3.651,
"step": 11450
},
{
"epoch": 3.35185401119403,
"grad_norm": 0.30847907066345215,
"learning_rate": 0.000560108486439195,
"loss": 3.6554,
"step": 11500
},
{
"epoch": 3.3664295708955225,
"grad_norm": 0.3323810398578644,
"learning_rate": 0.000559933508311461,
"loss": 3.6488,
"step": 11550
},
{
"epoch": 3.381005130597015,
"grad_norm": 0.3122812807559967,
"learning_rate": 0.0005597585301837269,
"loss": 3.6552,
"step": 11600
},
{
"epoch": 3.3955806902985075,
"grad_norm": 0.3094134032726288,
"learning_rate": 0.0005595835520559929,
"loss": 3.6454,
"step": 11650
},
{
"epoch": 3.41015625,
"grad_norm": 0.2967991828918457,
"learning_rate": 0.0005594085739282589,
"loss": 3.672,
"step": 11700
},
{
"epoch": 3.4247318097014925,
"grad_norm": 0.31317925453186035,
"learning_rate": 0.0005592335958005249,
"loss": 3.6597,
"step": 11750
},
{
"epoch": 3.439307369402985,
"grad_norm": 0.30682000517845154,
"learning_rate": 0.0005590586176727908,
"loss": 3.6456,
"step": 11800
},
{
"epoch": 3.4538829291044775,
"grad_norm": 0.31732603907585144,
"learning_rate": 0.0005588836395450568,
"loss": 3.6532,
"step": 11850
},
{
"epoch": 3.46845848880597,
"grad_norm": 0.3260912001132965,
"learning_rate": 0.0005587086614173228,
"loss": 3.6662,
"step": 11900
},
{
"epoch": 3.483034048507463,
"grad_norm": 0.3180249333381653,
"learning_rate": 0.0005585336832895888,
"loss": 3.6511,
"step": 11950
},
{
"epoch": 3.4976096082089554,
"grad_norm": 0.31996291875839233,
"learning_rate": 0.0005583587051618547,
"loss": 3.66,
"step": 12000
},
{
"epoch": 3.4976096082089554,
"eval_accuracy": 0.3516450561337941,
"eval_loss": 3.7062184810638428,
"eval_runtime": 180.5583,
"eval_samples_per_second": 92.087,
"eval_steps_per_second": 5.76,
"step": 12000
},
{
"epoch": 3.512185167910448,
"grad_norm": 0.313018262386322,
"learning_rate": 0.0005581837270341207,
"loss": 3.6495,
"step": 12050
},
{
"epoch": 3.5267607276119404,
"grad_norm": 0.31375324726104736,
"learning_rate": 0.0005580087489063867,
"loss": 3.6595,
"step": 12100
},
{
"epoch": 3.541336287313433,
"grad_norm": 0.32115986943244934,
"learning_rate": 0.0005578337707786526,
"loss": 3.6507,
"step": 12150
},
{
"epoch": 3.5559118470149254,
"grad_norm": 0.32580527663230896,
"learning_rate": 0.0005576587926509186,
"loss": 3.6527,
"step": 12200
},
{
"epoch": 3.570487406716418,
"grad_norm": 0.2964605689048767,
"learning_rate": 0.0005574838145231846,
"loss": 3.657,
"step": 12250
},
{
"epoch": 3.5850629664179103,
"grad_norm": 0.31582698225975037,
"learning_rate": 0.0005573088363954506,
"loss": 3.6419,
"step": 12300
},
{
"epoch": 3.599638526119403,
"grad_norm": 0.3034908175468445,
"learning_rate": 0.0005571338582677165,
"loss": 3.6547,
"step": 12350
},
{
"epoch": 3.6142140858208958,
"grad_norm": 0.3223356008529663,
"learning_rate": 0.0005569588801399825,
"loss": 3.6425,
"step": 12400
},
{
"epoch": 3.628789645522388,
"grad_norm": 0.3192656338214874,
"learning_rate": 0.0005567839020122485,
"loss": 3.6651,
"step": 12450
},
{
"epoch": 3.6433652052238807,
"grad_norm": 0.3013507127761841,
"learning_rate": 0.0005566089238845145,
"loss": 3.6463,
"step": 12500
},
{
"epoch": 3.6579407649253732,
"grad_norm": 0.30855220556259155,
"learning_rate": 0.0005564339457567803,
"loss": 3.6542,
"step": 12550
},
{
"epoch": 3.6725163246268657,
"grad_norm": 0.30979233980178833,
"learning_rate": 0.0005562589676290463,
"loss": 3.6514,
"step": 12600
},
{
"epoch": 3.687091884328358,
"grad_norm": 0.30794402956962585,
"learning_rate": 0.0005560839895013123,
"loss": 3.6469,
"step": 12650
},
{
"epoch": 3.7016674440298507,
"grad_norm": 0.30499374866485596,
"learning_rate": 0.0005559090113735782,
"loss": 3.6526,
"step": 12700
},
{
"epoch": 3.716243003731343,
"grad_norm": 0.31229689717292786,
"learning_rate": 0.0005557340332458442,
"loss": 3.6539,
"step": 12750
},
{
"epoch": 3.7308185634328357,
"grad_norm": 0.3160341680049896,
"learning_rate": 0.0005555590551181102,
"loss": 3.6505,
"step": 12800
},
{
"epoch": 3.7453941231343286,
"grad_norm": 0.3231920599937439,
"learning_rate": 0.0005553840769903761,
"loss": 3.6575,
"step": 12850
},
{
"epoch": 3.7599696828358207,
"grad_norm": 0.32196515798568726,
"learning_rate": 0.0005552090988626421,
"loss": 3.6436,
"step": 12900
},
{
"epoch": 3.7745452425373136,
"grad_norm": 0.3200373947620392,
"learning_rate": 0.0005550341207349081,
"loss": 3.6538,
"step": 12950
},
{
"epoch": 3.789120802238806,
"grad_norm": 0.3132128417491913,
"learning_rate": 0.000554859142607174,
"loss": 3.6552,
"step": 13000
},
{
"epoch": 3.789120802238806,
"eval_accuracy": 0.3530933170848773,
"eval_loss": 3.689838409423828,
"eval_runtime": 184.45,
"eval_samples_per_second": 90.144,
"eval_steps_per_second": 5.638,
"step": 13000
},
{
"epoch": 3.8036963619402986,
"grad_norm": 0.31602323055267334,
"learning_rate": 0.00055468416447944,
"loss": 3.6588,
"step": 13050
},
{
"epoch": 3.818271921641791,
"grad_norm": 0.3273375332355499,
"learning_rate": 0.000554509186351706,
"loss": 3.6435,
"step": 13100
},
{
"epoch": 3.8328474813432836,
"grad_norm": 0.3076334297657013,
"learning_rate": 0.000554334208223972,
"loss": 3.6514,
"step": 13150
},
{
"epoch": 3.847423041044776,
"grad_norm": 0.3193327486515045,
"learning_rate": 0.0005541592300962379,
"loss": 3.6496,
"step": 13200
},
{
"epoch": 3.8619986007462686,
"grad_norm": 0.2980865240097046,
"learning_rate": 0.0005539842519685039,
"loss": 3.6406,
"step": 13250
},
{
"epoch": 3.876574160447761,
"grad_norm": 0.30861976742744446,
"learning_rate": 0.0005538092738407699,
"loss": 3.6524,
"step": 13300
},
{
"epoch": 3.8911497201492535,
"grad_norm": 0.31307452917099,
"learning_rate": 0.0005536342957130358,
"loss": 3.6432,
"step": 13350
},
{
"epoch": 3.9057252798507465,
"grad_norm": 0.30774036049842834,
"learning_rate": 0.0005534593175853018,
"loss": 3.6442,
"step": 13400
},
{
"epoch": 3.920300839552239,
"grad_norm": 0.3179526925086975,
"learning_rate": 0.0005532843394575678,
"loss": 3.6473,
"step": 13450
},
{
"epoch": 3.9348763992537314,
"grad_norm": 0.3184812068939209,
"learning_rate": 0.0005531093613298337,
"loss": 3.6276,
"step": 13500
},
{
"epoch": 3.949451958955224,
"grad_norm": 0.33337730169296265,
"learning_rate": 0.0005529343832020997,
"loss": 3.6487,
"step": 13550
},
{
"epoch": 3.9640275186567164,
"grad_norm": 0.30484265089035034,
"learning_rate": 0.0005527594050743656,
"loss": 3.6475,
"step": 13600
},
{
"epoch": 3.978603078358209,
"grad_norm": 0.31063055992126465,
"learning_rate": 0.0005525844269466316,
"loss": 3.6491,
"step": 13650
},
{
"epoch": 3.9931786380597014,
"grad_norm": 0.3012693226337433,
"learning_rate": 0.0005524094488188975,
"loss": 3.6502,
"step": 13700
},
{
"epoch": 4.0075792910447765,
"grad_norm": 0.29927513003349304,
"learning_rate": 0.0005522344706911635,
"loss": 3.5919,
"step": 13750
},
{
"epoch": 4.0221548507462686,
"grad_norm": 0.325898140668869,
"learning_rate": 0.0005520594925634295,
"loss": 3.5521,
"step": 13800
},
{
"epoch": 4.0367304104477615,
"grad_norm": 0.3279491662979126,
"learning_rate": 0.0005518845144356954,
"loss": 3.5461,
"step": 13850
},
{
"epoch": 4.0513059701492535,
"grad_norm": 0.32043692469596863,
"learning_rate": 0.0005517095363079614,
"loss": 3.5494,
"step": 13900
},
{
"epoch": 4.0658815298507465,
"grad_norm": 0.3268143832683563,
"learning_rate": 0.0005515345581802274,
"loss": 3.5446,
"step": 13950
},
{
"epoch": 4.0804570895522385,
"grad_norm": 0.2917918264865875,
"learning_rate": 0.0005513595800524934,
"loss": 3.5443,
"step": 14000
},
{
"epoch": 4.0804570895522385,
"eval_accuracy": 0.35479980514093207,
"eval_loss": 3.680328369140625,
"eval_runtime": 180.7539,
"eval_samples_per_second": 91.987,
"eval_steps_per_second": 5.754,
"step": 14000
},
{
"epoch": 4.0950326492537314,
"grad_norm": 0.3293069303035736,
"learning_rate": 0.0005511846019247593,
"loss": 3.553,
"step": 14050
},
{
"epoch": 4.1096082089552235,
"grad_norm": 0.31176093220710754,
"learning_rate": 0.0005510096237970253,
"loss": 3.5489,
"step": 14100
},
{
"epoch": 4.124183768656716,
"grad_norm": 0.3254378139972687,
"learning_rate": 0.0005508346456692913,
"loss": 3.5478,
"step": 14150
},
{
"epoch": 4.138759328358209,
"grad_norm": 0.32653385400772095,
"learning_rate": 0.0005506596675415573,
"loss": 3.5658,
"step": 14200
},
{
"epoch": 4.153334888059701,
"grad_norm": 0.3056170344352722,
"learning_rate": 0.0005504846894138232,
"loss": 3.5698,
"step": 14250
},
{
"epoch": 4.167910447761194,
"grad_norm": 0.3116225004196167,
"learning_rate": 0.0005503097112860892,
"loss": 3.562,
"step": 14300
},
{
"epoch": 4.182486007462686,
"grad_norm": 0.3115183413028717,
"learning_rate": 0.0005501347331583552,
"loss": 3.5626,
"step": 14350
},
{
"epoch": 4.197061567164179,
"grad_norm": 0.332135409116745,
"learning_rate": 0.0005499597550306212,
"loss": 3.5885,
"step": 14400
},
{
"epoch": 4.211637126865671,
"grad_norm": 0.30620288848876953,
"learning_rate": 0.0005497847769028871,
"loss": 3.5675,
"step": 14450
},
{
"epoch": 4.226212686567164,
"grad_norm": 0.3426160216331482,
"learning_rate": 0.0005496097987751531,
"loss": 3.5712,
"step": 14500
},
{
"epoch": 4.240788246268656,
"grad_norm": 0.3171519339084625,
"learning_rate": 0.0005494348206474191,
"loss": 3.5806,
"step": 14550
},
{
"epoch": 4.255363805970149,
"grad_norm": 0.30158156156539917,
"learning_rate": 0.0005492598425196851,
"loss": 3.5669,
"step": 14600
},
{
"epoch": 4.269939365671641,
"grad_norm": 0.31135430932044983,
"learning_rate": 0.000549084864391951,
"loss": 3.5638,
"step": 14650
},
{
"epoch": 4.284514925373134,
"grad_norm": 0.32262277603149414,
"learning_rate": 0.000548909886264217,
"loss": 3.5744,
"step": 14700
},
{
"epoch": 4.299090485074627,
"grad_norm": 0.33498984575271606,
"learning_rate": 0.000548734908136483,
"loss": 3.5667,
"step": 14750
},
{
"epoch": 4.313666044776119,
"grad_norm": 0.31680381298065186,
"learning_rate": 0.0005485599300087488,
"loss": 3.5725,
"step": 14800
},
{
"epoch": 4.328241604477612,
"grad_norm": 0.330207496881485,
"learning_rate": 0.0005483849518810148,
"loss": 3.5772,
"step": 14850
},
{
"epoch": 4.342817164179104,
"grad_norm": 0.31242871284484863,
"learning_rate": 0.0005482099737532808,
"loss": 3.5764,
"step": 14900
},
{
"epoch": 4.357392723880597,
"grad_norm": 0.30855998396873474,
"learning_rate": 0.0005480349956255468,
"loss": 3.5655,
"step": 14950
},
{
"epoch": 4.371968283582089,
"grad_norm": 0.3287501335144043,
"learning_rate": 0.0005478600174978127,
"loss": 3.5694,
"step": 15000
},
{
"epoch": 4.371968283582089,
"eval_accuracy": 0.3560954131498328,
"eval_loss": 3.6704261302948,
"eval_runtime": 180.8213,
"eval_samples_per_second": 91.953,
"eval_steps_per_second": 5.752,
"step": 15000
},
{
"epoch": 4.386543843283582,
"grad_norm": 0.3167351484298706,
"learning_rate": 0.0005476850393700787,
"loss": 3.5763,
"step": 15050
},
{
"epoch": 4.401119402985074,
"grad_norm": 0.31553417444229126,
"learning_rate": 0.0005475100612423447,
"loss": 3.5838,
"step": 15100
},
{
"epoch": 4.415694962686567,
"grad_norm": 0.3282777667045593,
"learning_rate": 0.0005473350831146106,
"loss": 3.585,
"step": 15150
},
{
"epoch": 4.43027052238806,
"grad_norm": 0.3105928897857666,
"learning_rate": 0.0005471601049868766,
"loss": 3.5828,
"step": 15200
},
{
"epoch": 4.444846082089552,
"grad_norm": 0.2984945476055145,
"learning_rate": 0.0005469851268591426,
"loss": 3.5808,
"step": 15250
},
{
"epoch": 4.459421641791045,
"grad_norm": 0.30299872159957886,
"learning_rate": 0.0005468101487314085,
"loss": 3.582,
"step": 15300
},
{
"epoch": 4.473997201492537,
"grad_norm": 0.3066897392272949,
"learning_rate": 0.0005466351706036745,
"loss": 3.5935,
"step": 15350
},
{
"epoch": 4.48857276119403,
"grad_norm": 0.30703720450401306,
"learning_rate": 0.0005464601924759405,
"loss": 3.5741,
"step": 15400
},
{
"epoch": 4.503148320895522,
"grad_norm": 0.31966084241867065,
"learning_rate": 0.0005462852143482064,
"loss": 3.5844,
"step": 15450
},
{
"epoch": 4.517723880597015,
"grad_norm": 0.3157017230987549,
"learning_rate": 0.0005461102362204724,
"loss": 3.5834,
"step": 15500
},
{
"epoch": 4.532299440298507,
"grad_norm": 0.31914374232292175,
"learning_rate": 0.0005459352580927384,
"loss": 3.5824,
"step": 15550
},
{
"epoch": 4.546875,
"grad_norm": 0.3196133077144623,
"learning_rate": 0.0005457602799650043,
"loss": 3.5838,
"step": 15600
},
{
"epoch": 4.561450559701493,
"grad_norm": 0.3134230971336365,
"learning_rate": 0.0005455853018372703,
"loss": 3.5862,
"step": 15650
},
{
"epoch": 4.576026119402985,
"grad_norm": 0.2948553264141083,
"learning_rate": 0.0005454103237095363,
"loss": 3.5771,
"step": 15700
},
{
"epoch": 4.590601679104478,
"grad_norm": 0.31631046533584595,
"learning_rate": 0.0005452353455818022,
"loss": 3.6015,
"step": 15750
},
{
"epoch": 4.60517723880597,
"grad_norm": 0.3305026590824127,
"learning_rate": 0.0005450603674540681,
"loss": 3.5892,
"step": 15800
},
{
"epoch": 4.619752798507463,
"grad_norm": 0.3080248534679413,
"learning_rate": 0.0005448853893263341,
"loss": 3.5883,
"step": 15850
},
{
"epoch": 4.634328358208955,
"grad_norm": 0.317914754152298,
"learning_rate": 0.0005447104111986001,
"loss": 3.5963,
"step": 15900
},
{
"epoch": 4.648903917910448,
"grad_norm": 0.30993831157684326,
"learning_rate": 0.000544535433070866,
"loss": 3.5872,
"step": 15950
},
{
"epoch": 4.66347947761194,
"grad_norm": 0.32297852635383606,
"learning_rate": 0.000544360454943132,
"loss": 3.5998,
"step": 16000
},
{
"epoch": 4.66347947761194,
"eval_accuracy": 0.35684573119641183,
"eval_loss": 3.659557819366455,
"eval_runtime": 197.5912,
"eval_samples_per_second": 84.148,
"eval_steps_per_second": 5.263,
"step": 16000
},
{
"epoch": 4.678055037313433,
"grad_norm": 0.3044687509536743,
"learning_rate": 0.000544185476815398,
"loss": 3.5989,
"step": 16050
},
{
"epoch": 4.692630597014926,
"grad_norm": 0.3438224792480469,
"learning_rate": 0.000544010498687664,
"loss": 3.5888,
"step": 16100
},
{
"epoch": 4.707206156716418,
"grad_norm": 0.3074466288089752,
"learning_rate": 0.0005438355205599299,
"loss": 3.5969,
"step": 16150
},
{
"epoch": 4.721781716417911,
"grad_norm": 0.3279782235622406,
"learning_rate": 0.0005436605424321959,
"loss": 3.5828,
"step": 16200
},
{
"epoch": 4.736357276119403,
"grad_norm": 0.3192928731441498,
"learning_rate": 0.0005434855643044619,
"loss": 3.586,
"step": 16250
},
{
"epoch": 4.750932835820896,
"grad_norm": 0.3126852810382843,
"learning_rate": 0.0005433105861767279,
"loss": 3.5905,
"step": 16300
},
{
"epoch": 4.765508395522388,
"grad_norm": 0.3209400177001953,
"learning_rate": 0.0005431356080489938,
"loss": 3.5821,
"step": 16350
},
{
"epoch": 4.780083955223881,
"grad_norm": 0.31030383706092834,
"learning_rate": 0.0005429606299212598,
"loss": 3.5863,
"step": 16400
},
{
"epoch": 4.794659514925373,
"grad_norm": 0.29958170652389526,
"learning_rate": 0.0005427856517935258,
"loss": 3.5929,
"step": 16450
},
{
"epoch": 4.809235074626866,
"grad_norm": 0.3295001685619354,
"learning_rate": 0.0005426106736657917,
"loss": 3.5957,
"step": 16500
},
{
"epoch": 4.823810634328359,
"grad_norm": 0.32404083013534546,
"learning_rate": 0.0005424356955380577,
"loss": 3.5826,
"step": 16550
},
{
"epoch": 4.838386194029851,
"grad_norm": 0.33932703733444214,
"learning_rate": 0.0005422607174103237,
"loss": 3.5873,
"step": 16600
},
{
"epoch": 4.852961753731344,
"grad_norm": 0.3206491470336914,
"learning_rate": 0.0005420857392825897,
"loss": 3.5869,
"step": 16650
},
{
"epoch": 4.867537313432836,
"grad_norm": 0.3008648157119751,
"learning_rate": 0.0005419107611548556,
"loss": 3.576,
"step": 16700
},
{
"epoch": 4.882112873134329,
"grad_norm": 0.3127002716064453,
"learning_rate": 0.0005417357830271216,
"loss": 3.5996,
"step": 16750
},
{
"epoch": 4.896688432835821,
"grad_norm": 0.315496563911438,
"learning_rate": 0.0005415608048993876,
"loss": 3.5802,
"step": 16800
},
{
"epoch": 4.911263992537314,
"grad_norm": 0.3381385803222656,
"learning_rate": 0.0005413858267716535,
"loss": 3.5946,
"step": 16850
},
{
"epoch": 4.925839552238806,
"grad_norm": 0.314251571893692,
"learning_rate": 0.0005412108486439194,
"loss": 3.5767,
"step": 16900
},
{
"epoch": 4.940415111940299,
"grad_norm": 0.3200497329235077,
"learning_rate": 0.0005410358705161854,
"loss": 3.589,
"step": 16950
},
{
"epoch": 4.9549906716417915,
"grad_norm": 0.29789999127388,
"learning_rate": 0.0005408608923884514,
"loss": 3.5871,
"step": 17000
},
{
"epoch": 4.9549906716417915,
"eval_accuracy": 0.35848654435521315,
"eval_loss": 3.642368793487549,
"eval_runtime": 181.4525,
"eval_samples_per_second": 91.633,
"eval_steps_per_second": 5.732,
"step": 17000
},
{
"epoch": 4.969566231343284,
"grad_norm": 0.29378968477249146,
"learning_rate": 0.0005406859142607174,
"loss": 3.5946,
"step": 17050
},
{
"epoch": 4.9841417910447765,
"grad_norm": 0.29139405488967896,
"learning_rate": 0.0005405109361329833,
"loss": 3.5816,
"step": 17100
},
{
"epoch": 4.9987173507462686,
"grad_norm": 0.3059278130531311,
"learning_rate": 0.0005403359580052493,
"loss": 3.5769,
"step": 17150
},
{
"epoch": 5.013118003731344,
"grad_norm": 0.3037010133266449,
"learning_rate": 0.0005401609798775153,
"loss": 3.4897,
"step": 17200
},
{
"epoch": 5.027693563432836,
"grad_norm": 0.32655635476112366,
"learning_rate": 0.0005399860017497813,
"loss": 3.479,
"step": 17250
},
{
"epoch": 5.042269123134329,
"grad_norm": 0.3251640796661377,
"learning_rate": 0.0005398110236220472,
"loss": 3.4859,
"step": 17300
},
{
"epoch": 5.056844682835821,
"grad_norm": 0.31693679094314575,
"learning_rate": 0.0005396360454943132,
"loss": 3.4711,
"step": 17350
},
{
"epoch": 5.071420242537314,
"grad_norm": 0.32700759172439575,
"learning_rate": 0.0005394610673665792,
"loss": 3.481,
"step": 17400
},
{
"epoch": 5.085995802238806,
"grad_norm": 0.3072807192802429,
"learning_rate": 0.0005392860892388451,
"loss": 3.5028,
"step": 17450
},
{
"epoch": 5.100571361940299,
"grad_norm": 0.3221571743488312,
"learning_rate": 0.0005391111111111111,
"loss": 3.4889,
"step": 17500
},
{
"epoch": 5.115146921641791,
"grad_norm": 0.3130948841571808,
"learning_rate": 0.0005389361329833771,
"loss": 3.5,
"step": 17550
},
{
"epoch": 5.129722481343284,
"grad_norm": 0.31084439158439636,
"learning_rate": 0.000538761154855643,
"loss": 3.5014,
"step": 17600
},
{
"epoch": 5.1442980410447765,
"grad_norm": 0.3255762457847595,
"learning_rate": 0.000538586176727909,
"loss": 3.5074,
"step": 17650
},
{
"epoch": 5.1588736007462686,
"grad_norm": 0.33458569645881653,
"learning_rate": 0.000538411198600175,
"loss": 3.5101,
"step": 17700
},
{
"epoch": 5.1734491604477615,
"grad_norm": 0.327533096075058,
"learning_rate": 0.0005382362204724409,
"loss": 3.5184,
"step": 17750
},
{
"epoch": 5.1880247201492535,
"grad_norm": 0.31040626764297485,
"learning_rate": 0.0005380612423447069,
"loss": 3.5054,
"step": 17800
},
{
"epoch": 5.2026002798507465,
"grad_norm": 0.3363685607910156,
"learning_rate": 0.0005378862642169729,
"loss": 3.4981,
"step": 17850
},
{
"epoch": 5.2171758395522385,
"grad_norm": 0.3423328995704651,
"learning_rate": 0.0005377112860892387,
"loss": 3.52,
"step": 17900
},
{
"epoch": 5.2317513992537314,
"grad_norm": 0.33025994896888733,
"learning_rate": 0.0005375363079615047,
"loss": 3.5243,
"step": 17950
},
{
"epoch": 5.2463269589552235,
"grad_norm": 0.33021610975265503,
"learning_rate": 0.0005373613298337707,
"loss": 3.4993,
"step": 18000
},
{
"epoch": 5.2463269589552235,
"eval_accuracy": 0.3589595683911663,
"eval_loss": 3.6441898345947266,
"eval_runtime": 180.4815,
"eval_samples_per_second": 92.126,
"eval_steps_per_second": 5.762,
"step": 18000
},
{
"epoch": 5.260902518656716,
"grad_norm": 0.3091402053833008,
"learning_rate": 0.0005371863517060366,
"loss": 3.5146,
"step": 18050
},
{
"epoch": 5.275478078358209,
"grad_norm": 0.32835206389427185,
"learning_rate": 0.0005370113735783026,
"loss": 3.5231,
"step": 18100
},
{
"epoch": 5.290053638059701,
"grad_norm": 0.3306668698787689,
"learning_rate": 0.0005368363954505686,
"loss": 3.5071,
"step": 18150
},
{
"epoch": 5.304629197761194,
"grad_norm": 0.34048157930374146,
"learning_rate": 0.0005366614173228346,
"loss": 3.5238,
"step": 18200
},
{
"epoch": 5.319204757462686,
"grad_norm": 0.31027457118034363,
"learning_rate": 0.0005364864391951005,
"loss": 3.5294,
"step": 18250
},
{
"epoch": 5.333780317164179,
"grad_norm": 0.3100549280643463,
"learning_rate": 0.0005363114610673665,
"loss": 3.518,
"step": 18300
},
{
"epoch": 5.348355876865671,
"grad_norm": 0.3276021182537079,
"learning_rate": 0.0005361364829396325,
"loss": 3.5219,
"step": 18350
},
{
"epoch": 5.362931436567164,
"grad_norm": 0.3327234983444214,
"learning_rate": 0.0005359615048118984,
"loss": 3.5043,
"step": 18400
},
{
"epoch": 5.377506996268656,
"grad_norm": 0.3122633993625641,
"learning_rate": 0.0005357865266841644,
"loss": 3.538,
"step": 18450
},
{
"epoch": 5.392082555970149,
"grad_norm": 0.34483712911605835,
"learning_rate": 0.0005356115485564304,
"loss": 3.5303,
"step": 18500
},
{
"epoch": 5.406658115671641,
"grad_norm": 0.34489697217941284,
"learning_rate": 0.0005354365704286964,
"loss": 3.5276,
"step": 18550
},
{
"epoch": 5.421233675373134,
"grad_norm": 0.31317785382270813,
"learning_rate": 0.0005352615923009623,
"loss": 3.5298,
"step": 18600
},
{
"epoch": 5.435809235074627,
"grad_norm": 0.30463266372680664,
"learning_rate": 0.0005350866141732283,
"loss": 3.5289,
"step": 18650
},
{
"epoch": 5.450384794776119,
"grad_norm": 0.3198157250881195,
"learning_rate": 0.0005349116360454943,
"loss": 3.5255,
"step": 18700
},
{
"epoch": 5.464960354477612,
"grad_norm": 0.3108980357646942,
"learning_rate": 0.0005347366579177603,
"loss": 3.5311,
"step": 18750
},
{
"epoch": 5.479535914179104,
"grad_norm": 0.32333433628082275,
"learning_rate": 0.0005345616797900262,
"loss": 3.5315,
"step": 18800
},
{
"epoch": 5.494111473880597,
"grad_norm": 0.32552671432495117,
"learning_rate": 0.0005343867016622922,
"loss": 3.5385,
"step": 18850
},
{
"epoch": 5.508687033582089,
"grad_norm": 0.321522980928421,
"learning_rate": 0.0005342117235345582,
"loss": 3.5297,
"step": 18900
},
{
"epoch": 5.523262593283582,
"grad_norm": 0.30180302262306213,
"learning_rate": 0.0005340367454068242,
"loss": 3.5229,
"step": 18950
},
{
"epoch": 5.537838152985074,
"grad_norm": 0.3097897469997406,
"learning_rate": 0.00053386176727909,
"loss": 3.5276,
"step": 19000
},
{
"epoch": 5.537838152985074,
"eval_accuracy": 0.35964044523814037,
"eval_loss": 3.637159824371338,
"eval_runtime": 179.7064,
"eval_samples_per_second": 92.523,
"eval_steps_per_second": 5.787,
"step": 19000
},
{
"epoch": 5.552413712686567,
"grad_norm": 0.31561505794525146,
"learning_rate": 0.000533686789151356,
"loss": 3.5307,
"step": 19050
},
{
"epoch": 5.56698927238806,
"grad_norm": 0.3239782452583313,
"learning_rate": 0.000533511811023622,
"loss": 3.5387,
"step": 19100
},
{
"epoch": 5.581564832089552,
"grad_norm": 0.3201083242893219,
"learning_rate": 0.000533336832895888,
"loss": 3.5297,
"step": 19150
},
{
"epoch": 5.596140391791045,
"grad_norm": 0.3185979425907135,
"learning_rate": 0.0005331618547681539,
"loss": 3.5493,
"step": 19200
},
{
"epoch": 5.610715951492537,
"grad_norm": 0.29556897282600403,
"learning_rate": 0.0005329868766404199,
"loss": 3.5341,
"step": 19250
},
{
"epoch": 5.62529151119403,
"grad_norm": 0.30993542075157166,
"learning_rate": 0.0005328118985126859,
"loss": 3.5413,
"step": 19300
},
{
"epoch": 5.639867070895522,
"grad_norm": 0.3120933175086975,
"learning_rate": 0.0005326369203849518,
"loss": 3.5441,
"step": 19350
},
{
"epoch": 5.654442630597015,
"grad_norm": 0.33338284492492676,
"learning_rate": 0.0005324619422572178,
"loss": 3.5467,
"step": 19400
},
{
"epoch": 5.669018190298507,
"grad_norm": 0.31648966670036316,
"learning_rate": 0.0005322869641294838,
"loss": 3.5423,
"step": 19450
},
{
"epoch": 5.68359375,
"grad_norm": 0.31063228845596313,
"learning_rate": 0.0005321119860017498,
"loss": 3.525,
"step": 19500
},
{
"epoch": 5.698169309701493,
"grad_norm": 0.31778037548065186,
"learning_rate": 0.0005319370078740157,
"loss": 3.5486,
"step": 19550
},
{
"epoch": 5.712744869402985,
"grad_norm": 0.3344435393810272,
"learning_rate": 0.0005317620297462817,
"loss": 3.534,
"step": 19600
},
{
"epoch": 5.727320429104478,
"grad_norm": 0.3090979754924774,
"learning_rate": 0.0005315870516185477,
"loss": 3.5415,
"step": 19650
},
{
"epoch": 5.74189598880597,
"grad_norm": 0.3059396743774414,
"learning_rate": 0.0005314120734908137,
"loss": 3.5331,
"step": 19700
},
{
"epoch": 5.756471548507463,
"grad_norm": 0.32924172282218933,
"learning_rate": 0.0005312370953630796,
"loss": 3.5382,
"step": 19750
},
{
"epoch": 5.771047108208955,
"grad_norm": 0.30785858631134033,
"learning_rate": 0.0005310621172353456,
"loss": 3.5335,
"step": 19800
},
{
"epoch": 5.785622667910448,
"grad_norm": 0.29037636518478394,
"learning_rate": 0.0005308871391076116,
"loss": 3.5379,
"step": 19850
},
{
"epoch": 5.80019822761194,
"grad_norm": 0.30217617750167847,
"learning_rate": 0.0005307121609798775,
"loss": 3.5226,
"step": 19900
},
{
"epoch": 5.814773787313433,
"grad_norm": 0.3155075013637543,
"learning_rate": 0.0005305371828521435,
"loss": 3.5429,
"step": 19950
},
{
"epoch": 5.829349347014926,
"grad_norm": 0.3263370096683502,
"learning_rate": 0.0005303622047244095,
"loss": 3.5397,
"step": 20000
},
{
"epoch": 5.829349347014926,
"eval_accuracy": 0.3607415001911987,
"eval_loss": 3.6235318183898926,
"eval_runtime": 180.9952,
"eval_samples_per_second": 91.864,
"eval_steps_per_second": 5.746,
"step": 20000
},
{
"epoch": 5.843924906716418,
"grad_norm": 0.3229924738407135,
"learning_rate": 0.0005301872265966753,
"loss": 3.5435,
"step": 20050
},
{
"epoch": 5.858500466417911,
"grad_norm": 0.31298452615737915,
"learning_rate": 0.0005300122484689413,
"loss": 3.5387,
"step": 20100
},
{
"epoch": 5.873076026119403,
"grad_norm": 0.31652334332466125,
"learning_rate": 0.0005298372703412073,
"loss": 3.5417,
"step": 20150
},
{
"epoch": 5.887651585820896,
"grad_norm": 0.32354360818862915,
"learning_rate": 0.0005296622922134732,
"loss": 3.5316,
"step": 20200
},
{
"epoch": 5.902227145522388,
"grad_norm": 0.2969135344028473,
"learning_rate": 0.0005294873140857392,
"loss": 3.5412,
"step": 20250
},
{
"epoch": 5.916802705223881,
"grad_norm": 0.3211456537246704,
"learning_rate": 0.0005293123359580052,
"loss": 3.5363,
"step": 20300
},
{
"epoch": 5.931378264925373,
"grad_norm": 0.32332107424736023,
"learning_rate": 0.0005291373578302711,
"loss": 3.5407,
"step": 20350
},
{
"epoch": 5.945953824626866,
"grad_norm": 0.32086077332496643,
"learning_rate": 0.0005289623797025371,
"loss": 3.5501,
"step": 20400
},
{
"epoch": 5.960529384328359,
"grad_norm": 0.3089979290962219,
"learning_rate": 0.0005287874015748031,
"loss": 3.5404,
"step": 20450
},
{
"epoch": 5.975104944029851,
"grad_norm": 0.30579036474227905,
"learning_rate": 0.000528612423447069,
"loss": 3.5325,
"step": 20500
},
{
"epoch": 5.989680503731344,
"grad_norm": 0.32269933819770813,
"learning_rate": 0.000528437445319335,
"loss": 3.5472,
"step": 20550
},
{
"epoch": 6.004081156716418,
"grad_norm": 0.30791178345680237,
"learning_rate": 0.000528262467191601,
"loss": 3.5089,
"step": 20600
},
{
"epoch": 6.018656716417911,
"grad_norm": 0.3126281797885895,
"learning_rate": 0.000528087489063867,
"loss": 3.4296,
"step": 20650
},
{
"epoch": 6.033232276119403,
"grad_norm": 0.3279401957988739,
"learning_rate": 0.0005279125109361329,
"loss": 3.436,
"step": 20700
},
{
"epoch": 6.047807835820896,
"grad_norm": 0.30899009108543396,
"learning_rate": 0.0005277375328083989,
"loss": 3.4333,
"step": 20750
},
{
"epoch": 6.062383395522388,
"grad_norm": 0.32034310698509216,
"learning_rate": 0.0005275625546806649,
"loss": 3.439,
"step": 20800
},
{
"epoch": 6.076958955223881,
"grad_norm": 0.3171665370464325,
"learning_rate": 0.0005273875765529309,
"loss": 3.4449,
"step": 20850
},
{
"epoch": 6.091534514925373,
"grad_norm": 0.3259046971797943,
"learning_rate": 0.0005272125984251968,
"loss": 3.4499,
"step": 20900
},
{
"epoch": 6.106110074626866,
"grad_norm": 0.3260829448699951,
"learning_rate": 0.0005270376202974628,
"loss": 3.465,
"step": 20950
},
{
"epoch": 6.120685634328359,
"grad_norm": 0.30281195044517517,
"learning_rate": 0.0005268626421697288,
"loss": 3.4575,
"step": 21000
},
{
"epoch": 6.120685634328359,
"eval_accuracy": 0.3610256206248366,
"eval_loss": 3.628716468811035,
"eval_runtime": 180.6094,
"eval_samples_per_second": 92.061,
"eval_steps_per_second": 5.758,
"step": 21000
},
{
"epoch": 6.135261194029851,
"grad_norm": 0.32482925057411194,
"learning_rate": 0.0005266876640419946,
"loss": 3.46,
"step": 21050
},
{
"epoch": 6.149836753731344,
"grad_norm": 0.33268213272094727,
"learning_rate": 0.0005265126859142606,
"loss": 3.4493,
"step": 21100
},
{
"epoch": 6.164412313432836,
"grad_norm": 0.3173036575317383,
"learning_rate": 0.0005263377077865266,
"loss": 3.4723,
"step": 21150
},
{
"epoch": 6.178987873134329,
"grad_norm": 0.3229723870754242,
"learning_rate": 0.0005261627296587926,
"loss": 3.471,
"step": 21200
},
{
"epoch": 6.193563432835821,
"grad_norm": 0.3389780521392822,
"learning_rate": 0.0005259877515310585,
"loss": 3.4613,
"step": 21250
},
{
"epoch": 6.208138992537314,
"grad_norm": 0.31343021988868713,
"learning_rate": 0.0005258127734033245,
"loss": 3.4633,
"step": 21300
},
{
"epoch": 6.222714552238806,
"grad_norm": 0.31590187549591064,
"learning_rate": 0.0005256377952755905,
"loss": 3.4774,
"step": 21350
},
{
"epoch": 6.237290111940299,
"grad_norm": 0.3065197765827179,
"learning_rate": 0.0005254628171478565,
"loss": 3.4778,
"step": 21400
},
{
"epoch": 6.251865671641791,
"grad_norm": 0.3195898234844208,
"learning_rate": 0.0005252878390201224,
"loss": 3.4793,
"step": 21450
},
{
"epoch": 6.266441231343284,
"grad_norm": 0.30886998772621155,
"learning_rate": 0.0005251128608923884,
"loss": 3.4665,
"step": 21500
},
{
"epoch": 6.2810167910447765,
"grad_norm": 0.3227124512195587,
"learning_rate": 0.0005249378827646544,
"loss": 3.472,
"step": 21550
},
{
"epoch": 6.2955923507462686,
"grad_norm": 0.31510311365127563,
"learning_rate": 0.0005247629046369204,
"loss": 3.4859,
"step": 21600
},
{
"epoch": 6.3101679104477615,
"grad_norm": 0.32103273272514343,
"learning_rate": 0.0005245879265091863,
"loss": 3.4858,
"step": 21650
},
{
"epoch": 6.3247434701492535,
"grad_norm": 0.3214368522167206,
"learning_rate": 0.0005244129483814523,
"loss": 3.4829,
"step": 21700
},
{
"epoch": 6.3393190298507465,
"grad_norm": 0.3349897563457489,
"learning_rate": 0.0005242379702537183,
"loss": 3.4755,
"step": 21750
},
{
"epoch": 6.3538945895522385,
"grad_norm": 0.35362866520881653,
"learning_rate": 0.0005240629921259843,
"loss": 3.4807,
"step": 21800
},
{
"epoch": 6.3684701492537314,
"grad_norm": 0.34084802865982056,
"learning_rate": 0.0005238880139982502,
"loss": 3.479,
"step": 21850
},
{
"epoch": 6.3830457089552235,
"grad_norm": 0.31975167989730835,
"learning_rate": 0.0005237130358705162,
"loss": 3.4599,
"step": 21900
},
{
"epoch": 6.397621268656716,
"grad_norm": 0.3405190408229828,
"learning_rate": 0.0005235380577427822,
"loss": 3.4888,
"step": 21950
},
{
"epoch": 6.412196828358209,
"grad_norm": 0.3264370858669281,
"learning_rate": 0.0005233630796150481,
"loss": 3.4711,
"step": 22000
},
{
"epoch": 6.412196828358209,
"eval_accuracy": 0.3617952409709669,
"eval_loss": 3.6189749240875244,
"eval_runtime": 180.5749,
"eval_samples_per_second": 92.078,
"eval_steps_per_second": 5.759,
"step": 22000
},
{
"epoch": 6.426772388059701,
"grad_norm": 0.31837818026542664,
"learning_rate": 0.0005231881014873141,
"loss": 3.487,
"step": 22050
},
{
"epoch": 6.441347947761194,
"grad_norm": 0.3304750323295593,
"learning_rate": 0.00052301312335958,
"loss": 3.4812,
"step": 22100
},
{
"epoch": 6.455923507462686,
"grad_norm": 0.3154893219470978,
"learning_rate": 0.0005228381452318459,
"loss": 3.4939,
"step": 22150
},
{
"epoch": 6.470499067164179,
"grad_norm": 0.30578863620758057,
"learning_rate": 0.0005226631671041119,
"loss": 3.4839,
"step": 22200
},
{
"epoch": 6.485074626865671,
"grad_norm": 0.3136824667453766,
"learning_rate": 0.0005224881889763779,
"loss": 3.482,
"step": 22250
},
{
"epoch": 6.499650186567164,
"grad_norm": 0.3263259828090668,
"learning_rate": 0.0005223132108486439,
"loss": 3.4907,
"step": 22300
},
{
"epoch": 6.514225746268656,
"grad_norm": 0.3149360716342926,
"learning_rate": 0.0005221382327209098,
"loss": 3.4945,
"step": 22350
},
{
"epoch": 6.528801305970149,
"grad_norm": 0.3265800476074219,
"learning_rate": 0.0005219632545931758,
"loss": 3.4851,
"step": 22400
},
{
"epoch": 6.543376865671641,
"grad_norm": 0.3209088146686554,
"learning_rate": 0.0005217882764654418,
"loss": 3.5038,
"step": 22450
},
{
"epoch": 6.557952425373134,
"grad_norm": 0.318444162607193,
"learning_rate": 0.0005216132983377077,
"loss": 3.4979,
"step": 22500
},
{
"epoch": 6.572527985074627,
"grad_norm": 0.3191940188407898,
"learning_rate": 0.0005214383202099737,
"loss": 3.4907,
"step": 22550
},
{
"epoch": 6.587103544776119,
"grad_norm": 0.32639753818511963,
"learning_rate": 0.0005212633420822397,
"loss": 3.4945,
"step": 22600
},
{
"epoch": 6.601679104477612,
"grad_norm": 0.3260699510574341,
"learning_rate": 0.0005210883639545056,
"loss": 3.4953,
"step": 22650
},
{
"epoch": 6.616254664179104,
"grad_norm": 0.32714834809303284,
"learning_rate": 0.0005209133858267716,
"loss": 3.4973,
"step": 22700
},
{
"epoch": 6.630830223880597,
"grad_norm": 0.3026060163974762,
"learning_rate": 0.0005207384076990376,
"loss": 3.4976,
"step": 22750
},
{
"epoch": 6.645405783582089,
"grad_norm": 0.32032445073127747,
"learning_rate": 0.0005205634295713035,
"loss": 3.5048,
"step": 22800
},
{
"epoch": 6.659981343283582,
"grad_norm": 0.32959914207458496,
"learning_rate": 0.0005203884514435695,
"loss": 3.4923,
"step": 22850
},
{
"epoch": 6.674556902985074,
"grad_norm": 0.33499661087989807,
"learning_rate": 0.0005202134733158355,
"loss": 3.5019,
"step": 22900
},
{
"epoch": 6.689132462686567,
"grad_norm": 0.3219582140445709,
"learning_rate": 0.0005200384951881014,
"loss": 3.4957,
"step": 22950
},
{
"epoch": 6.70370802238806,
"grad_norm": 0.31592950224876404,
"learning_rate": 0.0005198635170603674,
"loss": 3.4998,
"step": 23000
},
{
"epoch": 6.70370802238806,
"eval_accuracy": 0.36258804761594826,
"eval_loss": 3.609095573425293,
"eval_runtime": 179.2047,
"eval_samples_per_second": 92.782,
"eval_steps_per_second": 5.803,
"step": 23000
},
{
"epoch": 6.718283582089552,
"grad_norm": 0.3130515217781067,
"learning_rate": 0.0005196885389326334,
"loss": 3.5013,
"step": 23050
},
{
"epoch": 6.732859141791045,
"grad_norm": 0.3212002217769623,
"learning_rate": 0.0005195135608048994,
"loss": 3.4966,
"step": 23100
},
{
"epoch": 6.747434701492537,
"grad_norm": 0.34351035952568054,
"learning_rate": 0.0005193385826771652,
"loss": 3.4956,
"step": 23150
},
{
"epoch": 6.76201026119403,
"grad_norm": 0.3154422640800476,
"learning_rate": 0.0005191636045494312,
"loss": 3.493,
"step": 23200
},
{
"epoch": 6.776585820895522,
"grad_norm": 0.3303772211074829,
"learning_rate": 0.0005189886264216972,
"loss": 3.4903,
"step": 23250
},
{
"epoch": 6.791161380597015,
"grad_norm": 0.3244941830635071,
"learning_rate": 0.0005188136482939632,
"loss": 3.4991,
"step": 23300
},
{
"epoch": 6.805736940298507,
"grad_norm": 0.3155946135520935,
"learning_rate": 0.0005186386701662291,
"loss": 3.4911,
"step": 23350
},
{
"epoch": 6.8203125,
"grad_norm": 0.31832653284072876,
"learning_rate": 0.0005184636920384951,
"loss": 3.4976,
"step": 23400
},
{
"epoch": 6.834888059701493,
"grad_norm": 0.3354407846927643,
"learning_rate": 0.0005182887139107611,
"loss": 3.4912,
"step": 23450
},
{
"epoch": 6.849463619402985,
"grad_norm": 0.3141067326068878,
"learning_rate": 0.0005181137357830271,
"loss": 3.4983,
"step": 23500
},
{
"epoch": 6.864039179104478,
"grad_norm": 0.31598249077796936,
"learning_rate": 0.000517938757655293,
"loss": 3.501,
"step": 23550
},
{
"epoch": 6.87861473880597,
"grad_norm": 0.3097233772277832,
"learning_rate": 0.000517763779527559,
"loss": 3.5097,
"step": 23600
},
{
"epoch": 6.893190298507463,
"grad_norm": 0.3032672107219696,
"learning_rate": 0.000517588801399825,
"loss": 3.4993,
"step": 23650
},
{
"epoch": 6.907765858208955,
"grad_norm": 0.3242533802986145,
"learning_rate": 0.0005174138232720909,
"loss": 3.5084,
"step": 23700
},
{
"epoch": 6.922341417910448,
"grad_norm": 0.30605459213256836,
"learning_rate": 0.0005172388451443569,
"loss": 3.5068,
"step": 23750
},
{
"epoch": 6.93691697761194,
"grad_norm": 0.31153604388237,
"learning_rate": 0.0005170638670166229,
"loss": 3.5068,
"step": 23800
},
{
"epoch": 6.951492537313433,
"grad_norm": 0.3352176547050476,
"learning_rate": 0.0005168888888888889,
"loss": 3.5053,
"step": 23850
},
{
"epoch": 6.966068097014926,
"grad_norm": 0.347098171710968,
"learning_rate": 0.0005167139107611548,
"loss": 3.4885,
"step": 23900
},
{
"epoch": 6.980643656716418,
"grad_norm": 0.31401047110557556,
"learning_rate": 0.0005165389326334208,
"loss": 3.5066,
"step": 23950
},
{
"epoch": 6.995219216417911,
"grad_norm": 0.327211856842041,
"learning_rate": 0.0005163639545056868,
"loss": 3.5047,
"step": 24000
},
{
"epoch": 6.995219216417911,
"eval_accuracy": 0.36335613790174826,
"eval_loss": 3.6004271507263184,
"eval_runtime": 185.5366,
"eval_samples_per_second": 89.616,
"eval_steps_per_second": 5.605,
"step": 24000
},
{
"epoch": 7.009619869402985,
"grad_norm": 0.30569273233413696,
"learning_rate": 0.0005161889763779528,
"loss": 3.428,
"step": 24050
},
{
"epoch": 7.024195429104478,
"grad_norm": 0.32120874524116516,
"learning_rate": 0.0005160139982502187,
"loss": 3.3881,
"step": 24100
},
{
"epoch": 7.03877098880597,
"grad_norm": 0.33526530861854553,
"learning_rate": 0.0005158390201224847,
"loss": 3.4098,
"step": 24150
},
{
"epoch": 7.053346548507463,
"grad_norm": 0.3269498348236084,
"learning_rate": 0.0005156640419947507,
"loss": 3.3953,
"step": 24200
},
{
"epoch": 7.067922108208955,
"grad_norm": 0.3256247639656067,
"learning_rate": 0.0005154890638670167,
"loss": 3.3965,
"step": 24250
},
{
"epoch": 7.082497667910448,
"grad_norm": 0.3173428177833557,
"learning_rate": 0.0005153140857392825,
"loss": 3.4165,
"step": 24300
},
{
"epoch": 7.09707322761194,
"grad_norm": 0.3311133086681366,
"learning_rate": 0.0005151391076115485,
"loss": 3.4155,
"step": 24350
},
{
"epoch": 7.111648787313433,
"grad_norm": 0.3398746848106384,
"learning_rate": 0.0005149641294838145,
"loss": 3.4204,
"step": 24400
},
{
"epoch": 7.126224347014926,
"grad_norm": 0.32222801446914673,
"learning_rate": 0.0005147891513560804,
"loss": 3.4071,
"step": 24450
},
{
"epoch": 7.140799906716418,
"grad_norm": 0.3478772044181824,
"learning_rate": 0.0005146141732283464,
"loss": 3.4268,
"step": 24500
},
{
"epoch": 7.155375466417911,
"grad_norm": 0.33055752515792847,
"learning_rate": 0.0005144391951006124,
"loss": 3.4221,
"step": 24550
},
{
"epoch": 7.169951026119403,
"grad_norm": 0.32589223980903625,
"learning_rate": 0.0005142642169728783,
"loss": 3.421,
"step": 24600
},
{
"epoch": 7.184526585820896,
"grad_norm": 0.30772513151168823,
"learning_rate": 0.0005140892388451443,
"loss": 3.4371,
"step": 24650
},
{
"epoch": 7.199102145522388,
"grad_norm": 0.32640331983566284,
"learning_rate": 0.0005139142607174103,
"loss": 3.4217,
"step": 24700
},
{
"epoch": 7.213677705223881,
"grad_norm": 0.3319501578807831,
"learning_rate": 0.0005137392825896762,
"loss": 3.4361,
"step": 24750
},
{
"epoch": 7.228253264925373,
"grad_norm": 0.323111891746521,
"learning_rate": 0.0005135643044619422,
"loss": 3.4226,
"step": 24800
},
{
"epoch": 7.242828824626866,
"grad_norm": 0.3399120569229126,
"learning_rate": 0.0005133893263342082,
"loss": 3.4465,
"step": 24850
},
{
"epoch": 7.257404384328359,
"grad_norm": 0.3136339485645294,
"learning_rate": 0.0005132143482064742,
"loss": 3.4355,
"step": 24900
},
{
"epoch": 7.271979944029851,
"grad_norm": 0.3228020668029785,
"learning_rate": 0.0005130393700787401,
"loss": 3.4474,
"step": 24950
},
{
"epoch": 7.286555503731344,
"grad_norm": 0.3219752311706543,
"learning_rate": 0.0005128643919510061,
"loss": 3.4571,
"step": 25000
},
{
"epoch": 7.286555503731344,
"eval_accuracy": 0.3633374240869394,
"eval_loss": 3.608869791030884,
"eval_runtime": 179.1593,
"eval_samples_per_second": 92.806,
"eval_steps_per_second": 5.805,
"step": 25000
},
{
"epoch": 7.301131063432836,
"grad_norm": 0.33547621965408325,
"learning_rate": 0.0005126894138232721,
"loss": 3.4439,
"step": 25050
},
{
"epoch": 7.315706623134329,
"grad_norm": 0.3228926956653595,
"learning_rate": 0.000512514435695538,
"loss": 3.425,
"step": 25100
},
{
"epoch": 7.330282182835821,
"grad_norm": 0.3242122530937195,
"learning_rate": 0.000512339457567804,
"loss": 3.4526,
"step": 25150
},
{
"epoch": 7.344857742537314,
"grad_norm": 0.32178714871406555,
"learning_rate": 0.00051216447944007,
"loss": 3.4547,
"step": 25200
},
{
"epoch": 7.359433302238806,
"grad_norm": 0.3509860932826996,
"learning_rate": 0.0005119895013123358,
"loss": 3.4411,
"step": 25250
},
{
"epoch": 7.374008861940299,
"grad_norm": 0.3500162661075592,
"learning_rate": 0.0005118145231846018,
"loss": 3.4378,
"step": 25300
},
{
"epoch": 7.388584421641791,
"grad_norm": 0.3226238191127777,
"learning_rate": 0.0005116395450568678,
"loss": 3.4573,
"step": 25350
},
{
"epoch": 7.403159981343284,
"grad_norm": 0.32828742265701294,
"learning_rate": 0.0005114645669291338,
"loss": 3.4468,
"step": 25400
},
{
"epoch": 7.4177355410447765,
"grad_norm": 0.3429689109325409,
"learning_rate": 0.0005112895888013997,
"loss": 3.4415,
"step": 25450
},
{
"epoch": 7.4323111007462686,
"grad_norm": 0.323826402425766,
"learning_rate": 0.0005111146106736657,
"loss": 3.4305,
"step": 25500
},
{
"epoch": 7.4468866604477615,
"grad_norm": 0.32293814420700073,
"learning_rate": 0.0005109396325459317,
"loss": 3.4567,
"step": 25550
},
{
"epoch": 7.4614622201492535,
"grad_norm": 0.32880374789237976,
"learning_rate": 0.0005107646544181976,
"loss": 3.4482,
"step": 25600
},
{
"epoch": 7.4760377798507465,
"grad_norm": 0.3458571135997772,
"learning_rate": 0.0005105896762904636,
"loss": 3.4586,
"step": 25650
},
{
"epoch": 7.4906133395522385,
"grad_norm": 0.3313862681388855,
"learning_rate": 0.0005104146981627296,
"loss": 3.4506,
"step": 25700
},
{
"epoch": 7.5051888992537314,
"grad_norm": 0.34324750304222107,
"learning_rate": 0.0005102397200349956,
"loss": 3.4432,
"step": 25750
},
{
"epoch": 7.5197644589552235,
"grad_norm": 0.3350330889225006,
"learning_rate": 0.0005100647419072615,
"loss": 3.4469,
"step": 25800
},
{
"epoch": 7.534340018656716,
"grad_norm": 0.340985506772995,
"learning_rate": 0.0005098897637795275,
"loss": 3.4555,
"step": 25850
},
{
"epoch": 7.5489155783582085,
"grad_norm": 0.3093026876449585,
"learning_rate": 0.0005097147856517935,
"loss": 3.4631,
"step": 25900
},
{
"epoch": 7.563491138059701,
"grad_norm": 0.33767765760421753,
"learning_rate": 0.0005095398075240595,
"loss": 3.4663,
"step": 25950
},
{
"epoch": 7.578066697761194,
"grad_norm": 0.3200538754463196,
"learning_rate": 0.0005093648293963254,
"loss": 3.4564,
"step": 26000
},
{
"epoch": 7.578066697761194,
"eval_accuracy": 0.3640933915870457,
"eval_loss": 3.600985288619995,
"eval_runtime": 179.1567,
"eval_samples_per_second": 92.807,
"eval_steps_per_second": 5.805,
"step": 26000
},
{
"epoch": 7.592642257462686,
"grad_norm": 0.3065427541732788,
"learning_rate": 0.0005091898512685914,
"loss": 3.4663,
"step": 26050
},
{
"epoch": 7.607217817164179,
"grad_norm": 0.34799593687057495,
"learning_rate": 0.0005090148731408574,
"loss": 3.4774,
"step": 26100
},
{
"epoch": 7.621793376865671,
"grad_norm": 0.31384772062301636,
"learning_rate": 0.0005088398950131234,
"loss": 3.4512,
"step": 26150
},
{
"epoch": 7.636368936567164,
"grad_norm": 0.3293142318725586,
"learning_rate": 0.0005086649168853893,
"loss": 3.4681,
"step": 26200
},
{
"epoch": 7.650944496268656,
"grad_norm": 0.3129211366176605,
"learning_rate": 0.0005084899387576553,
"loss": 3.4744,
"step": 26250
},
{
"epoch": 7.665520055970149,
"grad_norm": 0.33139219880104065,
"learning_rate": 0.0005083149606299213,
"loss": 3.4574,
"step": 26300
},
{
"epoch": 7.680095615671641,
"grad_norm": 0.3320215344429016,
"learning_rate": 0.0005081399825021873,
"loss": 3.4782,
"step": 26350
},
{
"epoch": 7.694671175373134,
"grad_norm": 0.32884448766708374,
"learning_rate": 0.0005079650043744531,
"loss": 3.46,
"step": 26400
},
{
"epoch": 7.709246735074627,
"grad_norm": 0.316210001707077,
"learning_rate": 0.0005077900262467191,
"loss": 3.4649,
"step": 26450
},
{
"epoch": 7.723822294776119,
"grad_norm": 0.3186090290546417,
"learning_rate": 0.0005076150481189851,
"loss": 3.4714,
"step": 26500
},
{
"epoch": 7.738397854477612,
"grad_norm": 0.29783856868743896,
"learning_rate": 0.000507440069991251,
"loss": 3.4626,
"step": 26550
},
{
"epoch": 7.752973414179104,
"grad_norm": 0.3331296145915985,
"learning_rate": 0.000507265091863517,
"loss": 3.4725,
"step": 26600
},
{
"epoch": 7.767548973880597,
"grad_norm": 0.3241812288761139,
"learning_rate": 0.000507090113735783,
"loss": 3.4666,
"step": 26650
},
{
"epoch": 7.782124533582089,
"grad_norm": 0.32092708349227905,
"learning_rate": 0.000506915135608049,
"loss": 3.4683,
"step": 26700
},
{
"epoch": 7.796700093283582,
"grad_norm": 0.3200468420982361,
"learning_rate": 0.0005067401574803149,
"loss": 3.4765,
"step": 26750
},
{
"epoch": 7.811275652985074,
"grad_norm": 0.30555614829063416,
"learning_rate": 0.0005065651793525809,
"loss": 3.4658,
"step": 26800
},
{
"epoch": 7.825851212686567,
"grad_norm": 0.3325377404689789,
"learning_rate": 0.0005063902012248469,
"loss": 3.4648,
"step": 26850
},
{
"epoch": 7.84042677238806,
"grad_norm": 0.32855042815208435,
"learning_rate": 0.0005062152230971128,
"loss": 3.4646,
"step": 26900
},
{
"epoch": 7.855002332089552,
"grad_norm": 0.3337857127189636,
"learning_rate": 0.0005060402449693788,
"loss": 3.4775,
"step": 26950
},
{
"epoch": 7.869577891791045,
"grad_norm": 0.33446067571640015,
"learning_rate": 0.0005058652668416448,
"loss": 3.4619,
"step": 27000
},
{
"epoch": 7.869577891791045,
"eval_accuracy": 0.3644832038804213,
"eval_loss": 3.592026472091675,
"eval_runtime": 179.258,
"eval_samples_per_second": 92.755,
"eval_steps_per_second": 5.802,
"step": 27000
},
{
"epoch": 7.884153451492537,
"grad_norm": 0.3174329400062561,
"learning_rate": 0.0005056902887139107,
"loss": 3.4611,
"step": 27050
},
{
"epoch": 7.89872901119403,
"grad_norm": 0.3485661745071411,
"learning_rate": 0.0005055153105861767,
"loss": 3.4636,
"step": 27100
},
{
"epoch": 7.913304570895522,
"grad_norm": 0.3340265154838562,
"learning_rate": 0.0005053403324584427,
"loss": 3.4708,
"step": 27150
},
{
"epoch": 7.927880130597015,
"grad_norm": 0.3334951102733612,
"learning_rate": 0.0005051653543307086,
"loss": 3.4747,
"step": 27200
},
{
"epoch": 7.942455690298507,
"grad_norm": 0.3194654583930969,
"learning_rate": 0.0005049903762029746,
"loss": 3.4683,
"step": 27250
},
{
"epoch": 7.95703125,
"grad_norm": 0.34018611907958984,
"learning_rate": 0.0005048153980752406,
"loss": 3.4781,
"step": 27300
},
{
"epoch": 7.971606809701493,
"grad_norm": 0.32004550099372864,
"learning_rate": 0.0005046404199475064,
"loss": 3.4654,
"step": 27350
},
{
"epoch": 7.986182369402985,
"grad_norm": 0.30326974391937256,
"learning_rate": 0.0005044654418197724,
"loss": 3.4611,
"step": 27400
},
{
"epoch": 8.00058302238806,
"grad_norm": 0.3574485182762146,
"learning_rate": 0.0005042904636920384,
"loss": 3.4766,
"step": 27450
},
{
"epoch": 8.015158582089553,
"grad_norm": 0.3087901771068573,
"learning_rate": 0.0005041154855643044,
"loss": 3.3484,
"step": 27500
},
{
"epoch": 8.029734141791044,
"grad_norm": 0.3568093180656433,
"learning_rate": 0.0005039405074365703,
"loss": 3.3668,
"step": 27550
},
{
"epoch": 8.044309701492537,
"grad_norm": 0.342636376619339,
"learning_rate": 0.0005037655293088363,
"loss": 3.3701,
"step": 27600
},
{
"epoch": 8.05888526119403,
"grad_norm": 0.3330060541629791,
"learning_rate": 0.0005035905511811023,
"loss": 3.3651,
"step": 27650
},
{
"epoch": 8.073460820895523,
"grad_norm": 0.3401563763618469,
"learning_rate": 0.0005034155730533682,
"loss": 3.3622,
"step": 27700
},
{
"epoch": 8.088036380597014,
"grad_norm": 0.32643309235572815,
"learning_rate": 0.0005032405949256342,
"loss": 3.3783,
"step": 27750
},
{
"epoch": 8.102611940298507,
"grad_norm": 0.35396072268486023,
"learning_rate": 0.0005030656167979002,
"loss": 3.3945,
"step": 27800
},
{
"epoch": 8.1171875,
"grad_norm": 0.32265105843544006,
"learning_rate": 0.0005028906386701662,
"loss": 3.3832,
"step": 27850
},
{
"epoch": 8.131763059701493,
"grad_norm": 0.32299792766571045,
"learning_rate": 0.0005027156605424321,
"loss": 3.3836,
"step": 27900
},
{
"epoch": 8.146338619402986,
"grad_norm": 0.3399452567100525,
"learning_rate": 0.0005025406824146981,
"loss": 3.3942,
"step": 27950
},
{
"epoch": 8.160914179104477,
"grad_norm": 0.3137241303920746,
"learning_rate": 0.0005023657042869641,
"loss": 3.3915,
"step": 28000
},
{
"epoch": 8.160914179104477,
"eval_accuracy": 0.3645743013185471,
"eval_loss": 3.599012613296509,
"eval_runtime": 179.2858,
"eval_samples_per_second": 92.74,
"eval_steps_per_second": 5.801,
"step": 28000
},
{
"epoch": 8.17548973880597,
"grad_norm": 0.3267982602119446,
"learning_rate": 0.0005021907261592301,
"loss": 3.3934,
"step": 28050
},
{
"epoch": 8.190065298507463,
"grad_norm": 0.3097693622112274,
"learning_rate": 0.000502015748031496,
"loss": 3.3953,
"step": 28100
},
{
"epoch": 8.204640858208956,
"grad_norm": 0.32897573709487915,
"learning_rate": 0.000501840769903762,
"loss": 3.4011,
"step": 28150
},
{
"epoch": 8.219216417910447,
"grad_norm": 0.3252919912338257,
"learning_rate": 0.000501665791776028,
"loss": 3.3996,
"step": 28200
},
{
"epoch": 8.23379197761194,
"grad_norm": 0.31856435537338257,
"learning_rate": 0.0005014908136482939,
"loss": 3.3998,
"step": 28250
},
{
"epoch": 8.248367537313433,
"grad_norm": 0.3390394449234009,
"learning_rate": 0.0005013158355205599,
"loss": 3.3888,
"step": 28300
},
{
"epoch": 8.262943097014926,
"grad_norm": 0.340518593788147,
"learning_rate": 0.0005011408573928259,
"loss": 3.4207,
"step": 28350
},
{
"epoch": 8.277518656716419,
"grad_norm": 0.323466956615448,
"learning_rate": 0.0005009658792650919,
"loss": 3.4109,
"step": 28400
},
{
"epoch": 8.29209421641791,
"grad_norm": 0.3533046245574951,
"learning_rate": 0.0005007909011373577,
"loss": 3.4048,
"step": 28450
},
{
"epoch": 8.306669776119403,
"grad_norm": 0.33783796429634094,
"learning_rate": 0.0005006159230096237,
"loss": 3.4087,
"step": 28500
},
{
"epoch": 8.321245335820896,
"grad_norm": 0.36026251316070557,
"learning_rate": 0.0005004409448818897,
"loss": 3.4204,
"step": 28550
},
{
"epoch": 8.335820895522389,
"grad_norm": 0.3321758508682251,
"learning_rate": 0.0005002659667541557,
"loss": 3.4211,
"step": 28600
},
{
"epoch": 8.35039645522388,
"grad_norm": 0.33608278632164,
"learning_rate": 0.0005000909886264216,
"loss": 3.4177,
"step": 28650
},
{
"epoch": 8.364972014925373,
"grad_norm": 0.3608180582523346,
"learning_rate": 0.0004999160104986876,
"loss": 3.4184,
"step": 28700
},
{
"epoch": 8.379547574626866,
"grad_norm": 0.3502536714076996,
"learning_rate": 0.0004997410323709536,
"loss": 3.4156,
"step": 28750
},
{
"epoch": 8.394123134328359,
"grad_norm": 0.3335579037666321,
"learning_rate": 0.0004995660542432196,
"loss": 3.4176,
"step": 28800
},
{
"epoch": 8.408698694029852,
"grad_norm": 0.31768515706062317,
"learning_rate": 0.0004993910761154855,
"loss": 3.4304,
"step": 28850
},
{
"epoch": 8.423274253731343,
"grad_norm": 0.3465861976146698,
"learning_rate": 0.0004992160979877515,
"loss": 3.4346,
"step": 28900
},
{
"epoch": 8.437849813432836,
"grad_norm": 0.33403196930885315,
"learning_rate": 0.0004990411198600175,
"loss": 3.4151,
"step": 28950
},
{
"epoch": 8.452425373134329,
"grad_norm": 0.3128868341445923,
"learning_rate": 0.0004988661417322835,
"loss": 3.4238,
"step": 29000
},
{
"epoch": 8.452425373134329,
"eval_accuracy": 0.3648340584838491,
"eval_loss": 3.5954160690307617,
"eval_runtime": 179.3003,
"eval_samples_per_second": 92.733,
"eval_steps_per_second": 5.8,
"step": 29000
},
{
"epoch": 8.467000932835822,
"grad_norm": 0.331660658121109,
"learning_rate": 0.0004986911636045494,
"loss": 3.425,
"step": 29050
},
{
"epoch": 8.481576492537313,
"grad_norm": 0.3326253294944763,
"learning_rate": 0.0004985161854768154,
"loss": 3.4186,
"step": 29100
},
{
"epoch": 8.496152052238806,
"grad_norm": 0.32247307896614075,
"learning_rate": 0.0004983412073490814,
"loss": 3.4258,
"step": 29150
},
{
"epoch": 8.510727611940299,
"grad_norm": 0.3508327901363373,
"learning_rate": 0.0004981662292213473,
"loss": 3.422,
"step": 29200
},
{
"epoch": 8.525303171641792,
"grad_norm": 0.33560535311698914,
"learning_rate": 0.0004979912510936133,
"loss": 3.422,
"step": 29250
},
{
"epoch": 8.539878731343283,
"grad_norm": 0.3057290017604828,
"learning_rate": 0.0004978162729658793,
"loss": 3.4313,
"step": 29300
},
{
"epoch": 8.554454291044776,
"grad_norm": 0.327250212430954,
"learning_rate": 0.0004976412948381452,
"loss": 3.4276,
"step": 29350
},
{
"epoch": 8.569029850746269,
"grad_norm": 0.32968395948410034,
"learning_rate": 0.0004974663167104112,
"loss": 3.4331,
"step": 29400
},
{
"epoch": 8.583605410447761,
"grad_norm": 0.32354989647865295,
"learning_rate": 0.0004972913385826772,
"loss": 3.426,
"step": 29450
},
{
"epoch": 8.598180970149254,
"grad_norm": 0.3291105031967163,
"learning_rate": 0.000497116360454943,
"loss": 3.4402,
"step": 29500
},
{
"epoch": 8.612756529850746,
"grad_norm": 0.329121470451355,
"learning_rate": 0.000496941382327209,
"loss": 3.4417,
"step": 29550
},
{
"epoch": 8.627332089552239,
"grad_norm": 0.3463682532310486,
"learning_rate": 0.000496766404199475,
"loss": 3.4281,
"step": 29600
},
{
"epoch": 8.641907649253731,
"grad_norm": 0.32956844568252563,
"learning_rate": 0.0004965914260717409,
"loss": 3.4311,
"step": 29650
},
{
"epoch": 8.656483208955224,
"grad_norm": 0.305973619222641,
"learning_rate": 0.0004964164479440069,
"loss": 3.4449,
"step": 29700
},
{
"epoch": 8.671058768656717,
"grad_norm": 0.32016459107398987,
"learning_rate": 0.0004962414698162729,
"loss": 3.4306,
"step": 29750
},
{
"epoch": 8.685634328358208,
"grad_norm": 0.3539639711380005,
"learning_rate": 0.0004960664916885388,
"loss": 3.4281,
"step": 29800
},
{
"epoch": 8.700209888059701,
"grad_norm": 0.3156653046607971,
"learning_rate": 0.0004958915135608048,
"loss": 3.4358,
"step": 29850
},
{
"epoch": 8.714785447761194,
"grad_norm": 0.3207005262374878,
"learning_rate": 0.0004957165354330708,
"loss": 3.4357,
"step": 29900
},
{
"epoch": 8.729361007462687,
"grad_norm": 0.3273802101612091,
"learning_rate": 0.0004955415573053368,
"loss": 3.4446,
"step": 29950
},
{
"epoch": 8.743936567164178,
"grad_norm": 0.3333602249622345,
"learning_rate": 0.0004953665791776027,
"loss": 3.433,
"step": 30000
},
{
"epoch": 8.743936567164178,
"eval_accuracy": 0.3657406780780135,
"eval_loss": 3.5863289833068848,
"eval_runtime": 179.2319,
"eval_samples_per_second": 92.768,
"eval_steps_per_second": 5.803,
"step": 30000
},
{
"epoch": 8.758512126865671,
"grad_norm": 0.31307345628738403,
"learning_rate": 0.0004951916010498687,
"loss": 3.4439,
"step": 30050
},
{
"epoch": 8.773087686567164,
"grad_norm": 0.3173314332962036,
"learning_rate": 0.0004950166229221347,
"loss": 3.4468,
"step": 30100
},
{
"epoch": 8.787663246268657,
"grad_norm": 0.3451690673828125,
"learning_rate": 0.0004948416447944006,
"loss": 3.4319,
"step": 30150
},
{
"epoch": 8.802238805970148,
"grad_norm": 0.33415162563323975,
"learning_rate": 0.0004946666666666666,
"loss": 3.4383,
"step": 30200
},
{
"epoch": 8.816814365671641,
"grad_norm": 0.32540515065193176,
"learning_rate": 0.0004944916885389326,
"loss": 3.4377,
"step": 30250
},
{
"epoch": 8.831389925373134,
"grad_norm": 0.3139285147190094,
"learning_rate": 0.0004943167104111986,
"loss": 3.4488,
"step": 30300
},
{
"epoch": 8.845965485074627,
"grad_norm": 0.3355214297771454,
"learning_rate": 0.0004941417322834645,
"loss": 3.4469,
"step": 30350
},
{
"epoch": 8.86054104477612,
"grad_norm": 0.34595736861228943,
"learning_rate": 0.0004939667541557305,
"loss": 3.4522,
"step": 30400
},
{
"epoch": 8.875116604477611,
"grad_norm": 0.3158757984638214,
"learning_rate": 0.0004937917760279965,
"loss": 3.4508,
"step": 30450
},
{
"epoch": 8.889692164179104,
"grad_norm": 0.32975685596466064,
"learning_rate": 0.0004936167979002625,
"loss": 3.4354,
"step": 30500
},
{
"epoch": 8.904267723880597,
"grad_norm": 0.31538817286491394,
"learning_rate": 0.0004934418197725284,
"loss": 3.4436,
"step": 30550
},
{
"epoch": 8.91884328358209,
"grad_norm": 0.33921748399734497,
"learning_rate": 0.0004932668416447943,
"loss": 3.4429,
"step": 30600
},
{
"epoch": 8.933418843283581,
"grad_norm": 0.3398772180080414,
"learning_rate": 0.0004930918635170603,
"loss": 3.4494,
"step": 30650
},
{
"epoch": 8.947994402985074,
"grad_norm": 0.33406862616539,
"learning_rate": 0.0004929168853893263,
"loss": 3.4469,
"step": 30700
},
{
"epoch": 8.962569962686567,
"grad_norm": 0.3257124125957489,
"learning_rate": 0.0004927419072615922,
"loss": 3.4456,
"step": 30750
},
{
"epoch": 8.97714552238806,
"grad_norm": 0.3062405288219452,
"learning_rate": 0.0004925669291338582,
"loss": 3.4359,
"step": 30800
},
{
"epoch": 8.991721082089553,
"grad_norm": 0.34552669525146484,
"learning_rate": 0.0004923919510061242,
"loss": 3.4475,
"step": 30850
},
{
"epoch": 9.006121735074627,
"grad_norm": 0.3198220431804657,
"learning_rate": 0.0004922169728783901,
"loss": 3.3947,
"step": 30900
},
{
"epoch": 9.02069729477612,
"grad_norm": 0.34176215529441833,
"learning_rate": 0.0004920419947506561,
"loss": 3.3396,
"step": 30950
},
{
"epoch": 9.035272854477611,
"grad_norm": 0.3633732497692108,
"learning_rate": 0.0004918670166229221,
"loss": 3.3406,
"step": 31000
},
{
"epoch": 9.035272854477611,
"eval_accuracy": 0.3655304713280229,
"eval_loss": 3.5907046794891357,
"eval_runtime": 179.2394,
"eval_samples_per_second": 92.764,
"eval_steps_per_second": 5.802,
"step": 31000
},
{
"epoch": 9.049848414179104,
"grad_norm": 0.32831311225891113,
"learning_rate": 0.0004916920384951881,
"loss": 3.3294,
"step": 31050
},
{
"epoch": 9.064423973880597,
"grad_norm": 0.33142951130867004,
"learning_rate": 0.000491517060367454,
"loss": 3.348,
"step": 31100
},
{
"epoch": 9.07899953358209,
"grad_norm": 0.34713199734687805,
"learning_rate": 0.00049134208223972,
"loss": 3.351,
"step": 31150
},
{
"epoch": 9.093575093283581,
"grad_norm": 0.3427370488643646,
"learning_rate": 0.000491167104111986,
"loss": 3.3562,
"step": 31200
},
{
"epoch": 9.108150652985074,
"grad_norm": 0.3258208930492401,
"learning_rate": 0.000490992125984252,
"loss": 3.3591,
"step": 31250
},
{
"epoch": 9.122726212686567,
"grad_norm": 0.3553750813007355,
"learning_rate": 0.0004908171478565179,
"loss": 3.3694,
"step": 31300
},
{
"epoch": 9.13730177238806,
"grad_norm": 0.32988446950912476,
"learning_rate": 0.0004906421697287839,
"loss": 3.3654,
"step": 31350
},
{
"epoch": 9.151877332089553,
"grad_norm": 0.33832770586013794,
"learning_rate": 0.0004904671916010499,
"loss": 3.362,
"step": 31400
},
{
"epoch": 9.166452891791044,
"grad_norm": 0.3746139109134674,
"learning_rate": 0.0004902922134733158,
"loss": 3.3804,
"step": 31450
},
{
"epoch": 9.181028451492537,
"grad_norm": 0.32792752981185913,
"learning_rate": 0.0004901172353455818,
"loss": 3.3784,
"step": 31500
},
{
"epoch": 9.19560401119403,
"grad_norm": 0.36606335639953613,
"learning_rate": 0.0004899422572178478,
"loss": 3.3747,
"step": 31550
},
{
"epoch": 9.210179570895523,
"grad_norm": 0.3486144244670868,
"learning_rate": 0.0004897672790901138,
"loss": 3.3741,
"step": 31600
},
{
"epoch": 9.224755130597014,
"grad_norm": 0.3726823925971985,
"learning_rate": 0.0004895923009623796,
"loss": 3.3728,
"step": 31650
},
{
"epoch": 9.239330690298507,
"grad_norm": 0.35447168350219727,
"learning_rate": 0.0004894173228346456,
"loss": 3.3664,
"step": 31700
},
{
"epoch": 9.25390625,
"grad_norm": 0.3537440598011017,
"learning_rate": 0.0004892423447069116,
"loss": 3.3792,
"step": 31750
},
{
"epoch": 9.268481809701493,
"grad_norm": 0.33133864402770996,
"learning_rate": 0.0004890673665791775,
"loss": 3.3743,
"step": 31800
},
{
"epoch": 9.283057369402986,
"grad_norm": 0.34640932083129883,
"learning_rate": 0.0004888923884514435,
"loss": 3.3848,
"step": 31850
},
{
"epoch": 9.297632929104477,
"grad_norm": 0.3547460734844208,
"learning_rate": 0.0004887174103237095,
"loss": 3.3836,
"step": 31900
},
{
"epoch": 9.31220848880597,
"grad_norm": 0.3356497883796692,
"learning_rate": 0.0004885424321959754,
"loss": 3.389,
"step": 31950
},
{
"epoch": 9.326784048507463,
"grad_norm": 0.33560049533843994,
"learning_rate": 0.0004883674540682414,
"loss": 3.3767,
"step": 32000
},
{
"epoch": 9.326784048507463,
"eval_accuracy": 0.36614790951976467,
"eval_loss": 3.587216854095459,
"eval_runtime": 179.274,
"eval_samples_per_second": 92.746,
"eval_steps_per_second": 5.801,
"step": 32000
},
{
"epoch": 9.341359608208956,
"grad_norm": 0.3377925753593445,
"learning_rate": 0.00048819247594050736,
"loss": 3.3874,
"step": 32050
},
{
"epoch": 9.355935167910447,
"grad_norm": 0.3350276052951813,
"learning_rate": 0.00048801749781277336,
"loss": 3.3886,
"step": 32100
},
{
"epoch": 9.37051072761194,
"grad_norm": 0.326797217130661,
"learning_rate": 0.00048784251968503936,
"loss": 3.4037,
"step": 32150
},
{
"epoch": 9.385086287313433,
"grad_norm": 0.33711764216423035,
"learning_rate": 0.0004876675415573053,
"loss": 3.384,
"step": 32200
},
{
"epoch": 9.399661847014926,
"grad_norm": 0.34087830781936646,
"learning_rate": 0.00048749256342957124,
"loss": 3.3944,
"step": 32250
},
{
"epoch": 9.414237406716419,
"grad_norm": 0.31658002734184265,
"learning_rate": 0.00048731758530183724,
"loss": 3.4046,
"step": 32300
},
{
"epoch": 9.42881296641791,
"grad_norm": 0.32896366715431213,
"learning_rate": 0.0004871426071741032,
"loss": 3.3909,
"step": 32350
},
{
"epoch": 9.443388526119403,
"grad_norm": 0.3304135799407959,
"learning_rate": 0.0004869676290463692,
"loss": 3.4053,
"step": 32400
},
{
"epoch": 9.457964085820896,
"grad_norm": 0.3395926356315613,
"learning_rate": 0.0004867926509186351,
"loss": 3.3937,
"step": 32450
},
{
"epoch": 9.472539645522389,
"grad_norm": 0.35604408383369446,
"learning_rate": 0.00048661767279090107,
"loss": 3.3919,
"step": 32500
},
{
"epoch": 9.48711520522388,
"grad_norm": 0.33686935901641846,
"learning_rate": 0.00048644269466316707,
"loss": 3.393,
"step": 32550
},
{
"epoch": 9.501690764925373,
"grad_norm": 0.3407963216304779,
"learning_rate": 0.00048626771653543306,
"loss": 3.4004,
"step": 32600
},
{
"epoch": 9.516266324626866,
"grad_norm": 0.3341701030731201,
"learning_rate": 0.00048609273840769895,
"loss": 3.4022,
"step": 32650
},
{
"epoch": 9.530841884328359,
"grad_norm": 0.3404782712459564,
"learning_rate": 0.00048591776027996495,
"loss": 3.4092,
"step": 32700
},
{
"epoch": 9.545417444029852,
"grad_norm": 0.346642404794693,
"learning_rate": 0.00048574278215223095,
"loss": 3.4079,
"step": 32750
},
{
"epoch": 9.559993003731343,
"grad_norm": 0.3336687386035919,
"learning_rate": 0.0004855678040244969,
"loss": 3.4083,
"step": 32800
},
{
"epoch": 9.574568563432836,
"grad_norm": 0.33750444650650024,
"learning_rate": 0.00048539282589676283,
"loss": 3.4093,
"step": 32850
},
{
"epoch": 9.589144123134329,
"grad_norm": 0.3416513502597809,
"learning_rate": 0.00048521784776902883,
"loss": 3.3882,
"step": 32900
},
{
"epoch": 9.603719682835822,
"grad_norm": 0.3458026051521301,
"learning_rate": 0.00048504286964129483,
"loss": 3.4058,
"step": 32950
},
{
"epoch": 9.618295242537313,
"grad_norm": 0.3294159471988678,
"learning_rate": 0.0004848678915135607,
"loss": 3.4124,
"step": 33000
},
{
"epoch": 9.618295242537313,
"eval_accuracy": 0.3663903652336396,
"eval_loss": 3.578124523162842,
"eval_runtime": 179.3194,
"eval_samples_per_second": 92.723,
"eval_steps_per_second": 5.8,
"step": 33000
},
{
"epoch": 9.632870802238806,
"grad_norm": 0.3293006718158722,
"learning_rate": 0.0004846929133858267,
"loss": 3.4163,
"step": 33050
},
{
"epoch": 9.647446361940299,
"grad_norm": 0.33910810947418213,
"learning_rate": 0.0004845179352580927,
"loss": 3.4063,
"step": 33100
},
{
"epoch": 9.662021921641792,
"grad_norm": 0.32959282398223877,
"learning_rate": 0.0004843429571303587,
"loss": 3.4139,
"step": 33150
},
{
"epoch": 9.676597481343283,
"grad_norm": 0.3160514235496521,
"learning_rate": 0.0004841679790026246,
"loss": 3.4103,
"step": 33200
},
{
"epoch": 9.691173041044776,
"grad_norm": 0.32494455575942993,
"learning_rate": 0.0004839930008748906,
"loss": 3.4003,
"step": 33250
},
{
"epoch": 9.705748600746269,
"grad_norm": 0.33086997270584106,
"learning_rate": 0.0004838180227471566,
"loss": 3.4161,
"step": 33300
},
{
"epoch": 9.720324160447761,
"grad_norm": 0.3385337293148041,
"learning_rate": 0.00048364304461942254,
"loss": 3.4096,
"step": 33350
},
{
"epoch": 9.734899720149254,
"grad_norm": 0.31951361894607544,
"learning_rate": 0.0004834680664916885,
"loss": 3.4144,
"step": 33400
},
{
"epoch": 9.749475279850746,
"grad_norm": 0.32688412070274353,
"learning_rate": 0.0004832930883639545,
"loss": 3.4109,
"step": 33450
},
{
"epoch": 9.764050839552239,
"grad_norm": 0.3271602392196655,
"learning_rate": 0.0004831181102362204,
"loss": 3.4155,
"step": 33500
},
{
"epoch": 9.778626399253731,
"grad_norm": 0.30692335963249207,
"learning_rate": 0.00048294313210848637,
"loss": 3.4077,
"step": 33550
},
{
"epoch": 9.793201958955224,
"grad_norm": 0.35177081823349,
"learning_rate": 0.00048276815398075237,
"loss": 3.419,
"step": 33600
},
{
"epoch": 9.807777518656717,
"grad_norm": 0.3389143645763397,
"learning_rate": 0.0004825931758530183,
"loss": 3.419,
"step": 33650
},
{
"epoch": 9.822353078358208,
"grad_norm": 0.3317539691925049,
"learning_rate": 0.0004824181977252843,
"loss": 3.4218,
"step": 33700
},
{
"epoch": 9.836928638059701,
"grad_norm": 0.3414798378944397,
"learning_rate": 0.00048224321959755025,
"loss": 3.4083,
"step": 33750
},
{
"epoch": 9.851504197761194,
"grad_norm": 0.32761162519454956,
"learning_rate": 0.0004820682414698162,
"loss": 3.4208,
"step": 33800
},
{
"epoch": 9.866079757462687,
"grad_norm": 0.346387654542923,
"learning_rate": 0.0004818932633420822,
"loss": 3.4237,
"step": 33850
},
{
"epoch": 9.880655317164178,
"grad_norm": 0.3281957805156708,
"learning_rate": 0.0004817182852143482,
"loss": 3.4228,
"step": 33900
},
{
"epoch": 9.895230876865671,
"grad_norm": 0.3266942799091339,
"learning_rate": 0.0004815433070866141,
"loss": 3.4279,
"step": 33950
},
{
"epoch": 9.909806436567164,
"grad_norm": 0.314626544713974,
"learning_rate": 0.0004813683289588801,
"loss": 3.4173,
"step": 34000
},
{
"epoch": 9.909806436567164,
"eval_accuracy": 0.367117614678316,
"eval_loss": 3.571929931640625,
"eval_runtime": 179.3176,
"eval_samples_per_second": 92.724,
"eval_steps_per_second": 5.8,
"step": 34000
},
{
"epoch": 9.924381996268657,
"grad_norm": 0.3587126135826111,
"learning_rate": 0.0004811933508311461,
"loss": 3.4121,
"step": 34050
},
{
"epoch": 9.938957555970148,
"grad_norm": 0.32173195481300354,
"learning_rate": 0.00048101837270341207,
"loss": 3.4217,
"step": 34100
},
{
"epoch": 9.953533115671641,
"grad_norm": 0.3450935184955597,
"learning_rate": 0.00048084339457567796,
"loss": 3.425,
"step": 34150
},
{
"epoch": 9.968108675373134,
"grad_norm": 0.35819029808044434,
"learning_rate": 0.00048066841644794396,
"loss": 3.4197,
"step": 34200
},
{
"epoch": 9.982684235074627,
"grad_norm": 0.33679142594337463,
"learning_rate": 0.00048049343832020996,
"loss": 3.4399,
"step": 34250
},
{
"epoch": 9.99725979477612,
"grad_norm": 0.32246530055999756,
"learning_rate": 0.00048031846019247595,
"loss": 3.4234,
"step": 34300
},
{
"epoch": 10.011660447761194,
"grad_norm": 0.35621923208236694,
"learning_rate": 0.00048014348206474184,
"loss": 3.3286,
"step": 34350
},
{
"epoch": 10.026236007462687,
"grad_norm": 0.3410172760486603,
"learning_rate": 0.00047996850393700784,
"loss": 3.317,
"step": 34400
},
{
"epoch": 10.040811567164178,
"grad_norm": 0.3566003441810608,
"learning_rate": 0.00047979352580927384,
"loss": 3.3056,
"step": 34450
},
{
"epoch": 10.055387126865671,
"grad_norm": 0.3618554174900055,
"learning_rate": 0.00047961854768153973,
"loss": 3.324,
"step": 34500
},
{
"epoch": 10.069962686567164,
"grad_norm": 0.33970338106155396,
"learning_rate": 0.0004794435695538057,
"loss": 3.3137,
"step": 34550
},
{
"epoch": 10.084538246268657,
"grad_norm": 0.38800761103630066,
"learning_rate": 0.0004792685914260717,
"loss": 3.3341,
"step": 34600
},
{
"epoch": 10.099113805970148,
"grad_norm": 0.3676946759223938,
"learning_rate": 0.00047909361329833767,
"loss": 3.3303,
"step": 34650
},
{
"epoch": 10.113689365671641,
"grad_norm": 0.31517815589904785,
"learning_rate": 0.0004789186351706036,
"loss": 3.3275,
"step": 34700
},
{
"epoch": 10.128264925373134,
"grad_norm": 0.3755510151386261,
"learning_rate": 0.0004787436570428696,
"loss": 3.3427,
"step": 34750
},
{
"epoch": 10.142840485074627,
"grad_norm": 0.3374476134777069,
"learning_rate": 0.00047856867891513555,
"loss": 3.3345,
"step": 34800
},
{
"epoch": 10.15741604477612,
"grad_norm": 0.3470100462436676,
"learning_rate": 0.00047839370078740155,
"loss": 3.3368,
"step": 34850
},
{
"epoch": 10.171991604477611,
"grad_norm": 0.349649578332901,
"learning_rate": 0.0004782187226596675,
"loss": 3.3515,
"step": 34900
},
{
"epoch": 10.186567164179104,
"grad_norm": 0.35084185004234314,
"learning_rate": 0.00047804374453193344,
"loss": 3.3456,
"step": 34950
},
{
"epoch": 10.201142723880597,
"grad_norm": 0.3345482647418976,
"learning_rate": 0.00047786876640419943,
"loss": 3.353,
"step": 35000
},
{
"epoch": 10.201142723880597,
"eval_accuracy": 0.3668088955824451,
"eval_loss": 3.5851173400878906,
"eval_runtime": 179.1694,
"eval_samples_per_second": 92.8,
"eval_steps_per_second": 5.805,
"step": 35000
},
{
"epoch": 10.21571828358209,
"grad_norm": 0.34957993030548096,
"learning_rate": 0.00047769378827646543,
"loss": 3.3386,
"step": 35050
},
{
"epoch": 10.230293843283581,
"grad_norm": 0.3573317229747772,
"learning_rate": 0.0004775188101487313,
"loss": 3.3499,
"step": 35100
},
{
"epoch": 10.244869402985074,
"grad_norm": 0.3407445251941681,
"learning_rate": 0.0004773438320209973,
"loss": 3.367,
"step": 35150
},
{
"epoch": 10.259444962686567,
"grad_norm": 0.34660759568214417,
"learning_rate": 0.0004771688538932633,
"loss": 3.3599,
"step": 35200
},
{
"epoch": 10.27402052238806,
"grad_norm": 0.3838936388492584,
"learning_rate": 0.0004769938757655293,
"loss": 3.3592,
"step": 35250
},
{
"epoch": 10.288596082089553,
"grad_norm": 0.343936949968338,
"learning_rate": 0.0004768188976377952,
"loss": 3.3729,
"step": 35300
},
{
"epoch": 10.303171641791044,
"grad_norm": 0.35758844017982483,
"learning_rate": 0.0004766439195100612,
"loss": 3.3567,
"step": 35350
},
{
"epoch": 10.317747201492537,
"grad_norm": 0.36581915616989136,
"learning_rate": 0.0004764689413823272,
"loss": 3.3649,
"step": 35400
},
{
"epoch": 10.33232276119403,
"grad_norm": 0.342654287815094,
"learning_rate": 0.0004762939632545931,
"loss": 3.371,
"step": 35450
},
{
"epoch": 10.346898320895523,
"grad_norm": 0.37132593989372253,
"learning_rate": 0.0004761189851268591,
"loss": 3.3667,
"step": 35500
},
{
"epoch": 10.361473880597014,
"grad_norm": 0.32585781812667847,
"learning_rate": 0.0004759440069991251,
"loss": 3.363,
"step": 35550
},
{
"epoch": 10.376049440298507,
"grad_norm": 0.3182028532028198,
"learning_rate": 0.0004757690288713911,
"loss": 3.3668,
"step": 35600
},
{
"epoch": 10.390625,
"grad_norm": 0.3395622968673706,
"learning_rate": 0.00047559405074365697,
"loss": 3.3641,
"step": 35650
},
{
"epoch": 10.405200559701493,
"grad_norm": 0.3391034007072449,
"learning_rate": 0.00047541907261592297,
"loss": 3.3705,
"step": 35700
},
{
"epoch": 10.419776119402986,
"grad_norm": 0.3554554879665375,
"learning_rate": 0.00047524409448818897,
"loss": 3.3717,
"step": 35750
},
{
"epoch": 10.434351679104477,
"grad_norm": 0.3345549404621124,
"learning_rate": 0.0004750691163604549,
"loss": 3.3593,
"step": 35800
},
{
"epoch": 10.44892723880597,
"grad_norm": 0.33761605620384216,
"learning_rate": 0.00047489413823272085,
"loss": 3.3697,
"step": 35850
},
{
"epoch": 10.463502798507463,
"grad_norm": 0.3620910942554474,
"learning_rate": 0.00047471916010498685,
"loss": 3.3873,
"step": 35900
},
{
"epoch": 10.478078358208956,
"grad_norm": 0.3357434570789337,
"learning_rate": 0.0004745441819772528,
"loss": 3.3693,
"step": 35950
},
{
"epoch": 10.492653917910447,
"grad_norm": 0.3724294900894165,
"learning_rate": 0.0004743692038495188,
"loss": 3.386,
"step": 36000
},
{
"epoch": 10.492653917910447,
"eval_accuracy": 0.36732217197477945,
"eval_loss": 3.575730323791504,
"eval_runtime": 179.2041,
"eval_samples_per_second": 92.782,
"eval_steps_per_second": 5.803,
"step": 36000
},
{
"epoch": 10.50722947761194,
"grad_norm": 0.3459193706512451,
"learning_rate": 0.00047419422572178474,
"loss": 3.3686,
"step": 36050
},
{
"epoch": 10.521805037313433,
"grad_norm": 0.34317150712013245,
"learning_rate": 0.0004740192475940507,
"loss": 3.3915,
"step": 36100
},
{
"epoch": 10.536380597014926,
"grad_norm": 0.3495369553565979,
"learning_rate": 0.0004738442694663167,
"loss": 3.3801,
"step": 36150
},
{
"epoch": 10.550956156716419,
"grad_norm": 0.32712510228157043,
"learning_rate": 0.0004736692913385827,
"loss": 3.3806,
"step": 36200
},
{
"epoch": 10.56553171641791,
"grad_norm": 0.3268829584121704,
"learning_rate": 0.00047349431321084856,
"loss": 3.3778,
"step": 36250
},
{
"epoch": 10.580107276119403,
"grad_norm": 0.3383152484893799,
"learning_rate": 0.00047331933508311456,
"loss": 3.3905,
"step": 36300
},
{
"epoch": 10.594682835820896,
"grad_norm": 0.3256557881832123,
"learning_rate": 0.00047314435695538056,
"loss": 3.3769,
"step": 36350
},
{
"epoch": 10.609258395522389,
"grad_norm": 0.3504721224308014,
"learning_rate": 0.00047296937882764645,
"loss": 3.3907,
"step": 36400
},
{
"epoch": 10.62383395522388,
"grad_norm": 0.3189436197280884,
"learning_rate": 0.00047279440069991245,
"loss": 3.3956,
"step": 36450
},
{
"epoch": 10.638409514925373,
"grad_norm": 0.3360075056552887,
"learning_rate": 0.00047261942257217844,
"loss": 3.3905,
"step": 36500
},
{
"epoch": 10.652985074626866,
"grad_norm": 0.35313287377357483,
"learning_rate": 0.00047244444444444444,
"loss": 3.3966,
"step": 36550
},
{
"epoch": 10.667560634328359,
"grad_norm": 0.34554523229599,
"learning_rate": 0.00047226946631671033,
"loss": 3.3831,
"step": 36600
},
{
"epoch": 10.682136194029852,
"grad_norm": 0.3466152548789978,
"learning_rate": 0.00047209448818897633,
"loss": 3.379,
"step": 36650
},
{
"epoch": 10.696711753731343,
"grad_norm": 0.36634284257888794,
"learning_rate": 0.0004719195100612423,
"loss": 3.3928,
"step": 36700
},
{
"epoch": 10.711287313432836,
"grad_norm": 0.3495074510574341,
"learning_rate": 0.00047174453193350827,
"loss": 3.3986,
"step": 36750
},
{
"epoch": 10.725862873134329,
"grad_norm": 0.3547740578651428,
"learning_rate": 0.0004715695538057742,
"loss": 3.3958,
"step": 36800
},
{
"epoch": 10.740438432835822,
"grad_norm": 0.3183799684047699,
"learning_rate": 0.0004713945756780402,
"loss": 3.3926,
"step": 36850
},
{
"epoch": 10.755013992537313,
"grad_norm": 0.3465604782104492,
"learning_rate": 0.0004712195975503062,
"loss": 3.4043,
"step": 36900
},
{
"epoch": 10.769589552238806,
"grad_norm": 0.3227628767490387,
"learning_rate": 0.00047104461942257215,
"loss": 3.4051,
"step": 36950
},
{
"epoch": 10.784165111940299,
"grad_norm": 0.3621023893356323,
"learning_rate": 0.0004708696412948381,
"loss": 3.3967,
"step": 37000
},
{
"epoch": 10.784165111940299,
"eval_accuracy": 0.3680770802023493,
"eval_loss": 3.568211793899536,
"eval_runtime": 179.2961,
"eval_samples_per_second": 92.735,
"eval_steps_per_second": 5.8,
"step": 37000
},
{
"epoch": 10.798740671641792,
"grad_norm": 0.33479082584381104,
"learning_rate": 0.0004706946631671041,
"loss": 3.3971,
"step": 37050
},
{
"epoch": 10.813316231343283,
"grad_norm": 0.36273670196533203,
"learning_rate": 0.00047051968503937004,
"loss": 3.3936,
"step": 37100
},
{
"epoch": 10.827891791044776,
"grad_norm": 0.3388984799385071,
"learning_rate": 0.000470344706911636,
"loss": 3.4087,
"step": 37150
},
{
"epoch": 10.842467350746269,
"grad_norm": 0.33692026138305664,
"learning_rate": 0.000470169728783902,
"loss": 3.4024,
"step": 37200
},
{
"epoch": 10.857042910447761,
"grad_norm": 0.34697890281677246,
"learning_rate": 0.0004699947506561679,
"loss": 3.3852,
"step": 37250
},
{
"epoch": 10.871618470149254,
"grad_norm": 0.33639323711395264,
"learning_rate": 0.0004698197725284339,
"loss": 3.4011,
"step": 37300
},
{
"epoch": 10.886194029850746,
"grad_norm": 0.3242666721343994,
"learning_rate": 0.00046964479440069986,
"loss": 3.3955,
"step": 37350
},
{
"epoch": 10.900769589552239,
"grad_norm": 0.34894293546676636,
"learning_rate": 0.0004694698162729658,
"loss": 3.4065,
"step": 37400
},
{
"epoch": 10.915345149253731,
"grad_norm": 0.3249868154525757,
"learning_rate": 0.0004692948381452318,
"loss": 3.4048,
"step": 37450
},
{
"epoch": 10.929920708955224,
"grad_norm": 0.3601526618003845,
"learning_rate": 0.0004691198600174978,
"loss": 3.4165,
"step": 37500
},
{
"epoch": 10.944496268656717,
"grad_norm": 0.3195374310016632,
"learning_rate": 0.0004689448818897637,
"loss": 3.3931,
"step": 37550
},
{
"epoch": 10.959071828358208,
"grad_norm": 0.35160696506500244,
"learning_rate": 0.0004687699037620297,
"loss": 3.4037,
"step": 37600
},
{
"epoch": 10.973647388059701,
"grad_norm": 0.39584487676620483,
"learning_rate": 0.0004685949256342957,
"loss": 3.3957,
"step": 37650
},
{
"epoch": 10.988222947761194,
"grad_norm": 0.3482179343700409,
"learning_rate": 0.0004684199475065617,
"loss": 3.3878,
"step": 37700
},
{
"epoch": 11.002623600746269,
"grad_norm": 0.3683445453643799,
"learning_rate": 0.0004682449693788276,
"loss": 3.3851,
"step": 37750
},
{
"epoch": 11.017199160447761,
"grad_norm": 0.35708582401275635,
"learning_rate": 0.00046806999125109357,
"loss": 3.2881,
"step": 37800
},
{
"epoch": 11.031774720149254,
"grad_norm": 0.33956989645957947,
"learning_rate": 0.00046789501312335957,
"loss": 3.2884,
"step": 37850
},
{
"epoch": 11.046350279850746,
"grad_norm": 0.35305795073509216,
"learning_rate": 0.0004677200349956255,
"loss": 3.2951,
"step": 37900
},
{
"epoch": 11.060925839552239,
"grad_norm": 0.3429839313030243,
"learning_rate": 0.00046754505686789146,
"loss": 3.3103,
"step": 37950
},
{
"epoch": 11.075501399253731,
"grad_norm": 0.36126893758773804,
"learning_rate": 0.00046737007874015745,
"loss": 3.3008,
"step": 38000
},
{
"epoch": 11.075501399253731,
"eval_accuracy": 0.3674310416521262,
"eval_loss": 3.5771067142486572,
"eval_runtime": 179.37,
"eval_samples_per_second": 92.697,
"eval_steps_per_second": 5.798,
"step": 38000
},
{
"epoch": 11.090076958955224,
"grad_norm": 0.3234972357749939,
"learning_rate": 0.0004671951006124234,
"loss": 3.3033,
"step": 38050
},
{
"epoch": 11.104652518656716,
"grad_norm": 0.3429054021835327,
"learning_rate": 0.00046702012248468934,
"loss": 3.3104,
"step": 38100
},
{
"epoch": 11.119228078358208,
"grad_norm": 0.366301566362381,
"learning_rate": 0.00046684514435695534,
"loss": 3.3158,
"step": 38150
},
{
"epoch": 11.133803638059701,
"grad_norm": 0.35824158787727356,
"learning_rate": 0.00046667016622922134,
"loss": 3.3167,
"step": 38200
},
{
"epoch": 11.148379197761194,
"grad_norm": 0.3471207618713379,
"learning_rate": 0.0004664951881014873,
"loss": 3.3236,
"step": 38250
},
{
"epoch": 11.162954757462687,
"grad_norm": 0.33413344621658325,
"learning_rate": 0.0004663202099737532,
"loss": 3.335,
"step": 38300
},
{
"epoch": 11.177530317164178,
"grad_norm": 0.35084760189056396,
"learning_rate": 0.0004661452318460192,
"loss": 3.3378,
"step": 38350
},
{
"epoch": 11.192105876865671,
"grad_norm": 0.3245983421802521,
"learning_rate": 0.00046597025371828516,
"loss": 3.3289,
"step": 38400
},
{
"epoch": 11.206681436567164,
"grad_norm": 0.3432168960571289,
"learning_rate": 0.00046579527559055116,
"loss": 3.3173,
"step": 38450
},
{
"epoch": 11.221256996268657,
"grad_norm": 0.3392120599746704,
"learning_rate": 0.0004656202974628171,
"loss": 3.3176,
"step": 38500
},
{
"epoch": 11.235832555970148,
"grad_norm": 0.30957579612731934,
"learning_rate": 0.00046544531933508305,
"loss": 3.3328,
"step": 38550
},
{
"epoch": 11.250408115671641,
"grad_norm": 0.3354186415672302,
"learning_rate": 0.00046527034120734905,
"loss": 3.343,
"step": 38600
},
{
"epoch": 11.264983675373134,
"grad_norm": 0.3629128634929657,
"learning_rate": 0.00046509536307961504,
"loss": 3.3423,
"step": 38650
},
{
"epoch": 11.279559235074627,
"grad_norm": 0.3577345311641693,
"learning_rate": 0.00046492038495188093,
"loss": 3.3461,
"step": 38700
},
{
"epoch": 11.29413479477612,
"grad_norm": 0.35234689712524414,
"learning_rate": 0.00046474540682414693,
"loss": 3.3383,
"step": 38750
},
{
"epoch": 11.308710354477611,
"grad_norm": 0.35899555683135986,
"learning_rate": 0.00046457042869641293,
"loss": 3.3405,
"step": 38800
},
{
"epoch": 11.323285914179104,
"grad_norm": 0.3531411290168762,
"learning_rate": 0.0004643954505686789,
"loss": 3.3408,
"step": 38850
},
{
"epoch": 11.337861473880597,
"grad_norm": 0.3381483256816864,
"learning_rate": 0.0004642204724409448,
"loss": 3.3351,
"step": 38900
},
{
"epoch": 11.35243703358209,
"grad_norm": 0.36588019132614136,
"learning_rate": 0.0004640454943132108,
"loss": 3.3432,
"step": 38950
},
{
"epoch": 11.367012593283581,
"grad_norm": 0.36804234981536865,
"learning_rate": 0.0004638705161854768,
"loss": 3.3421,
"step": 39000
},
{
"epoch": 11.367012593283581,
"eval_accuracy": 0.3676348927668987,
"eval_loss": 3.5748021602630615,
"eval_runtime": 179.2112,
"eval_samples_per_second": 92.779,
"eval_steps_per_second": 5.803,
"step": 39000
},
{
"epoch": 11.381588152985074,
"grad_norm": 0.36152568459510803,
"learning_rate": 0.0004636955380577427,
"loss": 3.3403,
"step": 39050
},
{
"epoch": 11.396163712686567,
"grad_norm": 0.327120304107666,
"learning_rate": 0.0004635205599300087,
"loss": 3.3424,
"step": 39100
},
{
"epoch": 11.41073927238806,
"grad_norm": 0.3252420425415039,
"learning_rate": 0.0004633455818022747,
"loss": 3.3589,
"step": 39150
},
{
"epoch": 11.425314832089553,
"grad_norm": 0.3580802083015442,
"learning_rate": 0.00046317060367454064,
"loss": 3.3514,
"step": 39200
},
{
"epoch": 11.439890391791044,
"grad_norm": 0.3448558449745178,
"learning_rate": 0.0004629956255468066,
"loss": 3.3533,
"step": 39250
},
{
"epoch": 11.454465951492537,
"grad_norm": 0.35852617025375366,
"learning_rate": 0.0004628206474190726,
"loss": 3.3519,
"step": 39300
},
{
"epoch": 11.46904151119403,
"grad_norm": 0.3588276505470276,
"learning_rate": 0.0004626456692913385,
"loss": 3.3636,
"step": 39350
},
{
"epoch": 11.483617070895523,
"grad_norm": 0.34626030921936035,
"learning_rate": 0.0004624706911636045,
"loss": 3.3617,
"step": 39400
},
{
"epoch": 11.498192630597014,
"grad_norm": 0.3265782594680786,
"learning_rate": 0.00046229571303587046,
"loss": 3.3584,
"step": 39450
},
{
"epoch": 11.512768190298507,
"grad_norm": 0.36340591311454773,
"learning_rate": 0.00046212073490813646,
"loss": 3.3619,
"step": 39500
},
{
"epoch": 11.52734375,
"grad_norm": 0.3383502960205078,
"learning_rate": 0.0004619457567804024,
"loss": 3.3671,
"step": 39550
},
{
"epoch": 11.541919309701493,
"grad_norm": 0.32055920362472534,
"learning_rate": 0.0004617707786526684,
"loss": 3.3657,
"step": 39600
},
{
"epoch": 11.556494869402986,
"grad_norm": 0.3531268835067749,
"learning_rate": 0.00046159580052493435,
"loss": 3.359,
"step": 39650
},
{
"epoch": 11.571070429104477,
"grad_norm": 0.38840246200561523,
"learning_rate": 0.0004614208223972003,
"loss": 3.3683,
"step": 39700
},
{
"epoch": 11.58564598880597,
"grad_norm": 0.350590318441391,
"learning_rate": 0.0004612458442694663,
"loss": 3.3655,
"step": 39750
},
{
"epoch": 11.600221548507463,
"grad_norm": 0.35112035274505615,
"learning_rate": 0.0004610708661417323,
"loss": 3.3695,
"step": 39800
},
{
"epoch": 11.614797108208956,
"grad_norm": 0.37844619154930115,
"learning_rate": 0.0004608958880139982,
"loss": 3.3694,
"step": 39850
},
{
"epoch": 11.629372667910447,
"grad_norm": 0.32265302538871765,
"learning_rate": 0.00046072090988626417,
"loss": 3.3716,
"step": 39900
},
{
"epoch": 11.64394822761194,
"grad_norm": 0.3645426332950592,
"learning_rate": 0.00046054593175853017,
"loss": 3.3769,
"step": 39950
},
{
"epoch": 11.658523787313433,
"grad_norm": 0.35819122195243835,
"learning_rate": 0.00046037095363079606,
"loss": 3.3865,
"step": 40000
},
{
"epoch": 11.658523787313433,
"eval_accuracy": 0.3685178552744181,
"eval_loss": 3.5665090084075928,
"eval_runtime": 179.4054,
"eval_samples_per_second": 92.678,
"eval_steps_per_second": 5.797,
"step": 40000
},
{
"epoch": 11.673099347014926,
"grad_norm": 0.34585466980934143,
"learning_rate": 0.00046019597550306206,
"loss": 3.3688,
"step": 40050
},
{
"epoch": 11.687674906716419,
"grad_norm": 0.3452220559120178,
"learning_rate": 0.00046002099737532806,
"loss": 3.3689,
"step": 40100
},
{
"epoch": 11.70225046641791,
"grad_norm": 0.3693149983882904,
"learning_rate": 0.00045984601924759405,
"loss": 3.3746,
"step": 40150
},
{
"epoch": 11.716826026119403,
"grad_norm": 0.32303112745285034,
"learning_rate": 0.00045967104111985994,
"loss": 3.3624,
"step": 40200
},
{
"epoch": 11.731401585820896,
"grad_norm": 0.33094048500061035,
"learning_rate": 0.00045949606299212594,
"loss": 3.3638,
"step": 40250
},
{
"epoch": 11.745977145522389,
"grad_norm": 0.3415735960006714,
"learning_rate": 0.00045932108486439194,
"loss": 3.3793,
"step": 40300
},
{
"epoch": 11.76055270522388,
"grad_norm": 0.35656100511550903,
"learning_rate": 0.0004591461067366579,
"loss": 3.3694,
"step": 40350
},
{
"epoch": 11.775128264925373,
"grad_norm": 0.34617358446121216,
"learning_rate": 0.0004589711286089238,
"loss": 3.3741,
"step": 40400
},
{
"epoch": 11.789703824626866,
"grad_norm": 0.36538684368133545,
"learning_rate": 0.0004587961504811898,
"loss": 3.379,
"step": 40450
},
{
"epoch": 11.804279384328359,
"grad_norm": 0.36401626467704773,
"learning_rate": 0.00045862117235345577,
"loss": 3.3815,
"step": 40500
},
{
"epoch": 11.818854944029852,
"grad_norm": 0.33819693326950073,
"learning_rate": 0.00045844619422572176,
"loss": 3.3767,
"step": 40550
},
{
"epoch": 11.833430503731343,
"grad_norm": 0.3567153811454773,
"learning_rate": 0.0004582712160979877,
"loss": 3.3868,
"step": 40600
},
{
"epoch": 11.848006063432836,
"grad_norm": 0.3180365562438965,
"learning_rate": 0.00045809623797025365,
"loss": 3.3854,
"step": 40650
},
{
"epoch": 11.862581623134329,
"grad_norm": 0.3368315100669861,
"learning_rate": 0.00045792125984251965,
"loss": 3.3879,
"step": 40700
},
{
"epoch": 11.877157182835822,
"grad_norm": 0.3494505286216736,
"learning_rate": 0.0004577462817147856,
"loss": 3.3775,
"step": 40750
},
{
"epoch": 11.891732742537313,
"grad_norm": 0.3420652747154236,
"learning_rate": 0.0004575713035870516,
"loss": 3.3773,
"step": 40800
},
{
"epoch": 11.906308302238806,
"grad_norm": 0.343851774930954,
"learning_rate": 0.00045739632545931753,
"loss": 3.3788,
"step": 40850
},
{
"epoch": 11.920883861940299,
"grad_norm": 0.3384932279586792,
"learning_rate": 0.00045722134733158353,
"loss": 3.3777,
"step": 40900
},
{
"epoch": 11.935459421641792,
"grad_norm": 0.35529130697250366,
"learning_rate": 0.0004570463692038495,
"loss": 3.3728,
"step": 40950
},
{
"epoch": 11.950034981343283,
"grad_norm": 0.3340221643447876,
"learning_rate": 0.0004568713910761154,
"loss": 3.3826,
"step": 41000
},
{
"epoch": 11.950034981343283,
"eval_accuracy": 0.3690850368691576,
"eval_loss": 3.5591351985931396,
"eval_runtime": 179.5419,
"eval_samples_per_second": 92.608,
"eval_steps_per_second": 5.793,
"step": 41000
},
{
"epoch": 11.964610541044776,
"grad_norm": 0.33717867732048035,
"learning_rate": 0.0004566964129483814,
"loss": 3.3718,
"step": 41050
},
{
"epoch": 11.979186100746269,
"grad_norm": 0.34679439663887024,
"learning_rate": 0.0004565214348206474,
"loss": 3.3889,
"step": 41100
},
{
"epoch": 11.993761660447761,
"grad_norm": 0.3585461378097534,
"learning_rate": 0.0004563464566929133,
"loss": 3.3777,
"step": 41150
},
{
"epoch": 12.008162313432836,
"grad_norm": 0.33981046080589294,
"learning_rate": 0.0004561714785651793,
"loss": 3.3212,
"step": 41200
},
{
"epoch": 12.022737873134329,
"grad_norm": 0.3614978790283203,
"learning_rate": 0.0004559965004374453,
"loss": 3.2726,
"step": 41250
},
{
"epoch": 12.037313432835822,
"grad_norm": 0.33403632044792175,
"learning_rate": 0.0004558215223097113,
"loss": 3.2689,
"step": 41300
},
{
"epoch": 12.051888992537313,
"grad_norm": 0.34065091609954834,
"learning_rate": 0.0004556465441819772,
"loss": 3.2765,
"step": 41350
},
{
"epoch": 12.066464552238806,
"grad_norm": 0.34842050075531006,
"learning_rate": 0.0004554715660542432,
"loss": 3.2862,
"step": 41400
},
{
"epoch": 12.081040111940299,
"grad_norm": 0.3776961863040924,
"learning_rate": 0.0004552965879265092,
"loss": 3.297,
"step": 41450
},
{
"epoch": 12.095615671641792,
"grad_norm": 0.3554358184337616,
"learning_rate": 0.0004551216097987751,
"loss": 3.2924,
"step": 41500
},
{
"epoch": 12.110191231343284,
"grad_norm": 0.33396828174591064,
"learning_rate": 0.00045494663167104107,
"loss": 3.29,
"step": 41550
},
{
"epoch": 12.124766791044776,
"grad_norm": 0.3485698699951172,
"learning_rate": 0.00045477165354330706,
"loss": 3.3019,
"step": 41600
},
{
"epoch": 12.139342350746269,
"grad_norm": 0.32045650482177734,
"learning_rate": 0.000454596675415573,
"loss": 3.3071,
"step": 41650
},
{
"epoch": 12.153917910447761,
"grad_norm": 0.35876530408859253,
"learning_rate": 0.00045442169728783895,
"loss": 3.3071,
"step": 41700
},
{
"epoch": 12.168493470149254,
"grad_norm": 0.3410411477088928,
"learning_rate": 0.00045424671916010495,
"loss": 3.3152,
"step": 41750
},
{
"epoch": 12.183069029850746,
"grad_norm": 0.36790353059768677,
"learning_rate": 0.0004540717410323709,
"loss": 3.3032,
"step": 41800
},
{
"epoch": 12.197644589552239,
"grad_norm": 0.33544808626174927,
"learning_rate": 0.0004538967629046369,
"loss": 3.3031,
"step": 41850
},
{
"epoch": 12.212220149253731,
"grad_norm": 0.35613003373146057,
"learning_rate": 0.00045372178477690283,
"loss": 3.3092,
"step": 41900
},
{
"epoch": 12.226795708955224,
"grad_norm": 0.3690536320209503,
"learning_rate": 0.0004535468066491688,
"loss": 3.3118,
"step": 41950
},
{
"epoch": 12.241371268656717,
"grad_norm": 0.35943329334259033,
"learning_rate": 0.0004533718285214348,
"loss": 3.3125,
"step": 42000
},
{
"epoch": 12.241371268656717,
"eval_accuracy": 0.36872500190374813,
"eval_loss": 3.57114839553833,
"eval_runtime": 179.3203,
"eval_samples_per_second": 92.722,
"eval_steps_per_second": 5.8,
"step": 42000
},
{
"epoch": 12.255946828358208,
"grad_norm": 0.3558652102947235,
"learning_rate": 0.00045319685039370077,
"loss": 3.3197,
"step": 42050
},
{
"epoch": 12.270522388059701,
"grad_norm": 0.36180514097213745,
"learning_rate": 0.0004530218722659667,
"loss": 3.3216,
"step": 42100
},
{
"epoch": 12.285097947761194,
"grad_norm": 0.34139835834503174,
"learning_rate": 0.00045284689413823266,
"loss": 3.3106,
"step": 42150
},
{
"epoch": 12.299673507462687,
"grad_norm": 0.3567177951335907,
"learning_rate": 0.00045267191601049866,
"loss": 3.3276,
"step": 42200
},
{
"epoch": 12.314249067164178,
"grad_norm": 0.3402513861656189,
"learning_rate": 0.00045249693788276465,
"loss": 3.3162,
"step": 42250
},
{
"epoch": 12.328824626865671,
"grad_norm": 0.3365866243839264,
"learning_rate": 0.00045232195975503054,
"loss": 3.3296,
"step": 42300
},
{
"epoch": 12.343400186567164,
"grad_norm": 0.39586734771728516,
"learning_rate": 0.00045214698162729654,
"loss": 3.3355,
"step": 42350
},
{
"epoch": 12.357975746268657,
"grad_norm": 0.3556031882762909,
"learning_rate": 0.00045197200349956254,
"loss": 3.3272,
"step": 42400
},
{
"epoch": 12.372551305970148,
"grad_norm": 0.342041939496994,
"learning_rate": 0.00045179702537182854,
"loss": 3.3228,
"step": 42450
},
{
"epoch": 12.387126865671641,
"grad_norm": 0.35764795541763306,
"learning_rate": 0.0004516220472440944,
"loss": 3.3353,
"step": 42500
},
{
"epoch": 12.401702425373134,
"grad_norm": 0.37403562664985657,
"learning_rate": 0.0004514470691163604,
"loss": 3.3313,
"step": 42550
},
{
"epoch": 12.416277985074627,
"grad_norm": 0.33595719933509827,
"learning_rate": 0.0004512720909886264,
"loss": 3.3267,
"step": 42600
},
{
"epoch": 12.43085354477612,
"grad_norm": 0.3399653732776642,
"learning_rate": 0.0004510971128608923,
"loss": 3.3374,
"step": 42650
},
{
"epoch": 12.445429104477611,
"grad_norm": 0.37351250648498535,
"learning_rate": 0.0004509221347331583,
"loss": 3.3454,
"step": 42700
},
{
"epoch": 12.460004664179104,
"grad_norm": 0.37687429785728455,
"learning_rate": 0.0004507471566054243,
"loss": 3.3437,
"step": 42750
},
{
"epoch": 12.474580223880597,
"grad_norm": 0.3576935827732086,
"learning_rate": 0.00045057217847769025,
"loss": 3.3292,
"step": 42800
},
{
"epoch": 12.48915578358209,
"grad_norm": 0.3471689522266388,
"learning_rate": 0.0004503972003499562,
"loss": 3.345,
"step": 42850
},
{
"epoch": 12.503731343283581,
"grad_norm": 0.350300133228302,
"learning_rate": 0.0004502222222222222,
"loss": 3.3493,
"step": 42900
},
{
"epoch": 12.518306902985074,
"grad_norm": 0.3431214988231659,
"learning_rate": 0.00045004724409448813,
"loss": 3.3476,
"step": 42950
},
{
"epoch": 12.532882462686567,
"grad_norm": 0.32879090309143066,
"learning_rate": 0.00044987226596675413,
"loss": 3.3482,
"step": 43000
},
{
"epoch": 12.532882462686567,
"eval_accuracy": 0.36874771741480533,
"eval_loss": 3.5677340030670166,
"eval_runtime": 179.625,
"eval_samples_per_second": 92.565,
"eval_steps_per_second": 5.79,
"step": 43000
},
{
"epoch": 12.54745802238806,
"grad_norm": 0.3496828079223633,
"learning_rate": 0.0004496972878390201,
"loss": 3.3493,
"step": 43050
},
{
"epoch": 12.562033582089553,
"grad_norm": 0.33506613969802856,
"learning_rate": 0.000449522309711286,
"loss": 3.34,
"step": 43100
},
{
"epoch": 12.576609141791044,
"grad_norm": 0.3366851210594177,
"learning_rate": 0.000449347331583552,
"loss": 3.3462,
"step": 43150
},
{
"epoch": 12.591184701492537,
"grad_norm": 0.3410353660583496,
"learning_rate": 0.000449172353455818,
"loss": 3.3391,
"step": 43200
},
{
"epoch": 12.60576026119403,
"grad_norm": 0.3703427314758301,
"learning_rate": 0.0004489973753280839,
"loss": 3.3415,
"step": 43250
},
{
"epoch": 12.620335820895523,
"grad_norm": 0.3844320476055145,
"learning_rate": 0.0004488223972003499,
"loss": 3.345,
"step": 43300
},
{
"epoch": 12.634911380597014,
"grad_norm": 0.3453065752983093,
"learning_rate": 0.0004486474190726159,
"loss": 3.3518,
"step": 43350
},
{
"epoch": 12.649486940298507,
"grad_norm": 0.3349737524986267,
"learning_rate": 0.0004484724409448819,
"loss": 3.3642,
"step": 43400
},
{
"epoch": 12.6640625,
"grad_norm": 0.36272352933883667,
"learning_rate": 0.0004482974628171478,
"loss": 3.3583,
"step": 43450
},
{
"epoch": 12.678638059701493,
"grad_norm": 0.3413692116737366,
"learning_rate": 0.0004481224846894138,
"loss": 3.3481,
"step": 43500
},
{
"epoch": 12.693213619402986,
"grad_norm": 0.33000117540359497,
"learning_rate": 0.0004479475065616798,
"loss": 3.3618,
"step": 43550
},
{
"epoch": 12.707789179104477,
"grad_norm": 0.35906121134757996,
"learning_rate": 0.00044777252843394567,
"loss": 3.3522,
"step": 43600
},
{
"epoch": 12.72236473880597,
"grad_norm": 0.3547782003879547,
"learning_rate": 0.00044759755030621167,
"loss": 3.3482,
"step": 43650
},
{
"epoch": 12.736940298507463,
"grad_norm": 0.346333384513855,
"learning_rate": 0.00044742257217847767,
"loss": 3.3648,
"step": 43700
},
{
"epoch": 12.751515858208956,
"grad_norm": 0.313652366399765,
"learning_rate": 0.00044724759405074366,
"loss": 3.3496,
"step": 43750
},
{
"epoch": 12.766091417910447,
"grad_norm": 0.3224954605102539,
"learning_rate": 0.00044707261592300955,
"loss": 3.3401,
"step": 43800
},
{
"epoch": 12.78066697761194,
"grad_norm": 0.3389032781124115,
"learning_rate": 0.00044689763779527555,
"loss": 3.3629,
"step": 43850
},
{
"epoch": 12.795242537313433,
"grad_norm": 0.3401406705379486,
"learning_rate": 0.00044672265966754155,
"loss": 3.3652,
"step": 43900
},
{
"epoch": 12.809818097014926,
"grad_norm": 0.337515264749527,
"learning_rate": 0.0004465476815398075,
"loss": 3.3559,
"step": 43950
},
{
"epoch": 12.824393656716419,
"grad_norm": 0.368762344121933,
"learning_rate": 0.00044637270341207344,
"loss": 3.3548,
"step": 44000
},
{
"epoch": 12.824393656716419,
"eval_accuracy": 0.3694424825017004,
"eval_loss": 3.5560858249664307,
"eval_runtime": 179.4895,
"eval_samples_per_second": 92.635,
"eval_steps_per_second": 5.794,
"step": 44000
},
{
"epoch": 12.83896921641791,
"grad_norm": 0.3303767740726471,
"learning_rate": 0.00044619772528433943,
"loss": 3.3674,
"step": 44050
},
{
"epoch": 12.853544776119403,
"grad_norm": 0.3471069931983948,
"learning_rate": 0.0004460227471566054,
"loss": 3.3569,
"step": 44100
},
{
"epoch": 12.868120335820896,
"grad_norm": 0.3295342028141022,
"learning_rate": 0.0004458477690288714,
"loss": 3.3561,
"step": 44150
},
{
"epoch": 12.882695895522389,
"grad_norm": 0.352363646030426,
"learning_rate": 0.0004456727909011373,
"loss": 3.3687,
"step": 44200
},
{
"epoch": 12.89727145522388,
"grad_norm": 0.3398921489715576,
"learning_rate": 0.00044549781277340326,
"loss": 3.3718,
"step": 44250
},
{
"epoch": 12.911847014925373,
"grad_norm": 0.354624480009079,
"learning_rate": 0.00044532283464566926,
"loss": 3.3674,
"step": 44300
},
{
"epoch": 12.926422574626866,
"grad_norm": 0.3335941433906555,
"learning_rate": 0.0004451478565179352,
"loss": 3.3619,
"step": 44350
},
{
"epoch": 12.940998134328359,
"grad_norm": 0.3419336974620819,
"learning_rate": 0.00044497287839020115,
"loss": 3.3593,
"step": 44400
},
{
"epoch": 12.955573694029852,
"grad_norm": 0.3105829358100891,
"learning_rate": 0.00044479790026246714,
"loss": 3.3697,
"step": 44450
},
{
"epoch": 12.970149253731343,
"grad_norm": 0.32766225934028625,
"learning_rate": 0.00044462292213473314,
"loss": 3.3784,
"step": 44500
},
{
"epoch": 12.984724813432836,
"grad_norm": 0.3454512059688568,
"learning_rate": 0.00044444794400699903,
"loss": 3.3585,
"step": 44550
},
{
"epoch": 12.999300373134329,
"grad_norm": 0.3578594923019409,
"learning_rate": 0.00044427296587926503,
"loss": 3.3594,
"step": 44600
},
{
"epoch": 13.013701026119403,
"grad_norm": 0.34886470437049866,
"learning_rate": 0.000444097987751531,
"loss": 3.2495,
"step": 44650
},
{
"epoch": 13.028276585820896,
"grad_norm": 0.3374779522418976,
"learning_rate": 0.000443923009623797,
"loss": 3.2504,
"step": 44700
},
{
"epoch": 13.042852145522389,
"grad_norm": 0.34586066007614136,
"learning_rate": 0.0004437480314960629,
"loss": 3.2695,
"step": 44750
},
{
"epoch": 13.05742770522388,
"grad_norm": 0.3878658413887024,
"learning_rate": 0.0004435730533683289,
"loss": 3.2661,
"step": 44800
},
{
"epoch": 13.072003264925373,
"grad_norm": 0.3439328372478485,
"learning_rate": 0.0004433980752405949,
"loss": 3.2634,
"step": 44850
},
{
"epoch": 13.086578824626866,
"grad_norm": 0.3657079339027405,
"learning_rate": 0.0004432230971128609,
"loss": 3.2741,
"step": 44900
},
{
"epoch": 13.101154384328359,
"grad_norm": 0.36978641152381897,
"learning_rate": 0.0004430481189851268,
"loss": 3.29,
"step": 44950
},
{
"epoch": 13.115729944029852,
"grad_norm": 0.3556627333164215,
"learning_rate": 0.0004428731408573928,
"loss": 3.2723,
"step": 45000
},
{
"epoch": 13.115729944029852,
"eval_accuracy": 0.3688253974008041,
"eval_loss": 3.573399066925049,
"eval_runtime": 179.4112,
"eval_samples_per_second": 92.675,
"eval_steps_per_second": 5.797,
"step": 45000
},
{
"epoch": 13.130305503731343,
"grad_norm": 0.3487858474254608,
"learning_rate": 0.0004426981627296588,
"loss": 3.2886,
"step": 45050
},
{
"epoch": 13.144881063432836,
"grad_norm": 0.3508056104183197,
"learning_rate": 0.00044252318460192473,
"loss": 3.2857,
"step": 45100
},
{
"epoch": 13.159456623134329,
"grad_norm": 0.3573300242424011,
"learning_rate": 0.0004423482064741907,
"loss": 3.2804,
"step": 45150
},
{
"epoch": 13.174032182835822,
"grad_norm": 0.364335298538208,
"learning_rate": 0.0004421732283464567,
"loss": 3.3004,
"step": 45200
},
{
"epoch": 13.188607742537313,
"grad_norm": 0.35763829946517944,
"learning_rate": 0.0004419982502187226,
"loss": 3.2839,
"step": 45250
},
{
"epoch": 13.203183302238806,
"grad_norm": 0.39192068576812744,
"learning_rate": 0.00044182327209098856,
"loss": 3.2913,
"step": 45300
},
{
"epoch": 13.217758861940299,
"grad_norm": 0.3512227535247803,
"learning_rate": 0.00044164829396325456,
"loss": 3.3005,
"step": 45350
},
{
"epoch": 13.232334421641792,
"grad_norm": 0.36455291509628296,
"learning_rate": 0.0004414733158355205,
"loss": 3.3018,
"step": 45400
},
{
"epoch": 13.246909981343283,
"grad_norm": 0.3636734187602997,
"learning_rate": 0.0004412983377077865,
"loss": 3.3025,
"step": 45450
},
{
"epoch": 13.261485541044776,
"grad_norm": 0.36011579632759094,
"learning_rate": 0.00044112335958005244,
"loss": 3.3056,
"step": 45500
},
{
"epoch": 13.276061100746269,
"grad_norm": 0.36715275049209595,
"learning_rate": 0.0004409483814523184,
"loss": 3.3089,
"step": 45550
},
{
"epoch": 13.290636660447761,
"grad_norm": 0.3400384187698364,
"learning_rate": 0.0004407734033245844,
"loss": 3.2957,
"step": 45600
},
{
"epoch": 13.305212220149254,
"grad_norm": 0.3540400266647339,
"learning_rate": 0.0004405984251968504,
"loss": 3.3109,
"step": 45650
},
{
"epoch": 13.319787779850746,
"grad_norm": 0.37732866406440735,
"learning_rate": 0.0004404234470691163,
"loss": 3.3153,
"step": 45700
},
{
"epoch": 13.334363339552239,
"grad_norm": 0.37615445256233215,
"learning_rate": 0.00044024846894138227,
"loss": 3.3183,
"step": 45750
},
{
"epoch": 13.348938899253731,
"grad_norm": 0.33871808648109436,
"learning_rate": 0.00044007349081364827,
"loss": 3.3111,
"step": 45800
},
{
"epoch": 13.363514458955224,
"grad_norm": 0.3805575966835022,
"learning_rate": 0.00043989851268591427,
"loss": 3.3073,
"step": 45850
},
{
"epoch": 13.378090018656717,
"grad_norm": 0.3696669340133667,
"learning_rate": 0.00043972353455818016,
"loss": 3.3173,
"step": 45900
},
{
"epoch": 13.392665578358208,
"grad_norm": 0.36460769176483154,
"learning_rate": 0.00043954855643044615,
"loss": 3.318,
"step": 45950
},
{
"epoch": 13.407241138059701,
"grad_norm": 0.36172017455101013,
"learning_rate": 0.00043937357830271215,
"loss": 3.3128,
"step": 46000
},
{
"epoch": 13.407241138059701,
"eval_accuracy": 0.36958960368730415,
"eval_loss": 3.5665695667266846,
"eval_runtime": 179.2795,
"eval_samples_per_second": 92.743,
"eval_steps_per_second": 5.801,
"step": 46000
},
{
"epoch": 13.421816697761194,
"grad_norm": 0.339266836643219,
"learning_rate": 0.0004391986001749781,
"loss": 3.3222,
"step": 46050
},
{
"epoch": 13.436392257462687,
"grad_norm": 0.33610814809799194,
"learning_rate": 0.00043902362204724404,
"loss": 3.3171,
"step": 46100
},
{
"epoch": 13.450967817164178,
"grad_norm": 0.34440693259239197,
"learning_rate": 0.00043884864391951004,
"loss": 3.3146,
"step": 46150
},
{
"epoch": 13.465543376865671,
"grad_norm": 0.35377469658851624,
"learning_rate": 0.00043867366579177603,
"loss": 3.3186,
"step": 46200
},
{
"epoch": 13.480118936567164,
"grad_norm": 0.3478979766368866,
"learning_rate": 0.0004384986876640419,
"loss": 3.3134,
"step": 46250
},
{
"epoch": 13.494694496268657,
"grad_norm": 0.3551417887210846,
"learning_rate": 0.0004383237095363079,
"loss": 3.321,
"step": 46300
},
{
"epoch": 13.509270055970148,
"grad_norm": 0.3469100594520569,
"learning_rate": 0.0004381487314085739,
"loss": 3.3224,
"step": 46350
},
{
"epoch": 13.523845615671641,
"grad_norm": 0.3445916771888733,
"learning_rate": 0.00043797375328083986,
"loss": 3.3258,
"step": 46400
},
{
"epoch": 13.538421175373134,
"grad_norm": 0.33777162432670593,
"learning_rate": 0.0004377987751531058,
"loss": 3.3198,
"step": 46450
},
{
"epoch": 13.552996735074627,
"grad_norm": 0.34429651498794556,
"learning_rate": 0.0004376237970253718,
"loss": 3.3506,
"step": 46500
},
{
"epoch": 13.56757229477612,
"grad_norm": 0.35852015018463135,
"learning_rate": 0.00043744881889763775,
"loss": 3.3238,
"step": 46550
},
{
"epoch": 13.582147854477611,
"grad_norm": 0.34117794036865234,
"learning_rate": 0.00043727384076990374,
"loss": 3.3247,
"step": 46600
},
{
"epoch": 13.596723414179104,
"grad_norm": 0.39330777525901794,
"learning_rate": 0.0004370988626421697,
"loss": 3.3289,
"step": 46650
},
{
"epoch": 13.611298973880597,
"grad_norm": 0.34990638494491577,
"learning_rate": 0.00043692388451443563,
"loss": 3.341,
"step": 46700
},
{
"epoch": 13.62587453358209,
"grad_norm": 0.330219566822052,
"learning_rate": 0.00043674890638670163,
"loss": 3.3356,
"step": 46750
},
{
"epoch": 13.640450093283581,
"grad_norm": 0.38839754462242126,
"learning_rate": 0.0004365739282589676,
"loss": 3.3439,
"step": 46800
},
{
"epoch": 13.655025652985074,
"grad_norm": 0.33825257420539856,
"learning_rate": 0.0004363989501312335,
"loss": 3.3384,
"step": 46850
},
{
"epoch": 13.669601212686567,
"grad_norm": 0.3526383638381958,
"learning_rate": 0.0004362239720034995,
"loss": 3.3325,
"step": 46900
},
{
"epoch": 13.68417677238806,
"grad_norm": 0.38543063402175903,
"learning_rate": 0.0004360489938757655,
"loss": 3.3314,
"step": 46950
},
{
"epoch": 13.698752332089553,
"grad_norm": 0.3445545434951782,
"learning_rate": 0.0004358740157480315,
"loss": 3.3554,
"step": 47000
},
{
"epoch": 13.698752332089553,
"eval_accuracy": 0.37001095876287327,
"eval_loss": 3.55552339553833,
"eval_runtime": 179.3748,
"eval_samples_per_second": 92.694,
"eval_steps_per_second": 5.798,
"step": 47000
},
{
"epoch": 13.713327891791044,
"grad_norm": 0.34910300374031067,
"learning_rate": 0.0004356990376202974,
"loss": 3.3439,
"step": 47050
},
{
"epoch": 13.727903451492537,
"grad_norm": 0.3361404240131378,
"learning_rate": 0.0004355240594925634,
"loss": 3.3431,
"step": 47100
},
{
"epoch": 13.74247901119403,
"grad_norm": 0.37516549229621887,
"learning_rate": 0.0004353490813648294,
"loss": 3.3323,
"step": 47150
},
{
"epoch": 13.757054570895523,
"grad_norm": 0.3573398292064667,
"learning_rate": 0.0004351741032370953,
"loss": 3.3324,
"step": 47200
},
{
"epoch": 13.771630130597014,
"grad_norm": 0.3558635115623474,
"learning_rate": 0.0004349991251093613,
"loss": 3.3542,
"step": 47250
},
{
"epoch": 13.786205690298507,
"grad_norm": 0.36393576860427856,
"learning_rate": 0.0004348241469816273,
"loss": 3.3417,
"step": 47300
},
{
"epoch": 13.80078125,
"grad_norm": 0.34120824933052063,
"learning_rate": 0.0004346491688538932,
"loss": 3.3343,
"step": 47350
},
{
"epoch": 13.815356809701493,
"grad_norm": 0.36425623297691345,
"learning_rate": 0.00043447419072615916,
"loss": 3.3608,
"step": 47400
},
{
"epoch": 13.829932369402986,
"grad_norm": 0.34796497225761414,
"learning_rate": 0.00043429921259842516,
"loss": 3.3407,
"step": 47450
},
{
"epoch": 13.844507929104477,
"grad_norm": 0.33243417739868164,
"learning_rate": 0.00043412423447069116,
"loss": 3.3386,
"step": 47500
},
{
"epoch": 13.85908348880597,
"grad_norm": 0.3396977186203003,
"learning_rate": 0.0004339492563429571,
"loss": 3.3475,
"step": 47550
},
{
"epoch": 13.873659048507463,
"grad_norm": 0.33732032775878906,
"learning_rate": 0.00043377427821522305,
"loss": 3.34,
"step": 47600
},
{
"epoch": 13.888234608208956,
"grad_norm": 0.3958549499511719,
"learning_rate": 0.00043359930008748904,
"loss": 3.3435,
"step": 47650
},
{
"epoch": 13.902810167910447,
"grad_norm": 0.35972535610198975,
"learning_rate": 0.000433424321959755,
"loss": 3.3638,
"step": 47700
},
{
"epoch": 13.91738572761194,
"grad_norm": 0.37601491808891296,
"learning_rate": 0.000433249343832021,
"loss": 3.3502,
"step": 47750
},
{
"epoch": 13.931961287313433,
"grad_norm": 0.3627243936061859,
"learning_rate": 0.00043307436570428693,
"loss": 3.3341,
"step": 47800
},
{
"epoch": 13.946536847014926,
"grad_norm": 0.34104883670806885,
"learning_rate": 0.00043289938757655287,
"loss": 3.3529,
"step": 47850
},
{
"epoch": 13.961112406716419,
"grad_norm": 0.35523277521133423,
"learning_rate": 0.00043272440944881887,
"loss": 3.3577,
"step": 47900
},
{
"epoch": 13.97568796641791,
"grad_norm": 0.3545657694339752,
"learning_rate": 0.0004325494313210848,
"loss": 3.348,
"step": 47950
},
{
"epoch": 13.990263526119403,
"grad_norm": 0.3553575277328491,
"learning_rate": 0.00043237445319335076,
"loss": 3.3418,
"step": 48000
},
{
"epoch": 13.990263526119403,
"eval_accuracy": 0.3703100267089685,
"eval_loss": 3.550926923751831,
"eval_runtime": 179.4655,
"eval_samples_per_second": 92.647,
"eval_steps_per_second": 5.795,
"step": 48000
},
{
"epoch": 14.004664179104477,
"grad_norm": 0.3258715271949768,
"learning_rate": 0.00043219947506561676,
"loss": 3.3135,
"step": 48050
},
{
"epoch": 14.01923973880597,
"grad_norm": 0.36811158061027527,
"learning_rate": 0.00043202449693788275,
"loss": 3.2429,
"step": 48100
},
{
"epoch": 14.033815298507463,
"grad_norm": 0.3549216687679291,
"learning_rate": 0.00043184951881014864,
"loss": 3.2336,
"step": 48150
},
{
"epoch": 14.048390858208956,
"grad_norm": 0.37687474489212036,
"learning_rate": 0.00043167454068241464,
"loss": 3.2607,
"step": 48200
},
{
"epoch": 14.062966417910447,
"grad_norm": 0.37216871976852417,
"learning_rate": 0.00043149956255468064,
"loss": 3.2459,
"step": 48250
},
{
"epoch": 14.07754197761194,
"grad_norm": 0.36139625310897827,
"learning_rate": 0.00043132458442694664,
"loss": 3.2678,
"step": 48300
},
{
"epoch": 14.092117537313433,
"grad_norm": 0.33921825885772705,
"learning_rate": 0.0004311496062992125,
"loss": 3.2559,
"step": 48350
},
{
"epoch": 14.106693097014926,
"grad_norm": 0.3851219415664673,
"learning_rate": 0.0004309746281714785,
"loss": 3.271,
"step": 48400
},
{
"epoch": 14.121268656716419,
"grad_norm": 0.38296273350715637,
"learning_rate": 0.0004307996500437445,
"loss": 3.2576,
"step": 48450
},
{
"epoch": 14.13584421641791,
"grad_norm": 0.3686645030975342,
"learning_rate": 0.00043062467191601046,
"loss": 3.2749,
"step": 48500
},
{
"epoch": 14.150419776119403,
"grad_norm": 0.3671486973762512,
"learning_rate": 0.0004304496937882764,
"loss": 3.2747,
"step": 48550
},
{
"epoch": 14.164995335820896,
"grad_norm": 0.3503707945346832,
"learning_rate": 0.0004302747156605424,
"loss": 3.2658,
"step": 48600
},
{
"epoch": 14.179570895522389,
"grad_norm": 0.3670405447483063,
"learning_rate": 0.00043009973753280835,
"loss": 3.2812,
"step": 48650
},
{
"epoch": 14.19414645522388,
"grad_norm": 0.37047263979911804,
"learning_rate": 0.00042992475940507435,
"loss": 3.2761,
"step": 48700
},
{
"epoch": 14.208722014925373,
"grad_norm": 0.34551745653152466,
"learning_rate": 0.0004297497812773403,
"loss": 3.2868,
"step": 48750
},
{
"epoch": 14.223297574626866,
"grad_norm": 0.3535717725753784,
"learning_rate": 0.0004295748031496063,
"loss": 3.2819,
"step": 48800
},
{
"epoch": 14.237873134328359,
"grad_norm": 0.3632153868675232,
"learning_rate": 0.00042939982502187223,
"loss": 3.2709,
"step": 48850
},
{
"epoch": 14.252448694029852,
"grad_norm": 0.3754744231700897,
"learning_rate": 0.0004292248468941382,
"loss": 3.2988,
"step": 48900
},
{
"epoch": 14.267024253731343,
"grad_norm": 0.35462021827697754,
"learning_rate": 0.00042904986876640417,
"loss": 3.2895,
"step": 48950
},
{
"epoch": 14.281599813432836,
"grad_norm": 0.37853652238845825,
"learning_rate": 0.0004288748906386701,
"loss": 3.2873,
"step": 49000
},
{
"epoch": 14.281599813432836,
"eval_accuracy": 0.3699119756291991,
"eval_loss": 3.564150333404541,
"eval_runtime": 179.6002,
"eval_samples_per_second": 92.578,
"eval_steps_per_second": 5.791,
"step": 49000
},
{
"epoch": 14.296175373134329,
"grad_norm": 0.38061970472335815,
"learning_rate": 0.0004286999125109361,
"loss": 3.2949,
"step": 49050
},
{
"epoch": 14.310750932835822,
"grad_norm": 0.35941916704177856,
"learning_rate": 0.00042852493438320206,
"loss": 3.2906,
"step": 49100
},
{
"epoch": 14.325326492537313,
"grad_norm": 0.37919649481773376,
"learning_rate": 0.000428349956255468,
"loss": 3.2912,
"step": 49150
},
{
"epoch": 14.339902052238806,
"grad_norm": 0.3739522099494934,
"learning_rate": 0.000428174978127734,
"loss": 3.2909,
"step": 49200
},
{
"epoch": 14.354477611940299,
"grad_norm": 0.40394482016563416,
"learning_rate": 0.000428,
"loss": 3.299,
"step": 49250
},
{
"epoch": 14.369053171641792,
"grad_norm": 0.37621989846229553,
"learning_rate": 0.0004278250218722659,
"loss": 3.3031,
"step": 49300
},
{
"epoch": 14.383628731343283,
"grad_norm": 0.3909219205379486,
"learning_rate": 0.0004276500437445319,
"loss": 3.2998,
"step": 49350
},
{
"epoch": 14.398204291044776,
"grad_norm": 0.3651144802570343,
"learning_rate": 0.0004274750656167979,
"loss": 3.2946,
"step": 49400
},
{
"epoch": 14.412779850746269,
"grad_norm": 0.35360461473464966,
"learning_rate": 0.0004273000874890639,
"loss": 3.3135,
"step": 49450
},
{
"epoch": 14.427355410447761,
"grad_norm": 0.4259098172187805,
"learning_rate": 0.00042712510936132977,
"loss": 3.2984,
"step": 49500
},
{
"epoch": 14.441930970149254,
"grad_norm": 0.36055222153663635,
"learning_rate": 0.00042695013123359576,
"loss": 3.3084,
"step": 49550
},
{
"epoch": 14.456506529850746,
"grad_norm": 0.3428303897380829,
"learning_rate": 0.00042677515310586176,
"loss": 3.3099,
"step": 49600
},
{
"epoch": 14.471082089552239,
"grad_norm": 0.38759660720825195,
"learning_rate": 0.0004266001749781277,
"loss": 3.31,
"step": 49650
},
{
"epoch": 14.485657649253731,
"grad_norm": 0.36404454708099365,
"learning_rate": 0.00042642519685039365,
"loss": 3.3048,
"step": 49700
},
{
"epoch": 14.500233208955224,
"grad_norm": 0.37980198860168457,
"learning_rate": 0.00042625021872265965,
"loss": 3.2926,
"step": 49750
},
{
"epoch": 14.514808768656717,
"grad_norm": 0.34508880972862244,
"learning_rate": 0.0004260752405949256,
"loss": 3.301,
"step": 49800
},
{
"epoch": 14.529384328358208,
"grad_norm": 0.3469710946083069,
"learning_rate": 0.00042590026246719153,
"loss": 3.3027,
"step": 49850
},
{
"epoch": 14.543959888059701,
"grad_norm": 0.3309904634952545,
"learning_rate": 0.00042572528433945753,
"loss": 3.3137,
"step": 49900
},
{
"epoch": 14.558535447761194,
"grad_norm": 0.3597103953361511,
"learning_rate": 0.0004255503062117235,
"loss": 3.3229,
"step": 49950
},
{
"epoch": 14.573111007462687,
"grad_norm": 0.3572462499141693,
"learning_rate": 0.00042537532808398947,
"loss": 3.3214,
"step": 50000
},
{
"epoch": 14.573111007462687,
"eval_accuracy": 0.36993763356396836,
"eval_loss": 3.559805393218994,
"eval_runtime": 179.3822,
"eval_samples_per_second": 92.69,
"eval_steps_per_second": 5.798,
"step": 50000
},
{
"epoch": 14.587686567164178,
"grad_norm": 0.3618316054344177,
"learning_rate": 0.0004252003499562554,
"loss": 3.319,
"step": 50050
},
{
"epoch": 14.602262126865671,
"grad_norm": 0.36564192175865173,
"learning_rate": 0.0004250253718285214,
"loss": 3.315,
"step": 50100
},
{
"epoch": 14.616837686567164,
"grad_norm": 0.35237547755241394,
"learning_rate": 0.00042485039370078736,
"loss": 3.3143,
"step": 50150
},
{
"epoch": 14.631413246268657,
"grad_norm": 0.3858044445514679,
"learning_rate": 0.00042467541557305335,
"loss": 3.3314,
"step": 50200
},
{
"epoch": 14.645988805970148,
"grad_norm": 0.34664201736450195,
"learning_rate": 0.0004245004374453193,
"loss": 3.3147,
"step": 50250
},
{
"epoch": 14.660564365671641,
"grad_norm": 0.3514772653579712,
"learning_rate": 0.00042432545931758524,
"loss": 3.3161,
"step": 50300
},
{
"epoch": 14.675139925373134,
"grad_norm": 0.3883207142353058,
"learning_rate": 0.00042415048118985124,
"loss": 3.3136,
"step": 50350
},
{
"epoch": 14.689715485074627,
"grad_norm": 0.3385624885559082,
"learning_rate": 0.00042397550306211724,
"loss": 3.3091,
"step": 50400
},
{
"epoch": 14.70429104477612,
"grad_norm": 0.35351499915122986,
"learning_rate": 0.0004238005249343831,
"loss": 3.3274,
"step": 50450
},
{
"epoch": 14.718866604477611,
"grad_norm": 0.3634480834007263,
"learning_rate": 0.0004236255468066491,
"loss": 3.3179,
"step": 50500
},
{
"epoch": 14.733442164179104,
"grad_norm": 0.366725891828537,
"learning_rate": 0.0004234505686789151,
"loss": 3.3205,
"step": 50550
},
{
"epoch": 14.748017723880597,
"grad_norm": 0.34701037406921387,
"learning_rate": 0.0004232755905511811,
"loss": 3.33,
"step": 50600
},
{
"epoch": 14.76259328358209,
"grad_norm": 0.35697057843208313,
"learning_rate": 0.000423100612423447,
"loss": 3.3354,
"step": 50650
},
{
"epoch": 14.777168843283581,
"grad_norm": 0.3638821542263031,
"learning_rate": 0.000422925634295713,
"loss": 3.3334,
"step": 50700
},
{
"epoch": 14.791744402985074,
"grad_norm": 0.36784827709198,
"learning_rate": 0.000422750656167979,
"loss": 3.3183,
"step": 50750
},
{
"epoch": 14.806319962686567,
"grad_norm": 0.335401713848114,
"learning_rate": 0.0004225756780402449,
"loss": 3.3289,
"step": 50800
},
{
"epoch": 14.82089552238806,
"grad_norm": 0.3848132789134979,
"learning_rate": 0.0004224006999125109,
"loss": 3.3264,
"step": 50850
},
{
"epoch": 14.835471082089553,
"grad_norm": 0.36380621790885925,
"learning_rate": 0.0004222257217847769,
"loss": 3.3303,
"step": 50900
},
{
"epoch": 14.850046641791044,
"grad_norm": 0.3595868945121765,
"learning_rate": 0.00042205074365704283,
"loss": 3.3333,
"step": 50950
},
{
"epoch": 14.864622201492537,
"grad_norm": 0.3353583514690399,
"learning_rate": 0.0004218757655293088,
"loss": 3.3372,
"step": 51000
},
{
"epoch": 14.864622201492537,
"eval_accuracy": 0.3704376102011241,
"eval_loss": 3.549607753753662,
"eval_runtime": 179.4313,
"eval_samples_per_second": 92.665,
"eval_steps_per_second": 5.796,
"step": 51000
},
{
"epoch": 14.87919776119403,
"grad_norm": 0.3326234817504883,
"learning_rate": 0.0004217007874015748,
"loss": 3.3247,
"step": 51050
},
{
"epoch": 14.893773320895523,
"grad_norm": 0.35429471731185913,
"learning_rate": 0.0004215258092738407,
"loss": 3.3338,
"step": 51100
},
{
"epoch": 14.908348880597014,
"grad_norm": 0.348736971616745,
"learning_rate": 0.0004213508311461067,
"loss": 3.3281,
"step": 51150
},
{
"epoch": 14.922924440298507,
"grad_norm": 0.3448042571544647,
"learning_rate": 0.00042117585301837266,
"loss": 3.325,
"step": 51200
},
{
"epoch": 14.9375,
"grad_norm": 0.37077659368515015,
"learning_rate": 0.0004210008748906386,
"loss": 3.3462,
"step": 51250
},
{
"epoch": 14.952075559701493,
"grad_norm": 0.34058451652526855,
"learning_rate": 0.0004208258967629046,
"loss": 3.3353,
"step": 51300
},
{
"epoch": 14.966651119402986,
"grad_norm": 0.3427557647228241,
"learning_rate": 0.0004206509186351706,
"loss": 3.3382,
"step": 51350
},
{
"epoch": 14.981226679104477,
"grad_norm": 0.3424375653266907,
"learning_rate": 0.00042047594050743654,
"loss": 3.3204,
"step": 51400
},
{
"epoch": 14.99580223880597,
"grad_norm": 0.3815460503101349,
"learning_rate": 0.0004203009623797025,
"loss": 3.3426,
"step": 51450
},
{
"epoch": 15.010202891791044,
"grad_norm": 0.34294381737709045,
"learning_rate": 0.0004201259842519685,
"loss": 3.2526,
"step": 51500
},
{
"epoch": 15.024778451492537,
"grad_norm": 0.35977619886398315,
"learning_rate": 0.0004199510061242344,
"loss": 3.2222,
"step": 51550
},
{
"epoch": 15.03935401119403,
"grad_norm": 0.35569271445274353,
"learning_rate": 0.00041977602799650037,
"loss": 3.2289,
"step": 51600
},
{
"epoch": 15.053929570895523,
"grad_norm": 0.37896642088890076,
"learning_rate": 0.00041960104986876637,
"loss": 3.2352,
"step": 51650
},
{
"epoch": 15.068505130597014,
"grad_norm": 0.34995055198669434,
"learning_rate": 0.00041942607174103236,
"loss": 3.2267,
"step": 51700
},
{
"epoch": 15.083080690298507,
"grad_norm": 0.3554458022117615,
"learning_rate": 0.00041925109361329825,
"loss": 3.2503,
"step": 51750
},
{
"epoch": 15.09765625,
"grad_norm": 0.36070436239242554,
"learning_rate": 0.00041907611548556425,
"loss": 3.2567,
"step": 51800
},
{
"epoch": 15.112231809701493,
"grad_norm": 0.3662464916706085,
"learning_rate": 0.00041890113735783025,
"loss": 3.2466,
"step": 51850
},
{
"epoch": 15.126807369402986,
"grad_norm": 0.34961169958114624,
"learning_rate": 0.00041872615923009625,
"loss": 3.2488,
"step": 51900
},
{
"epoch": 15.141382929104477,
"grad_norm": 0.3582487106323242,
"learning_rate": 0.00041855118110236214,
"loss": 3.2564,
"step": 51950
},
{
"epoch": 15.15595848880597,
"grad_norm": 0.3753519058227539,
"learning_rate": 0.00041837620297462813,
"loss": 3.2634,
"step": 52000
},
{
"epoch": 15.15595848880597,
"eval_accuracy": 0.370012841914049,
"eval_loss": 3.565525531768799,
"eval_runtime": 179.4874,
"eval_samples_per_second": 92.636,
"eval_steps_per_second": 5.794,
"step": 52000
},
{
"epoch": 15.170534048507463,
"grad_norm": 0.3757287859916687,
"learning_rate": 0.00041820122484689413,
"loss": 3.274,
"step": 52050
},
{
"epoch": 15.185109608208956,
"grad_norm": 0.3474057912826538,
"learning_rate": 0.0004180262467191601,
"loss": 3.2527,
"step": 52100
},
{
"epoch": 15.199685167910447,
"grad_norm": 0.36459994316101074,
"learning_rate": 0.000417851268591426,
"loss": 3.2632,
"step": 52150
},
{
"epoch": 15.21426072761194,
"grad_norm": 0.3778489828109741,
"learning_rate": 0.000417676290463692,
"loss": 3.2657,
"step": 52200
},
{
"epoch": 15.228836287313433,
"grad_norm": 0.3785831332206726,
"learning_rate": 0.00041750131233595796,
"loss": 3.2661,
"step": 52250
},
{
"epoch": 15.243411847014926,
"grad_norm": 0.35805240273475647,
"learning_rate": 0.00041732633420822396,
"loss": 3.2598,
"step": 52300
},
{
"epoch": 15.257987406716419,
"grad_norm": 0.3630155026912689,
"learning_rate": 0.0004171513560804899,
"loss": 3.2674,
"step": 52350
},
{
"epoch": 15.27256296641791,
"grad_norm": 0.33566099405288696,
"learning_rate": 0.00041697637795275584,
"loss": 3.277,
"step": 52400
},
{
"epoch": 15.287138526119403,
"grad_norm": 0.3608446419239044,
"learning_rate": 0.00041680139982502184,
"loss": 3.2707,
"step": 52450
},
{
"epoch": 15.301714085820896,
"grad_norm": 0.34851962327957153,
"learning_rate": 0.0004166264216972878,
"loss": 3.2775,
"step": 52500
},
{
"epoch": 15.316289645522389,
"grad_norm": 0.3692467212677002,
"learning_rate": 0.00041645144356955373,
"loss": 3.2781,
"step": 52550
},
{
"epoch": 15.33086520522388,
"grad_norm": 0.38889679312705994,
"learning_rate": 0.0004162764654418197,
"loss": 3.2882,
"step": 52600
},
{
"epoch": 15.345440764925373,
"grad_norm": 0.35008713603019714,
"learning_rate": 0.0004161014873140857,
"loss": 3.2761,
"step": 52650
},
{
"epoch": 15.360016324626866,
"grad_norm": 0.3493787944316864,
"learning_rate": 0.00041592650918635167,
"loss": 3.2752,
"step": 52700
},
{
"epoch": 15.374591884328359,
"grad_norm": 0.37880319356918335,
"learning_rate": 0.0004157515310586176,
"loss": 3.2865,
"step": 52750
},
{
"epoch": 15.389167444029852,
"grad_norm": 0.3520691692829132,
"learning_rate": 0.0004155765529308836,
"loss": 3.2837,
"step": 52800
},
{
"epoch": 15.403743003731343,
"grad_norm": 0.38314563035964966,
"learning_rate": 0.0004154015748031496,
"loss": 3.2907,
"step": 52850
},
{
"epoch": 15.418318563432836,
"grad_norm": 0.3624465763568878,
"learning_rate": 0.0004152265966754155,
"loss": 3.3018,
"step": 52900
},
{
"epoch": 15.432894123134329,
"grad_norm": 0.35036274790763855,
"learning_rate": 0.0004150516185476815,
"loss": 3.2947,
"step": 52950
},
{
"epoch": 15.447469682835822,
"grad_norm": 0.35341790318489075,
"learning_rate": 0.0004148766404199475,
"loss": 3.3005,
"step": 53000
},
{
"epoch": 15.447469682835822,
"eval_accuracy": 0.3703226202824562,
"eval_loss": 3.559825897216797,
"eval_runtime": 179.392,
"eval_samples_per_second": 92.685,
"eval_steps_per_second": 5.797,
"step": 53000
},
{
"epoch": 15.462045242537313,
"grad_norm": 0.35088276863098145,
"learning_rate": 0.0004147016622922135,
"loss": 3.3018,
"step": 53050
},
{
"epoch": 15.476620802238806,
"grad_norm": 0.34915563464164734,
"learning_rate": 0.0004145266841644794,
"loss": 3.2953,
"step": 53100
},
{
"epoch": 15.491196361940299,
"grad_norm": 0.35356277227401733,
"learning_rate": 0.0004143517060367454,
"loss": 3.2968,
"step": 53150
},
{
"epoch": 15.505771921641792,
"grad_norm": 0.37322476506233215,
"learning_rate": 0.0004141767279090114,
"loss": 3.2879,
"step": 53200
},
{
"epoch": 15.520347481343283,
"grad_norm": 0.3452324867248535,
"learning_rate": 0.0004140017497812773,
"loss": 3.308,
"step": 53250
},
{
"epoch": 15.534923041044776,
"grad_norm": 0.3493434190750122,
"learning_rate": 0.00041382677165354326,
"loss": 3.3076,
"step": 53300
},
{
"epoch": 15.549498600746269,
"grad_norm": 0.34777626395225525,
"learning_rate": 0.00041365179352580926,
"loss": 3.3032,
"step": 53350
},
{
"epoch": 15.564074160447761,
"grad_norm": 0.3296518623828888,
"learning_rate": 0.0004134768153980752,
"loss": 3.3017,
"step": 53400
},
{
"epoch": 15.578649720149254,
"grad_norm": 0.3763023614883423,
"learning_rate": 0.00041330183727034114,
"loss": 3.2968,
"step": 53450
},
{
"epoch": 15.593225279850746,
"grad_norm": 0.36298853158950806,
"learning_rate": 0.00041312685914260714,
"loss": 3.3098,
"step": 53500
},
{
"epoch": 15.607800839552239,
"grad_norm": 0.3488393723964691,
"learning_rate": 0.0004129518810148731,
"loss": 3.3003,
"step": 53550
},
{
"epoch": 15.622376399253731,
"grad_norm": 0.3801731765270233,
"learning_rate": 0.0004127769028871391,
"loss": 3.3079,
"step": 53600
},
{
"epoch": 15.636951958955224,
"grad_norm": 0.36680352687835693,
"learning_rate": 0.00041260192475940503,
"loss": 3.3055,
"step": 53650
},
{
"epoch": 15.651527518656717,
"grad_norm": 0.3656584918498993,
"learning_rate": 0.00041242694663167097,
"loss": 3.3073,
"step": 53700
},
{
"epoch": 15.666103078358208,
"grad_norm": 0.34342220425605774,
"learning_rate": 0.00041225196850393697,
"loss": 3.3053,
"step": 53750
},
{
"epoch": 15.680678638059701,
"grad_norm": 0.352065771818161,
"learning_rate": 0.00041207699037620297,
"loss": 3.3134,
"step": 53800
},
{
"epoch": 15.695254197761194,
"grad_norm": 0.3709762692451477,
"learning_rate": 0.00041190201224846886,
"loss": 3.3165,
"step": 53850
},
{
"epoch": 15.709829757462687,
"grad_norm": 0.3582940995693207,
"learning_rate": 0.00041172703412073485,
"loss": 3.3011,
"step": 53900
},
{
"epoch": 15.724405317164178,
"grad_norm": 0.3479735255241394,
"learning_rate": 0.00041155205599300085,
"loss": 3.3088,
"step": 53950
},
{
"epoch": 15.738980876865671,
"grad_norm": 0.3477837145328522,
"learning_rate": 0.00041137707786526685,
"loss": 3.3022,
"step": 54000
},
{
"epoch": 15.738980876865671,
"eval_accuracy": 0.37073397111740425,
"eval_loss": 3.554640054702759,
"eval_runtime": 179.3272,
"eval_samples_per_second": 92.719,
"eval_steps_per_second": 5.799,
"step": 54000
},
{
"epoch": 15.753556436567164,
"grad_norm": 0.34925585985183716,
"learning_rate": 0.00041120209973753274,
"loss": 3.3033,
"step": 54050
},
{
"epoch": 15.768131996268657,
"grad_norm": 0.373879998922348,
"learning_rate": 0.00041102712160979874,
"loss": 3.3086,
"step": 54100
},
{
"epoch": 15.782707555970148,
"grad_norm": 0.405393123626709,
"learning_rate": 0.00041085214348206473,
"loss": 3.3134,
"step": 54150
},
{
"epoch": 15.797283115671641,
"grad_norm": 0.3593122661113739,
"learning_rate": 0.0004106771653543306,
"loss": 3.3102,
"step": 54200
},
{
"epoch": 15.811858675373134,
"grad_norm": 0.38658249378204346,
"learning_rate": 0.0004105021872265966,
"loss": 3.3161,
"step": 54250
},
{
"epoch": 15.826434235074627,
"grad_norm": 0.3754787743091583,
"learning_rate": 0.0004103272090988626,
"loss": 3.3118,
"step": 54300
},
{
"epoch": 15.84100979477612,
"grad_norm": 0.3744620382785797,
"learning_rate": 0.0004101522309711286,
"loss": 3.3125,
"step": 54350
},
{
"epoch": 15.855585354477611,
"grad_norm": 0.3404674530029297,
"learning_rate": 0.0004099772528433945,
"loss": 3.3051,
"step": 54400
},
{
"epoch": 15.870160914179104,
"grad_norm": 0.3725360631942749,
"learning_rate": 0.0004098022747156605,
"loss": 3.3122,
"step": 54450
},
{
"epoch": 15.884736473880597,
"grad_norm": 0.3606867790222168,
"learning_rate": 0.0004096272965879265,
"loss": 3.3167,
"step": 54500
},
{
"epoch": 15.89931203358209,
"grad_norm": 0.37651240825653076,
"learning_rate": 0.00040945231846019244,
"loss": 3.3221,
"step": 54550
},
{
"epoch": 15.913887593283581,
"grad_norm": 0.3563765585422516,
"learning_rate": 0.0004092773403324584,
"loss": 3.315,
"step": 54600
},
{
"epoch": 15.928463152985074,
"grad_norm": 0.3644365668296814,
"learning_rate": 0.0004091023622047244,
"loss": 3.3119,
"step": 54650
},
{
"epoch": 15.943038712686567,
"grad_norm": 0.3580692410469055,
"learning_rate": 0.00040892738407699033,
"loss": 3.3243,
"step": 54700
},
{
"epoch": 15.95761427238806,
"grad_norm": 0.3598160147666931,
"learning_rate": 0.0004087524059492563,
"loss": 3.3212,
"step": 54750
},
{
"epoch": 15.972189832089553,
"grad_norm": 0.35726791620254517,
"learning_rate": 0.00040857742782152227,
"loss": 3.3214,
"step": 54800
},
{
"epoch": 15.986765391791044,
"grad_norm": 0.367314875125885,
"learning_rate": 0.0004084024496937882,
"loss": 3.3345,
"step": 54850
},
{
"epoch": 16.00116604477612,
"grad_norm": 0.42804643511772156,
"learning_rate": 0.0004082274715660542,
"loss": 3.3104,
"step": 54900
},
{
"epoch": 16.01574160447761,
"grad_norm": 0.3623397648334503,
"learning_rate": 0.0004080524934383202,
"loss": 3.214,
"step": 54950
},
{
"epoch": 16.030317164179106,
"grad_norm": 0.37068092823028564,
"learning_rate": 0.0004078775153105861,
"loss": 3.2172,
"step": 55000
},
{
"epoch": 16.030317164179106,
"eval_accuracy": 0.37059956120223664,
"eval_loss": 3.559025526046753,
"eval_runtime": 179.3964,
"eval_samples_per_second": 92.683,
"eval_steps_per_second": 5.797,
"step": 55000
},
{
"epoch": 16.044892723880597,
"grad_norm": 0.35877496004104614,
"learning_rate": 0.0004077025371828521,
"loss": 3.2115,
"step": 55050
},
{
"epoch": 16.05946828358209,
"grad_norm": 0.38903719186782837,
"learning_rate": 0.0004075275590551181,
"loss": 3.2158,
"step": 55100
},
{
"epoch": 16.074043843283583,
"grad_norm": 0.3581124544143677,
"learning_rate": 0.000407352580927384,
"loss": 3.2269,
"step": 55150
},
{
"epoch": 16.088619402985074,
"grad_norm": 0.3499051630496979,
"learning_rate": 0.00040717760279965,
"loss": 3.225,
"step": 55200
},
{
"epoch": 16.10319496268657,
"grad_norm": 0.3736487030982971,
"learning_rate": 0.000407002624671916,
"loss": 3.2432,
"step": 55250
},
{
"epoch": 16.11777052238806,
"grad_norm": 0.38322770595550537,
"learning_rate": 0.000406827646544182,
"loss": 3.2391,
"step": 55300
},
{
"epoch": 16.13234608208955,
"grad_norm": 0.3570607304573059,
"learning_rate": 0.00040665266841644786,
"loss": 3.2254,
"step": 55350
},
{
"epoch": 16.146921641791046,
"grad_norm": 0.3845062553882599,
"learning_rate": 0.00040647769028871386,
"loss": 3.2414,
"step": 55400
},
{
"epoch": 16.161497201492537,
"grad_norm": 0.37037938833236694,
"learning_rate": 0.00040630271216097986,
"loss": 3.2536,
"step": 55450
},
{
"epoch": 16.17607276119403,
"grad_norm": 0.3952064514160156,
"learning_rate": 0.00040612773403324586,
"loss": 3.2415,
"step": 55500
},
{
"epoch": 16.190648320895523,
"grad_norm": 0.3778436779975891,
"learning_rate": 0.00040595275590551175,
"loss": 3.2437,
"step": 55550
},
{
"epoch": 16.205223880597014,
"grad_norm": 0.3779846131801605,
"learning_rate": 0.00040577777777777774,
"loss": 3.2486,
"step": 55600
},
{
"epoch": 16.21979944029851,
"grad_norm": 0.35539406538009644,
"learning_rate": 0.00040560279965004374,
"loss": 3.2471,
"step": 55650
},
{
"epoch": 16.234375,
"grad_norm": 0.3961225152015686,
"learning_rate": 0.0004054278215223097,
"loss": 3.2613,
"step": 55700
},
{
"epoch": 16.24895055970149,
"grad_norm": 0.3713584244251251,
"learning_rate": 0.00040525284339457563,
"loss": 3.264,
"step": 55750
},
{
"epoch": 16.263526119402986,
"grad_norm": 0.39257341623306274,
"learning_rate": 0.0004050778652668416,
"loss": 3.2574,
"step": 55800
},
{
"epoch": 16.278101679104477,
"grad_norm": 0.3771331310272217,
"learning_rate": 0.00040490288713910757,
"loss": 3.2638,
"step": 55850
},
{
"epoch": 16.29267723880597,
"grad_norm": 0.3533707559108734,
"learning_rate": 0.00040472790901137357,
"loss": 3.2598,
"step": 55900
},
{
"epoch": 16.307252798507463,
"grad_norm": 0.3563488721847534,
"learning_rate": 0.0004045529308836395,
"loss": 3.2619,
"step": 55950
},
{
"epoch": 16.321828358208954,
"grad_norm": 0.37279897928237915,
"learning_rate": 0.00040437795275590546,
"loss": 3.2679,
"step": 56000
},
{
"epoch": 16.321828358208954,
"eval_accuracy": 0.3706797128241536,
"eval_loss": 3.5606961250305176,
"eval_runtime": 179.3046,
"eval_samples_per_second": 92.73,
"eval_steps_per_second": 5.8,
"step": 56000
},
{
"epoch": 16.33640391791045,
"grad_norm": 0.35383620858192444,
"learning_rate": 0.00040420297462817145,
"loss": 3.2789,
"step": 56050
},
{
"epoch": 16.35097947761194,
"grad_norm": 0.36741262674331665,
"learning_rate": 0.0004040279965004374,
"loss": 3.2644,
"step": 56100
},
{
"epoch": 16.365555037313435,
"grad_norm": 0.34609347581863403,
"learning_rate": 0.00040385301837270334,
"loss": 3.2825,
"step": 56150
},
{
"epoch": 16.380130597014926,
"grad_norm": 0.3590814769268036,
"learning_rate": 0.00040367804024496934,
"loss": 3.257,
"step": 56200
},
{
"epoch": 16.394706156716417,
"grad_norm": 0.3929997682571411,
"learning_rate": 0.00040350306211723534,
"loss": 3.2757,
"step": 56250
},
{
"epoch": 16.40928171641791,
"grad_norm": 0.3860103487968445,
"learning_rate": 0.0004033280839895012,
"loss": 3.2667,
"step": 56300
},
{
"epoch": 16.423857276119403,
"grad_norm": 0.3654019832611084,
"learning_rate": 0.0004031531058617672,
"loss": 3.2709,
"step": 56350
},
{
"epoch": 16.438432835820894,
"grad_norm": 0.3494860529899597,
"learning_rate": 0.0004029781277340332,
"loss": 3.2721,
"step": 56400
},
{
"epoch": 16.45300839552239,
"grad_norm": 0.38559070229530334,
"learning_rate": 0.0004028031496062992,
"loss": 3.2807,
"step": 56450
},
{
"epoch": 16.46758395522388,
"grad_norm": 0.37908729910850525,
"learning_rate": 0.0004026281714785651,
"loss": 3.2763,
"step": 56500
},
{
"epoch": 16.482159514925375,
"grad_norm": 0.38921189308166504,
"learning_rate": 0.0004024531933508311,
"loss": 3.2863,
"step": 56550
},
{
"epoch": 16.496735074626866,
"grad_norm": 0.3636137843132019,
"learning_rate": 0.0004022782152230971,
"loss": 3.2822,
"step": 56600
},
{
"epoch": 16.511310634328357,
"grad_norm": 0.3770190179347992,
"learning_rate": 0.0004021032370953631,
"loss": 3.2813,
"step": 56650
},
{
"epoch": 16.52588619402985,
"grad_norm": 0.3771943151950836,
"learning_rate": 0.000401928258967629,
"loss": 3.2899,
"step": 56700
},
{
"epoch": 16.540461753731343,
"grad_norm": 0.359430193901062,
"learning_rate": 0.000401753280839895,
"loss": 3.2897,
"step": 56750
},
{
"epoch": 16.555037313432837,
"grad_norm": 0.3965089023113251,
"learning_rate": 0.000401578302712161,
"loss": 3.2841,
"step": 56800
},
{
"epoch": 16.56961287313433,
"grad_norm": 0.35867807269096375,
"learning_rate": 0.00040140332458442693,
"loss": 3.2832,
"step": 56850
},
{
"epoch": 16.58418843283582,
"grad_norm": 0.3748142719268799,
"learning_rate": 0.00040122834645669287,
"loss": 3.2781,
"step": 56900
},
{
"epoch": 16.598763992537314,
"grad_norm": 0.35872289538383484,
"learning_rate": 0.00040105336832895887,
"loss": 3.2906,
"step": 56950
},
{
"epoch": 16.613339552238806,
"grad_norm": 0.36718112230300903,
"learning_rate": 0.0004008783902012248,
"loss": 3.2927,
"step": 57000
},
{
"epoch": 16.613339552238806,
"eval_accuracy": 0.37125619247782327,
"eval_loss": 3.5517942905426025,
"eval_runtime": 179.3724,
"eval_samples_per_second": 92.695,
"eval_steps_per_second": 5.798,
"step": 57000
},
{
"epoch": 16.627915111940297,
"grad_norm": 0.36552923917770386,
"learning_rate": 0.00040070341207349076,
"loss": 3.2982,
"step": 57050
},
{
"epoch": 16.64249067164179,
"grad_norm": 0.39370307326316833,
"learning_rate": 0.00040052843394575675,
"loss": 3.2764,
"step": 57100
},
{
"epoch": 16.657066231343283,
"grad_norm": 0.3543391227722168,
"learning_rate": 0.0004003534558180227,
"loss": 3.287,
"step": 57150
},
{
"epoch": 16.671641791044777,
"grad_norm": 0.35714074969291687,
"learning_rate": 0.0004001784776902887,
"loss": 3.2802,
"step": 57200
},
{
"epoch": 16.68621735074627,
"grad_norm": 0.3787291646003723,
"learning_rate": 0.00040000349956255464,
"loss": 3.2908,
"step": 57250
},
{
"epoch": 16.70079291044776,
"grad_norm": 0.36080485582351685,
"learning_rate": 0.0003998285214348206,
"loss": 3.304,
"step": 57300
},
{
"epoch": 16.715368470149254,
"grad_norm": 0.35946592688560486,
"learning_rate": 0.0003996535433070866,
"loss": 3.3054,
"step": 57350
},
{
"epoch": 16.729944029850746,
"grad_norm": 0.41039395332336426,
"learning_rate": 0.0003994785651793526,
"loss": 3.3145,
"step": 57400
},
{
"epoch": 16.74451958955224,
"grad_norm": 0.36457979679107666,
"learning_rate": 0.00039930358705161847,
"loss": 3.3108,
"step": 57450
},
{
"epoch": 16.75909514925373,
"grad_norm": 0.35925135016441345,
"learning_rate": 0.00039912860892388446,
"loss": 3.3023,
"step": 57500
},
{
"epoch": 16.773670708955223,
"grad_norm": 0.3933243453502655,
"learning_rate": 0.00039895363079615046,
"loss": 3.305,
"step": 57550
},
{
"epoch": 16.788246268656717,
"grad_norm": 0.3717454969882965,
"learning_rate": 0.00039877865266841646,
"loss": 3.3088,
"step": 57600
},
{
"epoch": 16.80282182835821,
"grad_norm": 0.3560434579849243,
"learning_rate": 0.00039860367454068235,
"loss": 3.3019,
"step": 57650
},
{
"epoch": 16.817397388059703,
"grad_norm": 0.3487250804901123,
"learning_rate": 0.00039842869641294835,
"loss": 3.2972,
"step": 57700
},
{
"epoch": 16.831972947761194,
"grad_norm": 0.366042822599411,
"learning_rate": 0.00039825371828521434,
"loss": 3.3067,
"step": 57750
},
{
"epoch": 16.846548507462686,
"grad_norm": 0.3558788597583771,
"learning_rate": 0.00039807874015748023,
"loss": 3.3003,
"step": 57800
},
{
"epoch": 16.86112406716418,
"grad_norm": 0.35664811730384827,
"learning_rate": 0.00039790376202974623,
"loss": 3.3094,
"step": 57850
},
{
"epoch": 16.87569962686567,
"grad_norm": 0.3727872967720032,
"learning_rate": 0.00039772878390201223,
"loss": 3.3029,
"step": 57900
},
{
"epoch": 16.890275186567163,
"grad_norm": 0.3959354758262634,
"learning_rate": 0.0003975538057742782,
"loss": 3.2972,
"step": 57950
},
{
"epoch": 16.904850746268657,
"grad_norm": 0.3455987572669983,
"learning_rate": 0.0003973788276465441,
"loss": 3.3154,
"step": 58000
},
{
"epoch": 16.904850746268657,
"eval_accuracy": 0.371655538224026,
"eval_loss": 3.5446152687072754,
"eval_runtime": 179.4456,
"eval_samples_per_second": 92.658,
"eval_steps_per_second": 5.796,
"step": 58000
},
{
"epoch": 16.91942630597015,
"grad_norm": 0.36653903126716614,
"learning_rate": 0.0003972038495188101,
"loss": 3.312,
"step": 58050
},
{
"epoch": 16.934001865671643,
"grad_norm": 0.3434462249279022,
"learning_rate": 0.0003970288713910761,
"loss": 3.3182,
"step": 58100
},
{
"epoch": 16.948577425373134,
"grad_norm": 0.3869427740573883,
"learning_rate": 0.00039685389326334205,
"loss": 3.3116,
"step": 58150
},
{
"epoch": 16.963152985074625,
"grad_norm": 0.3616868853569031,
"learning_rate": 0.000396678915135608,
"loss": 3.3154,
"step": 58200
},
{
"epoch": 16.97772854477612,
"grad_norm": 0.36797189712524414,
"learning_rate": 0.000396503937007874,
"loss": 3.2952,
"step": 58250
},
{
"epoch": 16.99230410447761,
"grad_norm": 0.3591371774673462,
"learning_rate": 0.00039632895888013994,
"loss": 3.3028,
"step": 58300
},
{
"epoch": 17.006704757462686,
"grad_norm": 0.38673022389411926,
"learning_rate": 0.00039615398075240594,
"loss": 3.2497,
"step": 58350
},
{
"epoch": 17.02128031716418,
"grad_norm": 0.3777545392513275,
"learning_rate": 0.0003959790026246719,
"loss": 3.2017,
"step": 58400
},
{
"epoch": 17.03585587686567,
"grad_norm": 0.41391587257385254,
"learning_rate": 0.0003958040244969378,
"loss": 3.2017,
"step": 58450
},
{
"epoch": 17.050431436567163,
"grad_norm": 0.38846704363822937,
"learning_rate": 0.0003956290463692038,
"loss": 3.211,
"step": 58500
},
{
"epoch": 17.065006996268657,
"grad_norm": 0.3742265999317169,
"learning_rate": 0.0003954540682414698,
"loss": 3.2093,
"step": 58550
},
{
"epoch": 17.07958255597015,
"grad_norm": 0.3785288333892822,
"learning_rate": 0.0003952790901137357,
"loss": 3.2154,
"step": 58600
},
{
"epoch": 17.094158115671643,
"grad_norm": 0.38964998722076416,
"learning_rate": 0.0003951041119860017,
"loss": 3.2223,
"step": 58650
},
{
"epoch": 17.108733675373134,
"grad_norm": 0.370022714138031,
"learning_rate": 0.0003949291338582677,
"loss": 3.2128,
"step": 58700
},
{
"epoch": 17.123309235074625,
"grad_norm": 0.3854983448982239,
"learning_rate": 0.0003947541557305336,
"loss": 3.2228,
"step": 58750
},
{
"epoch": 17.13788479477612,
"grad_norm": 0.3856055438518524,
"learning_rate": 0.0003945791776027996,
"loss": 3.2329,
"step": 58800
},
{
"epoch": 17.15246035447761,
"grad_norm": 0.38632187247276306,
"learning_rate": 0.0003944041994750656,
"loss": 3.2304,
"step": 58850
},
{
"epoch": 17.167035914179106,
"grad_norm": 0.3841564655303955,
"learning_rate": 0.0003942292213473316,
"loss": 3.2327,
"step": 58900
},
{
"epoch": 17.181611473880597,
"grad_norm": 0.3514139950275421,
"learning_rate": 0.0003940542432195975,
"loss": 3.221,
"step": 58950
},
{
"epoch": 17.19618703358209,
"grad_norm": 0.3656717538833618,
"learning_rate": 0.0003938792650918635,
"loss": 3.2442,
"step": 59000
},
{
"epoch": 17.19618703358209,
"eval_accuracy": 0.37105704924099003,
"eval_loss": 3.560926914215088,
"eval_runtime": 179.492,
"eval_samples_per_second": 92.634,
"eval_steps_per_second": 5.794,
"step": 59000
},
{
"epoch": 17.210762593283583,
"grad_norm": 0.3702145516872406,
"learning_rate": 0.00039370428696412947,
"loss": 3.2338,
"step": 59050
},
{
"epoch": 17.225338152985074,
"grad_norm": 0.38071298599243164,
"learning_rate": 0.0003935293088363954,
"loss": 3.2492,
"step": 59100
},
{
"epoch": 17.23991371268657,
"grad_norm": 0.3649649918079376,
"learning_rate": 0.00039335433070866136,
"loss": 3.2408,
"step": 59150
},
{
"epoch": 17.25448927238806,
"grad_norm": 0.3958336412906647,
"learning_rate": 0.00039317935258092736,
"loss": 3.2552,
"step": 59200
},
{
"epoch": 17.26906483208955,
"grad_norm": 0.3653128147125244,
"learning_rate": 0.00039300437445319335,
"loss": 3.2593,
"step": 59250
},
{
"epoch": 17.283640391791046,
"grad_norm": 0.3596879243850708,
"learning_rate": 0.0003928293963254593,
"loss": 3.2644,
"step": 59300
},
{
"epoch": 17.298215951492537,
"grad_norm": 0.35950595140457153,
"learning_rate": 0.00039265441819772524,
"loss": 3.2618,
"step": 59350
},
{
"epoch": 17.31279151119403,
"grad_norm": 0.3889482319355011,
"learning_rate": 0.00039247944006999124,
"loss": 3.2517,
"step": 59400
},
{
"epoch": 17.327367070895523,
"grad_norm": 0.35707592964172363,
"learning_rate": 0.0003923044619422572,
"loss": 3.2599,
"step": 59450
},
{
"epoch": 17.341942630597014,
"grad_norm": 0.35822805762290955,
"learning_rate": 0.0003921294838145232,
"loss": 3.2736,
"step": 59500
},
{
"epoch": 17.35651819029851,
"grad_norm": 0.355954110622406,
"learning_rate": 0.0003919545056867891,
"loss": 3.2614,
"step": 59550
},
{
"epoch": 17.37109375,
"grad_norm": 0.38183078169822693,
"learning_rate": 0.00039177952755905507,
"loss": 3.256,
"step": 59600
},
{
"epoch": 17.38566930970149,
"grad_norm": 0.35589709877967834,
"learning_rate": 0.00039160454943132106,
"loss": 3.26,
"step": 59650
},
{
"epoch": 17.400244869402986,
"grad_norm": 0.38540026545524597,
"learning_rate": 0.000391429571303587,
"loss": 3.2505,
"step": 59700
},
{
"epoch": 17.414820429104477,
"grad_norm": 0.3964499533176422,
"learning_rate": 0.00039125459317585295,
"loss": 3.2584,
"step": 59750
},
{
"epoch": 17.42939598880597,
"grad_norm": 0.3757520616054535,
"learning_rate": 0.00039107961504811895,
"loss": 3.2646,
"step": 59800
},
{
"epoch": 17.443971548507463,
"grad_norm": 0.3785048723220825,
"learning_rate": 0.00039090463692038495,
"loss": 3.2672,
"step": 59850
},
{
"epoch": 17.458547108208954,
"grad_norm": 0.3456498980522156,
"learning_rate": 0.00039072965879265084,
"loss": 3.2603,
"step": 59900
},
{
"epoch": 17.47312266791045,
"grad_norm": 0.38184770941734314,
"learning_rate": 0.00039055468066491683,
"loss": 3.2643,
"step": 59950
},
{
"epoch": 17.48769822761194,
"grad_norm": 0.3528321385383606,
"learning_rate": 0.00039037970253718283,
"loss": 3.2761,
"step": 60000
},
{
"epoch": 17.48769822761194,
"eval_accuracy": 0.37141108166202685,
"eval_loss": 3.552334785461426,
"eval_runtime": 179.4922,
"eval_samples_per_second": 92.634,
"eval_steps_per_second": 5.794,
"step": 60000
},
{
"epoch": 17.502273787313435,
"grad_norm": 0.3572315573692322,
"learning_rate": 0.00039020472440944883,
"loss": 3.2563,
"step": 60050
},
{
"epoch": 17.516849347014926,
"grad_norm": 0.3947022557258606,
"learning_rate": 0.0003900297462817147,
"loss": 3.2788,
"step": 60100
},
{
"epoch": 17.531424906716417,
"grad_norm": 0.3718763291835785,
"learning_rate": 0.0003898547681539807,
"loss": 3.2717,
"step": 60150
},
{
"epoch": 17.54600046641791,
"grad_norm": 0.33914080262184143,
"learning_rate": 0.0003896797900262467,
"loss": 3.2747,
"step": 60200
},
{
"epoch": 17.560576026119403,
"grad_norm": 0.38070231676101685,
"learning_rate": 0.00038950481189851266,
"loss": 3.2827,
"step": 60250
},
{
"epoch": 17.575151585820894,
"grad_norm": 0.3630342483520508,
"learning_rate": 0.0003893298337707786,
"loss": 3.2707,
"step": 60300
},
{
"epoch": 17.58972714552239,
"grad_norm": 0.38142305612564087,
"learning_rate": 0.0003891548556430446,
"loss": 3.2791,
"step": 60350
},
{
"epoch": 17.60430270522388,
"grad_norm": 0.36522653698921204,
"learning_rate": 0.00038897987751531054,
"loss": 3.2805,
"step": 60400
},
{
"epoch": 17.618878264925375,
"grad_norm": 0.38595816493034363,
"learning_rate": 0.00038880489938757654,
"loss": 3.2822,
"step": 60450
},
{
"epoch": 17.633453824626866,
"grad_norm": 0.3888775706291199,
"learning_rate": 0.0003886299212598425,
"loss": 3.284,
"step": 60500
},
{
"epoch": 17.648029384328357,
"grad_norm": 0.38401997089385986,
"learning_rate": 0.0003884549431321085,
"loss": 3.274,
"step": 60550
},
{
"epoch": 17.66260494402985,
"grad_norm": 0.36174026131629944,
"learning_rate": 0.0003882799650043744,
"loss": 3.2895,
"step": 60600
},
{
"epoch": 17.677180503731343,
"grad_norm": 0.36874690651893616,
"learning_rate": 0.00038810498687664037,
"loss": 3.2842,
"step": 60650
},
{
"epoch": 17.691756063432837,
"grad_norm": 0.38380053639411926,
"learning_rate": 0.00038793000874890637,
"loss": 3.2882,
"step": 60700
},
{
"epoch": 17.70633162313433,
"grad_norm": 0.36726170778274536,
"learning_rate": 0.0003877550306211723,
"loss": 3.274,
"step": 60750
},
{
"epoch": 17.72090718283582,
"grad_norm": 0.35661062598228455,
"learning_rate": 0.0003875800524934383,
"loss": 3.2791,
"step": 60800
},
{
"epoch": 17.735482742537314,
"grad_norm": 0.3487260639667511,
"learning_rate": 0.00038740507436570425,
"loss": 3.2784,
"step": 60850
},
{
"epoch": 17.750058302238806,
"grad_norm": 0.3786361813545227,
"learning_rate": 0.0003872300962379702,
"loss": 3.2846,
"step": 60900
},
{
"epoch": 17.764633861940297,
"grad_norm": 0.37210702896118164,
"learning_rate": 0.0003870551181102362,
"loss": 3.291,
"step": 60950
},
{
"epoch": 17.77920942164179,
"grad_norm": 0.3575860559940338,
"learning_rate": 0.0003868801399825022,
"loss": 3.291,
"step": 61000
},
{
"epoch": 17.77920942164179,
"eval_accuracy": 0.37191305914730677,
"eval_loss": 3.5460619926452637,
"eval_runtime": 179.5326,
"eval_samples_per_second": 92.613,
"eval_steps_per_second": 5.793,
"step": 61000
},
{
"epoch": 17.793784981343283,
"grad_norm": 0.3741329610347748,
"learning_rate": 0.0003867051618547681,
"loss": 3.2749,
"step": 61050
},
{
"epoch": 17.808360541044777,
"grad_norm": 0.3822123408317566,
"learning_rate": 0.0003865301837270341,
"loss": 3.2878,
"step": 61100
},
{
"epoch": 17.82293610074627,
"grad_norm": 0.3695724904537201,
"learning_rate": 0.0003863552055993001,
"loss": 3.2917,
"step": 61150
},
{
"epoch": 17.83751166044776,
"grad_norm": 0.37312400341033936,
"learning_rate": 0.00038618022747156607,
"loss": 3.289,
"step": 61200
},
{
"epoch": 17.852087220149254,
"grad_norm": 0.38065680861473083,
"learning_rate": 0.00038600524934383196,
"loss": 3.2949,
"step": 61250
},
{
"epoch": 17.866662779850746,
"grad_norm": 0.3558042347431183,
"learning_rate": 0.00038583027121609796,
"loss": 3.2938,
"step": 61300
},
{
"epoch": 17.88123833955224,
"grad_norm": 0.40853044390678406,
"learning_rate": 0.00038565529308836396,
"loss": 3.2928,
"step": 61350
},
{
"epoch": 17.89581389925373,
"grad_norm": 0.36038708686828613,
"learning_rate": 0.00038548031496062984,
"loss": 3.2871,
"step": 61400
},
{
"epoch": 17.910389458955223,
"grad_norm": 0.3773646056652069,
"learning_rate": 0.00038530533683289584,
"loss": 3.2862,
"step": 61450
},
{
"epoch": 17.924965018656717,
"grad_norm": 0.3764312267303467,
"learning_rate": 0.00038513035870516184,
"loss": 3.2862,
"step": 61500
},
{
"epoch": 17.93954057835821,
"grad_norm": 0.37428849935531616,
"learning_rate": 0.0003849553805774278,
"loss": 3.2922,
"step": 61550
},
{
"epoch": 17.954116138059703,
"grad_norm": 0.36383602023124695,
"learning_rate": 0.00038478040244969373,
"loss": 3.2902,
"step": 61600
},
{
"epoch": 17.968691697761194,
"grad_norm": 0.3712141215801239,
"learning_rate": 0.0003846054243219597,
"loss": 3.2926,
"step": 61650
},
{
"epoch": 17.983267257462686,
"grad_norm": 0.37419530749320984,
"learning_rate": 0.00038443044619422567,
"loss": 3.2895,
"step": 61700
},
{
"epoch": 17.99784281716418,
"grad_norm": 0.36220309138298035,
"learning_rate": 0.00038425546806649167,
"loss": 3.309,
"step": 61750
},
{
"epoch": 18.012243470149254,
"grad_norm": 0.38369911909103394,
"learning_rate": 0.0003840804899387576,
"loss": 3.2132,
"step": 61800
},
{
"epoch": 18.026819029850746,
"grad_norm": 0.38627973198890686,
"learning_rate": 0.0003839055118110236,
"loss": 3.1929,
"step": 61850
},
{
"epoch": 18.04139458955224,
"grad_norm": 0.38564252853393555,
"learning_rate": 0.00038373053368328955,
"loss": 3.1907,
"step": 61900
},
{
"epoch": 18.05597014925373,
"grad_norm": 0.37583598494529724,
"learning_rate": 0.00038355555555555555,
"loss": 3.2055,
"step": 61950
},
{
"epoch": 18.070545708955223,
"grad_norm": 0.38370102643966675,
"learning_rate": 0.0003833805774278215,
"loss": 3.1997,
"step": 62000
},
{
"epoch": 18.070545708955223,
"eval_accuracy": 0.3712240612108874,
"eval_loss": 3.563174247741699,
"eval_runtime": 179.5785,
"eval_samples_per_second": 92.589,
"eval_steps_per_second": 5.791,
"step": 62000
},
{
"epoch": 18.085121268656717,
"grad_norm": 0.3597595691680908,
"learning_rate": 0.00038320559930008744,
"loss": 3.2005,
"step": 62050
},
{
"epoch": 18.09969682835821,
"grad_norm": 0.40490153431892395,
"learning_rate": 0.00038303062117235343,
"loss": 3.2116,
"step": 62100
},
{
"epoch": 18.114272388059703,
"grad_norm": 0.3927886188030243,
"learning_rate": 0.00038285564304461943,
"loss": 3.2149,
"step": 62150
},
{
"epoch": 18.128847947761194,
"grad_norm": 0.40592485666275024,
"learning_rate": 0.0003826806649168853,
"loss": 3.224,
"step": 62200
},
{
"epoch": 18.143423507462686,
"grad_norm": 0.41048359870910645,
"learning_rate": 0.0003825056867891513,
"loss": 3.2146,
"step": 62250
},
{
"epoch": 18.15799906716418,
"grad_norm": 0.3761942386627197,
"learning_rate": 0.0003823307086614173,
"loss": 3.2287,
"step": 62300
},
{
"epoch": 18.17257462686567,
"grad_norm": 0.38858354091644287,
"learning_rate": 0.0003821557305336832,
"loss": 3.2138,
"step": 62350
},
{
"epoch": 18.187150186567163,
"grad_norm": 0.3984103202819824,
"learning_rate": 0.0003819807524059492,
"loss": 3.2247,
"step": 62400
},
{
"epoch": 18.201725746268657,
"grad_norm": 0.378798246383667,
"learning_rate": 0.0003818057742782152,
"loss": 3.2323,
"step": 62450
},
{
"epoch": 18.21630130597015,
"grad_norm": 0.4035341143608093,
"learning_rate": 0.0003816307961504812,
"loss": 3.2271,
"step": 62500
},
{
"epoch": 18.230876865671643,
"grad_norm": 0.38941821455955505,
"learning_rate": 0.0003814558180227471,
"loss": 3.2255,
"step": 62550
},
{
"epoch": 18.245452425373134,
"grad_norm": 0.40656763315200806,
"learning_rate": 0.0003812808398950131,
"loss": 3.2411,
"step": 62600
},
{
"epoch": 18.260027985074625,
"grad_norm": 0.3786965310573578,
"learning_rate": 0.0003811058617672791,
"loss": 3.2374,
"step": 62650
},
{
"epoch": 18.27460354477612,
"grad_norm": 0.3779425323009491,
"learning_rate": 0.000380930883639545,
"loss": 3.232,
"step": 62700
},
{
"epoch": 18.28917910447761,
"grad_norm": 0.3786343038082123,
"learning_rate": 0.00038075590551181097,
"loss": 3.2342,
"step": 62750
},
{
"epoch": 18.303754664179106,
"grad_norm": 0.3741774260997772,
"learning_rate": 0.00038058092738407697,
"loss": 3.2366,
"step": 62800
},
{
"epoch": 18.318330223880597,
"grad_norm": 0.4231685400009155,
"learning_rate": 0.0003804059492563429,
"loss": 3.2425,
"step": 62850
},
{
"epoch": 18.33290578358209,
"grad_norm": 0.3765699863433838,
"learning_rate": 0.0003802309711286089,
"loss": 3.2423,
"step": 62900
},
{
"epoch": 18.347481343283583,
"grad_norm": 0.37712177634239197,
"learning_rate": 0.00038005599300087485,
"loss": 3.2477,
"step": 62950
},
{
"epoch": 18.362056902985074,
"grad_norm": 0.37962380051612854,
"learning_rate": 0.0003798810148731408,
"loss": 3.2501,
"step": 63000
},
{
"epoch": 18.362056902985074,
"eval_accuracy": 0.3715983375070633,
"eval_loss": 3.5540771484375,
"eval_runtime": 179.3741,
"eval_samples_per_second": 92.695,
"eval_steps_per_second": 5.798,
"step": 63000
},
{
"epoch": 18.376632462686565,
"grad_norm": 0.3669949173927307,
"learning_rate": 0.0003797060367454068,
"loss": 3.2477,
"step": 63050
},
{
"epoch": 18.39120802238806,
"grad_norm": 0.39625945687294006,
"learning_rate": 0.0003795310586176728,
"loss": 3.2461,
"step": 63100
},
{
"epoch": 18.40578358208955,
"grad_norm": 0.37177714705467224,
"learning_rate": 0.00037935608048993873,
"loss": 3.2521,
"step": 63150
},
{
"epoch": 18.420359141791046,
"grad_norm": 0.39759817719459534,
"learning_rate": 0.0003791811023622047,
"loss": 3.2596,
"step": 63200
},
{
"epoch": 18.434934701492537,
"grad_norm": 0.36642131209373474,
"learning_rate": 0.0003790061242344707,
"loss": 3.2562,
"step": 63250
},
{
"epoch": 18.44951026119403,
"grad_norm": 0.37006133794784546,
"learning_rate": 0.0003788311461067366,
"loss": 3.2456,
"step": 63300
},
{
"epoch": 18.464085820895523,
"grad_norm": 0.4009372591972351,
"learning_rate": 0.00037865616797900256,
"loss": 3.2532,
"step": 63350
},
{
"epoch": 18.478661380597014,
"grad_norm": 0.3532605469226837,
"learning_rate": 0.00037848118985126856,
"loss": 3.2627,
"step": 63400
},
{
"epoch": 18.49323694029851,
"grad_norm": 0.42864686250686646,
"learning_rate": 0.00037830621172353456,
"loss": 3.256,
"step": 63450
},
{
"epoch": 18.5078125,
"grad_norm": 0.35426709055900574,
"learning_rate": 0.00037813123359580045,
"loss": 3.26,
"step": 63500
},
{
"epoch": 18.52238805970149,
"grad_norm": 0.3834630250930786,
"learning_rate": 0.00037795625546806644,
"loss": 3.2534,
"step": 63550
},
{
"epoch": 18.536963619402986,
"grad_norm": 0.39207953214645386,
"learning_rate": 0.00037778127734033244,
"loss": 3.2513,
"step": 63600
},
{
"epoch": 18.551539179104477,
"grad_norm": 0.36887794733047485,
"learning_rate": 0.00037760629921259844,
"loss": 3.2679,
"step": 63650
},
{
"epoch": 18.56611473880597,
"grad_norm": 0.41247034072875977,
"learning_rate": 0.00037743132108486433,
"loss": 3.2684,
"step": 63700
},
{
"epoch": 18.580690298507463,
"grad_norm": 0.4239455759525299,
"learning_rate": 0.0003772563429571303,
"loss": 3.2613,
"step": 63750
},
{
"epoch": 18.595265858208954,
"grad_norm": 0.3865257203578949,
"learning_rate": 0.0003770813648293963,
"loss": 3.2729,
"step": 63800
},
{
"epoch": 18.60984141791045,
"grad_norm": 0.4032337963581085,
"learning_rate": 0.00037690638670166227,
"loss": 3.2639,
"step": 63850
},
{
"epoch": 18.62441697761194,
"grad_norm": 0.3696430027484894,
"learning_rate": 0.0003767314085739282,
"loss": 3.2581,
"step": 63900
},
{
"epoch": 18.638992537313435,
"grad_norm": 0.3973971903324127,
"learning_rate": 0.0003765564304461942,
"loss": 3.2707,
"step": 63950
},
{
"epoch": 18.653568097014926,
"grad_norm": 0.3754875361919403,
"learning_rate": 0.00037638145231846015,
"loss": 3.2619,
"step": 64000
},
{
"epoch": 18.653568097014926,
"eval_accuracy": 0.3718132521349932,
"eval_loss": 3.5468034744262695,
"eval_runtime": 180.8218,
"eval_samples_per_second": 91.952,
"eval_steps_per_second": 5.752,
"step": 64000
},
{
"epoch": 18.668143656716417,
"grad_norm": 0.3698972463607788,
"learning_rate": 0.00037620647419072615,
"loss": 3.2717,
"step": 64050
},
{
"epoch": 18.68271921641791,
"grad_norm": 0.37259402871131897,
"learning_rate": 0.0003760314960629921,
"loss": 3.2705,
"step": 64100
},
{
"epoch": 18.697294776119403,
"grad_norm": 0.3753381371498108,
"learning_rate": 0.00037585651793525804,
"loss": 3.2766,
"step": 64150
},
{
"epoch": 18.711870335820894,
"grad_norm": 0.3642279803752899,
"learning_rate": 0.00037568153980752404,
"loss": 3.2636,
"step": 64200
},
{
"epoch": 18.72644589552239,
"grad_norm": 0.3768105208873749,
"learning_rate": 0.00037550656167979,
"loss": 3.2737,
"step": 64250
},
{
"epoch": 18.74102145522388,
"grad_norm": 0.36915749311447144,
"learning_rate": 0.0003753315835520559,
"loss": 3.2668,
"step": 64300
},
{
"epoch": 18.755597014925375,
"grad_norm": 0.35579395294189453,
"learning_rate": 0.0003751566054243219,
"loss": 3.2714,
"step": 64350
},
{
"epoch": 18.770172574626866,
"grad_norm": 0.3788515031337738,
"learning_rate": 0.0003749816272965879,
"loss": 3.2906,
"step": 64400
},
{
"epoch": 18.784748134328357,
"grad_norm": 0.38902297616004944,
"learning_rate": 0.00037480664916885386,
"loss": 3.2725,
"step": 64450
},
{
"epoch": 18.79932369402985,
"grad_norm": 0.420396089553833,
"learning_rate": 0.0003746316710411198,
"loss": 3.2659,
"step": 64500
},
{
"epoch": 18.813899253731343,
"grad_norm": 0.37137776613235474,
"learning_rate": 0.0003744566929133858,
"loss": 3.2694,
"step": 64550
},
{
"epoch": 18.828474813432837,
"grad_norm": 0.3613051176071167,
"learning_rate": 0.0003742817147856518,
"loss": 3.2652,
"step": 64600
},
{
"epoch": 18.84305037313433,
"grad_norm": 0.3822399973869324,
"learning_rate": 0.0003741067366579177,
"loss": 3.2897,
"step": 64650
},
{
"epoch": 18.85762593283582,
"grad_norm": 0.38065946102142334,
"learning_rate": 0.0003739317585301837,
"loss": 3.2776,
"step": 64700
},
{
"epoch": 18.872201492537314,
"grad_norm": 0.36325129866600037,
"learning_rate": 0.0003737567804024497,
"loss": 3.2708,
"step": 64750
},
{
"epoch": 18.886777052238806,
"grad_norm": 0.3650025427341461,
"learning_rate": 0.0003735818022747157,
"loss": 3.2868,
"step": 64800
},
{
"epoch": 18.901352611940297,
"grad_norm": 0.3612610101699829,
"learning_rate": 0.00037340682414698157,
"loss": 3.2961,
"step": 64850
},
{
"epoch": 18.91592817164179,
"grad_norm": 0.354300856590271,
"learning_rate": 0.00037323184601924757,
"loss": 3.2865,
"step": 64900
},
{
"epoch": 18.930503731343283,
"grad_norm": 0.3676280081272125,
"learning_rate": 0.00037305686789151357,
"loss": 3.2864,
"step": 64950
},
{
"epoch": 18.945079291044777,
"grad_norm": 0.3943864107131958,
"learning_rate": 0.00037288188976377946,
"loss": 3.2919,
"step": 65000
},
{
"epoch": 18.945079291044777,
"eval_accuracy": 0.37216375364757553,
"eval_loss": 3.5422606468200684,
"eval_runtime": 180.2835,
"eval_samples_per_second": 92.227,
"eval_steps_per_second": 5.769,
"step": 65000
},
{
"epoch": 18.95965485074627,
"grad_norm": 0.37015199661254883,
"learning_rate": 0.00037270691163604545,
"loss": 3.2765,
"step": 65050
},
{
"epoch": 18.97423041044776,
"grad_norm": 0.416165828704834,
"learning_rate": 0.00037253193350831145,
"loss": 3.2774,
"step": 65100
},
{
"epoch": 18.988805970149254,
"grad_norm": 0.37499311566352844,
"learning_rate": 0.0003723569553805774,
"loss": 3.2861,
"step": 65150
},
{
"epoch": 19.00320662313433,
"grad_norm": 0.3819681406021118,
"learning_rate": 0.00037218197725284334,
"loss": 3.2615,
"step": 65200
},
{
"epoch": 19.01778218283582,
"grad_norm": 0.3987461030483246,
"learning_rate": 0.00037200699912510934,
"loss": 3.1719,
"step": 65250
},
{
"epoch": 19.032357742537314,
"grad_norm": 0.4024551510810852,
"learning_rate": 0.0003718320209973753,
"loss": 3.1772,
"step": 65300
},
{
"epoch": 19.046933302238806,
"grad_norm": 0.36730852723121643,
"learning_rate": 0.0003716570428696413,
"loss": 3.1886,
"step": 65350
},
{
"epoch": 19.061508861940297,
"grad_norm": 0.3820883631706238,
"learning_rate": 0.0003714820647419072,
"loss": 3.1853,
"step": 65400
},
{
"epoch": 19.07608442164179,
"grad_norm": 0.3786020278930664,
"learning_rate": 0.00037130708661417316,
"loss": 3.194,
"step": 65450
},
{
"epoch": 19.090659981343283,
"grad_norm": 0.37983399629592896,
"learning_rate": 0.00037113210848643916,
"loss": 3.197,
"step": 65500
},
{
"epoch": 19.105235541044777,
"grad_norm": 0.3765254318714142,
"learning_rate": 0.00037095713035870516,
"loss": 3.197,
"step": 65550
},
{
"epoch": 19.11981110074627,
"grad_norm": 0.4143414795398712,
"learning_rate": 0.00037078215223097105,
"loss": 3.2023,
"step": 65600
},
{
"epoch": 19.13438666044776,
"grad_norm": 0.3956504166126251,
"learning_rate": 0.00037060717410323705,
"loss": 3.2054,
"step": 65650
},
{
"epoch": 19.148962220149254,
"grad_norm": 0.3792484998703003,
"learning_rate": 0.00037043219597550304,
"loss": 3.2109,
"step": 65700
},
{
"epoch": 19.163537779850746,
"grad_norm": 0.3993787467479706,
"learning_rate": 0.00037025721784776904,
"loss": 3.2066,
"step": 65750
},
{
"epoch": 19.17811333955224,
"grad_norm": 0.38693755865097046,
"learning_rate": 0.00037008223972003493,
"loss": 3.2079,
"step": 65800
},
{
"epoch": 19.19268889925373,
"grad_norm": 0.3611437678337097,
"learning_rate": 0.00036990726159230093,
"loss": 3.2081,
"step": 65850
},
{
"epoch": 19.207264458955223,
"grad_norm": 0.39916324615478516,
"learning_rate": 0.0003697322834645669,
"loss": 3.2189,
"step": 65900
},
{
"epoch": 19.221840018656717,
"grad_norm": 0.3750282824039459,
"learning_rate": 0.0003695573053368328,
"loss": 3.2194,
"step": 65950
},
{
"epoch": 19.23641557835821,
"grad_norm": 0.3601391315460205,
"learning_rate": 0.0003693823272090988,
"loss": 3.2264,
"step": 66000
},
{
"epoch": 19.23641557835821,
"eval_accuracy": 0.37148440686093176,
"eval_loss": 3.5561089515686035,
"eval_runtime": 179.3982,
"eval_samples_per_second": 92.682,
"eval_steps_per_second": 5.797,
"step": 66000
},
{
"epoch": 19.250991138059703,
"grad_norm": 0.3798132538795471,
"learning_rate": 0.0003692073490813648,
"loss": 3.2224,
"step": 66050
},
{
"epoch": 19.265566697761194,
"grad_norm": 0.3911631405353546,
"learning_rate": 0.0003690323709536308,
"loss": 3.2158,
"step": 66100
},
{
"epoch": 19.280142257462686,
"grad_norm": 0.3523401916027069,
"learning_rate": 0.0003688573928258967,
"loss": 3.2245,
"step": 66150
},
{
"epoch": 19.29471781716418,
"grad_norm": 0.3657352328300476,
"learning_rate": 0.0003686824146981627,
"loss": 3.2299,
"step": 66200
},
{
"epoch": 19.30929337686567,
"grad_norm": 0.4035662114620209,
"learning_rate": 0.0003685074365704287,
"loss": 3.2245,
"step": 66250
},
{
"epoch": 19.323868936567163,
"grad_norm": 0.41193512082099915,
"learning_rate": 0.00036833245844269464,
"loss": 3.2311,
"step": 66300
},
{
"epoch": 19.338444496268657,
"grad_norm": 0.368966668844223,
"learning_rate": 0.0003681574803149606,
"loss": 3.2366,
"step": 66350
},
{
"epoch": 19.35302005597015,
"grad_norm": 0.37729912996292114,
"learning_rate": 0.0003679825021872266,
"loss": 3.2232,
"step": 66400
},
{
"epoch": 19.367595615671643,
"grad_norm": 0.39348021149635315,
"learning_rate": 0.0003678075240594925,
"loss": 3.2442,
"step": 66450
},
{
"epoch": 19.382171175373134,
"grad_norm": 0.4021179676055908,
"learning_rate": 0.0003676325459317585,
"loss": 3.2287,
"step": 66500
},
{
"epoch": 19.396746735074625,
"grad_norm": 0.39004001021385193,
"learning_rate": 0.00036745756780402446,
"loss": 3.2393,
"step": 66550
},
{
"epoch": 19.41132229477612,
"grad_norm": 0.4309486746788025,
"learning_rate": 0.0003672825896762904,
"loss": 3.2398,
"step": 66600
},
{
"epoch": 19.42589785447761,
"grad_norm": 0.3987930119037628,
"learning_rate": 0.0003671076115485564,
"loss": 3.2451,
"step": 66650
},
{
"epoch": 19.440473414179106,
"grad_norm": 0.3985956907272339,
"learning_rate": 0.0003669326334208224,
"loss": 3.2425,
"step": 66700
},
{
"epoch": 19.455048973880597,
"grad_norm": 0.38769444823265076,
"learning_rate": 0.0003667576552930883,
"loss": 3.2463,
"step": 66750
},
{
"epoch": 19.46962453358209,
"grad_norm": 0.36316731572151184,
"learning_rate": 0.0003665826771653543,
"loss": 3.2502,
"step": 66800
},
{
"epoch": 19.484200093283583,
"grad_norm": 0.374905526638031,
"learning_rate": 0.0003664076990376203,
"loss": 3.2422,
"step": 66850
},
{
"epoch": 19.498775652985074,
"grad_norm": 0.38985154032707214,
"learning_rate": 0.0003662327209098862,
"loss": 3.2589,
"step": 66900
},
{
"epoch": 19.513351212686565,
"grad_norm": 0.37939122319221497,
"learning_rate": 0.0003660577427821522,
"loss": 3.2459,
"step": 66950
},
{
"epoch": 19.52792677238806,
"grad_norm": 0.3842551112174988,
"learning_rate": 0.00036588276465441817,
"loss": 3.2394,
"step": 67000
},
{
"epoch": 19.52792677238806,
"eval_accuracy": 0.37200721670609316,
"eval_loss": 3.5495340824127197,
"eval_runtime": 179.2821,
"eval_samples_per_second": 92.742,
"eval_steps_per_second": 5.801,
"step": 67000
},
{
"epoch": 19.54250233208955,
"grad_norm": 0.37161803245544434,
"learning_rate": 0.00036570778652668417,
"loss": 3.2461,
"step": 67050
},
{
"epoch": 19.557077891791046,
"grad_norm": 0.39652761816978455,
"learning_rate": 0.00036553280839895006,
"loss": 3.2559,
"step": 67100
},
{
"epoch": 19.571653451492537,
"grad_norm": 0.403962641954422,
"learning_rate": 0.00036535783027121606,
"loss": 3.2747,
"step": 67150
},
{
"epoch": 19.58622901119403,
"grad_norm": 0.3734639585018158,
"learning_rate": 0.00036518285214348205,
"loss": 3.2566,
"step": 67200
},
{
"epoch": 19.600804570895523,
"grad_norm": 0.3627129793167114,
"learning_rate": 0.00036500787401574805,
"loss": 3.2476,
"step": 67250
},
{
"epoch": 19.615380130597014,
"grad_norm": 0.3672148585319519,
"learning_rate": 0.00036483289588801394,
"loss": 3.2564,
"step": 67300
},
{
"epoch": 19.62995569029851,
"grad_norm": 0.38739728927612305,
"learning_rate": 0.00036465791776027994,
"loss": 3.2598,
"step": 67350
},
{
"epoch": 19.64453125,
"grad_norm": 0.352742463350296,
"learning_rate": 0.00036448293963254594,
"loss": 3.2527,
"step": 67400
},
{
"epoch": 19.65910680970149,
"grad_norm": 0.3656388521194458,
"learning_rate": 0.0003643079615048119,
"loss": 3.2553,
"step": 67450
},
{
"epoch": 19.673682369402986,
"grad_norm": 0.3881314694881439,
"learning_rate": 0.0003641329833770778,
"loss": 3.2561,
"step": 67500
},
{
"epoch": 19.688257929104477,
"grad_norm": 0.3891017436981201,
"learning_rate": 0.0003639580052493438,
"loss": 3.2463,
"step": 67550
},
{
"epoch": 19.70283348880597,
"grad_norm": 0.39045625925064087,
"learning_rate": 0.00036378302712160976,
"loss": 3.2573,
"step": 67600
},
{
"epoch": 19.717409048507463,
"grad_norm": 0.3692318797111511,
"learning_rate": 0.00036360804899387576,
"loss": 3.2656,
"step": 67650
},
{
"epoch": 19.731984608208954,
"grad_norm": 0.3797418177127838,
"learning_rate": 0.0003634330708661417,
"loss": 3.2651,
"step": 67700
},
{
"epoch": 19.74656016791045,
"grad_norm": 0.40278735756874084,
"learning_rate": 0.00036325809273840765,
"loss": 3.2616,
"step": 67750
},
{
"epoch": 19.76113572761194,
"grad_norm": 0.36150145530700684,
"learning_rate": 0.00036308311461067365,
"loss": 3.2544,
"step": 67800
},
{
"epoch": 19.775711287313435,
"grad_norm": 0.3682866394519806,
"learning_rate": 0.0003629081364829396,
"loss": 3.266,
"step": 67850
},
{
"epoch": 19.790286847014926,
"grad_norm": 0.3680150508880615,
"learning_rate": 0.00036273315835520553,
"loss": 3.259,
"step": 67900
},
{
"epoch": 19.804862406716417,
"grad_norm": 0.4011524021625519,
"learning_rate": 0.00036255818022747153,
"loss": 3.2745,
"step": 67950
},
{
"epoch": 19.81943796641791,
"grad_norm": 0.361175000667572,
"learning_rate": 0.00036238320209973753,
"loss": 3.2665,
"step": 68000
},
{
"epoch": 19.81943796641791,
"eval_accuracy": 0.3724805938328917,
"eval_loss": 3.5455946922302246,
"eval_runtime": 179.659,
"eval_samples_per_second": 92.548,
"eval_steps_per_second": 5.789,
"step": 68000
},
{
"epoch": 19.834013526119403,
"grad_norm": 0.3918949365615845,
"learning_rate": 0.0003622082239720034,
"loss": 3.2757,
"step": 68050
},
{
"epoch": 19.848589085820894,
"grad_norm": 0.4035460352897644,
"learning_rate": 0.0003620332458442694,
"loss": 3.2647,
"step": 68100
},
{
"epoch": 19.86316464552239,
"grad_norm": 0.39110511541366577,
"learning_rate": 0.0003618582677165354,
"loss": 3.2762,
"step": 68150
},
{
"epoch": 19.87774020522388,
"grad_norm": 0.3898833692073822,
"learning_rate": 0.0003616832895888014,
"loss": 3.2705,
"step": 68200
},
{
"epoch": 19.892315764925375,
"grad_norm": 0.3959086835384369,
"learning_rate": 0.0003615083114610673,
"loss": 3.2747,
"step": 68250
},
{
"epoch": 19.906891324626866,
"grad_norm": 0.40734153985977173,
"learning_rate": 0.0003613333333333333,
"loss": 3.2645,
"step": 68300
},
{
"epoch": 19.921466884328357,
"grad_norm": 0.38103732466697693,
"learning_rate": 0.0003611583552055993,
"loss": 3.2809,
"step": 68350
},
{
"epoch": 19.93604244402985,
"grad_norm": 0.3835790753364563,
"learning_rate": 0.00036098337707786524,
"loss": 3.2718,
"step": 68400
},
{
"epoch": 19.950618003731343,
"grad_norm": 0.3652678430080414,
"learning_rate": 0.0003608083989501312,
"loss": 3.2677,
"step": 68450
},
{
"epoch": 19.965193563432837,
"grad_norm": 0.36673831939697266,
"learning_rate": 0.0003606334208223972,
"loss": 3.2655,
"step": 68500
},
{
"epoch": 19.97976912313433,
"grad_norm": 0.39470627903938293,
"learning_rate": 0.0003604584426946632,
"loss": 3.2774,
"step": 68550
},
{
"epoch": 19.99434468283582,
"grad_norm": 0.3817540407180786,
"learning_rate": 0.00036028346456692907,
"loss": 3.2817,
"step": 68600
},
{
"epoch": 20.008745335820894,
"grad_norm": 0.39281952381134033,
"learning_rate": 0.00036010848643919507,
"loss": 3.2112,
"step": 68650
},
{
"epoch": 20.02332089552239,
"grad_norm": 0.4034322500228882,
"learning_rate": 0.00035993350831146106,
"loss": 3.1723,
"step": 68700
},
{
"epoch": 20.03789645522388,
"grad_norm": 0.370498389005661,
"learning_rate": 0.000359758530183727,
"loss": 3.1685,
"step": 68750
},
{
"epoch": 20.052472014925375,
"grad_norm": 0.39685019850730896,
"learning_rate": 0.00035958355205599295,
"loss": 3.184,
"step": 68800
},
{
"epoch": 20.067047574626866,
"grad_norm": 0.37462735176086426,
"learning_rate": 0.00035940857392825895,
"loss": 3.1755,
"step": 68850
},
{
"epoch": 20.081623134328357,
"grad_norm": 0.4028313159942627,
"learning_rate": 0.0003592335958005249,
"loss": 3.1769,
"step": 68900
},
{
"epoch": 20.09619869402985,
"grad_norm": 0.3951454162597656,
"learning_rate": 0.0003590586176727909,
"loss": 3.1853,
"step": 68950
},
{
"epoch": 20.110774253731343,
"grad_norm": 0.41274914145469666,
"learning_rate": 0.00035888363954505683,
"loss": 3.1939,
"step": 69000
},
{
"epoch": 20.110774253731343,
"eval_accuracy": 0.3716299979862052,
"eval_loss": 3.5588364601135254,
"eval_runtime": 179.4501,
"eval_samples_per_second": 92.655,
"eval_steps_per_second": 5.795,
"step": 69000
},
{
"epoch": 20.125349813432837,
"grad_norm": 0.3881000280380249,
"learning_rate": 0.0003587086614173228,
"loss": 3.2051,
"step": 69050
},
{
"epoch": 20.13992537313433,
"grad_norm": 0.38626691699028015,
"learning_rate": 0.0003585336832895888,
"loss": 3.1925,
"step": 69100
},
{
"epoch": 20.15450093283582,
"grad_norm": 0.37013718485832214,
"learning_rate": 0.00035835870516185477,
"loss": 3.1998,
"step": 69150
},
{
"epoch": 20.169076492537314,
"grad_norm": 0.4230809509754181,
"learning_rate": 0.00035818372703412066,
"loss": 3.2044,
"step": 69200
},
{
"epoch": 20.183652052238806,
"grad_norm": 0.39065060019493103,
"learning_rate": 0.00035800874890638666,
"loss": 3.2022,
"step": 69250
},
{
"epoch": 20.198227611940297,
"grad_norm": 0.38382846117019653,
"learning_rate": 0.00035783377077865266,
"loss": 3.2117,
"step": 69300
},
{
"epoch": 20.21280317164179,
"grad_norm": 0.3768085837364197,
"learning_rate": 0.00035765879265091865,
"loss": 3.2136,
"step": 69350
},
{
"epoch": 20.227378731343283,
"grad_norm": 0.42641666531562805,
"learning_rate": 0.00035748381452318454,
"loss": 3.1956,
"step": 69400
},
{
"epoch": 20.241954291044777,
"grad_norm": 0.3793436288833618,
"learning_rate": 0.00035730883639545054,
"loss": 3.2154,
"step": 69450
},
{
"epoch": 20.25652985074627,
"grad_norm": 0.38998252153396606,
"learning_rate": 0.00035713385826771654,
"loss": 3.2057,
"step": 69500
},
{
"epoch": 20.27110541044776,
"grad_norm": 0.4177810847759247,
"learning_rate": 0.00035695888013998243,
"loss": 3.2119,
"step": 69550
},
{
"epoch": 20.285680970149254,
"grad_norm": 0.37320512533187866,
"learning_rate": 0.0003567839020122484,
"loss": 3.2149,
"step": 69600
},
{
"epoch": 20.300256529850746,
"grad_norm": 0.3940044939517975,
"learning_rate": 0.0003566089238845144,
"loss": 3.2202,
"step": 69650
},
{
"epoch": 20.31483208955224,
"grad_norm": 0.38435447216033936,
"learning_rate": 0.00035643394575678037,
"loss": 3.2195,
"step": 69700
},
{
"epoch": 20.32940764925373,
"grad_norm": 0.38087761402130127,
"learning_rate": 0.0003562589676290463,
"loss": 3.2175,
"step": 69750
},
{
"epoch": 20.343983208955223,
"grad_norm": 0.4099285900592804,
"learning_rate": 0.0003560839895013123,
"loss": 3.2241,
"step": 69800
},
{
"epoch": 20.358558768656717,
"grad_norm": 0.4072279632091522,
"learning_rate": 0.0003559090113735783,
"loss": 3.2172,
"step": 69850
},
{
"epoch": 20.37313432835821,
"grad_norm": 0.3808026909828186,
"learning_rate": 0.00035573403324584425,
"loss": 3.2092,
"step": 69900
},
{
"epoch": 20.387709888059703,
"grad_norm": 0.3662373721599579,
"learning_rate": 0.0003555590551181102,
"loss": 3.2168,
"step": 69950
},
{
"epoch": 20.402285447761194,
"grad_norm": 0.3988702893257141,
"learning_rate": 0.0003553840769903762,
"loss": 3.2149,
"step": 70000
},
{
"epoch": 20.402285447761194,
"eval_accuracy": 0.37192118023675214,
"eval_loss": 3.555593490600586,
"eval_runtime": 179.2627,
"eval_samples_per_second": 92.752,
"eval_steps_per_second": 5.802,
"step": 70000
},
{
"epoch": 20.416861007462686,
"grad_norm": 0.37926796078681946,
"learning_rate": 0.00035520909886264213,
"loss": 3.2228,
"step": 70050
},
{
"epoch": 20.43143656716418,
"grad_norm": 0.38676461577415466,
"learning_rate": 0.00035503412073490813,
"loss": 3.2301,
"step": 70100
},
{
"epoch": 20.44601212686567,
"grad_norm": 0.3928930163383484,
"learning_rate": 0.0003548591426071741,
"loss": 3.2322,
"step": 70150
},
{
"epoch": 20.460587686567163,
"grad_norm": 0.3859976530075073,
"learning_rate": 0.00035468416447944,
"loss": 3.2353,
"step": 70200
},
{
"epoch": 20.475163246268657,
"grad_norm": 0.42515480518341064,
"learning_rate": 0.000354509186351706,
"loss": 3.2248,
"step": 70250
},
{
"epoch": 20.48973880597015,
"grad_norm": 0.4182749092578888,
"learning_rate": 0.000354334208223972,
"loss": 3.2393,
"step": 70300
},
{
"epoch": 20.504314365671643,
"grad_norm": 0.39793065190315247,
"learning_rate": 0.0003541592300962379,
"loss": 3.2468,
"step": 70350
},
{
"epoch": 20.518889925373134,
"grad_norm": 0.3741270899772644,
"learning_rate": 0.0003539842519685039,
"loss": 3.2475,
"step": 70400
},
{
"epoch": 20.533465485074625,
"grad_norm": 0.38552579283714294,
"learning_rate": 0.0003538092738407699,
"loss": 3.2452,
"step": 70450
},
{
"epoch": 20.54804104477612,
"grad_norm": 0.3872421383857727,
"learning_rate": 0.0003536342957130358,
"loss": 3.2376,
"step": 70500
},
{
"epoch": 20.56261660447761,
"grad_norm": 0.40599504113197327,
"learning_rate": 0.0003534593175853018,
"loss": 3.2374,
"step": 70550
},
{
"epoch": 20.577192164179106,
"grad_norm": 0.38468611240386963,
"learning_rate": 0.0003532843394575678,
"loss": 3.2457,
"step": 70600
},
{
"epoch": 20.591767723880597,
"grad_norm": 0.40714865922927856,
"learning_rate": 0.0003531093613298338,
"loss": 3.2432,
"step": 70650
},
{
"epoch": 20.60634328358209,
"grad_norm": 0.38764360547065735,
"learning_rate": 0.00035293438320209967,
"loss": 3.2479,
"step": 70700
},
{
"epoch": 20.620918843283583,
"grad_norm": 0.36900195479393005,
"learning_rate": 0.00035275940507436567,
"loss": 3.2318,
"step": 70750
},
{
"epoch": 20.635494402985074,
"grad_norm": 0.4236606955528259,
"learning_rate": 0.00035258442694663166,
"loss": 3.249,
"step": 70800
},
{
"epoch": 20.650069962686565,
"grad_norm": 0.4084267020225525,
"learning_rate": 0.0003524094488188976,
"loss": 3.2455,
"step": 70850
},
{
"epoch": 20.66464552238806,
"grad_norm": 0.39864709973335266,
"learning_rate": 0.00035223447069116355,
"loss": 3.2535,
"step": 70900
},
{
"epoch": 20.67922108208955,
"grad_norm": 0.4109085202217102,
"learning_rate": 0.00035205949256342955,
"loss": 3.2512,
"step": 70950
},
{
"epoch": 20.693796641791046,
"grad_norm": 0.4137507975101471,
"learning_rate": 0.0003518845144356955,
"loss": 3.2447,
"step": 71000
},
{
"epoch": 20.693796641791046,
"eval_accuracy": 0.3722022405497295,
"eval_loss": 3.5451698303222656,
"eval_runtime": 179.3907,
"eval_samples_per_second": 92.686,
"eval_steps_per_second": 5.797,
"step": 71000
},
{
"epoch": 20.708372201492537,
"grad_norm": 0.36888283491134644,
"learning_rate": 0.0003517095363079615,
"loss": 3.2481,
"step": 71050
},
{
"epoch": 20.72294776119403,
"grad_norm": 0.4099491238594055,
"learning_rate": 0.00035153455818022743,
"loss": 3.2501,
"step": 71100
},
{
"epoch": 20.737523320895523,
"grad_norm": 0.37443771958351135,
"learning_rate": 0.00035135958005249343,
"loss": 3.2593,
"step": 71150
},
{
"epoch": 20.752098880597014,
"grad_norm": 0.36958447098731995,
"learning_rate": 0.0003511846019247594,
"loss": 3.249,
"step": 71200
},
{
"epoch": 20.76667444029851,
"grad_norm": 0.38606441020965576,
"learning_rate": 0.0003510096237970253,
"loss": 3.2524,
"step": 71250
},
{
"epoch": 20.78125,
"grad_norm": 0.39055198431015015,
"learning_rate": 0.0003508346456692913,
"loss": 3.2566,
"step": 71300
},
{
"epoch": 20.79582555970149,
"grad_norm": 0.37249884009361267,
"learning_rate": 0.00035065966754155726,
"loss": 3.2574,
"step": 71350
},
{
"epoch": 20.810401119402986,
"grad_norm": 0.3753904402256012,
"learning_rate": 0.00035048468941382326,
"loss": 3.2471,
"step": 71400
},
{
"epoch": 20.824976679104477,
"grad_norm": 0.37924399971961975,
"learning_rate": 0.0003503097112860892,
"loss": 3.26,
"step": 71450
},
{
"epoch": 20.83955223880597,
"grad_norm": 0.3716459274291992,
"learning_rate": 0.00035013473315835514,
"loss": 3.2668,
"step": 71500
},
{
"epoch": 20.854127798507463,
"grad_norm": 0.37003329396247864,
"learning_rate": 0.00034995975503062114,
"loss": 3.2646,
"step": 71550
},
{
"epoch": 20.868703358208954,
"grad_norm": 0.40597912669181824,
"learning_rate": 0.00034978477690288714,
"loss": 3.259,
"step": 71600
},
{
"epoch": 20.88327891791045,
"grad_norm": 0.4224961996078491,
"learning_rate": 0.00034960979877515303,
"loss": 3.277,
"step": 71650
},
{
"epoch": 20.89785447761194,
"grad_norm": 0.3840181529521942,
"learning_rate": 0.000349434820647419,
"loss": 3.2468,
"step": 71700
},
{
"epoch": 20.912430037313435,
"grad_norm": 0.39738985896110535,
"learning_rate": 0.000349259842519685,
"loss": 3.2666,
"step": 71750
},
{
"epoch": 20.927005597014926,
"grad_norm": 0.3849341571331024,
"learning_rate": 0.000349084864391951,
"loss": 3.2541,
"step": 71800
},
{
"epoch": 20.941581156716417,
"grad_norm": 0.38834768533706665,
"learning_rate": 0.0003489098862642169,
"loss": 3.2643,
"step": 71850
},
{
"epoch": 20.95615671641791,
"grad_norm": 0.36527711153030396,
"learning_rate": 0.0003487349081364829,
"loss": 3.2632,
"step": 71900
},
{
"epoch": 20.970732276119403,
"grad_norm": 0.40803590416908264,
"learning_rate": 0.0003485599300087489,
"loss": 3.263,
"step": 71950
},
{
"epoch": 20.985307835820894,
"grad_norm": 0.3897600769996643,
"learning_rate": 0.00034838495188101485,
"loss": 3.2733,
"step": 72000
},
{
"epoch": 20.985307835820894,
"eval_accuracy": 0.3729873968930595,
"eval_loss": 3.538719415664673,
"eval_runtime": 179.3409,
"eval_samples_per_second": 92.712,
"eval_steps_per_second": 5.799,
"step": 72000
},
{
"epoch": 20.99988339552239,
"grad_norm": 0.359904021024704,
"learning_rate": 0.0003482099737532808,
"loss": 3.2644,
"step": 72050
},
{
"epoch": 21.014284048507463,
"grad_norm": 0.38795262575149536,
"learning_rate": 0.0003480349956255468,
"loss": 3.144,
"step": 72100
},
{
"epoch": 21.028859608208954,
"grad_norm": 0.41623035073280334,
"learning_rate": 0.00034786001749781274,
"loss": 3.1663,
"step": 72150
},
{
"epoch": 21.04343516791045,
"grad_norm": 0.40783727169036865,
"learning_rate": 0.0003476850393700787,
"loss": 3.1662,
"step": 72200
},
{
"epoch": 21.05801072761194,
"grad_norm": 0.4014035165309906,
"learning_rate": 0.0003475100612423447,
"loss": 3.1651,
"step": 72250
},
{
"epoch": 21.07258628731343,
"grad_norm": 0.3629119098186493,
"learning_rate": 0.0003473350831146106,
"loss": 3.1723,
"step": 72300
},
{
"epoch": 21.087161847014926,
"grad_norm": 0.40349218249320984,
"learning_rate": 0.0003471601049868766,
"loss": 3.1713,
"step": 72350
},
{
"epoch": 21.101737406716417,
"grad_norm": 0.41936585307121277,
"learning_rate": 0.00034698512685914256,
"loss": 3.1753,
"step": 72400
},
{
"epoch": 21.11631296641791,
"grad_norm": 0.38126733899116516,
"learning_rate": 0.00034681014873140856,
"loss": 3.1804,
"step": 72450
},
{
"epoch": 21.130888526119403,
"grad_norm": 0.4074459969997406,
"learning_rate": 0.0003466351706036745,
"loss": 3.197,
"step": 72500
},
{
"epoch": 21.145464085820894,
"grad_norm": 0.40520617365837097,
"learning_rate": 0.0003464601924759405,
"loss": 3.1886,
"step": 72550
},
{
"epoch": 21.16003964552239,
"grad_norm": 0.3764589726924896,
"learning_rate": 0.00034628521434820644,
"loss": 3.1782,
"step": 72600
},
{
"epoch": 21.17461520522388,
"grad_norm": 0.389152467250824,
"learning_rate": 0.0003461102362204724,
"loss": 3.1987,
"step": 72650
},
{
"epoch": 21.189190764925375,
"grad_norm": 0.4187963604927063,
"learning_rate": 0.0003459352580927384,
"loss": 3.1922,
"step": 72700
},
{
"epoch": 21.203766324626866,
"grad_norm": 0.38693854212760925,
"learning_rate": 0.0003457602799650044,
"loss": 3.194,
"step": 72750
},
{
"epoch": 21.218341884328357,
"grad_norm": 0.37709715962409973,
"learning_rate": 0.00034558530183727027,
"loss": 3.1903,
"step": 72800
},
{
"epoch": 21.23291744402985,
"grad_norm": 0.3794679641723633,
"learning_rate": 0.00034541032370953627,
"loss": 3.201,
"step": 72850
},
{
"epoch": 21.247493003731343,
"grad_norm": 0.3991202414035797,
"learning_rate": 0.00034523534558180227,
"loss": 3.1958,
"step": 72900
},
{
"epoch": 21.262068563432837,
"grad_norm": 0.41374242305755615,
"learning_rate": 0.00034506036745406826,
"loss": 3.2002,
"step": 72950
},
{
"epoch": 21.27664412313433,
"grad_norm": 0.37337031960487366,
"learning_rate": 0.00034488538932633415,
"loss": 3.2131,
"step": 73000
},
{
"epoch": 21.27664412313433,
"eval_accuracy": 0.37209054614561915,
"eval_loss": 3.558110475540161,
"eval_runtime": 179.4815,
"eval_samples_per_second": 92.639,
"eval_steps_per_second": 5.794,
"step": 73000
},
{
"epoch": 21.29121968283582,
"grad_norm": 0.4104592502117157,
"learning_rate": 0.00034471041119860015,
"loss": 3.2005,
"step": 73050
},
{
"epoch": 21.305795242537314,
"grad_norm": 0.40162068605422974,
"learning_rate": 0.00034453543307086615,
"loss": 3.204,
"step": 73100
},
{
"epoch": 21.320370802238806,
"grad_norm": 0.3967791497707367,
"learning_rate": 0.00034436045494313204,
"loss": 3.2127,
"step": 73150
},
{
"epoch": 21.334946361940297,
"grad_norm": 0.41675812005996704,
"learning_rate": 0.00034418547681539804,
"loss": 3.209,
"step": 73200
},
{
"epoch": 21.34952192164179,
"grad_norm": 0.37078526616096497,
"learning_rate": 0.00034401049868766403,
"loss": 3.2243,
"step": 73250
},
{
"epoch": 21.364097481343283,
"grad_norm": 0.3804914355278015,
"learning_rate": 0.00034383552055993,
"loss": 3.1965,
"step": 73300
},
{
"epoch": 21.378673041044777,
"grad_norm": 0.4345126450061798,
"learning_rate": 0.0003436605424321959,
"loss": 3.2157,
"step": 73350
},
{
"epoch": 21.39324860074627,
"grad_norm": 0.3758656978607178,
"learning_rate": 0.0003434855643044619,
"loss": 3.2316,
"step": 73400
},
{
"epoch": 21.40782416044776,
"grad_norm": 0.40299421548843384,
"learning_rate": 0.00034331058617672786,
"loss": 3.2177,
"step": 73450
},
{
"epoch": 21.422399720149254,
"grad_norm": 0.3812105655670166,
"learning_rate": 0.00034313560804899386,
"loss": 3.213,
"step": 73500
},
{
"epoch": 21.436975279850746,
"grad_norm": 0.37582188844680786,
"learning_rate": 0.0003429606299212598,
"loss": 3.2124,
"step": 73550
},
{
"epoch": 21.45155083955224,
"grad_norm": 0.4020504951477051,
"learning_rate": 0.00034278565179352575,
"loss": 3.2213,
"step": 73600
},
{
"epoch": 21.46612639925373,
"grad_norm": 0.4116267263889313,
"learning_rate": 0.00034261067366579174,
"loss": 3.2287,
"step": 73650
},
{
"epoch": 21.480701958955223,
"grad_norm": 0.36419060826301575,
"learning_rate": 0.00034243569553805774,
"loss": 3.2206,
"step": 73700
},
{
"epoch": 21.495277518656717,
"grad_norm": 0.3853178322315216,
"learning_rate": 0.0003422607174103237,
"loss": 3.2365,
"step": 73750
},
{
"epoch": 21.50985307835821,
"grad_norm": 0.3858824670314789,
"learning_rate": 0.00034208573928258963,
"loss": 3.2259,
"step": 73800
},
{
"epoch": 21.524428638059703,
"grad_norm": 0.3743896484375,
"learning_rate": 0.0003419107611548556,
"loss": 3.2361,
"step": 73850
},
{
"epoch": 21.539004197761194,
"grad_norm": 0.38600412011146545,
"learning_rate": 0.0003417357830271216,
"loss": 3.2351,
"step": 73900
},
{
"epoch": 21.553579757462686,
"grad_norm": 0.41251125931739807,
"learning_rate": 0.0003415608048993875,
"loss": 3.2241,
"step": 73950
},
{
"epoch": 21.56815531716418,
"grad_norm": 0.4111970067024231,
"learning_rate": 0.0003413858267716535,
"loss": 3.2312,
"step": 74000
},
{
"epoch": 21.56815531716418,
"eval_accuracy": 0.3726626710121949,
"eval_loss": 3.5494892597198486,
"eval_runtime": 179.3043,
"eval_samples_per_second": 92.731,
"eval_steps_per_second": 5.8,
"step": 74000
},
{
"epoch": 21.58273087686567,
"grad_norm": 0.4084521532058716,
"learning_rate": 0.0003412108486439195,
"loss": 3.2352,
"step": 74050
},
{
"epoch": 21.597306436567163,
"grad_norm": 0.37888526916503906,
"learning_rate": 0.0003410358705161854,
"loss": 3.2443,
"step": 74100
},
{
"epoch": 21.611881996268657,
"grad_norm": 0.3779131770133972,
"learning_rate": 0.0003408608923884514,
"loss": 3.2356,
"step": 74150
},
{
"epoch": 21.62645755597015,
"grad_norm": 0.39611154794692993,
"learning_rate": 0.0003406859142607174,
"loss": 3.223,
"step": 74200
},
{
"epoch": 21.641033115671643,
"grad_norm": 0.3650185763835907,
"learning_rate": 0.0003405109361329834,
"loss": 3.2348,
"step": 74250
},
{
"epoch": 21.655608675373134,
"grad_norm": 0.40089526772499084,
"learning_rate": 0.0003403359580052493,
"loss": 3.23,
"step": 74300
},
{
"epoch": 21.670184235074625,
"grad_norm": 0.3926517069339752,
"learning_rate": 0.0003401609798775153,
"loss": 3.2305,
"step": 74350
},
{
"epoch": 21.68475979477612,
"grad_norm": 0.415294349193573,
"learning_rate": 0.0003399860017497813,
"loss": 3.2289,
"step": 74400
},
{
"epoch": 21.69933535447761,
"grad_norm": 0.3892729580402374,
"learning_rate": 0.0003398110236220472,
"loss": 3.2243,
"step": 74450
},
{
"epoch": 21.713910914179106,
"grad_norm": 0.394654780626297,
"learning_rate": 0.00033963604549431316,
"loss": 3.2401,
"step": 74500
},
{
"epoch": 21.728486473880597,
"grad_norm": 0.4018441438674927,
"learning_rate": 0.00033946106736657916,
"loss": 3.2441,
"step": 74550
},
{
"epoch": 21.74306203358209,
"grad_norm": 0.36875492334365845,
"learning_rate": 0.0003392860892388451,
"loss": 3.2437,
"step": 74600
},
{
"epoch": 21.757637593283583,
"grad_norm": 0.37119260430336,
"learning_rate": 0.0003391111111111111,
"loss": 3.2423,
"step": 74650
},
{
"epoch": 21.772213152985074,
"grad_norm": 0.40055206418037415,
"learning_rate": 0.00033893613298337705,
"loss": 3.2489,
"step": 74700
},
{
"epoch": 21.786788712686565,
"grad_norm": 0.40984848141670227,
"learning_rate": 0.000338761154855643,
"loss": 3.2415,
"step": 74750
},
{
"epoch": 21.80136427238806,
"grad_norm": 0.3929216265678406,
"learning_rate": 0.000338586176727909,
"loss": 3.2455,
"step": 74800
},
{
"epoch": 21.81593983208955,
"grad_norm": 0.3937998414039612,
"learning_rate": 0.00033841119860017493,
"loss": 3.2448,
"step": 74850
},
{
"epoch": 21.830515391791046,
"grad_norm": 0.38548168540000916,
"learning_rate": 0.0003382362204724409,
"loss": 3.2507,
"step": 74900
},
{
"epoch": 21.845090951492537,
"grad_norm": 0.47423017024993896,
"learning_rate": 0.00033806124234470687,
"loss": 3.2418,
"step": 74950
},
{
"epoch": 21.85966651119403,
"grad_norm": 0.4023411273956299,
"learning_rate": 0.00033788626421697287,
"loss": 3.2569,
"step": 75000
},
{
"epoch": 21.85966651119403,
"eval_accuracy": 0.3730542487597978,
"eval_loss": 3.5411148071289062,
"eval_runtime": 179.3158,
"eval_samples_per_second": 92.725,
"eval_steps_per_second": 5.8,
"step": 75000
},
{
"epoch": 21.874242070895523,
"grad_norm": 0.3734757602214813,
"learning_rate": 0.0003377112860892388,
"loss": 3.2426,
"step": 75050
},
{
"epoch": 21.888817630597014,
"grad_norm": 0.36795511841773987,
"learning_rate": 0.00033753630796150476,
"loss": 3.2502,
"step": 75100
},
{
"epoch": 21.90339319029851,
"grad_norm": 0.40975630283355713,
"learning_rate": 0.00033736132983377075,
"loss": 3.2659,
"step": 75150
},
{
"epoch": 21.91796875,
"grad_norm": 0.44392845034599304,
"learning_rate": 0.00033718635170603675,
"loss": 3.2527,
"step": 75200
},
{
"epoch": 21.93254430970149,
"grad_norm": 0.38623732328414917,
"learning_rate": 0.00033701137357830264,
"loss": 3.2656,
"step": 75250
},
{
"epoch": 21.947119869402986,
"grad_norm": 0.39194947481155396,
"learning_rate": 0.00033683639545056864,
"loss": 3.2549,
"step": 75300
},
{
"epoch": 21.961695429104477,
"grad_norm": 0.41067978739738464,
"learning_rate": 0.00033666141732283464,
"loss": 3.2378,
"step": 75350
},
{
"epoch": 21.97627098880597,
"grad_norm": 0.3773854672908783,
"learning_rate": 0.00033648643919510063,
"loss": 3.2617,
"step": 75400
},
{
"epoch": 21.990846548507463,
"grad_norm": 0.4150872528553009,
"learning_rate": 0.0003363114610673665,
"loss": 3.254,
"step": 75450
},
{
"epoch": 22.005247201492537,
"grad_norm": 0.41412967443466187,
"learning_rate": 0.0003361364829396325,
"loss": 3.2217,
"step": 75500
},
{
"epoch": 22.01982276119403,
"grad_norm": 0.3943207859992981,
"learning_rate": 0.0003359615048118985,
"loss": 3.1477,
"step": 75550
},
{
"epoch": 22.034398320895523,
"grad_norm": 0.41345933079719543,
"learning_rate": 0.00033578652668416446,
"loss": 3.1496,
"step": 75600
},
{
"epoch": 22.048973880597014,
"grad_norm": 0.41332539916038513,
"learning_rate": 0.0003356115485564304,
"loss": 3.1548,
"step": 75650
},
{
"epoch": 22.06354944029851,
"grad_norm": 0.4008597731590271,
"learning_rate": 0.0003354365704286964,
"loss": 3.1602,
"step": 75700
},
{
"epoch": 22.078125,
"grad_norm": 0.4184229373931885,
"learning_rate": 0.00033526159230096235,
"loss": 3.158,
"step": 75750
},
{
"epoch": 22.09270055970149,
"grad_norm": 0.41302576661109924,
"learning_rate": 0.0003350866141732283,
"loss": 3.1706,
"step": 75800
},
{
"epoch": 22.107276119402986,
"grad_norm": 0.4041043519973755,
"learning_rate": 0.0003349116360454943,
"loss": 3.1706,
"step": 75850
},
{
"epoch": 22.121851679104477,
"grad_norm": 0.37200263142585754,
"learning_rate": 0.00033473665791776023,
"loss": 3.1662,
"step": 75900
},
{
"epoch": 22.13642723880597,
"grad_norm": 0.42748311161994934,
"learning_rate": 0.00033456167979002623,
"loss": 3.1711,
"step": 75950
},
{
"epoch": 22.151002798507463,
"grad_norm": 0.4101450741291046,
"learning_rate": 0.00033438670166229217,
"loss": 3.1789,
"step": 76000
},
{
"epoch": 22.151002798507463,
"eval_accuracy": 0.37194919211049104,
"eval_loss": 3.5588977336883545,
"eval_runtime": 179.9243,
"eval_samples_per_second": 92.411,
"eval_steps_per_second": 5.78,
"step": 76000
},
{
"epoch": 22.165578358208954,
"grad_norm": 0.4208793640136719,
"learning_rate": 0.0003342117235345581,
"loss": 3.1745,
"step": 76050
},
{
"epoch": 22.18015391791045,
"grad_norm": 0.4094926118850708,
"learning_rate": 0.0003340367454068241,
"loss": 3.1825,
"step": 76100
},
{
"epoch": 22.19472947761194,
"grad_norm": 0.40947505831718445,
"learning_rate": 0.0003338617672790901,
"loss": 3.1789,
"step": 76150
},
{
"epoch": 22.20930503731343,
"grad_norm": 0.4133168160915375,
"learning_rate": 0.000333686789151356,
"loss": 3.1793,
"step": 76200
},
{
"epoch": 22.223880597014926,
"grad_norm": 0.4098125696182251,
"learning_rate": 0.000333511811023622,
"loss": 3.1828,
"step": 76250
},
{
"epoch": 22.238456156716417,
"grad_norm": 0.40815654397010803,
"learning_rate": 0.000333336832895888,
"loss": 3.1904,
"step": 76300
},
{
"epoch": 22.25303171641791,
"grad_norm": 0.39737969636917114,
"learning_rate": 0.000333161854768154,
"loss": 3.1966,
"step": 76350
},
{
"epoch": 22.267607276119403,
"grad_norm": 0.43529120087623596,
"learning_rate": 0.0003329868766404199,
"loss": 3.1996,
"step": 76400
},
{
"epoch": 22.282182835820894,
"grad_norm": 0.4474215507507324,
"learning_rate": 0.0003328118985126859,
"loss": 3.185,
"step": 76450
},
{
"epoch": 22.29675839552239,
"grad_norm": 0.4055737853050232,
"learning_rate": 0.0003326369203849519,
"loss": 3.1962,
"step": 76500
},
{
"epoch": 22.31133395522388,
"grad_norm": 0.39701855182647705,
"learning_rate": 0.0003324619422572179,
"loss": 3.2104,
"step": 76550
},
{
"epoch": 22.325909514925375,
"grad_norm": 0.3831678628921509,
"learning_rate": 0.00033228696412948377,
"loss": 3.1951,
"step": 76600
},
{
"epoch": 22.340485074626866,
"grad_norm": 0.3953789472579956,
"learning_rate": 0.00033211198600174976,
"loss": 3.2057,
"step": 76650
},
{
"epoch": 22.355060634328357,
"grad_norm": 0.3927507996559143,
"learning_rate": 0.00033193700787401576,
"loss": 3.204,
"step": 76700
},
{
"epoch": 22.36963619402985,
"grad_norm": 0.43750348687171936,
"learning_rate": 0.00033176202974628165,
"loss": 3.2142,
"step": 76750
},
{
"epoch": 22.384211753731343,
"grad_norm": 0.4124338626861572,
"learning_rate": 0.00033158705161854765,
"loss": 3.2031,
"step": 76800
},
{
"epoch": 22.398787313432837,
"grad_norm": 0.4088267385959625,
"learning_rate": 0.00033141207349081365,
"loss": 3.2098,
"step": 76850
},
{
"epoch": 22.41336287313433,
"grad_norm": 0.39260512590408325,
"learning_rate": 0.0003312370953630796,
"loss": 3.2081,
"step": 76900
},
{
"epoch": 22.42793843283582,
"grad_norm": 0.409219890832901,
"learning_rate": 0.00033106211723534553,
"loss": 3.2058,
"step": 76950
},
{
"epoch": 22.442513992537314,
"grad_norm": 0.40991613268852234,
"learning_rate": 0.00033088713910761153,
"loss": 3.202,
"step": 77000
},
{
"epoch": 22.442513992537314,
"eval_accuracy": 0.37240091299876876,
"eval_loss": 3.5495948791503906,
"eval_runtime": 179.3301,
"eval_samples_per_second": 92.717,
"eval_steps_per_second": 5.799,
"step": 77000
},
{
"epoch": 22.457089552238806,
"grad_norm": 0.41137585043907166,
"learning_rate": 0.0003307121609798775,
"loss": 3.2186,
"step": 77050
},
{
"epoch": 22.471665111940297,
"grad_norm": 0.4068697392940521,
"learning_rate": 0.00033053718285214347,
"loss": 3.2289,
"step": 77100
},
{
"epoch": 22.48624067164179,
"grad_norm": 0.40242230892181396,
"learning_rate": 0.0003303622047244094,
"loss": 3.2167,
"step": 77150
},
{
"epoch": 22.500816231343283,
"grad_norm": 0.4122253656387329,
"learning_rate": 0.00033018722659667536,
"loss": 3.2109,
"step": 77200
},
{
"epoch": 22.515391791044777,
"grad_norm": 0.39489126205444336,
"learning_rate": 0.00033001224846894136,
"loss": 3.2146,
"step": 77250
},
{
"epoch": 22.52996735074627,
"grad_norm": 0.38545411825180054,
"learning_rate": 0.00032983727034120735,
"loss": 3.2177,
"step": 77300
},
{
"epoch": 22.54454291044776,
"grad_norm": 0.39428555965423584,
"learning_rate": 0.00032966229221347324,
"loss": 3.2321,
"step": 77350
},
{
"epoch": 22.559118470149254,
"grad_norm": 0.3956550657749176,
"learning_rate": 0.00032948731408573924,
"loss": 3.2223,
"step": 77400
},
{
"epoch": 22.573694029850746,
"grad_norm": 0.4145047962665558,
"learning_rate": 0.00032931233595800524,
"loss": 3.2189,
"step": 77450
},
{
"epoch": 22.58826958955224,
"grad_norm": 0.4277302920818329,
"learning_rate": 0.00032913735783027124,
"loss": 3.2234,
"step": 77500
},
{
"epoch": 22.60284514925373,
"grad_norm": 0.37812212109565735,
"learning_rate": 0.0003289623797025371,
"loss": 3.2148,
"step": 77550
},
{
"epoch": 22.617420708955223,
"grad_norm": 0.40783196687698364,
"learning_rate": 0.0003287874015748031,
"loss": 3.2286,
"step": 77600
},
{
"epoch": 22.631996268656717,
"grad_norm": 0.3943743407726288,
"learning_rate": 0.0003286124234470691,
"loss": 3.2179,
"step": 77650
},
{
"epoch": 22.64657182835821,
"grad_norm": 0.42744752764701843,
"learning_rate": 0.000328437445319335,
"loss": 3.2319,
"step": 77700
},
{
"epoch": 22.661147388059703,
"grad_norm": 0.4282462000846863,
"learning_rate": 0.000328262467191601,
"loss": 3.2319,
"step": 77750
},
{
"epoch": 22.675722947761194,
"grad_norm": 0.3943832218647003,
"learning_rate": 0.000328087489063867,
"loss": 3.2355,
"step": 77800
},
{
"epoch": 22.690298507462686,
"grad_norm": 0.41673794388771057,
"learning_rate": 0.000327912510936133,
"loss": 3.2374,
"step": 77850
},
{
"epoch": 22.70487406716418,
"grad_norm": 0.396979957818985,
"learning_rate": 0.0003277375328083989,
"loss": 3.2315,
"step": 77900
},
{
"epoch": 22.71944962686567,
"grad_norm": 0.3709559440612793,
"learning_rate": 0.0003275625546806649,
"loss": 3.2202,
"step": 77950
},
{
"epoch": 22.734025186567163,
"grad_norm": 0.3927803039550781,
"learning_rate": 0.0003273875765529309,
"loss": 3.2272,
"step": 78000
},
{
"epoch": 22.734025186567163,
"eval_accuracy": 0.37302141131117106,
"eval_loss": 3.542844772338867,
"eval_runtime": 181.1051,
"eval_samples_per_second": 91.809,
"eval_steps_per_second": 5.743,
"step": 78000
},
{
"epoch": 22.748600746268657,
"grad_norm": 0.4058307707309723,
"learning_rate": 0.00032721259842519683,
"loss": 3.2307,
"step": 78050
},
{
"epoch": 22.76317630597015,
"grad_norm": 0.4223111569881439,
"learning_rate": 0.0003270376202974628,
"loss": 3.2307,
"step": 78100
},
{
"epoch": 22.777751865671643,
"grad_norm": 0.4018338620662689,
"learning_rate": 0.00032686264216972877,
"loss": 3.2383,
"step": 78150
},
{
"epoch": 22.792327425373134,
"grad_norm": 0.3855460584163666,
"learning_rate": 0.0003266876640419947,
"loss": 3.2252,
"step": 78200
},
{
"epoch": 22.806902985074625,
"grad_norm": 0.3729098439216614,
"learning_rate": 0.0003265126859142607,
"loss": 3.2308,
"step": 78250
},
{
"epoch": 22.82147854477612,
"grad_norm": 0.4244031012058258,
"learning_rate": 0.00032633770778652666,
"loss": 3.2405,
"step": 78300
},
{
"epoch": 22.83605410447761,
"grad_norm": 0.38624075055122375,
"learning_rate": 0.0003261627296587926,
"loss": 3.2393,
"step": 78350
},
{
"epoch": 22.850629664179106,
"grad_norm": 0.38735702633857727,
"learning_rate": 0.0003259877515310586,
"loss": 3.2464,
"step": 78400
},
{
"epoch": 22.865205223880597,
"grad_norm": 0.423250675201416,
"learning_rate": 0.00032581277340332454,
"loss": 3.2473,
"step": 78450
},
{
"epoch": 22.87978078358209,
"grad_norm": 0.3985595703125,
"learning_rate": 0.0003256377952755905,
"loss": 3.235,
"step": 78500
},
{
"epoch": 22.894356343283583,
"grad_norm": 0.4058718979358673,
"learning_rate": 0.0003254628171478565,
"loss": 3.2463,
"step": 78550
},
{
"epoch": 22.908931902985074,
"grad_norm": 0.4019133150577545,
"learning_rate": 0.0003252878390201225,
"loss": 3.2384,
"step": 78600
},
{
"epoch": 22.923507462686565,
"grad_norm": 0.37895283102989197,
"learning_rate": 0.00032511286089238837,
"loss": 3.248,
"step": 78650
},
{
"epoch": 22.93808302238806,
"grad_norm": 0.42432135343551636,
"learning_rate": 0.00032493788276465437,
"loss": 3.2476,
"step": 78700
},
{
"epoch": 22.95265858208955,
"grad_norm": 0.3959765136241913,
"learning_rate": 0.00032476290463692036,
"loss": 3.2421,
"step": 78750
},
{
"epoch": 22.967234141791046,
"grad_norm": 0.40325257182121277,
"learning_rate": 0.00032458792650918636,
"loss": 3.2377,
"step": 78800
},
{
"epoch": 22.981809701492537,
"grad_norm": 0.38290831446647644,
"learning_rate": 0.00032441294838145225,
"loss": 3.2383,
"step": 78850
},
{
"epoch": 22.99638526119403,
"grad_norm": 0.45766332745552063,
"learning_rate": 0.00032423797025371825,
"loss": 3.2512,
"step": 78900
},
{
"epoch": 23.010785914179106,
"grad_norm": 0.38711297512054443,
"learning_rate": 0.00032406299212598425,
"loss": 3.1644,
"step": 78950
},
{
"epoch": 23.025361473880597,
"grad_norm": 0.4188516139984131,
"learning_rate": 0.0003238880139982502,
"loss": 3.1514,
"step": 79000
},
{
"epoch": 23.025361473880597,
"eval_accuracy": 0.3726237133222471,
"eval_loss": 3.5549256801605225,
"eval_runtime": 179.4078,
"eval_samples_per_second": 92.677,
"eval_steps_per_second": 5.797,
"step": 79000
},
{
"epoch": 23.03993703358209,
"grad_norm": 0.42309534549713135,
"learning_rate": 0.00032371303587051613,
"loss": 3.1538,
"step": 79050
},
{
"epoch": 23.054512593283583,
"grad_norm": 0.402117520570755,
"learning_rate": 0.00032353805774278213,
"loss": 3.1334,
"step": 79100
},
{
"epoch": 23.069088152985074,
"grad_norm": 0.41363510489463806,
"learning_rate": 0.00032336307961504813,
"loss": 3.1541,
"step": 79150
},
{
"epoch": 23.08366371268657,
"grad_norm": 0.40389856696128845,
"learning_rate": 0.0003231881014873141,
"loss": 3.1565,
"step": 79200
},
{
"epoch": 23.09823927238806,
"grad_norm": 0.37992772459983826,
"learning_rate": 0.00032301312335958,
"loss": 3.1577,
"step": 79250
},
{
"epoch": 23.11281483208955,
"grad_norm": 0.4105381667613983,
"learning_rate": 0.000322838145231846,
"loss": 3.1503,
"step": 79300
},
{
"epoch": 23.127390391791046,
"grad_norm": 0.3969649374485016,
"learning_rate": 0.00032266316710411196,
"loss": 3.1627,
"step": 79350
},
{
"epoch": 23.141965951492537,
"grad_norm": 0.4166598320007324,
"learning_rate": 0.0003224881889763779,
"loss": 3.1695,
"step": 79400
},
{
"epoch": 23.15654151119403,
"grad_norm": 0.4184854328632355,
"learning_rate": 0.0003223132108486439,
"loss": 3.1737,
"step": 79450
},
{
"epoch": 23.171117070895523,
"grad_norm": 0.392345666885376,
"learning_rate": 0.00032213823272090984,
"loss": 3.1738,
"step": 79500
},
{
"epoch": 23.185692630597014,
"grad_norm": 0.39596453309059143,
"learning_rate": 0.00032196325459317584,
"loss": 3.187,
"step": 79550
},
{
"epoch": 23.20026819029851,
"grad_norm": 0.3976131081581116,
"learning_rate": 0.0003217882764654418,
"loss": 3.174,
"step": 79600
},
{
"epoch": 23.21484375,
"grad_norm": 0.409440279006958,
"learning_rate": 0.0003216132983377077,
"loss": 3.1926,
"step": 79650
},
{
"epoch": 23.22941930970149,
"grad_norm": 0.43629515171051025,
"learning_rate": 0.0003214383202099737,
"loss": 3.173,
"step": 79700
},
{
"epoch": 23.243994869402986,
"grad_norm": 0.40185999870300293,
"learning_rate": 0.0003212633420822397,
"loss": 3.1817,
"step": 79750
},
{
"epoch": 23.258570429104477,
"grad_norm": 0.39592161774635315,
"learning_rate": 0.0003210883639545056,
"loss": 3.1945,
"step": 79800
},
{
"epoch": 23.27314598880597,
"grad_norm": 0.4166530668735504,
"learning_rate": 0.0003209133858267716,
"loss": 3.1855,
"step": 79850
},
{
"epoch": 23.287721548507463,
"grad_norm": 0.4228636920452118,
"learning_rate": 0.0003207384076990376,
"loss": 3.1971,
"step": 79900
},
{
"epoch": 23.302297108208954,
"grad_norm": 0.41409003734588623,
"learning_rate": 0.0003205634295713036,
"loss": 3.1935,
"step": 79950
},
{
"epoch": 23.31687266791045,
"grad_norm": 0.41710057854652405,
"learning_rate": 0.0003203884514435695,
"loss": 3.1818,
"step": 80000
},
{
"epoch": 23.31687266791045,
"eval_accuracy": 0.3723583067034179,
"eval_loss": 3.553128957748413,
"eval_runtime": 179.3491,
"eval_samples_per_second": 92.707,
"eval_steps_per_second": 5.799,
"step": 80000
},
{
"epoch": 23.33144822761194,
"grad_norm": 0.42076575756073,
"learning_rate": 0.0003202134733158355,
"loss": 3.1417,
"step": 80050
},
{
"epoch": 23.346023787313435,
"grad_norm": 0.3984379470348358,
"learning_rate": 0.0003200384951881015,
"loss": 3.1554,
"step": 80100
},
{
"epoch": 23.360599347014926,
"grad_norm": 0.42404845356941223,
"learning_rate": 0.00031986351706036743,
"loss": 3.1558,
"step": 80150
},
{
"epoch": 23.375174906716417,
"grad_norm": 0.4230729937553406,
"learning_rate": 0.0003196885389326334,
"loss": 3.1552,
"step": 80200
},
{
"epoch": 23.38975046641791,
"grad_norm": 0.4097963273525238,
"learning_rate": 0.0003195135608048994,
"loss": 3.1626,
"step": 80250
},
{
"epoch": 23.404326026119403,
"grad_norm": 0.4160501956939697,
"learning_rate": 0.0003193385826771653,
"loss": 3.1572,
"step": 80300
},
{
"epoch": 23.418901585820894,
"grad_norm": 0.39919206500053406,
"learning_rate": 0.00031916360454943126,
"loss": 3.1587,
"step": 80350
},
{
"epoch": 23.43347714552239,
"grad_norm": 0.3985087275505066,
"learning_rate": 0.00031898862642169726,
"loss": 3.1623,
"step": 80400
},
{
"epoch": 23.44805270522388,
"grad_norm": 0.40297672152519226,
"learning_rate": 0.00031881364829396326,
"loss": 3.176,
"step": 80450
},
{
"epoch": 23.462628264925375,
"grad_norm": 0.42020031809806824,
"learning_rate": 0.0003186386701662292,
"loss": 3.1747,
"step": 80500
},
{
"epoch": 23.477203824626866,
"grad_norm": 0.45471012592315674,
"learning_rate": 0.00031846369203849514,
"loss": 3.1711,
"step": 80550
},
{
"epoch": 23.491779384328357,
"grad_norm": 0.4380848705768585,
"learning_rate": 0.00031828871391076114,
"loss": 3.18,
"step": 80600
},
{
"epoch": 23.50635494402985,
"grad_norm": 0.4077417254447937,
"learning_rate": 0.0003181137357830271,
"loss": 3.1749,
"step": 80650
},
{
"epoch": 23.520930503731343,
"grad_norm": 0.41786298155784607,
"learning_rate": 0.0003179387576552931,
"loss": 3.1713,
"step": 80700
},
{
"epoch": 23.535506063432837,
"grad_norm": 0.4201817512512207,
"learning_rate": 0.000317763779527559,
"loss": 3.1766,
"step": 80750
},
{
"epoch": 23.55008162313433,
"grad_norm": 0.3985823690891266,
"learning_rate": 0.00031758880139982497,
"loss": 3.1784,
"step": 80800
},
{
"epoch": 23.56465718283582,
"grad_norm": 0.4040225148200989,
"learning_rate": 0.00031741382327209097,
"loss": 3.1852,
"step": 80850
},
{
"epoch": 23.579232742537314,
"grad_norm": 0.4432525634765625,
"learning_rate": 0.00031723884514435696,
"loss": 3.1815,
"step": 80900
},
{
"epoch": 23.593808302238806,
"grad_norm": 0.40925121307373047,
"learning_rate": 0.00031706386701662285,
"loss": 3.1782,
"step": 80950
},
{
"epoch": 23.608383861940297,
"grad_norm": 0.40709611773490906,
"learning_rate": 0.00031688888888888885,
"loss": 3.1943,
"step": 81000
},
{
"epoch": 23.608383861940297,
"eval_accuracy": 0.372661964830504,
"eval_loss": 3.5582470893859863,
"eval_runtime": 82.6312,
"eval_samples_per_second": 201.219,
"eval_steps_per_second": 12.586,
"step": 81000
},
{
"epoch": 23.62295942164179,
"grad_norm": 0.3898576498031616,
"learning_rate": 0.00031671391076115485,
"loss": 3.186,
"step": 81050
},
{
"epoch": 23.637534981343283,
"grad_norm": 0.4021594524383545,
"learning_rate": 0.00031653893263342085,
"loss": 3.2017,
"step": 81100
},
{
"epoch": 23.652110541044777,
"grad_norm": 0.42967769503593445,
"learning_rate": 0.00031636395450568674,
"loss": 3.2032,
"step": 81150
},
{
"epoch": 23.66668610074627,
"grad_norm": 0.4002310037612915,
"learning_rate": 0.00031618897637795273,
"loss": 3.197,
"step": 81200
},
{
"epoch": 23.68126166044776,
"grad_norm": 0.4102286398410797,
"learning_rate": 0.00031601399825021873,
"loss": 3.194,
"step": 81250
},
{
"epoch": 23.695837220149254,
"grad_norm": 0.3905355632305145,
"learning_rate": 0.0003158390201224846,
"loss": 3.2018,
"step": 81300
},
{
"epoch": 23.710412779850746,
"grad_norm": 0.4037608802318573,
"learning_rate": 0.0003156640419947506,
"loss": 3.1943,
"step": 81350
},
{
"epoch": 23.72498833955224,
"grad_norm": 0.44810715317726135,
"learning_rate": 0.0003154890638670166,
"loss": 3.1964,
"step": 81400
},
{
"epoch": 23.73956389925373,
"grad_norm": 0.4065055549144745,
"learning_rate": 0.00031531408573928256,
"loss": 3.2001,
"step": 81450
},
{
"epoch": 23.754139458955223,
"grad_norm": 0.39197850227355957,
"learning_rate": 0.0003151391076115485,
"loss": 3.1975,
"step": 81500
},
{
"epoch": 23.768715018656717,
"grad_norm": 0.4424740970134735,
"learning_rate": 0.0003149641294838145,
"loss": 3.1983,
"step": 81550
},
{
"epoch": 23.78329057835821,
"grad_norm": 0.42625612020492554,
"learning_rate": 0.00031478915135608044,
"loss": 3.2098,
"step": 81600
},
{
"epoch": 23.797866138059703,
"grad_norm": 0.39148834347724915,
"learning_rate": 0.00031461417322834644,
"loss": 3.204,
"step": 81650
},
{
"epoch": 23.812441697761194,
"grad_norm": 0.42868733406066895,
"learning_rate": 0.0003144391951006124,
"loss": 3.219,
"step": 81700
},
{
"epoch": 23.827017257462686,
"grad_norm": 0.43013501167297363,
"learning_rate": 0.0003142642169728784,
"loss": 3.2065,
"step": 81750
},
{
"epoch": 23.84159281716418,
"grad_norm": 0.40283554792404175,
"learning_rate": 0.0003140892388451443,
"loss": 3.2081,
"step": 81800
},
{
"epoch": 23.85616837686567,
"grad_norm": 0.421036034822464,
"learning_rate": 0.0003139142607174103,
"loss": 3.2085,
"step": 81850
},
{
"epoch": 23.870743936567163,
"grad_norm": 0.40139567852020264,
"learning_rate": 0.00031373928258967627,
"loss": 3.2001,
"step": 81900
},
{
"epoch": 23.885319496268657,
"grad_norm": 0.4077606797218323,
"learning_rate": 0.0003135643044619422,
"loss": 3.2076,
"step": 81950
},
{
"epoch": 23.89989505597015,
"grad_norm": 0.40206170082092285,
"learning_rate": 0.0003133893263342082,
"loss": 3.1961,
"step": 82000
},
{
"epoch": 23.89989505597015,
"eval_accuracy": 0.3726993924601216,
"eval_loss": 3.550748109817505,
"eval_runtime": 81.036,
"eval_samples_per_second": 205.18,
"eval_steps_per_second": 12.834,
"step": 82000
},
{
"epoch": 23.914470615671643,
"grad_norm": 0.43046948313713074,
"learning_rate": 0.00031321434820647415,
"loss": 3.2226,
"step": 82050
},
{
"epoch": 23.929046175373134,
"grad_norm": 0.39953580498695374,
"learning_rate": 0.0003130393700787401,
"loss": 3.2032,
"step": 82100
},
{
"epoch": 23.943621735074625,
"grad_norm": 0.3996121883392334,
"learning_rate": 0.0003128643919510061,
"loss": 3.2241,
"step": 82150
},
{
"epoch": 23.95819729477612,
"grad_norm": 0.38362568616867065,
"learning_rate": 0.0003126894138232721,
"loss": 3.211,
"step": 82200
},
{
"epoch": 23.97277285447761,
"grad_norm": 0.3902480900287628,
"learning_rate": 0.000312514435695538,
"loss": 3.2343,
"step": 82250
},
{
"epoch": 23.987348414179106,
"grad_norm": 0.37919190526008606,
"learning_rate": 0.000312339457567804,
"loss": 3.2164,
"step": 82300
},
{
"epoch": 24.00204057835821,
"grad_norm": 0.4392635226249695,
"learning_rate": 0.00031216447944007,
"loss": 3.2676,
"step": 82350
},
{
"epoch": 24.016616138059703,
"grad_norm": 0.43110543489456177,
"learning_rate": 0.000311989501312336,
"loss": 3.1351,
"step": 82400
},
{
"epoch": 24.031191697761194,
"grad_norm": 0.4461026191711426,
"learning_rate": 0.00031181452318460186,
"loss": 3.1418,
"step": 82450
},
{
"epoch": 24.045767257462686,
"grad_norm": 0.4295453429222107,
"learning_rate": 0.00031163954505686786,
"loss": 3.1488,
"step": 82500
},
{
"epoch": 24.06034281716418,
"grad_norm": 0.4268800616264343,
"learning_rate": 0.00031146456692913386,
"loss": 3.1561,
"step": 82550
},
{
"epoch": 24.07491837686567,
"grad_norm": 0.4661655128002167,
"learning_rate": 0.0003112895888013998,
"loss": 3.155,
"step": 82600
},
{
"epoch": 24.089493936567163,
"grad_norm": 0.4239647388458252,
"learning_rate": 0.00031111461067366575,
"loss": 3.1525,
"step": 82650
},
{
"epoch": 24.104069496268657,
"grad_norm": 0.4007156789302826,
"learning_rate": 0.00031093963254593174,
"loss": 3.1583,
"step": 82700
},
{
"epoch": 24.11864505597015,
"grad_norm": 0.4243454039096832,
"learning_rate": 0.0003107646544181977,
"loss": 3.148,
"step": 82750
},
{
"epoch": 24.133220615671643,
"grad_norm": 0.43287578225135803,
"learning_rate": 0.0003105896762904637,
"loss": 3.1769,
"step": 82800
},
{
"epoch": 24.147796175373134,
"grad_norm": 0.4111781418323517,
"learning_rate": 0.00031041469816272963,
"loss": 3.1748,
"step": 82850
},
{
"epoch": 24.162371735074625,
"grad_norm": 0.3968662917613983,
"learning_rate": 0.00031023972003499557,
"loss": 3.1692,
"step": 82900
},
{
"epoch": 24.17694729477612,
"grad_norm": 0.40891578793525696,
"learning_rate": 0.00031006474190726157,
"loss": 3.1799,
"step": 82950
},
{
"epoch": 24.19152285447761,
"grad_norm": 0.4242601990699768,
"learning_rate": 0.0003098897637795275,
"loss": 3.1603,
"step": 83000
},
{
"epoch": 24.19152285447761,
"eval_accuracy": 0.3723005175017128,
"eval_loss": 3.563079357147217,
"eval_runtime": 80.9798,
"eval_samples_per_second": 205.323,
"eval_steps_per_second": 12.843,
"step": 83000
},
{
"epoch": 24.206098414179106,
"grad_norm": 0.44086217880249023,
"learning_rate": 0.0003097147856517935,
"loss": 3.1747,
"step": 83050
},
{
"epoch": 24.220673973880597,
"grad_norm": 0.4039420485496521,
"learning_rate": 0.00030953980752405945,
"loss": 3.1845,
"step": 83100
},
{
"epoch": 24.23524953358209,
"grad_norm": 0.4216541051864624,
"learning_rate": 0.00030936482939632545,
"loss": 3.176,
"step": 83150
},
{
"epoch": 24.249825093283583,
"grad_norm": 0.41587021946907043,
"learning_rate": 0.0003091898512685914,
"loss": 3.1717,
"step": 83200
},
{
"epoch": 24.264400652985074,
"grad_norm": 0.404705286026001,
"learning_rate": 0.00030901487314085734,
"loss": 3.1731,
"step": 83250
},
{
"epoch": 24.278976212686565,
"grad_norm": 0.42235511541366577,
"learning_rate": 0.00030883989501312334,
"loss": 3.1763,
"step": 83300
},
{
"epoch": 24.29355177238806,
"grad_norm": 0.47101321816444397,
"learning_rate": 0.00030866491688538933,
"loss": 3.1887,
"step": 83350
},
{
"epoch": 24.30812733208955,
"grad_norm": 0.3986760675907135,
"learning_rate": 0.0003084899387576552,
"loss": 3.1907,
"step": 83400
},
{
"epoch": 24.322702891791046,
"grad_norm": 0.41290879249572754,
"learning_rate": 0.0003083149606299212,
"loss": 3.1921,
"step": 83450
},
{
"epoch": 24.337278451492537,
"grad_norm": 0.4050918519496918,
"learning_rate": 0.0003081399825021872,
"loss": 3.1869,
"step": 83500
},
{
"epoch": 24.35185401119403,
"grad_norm": 0.41474130749702454,
"learning_rate": 0.0003079650043744532,
"loss": 3.1767,
"step": 83550
},
{
"epoch": 24.366429570895523,
"grad_norm": 0.3922690451145172,
"learning_rate": 0.0003077900262467191,
"loss": 3.1937,
"step": 83600
},
{
"epoch": 24.381005130597014,
"grad_norm": 0.3993853032588959,
"learning_rate": 0.0003076150481189851,
"loss": 3.199,
"step": 83650
},
{
"epoch": 24.39558069029851,
"grad_norm": 0.41509875655174255,
"learning_rate": 0.0003074400699912511,
"loss": 3.1946,
"step": 83700
},
{
"epoch": 24.41015625,
"grad_norm": 0.40810710191726685,
"learning_rate": 0.00030726509186351704,
"loss": 3.19,
"step": 83750
},
{
"epoch": 24.42473180970149,
"grad_norm": 0.43359851837158203,
"learning_rate": 0.000307090113735783,
"loss": 3.1959,
"step": 83800
},
{
"epoch": 24.439307369402986,
"grad_norm": 0.4185155928134918,
"learning_rate": 0.000306915135608049,
"loss": 3.1878,
"step": 83850
},
{
"epoch": 24.453882929104477,
"grad_norm": 0.41894349455833435,
"learning_rate": 0.00030674015748031493,
"loss": 3.1987,
"step": 83900
},
{
"epoch": 24.46845848880597,
"grad_norm": 0.4216829538345337,
"learning_rate": 0.00030656517935258087,
"loss": 3.2036,
"step": 83950
},
{
"epoch": 24.483034048507463,
"grad_norm": 0.4279070496559143,
"learning_rate": 0.00030639020122484687,
"loss": 3.1921,
"step": 84000
},
{
"epoch": 24.483034048507463,
"eval_accuracy": 0.3727687159627781,
"eval_loss": 3.5538084506988525,
"eval_runtime": 82.2654,
"eval_samples_per_second": 202.114,
"eval_steps_per_second": 12.642,
"step": 84000
},
{
"epoch": 24.497609608208954,
"grad_norm": 0.3983027935028076,
"learning_rate": 0.0003062152230971128,
"loss": 3.208,
"step": 84050
},
{
"epoch": 24.51218516791045,
"grad_norm": 0.44559016823768616,
"learning_rate": 0.0003060402449693788,
"loss": 3.2071,
"step": 84100
},
{
"epoch": 24.52676072761194,
"grad_norm": 0.40929609537124634,
"learning_rate": 0.00030586526684164475,
"loss": 3.2088,
"step": 84150
},
{
"epoch": 24.541336287313435,
"grad_norm": 0.42126762866973877,
"learning_rate": 0.0003056902887139107,
"loss": 3.2002,
"step": 84200
},
{
"epoch": 24.555911847014926,
"grad_norm": 0.4303717017173767,
"learning_rate": 0.0003055153105861767,
"loss": 3.2064,
"step": 84250
},
{
"epoch": 24.570487406716417,
"grad_norm": 0.39315250515937805,
"learning_rate": 0.0003053403324584427,
"loss": 3.2003,
"step": 84300
},
{
"epoch": 24.58506296641791,
"grad_norm": 0.4045599699020386,
"learning_rate": 0.00030516535433070864,
"loss": 3.2225,
"step": 84350
},
{
"epoch": 24.599638526119403,
"grad_norm": 0.44561147689819336,
"learning_rate": 0.0003049903762029746,
"loss": 3.2121,
"step": 84400
},
{
"epoch": 24.614214085820894,
"grad_norm": 0.38626208901405334,
"learning_rate": 0.0003048153980752406,
"loss": 3.2035,
"step": 84450
},
{
"epoch": 24.62878964552239,
"grad_norm": 0.40461352467536926,
"learning_rate": 0.0003046404199475066,
"loss": 3.2219,
"step": 84500
},
{
"epoch": 24.64336520522388,
"grad_norm": 0.4504241943359375,
"learning_rate": 0.00030446544181977247,
"loss": 3.2145,
"step": 84550
},
{
"epoch": 24.657940764925375,
"grad_norm": 0.40991654992103577,
"learning_rate": 0.00030429046369203846,
"loss": 3.2162,
"step": 84600
},
{
"epoch": 24.672516324626866,
"grad_norm": 0.45852404832839966,
"learning_rate": 0.00030411548556430446,
"loss": 3.2053,
"step": 84650
},
{
"epoch": 24.687091884328357,
"grad_norm": 0.39378029108047485,
"learning_rate": 0.00030394050743657046,
"loss": 3.2044,
"step": 84700
},
{
"epoch": 24.70166744402985,
"grad_norm": 0.4019394814968109,
"learning_rate": 0.00030376552930883635,
"loss": 3.2135,
"step": 84750
},
{
"epoch": 24.716243003731343,
"grad_norm": 0.40645793080329895,
"learning_rate": 0.00030359055118110235,
"loss": 3.2192,
"step": 84800
},
{
"epoch": 24.730818563432837,
"grad_norm": 0.43606048822402954,
"learning_rate": 0.00030341557305336834,
"loss": 3.2137,
"step": 84850
},
{
"epoch": 24.74539412313433,
"grad_norm": 0.4315530061721802,
"learning_rate": 0.00030324059492563423,
"loss": 3.2111,
"step": 84900
},
{
"epoch": 24.75996968283582,
"grad_norm": 0.4235876500606537,
"learning_rate": 0.00030306561679790023,
"loss": 3.2289,
"step": 84950
},
{
"epoch": 24.774545242537314,
"grad_norm": 0.40020737051963806,
"learning_rate": 0.00030289063867016623,
"loss": 3.2218,
"step": 85000
},
{
"epoch": 24.774545242537314,
"eval_accuracy": 0.37339298057753184,
"eval_loss": 3.542487621307373,
"eval_runtime": 82.3315,
"eval_samples_per_second": 201.952,
"eval_steps_per_second": 12.632,
"step": 85000
},
{
"epoch": 24.789120802238806,
"grad_norm": 0.44500306248664856,
"learning_rate": 0.00030271566054243217,
"loss": 3.2194,
"step": 85050
},
{
"epoch": 24.803696361940297,
"grad_norm": 0.3997686207294464,
"learning_rate": 0.0003025406824146981,
"loss": 3.2125,
"step": 85100
},
{
"epoch": 24.81827192164179,
"grad_norm": 0.4181106686592102,
"learning_rate": 0.0003023657042869641,
"loss": 3.2152,
"step": 85150
},
{
"epoch": 24.832847481343283,
"grad_norm": 0.40245386958122253,
"learning_rate": 0.00030219072615923006,
"loss": 3.2156,
"step": 85200
},
{
"epoch": 24.847423041044777,
"grad_norm": 0.4032931625843048,
"learning_rate": 0.00030201574803149605,
"loss": 3.2211,
"step": 85250
},
{
"epoch": 24.86199860074627,
"grad_norm": 0.45445653796195984,
"learning_rate": 0.000301840769903762,
"loss": 3.2241,
"step": 85300
},
{
"epoch": 24.87657416044776,
"grad_norm": 0.41785797476768494,
"learning_rate": 0.00030166579177602794,
"loss": 3.2274,
"step": 85350
},
{
"epoch": 24.891149720149254,
"grad_norm": 0.42762765288352966,
"learning_rate": 0.00030149081364829394,
"loss": 3.2243,
"step": 85400
},
{
"epoch": 24.905725279850746,
"grad_norm": 0.4247051179409027,
"learning_rate": 0.00030131583552055994,
"loss": 3.2184,
"step": 85450
},
{
"epoch": 24.92030083955224,
"grad_norm": 0.4020553231239319,
"learning_rate": 0.0003011408573928258,
"loss": 3.219,
"step": 85500
},
{
"epoch": 24.93487639925373,
"grad_norm": 0.41240084171295166,
"learning_rate": 0.0003009658792650918,
"loss": 3.225,
"step": 85550
},
{
"epoch": 24.949451958955223,
"grad_norm": 0.4297904968261719,
"learning_rate": 0.0003007909011373578,
"loss": 3.2173,
"step": 85600
},
{
"epoch": 24.964027518656717,
"grad_norm": 0.41779932379722595,
"learning_rate": 0.00030061592300962376,
"loss": 3.2068,
"step": 85650
},
{
"epoch": 24.97860307835821,
"grad_norm": 0.40487775206565857,
"learning_rate": 0.0003004409448818897,
"loss": 3.2201,
"step": 85700
},
{
"epoch": 24.993178638059703,
"grad_norm": 0.3969780206680298,
"learning_rate": 0.0003002659667541557,
"loss": 3.2145,
"step": 85750
},
{
"epoch": 25.007579291044777,
"grad_norm": 0.44225025177001953,
"learning_rate": 0.0003000909886264217,
"loss": 3.1819,
"step": 85800
},
{
"epoch": 25.02215485074627,
"grad_norm": 0.42038342356681824,
"learning_rate": 0.00029991601049868765,
"loss": 3.1194,
"step": 85850
},
{
"epoch": 25.03673041044776,
"grad_norm": 0.42238423228263855,
"learning_rate": 0.0002997410323709536,
"loss": 3.1306,
"step": 85900
},
{
"epoch": 25.051305970149254,
"grad_norm": 0.4110691249370575,
"learning_rate": 0.0002995660542432196,
"loss": 3.1498,
"step": 85950
},
{
"epoch": 25.065881529850746,
"grad_norm": 0.38763201236724854,
"learning_rate": 0.00029939107611548553,
"loss": 3.1492,
"step": 86000
},
{
"epoch": 25.065881529850746,
"eval_accuracy": 0.3726318344116924,
"eval_loss": 3.5576071739196777,
"eval_runtime": 82.1603,
"eval_samples_per_second": 202.373,
"eval_steps_per_second": 12.658,
"step": 86000
},
{
"epoch": 25.08045708955224,
"grad_norm": 0.4149741232395172,
"learning_rate": 0.00029921609798775153,
"loss": 3.1375,
"step": 86050
},
{
"epoch": 25.09503264925373,
"grad_norm": 0.4215436279773712,
"learning_rate": 0.00029904111986001747,
"loss": 3.136,
"step": 86100
},
{
"epoch": 25.109608208955223,
"grad_norm": 0.41278237104415894,
"learning_rate": 0.00029886614173228347,
"loss": 3.1427,
"step": 86150
},
{
"epoch": 25.124183768656717,
"grad_norm": 0.40497392416000366,
"learning_rate": 0.0002986911636045494,
"loss": 3.1452,
"step": 86200
},
{
"epoch": 25.13875932835821,
"grad_norm": 0.42836251854896545,
"learning_rate": 0.0002985161854768154,
"loss": 3.1472,
"step": 86250
},
{
"epoch": 25.153334888059703,
"grad_norm": 0.43723875284194946,
"learning_rate": 0.00029834120734908135,
"loss": 3.1573,
"step": 86300
},
{
"epoch": 25.167910447761194,
"grad_norm": 0.4214979410171509,
"learning_rate": 0.0002981662292213473,
"loss": 3.1555,
"step": 86350
},
{
"epoch": 25.182486007462686,
"grad_norm": 0.4392107129096985,
"learning_rate": 0.0002979912510936133,
"loss": 3.1658,
"step": 86400
},
{
"epoch": 25.19706156716418,
"grad_norm": 0.481117308139801,
"learning_rate": 0.00029781627296587924,
"loss": 3.166,
"step": 86450
},
{
"epoch": 25.21163712686567,
"grad_norm": 0.43255847692489624,
"learning_rate": 0.0002976412948381452,
"loss": 3.1608,
"step": 86500
},
{
"epoch": 25.226212686567163,
"grad_norm": 0.4154725968837738,
"learning_rate": 0.0002974663167104112,
"loss": 3.166,
"step": 86550
},
{
"epoch": 25.240788246268657,
"grad_norm": 0.4099873900413513,
"learning_rate": 0.0002972913385826771,
"loss": 3.1486,
"step": 86600
},
{
"epoch": 25.25536380597015,
"grad_norm": 0.404623806476593,
"learning_rate": 0.00029711636045494307,
"loss": 3.1691,
"step": 86650
},
{
"epoch": 25.269939365671643,
"grad_norm": 0.4402453303337097,
"learning_rate": 0.00029694138232720906,
"loss": 3.1663,
"step": 86700
},
{
"epoch": 25.284514925373134,
"grad_norm": 0.4077349305152893,
"learning_rate": 0.000296766404199475,
"loss": 3.1617,
"step": 86750
},
{
"epoch": 25.299090485074625,
"grad_norm": 0.42283713817596436,
"learning_rate": 0.000296591426071741,
"loss": 3.1748,
"step": 86800
},
{
"epoch": 25.31366604477612,
"grad_norm": 0.4206119477748871,
"learning_rate": 0.00029641644794400695,
"loss": 3.1683,
"step": 86850
},
{
"epoch": 25.32824160447761,
"grad_norm": 0.4248672127723694,
"learning_rate": 0.00029624146981627295,
"loss": 3.1759,
"step": 86900
},
{
"epoch": 25.342817164179106,
"grad_norm": 0.4154689311981201,
"learning_rate": 0.0002960664916885389,
"loss": 3.1812,
"step": 86950
},
{
"epoch": 25.357392723880597,
"grad_norm": 0.4337415099143982,
"learning_rate": 0.0002958915135608049,
"loss": 3.1824,
"step": 87000
},
{
"epoch": 25.357392723880597,
"eval_accuracy": 0.37313086947326024,
"eval_loss": 3.550499200820923,
"eval_runtime": 82.2769,
"eval_samples_per_second": 202.086,
"eval_steps_per_second": 12.64,
"step": 87000
},
{
"epoch": 25.37196828358209,
"grad_norm": 0.39931657910346985,
"learning_rate": 0.00029571653543307083,
"loss": 3.1751,
"step": 87050
},
{
"epoch": 25.386543843283583,
"grad_norm": 0.429696649312973,
"learning_rate": 0.00029554155730533683,
"loss": 3.1885,
"step": 87100
},
{
"epoch": 25.401119402985074,
"grad_norm": 0.41605037450790405,
"learning_rate": 0.0002953665791776028,
"loss": 3.1804,
"step": 87150
},
{
"epoch": 25.415694962686565,
"grad_norm": 0.4128526449203491,
"learning_rate": 0.00029519160104986877,
"loss": 3.1737,
"step": 87200
},
{
"epoch": 25.43027052238806,
"grad_norm": 0.4425789713859558,
"learning_rate": 0.0002950166229221347,
"loss": 3.1946,
"step": 87250
},
{
"epoch": 25.44484608208955,
"grad_norm": 0.4459783732891083,
"learning_rate": 0.0002948416447944007,
"loss": 3.1741,
"step": 87300
},
{
"epoch": 25.459421641791046,
"grad_norm": 0.44101038575172424,
"learning_rate": 0.00029466666666666666,
"loss": 3.1864,
"step": 87350
},
{
"epoch": 25.473997201492537,
"grad_norm": 0.4281041920185089,
"learning_rate": 0.0002944916885389326,
"loss": 3.1938,
"step": 87400
},
{
"epoch": 25.48857276119403,
"grad_norm": 0.4300467073917389,
"learning_rate": 0.0002943167104111986,
"loss": 3.1894,
"step": 87450
},
{
"epoch": 25.503148320895523,
"grad_norm": 0.4109787940979004,
"learning_rate": 0.00029414173228346454,
"loss": 3.206,
"step": 87500
},
{
"epoch": 25.517723880597014,
"grad_norm": 0.4145699143409729,
"learning_rate": 0.00029396675415573054,
"loss": 3.1949,
"step": 87550
},
{
"epoch": 25.53229944029851,
"grad_norm": 0.4070979654788971,
"learning_rate": 0.0002937917760279965,
"loss": 3.189,
"step": 87600
},
{
"epoch": 25.546875,
"grad_norm": 0.43455639481544495,
"learning_rate": 0.0002936167979002624,
"loss": 3.1924,
"step": 87650
},
{
"epoch": 25.56145055970149,
"grad_norm": 0.4693121910095215,
"learning_rate": 0.0002934418197725284,
"loss": 3.1972,
"step": 87700
},
{
"epoch": 25.576026119402986,
"grad_norm": 0.40905654430389404,
"learning_rate": 0.00029326684164479437,
"loss": 3.1879,
"step": 87750
},
{
"epoch": 25.590601679104477,
"grad_norm": 0.399454265832901,
"learning_rate": 0.0002930918635170603,
"loss": 3.2004,
"step": 87800
},
{
"epoch": 25.60517723880597,
"grad_norm": 0.4239746034145355,
"learning_rate": 0.0002929168853893263,
"loss": 3.2113,
"step": 87850
},
{
"epoch": 25.619752798507463,
"grad_norm": 0.4122619330883026,
"learning_rate": 0.00029274190726159225,
"loss": 3.2005,
"step": 87900
},
{
"epoch": 25.634328358208954,
"grad_norm": 0.42600637674331665,
"learning_rate": 0.00029256692913385825,
"loss": 3.1965,
"step": 87950
},
{
"epoch": 25.64890391791045,
"grad_norm": 0.43111228942871094,
"learning_rate": 0.0002923919510061242,
"loss": 3.201,
"step": 88000
},
{
"epoch": 25.64890391791045,
"eval_accuracy": 0.37345865547478535,
"eval_loss": 3.546783924102783,
"eval_runtime": 82.2087,
"eval_samples_per_second": 202.254,
"eval_steps_per_second": 12.651,
"step": 88000
},
{
"epoch": 25.66347947761194,
"grad_norm": 0.4582037329673767,
"learning_rate": 0.0002922169728783902,
"loss": 3.2048,
"step": 88050
},
{
"epoch": 25.678055037313435,
"grad_norm": 0.4343770742416382,
"learning_rate": 0.00029204199475065613,
"loss": 3.1926,
"step": 88100
},
{
"epoch": 25.692630597014926,
"grad_norm": 0.4335322678089142,
"learning_rate": 0.00029186701662292213,
"loss": 3.1958,
"step": 88150
},
{
"epoch": 25.707206156716417,
"grad_norm": 0.41262751817703247,
"learning_rate": 0.0002916920384951881,
"loss": 3.2129,
"step": 88200
},
{
"epoch": 25.72178171641791,
"grad_norm": 0.42446255683898926,
"learning_rate": 0.00029151706036745407,
"loss": 3.2049,
"step": 88250
},
{
"epoch": 25.736357276119403,
"grad_norm": 0.4038170278072357,
"learning_rate": 0.00029134208223972,
"loss": 3.2032,
"step": 88300
},
{
"epoch": 25.750932835820894,
"grad_norm": 0.4331628382205963,
"learning_rate": 0.00029116710411198596,
"loss": 3.2015,
"step": 88350
},
{
"epoch": 25.76550839552239,
"grad_norm": 0.42481672763824463,
"learning_rate": 0.00029099212598425196,
"loss": 3.211,
"step": 88400
},
{
"epoch": 25.78008395522388,
"grad_norm": 0.4009307622909546,
"learning_rate": 0.0002908171478565179,
"loss": 3.2169,
"step": 88450
},
{
"epoch": 25.794659514925375,
"grad_norm": 0.4292212724685669,
"learning_rate": 0.0002906421697287839,
"loss": 3.2051,
"step": 88500
},
{
"epoch": 25.809235074626866,
"grad_norm": 0.4238194525241852,
"learning_rate": 0.00029046719160104984,
"loss": 3.2081,
"step": 88550
},
{
"epoch": 25.823810634328357,
"grad_norm": 0.45732471346855164,
"learning_rate": 0.00029029221347331584,
"loss": 3.2074,
"step": 88600
},
{
"epoch": 25.83838619402985,
"grad_norm": 0.41441765427589417,
"learning_rate": 0.0002901172353455818,
"loss": 3.2065,
"step": 88650
},
{
"epoch": 25.852961753731343,
"grad_norm": 0.40707236528396606,
"learning_rate": 0.0002899422572178477,
"loss": 3.2126,
"step": 88700
},
{
"epoch": 25.867537313432837,
"grad_norm": 0.4202680289745331,
"learning_rate": 0.0002897672790901137,
"loss": 3.2131,
"step": 88750
},
{
"epoch": 25.88211287313433,
"grad_norm": 0.41881847381591797,
"learning_rate": 0.00028959230096237967,
"loss": 3.2088,
"step": 88800
},
{
"epoch": 25.89668843283582,
"grad_norm": 0.39432790875434875,
"learning_rate": 0.00028941732283464566,
"loss": 3.2116,
"step": 88850
},
{
"epoch": 25.911263992537314,
"grad_norm": 0.41786453127861023,
"learning_rate": 0.0002892423447069116,
"loss": 3.2152,
"step": 88900
},
{
"epoch": 25.925839552238806,
"grad_norm": 0.4230845272541046,
"learning_rate": 0.00028906736657917755,
"loss": 3.2125,
"step": 88950
},
{
"epoch": 25.940415111940297,
"grad_norm": 0.42243847250938416,
"learning_rate": 0.00028889238845144355,
"loss": 3.2114,
"step": 89000
},
{
"epoch": 25.940415111940297,
"eval_accuracy": 0.3738018597765618,
"eval_loss": 3.5390872955322266,
"eval_runtime": 82.2336,
"eval_samples_per_second": 202.192,
"eval_steps_per_second": 12.647,
"step": 89000
},
{
"epoch": 25.95499067164179,
"grad_norm": 0.42381802201271057,
"learning_rate": 0.0002887174103237095,
"loss": 3.2058,
"step": 89050
},
{
"epoch": 25.969566231343283,
"grad_norm": 0.45048123598098755,
"learning_rate": 0.0002885424321959755,
"loss": 3.2144,
"step": 89100
},
{
"epoch": 25.984141791044777,
"grad_norm": 0.40106144547462463,
"learning_rate": 0.00028836745406824143,
"loss": 3.2167,
"step": 89150
},
{
"epoch": 25.99871735074627,
"grad_norm": 0.44207993149757385,
"learning_rate": 0.00028819247594050743,
"loss": 3.2194,
"step": 89200
},
{
"epoch": 26.013118003731343,
"grad_norm": 0.440739244222641,
"learning_rate": 0.0002880174978127734,
"loss": 3.1396,
"step": 89250
},
{
"epoch": 26.027693563432837,
"grad_norm": 0.49055179953575134,
"learning_rate": 0.0002878425196850393,
"loss": 3.1256,
"step": 89300
},
{
"epoch": 26.04226912313433,
"grad_norm": 0.41953858733177185,
"learning_rate": 0.0002876675415573053,
"loss": 3.1163,
"step": 89350
},
{
"epoch": 26.05684468283582,
"grad_norm": 0.4146621525287628,
"learning_rate": 0.00028749256342957126,
"loss": 3.1292,
"step": 89400
},
{
"epoch": 26.071420242537314,
"grad_norm": 0.42104142904281616,
"learning_rate": 0.00028731758530183726,
"loss": 3.1225,
"step": 89450
},
{
"epoch": 26.085995802238806,
"grad_norm": 0.4284878373146057,
"learning_rate": 0.0002871426071741032,
"loss": 3.1216,
"step": 89500
},
{
"epoch": 26.100571361940297,
"grad_norm": 0.4166851341724396,
"learning_rate": 0.0002869676290463692,
"loss": 3.1399,
"step": 89550
},
{
"epoch": 26.11514692164179,
"grad_norm": 0.4328397214412689,
"learning_rate": 0.00028679265091863514,
"loss": 3.1384,
"step": 89600
},
{
"epoch": 26.129722481343283,
"grad_norm": 0.4359130859375,
"learning_rate": 0.00028661767279090114,
"loss": 3.1342,
"step": 89650
},
{
"epoch": 26.144298041044777,
"grad_norm": 0.4311699867248535,
"learning_rate": 0.0002864426946631671,
"loss": 3.14,
"step": 89700
},
{
"epoch": 26.15887360074627,
"grad_norm": 0.41819536685943604,
"learning_rate": 0.0002862677165354331,
"loss": 3.1558,
"step": 89750
},
{
"epoch": 26.17344916044776,
"grad_norm": 0.44823819398880005,
"learning_rate": 0.000286092738407699,
"loss": 3.1426,
"step": 89800
},
{
"epoch": 26.188024720149254,
"grad_norm": 0.4144200384616852,
"learning_rate": 0.00028591776027996497,
"loss": 3.1473,
"step": 89850
},
{
"epoch": 26.202600279850746,
"grad_norm": 0.4270482659339905,
"learning_rate": 0.00028574278215223097,
"loss": 3.1601,
"step": 89900
},
{
"epoch": 26.21717583955224,
"grad_norm": 0.4241775870323181,
"learning_rate": 0.0002855678040244969,
"loss": 3.1421,
"step": 89950
},
{
"epoch": 26.23175139925373,
"grad_norm": 0.4102860987186432,
"learning_rate": 0.00028539282589676285,
"loss": 3.1724,
"step": 90000
},
{
"epoch": 26.23175139925373,
"eval_accuracy": 0.37293608102352094,
"eval_loss": 3.5586137771606445,
"eval_runtime": 82.1555,
"eval_samples_per_second": 202.384,
"eval_steps_per_second": 12.659,
"step": 90000
},
{
"epoch": 26.246326958955223,
"grad_norm": 0.4413115084171295,
"learning_rate": 0.00028521784776902885,
"loss": 3.1418,
"step": 90050
},
{
"epoch": 26.260902518656717,
"grad_norm": 0.4471030533313751,
"learning_rate": 0.0002850428696412948,
"loss": 3.1503,
"step": 90100
},
{
"epoch": 26.27547807835821,
"grad_norm": 0.41441798210144043,
"learning_rate": 0.00028486789151356074,
"loss": 3.1568,
"step": 90150
},
{
"epoch": 26.290053638059703,
"grad_norm": 0.40715059638023376,
"learning_rate": 0.00028469291338582673,
"loss": 3.1696,
"step": 90200
},
{
"epoch": 26.304629197761194,
"grad_norm": 0.4507633149623871,
"learning_rate": 0.0002845179352580927,
"loss": 3.166,
"step": 90250
},
{
"epoch": 26.319204757462686,
"grad_norm": 0.4515264332294464,
"learning_rate": 0.0002843429571303587,
"loss": 3.1736,
"step": 90300
},
{
"epoch": 26.33378031716418,
"grad_norm": 0.45111826062202454,
"learning_rate": 0.0002841679790026246,
"loss": 3.1645,
"step": 90350
},
{
"epoch": 26.34835587686567,
"grad_norm": 0.4295303225517273,
"learning_rate": 0.0002839930008748906,
"loss": 3.1699,
"step": 90400
},
{
"epoch": 26.362931436567163,
"grad_norm": 0.4211674928665161,
"learning_rate": 0.00028381802274715656,
"loss": 3.175,
"step": 90450
},
{
"epoch": 26.377506996268657,
"grad_norm": 0.4354601800441742,
"learning_rate": 0.00028364304461942256,
"loss": 3.1635,
"step": 90500
},
{
"epoch": 26.39208255597015,
"grad_norm": 0.44711926579475403,
"learning_rate": 0.0002834680664916885,
"loss": 3.1796,
"step": 90550
},
{
"epoch": 26.406658115671643,
"grad_norm": 0.43272531032562256,
"learning_rate": 0.0002832930883639545,
"loss": 3.161,
"step": 90600
},
{
"epoch": 26.421233675373134,
"grad_norm": 0.4213297963142395,
"learning_rate": 0.00028311811023622044,
"loss": 3.1676,
"step": 90650
},
{
"epoch": 26.435809235074625,
"grad_norm": 0.4430896043777466,
"learning_rate": 0.00028294313210848644,
"loss": 3.185,
"step": 90700
},
{
"epoch": 26.45038479477612,
"grad_norm": 0.4318443834781647,
"learning_rate": 0.0002827681539807524,
"loss": 3.1677,
"step": 90750
},
{
"epoch": 26.46496035447761,
"grad_norm": 0.42491278052330017,
"learning_rate": 0.0002825931758530184,
"loss": 3.1867,
"step": 90800
},
{
"epoch": 26.479535914179106,
"grad_norm": 0.42508071660995483,
"learning_rate": 0.0002824181977252843,
"loss": 3.1799,
"step": 90850
},
{
"epoch": 26.494111473880597,
"grad_norm": 0.46238982677459717,
"learning_rate": 0.0002822432195975503,
"loss": 3.1668,
"step": 90900
},
{
"epoch": 26.50868703358209,
"grad_norm": 0.43022239208221436,
"learning_rate": 0.00028206824146981627,
"loss": 3.1945,
"step": 90950
},
{
"epoch": 26.523262593283583,
"grad_norm": 0.44237610697746277,
"learning_rate": 0.0002818932633420822,
"loss": 3.1831,
"step": 91000
},
{
"epoch": 26.523262593283583,
"eval_accuracy": 0.3733205969542148,
"eval_loss": 3.5528080463409424,
"eval_runtime": 82.3087,
"eval_samples_per_second": 202.008,
"eval_steps_per_second": 12.635,
"step": 91000
},
{
"epoch": 26.537838152985074,
"grad_norm": 0.4143082797527313,
"learning_rate": 0.0002817182852143482,
"loss": 3.1941,
"step": 91050
},
{
"epoch": 26.552413712686565,
"grad_norm": 0.4099169373512268,
"learning_rate": 0.00028154330708661415,
"loss": 3.1927,
"step": 91100
},
{
"epoch": 26.56698927238806,
"grad_norm": 0.4147176742553711,
"learning_rate": 0.0002813683289588801,
"loss": 3.1883,
"step": 91150
},
{
"epoch": 26.58156483208955,
"grad_norm": 0.4103856682777405,
"learning_rate": 0.0002811933508311461,
"loss": 3.196,
"step": 91200
},
{
"epoch": 26.596140391791046,
"grad_norm": 0.4222527742385864,
"learning_rate": 0.00028101837270341204,
"loss": 3.1797,
"step": 91250
},
{
"epoch": 26.610715951492537,
"grad_norm": 0.41271039843559265,
"learning_rate": 0.000280843394575678,
"loss": 3.196,
"step": 91300
},
{
"epoch": 26.62529151119403,
"grad_norm": 0.42893654108047485,
"learning_rate": 0.000280668416447944,
"loss": 3.1862,
"step": 91350
},
{
"epoch": 26.639867070895523,
"grad_norm": 0.4422917068004608,
"learning_rate": 0.0002804934383202099,
"loss": 3.1866,
"step": 91400
},
{
"epoch": 26.654442630597014,
"grad_norm": 0.4190319776535034,
"learning_rate": 0.0002803184601924759,
"loss": 3.1852,
"step": 91450
},
{
"epoch": 26.66901819029851,
"grad_norm": 0.4198403060436249,
"learning_rate": 0.00028014348206474186,
"loss": 3.198,
"step": 91500
},
{
"epoch": 26.68359375,
"grad_norm": 0.4143039286136627,
"learning_rate": 0.00027996850393700786,
"loss": 3.1877,
"step": 91550
},
{
"epoch": 26.69816930970149,
"grad_norm": 0.4511352479457855,
"learning_rate": 0.0002797935258092738,
"loss": 3.1815,
"step": 91600
},
{
"epoch": 26.712744869402986,
"grad_norm": 0.427504301071167,
"learning_rate": 0.0002796185476815398,
"loss": 3.1959,
"step": 91650
},
{
"epoch": 26.727320429104477,
"grad_norm": 0.45200663805007935,
"learning_rate": 0.00027944356955380574,
"loss": 3.1907,
"step": 91700
},
{
"epoch": 26.74189598880597,
"grad_norm": 0.4165610671043396,
"learning_rate": 0.00027926859142607174,
"loss": 3.1929,
"step": 91750
},
{
"epoch": 26.756471548507463,
"grad_norm": 0.4394925832748413,
"learning_rate": 0.0002790936132983377,
"loss": 3.1997,
"step": 91800
},
{
"epoch": 26.771047108208954,
"grad_norm": 0.39829355478286743,
"learning_rate": 0.0002789186351706037,
"loss": 3.1901,
"step": 91850
},
{
"epoch": 26.78562266791045,
"grad_norm": 0.4587573707103729,
"learning_rate": 0.0002787436570428696,
"loss": 3.2027,
"step": 91900
},
{
"epoch": 26.80019822761194,
"grad_norm": 0.44898635149002075,
"learning_rate": 0.00027856867891513557,
"loss": 3.1916,
"step": 91950
},
{
"epoch": 26.814773787313435,
"grad_norm": 0.4305431544780731,
"learning_rate": 0.00027839370078740157,
"loss": 3.2083,
"step": 92000
},
{
"epoch": 26.814773787313435,
"eval_accuracy": 0.37361601629490715,
"eval_loss": 3.544487953186035,
"eval_runtime": 82.1477,
"eval_samples_per_second": 202.404,
"eval_steps_per_second": 12.66,
"step": 92000
},
{
"epoch": 26.829349347014926,
"grad_norm": 0.44197413325309753,
"learning_rate": 0.0002782187226596675,
"loss": 3.2056,
"step": 92050
},
{
"epoch": 26.843924906716417,
"grad_norm": 0.40853598713874817,
"learning_rate": 0.0002780437445319335,
"loss": 3.2028,
"step": 92100
},
{
"epoch": 26.85850046641791,
"grad_norm": 0.42029932141304016,
"learning_rate": 0.00027786876640419945,
"loss": 3.1965,
"step": 92150
},
{
"epoch": 26.873076026119403,
"grad_norm": 0.44294652342796326,
"learning_rate": 0.00027769378827646545,
"loss": 3.2025,
"step": 92200
},
{
"epoch": 26.887651585820894,
"grad_norm": 0.44561460614204407,
"learning_rate": 0.0002775188101487314,
"loss": 3.2205,
"step": 92250
},
{
"epoch": 26.90222714552239,
"grad_norm": 0.3962169289588928,
"learning_rate": 0.00027734383202099734,
"loss": 3.1954,
"step": 92300
},
{
"epoch": 26.91680270522388,
"grad_norm": 0.41185733675956726,
"learning_rate": 0.00027716885389326333,
"loss": 3.2069,
"step": 92350
},
{
"epoch": 26.931378264925375,
"grad_norm": 0.4333236515522003,
"learning_rate": 0.0002769938757655293,
"loss": 3.1982,
"step": 92400
},
{
"epoch": 26.945953824626866,
"grad_norm": 0.4122789204120636,
"learning_rate": 0.0002768188976377952,
"loss": 3.2089,
"step": 92450
},
{
"epoch": 26.960529384328357,
"grad_norm": 0.4543864130973816,
"learning_rate": 0.0002766439195100612,
"loss": 3.2081,
"step": 92500
},
{
"epoch": 26.97510494402985,
"grad_norm": 0.39395999908447266,
"learning_rate": 0.00027646894138232716,
"loss": 3.21,
"step": 92550
},
{
"epoch": 26.989680503731343,
"grad_norm": 0.42809590697288513,
"learning_rate": 0.00027629396325459316,
"loss": 3.2076,
"step": 92600
},
{
"epoch": 27.004081156716417,
"grad_norm": 0.420581579208374,
"learning_rate": 0.0002761189851268591,
"loss": 3.1817,
"step": 92650
},
{
"epoch": 27.01865671641791,
"grad_norm": 0.4725584387779236,
"learning_rate": 0.0002759440069991251,
"loss": 3.0986,
"step": 92700
},
{
"epoch": 27.033232276119403,
"grad_norm": 0.43019962310791016,
"learning_rate": 0.00027576902887139105,
"loss": 3.1164,
"step": 92750
},
{
"epoch": 27.047807835820894,
"grad_norm": 0.44071322679519653,
"learning_rate": 0.000275594050743657,
"loss": 3.1244,
"step": 92800
},
{
"epoch": 27.06238339552239,
"grad_norm": 0.4143284857273102,
"learning_rate": 0.000275419072615923,
"loss": 3.1148,
"step": 92850
},
{
"epoch": 27.07695895522388,
"grad_norm": 0.43484529852867126,
"learning_rate": 0.00027524409448818893,
"loss": 3.1153,
"step": 92900
},
{
"epoch": 27.091534514925375,
"grad_norm": 0.4603308439254761,
"learning_rate": 0.00027506911636045493,
"loss": 3.1139,
"step": 92950
},
{
"epoch": 27.106110074626866,
"grad_norm": 0.4302753210067749,
"learning_rate": 0.00027489413823272087,
"loss": 3.1286,
"step": 93000
},
{
"epoch": 27.106110074626866,
"eval_accuracy": 0.37318289152448975,
"eval_loss": 3.5575826168060303,
"eval_runtime": 82.136,
"eval_samples_per_second": 202.433,
"eval_steps_per_second": 12.662,
"step": 93000
},
{
"epoch": 27.120685634328357,
"grad_norm": 0.4867883622646332,
"learning_rate": 0.00027471916010498687,
"loss": 3.1335,
"step": 93050
},
{
"epoch": 27.13526119402985,
"grad_norm": 0.4239311218261719,
"learning_rate": 0.0002745441819772528,
"loss": 3.1383,
"step": 93100
},
{
"epoch": 27.149836753731343,
"grad_norm": 0.44329917430877686,
"learning_rate": 0.0002743692038495188,
"loss": 3.1343,
"step": 93150
},
{
"epoch": 27.164412313432837,
"grad_norm": 0.47699853777885437,
"learning_rate": 0.00027419422572178475,
"loss": 3.1357,
"step": 93200
},
{
"epoch": 27.17898787313433,
"grad_norm": 0.4764823317527771,
"learning_rate": 0.00027401924759405075,
"loss": 3.1396,
"step": 93250
},
{
"epoch": 27.19356343283582,
"grad_norm": 0.4707601070404053,
"learning_rate": 0.0002738442694663167,
"loss": 3.1536,
"step": 93300
},
{
"epoch": 27.208138992537314,
"grad_norm": 0.42642828822135925,
"learning_rate": 0.00027366929133858264,
"loss": 3.1399,
"step": 93350
},
{
"epoch": 27.222714552238806,
"grad_norm": 0.4552763104438782,
"learning_rate": 0.00027349431321084864,
"loss": 3.1456,
"step": 93400
},
{
"epoch": 27.237290111940297,
"grad_norm": 0.4463469684123993,
"learning_rate": 0.0002733193350831146,
"loss": 3.1482,
"step": 93450
},
{
"epoch": 27.25186567164179,
"grad_norm": 0.47506290674209595,
"learning_rate": 0.0002731443569553806,
"loss": 3.1453,
"step": 93500
},
{
"epoch": 27.266441231343283,
"grad_norm": 0.4418078660964966,
"learning_rate": 0.0002729693788276465,
"loss": 3.1503,
"step": 93550
},
{
"epoch": 27.281016791044777,
"grad_norm": 0.4288392663002014,
"learning_rate": 0.00027279440069991246,
"loss": 3.1563,
"step": 93600
},
{
"epoch": 27.29559235074627,
"grad_norm": 0.4385755956172943,
"learning_rate": 0.00027261942257217846,
"loss": 3.1549,
"step": 93650
},
{
"epoch": 27.31016791044776,
"grad_norm": 0.4316536486148834,
"learning_rate": 0.0002724444444444444,
"loss": 3.163,
"step": 93700
},
{
"epoch": 27.324743470149254,
"grad_norm": 0.4343269467353821,
"learning_rate": 0.00027226946631671035,
"loss": 3.1476,
"step": 93750
},
{
"epoch": 27.339319029850746,
"grad_norm": 0.45259353518486023,
"learning_rate": 0.00027209448818897635,
"loss": 3.1591,
"step": 93800
},
{
"epoch": 27.35389458955224,
"grad_norm": 0.44386130571365356,
"learning_rate": 0.0002719195100612423,
"loss": 3.1696,
"step": 93850
},
{
"epoch": 27.36847014925373,
"grad_norm": 0.43997707962989807,
"learning_rate": 0.0002717445319335083,
"loss": 3.1696,
"step": 93900
},
{
"epoch": 27.383045708955223,
"grad_norm": 0.4574432671070099,
"learning_rate": 0.00027156955380577423,
"loss": 3.1684,
"step": 93950
},
{
"epoch": 27.397621268656717,
"grad_norm": 0.41324666142463684,
"learning_rate": 0.00027139457567804023,
"loss": 3.1577,
"step": 94000
},
{
"epoch": 27.397621268656717,
"eval_accuracy": 0.37325468666306433,
"eval_loss": 3.5531723499298096,
"eval_runtime": 81.4446,
"eval_samples_per_second": 204.151,
"eval_steps_per_second": 12.769,
"step": 94000
},
{
"epoch": 27.41219682835821,
"grad_norm": 0.4798290431499481,
"learning_rate": 0.00027121959755030617,
"loss": 3.1687,
"step": 94050
},
{
"epoch": 27.426772388059703,
"grad_norm": 0.4669507145881653,
"learning_rate": 0.00027104461942257217,
"loss": 3.1611,
"step": 94100
},
{
"epoch": 27.441347947761194,
"grad_norm": 0.6186783909797668,
"learning_rate": 0.0002708696412948381,
"loss": 3.1633,
"step": 94150
},
{
"epoch": 27.455923507462686,
"grad_norm": 0.44731244444847107,
"learning_rate": 0.0002706946631671041,
"loss": 3.1642,
"step": 94200
},
{
"epoch": 27.47049906716418,
"grad_norm": 0.4521823227405548,
"learning_rate": 0.00027051968503937005,
"loss": 3.1794,
"step": 94250
},
{
"epoch": 27.48507462686567,
"grad_norm": 0.4513274133205414,
"learning_rate": 0.00027034470691163605,
"loss": 3.1605,
"step": 94300
},
{
"epoch": 27.499650186567163,
"grad_norm": 0.44645029306411743,
"learning_rate": 0.000270169728783902,
"loss": 3.1658,
"step": 94350
},
{
"epoch": 27.514225746268657,
"grad_norm": 0.42641523480415344,
"learning_rate": 0.000269994750656168,
"loss": 3.1763,
"step": 94400
},
{
"epoch": 27.52880130597015,
"grad_norm": 0.48925453424453735,
"learning_rate": 0.00026981977252843394,
"loss": 3.1741,
"step": 94450
},
{
"epoch": 27.543376865671643,
"grad_norm": 0.40931984782218933,
"learning_rate": 0.0002696447944006999,
"loss": 3.1692,
"step": 94500
},
{
"epoch": 27.557952425373134,
"grad_norm": 0.4484209716320038,
"learning_rate": 0.0002694698162729659,
"loss": 3.1979,
"step": 94550
},
{
"epoch": 27.572527985074625,
"grad_norm": 0.4323810636997223,
"learning_rate": 0.0002692948381452318,
"loss": 3.1729,
"step": 94600
},
{
"epoch": 27.58710354477612,
"grad_norm": 0.43009236454963684,
"learning_rate": 0.00026911986001749776,
"loss": 3.1841,
"step": 94650
},
{
"epoch": 27.60167910447761,
"grad_norm": 0.4245539605617523,
"learning_rate": 0.00026894488188976376,
"loss": 3.1874,
"step": 94700
},
{
"epoch": 27.616254664179106,
"grad_norm": 0.42876526713371277,
"learning_rate": 0.0002687699037620297,
"loss": 3.184,
"step": 94750
},
{
"epoch": 27.630830223880597,
"grad_norm": 0.4382181167602539,
"learning_rate": 0.0002685949256342957,
"loss": 3.182,
"step": 94800
},
{
"epoch": 27.64540578358209,
"grad_norm": 0.4394516050815582,
"learning_rate": 0.00026841994750656165,
"loss": 3.1818,
"step": 94850
},
{
"epoch": 27.659981343283583,
"grad_norm": 0.4382858872413635,
"learning_rate": 0.0002682449693788276,
"loss": 3.1715,
"step": 94900
},
{
"epoch": 27.674556902985074,
"grad_norm": 0.4355790317058563,
"learning_rate": 0.0002680699912510936,
"loss": 3.188,
"step": 94950
},
{
"epoch": 27.689132462686565,
"grad_norm": 0.44806766510009766,
"learning_rate": 0.00026789501312335953,
"loss": 3.1856,
"step": 95000
},
{
"epoch": 27.689132462686565,
"eval_accuracy": 0.37393956520628685,
"eval_loss": 3.54496693611145,
"eval_runtime": 81.364,
"eval_samples_per_second": 204.353,
"eval_steps_per_second": 12.782,
"step": 95000
},
{
"epoch": 27.70370802238806,
"grad_norm": 0.4786415696144104,
"learning_rate": 0.00026772003499562553,
"loss": 3.1954,
"step": 95050
},
{
"epoch": 27.71828358208955,
"grad_norm": 0.43282392621040344,
"learning_rate": 0.0002675450568678915,
"loss": 3.1868,
"step": 95100
},
{
"epoch": 27.732859141791046,
"grad_norm": 0.4446009695529938,
"learning_rate": 0.00026737007874015747,
"loss": 3.1852,
"step": 95150
},
{
"epoch": 27.747434701492537,
"grad_norm": 0.4616422951221466,
"learning_rate": 0.0002671951006124234,
"loss": 3.1974,
"step": 95200
},
{
"epoch": 27.76201026119403,
"grad_norm": 0.40891557931900024,
"learning_rate": 0.0002670201224846894,
"loss": 3.1842,
"step": 95250
},
{
"epoch": 27.776585820895523,
"grad_norm": 0.42072343826293945,
"learning_rate": 0.00026684514435695536,
"loss": 3.1819,
"step": 95300
},
{
"epoch": 27.791161380597014,
"grad_norm": 0.4344806671142578,
"learning_rate": 0.00026667016622922135,
"loss": 3.1801,
"step": 95350
},
{
"epoch": 27.80573694029851,
"grad_norm": 0.44849181175231934,
"learning_rate": 0.0002664951881014873,
"loss": 3.1914,
"step": 95400
},
{
"epoch": 27.8203125,
"grad_norm": 0.4324977695941925,
"learning_rate": 0.0002663202099737533,
"loss": 3.1815,
"step": 95450
},
{
"epoch": 27.83488805970149,
"grad_norm": 0.4600885510444641,
"learning_rate": 0.00026614523184601924,
"loss": 3.198,
"step": 95500
},
{
"epoch": 27.849463619402986,
"grad_norm": 0.43553200364112854,
"learning_rate": 0.0002659702537182852,
"loss": 3.1871,
"step": 95550
},
{
"epoch": 27.864039179104477,
"grad_norm": 0.4483141601085663,
"learning_rate": 0.0002657952755905512,
"loss": 3.2024,
"step": 95600
},
{
"epoch": 27.87861473880597,
"grad_norm": 0.4489576518535614,
"learning_rate": 0.0002656202974628171,
"loss": 3.1831,
"step": 95650
},
{
"epoch": 27.893190298507463,
"grad_norm": 0.4157513380050659,
"learning_rate": 0.0002654453193350831,
"loss": 3.1921,
"step": 95700
},
{
"epoch": 27.907765858208954,
"grad_norm": 0.41209346055984497,
"learning_rate": 0.00026527034120734906,
"loss": 3.1953,
"step": 95750
},
{
"epoch": 27.92234141791045,
"grad_norm": 0.4335402250289917,
"learning_rate": 0.000265095363079615,
"loss": 3.1873,
"step": 95800
},
{
"epoch": 27.93691697761194,
"grad_norm": 0.4793190658092499,
"learning_rate": 0.000264920384951881,
"loss": 3.1879,
"step": 95850
},
{
"epoch": 27.951492537313435,
"grad_norm": 0.43012735247612,
"learning_rate": 0.00026474540682414695,
"loss": 3.1829,
"step": 95900
},
{
"epoch": 27.966068097014926,
"grad_norm": 0.47469058632850647,
"learning_rate": 0.0002645704286964129,
"loss": 3.2055,
"step": 95950
},
{
"epoch": 27.980643656716417,
"grad_norm": 0.4229576289653778,
"learning_rate": 0.0002643954505686789,
"loss": 3.211,
"step": 96000
},
{
"epoch": 27.980643656716417,
"eval_accuracy": 0.37441765021102474,
"eval_loss": 3.5370290279388428,
"eval_runtime": 81.4563,
"eval_samples_per_second": 204.122,
"eval_steps_per_second": 12.768,
"step": 96000
},
{
"epoch": 27.99521921641791,
"grad_norm": 0.42650726437568665,
"learning_rate": 0.00026422047244094483,
"loss": 3.2048,
"step": 96050
},
{
"epoch": 28.009619869402986,
"grad_norm": 0.4413430094718933,
"learning_rate": 0.00026404549431321083,
"loss": 3.13,
"step": 96100
},
{
"epoch": 28.024195429104477,
"grad_norm": 0.4401211738586426,
"learning_rate": 0.0002638705161854768,
"loss": 3.1019,
"step": 96150
},
{
"epoch": 28.03877098880597,
"grad_norm": 0.4243675470352173,
"learning_rate": 0.00026369553805774277,
"loss": 3.109,
"step": 96200
},
{
"epoch": 28.053346548507463,
"grad_norm": 0.44328224658966064,
"learning_rate": 0.0002635205599300087,
"loss": 3.1023,
"step": 96250
},
{
"epoch": 28.067922108208954,
"grad_norm": 0.4328586459159851,
"learning_rate": 0.0002633455818022747,
"loss": 3.1122,
"step": 96300
},
{
"epoch": 28.08249766791045,
"grad_norm": 0.4224425256252289,
"learning_rate": 0.00026317060367454066,
"loss": 3.1039,
"step": 96350
},
{
"epoch": 28.09707322761194,
"grad_norm": 0.43421924114227295,
"learning_rate": 0.0002629956255468066,
"loss": 3.1199,
"step": 96400
},
{
"epoch": 28.11164878731343,
"grad_norm": 0.43485110998153687,
"learning_rate": 0.0002628206474190726,
"loss": 3.1217,
"step": 96450
},
{
"epoch": 28.126224347014926,
"grad_norm": 0.4528997838497162,
"learning_rate": 0.00026264566929133854,
"loss": 3.1155,
"step": 96500
},
{
"epoch": 28.140799906716417,
"grad_norm": 0.4375012516975403,
"learning_rate": 0.00026247069116360454,
"loss": 3.1368,
"step": 96550
},
{
"epoch": 28.15537546641791,
"grad_norm": 0.4785500764846802,
"learning_rate": 0.0002622957130358705,
"loss": 3.1282,
"step": 96600
},
{
"epoch": 28.169951026119403,
"grad_norm": 0.4440264403820038,
"learning_rate": 0.0002621207349081365,
"loss": 3.1369,
"step": 96650
},
{
"epoch": 28.184526585820894,
"grad_norm": 0.4505453109741211,
"learning_rate": 0.0002619457567804024,
"loss": 3.1321,
"step": 96700
},
{
"epoch": 28.19910214552239,
"grad_norm": 0.42353829741477966,
"learning_rate": 0.0002617707786526684,
"loss": 3.1444,
"step": 96750
},
{
"epoch": 28.21367770522388,
"grad_norm": 0.4531719386577606,
"learning_rate": 0.00026159580052493436,
"loss": 3.1437,
"step": 96800
},
{
"epoch": 28.228253264925375,
"grad_norm": 0.4176923632621765,
"learning_rate": 0.00026142082239720036,
"loss": 3.1305,
"step": 96850
},
{
"epoch": 28.242828824626866,
"grad_norm": 0.4476938247680664,
"learning_rate": 0.0002612458442694663,
"loss": 3.1518,
"step": 96900
},
{
"epoch": 28.257404384328357,
"grad_norm": 0.4284321665763855,
"learning_rate": 0.00026107086614173225,
"loss": 3.1459,
"step": 96950
},
{
"epoch": 28.27197994402985,
"grad_norm": 0.4615371823310852,
"learning_rate": 0.00026089588801399825,
"loss": 3.1377,
"step": 97000
},
{
"epoch": 28.27197994402985,
"eval_accuracy": 0.37340392639374076,
"eval_loss": 3.5557925701141357,
"eval_runtime": 81.4099,
"eval_samples_per_second": 204.238,
"eval_steps_per_second": 12.775,
"step": 97000
},
{
"epoch": 28.286555503731343,
"grad_norm": 0.4477929472923279,
"learning_rate": 0.0002607209098862642,
"loss": 3.1388,
"step": 97050
},
{
"epoch": 28.301131063432837,
"grad_norm": 0.46377018094062805,
"learning_rate": 0.00026054593175853013,
"loss": 3.1423,
"step": 97100
},
{
"epoch": 28.31570662313433,
"grad_norm": 0.4375309944152832,
"learning_rate": 0.00026037095363079613,
"loss": 3.1502,
"step": 97150
},
{
"epoch": 28.33028218283582,
"grad_norm": 0.42167800664901733,
"learning_rate": 0.0002601959755030621,
"loss": 3.1636,
"step": 97200
},
{
"epoch": 28.344857742537314,
"grad_norm": 0.4603038430213928,
"learning_rate": 0.00026002099737532807,
"loss": 3.1435,
"step": 97250
},
{
"epoch": 28.359433302238806,
"grad_norm": 0.46064409613609314,
"learning_rate": 0.000259846019247594,
"loss": 3.1646,
"step": 97300
},
{
"epoch": 28.374008861940297,
"grad_norm": 0.4541272819042206,
"learning_rate": 0.00025967104111985996,
"loss": 3.1524,
"step": 97350
},
{
"epoch": 28.38858442164179,
"grad_norm": 0.45432737469673157,
"learning_rate": 0.00025949606299212596,
"loss": 3.1483,
"step": 97400
},
{
"epoch": 28.403159981343283,
"grad_norm": 0.4208422899246216,
"learning_rate": 0.0002593210848643919,
"loss": 3.148,
"step": 97450
},
{
"epoch": 28.417735541044777,
"grad_norm": 0.44372475147247314,
"learning_rate": 0.0002591461067366579,
"loss": 3.1474,
"step": 97500
},
{
"epoch": 28.43231110074627,
"grad_norm": 0.4374051094055176,
"learning_rate": 0.00025897112860892384,
"loss": 3.163,
"step": 97550
},
{
"epoch": 28.44688666044776,
"grad_norm": 0.44096410274505615,
"learning_rate": 0.00025879615048118984,
"loss": 3.1587,
"step": 97600
},
{
"epoch": 28.461462220149254,
"grad_norm": 0.47338542342185974,
"learning_rate": 0.0002586211723534558,
"loss": 3.1599,
"step": 97650
},
{
"epoch": 28.476037779850746,
"grad_norm": 0.4479065239429474,
"learning_rate": 0.0002584461942257218,
"loss": 3.1591,
"step": 97700
},
{
"epoch": 28.49061333955224,
"grad_norm": 0.46635714173316956,
"learning_rate": 0.0002582712160979877,
"loss": 3.1686,
"step": 97750
},
{
"epoch": 28.50518889925373,
"grad_norm": 0.4383736550807953,
"learning_rate": 0.0002580962379702537,
"loss": 3.1631,
"step": 97800
},
{
"epoch": 28.519764458955223,
"grad_norm": 0.46040958166122437,
"learning_rate": 0.00025792125984251967,
"loss": 3.1537,
"step": 97850
},
{
"epoch": 28.534340018656717,
"grad_norm": 0.43736448884010315,
"learning_rate": 0.00025774628171478566,
"loss": 3.1609,
"step": 97900
},
{
"epoch": 28.54891557835821,
"grad_norm": 0.4490288496017456,
"learning_rate": 0.0002575713035870516,
"loss": 3.1711,
"step": 97950
},
{
"epoch": 28.563491138059703,
"grad_norm": 0.450336754322052,
"learning_rate": 0.00025739632545931755,
"loss": 3.1703,
"step": 98000
},
{
"epoch": 28.563491138059703,
"eval_accuracy": 0.37376537372253205,
"eval_loss": 3.5499653816223145,
"eval_runtime": 81.4058,
"eval_samples_per_second": 204.248,
"eval_steps_per_second": 12.776,
"step": 98000
},
{
"epoch": 28.578066697761194,
"grad_norm": 0.47243165969848633,
"learning_rate": 0.00025722134733158355,
"loss": 3.1704,
"step": 98050
},
{
"epoch": 28.592642257462686,
"grad_norm": 0.5251398682594299,
"learning_rate": 0.0002570463692038495,
"loss": 3.1747,
"step": 98100
},
{
"epoch": 28.60721781716418,
"grad_norm": 0.46638402342796326,
"learning_rate": 0.0002568713910761155,
"loss": 3.1742,
"step": 98150
},
{
"epoch": 28.62179337686567,
"grad_norm": 0.4190913140773773,
"learning_rate": 0.00025669641294838143,
"loss": 3.1682,
"step": 98200
},
{
"epoch": 28.636368936567163,
"grad_norm": 0.47515663504600525,
"learning_rate": 0.0002565214348206474,
"loss": 3.1723,
"step": 98250
},
{
"epoch": 28.650944496268657,
"grad_norm": 0.4323152005672455,
"learning_rate": 0.0002563464566929134,
"loss": 3.1577,
"step": 98300
},
{
"epoch": 28.66552005597015,
"grad_norm": 0.4200900197029114,
"learning_rate": 0.0002561714785651793,
"loss": 3.1871,
"step": 98350
},
{
"epoch": 28.680095615671643,
"grad_norm": 0.45298445224761963,
"learning_rate": 0.00025599650043744526,
"loss": 3.1659,
"step": 98400
},
{
"epoch": 28.694671175373134,
"grad_norm": 0.4306495785713196,
"learning_rate": 0.00025582152230971126,
"loss": 3.1648,
"step": 98450
},
{
"epoch": 28.709246735074625,
"grad_norm": 0.4497992992401123,
"learning_rate": 0.0002556465441819772,
"loss": 3.1659,
"step": 98500
},
{
"epoch": 28.72382229477612,
"grad_norm": 0.4279501438140869,
"learning_rate": 0.0002554715660542432,
"loss": 3.1657,
"step": 98550
},
{
"epoch": 28.73839785447761,
"grad_norm": 0.438489705324173,
"learning_rate": 0.00025529658792650914,
"loss": 3.1907,
"step": 98600
},
{
"epoch": 28.752973414179106,
"grad_norm": 0.42497625946998596,
"learning_rate": 0.00025512160979877514,
"loss": 3.1752,
"step": 98650
},
{
"epoch": 28.767548973880597,
"grad_norm": 0.46575412154197693,
"learning_rate": 0.0002549466316710411,
"loss": 3.177,
"step": 98700
},
{
"epoch": 28.78212453358209,
"grad_norm": 0.4322008788585663,
"learning_rate": 0.0002547716535433071,
"loss": 3.1806,
"step": 98750
},
{
"epoch": 28.796700093283583,
"grad_norm": 0.45955249667167664,
"learning_rate": 0.000254596675415573,
"loss": 3.1863,
"step": 98800
},
{
"epoch": 28.811275652985074,
"grad_norm": 0.44759657979011536,
"learning_rate": 0.000254421697287839,
"loss": 3.1943,
"step": 98850
},
{
"epoch": 28.825851212686565,
"grad_norm": 0.4237118065357208,
"learning_rate": 0.00025424671916010497,
"loss": 3.1861,
"step": 98900
},
{
"epoch": 28.84042677238806,
"grad_norm": 0.4191685914993286,
"learning_rate": 0.00025407174103237096,
"loss": 3.1886,
"step": 98950
},
{
"epoch": 28.85500233208955,
"grad_norm": 0.45205187797546387,
"learning_rate": 0.0002538967629046369,
"loss": 3.1761,
"step": 99000
},
{
"epoch": 28.85500233208955,
"eval_accuracy": 0.37389130945740884,
"eval_loss": 3.5445897579193115,
"eval_runtime": 81.4173,
"eval_samples_per_second": 204.22,
"eval_steps_per_second": 12.774,
"step": 99000
},
{
"epoch": 28.869577891791046,
"grad_norm": 0.4248482584953308,
"learning_rate": 0.0002537217847769029,
"loss": 3.1808,
"step": 99050
},
{
"epoch": 28.884153451492537,
"grad_norm": 0.44186240434646606,
"learning_rate": 0.00025354680664916885,
"loss": 3.1812,
"step": 99100
},
{
"epoch": 28.89872901119403,
"grad_norm": 0.46802234649658203,
"learning_rate": 0.0002533718285214348,
"loss": 3.1865,
"step": 99150
},
{
"epoch": 28.913304570895523,
"grad_norm": 0.4645196795463562,
"learning_rate": 0.0002531968503937008,
"loss": 3.1839,
"step": 99200
},
{
"epoch": 28.927880130597014,
"grad_norm": 0.4409431219100952,
"learning_rate": 0.00025302187226596673,
"loss": 3.1905,
"step": 99250
},
{
"epoch": 28.94245569029851,
"grad_norm": 0.45034196972846985,
"learning_rate": 0.0002528468941382327,
"loss": 3.188,
"step": 99300
},
{
"epoch": 28.95703125,
"grad_norm": 0.4407288730144501,
"learning_rate": 0.0002526719160104987,
"loss": 3.1961,
"step": 99350
},
{
"epoch": 28.97160680970149,
"grad_norm": 0.42619070410728455,
"learning_rate": 0.0002524969378827646,
"loss": 3.1995,
"step": 99400
},
{
"epoch": 28.986182369402986,
"grad_norm": 0.4555329382419586,
"learning_rate": 0.0002523219597550306,
"loss": 3.1866,
"step": 99450
},
{
"epoch": 29.00058302238806,
"grad_norm": 0.46135783195495605,
"learning_rate": 0.00025214698162729656,
"loss": 3.1917,
"step": 99500
},
{
"epoch": 29.01515858208955,
"grad_norm": 0.444700688123703,
"learning_rate": 0.0002519720034995625,
"loss": 3.0955,
"step": 99550
},
{
"epoch": 29.029734141791046,
"grad_norm": 0.44991371035575867,
"learning_rate": 0.0002517970253718285,
"loss": 3.0921,
"step": 99600
},
{
"epoch": 29.044309701492537,
"grad_norm": 0.4495794475078583,
"learning_rate": 0.00025162204724409444,
"loss": 3.105,
"step": 99650
},
{
"epoch": 29.05888526119403,
"grad_norm": 0.4699917435646057,
"learning_rate": 0.00025144706911636044,
"loss": 3.0964,
"step": 99700
},
{
"epoch": 29.073460820895523,
"grad_norm": 0.4349089562892914,
"learning_rate": 0.0002512720909886264,
"loss": 3.1254,
"step": 99750
},
{
"epoch": 29.088036380597014,
"grad_norm": 0.4257027208805084,
"learning_rate": 0.0002510971128608924,
"loss": 3.1008,
"step": 99800
},
{
"epoch": 29.10261194029851,
"grad_norm": 0.43259721994400024,
"learning_rate": 0.0002509221347331583,
"loss": 3.1116,
"step": 99850
},
{
"epoch": 29.1171875,
"grad_norm": 0.4446471631526947,
"learning_rate": 0.0002507471566054243,
"loss": 3.1186,
"step": 99900
},
{
"epoch": 29.13176305970149,
"grad_norm": 0.4739801585674286,
"learning_rate": 0.00025057217847769027,
"loss": 3.1133,
"step": 99950
},
{
"epoch": 29.146338619402986,
"grad_norm": 0.4532054662704468,
"learning_rate": 0.0002503972003499562,
"loss": 3.1259,
"step": 100000
},
{
"epoch": 29.146338619402986,
"eval_accuracy": 0.37309544269176687,
"eval_loss": 3.5599021911621094,
"eval_runtime": 81.5499,
"eval_samples_per_second": 203.887,
"eval_steps_per_second": 12.753,
"step": 100000
},
{
"epoch": 29.160914179104477,
"grad_norm": 0.44886332750320435,
"learning_rate": 0.0002502222222222222,
"loss": 3.1373,
"step": 100050
},
{
"epoch": 29.17548973880597,
"grad_norm": 0.4415283203125,
"learning_rate": 0.00025004724409448815,
"loss": 3.1204,
"step": 100100
},
{
"epoch": 29.190065298507463,
"grad_norm": 0.41481301188468933,
"learning_rate": 0.00024987226596675415,
"loss": 3.1203,
"step": 100150
},
{
"epoch": 29.204640858208954,
"grad_norm": 0.469487726688385,
"learning_rate": 0.0002496972878390201,
"loss": 3.1224,
"step": 100200
},
{
"epoch": 29.21921641791045,
"grad_norm": 0.46850454807281494,
"learning_rate": 0.0002495223097112861,
"loss": 3.1309,
"step": 100250
},
{
"epoch": 29.23379197761194,
"grad_norm": 0.4412026107311249,
"learning_rate": 0.00024934733158355203,
"loss": 3.1222,
"step": 100300
},
{
"epoch": 29.24836753731343,
"grad_norm": 0.43051907420158386,
"learning_rate": 0.00024917235345581803,
"loss": 3.1292,
"step": 100350
},
{
"epoch": 29.262943097014926,
"grad_norm": 0.4601272642612457,
"learning_rate": 0.000248997375328084,
"loss": 3.1311,
"step": 100400
},
{
"epoch": 29.277518656716417,
"grad_norm": 0.4913773834705353,
"learning_rate": 0.0002488223972003499,
"loss": 3.1256,
"step": 100450
},
{
"epoch": 29.29209421641791,
"grad_norm": 0.4545249938964844,
"learning_rate": 0.0002486474190726159,
"loss": 3.1333,
"step": 100500
},
{
"epoch": 29.306669776119403,
"grad_norm": 0.439251571893692,
"learning_rate": 0.00024847244094488186,
"loss": 3.1338,
"step": 100550
},
{
"epoch": 29.321245335820894,
"grad_norm": 0.4394617974758148,
"learning_rate": 0.0002482974628171478,
"loss": 3.1343,
"step": 100600
},
{
"epoch": 29.33582089552239,
"grad_norm": 0.4542093873023987,
"learning_rate": 0.0002481224846894138,
"loss": 3.151,
"step": 100650
},
{
"epoch": 29.35039645522388,
"grad_norm": 0.45754221081733704,
"learning_rate": 0.00024794750656167975,
"loss": 3.1493,
"step": 100700
},
{
"epoch": 29.364972014925375,
"grad_norm": 0.4488230049610138,
"learning_rate": 0.00024777252843394574,
"loss": 3.1521,
"step": 100750
},
{
"epoch": 29.379547574626866,
"grad_norm": 0.45819759368896484,
"learning_rate": 0.0002475975503062117,
"loss": 3.1485,
"step": 100800
},
{
"epoch": 29.394123134328357,
"grad_norm": 0.4417685270309448,
"learning_rate": 0.0002474225721784777,
"loss": 3.147,
"step": 100850
},
{
"epoch": 29.40869869402985,
"grad_norm": 0.4662354588508606,
"learning_rate": 0.00024724759405074363,
"loss": 3.1439,
"step": 100900
},
{
"epoch": 29.423274253731343,
"grad_norm": 0.46339571475982666,
"learning_rate": 0.00024707261592300957,
"loss": 3.1516,
"step": 100950
},
{
"epoch": 29.437849813432837,
"grad_norm": 0.45555153489112854,
"learning_rate": 0.00024689763779527557,
"loss": 3.1412,
"step": 101000
},
{
"epoch": 29.437849813432837,
"eval_accuracy": 0.3736002449038104,
"eval_loss": 3.554413080215454,
"eval_runtime": 81.1357,
"eval_samples_per_second": 204.928,
"eval_steps_per_second": 12.818,
"step": 101000
},
{
"epoch": 29.45242537313433,
"grad_norm": 0.4727424681186676,
"learning_rate": 0.0002467226596675415,
"loss": 3.1543,
"step": 101050
},
{
"epoch": 29.46700093283582,
"grad_norm": 0.43927642703056335,
"learning_rate": 0.0002465476815398075,
"loss": 3.1567,
"step": 101100
},
{
"epoch": 29.481576492537314,
"grad_norm": 0.4496012330055237,
"learning_rate": 0.00024637270341207345,
"loss": 3.1537,
"step": 101150
},
{
"epoch": 29.496152052238806,
"grad_norm": 0.49772119522094727,
"learning_rate": 0.00024619772528433945,
"loss": 3.1574,
"step": 101200
},
{
"epoch": 29.510727611940297,
"grad_norm": 0.44245609641075134,
"learning_rate": 0.0002460227471566054,
"loss": 3.1533,
"step": 101250
},
{
"epoch": 29.52530317164179,
"grad_norm": 0.4604763090610504,
"learning_rate": 0.0002458477690288714,
"loss": 3.1584,
"step": 101300
},
{
"epoch": 29.539878731343283,
"grad_norm": 0.4395926296710968,
"learning_rate": 0.00024567279090113734,
"loss": 3.1496,
"step": 101350
},
{
"epoch": 29.554454291044777,
"grad_norm": 0.4405112862586975,
"learning_rate": 0.00024549781277340333,
"loss": 3.1496,
"step": 101400
},
{
"epoch": 29.56902985074627,
"grad_norm": 0.4369037449359894,
"learning_rate": 0.0002453228346456693,
"loss": 3.1631,
"step": 101450
},
{
"epoch": 29.58360541044776,
"grad_norm": 0.4483332931995392,
"learning_rate": 0.0002451478565179353,
"loss": 3.1673,
"step": 101500
},
{
"epoch": 29.598180970149254,
"grad_norm": 0.4601835310459137,
"learning_rate": 0.0002449728783902012,
"loss": 3.1583,
"step": 101550
},
{
"epoch": 29.612756529850746,
"grad_norm": 0.4867148697376251,
"learning_rate": 0.00024479790026246716,
"loss": 3.1646,
"step": 101600
},
{
"epoch": 29.62733208955224,
"grad_norm": 0.42792022228240967,
"learning_rate": 0.00024462292213473316,
"loss": 3.1623,
"step": 101650
},
{
"epoch": 29.64190764925373,
"grad_norm": 0.4424441158771515,
"learning_rate": 0.0002444479440069991,
"loss": 3.1556,
"step": 101700
},
{
"epoch": 29.656483208955223,
"grad_norm": 0.46660882234573364,
"learning_rate": 0.00024427296587926505,
"loss": 3.1703,
"step": 101750
},
{
"epoch": 29.671058768656717,
"grad_norm": 0.4472818374633789,
"learning_rate": 0.00024409798775153102,
"loss": 3.1639,
"step": 101800
},
{
"epoch": 29.68563432835821,
"grad_norm": 0.4481259286403656,
"learning_rate": 0.00024392300962379701,
"loss": 3.1763,
"step": 101850
},
{
"epoch": 29.700209888059703,
"grad_norm": 0.4559357762336731,
"learning_rate": 0.00024374803149606296,
"loss": 3.1714,
"step": 101900
},
{
"epoch": 29.714785447761194,
"grad_norm": 0.4513476490974426,
"learning_rate": 0.00024357305336832893,
"loss": 3.1579,
"step": 101950
},
{
"epoch": 29.729361007462686,
"grad_norm": 0.43404948711395264,
"learning_rate": 0.0002433980752405949,
"loss": 3.1537,
"step": 102000
},
{
"epoch": 29.729361007462686,
"eval_accuracy": 0.37413376517128377,
"eval_loss": 3.5457191467285156,
"eval_runtime": 113.0692,
"eval_samples_per_second": 147.052,
"eval_steps_per_second": 9.198,
"step": 102000
},
{
"epoch": 29.74393656716418,
"grad_norm": 0.44890788197517395,
"learning_rate": 0.00024322309711286087,
"loss": 3.1716,
"step": 102050
},
{
"epoch": 29.75851212686567,
"grad_norm": 0.44202056527137756,
"learning_rate": 0.00024304811898512684,
"loss": 3.1689,
"step": 102100
},
{
"epoch": 29.773087686567163,
"grad_norm": 0.46225395798683167,
"learning_rate": 0.0002428731408573928,
"loss": 3.1651,
"step": 102150
},
{
"epoch": 29.787663246268657,
"grad_norm": 0.42368146777153015,
"learning_rate": 0.00024269816272965875,
"loss": 3.1662,
"step": 102200
},
{
"epoch": 29.80223880597015,
"grad_norm": 0.44670891761779785,
"learning_rate": 0.00024252318460192475,
"loss": 3.1722,
"step": 102250
},
{
"epoch": 29.816814365671643,
"grad_norm": 0.44104140996932983,
"learning_rate": 0.0002423482064741907,
"loss": 3.1713,
"step": 102300
},
{
"epoch": 29.831389925373134,
"grad_norm": 0.43549516797065735,
"learning_rate": 0.0002421732283464567,
"loss": 3.182,
"step": 102350
},
{
"epoch": 29.845965485074625,
"grad_norm": 0.4488188624382019,
"learning_rate": 0.00024199825021872264,
"loss": 3.1851,
"step": 102400
},
{
"epoch": 29.86054104477612,
"grad_norm": 0.46105653047561646,
"learning_rate": 0.0002418232720909886,
"loss": 3.1763,
"step": 102450
},
{
"epoch": 29.87511660447761,
"grad_norm": 0.44759291410446167,
"learning_rate": 0.00024164829396325458,
"loss": 3.1744,
"step": 102500
},
{
"epoch": 29.889692164179106,
"grad_norm": 0.45466265082359314,
"learning_rate": 0.00024147331583552055,
"loss": 3.1666,
"step": 102550
},
{
"epoch": 29.904267723880597,
"grad_norm": 0.48013535141944885,
"learning_rate": 0.0002412983377077865,
"loss": 3.1687,
"step": 102600
},
{
"epoch": 29.91884328358209,
"grad_norm": 0.4551558196544647,
"learning_rate": 0.0002411233595800525,
"loss": 3.1802,
"step": 102650
},
{
"epoch": 29.933418843283583,
"grad_norm": 0.46371686458587646,
"learning_rate": 0.00024094838145231843,
"loss": 3.1774,
"step": 102700
},
{
"epoch": 29.947994402985074,
"grad_norm": 0.4950931668281555,
"learning_rate": 0.0002407734033245844,
"loss": 3.1714,
"step": 102750
},
{
"epoch": 29.962569962686565,
"grad_norm": 0.45876121520996094,
"learning_rate": 0.00024059842519685037,
"loss": 3.1855,
"step": 102800
},
{
"epoch": 29.97714552238806,
"grad_norm": 0.4418525695800781,
"learning_rate": 0.00024042344706911632,
"loss": 3.1874,
"step": 102850
},
{
"epoch": 29.99172108208955,
"grad_norm": 0.4665625989437103,
"learning_rate": 0.00024024846894138232,
"loss": 3.1899,
"step": 102900
},
{
"epoch": 30.006121735074625,
"grad_norm": 0.4641040861606598,
"learning_rate": 0.00024007349081364826,
"loss": 3.1525,
"step": 102950
},
{
"epoch": 30.02069729477612,
"grad_norm": 0.4659494459629059,
"learning_rate": 0.00023989851268591426,
"loss": 3.0842,
"step": 103000
},
{
"epoch": 30.02069729477612,
"eval_accuracy": 0.3736993457344331,
"eval_loss": 3.555461883544922,
"eval_runtime": 80.9964,
"eval_samples_per_second": 205.281,
"eval_steps_per_second": 12.84,
"step": 103000
},
{
"epoch": 30.03527285447761,
"grad_norm": 0.46675172448158264,
"learning_rate": 0.0002397235345581802,
"loss": 3.0937,
"step": 103050
},
{
"epoch": 30.049848414179106,
"grad_norm": 0.4507794678211212,
"learning_rate": 0.00023954855643044617,
"loss": 3.0836,
"step": 103100
},
{
"epoch": 30.064423973880597,
"grad_norm": 0.45438939332962036,
"learning_rate": 0.00023937357830271214,
"loss": 3.0987,
"step": 103150
},
{
"epoch": 30.07899953358209,
"grad_norm": 0.44462865591049194,
"learning_rate": 0.0002391986001749781,
"loss": 3.0901,
"step": 103200
},
{
"epoch": 30.093575093283583,
"grad_norm": 0.46132996678352356,
"learning_rate": 0.00023902362204724406,
"loss": 3.1067,
"step": 103250
},
{
"epoch": 30.108150652985074,
"grad_norm": 0.45434698462486267,
"learning_rate": 0.00023884864391951005,
"loss": 3.1135,
"step": 103300
},
{
"epoch": 30.12272621268657,
"grad_norm": 0.46210694313049316,
"learning_rate": 0.000238673665791776,
"loss": 3.1125,
"step": 103350
},
{
"epoch": 30.13730177238806,
"grad_norm": 0.4344474673271179,
"learning_rate": 0.000238498687664042,
"loss": 3.105,
"step": 103400
},
{
"epoch": 30.15187733208955,
"grad_norm": 0.4553607404232025,
"learning_rate": 0.00023832370953630794,
"loss": 3.1198,
"step": 103450
},
{
"epoch": 30.166452891791046,
"grad_norm": 0.4717164933681488,
"learning_rate": 0.00023814873140857394,
"loss": 3.1242,
"step": 103500
},
{
"epoch": 30.181028451492537,
"grad_norm": 0.45845305919647217,
"learning_rate": 0.00023797375328083988,
"loss": 3.117,
"step": 103550
},
{
"epoch": 30.19560401119403,
"grad_norm": 0.45949000120162964,
"learning_rate": 0.00023779877515310582,
"loss": 3.1081,
"step": 103600
},
{
"epoch": 30.210179570895523,
"grad_norm": 0.4732789695262909,
"learning_rate": 0.00023762379702537182,
"loss": 3.1121,
"step": 103650
},
{
"epoch": 30.224755130597014,
"grad_norm": 0.4642253518104553,
"learning_rate": 0.00023744881889763776,
"loss": 3.1267,
"step": 103700
},
{
"epoch": 30.23933069029851,
"grad_norm": 0.47492390871047974,
"learning_rate": 0.00023727384076990373,
"loss": 3.1203,
"step": 103750
},
{
"epoch": 30.25390625,
"grad_norm": 0.47091248631477356,
"learning_rate": 0.0002370988626421697,
"loss": 3.1275,
"step": 103800
},
{
"epoch": 30.26848180970149,
"grad_norm": 0.4485725164413452,
"learning_rate": 0.00023692388451443568,
"loss": 3.1242,
"step": 103850
},
{
"epoch": 30.283057369402986,
"grad_norm": 0.43855518102645874,
"learning_rate": 0.00023674890638670162,
"loss": 3.1274,
"step": 103900
},
{
"epoch": 30.297632929104477,
"grad_norm": 0.46146199107170105,
"learning_rate": 0.00023657392825896762,
"loss": 3.1225,
"step": 103950
},
{
"epoch": 30.31220848880597,
"grad_norm": 0.45113933086395264,
"learning_rate": 0.00023639895013123356,
"loss": 3.1355,
"step": 104000
},
{
"epoch": 30.31220848880597,
"eval_accuracy": 0.373506322738921,
"eval_loss": 3.556199550628662,
"eval_runtime": 80.9658,
"eval_samples_per_second": 205.358,
"eval_steps_per_second": 12.845,
"step": 104000
},
{
"epoch": 30.326784048507463,
"grad_norm": 0.46137019991874695,
"learning_rate": 0.00023622397200349956,
"loss": 3.1415,
"step": 104050
},
{
"epoch": 30.341359608208954,
"grad_norm": 0.4407903254032135,
"learning_rate": 0.0002360489938757655,
"loss": 3.1313,
"step": 104100
},
{
"epoch": 30.35593516791045,
"grad_norm": 0.47344106435775757,
"learning_rate": 0.0002358740157480315,
"loss": 3.1358,
"step": 104150
},
{
"epoch": 30.37051072761194,
"grad_norm": 0.49034038186073303,
"learning_rate": 0.00023569903762029744,
"loss": 3.1396,
"step": 104200
},
{
"epoch": 30.385086287313435,
"grad_norm": 0.4473479092121124,
"learning_rate": 0.0002355240594925634,
"loss": 3.1308,
"step": 104250
},
{
"epoch": 30.399661847014926,
"grad_norm": 0.4641791582107544,
"learning_rate": 0.00023534908136482938,
"loss": 3.1368,
"step": 104300
},
{
"epoch": 30.414237406716417,
"grad_norm": 0.4826582670211792,
"learning_rate": 0.00023517410323709535,
"loss": 3.135,
"step": 104350
},
{
"epoch": 30.42881296641791,
"grad_norm": 0.4261675775051117,
"learning_rate": 0.0002349991251093613,
"loss": 3.1401,
"step": 104400
},
{
"epoch": 30.443388526119403,
"grad_norm": 0.4646788239479065,
"learning_rate": 0.0002348241469816273,
"loss": 3.1331,
"step": 104450
},
{
"epoch": 30.457964085820894,
"grad_norm": 0.4778652787208557,
"learning_rate": 0.00023464916885389324,
"loss": 3.14,
"step": 104500
},
{
"epoch": 30.47253964552239,
"grad_norm": 0.4688096046447754,
"learning_rate": 0.00023447419072615918,
"loss": 3.1459,
"step": 104550
},
{
"epoch": 30.48711520522388,
"grad_norm": 0.4571461081504822,
"learning_rate": 0.00023429921259842518,
"loss": 3.1536,
"step": 104600
},
{
"epoch": 30.501690764925375,
"grad_norm": 0.46317562460899353,
"learning_rate": 0.00023412423447069112,
"loss": 3.1514,
"step": 104650
},
{
"epoch": 30.516266324626866,
"grad_norm": 0.440745085477829,
"learning_rate": 0.00023394925634295712,
"loss": 3.151,
"step": 104700
},
{
"epoch": 30.530841884328357,
"grad_norm": 0.4459381103515625,
"learning_rate": 0.00023377427821522306,
"loss": 3.1491,
"step": 104750
},
{
"epoch": 30.54541744402985,
"grad_norm": 0.43897759914398193,
"learning_rate": 0.00023359930008748906,
"loss": 3.1535,
"step": 104800
},
{
"epoch": 30.559993003731343,
"grad_norm": 0.45239999890327454,
"learning_rate": 0.000233424321959755,
"loss": 3.1449,
"step": 104850
},
{
"epoch": 30.574568563432837,
"grad_norm": 0.46996092796325684,
"learning_rate": 0.00023324934383202098,
"loss": 3.1482,
"step": 104900
},
{
"epoch": 30.58914412313433,
"grad_norm": 0.4657279849052429,
"learning_rate": 0.00023307436570428695,
"loss": 3.1442,
"step": 104950
},
{
"epoch": 30.60371968283582,
"grad_norm": 0.4449421763420105,
"learning_rate": 0.00023289938757655292,
"loss": 3.1478,
"step": 105000
},
{
"epoch": 30.60371968283582,
"eval_accuracy": 0.37399841368052833,
"eval_loss": 3.5493507385253906,
"eval_runtime": 81.065,
"eval_samples_per_second": 205.107,
"eval_steps_per_second": 12.829,
"step": 105000
},
{
"epoch": 30.618295242537314,
"grad_norm": 0.4685562252998352,
"learning_rate": 0.00023272440944881886,
"loss": 3.1475,
"step": 105050
},
{
"epoch": 30.632870802238806,
"grad_norm": 0.46623244881629944,
"learning_rate": 0.00023254943132108486,
"loss": 3.1542,
"step": 105100
},
{
"epoch": 30.647446361940297,
"grad_norm": 0.4504511058330536,
"learning_rate": 0.0002323744531933508,
"loss": 3.1613,
"step": 105150
},
{
"epoch": 30.66202192164179,
"grad_norm": 0.4688693881034851,
"learning_rate": 0.0002321994750656168,
"loss": 3.166,
"step": 105200
},
{
"epoch": 30.676597481343283,
"grad_norm": 0.4453798830509186,
"learning_rate": 0.00023202449693788274,
"loss": 3.162,
"step": 105250
},
{
"epoch": 30.691173041044777,
"grad_norm": 0.4550257921218872,
"learning_rate": 0.00023184951881014871,
"loss": 3.1528,
"step": 105300
},
{
"epoch": 30.70574860074627,
"grad_norm": 0.4807048439979553,
"learning_rate": 0.00023167454068241468,
"loss": 3.1511,
"step": 105350
},
{
"epoch": 30.72032416044776,
"grad_norm": 0.44397810101509094,
"learning_rate": 0.00023149956255468063,
"loss": 3.1635,
"step": 105400
},
{
"epoch": 30.734899720149254,
"grad_norm": 0.4527873992919922,
"learning_rate": 0.00023132458442694663,
"loss": 3.1681,
"step": 105450
},
{
"epoch": 30.749475279850746,
"grad_norm": 0.44800546765327454,
"learning_rate": 0.00023114960629921257,
"loss": 3.1506,
"step": 105500
},
{
"epoch": 30.76405083955224,
"grad_norm": 0.4325464367866516,
"learning_rate": 0.00023097462817147854,
"loss": 3.1631,
"step": 105550
},
{
"epoch": 30.77862639925373,
"grad_norm": 0.4772892892360687,
"learning_rate": 0.0002307996500437445,
"loss": 3.1606,
"step": 105600
},
{
"epoch": 30.793201958955223,
"grad_norm": 0.46441707015037537,
"learning_rate": 0.00023062467191601048,
"loss": 3.1652,
"step": 105650
},
{
"epoch": 30.807777518656717,
"grad_norm": 0.5042896270751953,
"learning_rate": 0.00023044969378827642,
"loss": 3.1667,
"step": 105700
},
{
"epoch": 30.82235307835821,
"grad_norm": 0.47036123275756836,
"learning_rate": 0.00023027471566054242,
"loss": 3.164,
"step": 105750
},
{
"epoch": 30.836928638059703,
"grad_norm": 0.45908039808273315,
"learning_rate": 0.00023009973753280837,
"loss": 3.174,
"step": 105800
},
{
"epoch": 30.851504197761194,
"grad_norm": 0.46668148040771484,
"learning_rate": 0.00022992475940507436,
"loss": 3.1733,
"step": 105850
},
{
"epoch": 30.866079757462686,
"grad_norm": 0.4751679301261902,
"learning_rate": 0.0002297497812773403,
"loss": 3.1681,
"step": 105900
},
{
"epoch": 30.88065531716418,
"grad_norm": 0.46648067235946655,
"learning_rate": 0.00022957480314960628,
"loss": 3.167,
"step": 105950
},
{
"epoch": 30.89523087686567,
"grad_norm": 0.4647907018661499,
"learning_rate": 0.00022939982502187225,
"loss": 3.166,
"step": 106000
},
{
"epoch": 30.89523087686567,
"eval_accuracy": 0.37426323181461507,
"eval_loss": 3.54299259185791,
"eval_runtime": 80.981,
"eval_samples_per_second": 205.32,
"eval_steps_per_second": 12.843,
"step": 106000
},
{
"epoch": 30.909806436567163,
"grad_norm": 0.5013905763626099,
"learning_rate": 0.00022922484689413822,
"loss": 3.1729,
"step": 106050
},
{
"epoch": 30.924381996268657,
"grad_norm": 0.43408867716789246,
"learning_rate": 0.0002290498687664042,
"loss": 3.1664,
"step": 106100
},
{
"epoch": 30.93895755597015,
"grad_norm": 0.476228266954422,
"learning_rate": 0.00022887489063867016,
"loss": 3.1778,
"step": 106150
},
{
"epoch": 30.953533115671643,
"grad_norm": 0.4531143009662628,
"learning_rate": 0.0002286999125109361,
"loss": 3.1769,
"step": 106200
},
{
"epoch": 30.968108675373134,
"grad_norm": 0.4749349057674408,
"learning_rate": 0.0002285249343832021,
"loss": 3.1645,
"step": 106250
},
{
"epoch": 30.982684235074625,
"grad_norm": 0.454899400472641,
"learning_rate": 0.00022834995625546804,
"loss": 3.1611,
"step": 106300
},
{
"epoch": 30.99725979477612,
"grad_norm": 0.4563228487968445,
"learning_rate": 0.000228174978127734,
"loss": 3.1717,
"step": 106350
},
{
"epoch": 31.011660447761194,
"grad_norm": 0.4625491499900818,
"learning_rate": 0.00022799999999999999,
"loss": 3.102,
"step": 106400
},
{
"epoch": 31.026236007462686,
"grad_norm": 0.4476287364959717,
"learning_rate": 0.00022782502187226593,
"loss": 3.0799,
"step": 106450
},
{
"epoch": 31.04081156716418,
"grad_norm": 0.4789726436138153,
"learning_rate": 0.00022765004374453193,
"loss": 3.0748,
"step": 106500
},
{
"epoch": 31.05538712686567,
"grad_norm": 0.45199573040008545,
"learning_rate": 0.00022747506561679787,
"loss": 3.0959,
"step": 106550
},
{
"epoch": 31.069962686567163,
"grad_norm": 0.4647940993309021,
"learning_rate": 0.00022730008748906384,
"loss": 3.1029,
"step": 106600
},
{
"epoch": 31.084538246268657,
"grad_norm": 0.48996156454086304,
"learning_rate": 0.0002271251093613298,
"loss": 3.0849,
"step": 106650
},
{
"epoch": 31.09911380597015,
"grad_norm": 0.45512181520462036,
"learning_rate": 0.00022695013123359578,
"loss": 3.0943,
"step": 106700
},
{
"epoch": 31.113689365671643,
"grad_norm": 0.4582662582397461,
"learning_rate": 0.00022677515310586175,
"loss": 3.0874,
"step": 106750
},
{
"epoch": 31.128264925373134,
"grad_norm": 0.45606929063796997,
"learning_rate": 0.00022660017497812772,
"loss": 3.1053,
"step": 106800
},
{
"epoch": 31.142840485074625,
"grad_norm": 0.4730152189731598,
"learning_rate": 0.00022642519685039367,
"loss": 3.0969,
"step": 106850
},
{
"epoch": 31.15741604477612,
"grad_norm": 0.5055202841758728,
"learning_rate": 0.00022625021872265966,
"loss": 3.1064,
"step": 106900
},
{
"epoch": 31.17199160447761,
"grad_norm": 0.4526134729385376,
"learning_rate": 0.0002260752405949256,
"loss": 3.1017,
"step": 106950
},
{
"epoch": 31.186567164179106,
"grad_norm": 0.47562670707702637,
"learning_rate": 0.0002259002624671916,
"loss": 3.113,
"step": 107000
},
{
"epoch": 31.186567164179106,
"eval_accuracy": 0.3738705947944758,
"eval_loss": 3.557934284210205,
"eval_runtime": 80.9833,
"eval_samples_per_second": 205.314,
"eval_steps_per_second": 12.842,
"step": 107000
},
{
"epoch": 31.201142723880597,
"grad_norm": 0.4674347937107086,
"learning_rate": 0.00022572528433945755,
"loss": 3.1066,
"step": 107050
},
{
"epoch": 31.21571828358209,
"grad_norm": 0.44288942217826843,
"learning_rate": 0.00022555030621172352,
"loss": 3.109,
"step": 107100
},
{
"epoch": 31.230293843283583,
"grad_norm": 0.48940151929855347,
"learning_rate": 0.0002253753280839895,
"loss": 3.1168,
"step": 107150
},
{
"epoch": 31.244869402985074,
"grad_norm": 0.45776811242103577,
"learning_rate": 0.00022520034995625543,
"loss": 3.1157,
"step": 107200
},
{
"epoch": 31.259444962686565,
"grad_norm": 0.4868299067020416,
"learning_rate": 0.0002250253718285214,
"loss": 3.113,
"step": 107250
},
{
"epoch": 31.27402052238806,
"grad_norm": 0.4617083966732025,
"learning_rate": 0.00022485039370078737,
"loss": 3.1225,
"step": 107300
},
{
"epoch": 31.28859608208955,
"grad_norm": 0.4782308340072632,
"learning_rate": 0.00022467541557305335,
"loss": 3.1277,
"step": 107350
},
{
"epoch": 31.303171641791046,
"grad_norm": 0.4509190320968628,
"learning_rate": 0.00022450043744531932,
"loss": 3.126,
"step": 107400
},
{
"epoch": 31.317747201492537,
"grad_norm": 0.45488241314888,
"learning_rate": 0.0002243254593175853,
"loss": 3.1286,
"step": 107450
},
{
"epoch": 31.33232276119403,
"grad_norm": 0.47506406903266907,
"learning_rate": 0.00022415048118985123,
"loss": 3.1146,
"step": 107500
},
{
"epoch": 31.346898320895523,
"grad_norm": 0.4671951234340668,
"learning_rate": 0.00022397550306211723,
"loss": 3.1226,
"step": 107550
},
{
"epoch": 31.361473880597014,
"grad_norm": 0.4664361774921417,
"learning_rate": 0.00022380052493438317,
"loss": 3.1306,
"step": 107600
},
{
"epoch": 31.37604944029851,
"grad_norm": 0.49000778794288635,
"learning_rate": 0.00022362554680664917,
"loss": 3.1322,
"step": 107650
},
{
"epoch": 31.390625,
"grad_norm": 0.49472030997276306,
"learning_rate": 0.0002234505686789151,
"loss": 3.1277,
"step": 107700
},
{
"epoch": 31.40520055970149,
"grad_norm": 0.4811898171901703,
"learning_rate": 0.00022327559055118108,
"loss": 3.1323,
"step": 107750
},
{
"epoch": 31.419776119402986,
"grad_norm": 0.4730585515499115,
"learning_rate": 0.00022310061242344705,
"loss": 3.1362,
"step": 107800
},
{
"epoch": 31.434351679104477,
"grad_norm": 0.4924510419368744,
"learning_rate": 0.00022292563429571302,
"loss": 3.1341,
"step": 107850
},
{
"epoch": 31.44892723880597,
"grad_norm": 0.45863619446754456,
"learning_rate": 0.00022275065616797897,
"loss": 3.1376,
"step": 107900
},
{
"epoch": 31.463502798507463,
"grad_norm": 0.4936143457889557,
"learning_rate": 0.00022257567804024497,
"loss": 3.1326,
"step": 107950
},
{
"epoch": 31.478078358208954,
"grad_norm": 0.4574803411960602,
"learning_rate": 0.0002224006999125109,
"loss": 3.1249,
"step": 108000
},
{
"epoch": 31.478078358208954,
"eval_accuracy": 0.3737095853689511,
"eval_loss": 3.5542073249816895,
"eval_runtime": 80.9813,
"eval_samples_per_second": 205.319,
"eval_steps_per_second": 12.842,
"step": 108000
},
{
"epoch": 31.49265391791045,
"grad_norm": 0.4639244079589844,
"learning_rate": 0.0002222257217847769,
"loss": 3.1312,
"step": 108050
},
{
"epoch": 31.50722947761194,
"grad_norm": 0.48924142122268677,
"learning_rate": 0.00022205074365704285,
"loss": 3.1482,
"step": 108100
},
{
"epoch": 31.521805037313435,
"grad_norm": 0.4623509347438812,
"learning_rate": 0.0002218757655293088,
"loss": 3.146,
"step": 108150
},
{
"epoch": 31.536380597014926,
"grad_norm": 0.46016931533813477,
"learning_rate": 0.0002217007874015748,
"loss": 3.147,
"step": 108200
},
{
"epoch": 31.550956156716417,
"grad_norm": 0.4692668318748474,
"learning_rate": 0.00022152580927384073,
"loss": 3.1318,
"step": 108250
},
{
"epoch": 31.56553171641791,
"grad_norm": 0.4955618977546692,
"learning_rate": 0.00022135083114610673,
"loss": 3.1512,
"step": 108300
},
{
"epoch": 31.580107276119403,
"grad_norm": 0.45453202724456787,
"learning_rate": 0.00022117585301837268,
"loss": 3.1565,
"step": 108350
},
{
"epoch": 31.594682835820894,
"grad_norm": 0.4675719141960144,
"learning_rate": 0.00022100087489063865,
"loss": 3.1458,
"step": 108400
},
{
"epoch": 31.60925839552239,
"grad_norm": 0.4647286832332611,
"learning_rate": 0.00022082589676290462,
"loss": 3.1483,
"step": 108450
},
{
"epoch": 31.62383395522388,
"grad_norm": 0.46763309836387634,
"learning_rate": 0.0002206509186351706,
"loss": 3.1558,
"step": 108500
},
{
"epoch": 31.638409514925375,
"grad_norm": 0.4608073830604553,
"learning_rate": 0.00022047594050743653,
"loss": 3.1411,
"step": 108550
},
{
"epoch": 31.652985074626866,
"grad_norm": 0.4496559202671051,
"learning_rate": 0.00022030096237970253,
"loss": 3.149,
"step": 108600
},
{
"epoch": 31.667560634328357,
"grad_norm": 0.4729684293270111,
"learning_rate": 0.00022012598425196847,
"loss": 3.1499,
"step": 108650
},
{
"epoch": 31.68213619402985,
"grad_norm": 0.48932141065597534,
"learning_rate": 0.00021995100612423447,
"loss": 3.1466,
"step": 108700
},
{
"epoch": 31.696711753731343,
"grad_norm": 0.46674734354019165,
"learning_rate": 0.0002197760279965004,
"loss": 3.1553,
"step": 108750
},
{
"epoch": 31.711287313432837,
"grad_norm": 0.45925435423851013,
"learning_rate": 0.0002196010498687664,
"loss": 3.1536,
"step": 108800
},
{
"epoch": 31.72586287313433,
"grad_norm": 0.4495401382446289,
"learning_rate": 0.00021942607174103235,
"loss": 3.15,
"step": 108850
},
{
"epoch": 31.74043843283582,
"grad_norm": 0.4811021387577057,
"learning_rate": 0.00021925109361329833,
"loss": 3.1417,
"step": 108900
},
{
"epoch": 31.755013992537314,
"grad_norm": 0.469148725271225,
"learning_rate": 0.0002190761154855643,
"loss": 3.1587,
"step": 108950
},
{
"epoch": 31.769589552238806,
"grad_norm": 0.5063664317131042,
"learning_rate": 0.00021890113735783024,
"loss": 3.1627,
"step": 109000
},
{
"epoch": 31.769589552238806,
"eval_accuracy": 0.37450910073999605,
"eval_loss": 3.544800043106079,
"eval_runtime": 81.0093,
"eval_samples_per_second": 205.248,
"eval_steps_per_second": 12.838,
"step": 109000
},
{
"epoch": 31.784165111940297,
"grad_norm": 0.4680740237236023,
"learning_rate": 0.0002187261592300962,
"loss": 3.1437,
"step": 109050
},
{
"epoch": 31.79874067164179,
"grad_norm": 0.465648353099823,
"learning_rate": 0.00021855118110236218,
"loss": 3.1471,
"step": 109100
},
{
"epoch": 31.813316231343283,
"grad_norm": 0.46208086609840393,
"learning_rate": 0.00021837620297462815,
"loss": 3.1582,
"step": 109150
},
{
"epoch": 31.827891791044777,
"grad_norm": 0.4603714942932129,
"learning_rate": 0.0002182012248468941,
"loss": 3.1493,
"step": 109200
},
{
"epoch": 31.84246735074627,
"grad_norm": 0.4754786491394043,
"learning_rate": 0.0002180262467191601,
"loss": 3.1541,
"step": 109250
},
{
"epoch": 31.85704291044776,
"grad_norm": 0.47437232732772827,
"learning_rate": 0.00021785126859142604,
"loss": 3.1483,
"step": 109300
},
{
"epoch": 31.871618470149254,
"grad_norm": 0.4557444453239441,
"learning_rate": 0.00021767629046369203,
"loss": 3.1499,
"step": 109350
},
{
"epoch": 31.886194029850746,
"grad_norm": 0.4662182033061981,
"learning_rate": 0.00021750131233595798,
"loss": 3.1529,
"step": 109400
},
{
"epoch": 31.90076958955224,
"grad_norm": 0.5064869523048401,
"learning_rate": 0.00021732633420822397,
"loss": 3.151,
"step": 109450
},
{
"epoch": 31.91534514925373,
"grad_norm": 0.47745054960250854,
"learning_rate": 0.00021715135608048992,
"loss": 3.1649,
"step": 109500
},
{
"epoch": 31.929920708955223,
"grad_norm": 0.45016202330589294,
"learning_rate": 0.0002169763779527559,
"loss": 3.1732,
"step": 109550
},
{
"epoch": 31.944496268656717,
"grad_norm": 0.46144726872444153,
"learning_rate": 0.00021680139982502186,
"loss": 3.1816,
"step": 109600
},
{
"epoch": 31.95907182835821,
"grad_norm": 0.4760236144065857,
"learning_rate": 0.00021662642169728783,
"loss": 3.1569,
"step": 109650
},
{
"epoch": 31.973647388059703,
"grad_norm": 0.4745504558086395,
"learning_rate": 0.00021645144356955377,
"loss": 3.16,
"step": 109700
},
{
"epoch": 31.988222947761194,
"grad_norm": 0.43345993757247925,
"learning_rate": 0.00021627646544181977,
"loss": 3.1708,
"step": 109750
},
{
"epoch": 32.00262360074627,
"grad_norm": 0.48590293526649475,
"learning_rate": 0.00021610148731408571,
"loss": 3.1383,
"step": 109800
},
{
"epoch": 32.01719916044776,
"grad_norm": 0.4533041715621948,
"learning_rate": 0.00021592650918635166,
"loss": 3.0848,
"step": 109850
},
{
"epoch": 32.03177472014925,
"grad_norm": 0.4639461636543274,
"learning_rate": 0.00021575153105861766,
"loss": 3.079,
"step": 109900
},
{
"epoch": 32.04635027985075,
"grad_norm": 0.5041998624801636,
"learning_rate": 0.0002155765529308836,
"loss": 3.0844,
"step": 109950
},
{
"epoch": 32.06092583955224,
"grad_norm": 0.45482882857322693,
"learning_rate": 0.0002154015748031496,
"loss": 3.0785,
"step": 110000
},
{
"epoch": 32.06092583955224,
"eval_accuracy": 0.3737993881406436,
"eval_loss": 3.5565266609191895,
"eval_runtime": 80.9883,
"eval_samples_per_second": 205.301,
"eval_steps_per_second": 12.841,
"step": 110000
},
{
"epoch": 32.07550139925373,
"grad_norm": 0.4785085618495941,
"learning_rate": 0.00021522659667541554,
"loss": 3.0837,
"step": 110050
},
{
"epoch": 32.09007695895522,
"grad_norm": 0.46124953031539917,
"learning_rate": 0.00021505161854768154,
"loss": 3.073,
"step": 110100
},
{
"epoch": 32.104652518656714,
"grad_norm": 0.45453038811683655,
"learning_rate": 0.00021487664041994748,
"loss": 3.0919,
"step": 110150
},
{
"epoch": 32.11922807835821,
"grad_norm": 0.47057774662971497,
"learning_rate": 0.00021470166229221345,
"loss": 3.1097,
"step": 110200
},
{
"epoch": 32.1338036380597,
"grad_norm": 0.4835250675678253,
"learning_rate": 0.00021452668416447942,
"loss": 3.0993,
"step": 110250
},
{
"epoch": 32.148379197761194,
"grad_norm": 0.4598732888698578,
"learning_rate": 0.0002143517060367454,
"loss": 3.0939,
"step": 110300
},
{
"epoch": 32.162954757462686,
"grad_norm": 0.47464901208877563,
"learning_rate": 0.00021417672790901134,
"loss": 3.0838,
"step": 110350
},
{
"epoch": 32.17753031716418,
"grad_norm": 0.4422462582588196,
"learning_rate": 0.00021400174978127733,
"loss": 3.1083,
"step": 110400
},
{
"epoch": 32.192105876865675,
"grad_norm": 0.4594365656375885,
"learning_rate": 0.00021382677165354328,
"loss": 3.1046,
"step": 110450
},
{
"epoch": 32.206681436567166,
"grad_norm": 0.49147263169288635,
"learning_rate": 0.00021365179352580928,
"loss": 3.1058,
"step": 110500
},
{
"epoch": 32.22125699626866,
"grad_norm": 0.46081140637397766,
"learning_rate": 0.00021347681539807522,
"loss": 3.1097,
"step": 110550
},
{
"epoch": 32.23583255597015,
"grad_norm": 0.4752926528453827,
"learning_rate": 0.0002133018372703412,
"loss": 3.1038,
"step": 110600
},
{
"epoch": 32.25040811567164,
"grad_norm": 0.4841436743736267,
"learning_rate": 0.00021312685914260716,
"loss": 3.1024,
"step": 110650
},
{
"epoch": 32.26498367537314,
"grad_norm": 0.49166467785835266,
"learning_rate": 0.00021295188101487313,
"loss": 3.1066,
"step": 110700
},
{
"epoch": 32.27955923507463,
"grad_norm": 0.48016849160194397,
"learning_rate": 0.0002127769028871391,
"loss": 3.1129,
"step": 110750
},
{
"epoch": 32.29413479477612,
"grad_norm": 0.47091442346572876,
"learning_rate": 0.00021260192475940504,
"loss": 3.1195,
"step": 110800
},
{
"epoch": 32.30871035447761,
"grad_norm": 0.49015673995018005,
"learning_rate": 0.00021242694663167102,
"loss": 3.116,
"step": 110850
},
{
"epoch": 32.3232859141791,
"grad_norm": 0.4439619481563568,
"learning_rate": 0.00021225196850393699,
"loss": 3.1255,
"step": 110900
},
{
"epoch": 32.337861473880594,
"grad_norm": 0.4515083134174347,
"learning_rate": 0.00021207699037620296,
"loss": 3.1216,
"step": 110950
},
{
"epoch": 32.35243703358209,
"grad_norm": 0.47670596837997437,
"learning_rate": 0.0002119020122484689,
"loss": 3.1107,
"step": 111000
},
{
"epoch": 32.35243703358209,
"eval_accuracy": 0.37408480324071486,
"eval_loss": 3.5553367137908936,
"eval_runtime": 81.0462,
"eval_samples_per_second": 205.155,
"eval_steps_per_second": 12.832,
"step": 111000
},
{
"epoch": 32.36701259328358,
"grad_norm": 0.5058426260948181,
"learning_rate": 0.0002117270341207349,
"loss": 3.1131,
"step": 111050
},
{
"epoch": 32.381588152985074,
"grad_norm": 0.47986745834350586,
"learning_rate": 0.00021155205599300084,
"loss": 3.1077,
"step": 111100
},
{
"epoch": 32.396163712686565,
"grad_norm": 0.46111610531806946,
"learning_rate": 0.00021137707786526684,
"loss": 3.1129,
"step": 111150
},
{
"epoch": 32.41073927238806,
"grad_norm": 0.439547061920166,
"learning_rate": 0.00021120209973753278,
"loss": 3.1247,
"step": 111200
},
{
"epoch": 32.425314832089555,
"grad_norm": 0.466490238904953,
"learning_rate": 0.00021102712160979875,
"loss": 3.1274,
"step": 111250
},
{
"epoch": 32.439890391791046,
"grad_norm": 0.48718830943107605,
"learning_rate": 0.00021085214348206472,
"loss": 3.1277,
"step": 111300
},
{
"epoch": 32.45446595149254,
"grad_norm": 0.47072693705558777,
"learning_rate": 0.0002106771653543307,
"loss": 3.1258,
"step": 111350
},
{
"epoch": 32.46904151119403,
"grad_norm": 0.47657322883605957,
"learning_rate": 0.00021050218722659666,
"loss": 3.1353,
"step": 111400
},
{
"epoch": 32.48361707089552,
"grad_norm": 0.49400582909584045,
"learning_rate": 0.00021032720909886264,
"loss": 3.1181,
"step": 111450
},
{
"epoch": 32.49819263059702,
"grad_norm": 0.47606533765792847,
"learning_rate": 0.00021015223097112858,
"loss": 3.1306,
"step": 111500
},
{
"epoch": 32.51276819029851,
"grad_norm": 0.4946952164173126,
"learning_rate": 0.00020997725284339458,
"loss": 3.1323,
"step": 111550
},
{
"epoch": 32.52734375,
"grad_norm": 0.45804789662361145,
"learning_rate": 0.00020980227471566052,
"loss": 3.121,
"step": 111600
},
{
"epoch": 32.54191930970149,
"grad_norm": 0.4503360986709595,
"learning_rate": 0.00020962729658792646,
"loss": 3.1391,
"step": 111650
},
{
"epoch": 32.55649486940298,
"grad_norm": 0.49889054894447327,
"learning_rate": 0.00020945231846019246,
"loss": 3.1246,
"step": 111700
},
{
"epoch": 32.57107042910448,
"grad_norm": 0.46974316239356995,
"learning_rate": 0.0002092773403324584,
"loss": 3.1442,
"step": 111750
},
{
"epoch": 32.58564598880597,
"grad_norm": 0.46693581342697144,
"learning_rate": 0.0002091023622047244,
"loss": 3.139,
"step": 111800
},
{
"epoch": 32.60022154850746,
"grad_norm": 0.46260032057762146,
"learning_rate": 0.00020892738407699035,
"loss": 3.1357,
"step": 111850
},
{
"epoch": 32.614797108208954,
"grad_norm": 0.47752782702445984,
"learning_rate": 0.00020875240594925632,
"loss": 3.1336,
"step": 111900
},
{
"epoch": 32.629372667910445,
"grad_norm": 0.5107750296592712,
"learning_rate": 0.0002085774278215223,
"loss": 3.1347,
"step": 111950
},
{
"epoch": 32.64394822761194,
"grad_norm": 0.4821491539478302,
"learning_rate": 0.00020840244969378826,
"loss": 3.1327,
"step": 112000
},
{
"epoch": 32.64394822761194,
"eval_accuracy": 0.3742285112148126,
"eval_loss": 3.548354387283325,
"eval_runtime": 80.9257,
"eval_samples_per_second": 205.46,
"eval_steps_per_second": 12.851,
"step": 112000
},
{
"epoch": 32.658523787313435,
"grad_norm": 0.4635300934314728,
"learning_rate": 0.00020822747156605423,
"loss": 3.1473,
"step": 112050
},
{
"epoch": 32.673099347014926,
"grad_norm": 0.4636472761631012,
"learning_rate": 0.0002080524934383202,
"loss": 3.1355,
"step": 112100
},
{
"epoch": 32.68767490671642,
"grad_norm": 0.49074453115463257,
"learning_rate": 0.00020787751531058614,
"loss": 3.1478,
"step": 112150
},
{
"epoch": 32.70225046641791,
"grad_norm": 0.4680987000465393,
"learning_rate": 0.00020770253718285214,
"loss": 3.1419,
"step": 112200
},
{
"epoch": 32.716826026119406,
"grad_norm": 0.46409615874290466,
"learning_rate": 0.00020752755905511808,
"loss": 3.1408,
"step": 112250
},
{
"epoch": 32.7314015858209,
"grad_norm": 0.528548538684845,
"learning_rate": 0.00020735258092738408,
"loss": 3.1428,
"step": 112300
},
{
"epoch": 32.74597714552239,
"grad_norm": 0.4838976263999939,
"learning_rate": 0.00020717760279965002,
"loss": 3.1356,
"step": 112350
},
{
"epoch": 32.76055270522388,
"grad_norm": 0.4824446737766266,
"learning_rate": 0.000207002624671916,
"loss": 3.138,
"step": 112400
},
{
"epoch": 32.77512826492537,
"grad_norm": 0.48765483498573303,
"learning_rate": 0.00020682764654418197,
"loss": 3.15,
"step": 112450
},
{
"epoch": 32.78970382462687,
"grad_norm": 0.47936880588531494,
"learning_rate": 0.00020665266841644794,
"loss": 3.1503,
"step": 112500
},
{
"epoch": 32.80427938432836,
"grad_norm": 0.48564252257347107,
"learning_rate": 0.00020647769028871388,
"loss": 3.1397,
"step": 112550
},
{
"epoch": 32.81885494402985,
"grad_norm": 0.4732897877693176,
"learning_rate": 0.00020630271216097985,
"loss": 3.1484,
"step": 112600
},
{
"epoch": 32.83343050373134,
"grad_norm": 0.474088579416275,
"learning_rate": 0.00020612773403324582,
"loss": 3.1595,
"step": 112650
},
{
"epoch": 32.848006063432834,
"grad_norm": 0.4689086079597473,
"learning_rate": 0.0002059527559055118,
"loss": 3.1495,
"step": 112700
},
{
"epoch": 32.862581623134325,
"grad_norm": 0.463306725025177,
"learning_rate": 0.00020577777777777776,
"loss": 3.1528,
"step": 112750
},
{
"epoch": 32.87715718283582,
"grad_norm": 0.4703214466571808,
"learning_rate": 0.0002056027996500437,
"loss": 3.1478,
"step": 112800
},
{
"epoch": 32.891732742537314,
"grad_norm": 0.4781394302845001,
"learning_rate": 0.0002054278215223097,
"loss": 3.1519,
"step": 112850
},
{
"epoch": 32.906308302238806,
"grad_norm": 0.4915687143802643,
"learning_rate": 0.00020525284339457565,
"loss": 3.1538,
"step": 112900
},
{
"epoch": 32.9208838619403,
"grad_norm": 0.45098525285720825,
"learning_rate": 0.00020507786526684164,
"loss": 3.1422,
"step": 112950
},
{
"epoch": 32.93545942164179,
"grad_norm": 0.492906779050827,
"learning_rate": 0.0002049028871391076,
"loss": 3.1514,
"step": 113000
},
{
"epoch": 32.93545942164179,
"eval_accuracy": 0.37444248426715465,
"eval_loss": 3.546422004699707,
"eval_runtime": 80.9923,
"eval_samples_per_second": 205.291,
"eval_steps_per_second": 12.841,
"step": 113000
},
{
"epoch": 32.950034981343286,
"grad_norm": 0.46875256299972534,
"learning_rate": 0.00020472790901137356,
"loss": 3.1516,
"step": 113050
},
{
"epoch": 32.96461054104478,
"grad_norm": 0.45780235528945923,
"learning_rate": 0.00020455293088363953,
"loss": 3.1617,
"step": 113100
},
{
"epoch": 32.97918610074627,
"grad_norm": 0.466714471578598,
"learning_rate": 0.0002043779527559055,
"loss": 3.1532,
"step": 113150
},
{
"epoch": 32.99376166044776,
"grad_norm": 0.47221630811691284,
"learning_rate": 0.00020420297462817144,
"loss": 3.1665,
"step": 113200
},
{
"epoch": 33.008162313432834,
"grad_norm": 0.46910685300827026,
"learning_rate": 0.00020402799650043744,
"loss": 3.0957,
"step": 113250
},
{
"epoch": 33.022737873134325,
"grad_norm": 0.47130286693573,
"learning_rate": 0.00020385301837270338,
"loss": 3.0662,
"step": 113300
},
{
"epoch": 33.03731343283582,
"grad_norm": 0.48261669278144836,
"learning_rate": 0.00020367804024496938,
"loss": 3.0748,
"step": 113350
},
{
"epoch": 33.051888992537314,
"grad_norm": 0.46619370579719543,
"learning_rate": 0.00020350306211723533,
"loss": 3.0783,
"step": 113400
},
{
"epoch": 33.066464552238806,
"grad_norm": 0.4570696949958801,
"learning_rate": 0.00020332808398950127,
"loss": 3.0846,
"step": 113450
},
{
"epoch": 33.0810401119403,
"grad_norm": 0.4632129967212677,
"learning_rate": 0.00020315310586176727,
"loss": 3.0783,
"step": 113500
},
{
"epoch": 33.09561567164179,
"grad_norm": 0.46962153911590576,
"learning_rate": 0.0002029781277340332,
"loss": 3.0921,
"step": 113550
},
{
"epoch": 33.110191231343286,
"grad_norm": 0.4779681861400604,
"learning_rate": 0.0002028031496062992,
"loss": 3.0828,
"step": 113600
},
{
"epoch": 33.12476679104478,
"grad_norm": 0.48863279819488525,
"learning_rate": 0.00020262817147856515,
"loss": 3.0816,
"step": 113650
},
{
"epoch": 33.13934235074627,
"grad_norm": 0.4736406207084656,
"learning_rate": 0.00020245319335083112,
"loss": 3.0919,
"step": 113700
},
{
"epoch": 33.15391791044776,
"grad_norm": 0.49636760354042053,
"learning_rate": 0.0002022782152230971,
"loss": 3.0978,
"step": 113750
},
{
"epoch": 33.16849347014925,
"grad_norm": 0.48014184832572937,
"learning_rate": 0.00020210323709536306,
"loss": 3.0944,
"step": 113800
},
{
"epoch": 33.18306902985075,
"grad_norm": 0.4676609933376312,
"learning_rate": 0.000201928258967629,
"loss": 3.0811,
"step": 113850
},
{
"epoch": 33.19764458955224,
"grad_norm": 0.46274760365486145,
"learning_rate": 0.000201753280839895,
"loss": 3.1013,
"step": 113900
},
{
"epoch": 33.21222014925373,
"grad_norm": 0.47959813475608826,
"learning_rate": 0.00020157830271216095,
"loss": 3.1003,
"step": 113950
},
{
"epoch": 33.22679570895522,
"grad_norm": 0.49425938725471497,
"learning_rate": 0.00020140332458442695,
"loss": 3.1022,
"step": 114000
},
{
"epoch": 33.22679570895522,
"eval_accuracy": 0.3739919403483618,
"eval_loss": 3.5567288398742676,
"eval_runtime": 81.0289,
"eval_samples_per_second": 205.198,
"eval_steps_per_second": 12.835,
"step": 114000
},
{
"epoch": 33.241371268656714,
"grad_norm": 0.49249497056007385,
"learning_rate": 0.0002012283464566929,
"loss": 3.0883,
"step": 114050
},
{
"epoch": 33.25594682835821,
"grad_norm": 0.5082933902740479,
"learning_rate": 0.0002010533683289589,
"loss": 3.0939,
"step": 114100
},
{
"epoch": 33.2705223880597,
"grad_norm": 0.46832650899887085,
"learning_rate": 0.00020087839020122483,
"loss": 3.1014,
"step": 114150
},
{
"epoch": 33.285097947761194,
"grad_norm": 0.4847492575645447,
"learning_rate": 0.0002007034120734908,
"loss": 3.1044,
"step": 114200
},
{
"epoch": 33.299673507462686,
"grad_norm": 0.46897026896476746,
"learning_rate": 0.00020052843394575677,
"loss": 3.099,
"step": 114250
},
{
"epoch": 33.31424906716418,
"grad_norm": 0.48055335879325867,
"learning_rate": 0.00020035345581802274,
"loss": 3.106,
"step": 114300
},
{
"epoch": 33.328824626865675,
"grad_norm": 0.5022363066673279,
"learning_rate": 0.00020017847769028869,
"loss": 3.1086,
"step": 114350
},
{
"epoch": 33.343400186567166,
"grad_norm": 0.4915386140346527,
"learning_rate": 0.00020000349956255466,
"loss": 3.1036,
"step": 114400
},
{
"epoch": 33.35797574626866,
"grad_norm": 0.4777482748031616,
"learning_rate": 0.00019982852143482063,
"loss": 3.1066,
"step": 114450
},
{
"epoch": 33.37255130597015,
"grad_norm": 0.46391403675079346,
"learning_rate": 0.00019965354330708657,
"loss": 3.1072,
"step": 114500
},
{
"epoch": 33.38712686567164,
"grad_norm": 0.48125559091567993,
"learning_rate": 0.00019947856517935257,
"loss": 3.1137,
"step": 114550
},
{
"epoch": 33.40170242537314,
"grad_norm": 0.4890342354774475,
"learning_rate": 0.0001993035870516185,
"loss": 3.1129,
"step": 114600
},
{
"epoch": 33.41627798507463,
"grad_norm": 0.4634007513523102,
"learning_rate": 0.0001991286089238845,
"loss": 3.1203,
"step": 114650
},
{
"epoch": 33.43085354477612,
"grad_norm": 0.5099166035652161,
"learning_rate": 0.00019895363079615045,
"loss": 3.1126,
"step": 114700
},
{
"epoch": 33.44542910447761,
"grad_norm": 0.48662975430488586,
"learning_rate": 0.00019877865266841645,
"loss": 3.1255,
"step": 114750
},
{
"epoch": 33.4600046641791,
"grad_norm": 0.46207714080810547,
"learning_rate": 0.0001986036745406824,
"loss": 3.1308,
"step": 114800
},
{
"epoch": 33.474580223880594,
"grad_norm": 0.4985235035419464,
"learning_rate": 0.00019842869641294836,
"loss": 3.125,
"step": 114850
},
{
"epoch": 33.48915578358209,
"grad_norm": 0.511412501335144,
"learning_rate": 0.00019825371828521433,
"loss": 3.116,
"step": 114900
},
{
"epoch": 33.50373134328358,
"grad_norm": 0.47149890661239624,
"learning_rate": 0.0001980787401574803,
"loss": 3.1232,
"step": 114950
},
{
"epoch": 33.518306902985074,
"grad_norm": 0.48918309807777405,
"learning_rate": 0.00019790376202974625,
"loss": 3.1056,
"step": 115000
},
{
"epoch": 33.518306902985074,
"eval_accuracy": 0.37393838823680203,
"eval_loss": 3.5551340579986572,
"eval_runtime": 81.0751,
"eval_samples_per_second": 205.082,
"eval_steps_per_second": 12.828,
"step": 115000
},
{
"epoch": 33.532882462686565,
"grad_norm": 0.48534640669822693,
"learning_rate": 0.00019772878390201225,
"loss": 3.1205,
"step": 115050
},
{
"epoch": 33.54745802238806,
"grad_norm": 0.46837061643600464,
"learning_rate": 0.0001975538057742782,
"loss": 3.1311,
"step": 115100
},
{
"epoch": 33.562033582089555,
"grad_norm": 0.4587228000164032,
"learning_rate": 0.0001973788276465442,
"loss": 3.1203,
"step": 115150
},
{
"epoch": 33.576609141791046,
"grad_norm": 0.4965645670890808,
"learning_rate": 0.00019720384951881013,
"loss": 3.1255,
"step": 115200
},
{
"epoch": 33.59118470149254,
"grad_norm": 0.48364168405532837,
"learning_rate": 0.00019702887139107607,
"loss": 3.134,
"step": 115250
},
{
"epoch": 33.60576026119403,
"grad_norm": 0.46811044216156006,
"learning_rate": 0.00019685389326334207,
"loss": 3.1181,
"step": 115300
},
{
"epoch": 33.62033582089552,
"grad_norm": 0.48604077100753784,
"learning_rate": 0.00019667891513560802,
"loss": 3.1307,
"step": 115350
},
{
"epoch": 33.63491138059702,
"grad_norm": 0.4842829406261444,
"learning_rate": 0.00019650393700787401,
"loss": 3.1237,
"step": 115400
},
{
"epoch": 33.64948694029851,
"grad_norm": 0.45538443326950073,
"learning_rate": 0.00019632895888013996,
"loss": 3.1286,
"step": 115450
},
{
"epoch": 33.6640625,
"grad_norm": 0.49470046162605286,
"learning_rate": 0.00019615398075240593,
"loss": 3.1251,
"step": 115500
},
{
"epoch": 33.67863805970149,
"grad_norm": 0.4743928015232086,
"learning_rate": 0.0001959790026246719,
"loss": 3.1342,
"step": 115550
},
{
"epoch": 33.69321361940298,
"grad_norm": 0.4796803295612335,
"learning_rate": 0.00019580402449693787,
"loss": 3.1392,
"step": 115600
},
{
"epoch": 33.70778917910448,
"grad_norm": 0.5133796334266663,
"learning_rate": 0.0001956290463692038,
"loss": 3.1432,
"step": 115650
},
{
"epoch": 33.72236473880597,
"grad_norm": 0.5081549882888794,
"learning_rate": 0.0001954540682414698,
"loss": 3.1458,
"step": 115700
},
{
"epoch": 33.73694029850746,
"grad_norm": 0.4586086571216583,
"learning_rate": 0.00019527909011373575,
"loss": 3.1322,
"step": 115750
},
{
"epoch": 33.751515858208954,
"grad_norm": 0.48370683193206787,
"learning_rate": 0.00019510411198600175,
"loss": 3.1351,
"step": 115800
},
{
"epoch": 33.766091417910445,
"grad_norm": 0.4725615382194519,
"learning_rate": 0.0001949291338582677,
"loss": 3.1196,
"step": 115850
},
{
"epoch": 33.78066697761194,
"grad_norm": 0.4887787103652954,
"learning_rate": 0.00019475415573053367,
"loss": 3.1356,
"step": 115900
},
{
"epoch": 33.795242537313435,
"grad_norm": 0.4757724702358246,
"learning_rate": 0.00019457917760279964,
"loss": 3.1503,
"step": 115950
},
{
"epoch": 33.809818097014926,
"grad_norm": 0.45303794741630554,
"learning_rate": 0.0001944041994750656,
"loss": 3.1411,
"step": 116000
},
{
"epoch": 33.809818097014926,
"eval_accuracy": 0.37475544045317094,
"eval_loss": 3.541987657546997,
"eval_runtime": 81.014,
"eval_samples_per_second": 205.236,
"eval_steps_per_second": 12.837,
"step": 116000
},
{
"epoch": 33.809818097014926,
"step": 116000,
"total_flos": 2.424310847373312e+18,
"train_loss": 0.9799438093777361,
"train_runtime": 26721.6103,
"train_samples_per_second": 513.483,
"train_steps_per_second": 6.42
}
],
"logging_steps": 50,
"max_steps": 171550,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 10000,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 20,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 14
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.424310847373312e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}