craa's picture
End of training
0f04fb5 verified
{
"best_global_step": 79000,
"best_metric": 3.5285708904266357,
"best_model_checkpoint": "/scratch/cl5625/exceptions/models/swap_require_to_carry_3591/checkpoint-40000",
"epoch": 29.103924080111785,
"eval_steps": 1000,
"global_step": 100000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014555193292966931,
"grad_norm": 0.7359105348587036,
"learning_rate": 0.000294,
"loss": 8.4385,
"step": 50
},
{
"epoch": 0.029110386585933862,
"grad_norm": 0.7309923768043518,
"learning_rate": 0.0005939999999999999,
"loss": 6.728,
"step": 100
},
{
"epoch": 0.04366557987890079,
"grad_norm": 0.5441356897354126,
"learning_rate": 0.0005998287711124053,
"loss": 6.3624,
"step": 150
},
{
"epoch": 0.058220773171867725,
"grad_norm": 0.5081177949905396,
"learning_rate": 0.000599654047757717,
"loss": 6.1462,
"step": 200
},
{
"epoch": 0.07277596646483465,
"grad_norm": 0.4880349338054657,
"learning_rate": 0.0005994793244030285,
"loss": 5.9999,
"step": 250
},
{
"epoch": 0.08733115975780158,
"grad_norm": 0.44926661252975464,
"learning_rate": 0.00059930460104834,
"loss": 5.8789,
"step": 300
},
{
"epoch": 0.10188635305076851,
"grad_norm": 0.42646145820617676,
"learning_rate": 0.0005991298776936517,
"loss": 5.7411,
"step": 350
},
{
"epoch": 0.11644154634373545,
"grad_norm": 0.5302311778068542,
"learning_rate": 0.0005989551543389632,
"loss": 5.6293,
"step": 400
},
{
"epoch": 0.1309967396367024,
"grad_norm": 0.4889351427555084,
"learning_rate": 0.0005987804309842748,
"loss": 5.515,
"step": 450
},
{
"epoch": 0.1455519329296693,
"grad_norm": 0.42970505356788635,
"learning_rate": 0.0005986057076295864,
"loss": 5.4076,
"step": 500
},
{
"epoch": 0.16010712622263623,
"grad_norm": 0.4154544770717621,
"learning_rate": 0.0005984309842748981,
"loss": 5.3341,
"step": 550
},
{
"epoch": 0.17466231951560315,
"grad_norm": 0.4824340343475342,
"learning_rate": 0.0005982562609202096,
"loss": 5.2568,
"step": 600
},
{
"epoch": 0.1892175128085701,
"grad_norm": 0.42126211524009705,
"learning_rate": 0.0005980815375655212,
"loss": 5.1928,
"step": 650
},
{
"epoch": 0.20377270610153703,
"grad_norm": 0.4139862358570099,
"learning_rate": 0.0005979068142108328,
"loss": 5.1269,
"step": 700
},
{
"epoch": 0.21832789939450395,
"grad_norm": 0.48572856187820435,
"learning_rate": 0.0005977320908561445,
"loss": 5.0723,
"step": 750
},
{
"epoch": 0.2328830926874709,
"grad_norm": 0.4250296950340271,
"learning_rate": 0.000597557367501456,
"loss": 5.0148,
"step": 800
},
{
"epoch": 0.24743828598043782,
"grad_norm": 0.3736257255077362,
"learning_rate": 0.0005973826441467675,
"loss": 4.9691,
"step": 850
},
{
"epoch": 0.2619934792734048,
"grad_norm": 0.3763846457004547,
"learning_rate": 0.0005972079207920792,
"loss": 4.9133,
"step": 900
},
{
"epoch": 0.27654867256637167,
"grad_norm": 0.43584969639778137,
"learning_rate": 0.0005970331974373907,
"loss": 4.8652,
"step": 950
},
{
"epoch": 0.2911038658593386,
"grad_norm": 0.4402410686016083,
"learning_rate": 0.0005968584740827023,
"loss": 4.8353,
"step": 1000
},
{
"epoch": 0.2911038658593386,
"eval_accuracy": 0.2531285474882593,
"eval_loss": 4.762944221496582,
"eval_runtime": 179.4777,
"eval_samples_per_second": 92.775,
"eval_steps_per_second": 5.8,
"step": 1000
},
{
"epoch": 0.30565905915230557,
"grad_norm": 0.4653228223323822,
"learning_rate": 0.0005966837507280139,
"loss": 4.7825,
"step": 1050
},
{
"epoch": 0.32021425244527246,
"grad_norm": 0.44444817304611206,
"learning_rate": 0.0005965090273733256,
"loss": 4.7472,
"step": 1100
},
{
"epoch": 0.3347694457382394,
"grad_norm": 0.4688259959220886,
"learning_rate": 0.0005963343040186371,
"loss": 4.7077,
"step": 1150
},
{
"epoch": 0.3493246390312063,
"grad_norm": 0.4208512306213379,
"learning_rate": 0.0005961595806639486,
"loss": 4.6714,
"step": 1200
},
{
"epoch": 0.36387983232417326,
"grad_norm": 0.4392108619213104,
"learning_rate": 0.0005959848573092603,
"loss": 4.6226,
"step": 1250
},
{
"epoch": 0.3784350256171402,
"grad_norm": 0.5245473384857178,
"learning_rate": 0.0005958101339545718,
"loss": 4.6081,
"step": 1300
},
{
"epoch": 0.3929902189101071,
"grad_norm": 0.42627111077308655,
"learning_rate": 0.0005956354105998835,
"loss": 4.5746,
"step": 1350
},
{
"epoch": 0.40754541220307405,
"grad_norm": 0.45299196243286133,
"learning_rate": 0.000595460687245195,
"loss": 4.5448,
"step": 1400
},
{
"epoch": 0.422100605496041,
"grad_norm": 0.40141919255256653,
"learning_rate": 0.0005952859638905067,
"loss": 4.5306,
"step": 1450
},
{
"epoch": 0.4366557987890079,
"grad_norm": 0.4180101156234741,
"learning_rate": 0.0005951112405358182,
"loss": 4.5104,
"step": 1500
},
{
"epoch": 0.45121099208197485,
"grad_norm": 0.4382331073284149,
"learning_rate": 0.0005949365171811299,
"loss": 4.4792,
"step": 1550
},
{
"epoch": 0.4657661853749418,
"grad_norm": 0.45820483565330505,
"learning_rate": 0.0005947617938264414,
"loss": 4.4604,
"step": 1600
},
{
"epoch": 0.4803213786679087,
"grad_norm": 0.41873323917388916,
"learning_rate": 0.000594587070471753,
"loss": 4.4525,
"step": 1650
},
{
"epoch": 0.49487657196087564,
"grad_norm": 0.4274512231349945,
"learning_rate": 0.0005944123471170646,
"loss": 4.4232,
"step": 1700
},
{
"epoch": 0.5094317652538426,
"grad_norm": 0.4086366891860962,
"learning_rate": 0.0005942376237623762,
"loss": 4.4096,
"step": 1750
},
{
"epoch": 0.5239869585468095,
"grad_norm": 0.38669702410697937,
"learning_rate": 0.0005940629004076878,
"loss": 4.3981,
"step": 1800
},
{
"epoch": 0.5385421518397764,
"grad_norm": 0.4255729913711548,
"learning_rate": 0.0005938881770529993,
"loss": 4.3785,
"step": 1850
},
{
"epoch": 0.5530973451327433,
"grad_norm": 0.40904727578163147,
"learning_rate": 0.000593713453698311,
"loss": 4.3684,
"step": 1900
},
{
"epoch": 0.5676525384257103,
"grad_norm": 0.384636789560318,
"learning_rate": 0.0005935387303436226,
"loss": 4.3485,
"step": 1950
},
{
"epoch": 0.5822077317186772,
"grad_norm": 0.4042646884918213,
"learning_rate": 0.0005933640069889342,
"loss": 4.3374,
"step": 2000
},
{
"epoch": 0.5822077317186772,
"eval_accuracy": 0.2996315166393396,
"eval_loss": 4.279468059539795,
"eval_runtime": 179.5389,
"eval_samples_per_second": 92.743,
"eval_steps_per_second": 5.798,
"step": 2000
},
{
"epoch": 0.5967629250116442,
"grad_norm": 0.41661450266838074,
"learning_rate": 0.0005931892836342457,
"loss": 4.3235,
"step": 2050
},
{
"epoch": 0.6113181183046111,
"grad_norm": 0.42830735445022583,
"learning_rate": 0.0005930145602795573,
"loss": 4.3108,
"step": 2100
},
{
"epoch": 0.625873311597578,
"grad_norm": 0.37084299325942993,
"learning_rate": 0.000592839836924869,
"loss": 4.3092,
"step": 2150
},
{
"epoch": 0.6404285048905449,
"grad_norm": 0.4014033377170563,
"learning_rate": 0.0005926651135701805,
"loss": 4.3037,
"step": 2200
},
{
"epoch": 0.6549836981835119,
"grad_norm": 0.3967902660369873,
"learning_rate": 0.0005924903902154921,
"loss": 4.2826,
"step": 2250
},
{
"epoch": 0.6695388914764788,
"grad_norm": 0.4075942635536194,
"learning_rate": 0.0005923156668608037,
"loss": 4.2677,
"step": 2300
},
{
"epoch": 0.6840940847694458,
"grad_norm": 0.4896312355995178,
"learning_rate": 0.0005921409435061153,
"loss": 4.2633,
"step": 2350
},
{
"epoch": 0.6986492780624126,
"grad_norm": 0.4202996492385864,
"learning_rate": 0.0005919662201514268,
"loss": 4.2499,
"step": 2400
},
{
"epoch": 0.7132044713553796,
"grad_norm": 0.4072986841201782,
"learning_rate": 0.0005917914967967384,
"loss": 4.2487,
"step": 2450
},
{
"epoch": 0.7277596646483465,
"grad_norm": 0.3957359790802002,
"learning_rate": 0.0005916167734420501,
"loss": 4.2499,
"step": 2500
},
{
"epoch": 0.7423148579413135,
"grad_norm": 0.3910154700279236,
"learning_rate": 0.0005914420500873616,
"loss": 4.2258,
"step": 2550
},
{
"epoch": 0.7568700512342804,
"grad_norm": 0.3771495223045349,
"learning_rate": 0.0005912673267326732,
"loss": 4.2105,
"step": 2600
},
{
"epoch": 0.7714252445272474,
"grad_norm": 0.37781083583831787,
"learning_rate": 0.0005910926033779848,
"loss": 4.2069,
"step": 2650
},
{
"epoch": 0.7859804378202142,
"grad_norm": 0.3904173672199249,
"learning_rate": 0.0005909178800232964,
"loss": 4.1878,
"step": 2700
},
{
"epoch": 0.8005356311131812,
"grad_norm": 0.3640347421169281,
"learning_rate": 0.000590743156668608,
"loss": 4.1915,
"step": 2750
},
{
"epoch": 0.8150908244061481,
"grad_norm": 0.3591432273387909,
"learning_rate": 0.0005905684333139196,
"loss": 4.1809,
"step": 2800
},
{
"epoch": 0.8296460176991151,
"grad_norm": 0.40700843930244446,
"learning_rate": 0.0005903937099592312,
"loss": 4.1689,
"step": 2850
},
{
"epoch": 0.844201210992082,
"grad_norm": 0.3366829454898834,
"learning_rate": 0.0005902189866045427,
"loss": 4.1601,
"step": 2900
},
{
"epoch": 0.858756404285049,
"grad_norm": 0.35641607642173767,
"learning_rate": 0.0005900442632498543,
"loss": 4.1486,
"step": 2950
},
{
"epoch": 0.8733115975780158,
"grad_norm": 0.35798075795173645,
"learning_rate": 0.0005898695398951659,
"loss": 4.1489,
"step": 3000
},
{
"epoch": 0.8733115975780158,
"eval_accuracy": 0.315511923674007,
"eval_loss": 4.095459938049316,
"eval_runtime": 179.6196,
"eval_samples_per_second": 92.701,
"eval_steps_per_second": 5.796,
"step": 3000
},
{
"epoch": 0.8878667908709827,
"grad_norm": 0.3598261773586273,
"learning_rate": 0.0005896948165404776,
"loss": 4.1458,
"step": 3050
},
{
"epoch": 0.9024219841639497,
"grad_norm": 0.3457593321800232,
"learning_rate": 0.0005895200931857891,
"loss": 4.1294,
"step": 3100
},
{
"epoch": 0.9169771774569166,
"grad_norm": 0.3695979416370392,
"learning_rate": 0.0005893453698311007,
"loss": 4.1345,
"step": 3150
},
{
"epoch": 0.9315323707498836,
"grad_norm": 0.3489490747451782,
"learning_rate": 0.0005891706464764123,
"loss": 4.1108,
"step": 3200
},
{
"epoch": 0.9460875640428504,
"grad_norm": 0.34958118200302124,
"learning_rate": 0.0005889959231217238,
"loss": 4.1165,
"step": 3250
},
{
"epoch": 0.9606427573358174,
"grad_norm": 0.3432953357696533,
"learning_rate": 0.0005888211997670355,
"loss": 4.1136,
"step": 3300
},
{
"epoch": 0.9751979506287843,
"grad_norm": 0.38065826892852783,
"learning_rate": 0.000588646476412347,
"loss": 4.1089,
"step": 3350
},
{
"epoch": 0.9897531439217513,
"grad_norm": 0.34400883316993713,
"learning_rate": 0.0005884717530576587,
"loss": 4.11,
"step": 3400
},
{
"epoch": 1.0040754541220307,
"grad_norm": 0.33822906017303467,
"learning_rate": 0.0005882970297029702,
"loss": 4.0757,
"step": 3450
},
{
"epoch": 1.0186306474149978,
"grad_norm": 0.35206469893455505,
"learning_rate": 0.0005881223063482818,
"loss": 4.0187,
"step": 3500
},
{
"epoch": 1.0331858407079646,
"grad_norm": 0.3783675730228424,
"learning_rate": 0.0005879475829935934,
"loss": 4.0276,
"step": 3550
},
{
"epoch": 1.0477410340009314,
"grad_norm": 0.35457679629325867,
"learning_rate": 0.0005877728596389051,
"loss": 4.0168,
"step": 3600
},
{
"epoch": 1.0622962272938985,
"grad_norm": 0.37590548396110535,
"learning_rate": 0.0005875981362842166,
"loss": 4.0154,
"step": 3650
},
{
"epoch": 1.0768514205868653,
"grad_norm": 0.35166940093040466,
"learning_rate": 0.0005874234129295281,
"loss": 4.0346,
"step": 3700
},
{
"epoch": 1.0914066138798324,
"grad_norm": 0.3491305708885193,
"learning_rate": 0.0005872486895748398,
"loss": 4.0116,
"step": 3750
},
{
"epoch": 1.1059618071727992,
"grad_norm": 0.36400511860847473,
"learning_rate": 0.0005870739662201513,
"loss": 4.0084,
"step": 3800
},
{
"epoch": 1.120517000465766,
"grad_norm": 0.35419315099716187,
"learning_rate": 0.000586899242865463,
"loss": 4.0014,
"step": 3850
},
{
"epoch": 1.1350721937587331,
"grad_norm": 0.3940429985523224,
"learning_rate": 0.0005867245195107746,
"loss": 4.006,
"step": 3900
},
{
"epoch": 1.1496273870517,
"grad_norm": 0.3528949022293091,
"learning_rate": 0.0005865497961560862,
"loss": 4.0051,
"step": 3950
},
{
"epoch": 1.164182580344667,
"grad_norm": 0.35174238681793213,
"learning_rate": 0.0005863750728013977,
"loss": 3.996,
"step": 4000
},
{
"epoch": 1.164182580344667,
"eval_accuracy": 0.32519629116731763,
"eval_loss": 3.99649977684021,
"eval_runtime": 179.7575,
"eval_samples_per_second": 92.63,
"eval_steps_per_second": 5.791,
"step": 4000
},
{
"epoch": 1.1787377736376339,
"grad_norm": 0.3291102945804596,
"learning_rate": 0.0005862003494467094,
"loss": 3.9821,
"step": 4050
},
{
"epoch": 1.193292966930601,
"grad_norm": 0.35790738463401794,
"learning_rate": 0.0005860256260920209,
"loss": 4.0037,
"step": 4100
},
{
"epoch": 1.2078481602235678,
"grad_norm": 0.37376832962036133,
"learning_rate": 0.0005858509027373325,
"loss": 3.9955,
"step": 4150
},
{
"epoch": 1.2224033535165346,
"grad_norm": 0.34248632192611694,
"learning_rate": 0.0005856761793826441,
"loss": 3.9708,
"step": 4200
},
{
"epoch": 1.2369585468095017,
"grad_norm": 0.3399963676929474,
"learning_rate": 0.0005855014560279557,
"loss": 3.9767,
"step": 4250
},
{
"epoch": 1.2515137401024685,
"grad_norm": 0.3411734104156494,
"learning_rate": 0.0005853267326732673,
"loss": 3.9673,
"step": 4300
},
{
"epoch": 1.2660689333954354,
"grad_norm": 0.3493206202983856,
"learning_rate": 0.0005851520093185788,
"loss": 3.9599,
"step": 4350
},
{
"epoch": 1.2806241266884024,
"grad_norm": 0.34816136956214905,
"learning_rate": 0.0005849772859638905,
"loss": 3.9579,
"step": 4400
},
{
"epoch": 1.2951793199813695,
"grad_norm": 0.3242569863796234,
"learning_rate": 0.0005848025626092021,
"loss": 3.9739,
"step": 4450
},
{
"epoch": 1.3097345132743363,
"grad_norm": 0.34569236636161804,
"learning_rate": 0.0005846278392545136,
"loss": 3.9623,
"step": 4500
},
{
"epoch": 1.3242897065673032,
"grad_norm": 0.34033289551734924,
"learning_rate": 0.0005844531158998252,
"loss": 3.9518,
"step": 4550
},
{
"epoch": 1.3388448998602702,
"grad_norm": 0.3365550637245178,
"learning_rate": 0.0005842783925451368,
"loss": 3.9437,
"step": 4600
},
{
"epoch": 1.353400093153237,
"grad_norm": 0.3506678342819214,
"learning_rate": 0.0005841036691904484,
"loss": 3.9446,
"step": 4650
},
{
"epoch": 1.367955286446204,
"grad_norm": 0.3359658718109131,
"learning_rate": 0.00058392894583576,
"loss": 3.9492,
"step": 4700
},
{
"epoch": 1.382510479739171,
"grad_norm": 0.3435382843017578,
"learning_rate": 0.0005837542224810716,
"loss": 3.9445,
"step": 4750
},
{
"epoch": 1.3970656730321378,
"grad_norm": 0.35683852434158325,
"learning_rate": 0.0005835794991263832,
"loss": 3.9433,
"step": 4800
},
{
"epoch": 1.4116208663251049,
"grad_norm": 0.3335430324077606,
"learning_rate": 0.0005834047757716948,
"loss": 3.936,
"step": 4850
},
{
"epoch": 1.4261760596180717,
"grad_norm": 0.3474925756454468,
"learning_rate": 0.0005832300524170063,
"loss": 3.9431,
"step": 4900
},
{
"epoch": 1.4407312529110388,
"grad_norm": 0.35509905219078064,
"learning_rate": 0.0005830553290623179,
"loss": 3.9349,
"step": 4950
},
{
"epoch": 1.4552864462040056,
"grad_norm": 0.33583182096481323,
"learning_rate": 0.0005828806057076296,
"loss": 3.9347,
"step": 5000
},
{
"epoch": 1.4552864462040056,
"eval_accuracy": 0.33179591947546155,
"eval_loss": 3.9166266918182373,
"eval_runtime": 179.7795,
"eval_samples_per_second": 92.619,
"eval_steps_per_second": 5.79,
"step": 5000
},
{
"epoch": 1.4698416394969724,
"grad_norm": 0.3611011505126953,
"learning_rate": 0.0005827058823529411,
"loss": 3.9249,
"step": 5050
},
{
"epoch": 1.4843968327899395,
"grad_norm": 0.35978731513023376,
"learning_rate": 0.0005825311589982527,
"loss": 3.9307,
"step": 5100
},
{
"epoch": 1.4989520260829063,
"grad_norm": 0.3362000286579132,
"learning_rate": 0.0005823564356435643,
"loss": 3.9162,
"step": 5150
},
{
"epoch": 1.5135072193758732,
"grad_norm": 0.3377557098865509,
"learning_rate": 0.0005821817122888759,
"loss": 3.9344,
"step": 5200
},
{
"epoch": 1.5280624126688402,
"grad_norm": 0.33132773637771606,
"learning_rate": 0.0005820069889341875,
"loss": 3.93,
"step": 5250
},
{
"epoch": 1.5426176059618073,
"grad_norm": 0.3384888470172882,
"learning_rate": 0.000581832265579499,
"loss": 3.9109,
"step": 5300
},
{
"epoch": 1.5571727992547741,
"grad_norm": 0.3428712785243988,
"learning_rate": 0.0005816575422248107,
"loss": 3.9228,
"step": 5350
},
{
"epoch": 1.571727992547741,
"grad_norm": 0.3271981179714203,
"learning_rate": 0.0005814828188701222,
"loss": 3.9087,
"step": 5400
},
{
"epoch": 1.586283185840708,
"grad_norm": 0.3368676006793976,
"learning_rate": 0.0005813080955154338,
"loss": 3.9175,
"step": 5450
},
{
"epoch": 1.6008383791336749,
"grad_norm": 0.32926130294799805,
"learning_rate": 0.0005811333721607454,
"loss": 3.8967,
"step": 5500
},
{
"epoch": 1.6153935724266417,
"grad_norm": 0.32293272018432617,
"learning_rate": 0.0005809586488060571,
"loss": 3.8934,
"step": 5550
},
{
"epoch": 1.6299487657196088,
"grad_norm": 0.35573187470436096,
"learning_rate": 0.0005807839254513686,
"loss": 3.8959,
"step": 5600
},
{
"epoch": 1.6445039590125758,
"grad_norm": 0.3371495306491852,
"learning_rate": 0.0005806092020966802,
"loss": 3.8953,
"step": 5650
},
{
"epoch": 1.6590591523055425,
"grad_norm": 0.3150193989276886,
"learning_rate": 0.0005804344787419918,
"loss": 3.8906,
"step": 5700
},
{
"epoch": 1.6736143455985095,
"grad_norm": 0.34463661909103394,
"learning_rate": 0.0005802597553873033,
"loss": 3.8892,
"step": 5750
},
{
"epoch": 1.6881695388914766,
"grad_norm": 0.3225216865539551,
"learning_rate": 0.000580085032032615,
"loss": 3.8853,
"step": 5800
},
{
"epoch": 1.7027247321844434,
"grad_norm": 0.33168166875839233,
"learning_rate": 0.0005799103086779265,
"loss": 3.8872,
"step": 5850
},
{
"epoch": 1.7172799254774103,
"grad_norm": 0.3117685914039612,
"learning_rate": 0.0005797355853232382,
"loss": 3.8712,
"step": 5900
},
{
"epoch": 1.7318351187703773,
"grad_norm": 0.3171573579311371,
"learning_rate": 0.0005795608619685497,
"loss": 3.8869,
"step": 5950
},
{
"epoch": 1.7463903120633442,
"grad_norm": 0.3368074893951416,
"learning_rate": 0.0005793861386138614,
"loss": 3.8665,
"step": 6000
},
{
"epoch": 1.7463903120633442,
"eval_accuracy": 0.3372165138557054,
"eval_loss": 3.8597424030303955,
"eval_runtime": 179.7463,
"eval_samples_per_second": 92.636,
"eval_steps_per_second": 5.791,
"step": 6000
},
{
"epoch": 1.760945505356311,
"grad_norm": 0.33153268694877625,
"learning_rate": 0.0005792114152591729,
"loss": 3.8785,
"step": 6050
},
{
"epoch": 1.775500698649278,
"grad_norm": 0.3446453809738159,
"learning_rate": 0.0005790366919044846,
"loss": 3.8873,
"step": 6100
},
{
"epoch": 1.7900558919422451,
"grad_norm": 0.34094732999801636,
"learning_rate": 0.0005788619685497961,
"loss": 3.8715,
"step": 6150
},
{
"epoch": 1.804611085235212,
"grad_norm": 0.332516610622406,
"learning_rate": 0.0005786872451951077,
"loss": 3.8612,
"step": 6200
},
{
"epoch": 1.8191662785281788,
"grad_norm": 0.33042222261428833,
"learning_rate": 0.0005785125218404193,
"loss": 3.8693,
"step": 6250
},
{
"epoch": 1.8337214718211459,
"grad_norm": 0.31963613629341125,
"learning_rate": 0.0005783377984857308,
"loss": 3.8765,
"step": 6300
},
{
"epoch": 1.8482766651141127,
"grad_norm": 0.3356238305568695,
"learning_rate": 0.0005781630751310425,
"loss": 3.8714,
"step": 6350
},
{
"epoch": 1.8628318584070795,
"grad_norm": 0.32635051012039185,
"learning_rate": 0.0005779883517763541,
"loss": 3.8626,
"step": 6400
},
{
"epoch": 1.8773870517000466,
"grad_norm": 0.3183547556400299,
"learning_rate": 0.0005778136284216657,
"loss": 3.8541,
"step": 6450
},
{
"epoch": 1.8919422449930137,
"grad_norm": 0.332824170589447,
"learning_rate": 0.0005776389050669772,
"loss": 3.8672,
"step": 6500
},
{
"epoch": 1.9064974382859803,
"grad_norm": 0.3161756694316864,
"learning_rate": 0.0005774641817122889,
"loss": 3.8409,
"step": 6550
},
{
"epoch": 1.9210526315789473,
"grad_norm": 0.33893078565597534,
"learning_rate": 0.0005772894583576004,
"loss": 3.8608,
"step": 6600
},
{
"epoch": 1.9356078248719144,
"grad_norm": 0.32037511467933655,
"learning_rate": 0.000577114735002912,
"loss": 3.8494,
"step": 6650
},
{
"epoch": 1.9501630181648812,
"grad_norm": 0.30871209502220154,
"learning_rate": 0.0005769400116482236,
"loss": 3.8421,
"step": 6700
},
{
"epoch": 1.964718211457848,
"grad_norm": 0.3202383518218994,
"learning_rate": 0.0005767652882935352,
"loss": 3.8567,
"step": 6750
},
{
"epoch": 1.9792734047508151,
"grad_norm": 0.3361697494983673,
"learning_rate": 0.0005765905649388468,
"loss": 3.8522,
"step": 6800
},
{
"epoch": 1.993828598043782,
"grad_norm": 0.32010895013809204,
"learning_rate": 0.0005764158415841583,
"loss": 3.8407,
"step": 6850
},
{
"epoch": 2.0081509082440614,
"grad_norm": 0.33929452300071716,
"learning_rate": 0.00057624111822947,
"loss": 3.7821,
"step": 6900
},
{
"epoch": 2.0227061015370285,
"grad_norm": 0.32835519313812256,
"learning_rate": 0.0005760663948747816,
"loss": 3.7356,
"step": 6950
},
{
"epoch": 2.0372612948299955,
"grad_norm": 0.31738170981407166,
"learning_rate": 0.0005758916715200931,
"loss": 3.7343,
"step": 7000
},
{
"epoch": 2.0372612948299955,
"eval_accuracy": 0.3415684324478317,
"eval_loss": 3.8141865730285645,
"eval_runtime": 179.7242,
"eval_samples_per_second": 92.648,
"eval_steps_per_second": 5.792,
"step": 7000
},
{
"epoch": 2.051816488122962,
"grad_norm": 0.3155139088630676,
"learning_rate": 0.0005757169481654047,
"loss": 3.7447,
"step": 7050
},
{
"epoch": 2.066371681415929,
"grad_norm": 0.3158959448337555,
"learning_rate": 0.0005755422248107163,
"loss": 3.7481,
"step": 7100
},
{
"epoch": 2.0809268747088963,
"grad_norm": 0.3319007158279419,
"learning_rate": 0.0005753675014560279,
"loss": 3.7539,
"step": 7150
},
{
"epoch": 2.095482068001863,
"grad_norm": 0.31912368535995483,
"learning_rate": 0.0005751927781013395,
"loss": 3.7488,
"step": 7200
},
{
"epoch": 2.11003726129483,
"grad_norm": 0.31239715218544006,
"learning_rate": 0.0005750180547466511,
"loss": 3.7673,
"step": 7250
},
{
"epoch": 2.124592454587797,
"grad_norm": 0.3297717273235321,
"learning_rate": 0.0005748433313919627,
"loss": 3.7545,
"step": 7300
},
{
"epoch": 2.139147647880764,
"grad_norm": 0.34258368611335754,
"learning_rate": 0.0005746686080372743,
"loss": 3.7494,
"step": 7350
},
{
"epoch": 2.1537028411737307,
"grad_norm": 0.323652058839798,
"learning_rate": 0.0005744938846825858,
"loss": 3.7428,
"step": 7400
},
{
"epoch": 2.1682580344666977,
"grad_norm": 0.3314710557460785,
"learning_rate": 0.0005743191613278974,
"loss": 3.7627,
"step": 7450
},
{
"epoch": 2.182813227759665,
"grad_norm": 0.3517455458641052,
"learning_rate": 0.0005741444379732091,
"loss": 3.7598,
"step": 7500
},
{
"epoch": 2.1973684210526314,
"grad_norm": 0.34565964341163635,
"learning_rate": 0.0005739697146185206,
"loss": 3.7568,
"step": 7550
},
{
"epoch": 2.2119236143455985,
"grad_norm": 0.31987765431404114,
"learning_rate": 0.0005737949912638322,
"loss": 3.7692,
"step": 7600
},
{
"epoch": 2.2264788076385655,
"grad_norm": 0.3262273371219635,
"learning_rate": 0.0005736202679091438,
"loss": 3.7536,
"step": 7650
},
{
"epoch": 2.241034000931532,
"grad_norm": 0.32346490025520325,
"learning_rate": 0.0005734455445544554,
"loss": 3.7582,
"step": 7700
},
{
"epoch": 2.255589194224499,
"grad_norm": 0.3117275834083557,
"learning_rate": 0.000573270821199767,
"loss": 3.741,
"step": 7750
},
{
"epoch": 2.2701443875174663,
"grad_norm": 0.3295489251613617,
"learning_rate": 0.0005730960978450785,
"loss": 3.7585,
"step": 7800
},
{
"epoch": 2.2846995808104333,
"grad_norm": 0.30941474437713623,
"learning_rate": 0.0005729213744903902,
"loss": 3.7513,
"step": 7850
},
{
"epoch": 2.2992547741034,
"grad_norm": 0.3114382028579712,
"learning_rate": 0.0005727466511357017,
"loss": 3.7662,
"step": 7900
},
{
"epoch": 2.313809967396367,
"grad_norm": 0.3253454267978668,
"learning_rate": 0.0005725719277810134,
"loss": 3.757,
"step": 7950
},
{
"epoch": 2.328365160689334,
"grad_norm": 0.32134369015693665,
"learning_rate": 0.0005723972044263249,
"loss": 3.7434,
"step": 8000
},
{
"epoch": 2.328365160689334,
"eval_accuracy": 0.34472039725169445,
"eval_loss": 3.7803711891174316,
"eval_runtime": 179.7407,
"eval_samples_per_second": 92.639,
"eval_steps_per_second": 5.792,
"step": 8000
},
{
"epoch": 2.3429203539823007,
"grad_norm": 0.31129831075668335,
"learning_rate": 0.0005722224810716366,
"loss": 3.7715,
"step": 8050
},
{
"epoch": 2.3574755472752678,
"grad_norm": 0.31825774908065796,
"learning_rate": 0.0005720477577169481,
"loss": 3.7519,
"step": 8100
},
{
"epoch": 2.372030740568235,
"grad_norm": 0.3173275291919708,
"learning_rate": 0.0005718730343622598,
"loss": 3.7538,
"step": 8150
},
{
"epoch": 2.386585933861202,
"grad_norm": 0.3088977038860321,
"learning_rate": 0.0005716983110075713,
"loss": 3.7434,
"step": 8200
},
{
"epoch": 2.4011411271541685,
"grad_norm": 0.31404852867126465,
"learning_rate": 0.0005715235876528828,
"loss": 3.7509,
"step": 8250
},
{
"epoch": 2.4156963204471356,
"grad_norm": 0.34154555201530457,
"learning_rate": 0.0005713488642981945,
"loss": 3.7509,
"step": 8300
},
{
"epoch": 2.4302515137401026,
"grad_norm": 0.3258163332939148,
"learning_rate": 0.0005711741409435061,
"loss": 3.7483,
"step": 8350
},
{
"epoch": 2.4448067070330692,
"grad_norm": 0.30587008595466614,
"learning_rate": 0.0005709994175888177,
"loss": 3.7547,
"step": 8400
},
{
"epoch": 2.4593619003260363,
"grad_norm": 0.304930180311203,
"learning_rate": 0.0005708246942341292,
"loss": 3.7466,
"step": 8450
},
{
"epoch": 2.4739170936190034,
"grad_norm": 0.31150105595588684,
"learning_rate": 0.0005706499708794409,
"loss": 3.7452,
"step": 8500
},
{
"epoch": 2.4884722869119704,
"grad_norm": 0.31906256079673767,
"learning_rate": 0.0005704752475247524,
"loss": 3.7465,
"step": 8550
},
{
"epoch": 2.503027480204937,
"grad_norm": 0.30793827772140503,
"learning_rate": 0.0005703005241700641,
"loss": 3.757,
"step": 8600
},
{
"epoch": 2.517582673497904,
"grad_norm": 0.312449187040329,
"learning_rate": 0.0005701258008153756,
"loss": 3.744,
"step": 8650
},
{
"epoch": 2.5321378667908707,
"grad_norm": 0.3282853364944458,
"learning_rate": 0.0005699510774606872,
"loss": 3.7454,
"step": 8700
},
{
"epoch": 2.546693060083838,
"grad_norm": 0.3083302080631256,
"learning_rate": 0.0005697763541059988,
"loss": 3.7455,
"step": 8750
},
{
"epoch": 2.561248253376805,
"grad_norm": 0.314094215631485,
"learning_rate": 0.0005696016307513103,
"loss": 3.7441,
"step": 8800
},
{
"epoch": 2.575803446669772,
"grad_norm": 0.3158092796802521,
"learning_rate": 0.000569426907396622,
"loss": 3.7521,
"step": 8850
},
{
"epoch": 2.590358639962739,
"grad_norm": 0.3241555094718933,
"learning_rate": 0.0005692521840419336,
"loss": 3.7385,
"step": 8900
},
{
"epoch": 2.6049138332557056,
"grad_norm": 0.34334230422973633,
"learning_rate": 0.0005690774606872452,
"loss": 3.7346,
"step": 8950
},
{
"epoch": 2.6194690265486726,
"grad_norm": 0.3200402855873108,
"learning_rate": 0.0005689027373325567,
"loss": 3.7318,
"step": 9000
},
{
"epoch": 2.6194690265486726,
"eval_accuracy": 0.34772286732307234,
"eval_loss": 3.7532577514648438,
"eval_runtime": 179.9031,
"eval_samples_per_second": 92.555,
"eval_steps_per_second": 5.786,
"step": 9000
},
{
"epoch": 2.6340242198416393,
"grad_norm": 0.3395897150039673,
"learning_rate": 0.0005687280139778683,
"loss": 3.7331,
"step": 9050
},
{
"epoch": 2.6485794131346063,
"grad_norm": 0.3154236972332001,
"learning_rate": 0.0005685532906231799,
"loss": 3.7301,
"step": 9100
},
{
"epoch": 2.6631346064275734,
"grad_norm": 0.31031179428100586,
"learning_rate": 0.0005683785672684915,
"loss": 3.7267,
"step": 9150
},
{
"epoch": 2.6776897997205404,
"grad_norm": 0.3203895688056946,
"learning_rate": 0.0005682038439138031,
"loss": 3.7326,
"step": 9200
},
{
"epoch": 2.692244993013507,
"grad_norm": 0.3058587312698364,
"learning_rate": 0.0005680291205591147,
"loss": 3.7222,
"step": 9250
},
{
"epoch": 2.706800186306474,
"grad_norm": 0.31644535064697266,
"learning_rate": 0.0005678543972044263,
"loss": 3.7268,
"step": 9300
},
{
"epoch": 2.721355379599441,
"grad_norm": 0.30357447266578674,
"learning_rate": 0.0005676796738497378,
"loss": 3.7311,
"step": 9350
},
{
"epoch": 2.735910572892408,
"grad_norm": 0.3125144839286804,
"learning_rate": 0.0005675049504950495,
"loss": 3.7375,
"step": 9400
},
{
"epoch": 2.750465766185375,
"grad_norm": 0.3147082030773163,
"learning_rate": 0.0005673302271403611,
"loss": 3.7399,
"step": 9450
},
{
"epoch": 2.765020959478342,
"grad_norm": 0.304993599653244,
"learning_rate": 0.0005671555037856726,
"loss": 3.7334,
"step": 9500
},
{
"epoch": 2.779576152771309,
"grad_norm": 0.32299643754959106,
"learning_rate": 0.0005669807804309842,
"loss": 3.7218,
"step": 9550
},
{
"epoch": 2.7941313460642756,
"grad_norm": 0.32130691409111023,
"learning_rate": 0.0005668060570762958,
"loss": 3.7313,
"step": 9600
},
{
"epoch": 2.8086865393572427,
"grad_norm": 0.3162747621536255,
"learning_rate": 0.0005666313337216074,
"loss": 3.7299,
"step": 9650
},
{
"epoch": 2.8232417326502097,
"grad_norm": 0.3274405002593994,
"learning_rate": 0.000566456610366919,
"loss": 3.7252,
"step": 9700
},
{
"epoch": 2.8377969259431763,
"grad_norm": 0.31984663009643555,
"learning_rate": 0.0005662818870122306,
"loss": 3.735,
"step": 9750
},
{
"epoch": 2.8523521192361434,
"grad_norm": 0.2940428555011749,
"learning_rate": 0.0005661071636575422,
"loss": 3.7267,
"step": 9800
},
{
"epoch": 2.8669073125291105,
"grad_norm": 0.306113064289093,
"learning_rate": 0.0005659324403028537,
"loss": 3.7327,
"step": 9850
},
{
"epoch": 2.8814625058220775,
"grad_norm": 0.3145885467529297,
"learning_rate": 0.0005657577169481653,
"loss": 3.7142,
"step": 9900
},
{
"epoch": 2.896017699115044,
"grad_norm": 0.29591280221939087,
"learning_rate": 0.0005655829935934769,
"loss": 3.7039,
"step": 9950
},
{
"epoch": 2.910572892408011,
"grad_norm": 0.3004099726676941,
"learning_rate": 0.0005654082702387886,
"loss": 3.7166,
"step": 10000
},
{
"epoch": 2.910572892408011,
"eval_accuracy": 0.34956734085421903,
"eval_loss": 3.726909637451172,
"eval_runtime": 179.5861,
"eval_samples_per_second": 92.719,
"eval_steps_per_second": 5.797,
"step": 10000
},
{
"epoch": 2.9251280857009783,
"grad_norm": 0.3109896183013916,
"learning_rate": 0.0005652335468841001,
"loss": 3.7304,
"step": 10050
},
{
"epoch": 2.939683278993945,
"grad_norm": 0.3140173554420471,
"learning_rate": 0.0005650588235294117,
"loss": 3.7058,
"step": 10100
},
{
"epoch": 2.954238472286912,
"grad_norm": 0.31375789642333984,
"learning_rate": 0.0005648841001747233,
"loss": 3.7248,
"step": 10150
},
{
"epoch": 2.968793665579879,
"grad_norm": 0.309282511472702,
"learning_rate": 0.0005647093768200349,
"loss": 3.7292,
"step": 10200
},
{
"epoch": 2.983348858872846,
"grad_norm": 0.3091147243976593,
"learning_rate": 0.0005645346534653465,
"loss": 3.7245,
"step": 10250
},
{
"epoch": 2.9979040521658127,
"grad_norm": 0.31467899680137634,
"learning_rate": 0.0005643599301106582,
"loss": 3.717,
"step": 10300
},
{
"epoch": 3.0122263623660923,
"grad_norm": 0.31328141689300537,
"learning_rate": 0.0005641852067559697,
"loss": 3.6388,
"step": 10350
},
{
"epoch": 3.026781555659059,
"grad_norm": 0.30440211296081543,
"learning_rate": 0.0005640104834012812,
"loss": 3.6052,
"step": 10400
},
{
"epoch": 3.041336748952026,
"grad_norm": 0.31965646147727966,
"learning_rate": 0.0005638357600465929,
"loss": 3.6184,
"step": 10450
},
{
"epoch": 3.055891942244993,
"grad_norm": 0.3041605055332184,
"learning_rate": 0.0005636610366919044,
"loss": 3.6315,
"step": 10500
},
{
"epoch": 3.07044713553796,
"grad_norm": 0.3063664734363556,
"learning_rate": 0.0005634863133372161,
"loss": 3.6164,
"step": 10550
},
{
"epoch": 3.0850023288309267,
"grad_norm": 0.3086493909358978,
"learning_rate": 0.0005633115899825276,
"loss": 3.6269,
"step": 10600
},
{
"epoch": 3.099557522123894,
"grad_norm": 0.3314305245876312,
"learning_rate": 0.0005631368666278393,
"loss": 3.626,
"step": 10650
},
{
"epoch": 3.114112715416861,
"grad_norm": 0.3184899687767029,
"learning_rate": 0.0005629621432731508,
"loss": 3.6244,
"step": 10700
},
{
"epoch": 3.1286679087098275,
"grad_norm": 0.30493825674057007,
"learning_rate": 0.0005627874199184623,
"loss": 3.6374,
"step": 10750
},
{
"epoch": 3.1432231020027945,
"grad_norm": 0.32075250148773193,
"learning_rate": 0.000562612696563774,
"loss": 3.6312,
"step": 10800
},
{
"epoch": 3.1577782952957616,
"grad_norm": 0.31725215911865234,
"learning_rate": 0.0005624379732090856,
"loss": 3.6337,
"step": 10850
},
{
"epoch": 3.1723334885887287,
"grad_norm": 0.3105776607990265,
"learning_rate": 0.0005622632498543972,
"loss": 3.6398,
"step": 10900
},
{
"epoch": 3.1868886818816953,
"grad_norm": 0.31388428807258606,
"learning_rate": 0.0005620885264997087,
"loss": 3.6342,
"step": 10950
},
{
"epoch": 3.2014438751746623,
"grad_norm": 0.338467538356781,
"learning_rate": 0.0005619138031450204,
"loss": 3.6328,
"step": 11000
},
{
"epoch": 3.2014438751746623,
"eval_accuracy": 0.3517692149211257,
"eval_loss": 3.71359920501709,
"eval_runtime": 179.6628,
"eval_samples_per_second": 92.679,
"eval_steps_per_second": 5.794,
"step": 11000
},
{
"epoch": 3.2159990684676294,
"grad_norm": 0.33954986929893494,
"learning_rate": 0.0005617390797903319,
"loss": 3.634,
"step": 11050
},
{
"epoch": 3.230554261760596,
"grad_norm": 0.3152860999107361,
"learning_rate": 0.0005615643564356436,
"loss": 3.6452,
"step": 11100
},
{
"epoch": 3.245109455053563,
"grad_norm": 0.31836217641830444,
"learning_rate": 0.0005613896330809551,
"loss": 3.6352,
"step": 11150
},
{
"epoch": 3.25966464834653,
"grad_norm": 0.3215279281139374,
"learning_rate": 0.0005612149097262667,
"loss": 3.6452,
"step": 11200
},
{
"epoch": 3.274219841639497,
"grad_norm": 0.31674569845199585,
"learning_rate": 0.0005610401863715783,
"loss": 3.6434,
"step": 11250
},
{
"epoch": 3.288775034932464,
"grad_norm": 0.2988327741622925,
"learning_rate": 0.0005608654630168898,
"loss": 3.6226,
"step": 11300
},
{
"epoch": 3.303330228225431,
"grad_norm": 0.322599858045578,
"learning_rate": 0.0005606907396622015,
"loss": 3.6409,
"step": 11350
},
{
"epoch": 3.317885421518398,
"grad_norm": 0.3030698001384735,
"learning_rate": 0.0005605160163075131,
"loss": 3.6371,
"step": 11400
},
{
"epoch": 3.3324406148113646,
"grad_norm": 0.3100179433822632,
"learning_rate": 0.0005603412929528247,
"loss": 3.6327,
"step": 11450
},
{
"epoch": 3.3469958081043316,
"grad_norm": 0.30968624353408813,
"learning_rate": 0.0005601665695981362,
"loss": 3.6423,
"step": 11500
},
{
"epoch": 3.3615510013972987,
"grad_norm": 0.3197573125362396,
"learning_rate": 0.0005599918462434478,
"loss": 3.6325,
"step": 11550
},
{
"epoch": 3.3761061946902653,
"grad_norm": 0.3116932809352875,
"learning_rate": 0.0005598171228887594,
"loss": 3.6313,
"step": 11600
},
{
"epoch": 3.3906613879832324,
"grad_norm": 0.31749212741851807,
"learning_rate": 0.0005596423995340709,
"loss": 3.6446,
"step": 11650
},
{
"epoch": 3.4052165812761994,
"grad_norm": 0.31616830825805664,
"learning_rate": 0.0005594676761793826,
"loss": 3.6578,
"step": 11700
},
{
"epoch": 3.419771774569166,
"grad_norm": 0.30816033482551575,
"learning_rate": 0.0005592929528246942,
"loss": 3.6325,
"step": 11750
},
{
"epoch": 3.434326967862133,
"grad_norm": 0.3149195909500122,
"learning_rate": 0.0005591182294700058,
"loss": 3.6345,
"step": 11800
},
{
"epoch": 3.4488821611551,
"grad_norm": 0.318624883890152,
"learning_rate": 0.0005589435061153173,
"loss": 3.6419,
"step": 11850
},
{
"epoch": 3.463437354448067,
"grad_norm": 0.3080650269985199,
"learning_rate": 0.000558768782760629,
"loss": 3.6258,
"step": 11900
},
{
"epoch": 3.477992547741034,
"grad_norm": 0.3114171326160431,
"learning_rate": 0.0005585940594059406,
"loss": 3.6486,
"step": 11950
},
{
"epoch": 3.492547741034001,
"grad_norm": 0.30843472480773926,
"learning_rate": 0.0005584193360512521,
"loss": 3.6293,
"step": 12000
},
{
"epoch": 3.492547741034001,
"eval_accuracy": 0.35355715781836883,
"eval_loss": 3.6956379413604736,
"eval_runtime": 179.6151,
"eval_samples_per_second": 92.704,
"eval_steps_per_second": 5.796,
"step": 12000
},
{
"epoch": 3.507102934326968,
"grad_norm": 0.3114839196205139,
"learning_rate": 0.0005582446126965637,
"loss": 3.6463,
"step": 12050
},
{
"epoch": 3.5216581276199346,
"grad_norm": 0.31944605708122253,
"learning_rate": 0.0005580698893418753,
"loss": 3.63,
"step": 12100
},
{
"epoch": 3.5362133209129016,
"grad_norm": 0.3223167955875397,
"learning_rate": 0.0005578951659871869,
"loss": 3.6368,
"step": 12150
},
{
"epoch": 3.5507685142058687,
"grad_norm": 0.31075170636177063,
"learning_rate": 0.0005577204426324985,
"loss": 3.6487,
"step": 12200
},
{
"epoch": 3.5653237074988358,
"grad_norm": 0.30599287152290344,
"learning_rate": 0.0005575457192778101,
"loss": 3.6363,
"step": 12250
},
{
"epoch": 3.5798789007918024,
"grad_norm": 0.3183053433895111,
"learning_rate": 0.0005573709959231217,
"loss": 3.6563,
"step": 12300
},
{
"epoch": 3.5944340940847694,
"grad_norm": 0.3029063642024994,
"learning_rate": 0.0005571962725684332,
"loss": 3.6467,
"step": 12350
},
{
"epoch": 3.6089892873777365,
"grad_norm": 0.3174987733364105,
"learning_rate": 0.0005570215492137449,
"loss": 3.6467,
"step": 12400
},
{
"epoch": 3.623544480670703,
"grad_norm": 0.32621219754219055,
"learning_rate": 0.0005568468258590564,
"loss": 3.6284,
"step": 12450
},
{
"epoch": 3.63809967396367,
"grad_norm": 0.31085944175720215,
"learning_rate": 0.0005566721025043681,
"loss": 3.6346,
"step": 12500
},
{
"epoch": 3.6526548672566372,
"grad_norm": 0.30973267555236816,
"learning_rate": 0.0005564973791496796,
"loss": 3.6303,
"step": 12550
},
{
"epoch": 3.6672100605496043,
"grad_norm": 0.3051224946975708,
"learning_rate": 0.0005563226557949913,
"loss": 3.6498,
"step": 12600
},
{
"epoch": 3.681765253842571,
"grad_norm": 0.30966806411743164,
"learning_rate": 0.0005561479324403028,
"loss": 3.6451,
"step": 12650
},
{
"epoch": 3.696320447135538,
"grad_norm": 0.31790247559547424,
"learning_rate": 0.0005559732090856144,
"loss": 3.6464,
"step": 12700
},
{
"epoch": 3.710875640428505,
"grad_norm": 0.3122842311859131,
"learning_rate": 0.000555798485730926,
"loss": 3.6368,
"step": 12750
},
{
"epoch": 3.7254308337214717,
"grad_norm": 0.3073221445083618,
"learning_rate": 0.0005556237623762376,
"loss": 3.6385,
"step": 12800
},
{
"epoch": 3.7399860270144387,
"grad_norm": 0.3260260224342346,
"learning_rate": 0.0005554490390215492,
"loss": 3.6424,
"step": 12850
},
{
"epoch": 3.754541220307406,
"grad_norm": 0.29862716794013977,
"learning_rate": 0.0005552743156668607,
"loss": 3.6521,
"step": 12900
},
{
"epoch": 3.769096413600373,
"grad_norm": 0.3145371675491333,
"learning_rate": 0.0005550995923121724,
"loss": 3.6345,
"step": 12950
},
{
"epoch": 3.7836516068933395,
"grad_norm": 0.31774255633354187,
"learning_rate": 0.0005549248689574839,
"loss": 3.6383,
"step": 13000
},
{
"epoch": 3.7836516068933395,
"eval_accuracy": 0.3552514314531981,
"eval_loss": 3.6770644187927246,
"eval_runtime": 179.7074,
"eval_samples_per_second": 92.656,
"eval_steps_per_second": 5.793,
"step": 13000
},
{
"epoch": 3.7982068001863065,
"grad_norm": 0.31905579566955566,
"learning_rate": 0.0005547501456027955,
"loss": 3.6483,
"step": 13050
},
{
"epoch": 3.812761993479273,
"grad_norm": 0.29292047023773193,
"learning_rate": 0.0005545754222481071,
"loss": 3.6467,
"step": 13100
},
{
"epoch": 3.82731718677224,
"grad_norm": 0.3134244978427887,
"learning_rate": 0.0005544006988934188,
"loss": 3.6342,
"step": 13150
},
{
"epoch": 3.8418723800652073,
"grad_norm": 0.3155539035797119,
"learning_rate": 0.0005542259755387303,
"loss": 3.6396,
"step": 13200
},
{
"epoch": 3.8564275733581743,
"grad_norm": 0.30693483352661133,
"learning_rate": 0.0005540512521840418,
"loss": 3.6287,
"step": 13250
},
{
"epoch": 3.8709827666511414,
"grad_norm": 0.3106670677661896,
"learning_rate": 0.0005538765288293535,
"loss": 3.6293,
"step": 13300
},
{
"epoch": 3.885537959944108,
"grad_norm": 0.3124571442604065,
"learning_rate": 0.0005537018054746651,
"loss": 3.6383,
"step": 13350
},
{
"epoch": 3.900093153237075,
"grad_norm": 0.2964450418949127,
"learning_rate": 0.0005535270821199767,
"loss": 3.6412,
"step": 13400
},
{
"epoch": 3.9146483465300417,
"grad_norm": 0.2931179702281952,
"learning_rate": 0.0005533523587652882,
"loss": 3.6425,
"step": 13450
},
{
"epoch": 3.9292035398230087,
"grad_norm": 0.30559033155441284,
"learning_rate": 0.0005531776354105999,
"loss": 3.626,
"step": 13500
},
{
"epoch": 3.943758733115976,
"grad_norm": 0.30951324105262756,
"learning_rate": 0.0005530029120559114,
"loss": 3.6453,
"step": 13550
},
{
"epoch": 3.958313926408943,
"grad_norm": 0.3029481768608093,
"learning_rate": 0.0005528281887012229,
"loss": 3.6301,
"step": 13600
},
{
"epoch": 3.9728691197019095,
"grad_norm": 0.2964765727519989,
"learning_rate": 0.0005526534653465346,
"loss": 3.643,
"step": 13650
},
{
"epoch": 3.9874243129948765,
"grad_norm": 0.2986086905002594,
"learning_rate": 0.0005524787419918462,
"loss": 3.6376,
"step": 13700
},
{
"epoch": 4.001746623195156,
"grad_norm": 0.30963781476020813,
"learning_rate": 0.0005523040186371578,
"loss": 3.6197,
"step": 13750
},
{
"epoch": 4.016301816488123,
"grad_norm": 0.3227597177028656,
"learning_rate": 0.0005521292952824693,
"loss": 3.5223,
"step": 13800
},
{
"epoch": 4.03085700978109,
"grad_norm": 0.32853272557258606,
"learning_rate": 0.000551954571927781,
"loss": 3.5391,
"step": 13850
},
{
"epoch": 4.045412203074057,
"grad_norm": 0.32738175988197327,
"learning_rate": 0.0005517798485730926,
"loss": 3.5281,
"step": 13900
},
{
"epoch": 4.059967396367024,
"grad_norm": 0.31711599230766296,
"learning_rate": 0.0005516051252184042,
"loss": 3.5304,
"step": 13950
},
{
"epoch": 4.074522589659991,
"grad_norm": 0.2948492169380188,
"learning_rate": 0.0005514304018637157,
"loss": 3.5359,
"step": 14000
},
{
"epoch": 4.074522589659991,
"eval_accuracy": 0.356798090792429,
"eval_loss": 3.6693131923675537,
"eval_runtime": 179.7092,
"eval_samples_per_second": 92.655,
"eval_steps_per_second": 5.793,
"step": 14000
},
{
"epoch": 4.089077782952957,
"grad_norm": 0.32520484924316406,
"learning_rate": 0.0005512556785090273,
"loss": 3.5517,
"step": 14050
},
{
"epoch": 4.103632976245924,
"grad_norm": 0.3147394061088562,
"learning_rate": 0.0005510809551543389,
"loss": 3.5448,
"step": 14100
},
{
"epoch": 4.118188169538891,
"grad_norm": 0.303621768951416,
"learning_rate": 0.0005509062317996504,
"loss": 3.5423,
"step": 14150
},
{
"epoch": 4.132743362831858,
"grad_norm": 0.3130205571651459,
"learning_rate": 0.0005507315084449621,
"loss": 3.5559,
"step": 14200
},
{
"epoch": 4.1472985561248255,
"grad_norm": 0.342622846364975,
"learning_rate": 0.0005505567850902737,
"loss": 3.5519,
"step": 14250
},
{
"epoch": 4.1618537494177925,
"grad_norm": 0.3157576322555542,
"learning_rate": 0.0005503820617355853,
"loss": 3.5574,
"step": 14300
},
{
"epoch": 4.17640894271076,
"grad_norm": 0.3237878084182739,
"learning_rate": 0.0005502073383808969,
"loss": 3.5596,
"step": 14350
},
{
"epoch": 4.190964136003726,
"grad_norm": 0.3167899250984192,
"learning_rate": 0.0005500326150262085,
"loss": 3.5716,
"step": 14400
},
{
"epoch": 4.205519329296693,
"grad_norm": 0.3111872971057892,
"learning_rate": 0.00054985789167152,
"loss": 3.5554,
"step": 14450
},
{
"epoch": 4.22007452258966,
"grad_norm": 0.3192780315876007,
"learning_rate": 0.0005496831683168316,
"loss": 3.5633,
"step": 14500
},
{
"epoch": 4.234629715882627,
"grad_norm": 0.3170258104801178,
"learning_rate": 0.0005495084449621433,
"loss": 3.5539,
"step": 14550
},
{
"epoch": 4.249184909175594,
"grad_norm": 0.31680041551589966,
"learning_rate": 0.0005493337216074548,
"loss": 3.5602,
"step": 14600
},
{
"epoch": 4.263740102468561,
"grad_norm": 0.3066939413547516,
"learning_rate": 0.0005491589982527664,
"loss": 3.5584,
"step": 14650
},
{
"epoch": 4.278295295761528,
"grad_norm": 0.35077103972435,
"learning_rate": 0.000548984274898078,
"loss": 3.5494,
"step": 14700
},
{
"epoch": 4.292850489054494,
"grad_norm": 0.321871280670166,
"learning_rate": 0.0005488095515433897,
"loss": 3.5569,
"step": 14750
},
{
"epoch": 4.307405682347461,
"grad_norm": 0.34034264087677,
"learning_rate": 0.0005486348281887012,
"loss": 3.5536,
"step": 14800
},
{
"epoch": 4.321960875640428,
"grad_norm": 0.30980339646339417,
"learning_rate": 0.0005484601048340127,
"loss": 3.5674,
"step": 14850
},
{
"epoch": 4.3365160689333955,
"grad_norm": 0.3066848814487457,
"learning_rate": 0.0005482853814793244,
"loss": 3.5601,
"step": 14900
},
{
"epoch": 4.3510712622263625,
"grad_norm": 0.3109862804412842,
"learning_rate": 0.0005481106581246359,
"loss": 3.5693,
"step": 14950
},
{
"epoch": 4.36562645551933,
"grad_norm": 0.321115106344223,
"learning_rate": 0.0005479359347699475,
"loss": 3.5563,
"step": 15000
},
{
"epoch": 4.36562645551933,
"eval_accuracy": 0.35777850357418167,
"eval_loss": 3.657255172729492,
"eval_runtime": 179.8875,
"eval_samples_per_second": 92.563,
"eval_steps_per_second": 5.787,
"step": 15000
},
{
"epoch": 4.380181648812297,
"grad_norm": 0.3224615454673767,
"learning_rate": 0.0005477612114152591,
"loss": 3.5622,
"step": 15050
},
{
"epoch": 4.394736842105263,
"grad_norm": 0.3030092120170593,
"learning_rate": 0.0005475864880605708,
"loss": 3.5642,
"step": 15100
},
{
"epoch": 4.40929203539823,
"grad_norm": 0.33209550380706787,
"learning_rate": 0.0005474117647058823,
"loss": 3.5749,
"step": 15150
},
{
"epoch": 4.423847228691197,
"grad_norm": 0.3086824119091034,
"learning_rate": 0.0005472370413511939,
"loss": 3.5624,
"step": 15200
},
{
"epoch": 4.438402421984164,
"grad_norm": 0.31303492188453674,
"learning_rate": 0.0005470623179965055,
"loss": 3.5803,
"step": 15250
},
{
"epoch": 4.452957615277131,
"grad_norm": 0.30704519152641296,
"learning_rate": 0.0005468875946418171,
"loss": 3.5768,
"step": 15300
},
{
"epoch": 4.467512808570098,
"grad_norm": 0.31732097268104553,
"learning_rate": 0.0005467128712871287,
"loss": 3.5743,
"step": 15350
},
{
"epoch": 4.482068001863064,
"grad_norm": 0.32602792978286743,
"learning_rate": 0.0005465381479324402,
"loss": 3.5603,
"step": 15400
},
{
"epoch": 4.496623195156031,
"grad_norm": 0.3007522523403168,
"learning_rate": 0.0005463634245777519,
"loss": 3.5684,
"step": 15450
},
{
"epoch": 4.511178388448998,
"grad_norm": 0.3011171519756317,
"learning_rate": 0.0005461887012230634,
"loss": 3.5605,
"step": 15500
},
{
"epoch": 4.5257335817419655,
"grad_norm": 0.31969600915908813,
"learning_rate": 0.000546013977868375,
"loss": 3.5675,
"step": 15550
},
{
"epoch": 4.5402887750349326,
"grad_norm": 0.3020431399345398,
"learning_rate": 0.0005458392545136866,
"loss": 3.596,
"step": 15600
},
{
"epoch": 4.5548439683279,
"grad_norm": 0.3164878189563751,
"learning_rate": 0.0005456645311589983,
"loss": 3.5748,
"step": 15650
},
{
"epoch": 4.569399161620867,
"grad_norm": 0.3169105350971222,
"learning_rate": 0.0005454898078043098,
"loss": 3.5658,
"step": 15700
},
{
"epoch": 4.583954354913834,
"grad_norm": 0.3334912359714508,
"learning_rate": 0.0005453150844496213,
"loss": 3.5709,
"step": 15750
},
{
"epoch": 4.5985095482068,
"grad_norm": 0.3061586022377014,
"learning_rate": 0.000545140361094933,
"loss": 3.5786,
"step": 15800
},
{
"epoch": 4.613064741499767,
"grad_norm": 0.3311856687068939,
"learning_rate": 0.0005449656377402445,
"loss": 3.5711,
"step": 15850
},
{
"epoch": 4.627619934792734,
"grad_norm": 0.3058924674987793,
"learning_rate": 0.0005447909143855562,
"loss": 3.5846,
"step": 15900
},
{
"epoch": 4.642175128085701,
"grad_norm": 0.3133726119995117,
"learning_rate": 0.0005446161910308677,
"loss": 3.5762,
"step": 15950
},
{
"epoch": 4.656730321378668,
"grad_norm": 0.2965134084224701,
"learning_rate": 0.0005444414676761794,
"loss": 3.569,
"step": 16000
},
{
"epoch": 4.656730321378668,
"eval_accuracy": 0.3587089672511339,
"eval_loss": 3.643244981765747,
"eval_runtime": 179.7262,
"eval_samples_per_second": 92.646,
"eval_steps_per_second": 5.792,
"step": 16000
},
{
"epoch": 4.671285514671635,
"grad_norm": 0.30532756447792053,
"learning_rate": 0.0005442667443214909,
"loss": 3.5707,
"step": 16050
},
{
"epoch": 4.685840707964601,
"grad_norm": 0.327919065952301,
"learning_rate": 0.0005440920209668024,
"loss": 3.5688,
"step": 16100
},
{
"epoch": 4.7003959012575685,
"grad_norm": 0.30351272225379944,
"learning_rate": 0.0005439172976121141,
"loss": 3.5807,
"step": 16150
},
{
"epoch": 4.7149510945505355,
"grad_norm": 0.3110066056251526,
"learning_rate": 0.0005437425742574257,
"loss": 3.5803,
"step": 16200
},
{
"epoch": 4.729506287843503,
"grad_norm": 0.31730541586875916,
"learning_rate": 0.0005435678509027373,
"loss": 3.5656,
"step": 16250
},
{
"epoch": 4.74406148113647,
"grad_norm": 0.30847522616386414,
"learning_rate": 0.0005433931275480488,
"loss": 3.5674,
"step": 16300
},
{
"epoch": 4.758616674429437,
"grad_norm": 0.29101788997650146,
"learning_rate": 0.0005432184041933605,
"loss": 3.5671,
"step": 16350
},
{
"epoch": 4.773171867722404,
"grad_norm": 0.2971099615097046,
"learning_rate": 0.000543043680838672,
"loss": 3.5663,
"step": 16400
},
{
"epoch": 4.78772706101537,
"grad_norm": 0.292220801115036,
"learning_rate": 0.0005428689574839837,
"loss": 3.5817,
"step": 16450
},
{
"epoch": 4.802282254308337,
"grad_norm": 0.3278649151325226,
"learning_rate": 0.0005426942341292952,
"loss": 3.5734,
"step": 16500
},
{
"epoch": 4.816837447601304,
"grad_norm": 0.3087938725948334,
"learning_rate": 0.0005425195107746068,
"loss": 3.5699,
"step": 16550
},
{
"epoch": 4.831392640894271,
"grad_norm": 0.3141016662120819,
"learning_rate": 0.0005423447874199184,
"loss": 3.571,
"step": 16600
},
{
"epoch": 4.845947834187238,
"grad_norm": 0.32644930481910706,
"learning_rate": 0.00054217006406523,
"loss": 3.5744,
"step": 16650
},
{
"epoch": 4.860503027480205,
"grad_norm": 0.33965256810188293,
"learning_rate": 0.0005419953407105417,
"loss": 3.577,
"step": 16700
},
{
"epoch": 4.875058220773171,
"grad_norm": 0.2934809923171997,
"learning_rate": 0.0005418206173558532,
"loss": 3.5731,
"step": 16750
},
{
"epoch": 4.8896134140661385,
"grad_norm": 0.32220134139060974,
"learning_rate": 0.0005416458940011648,
"loss": 3.5702,
"step": 16800
},
{
"epoch": 4.9041686073591055,
"grad_norm": 0.294540137052536,
"learning_rate": 0.0005414711706464764,
"loss": 3.5755,
"step": 16850
},
{
"epoch": 4.918723800652073,
"grad_norm": 0.33389946818351746,
"learning_rate": 0.000541296447291788,
"loss": 3.5798,
"step": 16900
},
{
"epoch": 4.93327899394504,
"grad_norm": 0.3083215057849884,
"learning_rate": 0.0005411217239370995,
"loss": 3.5754,
"step": 16950
},
{
"epoch": 4.947834187238007,
"grad_norm": 0.3171711564064026,
"learning_rate": 0.0005409470005824111,
"loss": 3.5858,
"step": 17000
},
{
"epoch": 4.947834187238007,
"eval_accuracy": 0.36012622902710545,
"eval_loss": 3.632199287414551,
"eval_runtime": 179.7045,
"eval_samples_per_second": 92.658,
"eval_steps_per_second": 5.793,
"step": 17000
},
{
"epoch": 4.962389380530974,
"grad_norm": 0.29613155126571655,
"learning_rate": 0.0005407722772277228,
"loss": 3.5864,
"step": 17050
},
{
"epoch": 4.976944573823941,
"grad_norm": 0.305269330739975,
"learning_rate": 0.0005405975538730343,
"loss": 3.5816,
"step": 17100
},
{
"epoch": 4.991499767116907,
"grad_norm": 0.293756902217865,
"learning_rate": 0.0005404228305183459,
"loss": 3.5746,
"step": 17150
},
{
"epoch": 5.005822077317187,
"grad_norm": 0.33331188559532166,
"learning_rate": 0.0005402481071636575,
"loss": 3.5379,
"step": 17200
},
{
"epoch": 5.020377270610154,
"grad_norm": 0.30909281969070435,
"learning_rate": 0.0005400733838089692,
"loss": 3.4594,
"step": 17250
},
{
"epoch": 5.034932463903121,
"grad_norm": 0.3031615912914276,
"learning_rate": 0.0005398986604542807,
"loss": 3.4567,
"step": 17300
},
{
"epoch": 5.049487657196088,
"grad_norm": 0.3327254354953766,
"learning_rate": 0.0005397239370995922,
"loss": 3.4685,
"step": 17350
},
{
"epoch": 5.064042850489055,
"grad_norm": 0.32414567470550537,
"learning_rate": 0.0005395492137449039,
"loss": 3.483,
"step": 17400
},
{
"epoch": 5.078598043782021,
"grad_norm": 0.2953879237174988,
"learning_rate": 0.0005393744903902154,
"loss": 3.484,
"step": 17450
},
{
"epoch": 5.093153237074988,
"grad_norm": 0.32500144839286804,
"learning_rate": 0.000539199767035527,
"loss": 3.4798,
"step": 17500
},
{
"epoch": 5.107708430367955,
"grad_norm": 0.33330488204956055,
"learning_rate": 0.0005390250436808386,
"loss": 3.4824,
"step": 17550
},
{
"epoch": 5.122263623660922,
"grad_norm": 0.3294694423675537,
"learning_rate": 0.0005388503203261503,
"loss": 3.4841,
"step": 17600
},
{
"epoch": 5.136818816953889,
"grad_norm": 0.328597754240036,
"learning_rate": 0.0005386755969714618,
"loss": 3.4937,
"step": 17650
},
{
"epoch": 5.151374010246856,
"grad_norm": 0.34227892756462097,
"learning_rate": 0.0005385008736167733,
"loss": 3.4842,
"step": 17700
},
{
"epoch": 5.165929203539823,
"grad_norm": 0.30092424154281616,
"learning_rate": 0.000538326150262085,
"loss": 3.4961,
"step": 17750
},
{
"epoch": 5.18048439683279,
"grad_norm": 0.31993362307548523,
"learning_rate": 0.0005381514269073965,
"loss": 3.5018,
"step": 17800
},
{
"epoch": 5.195039590125757,
"grad_norm": 0.31290343403816223,
"learning_rate": 0.0005379767035527082,
"loss": 3.4916,
"step": 17850
},
{
"epoch": 5.209594783418724,
"grad_norm": 0.31315720081329346,
"learning_rate": 0.0005378019801980197,
"loss": 3.4899,
"step": 17900
},
{
"epoch": 5.224149976711691,
"grad_norm": 0.33438387513160706,
"learning_rate": 0.0005376272568433314,
"loss": 3.5003,
"step": 17950
},
{
"epoch": 5.238705170004658,
"grad_norm": 0.3011881709098816,
"learning_rate": 0.0005374525334886429,
"loss": 3.4991,
"step": 18000
},
{
"epoch": 5.238705170004658,
"eval_accuracy": 0.3604810439621463,
"eval_loss": 3.6344430446624756,
"eval_runtime": 179.6987,
"eval_samples_per_second": 92.661,
"eval_steps_per_second": 5.793,
"step": 18000
},
{
"epoch": 5.253260363297625,
"grad_norm": 0.30630162358283997,
"learning_rate": 0.0005372778101339545,
"loss": 3.501,
"step": 18050
},
{
"epoch": 5.267815556590591,
"grad_norm": 0.31994864344596863,
"learning_rate": 0.0005371030867792661,
"loss": 3.5159,
"step": 18100
},
{
"epoch": 5.282370749883558,
"grad_norm": 0.30697357654571533,
"learning_rate": 0.0005369283634245778,
"loss": 3.5036,
"step": 18150
},
{
"epoch": 5.296925943176525,
"grad_norm": 0.30527111887931824,
"learning_rate": 0.0005367536400698893,
"loss": 3.5074,
"step": 18200
},
{
"epoch": 5.311481136469492,
"grad_norm": 0.3237697184085846,
"learning_rate": 0.0005365789167152008,
"loss": 3.5075,
"step": 18250
},
{
"epoch": 5.326036329762459,
"grad_norm": 0.3105727434158325,
"learning_rate": 0.0005364041933605125,
"loss": 3.5095,
"step": 18300
},
{
"epoch": 5.340591523055426,
"grad_norm": 0.3035230040550232,
"learning_rate": 0.000536229470005824,
"loss": 3.5185,
"step": 18350
},
{
"epoch": 5.3551467163483935,
"grad_norm": 0.3358999788761139,
"learning_rate": 0.0005360547466511357,
"loss": 3.5145,
"step": 18400
},
{
"epoch": 5.36970190964136,
"grad_norm": 0.3387795388698578,
"learning_rate": 0.0005358800232964472,
"loss": 3.5195,
"step": 18450
},
{
"epoch": 5.384257102934327,
"grad_norm": 0.30367156863212585,
"learning_rate": 0.0005357052999417589,
"loss": 3.5161,
"step": 18500
},
{
"epoch": 5.398812296227294,
"grad_norm": 0.32554861903190613,
"learning_rate": 0.0005355305765870704,
"loss": 3.5156,
"step": 18550
},
{
"epoch": 5.413367489520261,
"grad_norm": 0.32227516174316406,
"learning_rate": 0.000535355853232382,
"loss": 3.521,
"step": 18600
},
{
"epoch": 5.427922682813228,
"grad_norm": 0.30939218401908875,
"learning_rate": 0.0005351811298776936,
"loss": 3.5177,
"step": 18650
},
{
"epoch": 5.442477876106195,
"grad_norm": 0.3210090696811676,
"learning_rate": 0.0005350064065230052,
"loss": 3.5075,
"step": 18700
},
{
"epoch": 5.457033069399162,
"grad_norm": 0.30049794912338257,
"learning_rate": 0.0005348316831683168,
"loss": 3.5269,
"step": 18750
},
{
"epoch": 5.471588262692128,
"grad_norm": 0.31089526414871216,
"learning_rate": 0.0005346569598136284,
"loss": 3.5171,
"step": 18800
},
{
"epoch": 5.486143455985095,
"grad_norm": 0.3039928674697876,
"learning_rate": 0.00053448223645894,
"loss": 3.5192,
"step": 18850
},
{
"epoch": 5.500698649278062,
"grad_norm": 0.3058149814605713,
"learning_rate": 0.0005343075131042515,
"loss": 3.514,
"step": 18900
},
{
"epoch": 5.515253842571029,
"grad_norm": 0.30949902534484863,
"learning_rate": 0.0005341327897495632,
"loss": 3.513,
"step": 18950
},
{
"epoch": 5.529809035863996,
"grad_norm": 0.2995462417602539,
"learning_rate": 0.0005339580663948748,
"loss": 3.5221,
"step": 19000
},
{
"epoch": 5.529809035863996,
"eval_accuracy": 0.3616167103143491,
"eval_loss": 3.6234872341156006,
"eval_runtime": 179.8697,
"eval_samples_per_second": 92.573,
"eval_steps_per_second": 5.788,
"step": 19000
},
{
"epoch": 5.5443642291569635,
"grad_norm": 0.3011551797389984,
"learning_rate": 0.0005337833430401863,
"loss": 3.5138,
"step": 19050
},
{
"epoch": 5.5589194224499305,
"grad_norm": 0.33986860513687134,
"learning_rate": 0.0005336086196854979,
"loss": 3.5246,
"step": 19100
},
{
"epoch": 5.573474615742897,
"grad_norm": 0.32027342915534973,
"learning_rate": 0.0005334338963308095,
"loss": 3.5271,
"step": 19150
},
{
"epoch": 5.588029809035864,
"grad_norm": 0.31938230991363525,
"learning_rate": 0.0005332591729761211,
"loss": 3.5304,
"step": 19200
},
{
"epoch": 5.602585002328831,
"grad_norm": 0.3016068935394287,
"learning_rate": 0.0005330844496214327,
"loss": 3.5239,
"step": 19250
},
{
"epoch": 5.617140195621798,
"grad_norm": 0.33804088830947876,
"learning_rate": 0.0005329097262667443,
"loss": 3.5175,
"step": 19300
},
{
"epoch": 5.631695388914765,
"grad_norm": 0.31818464398384094,
"learning_rate": 0.0005327350029120559,
"loss": 3.5263,
"step": 19350
},
{
"epoch": 5.646250582207732,
"grad_norm": 0.2964155673980713,
"learning_rate": 0.0005325602795573674,
"loss": 3.5271,
"step": 19400
},
{
"epoch": 5.660805775500698,
"grad_norm": 0.31857946515083313,
"learning_rate": 0.000532385556202679,
"loss": 3.5196,
"step": 19450
},
{
"epoch": 5.675360968793665,
"grad_norm": 0.3290652334690094,
"learning_rate": 0.0005322108328479906,
"loss": 3.5299,
"step": 19500
},
{
"epoch": 5.689916162086632,
"grad_norm": 0.337568461894989,
"learning_rate": 0.0005320361094933023,
"loss": 3.5341,
"step": 19550
},
{
"epoch": 5.704471355379599,
"grad_norm": 0.3184109032154083,
"learning_rate": 0.0005318613861386138,
"loss": 3.5251,
"step": 19600
},
{
"epoch": 5.719026548672566,
"grad_norm": 0.30031880736351013,
"learning_rate": 0.0005316866627839254,
"loss": 3.5197,
"step": 19650
},
{
"epoch": 5.7335817419655335,
"grad_norm": 0.3102143406867981,
"learning_rate": 0.000531511939429237,
"loss": 3.5203,
"step": 19700
},
{
"epoch": 5.748136935258501,
"grad_norm": 0.31440630555152893,
"learning_rate": 0.0005313372160745486,
"loss": 3.5217,
"step": 19750
},
{
"epoch": 5.762692128551468,
"grad_norm": 0.3147021532058716,
"learning_rate": 0.0005311624927198602,
"loss": 3.505,
"step": 19800
},
{
"epoch": 5.777247321844434,
"grad_norm": 0.312235027551651,
"learning_rate": 0.0005309877693651717,
"loss": 3.5302,
"step": 19850
},
{
"epoch": 5.791802515137401,
"grad_norm": 0.30304163694381714,
"learning_rate": 0.0005308130460104834,
"loss": 3.5261,
"step": 19900
},
{
"epoch": 5.806357708430368,
"grad_norm": 0.32415395975112915,
"learning_rate": 0.0005306383226557949,
"loss": 3.5251,
"step": 19950
},
{
"epoch": 5.820912901723335,
"grad_norm": 0.29884156584739685,
"learning_rate": 0.0005304635993011065,
"loss": 3.5329,
"step": 20000
},
{
"epoch": 5.820912901723335,
"eval_accuracy": 0.3623818130725857,
"eval_loss": 3.611879587173462,
"eval_runtime": 179.674,
"eval_samples_per_second": 92.673,
"eval_steps_per_second": 5.794,
"step": 20000
},
{
"epoch": 5.835468095016302,
"grad_norm": 0.30554407835006714,
"learning_rate": 0.0005302888759464181,
"loss": 3.5344,
"step": 20050
},
{
"epoch": 5.850023288309269,
"grad_norm": 0.3254547715187073,
"learning_rate": 0.0005301141525917298,
"loss": 3.5282,
"step": 20100
},
{
"epoch": 5.864578481602235,
"grad_norm": 0.3139106035232544,
"learning_rate": 0.0005299394292370413,
"loss": 3.5229,
"step": 20150
},
{
"epoch": 5.879133674895202,
"grad_norm": 0.32929766178131104,
"learning_rate": 0.0005297647058823528,
"loss": 3.5214,
"step": 20200
},
{
"epoch": 5.893688868188169,
"grad_norm": 0.31918251514434814,
"learning_rate": 0.0005295899825276645,
"loss": 3.5211,
"step": 20250
},
{
"epoch": 5.9082440614811365,
"grad_norm": 0.3251992166042328,
"learning_rate": 0.000529415259172976,
"loss": 3.5237,
"step": 20300
},
{
"epoch": 5.9227992547741035,
"grad_norm": 0.30213046073913574,
"learning_rate": 0.0005292405358182877,
"loss": 3.5333,
"step": 20350
},
{
"epoch": 5.937354448067071,
"grad_norm": 0.3040863275527954,
"learning_rate": 0.0005290658124635992,
"loss": 3.5296,
"step": 20400
},
{
"epoch": 5.951909641360038,
"grad_norm": 0.3090716600418091,
"learning_rate": 0.0005288910891089109,
"loss": 3.5339,
"step": 20450
},
{
"epoch": 5.966464834653004,
"grad_norm": 0.31416988372802734,
"learning_rate": 0.0005287163657542224,
"loss": 3.5275,
"step": 20500
},
{
"epoch": 5.981020027945971,
"grad_norm": 0.32365185022354126,
"learning_rate": 0.000528541642399534,
"loss": 3.5326,
"step": 20550
},
{
"epoch": 5.995575221238938,
"grad_norm": 0.31189417839050293,
"learning_rate": 0.0005283669190448456,
"loss": 3.5301,
"step": 20600
},
{
"epoch": 6.009897531439218,
"grad_norm": 0.30607596039772034,
"learning_rate": 0.0005281921956901572,
"loss": 3.4514,
"step": 20650
},
{
"epoch": 6.024452724732185,
"grad_norm": 0.2989142835140228,
"learning_rate": 0.0005280174723354688,
"loss": 3.4082,
"step": 20700
},
{
"epoch": 6.039007918025152,
"grad_norm": 0.30611732602119446,
"learning_rate": 0.0005278427489807804,
"loss": 3.4312,
"step": 20750
},
{
"epoch": 6.053563111318118,
"grad_norm": 0.33311450481414795,
"learning_rate": 0.000527668025626092,
"loss": 3.4145,
"step": 20800
},
{
"epoch": 6.068118304611085,
"grad_norm": 0.3017202913761139,
"learning_rate": 0.0005274933022714035,
"loss": 3.4378,
"step": 20850
},
{
"epoch": 6.082673497904052,
"grad_norm": 0.331897109746933,
"learning_rate": 0.0005273185789167152,
"loss": 3.4208,
"step": 20900
},
{
"epoch": 6.097228691197019,
"grad_norm": 0.32341882586479187,
"learning_rate": 0.0005271438555620268,
"loss": 3.4489,
"step": 20950
},
{
"epoch": 6.111783884489986,
"grad_norm": 0.3440253734588623,
"learning_rate": 0.0005269691322073384,
"loss": 3.4393,
"step": 21000
},
{
"epoch": 6.111783884489986,
"eval_accuracy": 0.3628803639021463,
"eval_loss": 3.613924503326416,
"eval_runtime": 179.8223,
"eval_samples_per_second": 92.597,
"eval_steps_per_second": 5.789,
"step": 21000
},
{
"epoch": 6.126339077782953,
"grad_norm": 0.3371540904045105,
"learning_rate": 0.0005267944088526499,
"loss": 3.446,
"step": 21050
},
{
"epoch": 6.14089427107592,
"grad_norm": 0.328134685754776,
"learning_rate": 0.0005266196854979615,
"loss": 3.4384,
"step": 21100
},
{
"epoch": 6.155449464368886,
"grad_norm": 0.33417844772338867,
"learning_rate": 0.0005264449621432731,
"loss": 3.4554,
"step": 21150
},
{
"epoch": 6.1700046576618535,
"grad_norm": 0.32764503359794617,
"learning_rate": 0.0005262702387885847,
"loss": 3.4483,
"step": 21200
},
{
"epoch": 6.1845598509548205,
"grad_norm": 0.3138699233531952,
"learning_rate": 0.0005260955154338963,
"loss": 3.4495,
"step": 21250
},
{
"epoch": 6.199115044247788,
"grad_norm": 0.33888301253318787,
"learning_rate": 0.0005259207920792079,
"loss": 3.4616,
"step": 21300
},
{
"epoch": 6.213670237540755,
"grad_norm": 0.3592548668384552,
"learning_rate": 0.0005257460687245195,
"loss": 3.4615,
"step": 21350
},
{
"epoch": 6.228225430833722,
"grad_norm": 0.30623868107795715,
"learning_rate": 0.000525571345369831,
"loss": 3.4654,
"step": 21400
},
{
"epoch": 6.242780624126689,
"grad_norm": 0.3069242835044861,
"learning_rate": 0.0005253966220151426,
"loss": 3.4524,
"step": 21450
},
{
"epoch": 6.257335817419655,
"grad_norm": 0.3082742989063263,
"learning_rate": 0.0005252218986604543,
"loss": 3.4639,
"step": 21500
},
{
"epoch": 6.271891010712622,
"grad_norm": 0.31266793608665466,
"learning_rate": 0.0005250471753057658,
"loss": 3.4535,
"step": 21550
},
{
"epoch": 6.286446204005589,
"grad_norm": 0.3180970847606659,
"learning_rate": 0.0005248724519510774,
"loss": 3.4587,
"step": 21600
},
{
"epoch": 6.301001397298556,
"grad_norm": 0.31825992465019226,
"learning_rate": 0.000524697728596389,
"loss": 3.4618,
"step": 21650
},
{
"epoch": 6.315556590591523,
"grad_norm": 0.31838515400886536,
"learning_rate": 0.0005245230052417006,
"loss": 3.4509,
"step": 21700
},
{
"epoch": 6.33011178388449,
"grad_norm": 0.3172382116317749,
"learning_rate": 0.0005243482818870122,
"loss": 3.4688,
"step": 21750
},
{
"epoch": 6.344666977177457,
"grad_norm": 0.34158939123153687,
"learning_rate": 0.0005241735585323238,
"loss": 3.4639,
"step": 21800
},
{
"epoch": 6.3592221704704235,
"grad_norm": 0.31548604369163513,
"learning_rate": 0.0005239988351776354,
"loss": 3.4599,
"step": 21850
},
{
"epoch": 6.3737773637633905,
"grad_norm": 0.3228817880153656,
"learning_rate": 0.0005238241118229469,
"loss": 3.4779,
"step": 21900
},
{
"epoch": 6.388332557056358,
"grad_norm": 0.32255199551582336,
"learning_rate": 0.0005236493884682585,
"loss": 3.4593,
"step": 21950
},
{
"epoch": 6.402887750349325,
"grad_norm": 0.3081405460834503,
"learning_rate": 0.0005234746651135701,
"loss": 3.4739,
"step": 22000
},
{
"epoch": 6.402887750349325,
"eval_accuracy": 0.3633811477505097,
"eval_loss": 3.608250856399536,
"eval_runtime": 179.8112,
"eval_samples_per_second": 92.603,
"eval_steps_per_second": 5.789,
"step": 22000
},
{
"epoch": 6.417442943642292,
"grad_norm": 0.3040979504585266,
"learning_rate": 0.0005232999417588818,
"loss": 3.4686,
"step": 22050
},
{
"epoch": 6.431998136935259,
"grad_norm": 0.32507723569869995,
"learning_rate": 0.0005231252184041933,
"loss": 3.4753,
"step": 22100
},
{
"epoch": 6.446553330228225,
"grad_norm": 0.35055863857269287,
"learning_rate": 0.0005229504950495049,
"loss": 3.459,
"step": 22150
},
{
"epoch": 6.461108523521192,
"grad_norm": 0.34218981862068176,
"learning_rate": 0.0005227757716948165,
"loss": 3.4631,
"step": 22200
},
{
"epoch": 6.475663716814159,
"grad_norm": 0.3296588063240051,
"learning_rate": 0.000522601048340128,
"loss": 3.4817,
"step": 22250
},
{
"epoch": 6.490218910107126,
"grad_norm": 0.33755651116371155,
"learning_rate": 0.0005224263249854397,
"loss": 3.4749,
"step": 22300
},
{
"epoch": 6.504774103400093,
"grad_norm": 0.36849409341812134,
"learning_rate": 0.0005222516016307512,
"loss": 3.4844,
"step": 22350
},
{
"epoch": 6.51932929669306,
"grad_norm": 0.3198098838329315,
"learning_rate": 0.0005220768782760629,
"loss": 3.4798,
"step": 22400
},
{
"epoch": 6.533884489986027,
"grad_norm": 0.36485329270362854,
"learning_rate": 0.0005219021549213744,
"loss": 3.4739,
"step": 22450
},
{
"epoch": 6.548439683278994,
"grad_norm": 0.36852753162384033,
"learning_rate": 0.000521727431566686,
"loss": 3.4807,
"step": 22500
},
{
"epoch": 6.562994876571961,
"grad_norm": 0.31377214193344116,
"learning_rate": 0.0005215527082119976,
"loss": 3.4769,
"step": 22550
},
{
"epoch": 6.577550069864928,
"grad_norm": 0.3116380274295807,
"learning_rate": 0.0005213779848573093,
"loss": 3.4823,
"step": 22600
},
{
"epoch": 6.592105263157895,
"grad_norm": 0.31712695956230164,
"learning_rate": 0.0005212032615026208,
"loss": 3.4833,
"step": 22650
},
{
"epoch": 6.606660456450862,
"grad_norm": 0.31625989079475403,
"learning_rate": 0.0005210285381479323,
"loss": 3.4778,
"step": 22700
},
{
"epoch": 6.621215649743829,
"grad_norm": 0.32391512393951416,
"learning_rate": 0.000520853814793244,
"loss": 3.4826,
"step": 22750
},
{
"epoch": 6.635770843036796,
"grad_norm": 0.3269652724266052,
"learning_rate": 0.0005206790914385555,
"loss": 3.4782,
"step": 22800
},
{
"epoch": 6.650326036329762,
"grad_norm": 0.32664456963539124,
"learning_rate": 0.0005205043680838672,
"loss": 3.4819,
"step": 22850
},
{
"epoch": 6.664881229622729,
"grad_norm": 0.32027164101600647,
"learning_rate": 0.0005203296447291787,
"loss": 3.4821,
"step": 22900
},
{
"epoch": 6.679436422915696,
"grad_norm": 0.3294582664966583,
"learning_rate": 0.0005201549213744904,
"loss": 3.4927,
"step": 22950
},
{
"epoch": 6.693991616208663,
"grad_norm": 0.32960760593414307,
"learning_rate": 0.0005199801980198019,
"loss": 3.4852,
"step": 23000
},
{
"epoch": 6.693991616208663,
"eval_accuracy": 0.3639370519051117,
"eval_loss": 3.6000916957855225,
"eval_runtime": 179.769,
"eval_samples_per_second": 92.624,
"eval_steps_per_second": 5.791,
"step": 23000
},
{
"epoch": 6.70854680950163,
"grad_norm": 0.31536969542503357,
"learning_rate": 0.0005198054746651136,
"loss": 3.4909,
"step": 23050
},
{
"epoch": 6.723102002794597,
"grad_norm": 0.3355516195297241,
"learning_rate": 0.0005196307513104251,
"loss": 3.482,
"step": 23100
},
{
"epoch": 6.737657196087564,
"grad_norm": 0.31154128909111023,
"learning_rate": 0.0005194560279557367,
"loss": 3.4761,
"step": 23150
},
{
"epoch": 6.752212389380531,
"grad_norm": 0.33221757411956787,
"learning_rate": 0.0005192813046010483,
"loss": 3.4905,
"step": 23200
},
{
"epoch": 6.766767582673498,
"grad_norm": 0.32357120513916016,
"learning_rate": 0.0005191065812463599,
"loss": 3.483,
"step": 23250
},
{
"epoch": 6.781322775966465,
"grad_norm": 0.30440056324005127,
"learning_rate": 0.0005189318578916715,
"loss": 3.4882,
"step": 23300
},
{
"epoch": 6.795877969259432,
"grad_norm": 0.33167630434036255,
"learning_rate": 0.000518757134536983,
"loss": 3.4971,
"step": 23350
},
{
"epoch": 6.810433162552399,
"grad_norm": 0.33919447660446167,
"learning_rate": 0.0005185824111822947,
"loss": 3.4903,
"step": 23400
},
{
"epoch": 6.824988355845366,
"grad_norm": 0.3298689126968384,
"learning_rate": 0.0005184076878276063,
"loss": 3.4918,
"step": 23450
},
{
"epoch": 6.839543549138332,
"grad_norm": 0.33077070116996765,
"learning_rate": 0.0005182329644729179,
"loss": 3.4843,
"step": 23500
},
{
"epoch": 6.854098742431299,
"grad_norm": 0.32967621088027954,
"learning_rate": 0.0005180582411182294,
"loss": 3.4893,
"step": 23550
},
{
"epoch": 6.868653935724266,
"grad_norm": 0.3005642294883728,
"learning_rate": 0.000517883517763541,
"loss": 3.496,
"step": 23600
},
{
"epoch": 6.883209129017233,
"grad_norm": 0.33780768513679504,
"learning_rate": 0.0005177087944088526,
"loss": 3.4948,
"step": 23650
},
{
"epoch": 6.8977643223102,
"grad_norm": 0.30605146288871765,
"learning_rate": 0.0005175340710541642,
"loss": 3.4951,
"step": 23700
},
{
"epoch": 6.912319515603167,
"grad_norm": 0.30990949273109436,
"learning_rate": 0.0005173593476994758,
"loss": 3.4877,
"step": 23750
},
{
"epoch": 6.926874708896134,
"grad_norm": 0.30463626980781555,
"learning_rate": 0.0005171846243447874,
"loss": 3.5054,
"step": 23800
},
{
"epoch": 6.9414299021891015,
"grad_norm": 0.3081115484237671,
"learning_rate": 0.000517009900990099,
"loss": 3.4877,
"step": 23850
},
{
"epoch": 6.955985095482068,
"grad_norm": 0.3156517446041107,
"learning_rate": 0.0005168351776354105,
"loss": 3.4985,
"step": 23900
},
{
"epoch": 6.970540288775035,
"grad_norm": 0.3113146722316742,
"learning_rate": 0.0005166604542807221,
"loss": 3.5097,
"step": 23950
},
{
"epoch": 6.985095482068002,
"grad_norm": 0.30410292744636536,
"learning_rate": 0.0005164857309260338,
"loss": 3.4901,
"step": 24000
},
{
"epoch": 6.985095482068002,
"eval_accuracy": 0.3646075451824911,
"eval_loss": 3.5917577743530273,
"eval_runtime": 179.7023,
"eval_samples_per_second": 92.659,
"eval_steps_per_second": 5.793,
"step": 24000
},
{
"epoch": 6.999650675360969,
"grad_norm": 0.3223191797733307,
"learning_rate": 0.0005163110075713453,
"loss": 3.4891,
"step": 24050
},
{
"epoch": 7.0139729855612485,
"grad_norm": 0.32697129249572754,
"learning_rate": 0.0005161362842166569,
"loss": 3.3921,
"step": 24100
},
{
"epoch": 7.0285281788542155,
"grad_norm": 0.31622612476348877,
"learning_rate": 0.0005159615608619685,
"loss": 3.3832,
"step": 24150
},
{
"epoch": 7.043083372147182,
"grad_norm": 0.31813427805900574,
"learning_rate": 0.0005157868375072801,
"loss": 3.382,
"step": 24200
},
{
"epoch": 7.057638565440149,
"grad_norm": 0.3346388638019562,
"learning_rate": 0.0005156121141525917,
"loss": 3.3943,
"step": 24250
},
{
"epoch": 7.072193758733116,
"grad_norm": 0.32561907172203064,
"learning_rate": 0.0005154373907979033,
"loss": 3.3955,
"step": 24300
},
{
"epoch": 7.086748952026083,
"grad_norm": 0.34515753388404846,
"learning_rate": 0.0005152626674432149,
"loss": 3.4014,
"step": 24350
},
{
"epoch": 7.10130414531905,
"grad_norm": 0.3135473430156708,
"learning_rate": 0.0005150879440885264,
"loss": 3.3965,
"step": 24400
},
{
"epoch": 7.115859338612017,
"grad_norm": 0.31428104639053345,
"learning_rate": 0.000514913220733838,
"loss": 3.3987,
"step": 24450
},
{
"epoch": 7.130414531904984,
"grad_norm": 0.3100883960723877,
"learning_rate": 0.0005147384973791496,
"loss": 3.4165,
"step": 24500
},
{
"epoch": 7.14496972519795,
"grad_norm": 0.32985416054725647,
"learning_rate": 0.0005145637740244613,
"loss": 3.4222,
"step": 24550
},
{
"epoch": 7.159524918490917,
"grad_norm": 0.3290567100048065,
"learning_rate": 0.0005143890506697728,
"loss": 3.4001,
"step": 24600
},
{
"epoch": 7.174080111783884,
"grad_norm": 0.33694422245025635,
"learning_rate": 0.0005142143273150844,
"loss": 3.4296,
"step": 24650
},
{
"epoch": 7.1886353050768514,
"grad_norm": 0.3293610215187073,
"learning_rate": 0.000514039603960396,
"loss": 3.405,
"step": 24700
},
{
"epoch": 7.2031904983698185,
"grad_norm": 0.3535860776901245,
"learning_rate": 0.0005138648806057075,
"loss": 3.4236,
"step": 24750
},
{
"epoch": 7.217745691662786,
"grad_norm": 0.329603374004364,
"learning_rate": 0.0005136901572510192,
"loss": 3.4145,
"step": 24800
},
{
"epoch": 7.232300884955753,
"grad_norm": 0.3592855632305145,
"learning_rate": 0.0005135154338963307,
"loss": 3.4108,
"step": 24850
},
{
"epoch": 7.246856078248719,
"grad_norm": 0.31975114345550537,
"learning_rate": 0.0005133407105416424,
"loss": 3.4276,
"step": 24900
},
{
"epoch": 7.261411271541686,
"grad_norm": 0.30115067958831787,
"learning_rate": 0.0005131659871869539,
"loss": 3.4279,
"step": 24950
},
{
"epoch": 7.275966464834653,
"grad_norm": 0.3238639831542969,
"learning_rate": 0.0005129912638322656,
"loss": 3.4147,
"step": 25000
},
{
"epoch": 7.275966464834653,
"eval_accuracy": 0.36494050003872525,
"eval_loss": 3.5967977046966553,
"eval_runtime": 179.8383,
"eval_samples_per_second": 92.589,
"eval_steps_per_second": 5.789,
"step": 25000
},
{
"epoch": 7.29052165812762,
"grad_norm": 0.3399578332901001,
"learning_rate": 0.0005128165404775771,
"loss": 3.4347,
"step": 25050
},
{
"epoch": 7.305076851420587,
"grad_norm": 0.3150589168071747,
"learning_rate": 0.0005126418171228888,
"loss": 3.4344,
"step": 25100
},
{
"epoch": 7.319632044713554,
"grad_norm": 0.3032225966453552,
"learning_rate": 0.0005124670937682003,
"loss": 3.4145,
"step": 25150
},
{
"epoch": 7.334187238006521,
"grad_norm": 0.3271799683570862,
"learning_rate": 0.000512292370413512,
"loss": 3.4279,
"step": 25200
},
{
"epoch": 7.348742431299487,
"grad_norm": 0.339102566242218,
"learning_rate": 0.0005121176470588235,
"loss": 3.4321,
"step": 25250
},
{
"epoch": 7.363297624592454,
"grad_norm": 0.3249066174030304,
"learning_rate": 0.000511942923704135,
"loss": 3.4173,
"step": 25300
},
{
"epoch": 7.3778528178854215,
"grad_norm": 0.3434494137763977,
"learning_rate": 0.0005117682003494467,
"loss": 3.44,
"step": 25350
},
{
"epoch": 7.3924080111783885,
"grad_norm": 0.3373579680919647,
"learning_rate": 0.0005115934769947583,
"loss": 3.445,
"step": 25400
},
{
"epoch": 7.406963204471356,
"grad_norm": 0.3416643440723419,
"learning_rate": 0.0005114187536400699,
"loss": 3.4308,
"step": 25450
},
{
"epoch": 7.421518397764323,
"grad_norm": 0.3268771171569824,
"learning_rate": 0.0005112440302853814,
"loss": 3.4366,
"step": 25500
},
{
"epoch": 7.436073591057289,
"grad_norm": 0.3285571038722992,
"learning_rate": 0.0005110693069306931,
"loss": 3.4416,
"step": 25550
},
{
"epoch": 7.450628784350256,
"grad_norm": 0.33393388986587524,
"learning_rate": 0.0005108945835760046,
"loss": 3.4464,
"step": 25600
},
{
"epoch": 7.465183977643223,
"grad_norm": 0.33005261421203613,
"learning_rate": 0.0005107198602213162,
"loss": 3.4413,
"step": 25650
},
{
"epoch": 7.47973917093619,
"grad_norm": 0.31219834089279175,
"learning_rate": 0.0005105451368666278,
"loss": 3.4401,
"step": 25700
},
{
"epoch": 7.494294364229157,
"grad_norm": 0.315737247467041,
"learning_rate": 0.0005103704135119394,
"loss": 3.4436,
"step": 25750
},
{
"epoch": 7.508849557522124,
"grad_norm": 0.349681556224823,
"learning_rate": 0.000510195690157251,
"loss": 3.4555,
"step": 25800
},
{
"epoch": 7.523404750815091,
"grad_norm": 0.31493327021598816,
"learning_rate": 0.0005100209668025625,
"loss": 3.4331,
"step": 25850
},
{
"epoch": 7.537959944108058,
"grad_norm": 0.3189566433429718,
"learning_rate": 0.0005098462434478742,
"loss": 3.4371,
"step": 25900
},
{
"epoch": 7.552515137401024,
"grad_norm": 0.3341345191001892,
"learning_rate": 0.0005096715200931858,
"loss": 3.456,
"step": 25950
},
{
"epoch": 7.5670703306939915,
"grad_norm": 0.30145561695098877,
"learning_rate": 0.0005094967967384974,
"loss": 3.4439,
"step": 26000
},
{
"epoch": 7.5670703306939915,
"eval_accuracy": 0.36580832166189253,
"eval_loss": 3.5877833366394043,
"eval_runtime": 179.756,
"eval_samples_per_second": 92.631,
"eval_steps_per_second": 5.791,
"step": 26000
},
{
"epoch": 7.5816255239869585,
"grad_norm": 0.33717626333236694,
"learning_rate": 0.0005093220733838089,
"loss": 3.4481,
"step": 26050
},
{
"epoch": 7.596180717279926,
"grad_norm": 0.3323234021663666,
"learning_rate": 0.0005091473500291205,
"loss": 3.4396,
"step": 26100
},
{
"epoch": 7.610735910572893,
"grad_norm": 0.33281031250953674,
"learning_rate": 0.0005089726266744321,
"loss": 3.4597,
"step": 26150
},
{
"epoch": 7.625291103865859,
"grad_norm": 0.30492302775382996,
"learning_rate": 0.0005087979033197437,
"loss": 3.4504,
"step": 26200
},
{
"epoch": 7.639846297158826,
"grad_norm": 0.3305889666080475,
"learning_rate": 0.0005086231799650553,
"loss": 3.4563,
"step": 26250
},
{
"epoch": 7.654401490451793,
"grad_norm": 0.3255027234554291,
"learning_rate": 0.0005084484566103669,
"loss": 3.4421,
"step": 26300
},
{
"epoch": 7.66895668374476,
"grad_norm": 0.3166560232639313,
"learning_rate": 0.0005082737332556785,
"loss": 3.4486,
"step": 26350
},
{
"epoch": 7.683511877037727,
"grad_norm": 0.3044646382331848,
"learning_rate": 0.00050809900990099,
"loss": 3.4568,
"step": 26400
},
{
"epoch": 7.698067070330694,
"grad_norm": 0.33230146765708923,
"learning_rate": 0.0005079242865463016,
"loss": 3.4496,
"step": 26450
},
{
"epoch": 7.712622263623661,
"grad_norm": 0.3186247646808624,
"learning_rate": 0.0005077495631916133,
"loss": 3.4491,
"step": 26500
},
{
"epoch": 7.727177456916628,
"grad_norm": 0.31578993797302246,
"learning_rate": 0.0005075748398369248,
"loss": 3.4557,
"step": 26550
},
{
"epoch": 7.7417326502095944,
"grad_norm": 0.32597461342811584,
"learning_rate": 0.0005074001164822364,
"loss": 3.462,
"step": 26600
},
{
"epoch": 7.7562878435025615,
"grad_norm": 0.31171032786369324,
"learning_rate": 0.000507225393127548,
"loss": 3.4615,
"step": 26650
},
{
"epoch": 7.770843036795529,
"grad_norm": 0.3156268894672394,
"learning_rate": 0.0005070506697728596,
"loss": 3.4549,
"step": 26700
},
{
"epoch": 7.785398230088496,
"grad_norm": 0.31520920991897583,
"learning_rate": 0.0005068759464181711,
"loss": 3.4521,
"step": 26750
},
{
"epoch": 7.799953423381463,
"grad_norm": 0.3372666835784912,
"learning_rate": 0.0005067012230634828,
"loss": 3.4647,
"step": 26800
},
{
"epoch": 7.81450861667443,
"grad_norm": 0.30733582377433777,
"learning_rate": 0.0005065264997087944,
"loss": 3.4506,
"step": 26850
},
{
"epoch": 7.829063809967396,
"grad_norm": 0.35157862305641174,
"learning_rate": 0.0005063517763541059,
"loss": 3.4553,
"step": 26900
},
{
"epoch": 7.843619003260363,
"grad_norm": 0.3399302661418915,
"learning_rate": 0.0005061770529994175,
"loss": 3.4566,
"step": 26950
},
{
"epoch": 7.85817419655333,
"grad_norm": 0.33593276143074036,
"learning_rate": 0.0005060023296447291,
"loss": 3.4694,
"step": 27000
},
{
"epoch": 7.85817419655333,
"eval_accuracy": 0.3663316707528952,
"eval_loss": 3.5779364109039307,
"eval_runtime": 179.7006,
"eval_samples_per_second": 92.66,
"eval_steps_per_second": 5.793,
"step": 27000
},
{
"epoch": 7.872729389846297,
"grad_norm": 0.32834339141845703,
"learning_rate": 0.0005058276062900408,
"loss": 3.4545,
"step": 27050
},
{
"epoch": 7.887284583139264,
"grad_norm": 0.3266600966453552,
"learning_rate": 0.0005056528829353523,
"loss": 3.4561,
"step": 27100
},
{
"epoch": 7.901839776432231,
"grad_norm": 0.34114813804626465,
"learning_rate": 0.000505478159580664,
"loss": 3.4538,
"step": 27150
},
{
"epoch": 7.916394969725198,
"grad_norm": 0.31827306747436523,
"learning_rate": 0.0005053034362259755,
"loss": 3.4641,
"step": 27200
},
{
"epoch": 7.930950163018165,
"grad_norm": 0.36237582564353943,
"learning_rate": 0.000505128712871287,
"loss": 3.453,
"step": 27250
},
{
"epoch": 7.9455053563111315,
"grad_norm": 0.3262782096862793,
"learning_rate": 0.0005049539895165987,
"loss": 3.4534,
"step": 27300
},
{
"epoch": 7.960060549604099,
"grad_norm": 0.31556040048599243,
"learning_rate": 0.0005047792661619103,
"loss": 3.4556,
"step": 27350
},
{
"epoch": 7.974615742897066,
"grad_norm": 0.3208247423171997,
"learning_rate": 0.0005046045428072219,
"loss": 3.4579,
"step": 27400
},
{
"epoch": 7.989170936190033,
"grad_norm": 0.3132985234260559,
"learning_rate": 0.0005044298194525334,
"loss": 3.4575,
"step": 27450
},
{
"epoch": 8.003493246390311,
"grad_norm": 0.39999905228614807,
"learning_rate": 0.0005042550960978451,
"loss": 3.4277,
"step": 27500
},
{
"epoch": 8.018048439683279,
"grad_norm": 0.3299867808818817,
"learning_rate": 0.0005040803727431566,
"loss": 3.3549,
"step": 27550
},
{
"epoch": 8.032603632976246,
"grad_norm": 0.33842816948890686,
"learning_rate": 0.0005039056493884683,
"loss": 3.3549,
"step": 27600
},
{
"epoch": 8.047158826269213,
"grad_norm": 0.33577972650527954,
"learning_rate": 0.0005037309260337798,
"loss": 3.3558,
"step": 27650
},
{
"epoch": 8.06171401956218,
"grad_norm": 0.329557329416275,
"learning_rate": 0.0005035562026790914,
"loss": 3.3599,
"step": 27700
},
{
"epoch": 8.076269212855147,
"grad_norm": 0.33456891775131226,
"learning_rate": 0.000503381479324403,
"loss": 3.3565,
"step": 27750
},
{
"epoch": 8.090824406148114,
"grad_norm": 0.34147799015045166,
"learning_rate": 0.0005032067559697145,
"loss": 3.3836,
"step": 27800
},
{
"epoch": 8.10537959944108,
"grad_norm": 0.3332184851169586,
"learning_rate": 0.0005030320326150262,
"loss": 3.3786,
"step": 27850
},
{
"epoch": 8.119934792734048,
"grad_norm": 0.3092395067214966,
"learning_rate": 0.0005028573092603378,
"loss": 3.3869,
"step": 27900
},
{
"epoch": 8.134489986027015,
"grad_norm": 0.3140532374382019,
"learning_rate": 0.0005026825859056494,
"loss": 3.3799,
"step": 27950
},
{
"epoch": 8.149045179319982,
"grad_norm": 0.3436073064804077,
"learning_rate": 0.0005025078625509609,
"loss": 3.3723,
"step": 28000
},
{
"epoch": 8.149045179319982,
"eval_accuracy": 0.36610496058075415,
"eval_loss": 3.591127395629883,
"eval_runtime": 179.6817,
"eval_samples_per_second": 92.669,
"eval_steps_per_second": 5.794,
"step": 28000
},
{
"epoch": 8.16360037261295,
"grad_norm": 0.3282712697982788,
"learning_rate": 0.0005023331391962726,
"loss": 3.382,
"step": 28050
},
{
"epoch": 8.178155565905914,
"grad_norm": 0.3222522735595703,
"learning_rate": 0.0005021584158415841,
"loss": 3.3743,
"step": 28100
},
{
"epoch": 8.192710759198881,
"grad_norm": 0.3165663182735443,
"learning_rate": 0.0005019836924868956,
"loss": 3.3873,
"step": 28150
},
{
"epoch": 8.207265952491849,
"grad_norm": 0.3453287184238434,
"learning_rate": 0.0005018089691322073,
"loss": 3.3833,
"step": 28200
},
{
"epoch": 8.221821145784816,
"grad_norm": 0.34769949316978455,
"learning_rate": 0.0005016342457775189,
"loss": 3.3829,
"step": 28250
},
{
"epoch": 8.236376339077783,
"grad_norm": 0.33653950691223145,
"learning_rate": 0.0005014595224228305,
"loss": 3.381,
"step": 28300
},
{
"epoch": 8.25093153237075,
"grad_norm": 0.3488893210887909,
"learning_rate": 0.000501284799068142,
"loss": 3.397,
"step": 28350
},
{
"epoch": 8.265486725663717,
"grad_norm": 0.32981276512145996,
"learning_rate": 0.0005011100757134537,
"loss": 3.393,
"step": 28400
},
{
"epoch": 8.280041918956684,
"grad_norm": 0.3737078607082367,
"learning_rate": 0.0005009353523587653,
"loss": 3.401,
"step": 28450
},
{
"epoch": 8.294597112249651,
"grad_norm": 0.36423400044441223,
"learning_rate": 0.0005007606290040768,
"loss": 3.401,
"step": 28500
},
{
"epoch": 8.309152305542618,
"grad_norm": 0.31796398758888245,
"learning_rate": 0.0005005859056493884,
"loss": 3.4029,
"step": 28550
},
{
"epoch": 8.323707498835585,
"grad_norm": 0.31824707984924316,
"learning_rate": 0.0005004111822947,
"loss": 3.3987,
"step": 28600
},
{
"epoch": 8.338262692128552,
"grad_norm": 0.3500267565250397,
"learning_rate": 0.0005002364589400116,
"loss": 3.4124,
"step": 28650
},
{
"epoch": 8.35281788542152,
"grad_norm": 0.3271293342113495,
"learning_rate": 0.0005000617355853231,
"loss": 3.4052,
"step": 28700
},
{
"epoch": 8.367373078714486,
"grad_norm": 0.346746027469635,
"learning_rate": 0.0004998870122306348,
"loss": 3.4036,
"step": 28750
},
{
"epoch": 8.381928272007451,
"grad_norm": 0.32007408142089844,
"learning_rate": 0.0004997122888759464,
"loss": 3.3994,
"step": 28800
},
{
"epoch": 8.396483465300419,
"grad_norm": 0.3441210687160492,
"learning_rate": 0.000499537565521258,
"loss": 3.4007,
"step": 28850
},
{
"epoch": 8.411038658593386,
"grad_norm": 0.33618438243865967,
"learning_rate": 0.0004993628421665695,
"loss": 3.4075,
"step": 28900
},
{
"epoch": 8.425593851886353,
"grad_norm": 0.31930792331695557,
"learning_rate": 0.0004991881188118811,
"loss": 3.4101,
"step": 28950
},
{
"epoch": 8.44014904517932,
"grad_norm": 0.32625195384025574,
"learning_rate": 0.0004990133954571928,
"loss": 3.405,
"step": 29000
},
{
"epoch": 8.44014904517932,
"eval_accuracy": 0.36678896949825596,
"eval_loss": 3.5803894996643066,
"eval_runtime": 179.7312,
"eval_samples_per_second": 92.644,
"eval_steps_per_second": 5.792,
"step": 29000
},
{
"epoch": 8.454704238472287,
"grad_norm": 0.3442314863204956,
"learning_rate": 0.0004988386721025043,
"loss": 3.4119,
"step": 29050
},
{
"epoch": 8.469259431765254,
"grad_norm": 0.3489319682121277,
"learning_rate": 0.0004986639487478159,
"loss": 3.4213,
"step": 29100
},
{
"epoch": 8.483814625058221,
"grad_norm": 0.32982802391052246,
"learning_rate": 0.0004984892253931275,
"loss": 3.418,
"step": 29150
},
{
"epoch": 8.498369818351188,
"grad_norm": 0.32021379470825195,
"learning_rate": 0.0004983145020384391,
"loss": 3.4164,
"step": 29200
},
{
"epoch": 8.512925011644155,
"grad_norm": 0.3246486783027649,
"learning_rate": 0.0004981397786837507,
"loss": 3.3948,
"step": 29250
},
{
"epoch": 8.527480204937122,
"grad_norm": 0.33790332078933716,
"learning_rate": 0.0004979650553290622,
"loss": 3.4134,
"step": 29300
},
{
"epoch": 8.54203539823009,
"grad_norm": 0.3419322073459625,
"learning_rate": 0.0004977903319743739,
"loss": 3.414,
"step": 29350
},
{
"epoch": 8.556590591523056,
"grad_norm": 0.333080530166626,
"learning_rate": 0.0004976156086196854,
"loss": 3.4146,
"step": 29400
},
{
"epoch": 8.571145784816022,
"grad_norm": 0.34474682807922363,
"learning_rate": 0.0004974408852649971,
"loss": 3.4239,
"step": 29450
},
{
"epoch": 8.585700978108989,
"grad_norm": 0.33647671341896057,
"learning_rate": 0.0004972661619103086,
"loss": 3.4339,
"step": 29500
},
{
"epoch": 8.600256171401956,
"grad_norm": 0.3294098973274231,
"learning_rate": 0.0004970914385556202,
"loss": 3.4124,
"step": 29550
},
{
"epoch": 8.614811364694923,
"grad_norm": 0.3614839017391205,
"learning_rate": 0.0004969167152009318,
"loss": 3.4181,
"step": 29600
},
{
"epoch": 8.62936655798789,
"grad_norm": 0.33004382252693176,
"learning_rate": 0.0004967419918462435,
"loss": 3.4296,
"step": 29650
},
{
"epoch": 8.643921751280857,
"grad_norm": 0.32359495759010315,
"learning_rate": 0.000496567268491555,
"loss": 3.4182,
"step": 29700
},
{
"epoch": 8.658476944573824,
"grad_norm": 0.3327469825744629,
"learning_rate": 0.0004963925451368665,
"loss": 3.4334,
"step": 29750
},
{
"epoch": 8.673032137866791,
"grad_norm": 0.3434790074825287,
"learning_rate": 0.0004962178217821782,
"loss": 3.4206,
"step": 29800
},
{
"epoch": 8.687587331159758,
"grad_norm": 0.32667237520217896,
"learning_rate": 0.0004960430984274898,
"loss": 3.4193,
"step": 29850
},
{
"epoch": 8.702142524452725,
"grad_norm": 0.36787742376327515,
"learning_rate": 0.0004958683750728014,
"loss": 3.4198,
"step": 29900
},
{
"epoch": 8.716697717745692,
"grad_norm": 0.30659085512161255,
"learning_rate": 0.0004956936517181129,
"loss": 3.4216,
"step": 29950
},
{
"epoch": 8.73125291103866,
"grad_norm": 0.322503924369812,
"learning_rate": 0.0004955189283634246,
"loss": 3.4347,
"step": 30000
},
{
"epoch": 8.73125291103866,
"eval_accuracy": 0.3674749763799498,
"eval_loss": 3.5715341567993164,
"eval_runtime": 179.5919,
"eval_samples_per_second": 92.716,
"eval_steps_per_second": 5.796,
"step": 30000
},
{
"epoch": 8.745808104331626,
"grad_norm": 0.33062100410461426,
"learning_rate": 0.0004953442050087361,
"loss": 3.4324,
"step": 30050
},
{
"epoch": 8.760363297624593,
"grad_norm": 0.32108139991760254,
"learning_rate": 0.0004951694816540476,
"loss": 3.4105,
"step": 30100
},
{
"epoch": 8.774918490917559,
"grad_norm": 0.3835145831108093,
"learning_rate": 0.0004949947582993593,
"loss": 3.424,
"step": 30150
},
{
"epoch": 8.789473684210526,
"grad_norm": 0.3131749629974365,
"learning_rate": 0.0004948200349446709,
"loss": 3.4238,
"step": 30200
},
{
"epoch": 8.804028877503493,
"grad_norm": 0.3232799470424652,
"learning_rate": 0.0004946453115899825,
"loss": 3.4309,
"step": 30250
},
{
"epoch": 8.81858407079646,
"grad_norm": 0.31317904591560364,
"learning_rate": 0.000494470588235294,
"loss": 3.4317,
"step": 30300
},
{
"epoch": 8.833139264089427,
"grad_norm": 0.35502809286117554,
"learning_rate": 0.0004942958648806057,
"loss": 3.4321,
"step": 30350
},
{
"epoch": 8.847694457382394,
"grad_norm": 0.3296915292739868,
"learning_rate": 0.0004941211415259173,
"loss": 3.4246,
"step": 30400
},
{
"epoch": 8.862249650675361,
"grad_norm": 0.332200288772583,
"learning_rate": 0.0004939464181712289,
"loss": 3.4364,
"step": 30450
},
{
"epoch": 8.876804843968328,
"grad_norm": 0.31839796900749207,
"learning_rate": 0.0004937716948165404,
"loss": 3.4393,
"step": 30500
},
{
"epoch": 8.891360037261295,
"grad_norm": 0.3107447028160095,
"learning_rate": 0.000493596971461852,
"loss": 3.4403,
"step": 30550
},
{
"epoch": 8.905915230554262,
"grad_norm": 0.32366111874580383,
"learning_rate": 0.0004934222481071636,
"loss": 3.4284,
"step": 30600
},
{
"epoch": 8.92047042384723,
"grad_norm": 0.3176148235797882,
"learning_rate": 0.0004932475247524751,
"loss": 3.4152,
"step": 30650
},
{
"epoch": 8.935025617140196,
"grad_norm": 0.30004048347473145,
"learning_rate": 0.0004930728013977868,
"loss": 3.4174,
"step": 30700
},
{
"epoch": 8.949580810433163,
"grad_norm": 0.3007476031780243,
"learning_rate": 0.0004928980780430984,
"loss": 3.4378,
"step": 30750
},
{
"epoch": 8.964136003726129,
"grad_norm": 0.32137224078178406,
"learning_rate": 0.00049272335468841,
"loss": 3.4267,
"step": 30800
},
{
"epoch": 8.978691197019096,
"grad_norm": 0.31897372007369995,
"learning_rate": 0.0004925486313337215,
"loss": 3.4309,
"step": 30850
},
{
"epoch": 8.993246390312063,
"grad_norm": 0.3516756296157837,
"learning_rate": 0.0004923739079790332,
"loss": 3.426,
"step": 30900
},
{
"epoch": 9.007568700512342,
"grad_norm": 0.341561883687973,
"learning_rate": 0.0004921991846243447,
"loss": 3.3706,
"step": 30950
},
{
"epoch": 9.02212389380531,
"grad_norm": 0.3258677124977112,
"learning_rate": 0.0004920244612696563,
"loss": 3.3197,
"step": 31000
},
{
"epoch": 9.02212389380531,
"eval_accuracy": 0.3673204279733321,
"eval_loss": 3.5789639949798584,
"eval_runtime": 179.612,
"eval_samples_per_second": 92.705,
"eval_steps_per_second": 5.796,
"step": 31000
},
{
"epoch": 9.036679087098276,
"grad_norm": 0.35316187143325806,
"learning_rate": 0.0004918497379149679,
"loss": 3.3323,
"step": 31050
},
{
"epoch": 9.051234280391244,
"grad_norm": 0.35009464621543884,
"learning_rate": 0.0004916750145602795,
"loss": 3.3375,
"step": 31100
},
{
"epoch": 9.06578947368421,
"grad_norm": 0.3244270086288452,
"learning_rate": 0.0004915002912055911,
"loss": 3.3319,
"step": 31150
},
{
"epoch": 9.080344666977178,
"grad_norm": 0.33380645513534546,
"learning_rate": 0.0004913255678509026,
"loss": 3.3444,
"step": 31200
},
{
"epoch": 9.094899860270145,
"grad_norm": 0.3167516887187958,
"learning_rate": 0.0004911508444962143,
"loss": 3.3275,
"step": 31250
},
{
"epoch": 9.109455053563112,
"grad_norm": 0.3265177309513092,
"learning_rate": 0.0004909761211415259,
"loss": 3.3431,
"step": 31300
},
{
"epoch": 9.124010246856079,
"grad_norm": 0.350564181804657,
"learning_rate": 0.0004908013977868375,
"loss": 3.3494,
"step": 31350
},
{
"epoch": 9.138565440149046,
"grad_norm": 0.3303062915802002,
"learning_rate": 0.0004906266744321491,
"loss": 3.3463,
"step": 31400
},
{
"epoch": 9.153120633442011,
"grad_norm": 0.3373015820980072,
"learning_rate": 0.0004904519510774606,
"loss": 3.3478,
"step": 31450
},
{
"epoch": 9.167675826734978,
"grad_norm": 0.33251577615737915,
"learning_rate": 0.0004902772277227722,
"loss": 3.3607,
"step": 31500
},
{
"epoch": 9.182231020027945,
"grad_norm": 0.3346186578273773,
"learning_rate": 0.0004901025043680838,
"loss": 3.3683,
"step": 31550
},
{
"epoch": 9.196786213320912,
"grad_norm": 0.32972466945648193,
"learning_rate": 0.0004899277810133955,
"loss": 3.3544,
"step": 31600
},
{
"epoch": 9.21134140661388,
"grad_norm": 0.3555352985858917,
"learning_rate": 0.000489753057658707,
"loss": 3.3542,
"step": 31650
},
{
"epoch": 9.225896599906847,
"grad_norm": 0.34347039461135864,
"learning_rate": 0.0004895783343040186,
"loss": 3.3548,
"step": 31700
},
{
"epoch": 9.240451793199814,
"grad_norm": 0.3755747377872467,
"learning_rate": 0.0004894036109493302,
"loss": 3.3721,
"step": 31750
},
{
"epoch": 9.25500698649278,
"grad_norm": 0.34164270758628845,
"learning_rate": 0.0004892288875946419,
"loss": 3.3667,
"step": 31800
},
{
"epoch": 9.269562179785748,
"grad_norm": 0.33080416917800903,
"learning_rate": 0.0004890541642399534,
"loss": 3.3655,
"step": 31850
},
{
"epoch": 9.284117373078715,
"grad_norm": 0.32492369413375854,
"learning_rate": 0.0004888794408852649,
"loss": 3.3689,
"step": 31900
},
{
"epoch": 9.298672566371682,
"grad_norm": 0.3133104145526886,
"learning_rate": 0.0004887047175305766,
"loss": 3.37,
"step": 31950
},
{
"epoch": 9.313227759664649,
"grad_norm": 0.3155703842639923,
"learning_rate": 0.0004885299941758881,
"loss": 3.3719,
"step": 32000
},
{
"epoch": 9.313227759664649,
"eval_accuracy": 0.3677861886846826,
"eval_loss": 3.5757789611816406,
"eval_runtime": 179.647,
"eval_samples_per_second": 92.687,
"eval_steps_per_second": 5.795,
"step": 32000
},
{
"epoch": 9.327782952957616,
"grad_norm": 0.32102909684181213,
"learning_rate": 0.0004883552708211997,
"loss": 3.364,
"step": 32050
},
{
"epoch": 9.342338146250583,
"grad_norm": 0.31682634353637695,
"learning_rate": 0.00048818054746651137,
"loss": 3.3759,
"step": 32100
},
{
"epoch": 9.356893339543548,
"grad_norm": 0.33725234866142273,
"learning_rate": 0.0004880058241118229,
"loss": 3.376,
"step": 32150
},
{
"epoch": 9.371448532836515,
"grad_norm": 0.33588212728500366,
"learning_rate": 0.0004878311007571345,
"loss": 3.3753,
"step": 32200
},
{
"epoch": 9.386003726129482,
"grad_norm": 0.341235488653183,
"learning_rate": 0.0004876563774024461,
"loss": 3.3824,
"step": 32250
},
{
"epoch": 9.40055891942245,
"grad_norm": 0.39756667613983154,
"learning_rate": 0.00048748165404775763,
"loss": 3.3874,
"step": 32300
},
{
"epoch": 9.415114112715417,
"grad_norm": 0.3262716233730316,
"learning_rate": 0.0004873069306930693,
"loss": 3.3839,
"step": 32350
},
{
"epoch": 9.429669306008384,
"grad_norm": 0.3105382025241852,
"learning_rate": 0.0004871322073383809,
"loss": 3.3724,
"step": 32400
},
{
"epoch": 9.44422449930135,
"grad_norm": 0.3270449936389923,
"learning_rate": 0.00048695748398369247,
"loss": 3.379,
"step": 32450
},
{
"epoch": 9.458779692594318,
"grad_norm": 0.34322798252105713,
"learning_rate": 0.000486782760629004,
"loss": 3.3976,
"step": 32500
},
{
"epoch": 9.473334885887285,
"grad_norm": 0.32397979497909546,
"learning_rate": 0.0004866080372743156,
"loss": 3.3871,
"step": 32550
},
{
"epoch": 9.487890079180252,
"grad_norm": 0.33422529697418213,
"learning_rate": 0.0004864333139196272,
"loss": 3.3824,
"step": 32600
},
{
"epoch": 9.502445272473219,
"grad_norm": 0.3257046341896057,
"learning_rate": 0.00048625859056493885,
"loss": 3.3782,
"step": 32650
},
{
"epoch": 9.517000465766186,
"grad_norm": 0.337839275598526,
"learning_rate": 0.0004860838672102504,
"loss": 3.3972,
"step": 32700
},
{
"epoch": 9.531555659059153,
"grad_norm": 0.3106997013092041,
"learning_rate": 0.000485909143855562,
"loss": 3.4099,
"step": 32750
},
{
"epoch": 9.546110852352118,
"grad_norm": 0.3634362518787384,
"learning_rate": 0.0004857344205008736,
"loss": 3.3984,
"step": 32800
},
{
"epoch": 9.560666045645085,
"grad_norm": 0.36889997124671936,
"learning_rate": 0.00048555969714618517,
"loss": 3.3959,
"step": 32850
},
{
"epoch": 9.575221238938052,
"grad_norm": 0.3163613975048065,
"learning_rate": 0.0004853849737914967,
"loss": 3.3775,
"step": 32900
},
{
"epoch": 9.58977643223102,
"grad_norm": 0.33509278297424316,
"learning_rate": 0.00048521025043680836,
"loss": 3.3918,
"step": 32950
},
{
"epoch": 9.604331625523987,
"grad_norm": 0.3135107755661011,
"learning_rate": 0.00048503552708211995,
"loss": 3.394,
"step": 33000
},
{
"epoch": 9.604331625523987,
"eval_accuracy": 0.3684542138886483,
"eval_loss": 3.568233013153076,
"eval_runtime": 179.6101,
"eval_samples_per_second": 92.706,
"eval_steps_per_second": 5.796,
"step": 33000
},
{
"epoch": 9.618886818816954,
"grad_norm": 0.3235209584236145,
"learning_rate": 0.00048486080372743155,
"loss": 3.3925,
"step": 33050
},
{
"epoch": 9.63344201210992,
"grad_norm": 0.34784653782844543,
"learning_rate": 0.0004846860803727431,
"loss": 3.4051,
"step": 33100
},
{
"epoch": 9.647997205402888,
"grad_norm": 0.3406641483306885,
"learning_rate": 0.0004845113570180547,
"loss": 3.3897,
"step": 33150
},
{
"epoch": 9.662552398695855,
"grad_norm": 0.3359287977218628,
"learning_rate": 0.00048433663366336633,
"loss": 3.394,
"step": 33200
},
{
"epoch": 9.677107591988822,
"grad_norm": 0.3356068432331085,
"learning_rate": 0.0004841619103086779,
"loss": 3.408,
"step": 33250
},
{
"epoch": 9.691662785281789,
"grad_norm": 0.36484211683273315,
"learning_rate": 0.00048398718695398947,
"loss": 3.4018,
"step": 33300
},
{
"epoch": 9.706217978574756,
"grad_norm": 0.3308541774749756,
"learning_rate": 0.00048381246359930106,
"loss": 3.4026,
"step": 33350
},
{
"epoch": 9.720773171867723,
"grad_norm": 0.33220258355140686,
"learning_rate": 0.00048363774024461265,
"loss": 3.3921,
"step": 33400
},
{
"epoch": 9.73532836516069,
"grad_norm": 0.34359830617904663,
"learning_rate": 0.0004834630168899242,
"loss": 3.401,
"step": 33450
},
{
"epoch": 9.749883558453657,
"grad_norm": 0.3357202708721161,
"learning_rate": 0.00048328829353523584,
"loss": 3.3974,
"step": 33500
},
{
"epoch": 9.764438751746622,
"grad_norm": 0.3662799894809723,
"learning_rate": 0.00048311357018054744,
"loss": 3.4082,
"step": 33550
},
{
"epoch": 9.77899394503959,
"grad_norm": 0.33433783054351807,
"learning_rate": 0.00048293884682585903,
"loss": 3.3953,
"step": 33600
},
{
"epoch": 9.793549138332557,
"grad_norm": 0.3398445248603821,
"learning_rate": 0.00048276412347117057,
"loss": 3.4051,
"step": 33650
},
{
"epoch": 9.808104331625524,
"grad_norm": 0.32314440608024597,
"learning_rate": 0.00048258940011648217,
"loss": 3.4146,
"step": 33700
},
{
"epoch": 9.82265952491849,
"grad_norm": 0.31336554884910583,
"learning_rate": 0.0004824146767617938,
"loss": 3.4036,
"step": 33750
},
{
"epoch": 9.837214718211458,
"grad_norm": 0.366742342710495,
"learning_rate": 0.0004822399534071054,
"loss": 3.399,
"step": 33800
},
{
"epoch": 9.851769911504425,
"grad_norm": 0.33634525537490845,
"learning_rate": 0.00048206523005241695,
"loss": 3.4209,
"step": 33850
},
{
"epoch": 9.866325104797392,
"grad_norm": 0.3509387671947479,
"learning_rate": 0.00048189050669772854,
"loss": 3.4114,
"step": 33900
},
{
"epoch": 9.880880298090359,
"grad_norm": 0.33496803045272827,
"learning_rate": 0.00048171578334304014,
"loss": 3.4134,
"step": 33950
},
{
"epoch": 9.895435491383326,
"grad_norm": 0.3415274918079376,
"learning_rate": 0.00048154105998835173,
"loss": 3.4089,
"step": 34000
},
{
"epoch": 9.895435491383326,
"eval_accuracy": 0.36880409267686187,
"eval_loss": 3.559906482696533,
"eval_runtime": 179.6841,
"eval_samples_per_second": 92.668,
"eval_steps_per_second": 5.794,
"step": 34000
},
{
"epoch": 9.909990684676293,
"grad_norm": 0.3290272653102875,
"learning_rate": 0.0004813663366336633,
"loss": 3.4046,
"step": 34050
},
{
"epoch": 9.92454587796926,
"grad_norm": 0.3289201259613037,
"learning_rate": 0.0004811916132789749,
"loss": 3.4071,
"step": 34100
},
{
"epoch": 9.939101071262227,
"grad_norm": 0.35677555203437805,
"learning_rate": 0.0004810168899242865,
"loss": 3.3999,
"step": 34150
},
{
"epoch": 9.953656264555192,
"grad_norm": 0.3309026062488556,
"learning_rate": 0.0004808421665695981,
"loss": 3.4169,
"step": 34200
},
{
"epoch": 9.96821145784816,
"grad_norm": 0.32284677028656006,
"learning_rate": 0.00048066744321490965,
"loss": 3.4157,
"step": 34250
},
{
"epoch": 9.982766651141127,
"grad_norm": 0.32936713099479675,
"learning_rate": 0.00048049271986022124,
"loss": 3.395,
"step": 34300
},
{
"epoch": 9.997321844434094,
"grad_norm": 0.33557114005088806,
"learning_rate": 0.0004803179965055329,
"loss": 3.4141,
"step": 34350
},
{
"epoch": 10.011644154634373,
"grad_norm": 0.32910844683647156,
"learning_rate": 0.0004801432731508445,
"loss": 3.3246,
"step": 34400
},
{
"epoch": 10.02619934792734,
"grad_norm": 0.3272988498210907,
"learning_rate": 0.000479968549796156,
"loss": 3.3019,
"step": 34450
},
{
"epoch": 10.040754541220307,
"grad_norm": 0.34475770592689514,
"learning_rate": 0.0004797938264414676,
"loss": 3.3026,
"step": 34500
},
{
"epoch": 10.055309734513274,
"grad_norm": 0.3495998680591583,
"learning_rate": 0.0004796191030867792,
"loss": 3.3068,
"step": 34550
},
{
"epoch": 10.069864927806242,
"grad_norm": 0.3315109312534332,
"learning_rate": 0.00047944437973209086,
"loss": 3.3101,
"step": 34600
},
{
"epoch": 10.084420121099209,
"grad_norm": 0.3565497100353241,
"learning_rate": 0.0004792696563774024,
"loss": 3.3121,
"step": 34650
},
{
"epoch": 10.098975314392176,
"grad_norm": 0.33841779828071594,
"learning_rate": 0.000479094933022714,
"loss": 3.3233,
"step": 34700
},
{
"epoch": 10.113530507685143,
"grad_norm": 0.31984496116638184,
"learning_rate": 0.0004789202096680256,
"loss": 3.3232,
"step": 34750
},
{
"epoch": 10.12808570097811,
"grad_norm": 0.33919304609298706,
"learning_rate": 0.00047874548631333713,
"loss": 3.3161,
"step": 34800
},
{
"epoch": 10.142640894271075,
"grad_norm": 0.3212825357913971,
"learning_rate": 0.0004785707629586487,
"loss": 3.3306,
"step": 34850
},
{
"epoch": 10.157196087564042,
"grad_norm": 0.34842801094055176,
"learning_rate": 0.0004783960396039604,
"loss": 3.3233,
"step": 34900
},
{
"epoch": 10.17175128085701,
"grad_norm": 0.33531925082206726,
"learning_rate": 0.00047822131624927197,
"loss": 3.3363,
"step": 34950
},
{
"epoch": 10.186306474149976,
"grad_norm": 0.3403702974319458,
"learning_rate": 0.0004780465928945835,
"loss": 3.3351,
"step": 35000
},
{
"epoch": 10.186306474149976,
"eval_accuracy": 0.3685656297741795,
"eval_loss": 3.571756362915039,
"eval_runtime": 179.8081,
"eval_samples_per_second": 92.604,
"eval_steps_per_second": 5.79,
"step": 35000
},
{
"epoch": 10.200861667442943,
"grad_norm": 0.34296759963035583,
"learning_rate": 0.0004778718695398951,
"loss": 3.34,
"step": 35050
},
{
"epoch": 10.21541686073591,
"grad_norm": 0.3556338846683502,
"learning_rate": 0.0004776971461852067,
"loss": 3.3419,
"step": 35100
},
{
"epoch": 10.229972054028877,
"grad_norm": 0.3632368743419647,
"learning_rate": 0.00047752242283051835,
"loss": 3.3468,
"step": 35150
},
{
"epoch": 10.244527247321844,
"grad_norm": 0.370301216840744,
"learning_rate": 0.00047734769947582994,
"loss": 3.3431,
"step": 35200
},
{
"epoch": 10.259082440614812,
"grad_norm": 0.3300345242023468,
"learning_rate": 0.0004771729761211415,
"loss": 3.3484,
"step": 35250
},
{
"epoch": 10.273637633907779,
"grad_norm": 0.35405808687210083,
"learning_rate": 0.0004769982527664531,
"loss": 3.3431,
"step": 35300
},
{
"epoch": 10.288192827200746,
"grad_norm": 0.33882415294647217,
"learning_rate": 0.00047682352941176467,
"loss": 3.358,
"step": 35350
},
{
"epoch": 10.302748020493713,
"grad_norm": 0.32428914308547974,
"learning_rate": 0.0004766488060570762,
"loss": 3.3416,
"step": 35400
},
{
"epoch": 10.31730321378668,
"grad_norm": 0.3725070655345917,
"learning_rate": 0.00047647408270238786,
"loss": 3.3579,
"step": 35450
},
{
"epoch": 10.331858407079647,
"grad_norm": 0.32831016182899475,
"learning_rate": 0.00047629935934769945,
"loss": 3.3651,
"step": 35500
},
{
"epoch": 10.346413600372612,
"grad_norm": 0.32686808705329895,
"learning_rate": 0.00047612463599301105,
"loss": 3.3474,
"step": 35550
},
{
"epoch": 10.36096879366558,
"grad_norm": 0.3326481580734253,
"learning_rate": 0.0004759499126383226,
"loss": 3.343,
"step": 35600
},
{
"epoch": 10.375523986958546,
"grad_norm": 0.34560704231262207,
"learning_rate": 0.0004757751892836342,
"loss": 3.3622,
"step": 35650
},
{
"epoch": 10.390079180251513,
"grad_norm": 0.35292690992355347,
"learning_rate": 0.0004756004659289458,
"loss": 3.3523,
"step": 35700
},
{
"epoch": 10.40463437354448,
"grad_norm": 0.33068376779556274,
"learning_rate": 0.0004754257425742574,
"loss": 3.3536,
"step": 35750
},
{
"epoch": 10.419189566837447,
"grad_norm": 0.3444773852825165,
"learning_rate": 0.00047525101921956896,
"loss": 3.3609,
"step": 35800
},
{
"epoch": 10.433744760130415,
"grad_norm": 0.35411354899406433,
"learning_rate": 0.00047507629586488056,
"loss": 3.3738,
"step": 35850
},
{
"epoch": 10.448299953423382,
"grad_norm": 0.3531155586242676,
"learning_rate": 0.00047490157251019215,
"loss": 3.3521,
"step": 35900
},
{
"epoch": 10.462855146716349,
"grad_norm": 0.34749075770378113,
"learning_rate": 0.0004747268491555037,
"loss": 3.3709,
"step": 35950
},
{
"epoch": 10.477410340009316,
"grad_norm": 0.3302483558654785,
"learning_rate": 0.00047455212580081534,
"loss": 3.3608,
"step": 36000
},
{
"epoch": 10.477410340009316,
"eval_accuracy": 0.3686402596131166,
"eval_loss": 3.5644166469573975,
"eval_runtime": 179.674,
"eval_samples_per_second": 92.673,
"eval_steps_per_second": 5.794,
"step": 36000
},
{
"epoch": 10.491965533302283,
"grad_norm": 0.3223860263824463,
"learning_rate": 0.00047437740244612694,
"loss": 3.3601,
"step": 36050
},
{
"epoch": 10.50652072659525,
"grad_norm": 0.3294314742088318,
"learning_rate": 0.00047420267909143853,
"loss": 3.3779,
"step": 36100
},
{
"epoch": 10.521075919888217,
"grad_norm": 0.3386666178703308,
"learning_rate": 0.0004740279557367501,
"loss": 3.3699,
"step": 36150
},
{
"epoch": 10.535631113181182,
"grad_norm": 0.3316313624382019,
"learning_rate": 0.00047385323238206166,
"loss": 3.3679,
"step": 36200
},
{
"epoch": 10.55018630647415,
"grad_norm": 0.3450184464454651,
"learning_rate": 0.00047367850902737326,
"loss": 3.3768,
"step": 36250
},
{
"epoch": 10.564741499767116,
"grad_norm": 0.3605281114578247,
"learning_rate": 0.0004735037856726849,
"loss": 3.362,
"step": 36300
},
{
"epoch": 10.579296693060083,
"grad_norm": 0.3162323534488678,
"learning_rate": 0.0004733290623179965,
"loss": 3.3737,
"step": 36350
},
{
"epoch": 10.59385188635305,
"grad_norm": 0.34284910559654236,
"learning_rate": 0.00047315433896330804,
"loss": 3.3803,
"step": 36400
},
{
"epoch": 10.608407079646017,
"grad_norm": 0.3211812376976013,
"learning_rate": 0.00047297961560861964,
"loss": 3.3822,
"step": 36450
},
{
"epoch": 10.622962272938985,
"grad_norm": 0.3258812725543976,
"learning_rate": 0.00047280489225393123,
"loss": 3.3763,
"step": 36500
},
{
"epoch": 10.637517466231952,
"grad_norm": 0.3411952257156372,
"learning_rate": 0.0004726301688992429,
"loss": 3.379,
"step": 36550
},
{
"epoch": 10.652072659524919,
"grad_norm": 0.3464188873767853,
"learning_rate": 0.0004724554455445544,
"loss": 3.3756,
"step": 36600
},
{
"epoch": 10.666627852817886,
"grad_norm": 0.3489798307418823,
"learning_rate": 0.000472280722189866,
"loss": 3.3666,
"step": 36650
},
{
"epoch": 10.681183046110853,
"grad_norm": 0.3330119550228119,
"learning_rate": 0.0004721059988351776,
"loss": 3.3725,
"step": 36700
},
{
"epoch": 10.69573823940382,
"grad_norm": 0.37871816754341125,
"learning_rate": 0.00047193127548048915,
"loss": 3.3732,
"step": 36750
},
{
"epoch": 10.710293432696787,
"grad_norm": 0.30662286281585693,
"learning_rate": 0.00047175655212580074,
"loss": 3.3792,
"step": 36800
},
{
"epoch": 10.724848625989754,
"grad_norm": 0.34182503819465637,
"learning_rate": 0.0004715818287711124,
"loss": 3.3996,
"step": 36850
},
{
"epoch": 10.73940381928272,
"grad_norm": 0.3572937250137329,
"learning_rate": 0.000471407105416424,
"loss": 3.3756,
"step": 36900
},
{
"epoch": 10.753959012575686,
"grad_norm": 0.33789294958114624,
"learning_rate": 0.0004712323820617355,
"loss": 3.3804,
"step": 36950
},
{
"epoch": 10.768514205868653,
"grad_norm": 0.34808850288391113,
"learning_rate": 0.0004710576587070471,
"loss": 3.3839,
"step": 37000
},
{
"epoch": 10.768514205868653,
"eval_accuracy": 0.36958905754971316,
"eval_loss": 3.5576395988464355,
"eval_runtime": 179.691,
"eval_samples_per_second": 92.665,
"eval_steps_per_second": 5.793,
"step": 37000
},
{
"epoch": 10.78306939916162,
"grad_norm": 0.3317504823207855,
"learning_rate": 0.0004708829353523587,
"loss": 3.3828,
"step": 37050
},
{
"epoch": 10.797624592454587,
"grad_norm": 0.3486345112323761,
"learning_rate": 0.0004707082119976703,
"loss": 3.378,
"step": 37100
},
{
"epoch": 10.812179785747555,
"grad_norm": 0.3379111886024475,
"learning_rate": 0.0004705334886429819,
"loss": 3.3915,
"step": 37150
},
{
"epoch": 10.826734979040522,
"grad_norm": 0.3323996365070343,
"learning_rate": 0.0004703587652882935,
"loss": 3.3739,
"step": 37200
},
{
"epoch": 10.841290172333489,
"grad_norm": 0.341294527053833,
"learning_rate": 0.0004701840419336051,
"loss": 3.3687,
"step": 37250
},
{
"epoch": 10.855845365626456,
"grad_norm": 0.3258977234363556,
"learning_rate": 0.0004700093185789167,
"loss": 3.3836,
"step": 37300
},
{
"epoch": 10.870400558919423,
"grad_norm": 0.327525794506073,
"learning_rate": 0.0004698345952242282,
"loss": 3.3697,
"step": 37350
},
{
"epoch": 10.88495575221239,
"grad_norm": 0.3258748948574066,
"learning_rate": 0.00046965987186953987,
"loss": 3.3864,
"step": 37400
},
{
"epoch": 10.899510945505357,
"grad_norm": 0.3566642701625824,
"learning_rate": 0.00046948514851485147,
"loss": 3.3892,
"step": 37450
},
{
"epoch": 10.914066138798324,
"grad_norm": 0.32489481568336487,
"learning_rate": 0.00046931042516016306,
"loss": 3.3924,
"step": 37500
},
{
"epoch": 10.92862133209129,
"grad_norm": 0.3091423809528351,
"learning_rate": 0.0004691357018054746,
"loss": 3.3915,
"step": 37550
},
{
"epoch": 10.943176525384256,
"grad_norm": 0.3518022894859314,
"learning_rate": 0.0004689609784507862,
"loss": 3.3855,
"step": 37600
},
{
"epoch": 10.957731718677223,
"grad_norm": 0.3540588915348053,
"learning_rate": 0.0004687862550960978,
"loss": 3.3879,
"step": 37650
},
{
"epoch": 10.97228691197019,
"grad_norm": 0.339358389377594,
"learning_rate": 0.00046861153174140944,
"loss": 3.3936,
"step": 37700
},
{
"epoch": 10.986842105263158,
"grad_norm": 0.3372827172279358,
"learning_rate": 0.000468436808386721,
"loss": 3.3831,
"step": 37750
},
{
"epoch": 11.001164415463437,
"grad_norm": 0.37063729763031006,
"learning_rate": 0.0004682620850320326,
"loss": 3.3715,
"step": 37800
},
{
"epoch": 11.015719608756404,
"grad_norm": 0.3537955582141876,
"learning_rate": 0.00046808736167734417,
"loss": 3.266,
"step": 37850
},
{
"epoch": 11.030274802049371,
"grad_norm": 0.34943118691444397,
"learning_rate": 0.0004679126383226557,
"loss": 3.2636,
"step": 37900
},
{
"epoch": 11.044829995342338,
"grad_norm": 0.323003351688385,
"learning_rate": 0.00046773791496796736,
"loss": 3.292,
"step": 37950
},
{
"epoch": 11.059385188635305,
"grad_norm": 0.351985901594162,
"learning_rate": 0.00046756319161327895,
"loss": 3.2872,
"step": 38000
},
{
"epoch": 11.059385188635305,
"eval_accuracy": 0.3697098756196774,
"eval_loss": 3.5635266304016113,
"eval_runtime": 179.5649,
"eval_samples_per_second": 92.73,
"eval_steps_per_second": 5.797,
"step": 38000
},
{
"epoch": 11.073940381928272,
"grad_norm": 0.3405875265598297,
"learning_rate": 0.00046738846825859054,
"loss": 3.2988,
"step": 38050
},
{
"epoch": 11.08849557522124,
"grad_norm": 0.3895106613636017,
"learning_rate": 0.0004672137449039021,
"loss": 3.2898,
"step": 38100
},
{
"epoch": 11.103050768514207,
"grad_norm": 0.34889012575149536,
"learning_rate": 0.0004670390215492137,
"loss": 3.2914,
"step": 38150
},
{
"epoch": 11.117605961807174,
"grad_norm": 0.3436652719974518,
"learning_rate": 0.0004668642981945253,
"loss": 3.2922,
"step": 38200
},
{
"epoch": 11.132161155100139,
"grad_norm": 0.3298552334308624,
"learning_rate": 0.0004666895748398369,
"loss": 3.3036,
"step": 38250
},
{
"epoch": 11.146716348393106,
"grad_norm": 0.374026358127594,
"learning_rate": 0.00046651485148514846,
"loss": 3.3078,
"step": 38300
},
{
"epoch": 11.161271541686073,
"grad_norm": 0.3402121067047119,
"learning_rate": 0.00046634012813046006,
"loss": 3.3049,
"step": 38350
},
{
"epoch": 11.17582673497904,
"grad_norm": 0.347309947013855,
"learning_rate": 0.00046616540477577165,
"loss": 3.3056,
"step": 38400
},
{
"epoch": 11.190381928272007,
"grad_norm": 0.38802456855773926,
"learning_rate": 0.00046599068142108324,
"loss": 3.3165,
"step": 38450
},
{
"epoch": 11.204937121564974,
"grad_norm": 0.3473553955554962,
"learning_rate": 0.0004658159580663948,
"loss": 3.314,
"step": 38500
},
{
"epoch": 11.219492314857941,
"grad_norm": 0.35995396971702576,
"learning_rate": 0.00046564123471170643,
"loss": 3.3263,
"step": 38550
},
{
"epoch": 11.234047508150908,
"grad_norm": 0.34611260890960693,
"learning_rate": 0.00046546651135701803,
"loss": 3.3238,
"step": 38600
},
{
"epoch": 11.248602701443875,
"grad_norm": 0.34514832496643066,
"learning_rate": 0.0004652917880023296,
"loss": 3.3183,
"step": 38650
},
{
"epoch": 11.263157894736842,
"grad_norm": 0.3476332724094391,
"learning_rate": 0.00046511706464764116,
"loss": 3.3175,
"step": 38700
},
{
"epoch": 11.27771308802981,
"grad_norm": 0.3759239912033081,
"learning_rate": 0.00046494234129295276,
"loss": 3.3355,
"step": 38750
},
{
"epoch": 11.292268281322777,
"grad_norm": 0.3533199727535248,
"learning_rate": 0.0004647676179382644,
"loss": 3.3161,
"step": 38800
},
{
"epoch": 11.306823474615744,
"grad_norm": 0.37267446517944336,
"learning_rate": 0.000464592894583576,
"loss": 3.331,
"step": 38850
},
{
"epoch": 11.32137866790871,
"grad_norm": 0.34054192900657654,
"learning_rate": 0.00046441817122888754,
"loss": 3.3426,
"step": 38900
},
{
"epoch": 11.335933861201676,
"grad_norm": 0.34166160225868225,
"learning_rate": 0.00046424344787419913,
"loss": 3.3492,
"step": 38950
},
{
"epoch": 11.350489054494643,
"grad_norm": 0.34584760665893555,
"learning_rate": 0.00046406872451951073,
"loss": 3.3239,
"step": 39000
},
{
"epoch": 11.350489054494643,
"eval_accuracy": 0.36916642935944916,
"eval_loss": 3.564857244491577,
"eval_runtime": 179.5454,
"eval_samples_per_second": 92.74,
"eval_steps_per_second": 5.798,
"step": 39000
},
{
"epoch": 11.36504424778761,
"grad_norm": 0.34652185440063477,
"learning_rate": 0.00046389400116482227,
"loss": 3.3311,
"step": 39050
},
{
"epoch": 11.379599441080577,
"grad_norm": 0.35235944390296936,
"learning_rate": 0.0004637192778101339,
"loss": 3.3451,
"step": 39100
},
{
"epoch": 11.394154634373544,
"grad_norm": 0.3453175723552704,
"learning_rate": 0.0004635445544554455,
"loss": 3.3386,
"step": 39150
},
{
"epoch": 11.408709827666511,
"grad_norm": 0.3537910282611847,
"learning_rate": 0.0004633698311007571,
"loss": 3.3399,
"step": 39200
},
{
"epoch": 11.423265020959478,
"grad_norm": 0.354648232460022,
"learning_rate": 0.0004631951077460687,
"loss": 3.3416,
"step": 39250
},
{
"epoch": 11.437820214252445,
"grad_norm": 0.3352389931678772,
"learning_rate": 0.00046302038439138024,
"loss": 3.3472,
"step": 39300
},
{
"epoch": 11.452375407545413,
"grad_norm": 0.33255258202552795,
"learning_rate": 0.0004628456610366919,
"loss": 3.3551,
"step": 39350
},
{
"epoch": 11.46693060083838,
"grad_norm": 0.36602121591567993,
"learning_rate": 0.0004626709376820035,
"loss": 3.3424,
"step": 39400
},
{
"epoch": 11.481485794131347,
"grad_norm": 0.3826133608818054,
"learning_rate": 0.0004624962143273151,
"loss": 3.3527,
"step": 39450
},
{
"epoch": 11.496040987424314,
"grad_norm": 0.3627864122390747,
"learning_rate": 0.0004623214909726266,
"loss": 3.344,
"step": 39500
},
{
"epoch": 11.51059618071728,
"grad_norm": 0.33537665009498596,
"learning_rate": 0.0004621467676179382,
"loss": 3.3587,
"step": 39550
},
{
"epoch": 11.525151374010246,
"grad_norm": 0.33809006214141846,
"learning_rate": 0.0004619720442632498,
"loss": 3.3333,
"step": 39600
},
{
"epoch": 11.539706567303213,
"grad_norm": 0.35963302850723267,
"learning_rate": 0.00046179732090856145,
"loss": 3.3527,
"step": 39650
},
{
"epoch": 11.55426176059618,
"grad_norm": 0.3328627347946167,
"learning_rate": 0.000461622597553873,
"loss": 3.3601,
"step": 39700
},
{
"epoch": 11.568816953889147,
"grad_norm": 0.34587082266807556,
"learning_rate": 0.0004614478741991846,
"loss": 3.3477,
"step": 39750
},
{
"epoch": 11.583372147182114,
"grad_norm": 0.347168505191803,
"learning_rate": 0.0004612731508444962,
"loss": 3.35,
"step": 39800
},
{
"epoch": 11.597927340475081,
"grad_norm": 0.3339124917984009,
"learning_rate": 0.0004610984274898077,
"loss": 3.3537,
"step": 39850
},
{
"epoch": 11.612482533768048,
"grad_norm": 0.3587067127227783,
"learning_rate": 0.00046092370413511937,
"loss": 3.3494,
"step": 39900
},
{
"epoch": 11.627037727061015,
"grad_norm": 0.34786462783813477,
"learning_rate": 0.00046074898078043096,
"loss": 3.3506,
"step": 39950
},
{
"epoch": 11.641592920353983,
"grad_norm": 0.3410889804363251,
"learning_rate": 0.00046057425742574256,
"loss": 3.3604,
"step": 40000
},
{
"epoch": 11.641592920353983,
"eval_accuracy": 0.3702274658727149,
"eval_loss": 3.556337594985962,
"eval_runtime": 179.6688,
"eval_samples_per_second": 92.676,
"eval_steps_per_second": 5.794,
"step": 40000
},
{
"epoch": 11.65614811364695,
"grad_norm": 0.32578304409980774,
"learning_rate": 0.0004603995340710541,
"loss": 3.3519,
"step": 40050
},
{
"epoch": 11.670703306939917,
"grad_norm": 0.33926668763160706,
"learning_rate": 0.0004602248107163657,
"loss": 3.3591,
"step": 40100
},
{
"epoch": 11.685258500232884,
"grad_norm": 0.3640845715999603,
"learning_rate": 0.0004600500873616773,
"loss": 3.3618,
"step": 40150
},
{
"epoch": 11.69981369352585,
"grad_norm": 0.34970220923423767,
"learning_rate": 0.00045987536400698894,
"loss": 3.3563,
"step": 40200
},
{
"epoch": 11.714368886818818,
"grad_norm": 0.3340449631214142,
"learning_rate": 0.0004597006406523005,
"loss": 3.3529,
"step": 40250
},
{
"epoch": 11.728924080111783,
"grad_norm": 0.3313457667827606,
"learning_rate": 0.00045952591729761207,
"loss": 3.3554,
"step": 40300
},
{
"epoch": 11.74347927340475,
"grad_norm": 0.3290119171142578,
"learning_rate": 0.00045935119394292367,
"loss": 3.3634,
"step": 40350
},
{
"epoch": 11.758034466697717,
"grad_norm": 0.34706369042396545,
"learning_rate": 0.00045917647058823526,
"loss": 3.3626,
"step": 40400
},
{
"epoch": 11.772589659990684,
"grad_norm": 0.3630402982234955,
"learning_rate": 0.0004590017472335468,
"loss": 3.3507,
"step": 40450
},
{
"epoch": 11.787144853283651,
"grad_norm": 0.3313291668891907,
"learning_rate": 0.00045882702387885845,
"loss": 3.3646,
"step": 40500
},
{
"epoch": 11.801700046576618,
"grad_norm": 0.3491012454032898,
"learning_rate": 0.00045865230052417004,
"loss": 3.3703,
"step": 40550
},
{
"epoch": 11.816255239869585,
"grad_norm": 0.3649109899997711,
"learning_rate": 0.00045847757716948164,
"loss": 3.3689,
"step": 40600
},
{
"epoch": 11.830810433162553,
"grad_norm": 0.3404376804828644,
"learning_rate": 0.0004583028538147932,
"loss": 3.3697,
"step": 40650
},
{
"epoch": 11.84536562645552,
"grad_norm": 0.35776814818382263,
"learning_rate": 0.00045812813046010477,
"loss": 3.3625,
"step": 40700
},
{
"epoch": 11.859920819748487,
"grad_norm": 0.3335564434528351,
"learning_rate": 0.0004579534071054164,
"loss": 3.3575,
"step": 40750
},
{
"epoch": 11.874476013041454,
"grad_norm": 0.3584432899951935,
"learning_rate": 0.000457778683750728,
"loss": 3.3683,
"step": 40800
},
{
"epoch": 11.88903120633442,
"grad_norm": 0.372200071811676,
"learning_rate": 0.00045760396039603955,
"loss": 3.3574,
"step": 40850
},
{
"epoch": 11.903586399627388,
"grad_norm": 0.3619687855243683,
"learning_rate": 0.00045742923704135115,
"loss": 3.3737,
"step": 40900
},
{
"epoch": 11.918141592920353,
"grad_norm": 0.36458566784858704,
"learning_rate": 0.00045725451368666274,
"loss": 3.3734,
"step": 40950
},
{
"epoch": 11.93269678621332,
"grad_norm": 0.36355504393577576,
"learning_rate": 0.0004570797903319743,
"loss": 3.3633,
"step": 41000
},
{
"epoch": 11.93269678621332,
"eval_accuracy": 0.37053221417564997,
"eval_loss": 3.548603057861328,
"eval_runtime": 179.6555,
"eval_samples_per_second": 92.683,
"eval_steps_per_second": 5.794,
"step": 41000
},
{
"epoch": 11.947251979506287,
"grad_norm": 0.3612082302570343,
"learning_rate": 0.00045690506697728593,
"loss": 3.3748,
"step": 41050
},
{
"epoch": 11.961807172799254,
"grad_norm": 0.31688255071640015,
"learning_rate": 0.0004567303436225975,
"loss": 3.3693,
"step": 41100
},
{
"epoch": 11.976362366092221,
"grad_norm": 0.33474868535995483,
"learning_rate": 0.0004565556202679091,
"loss": 3.3685,
"step": 41150
},
{
"epoch": 11.990917559385188,
"grad_norm": 0.33027711510658264,
"learning_rate": 0.00045638089691322066,
"loss": 3.381,
"step": 41200
},
{
"epoch": 12.005239869585468,
"grad_norm": 0.3528062105178833,
"learning_rate": 0.00045620617355853225,
"loss": 3.3416,
"step": 41250
},
{
"epoch": 12.019795062878435,
"grad_norm": 0.32908952236175537,
"learning_rate": 0.0004560314502038439,
"loss": 3.2629,
"step": 41300
},
{
"epoch": 12.034350256171402,
"grad_norm": 0.37203720211982727,
"learning_rate": 0.0004558567268491555,
"loss": 3.2617,
"step": 41350
},
{
"epoch": 12.04890544946437,
"grad_norm": 0.34509584307670593,
"learning_rate": 0.00045568200349446704,
"loss": 3.2684,
"step": 41400
},
{
"epoch": 12.063460642757336,
"grad_norm": 0.3308143615722656,
"learning_rate": 0.00045550728013977863,
"loss": 3.2681,
"step": 41450
},
{
"epoch": 12.078015836050303,
"grad_norm": 0.3766533136367798,
"learning_rate": 0.0004553325567850902,
"loss": 3.2629,
"step": 41500
},
{
"epoch": 12.09257102934327,
"grad_norm": 0.3604935109615326,
"learning_rate": 0.0004551578334304018,
"loss": 3.2869,
"step": 41550
},
{
"epoch": 12.107126222636236,
"grad_norm": 0.34294411540031433,
"learning_rate": 0.00045498311007571347,
"loss": 3.2633,
"step": 41600
},
{
"epoch": 12.121681415929203,
"grad_norm": 0.34739822149276733,
"learning_rate": 0.000454808386721025,
"loss": 3.2834,
"step": 41650
},
{
"epoch": 12.13623660922217,
"grad_norm": 0.3293208181858063,
"learning_rate": 0.0004546336633663366,
"loss": 3.286,
"step": 41700
},
{
"epoch": 12.150791802515137,
"grad_norm": 0.346708208322525,
"learning_rate": 0.0004544589400116482,
"loss": 3.287,
"step": 41750
},
{
"epoch": 12.165346995808104,
"grad_norm": 0.3570074439048767,
"learning_rate": 0.00045428421665695974,
"loss": 3.2899,
"step": 41800
},
{
"epoch": 12.179902189101071,
"grad_norm": 0.34867045283317566,
"learning_rate": 0.00045410949330227133,
"loss": 3.2963,
"step": 41850
},
{
"epoch": 12.194457382394038,
"grad_norm": 0.3396155834197998,
"learning_rate": 0.000453934769947583,
"loss": 3.2904,
"step": 41900
},
{
"epoch": 12.209012575687005,
"grad_norm": 0.3608848452568054,
"learning_rate": 0.0004537600465928946,
"loss": 3.3067,
"step": 41950
},
{
"epoch": 12.223567768979972,
"grad_norm": 0.3715206980705261,
"learning_rate": 0.0004535853232382061,
"loss": 3.3018,
"step": 42000
},
{
"epoch": 12.223567768979972,
"eval_accuracy": 0.369946693140084,
"eval_loss": 3.5617122650146484,
"eval_runtime": 179.8742,
"eval_samples_per_second": 92.57,
"eval_steps_per_second": 5.787,
"step": 42000
},
{
"epoch": 12.23812296227294,
"grad_norm": 0.3436983823776245,
"learning_rate": 0.0004534105998835177,
"loss": 3.3002,
"step": 42050
},
{
"epoch": 12.252678155565906,
"grad_norm": 0.3525453805923462,
"learning_rate": 0.0004532358765288293,
"loss": 3.3219,
"step": 42100
},
{
"epoch": 12.267233348858873,
"grad_norm": 0.3803021013736725,
"learning_rate": 0.00045306115317414095,
"loss": 3.2979,
"step": 42150
},
{
"epoch": 12.28178854215184,
"grad_norm": 0.36688610911369324,
"learning_rate": 0.0004528864298194525,
"loss": 3.3118,
"step": 42200
},
{
"epoch": 12.296343735444808,
"grad_norm": 0.3614189326763153,
"learning_rate": 0.0004527117064647641,
"loss": 3.3043,
"step": 42250
},
{
"epoch": 12.310898928737773,
"grad_norm": 0.41557541489601135,
"learning_rate": 0.0004525369831100757,
"loss": 3.3259,
"step": 42300
},
{
"epoch": 12.32545412203074,
"grad_norm": 0.34607186913490295,
"learning_rate": 0.0004523622597553872,
"loss": 3.3075,
"step": 42350
},
{
"epoch": 12.340009315323707,
"grad_norm": 0.36357954144477844,
"learning_rate": 0.0004521875364006988,
"loss": 3.3127,
"step": 42400
},
{
"epoch": 12.354564508616674,
"grad_norm": 0.35901084542274475,
"learning_rate": 0.00045201281304601046,
"loss": 3.3222,
"step": 42450
},
{
"epoch": 12.369119701909641,
"grad_norm": 0.34429091215133667,
"learning_rate": 0.00045183808969132206,
"loss": 3.3268,
"step": 42500
},
{
"epoch": 12.383674895202608,
"grad_norm": 0.3401288688182831,
"learning_rate": 0.00045166336633663365,
"loss": 3.3147,
"step": 42550
},
{
"epoch": 12.398230088495575,
"grad_norm": 0.3882862329483032,
"learning_rate": 0.0004514886429819452,
"loss": 3.3241,
"step": 42600
},
{
"epoch": 12.412785281788542,
"grad_norm": 0.33733412623405457,
"learning_rate": 0.0004513139196272568,
"loss": 3.3212,
"step": 42650
},
{
"epoch": 12.42734047508151,
"grad_norm": 0.35020992159843445,
"learning_rate": 0.00045113919627256843,
"loss": 3.3179,
"step": 42700
},
{
"epoch": 12.441895668374476,
"grad_norm": 0.36065271496772766,
"learning_rate": 0.00045096447291788003,
"loss": 3.3266,
"step": 42750
},
{
"epoch": 12.456450861667443,
"grad_norm": 0.34550905227661133,
"learning_rate": 0.00045078974956319157,
"loss": 3.3288,
"step": 42800
},
{
"epoch": 12.47100605496041,
"grad_norm": 0.3330172896385193,
"learning_rate": 0.00045061502620850316,
"loss": 3.328,
"step": 42850
},
{
"epoch": 12.485561248253378,
"grad_norm": 0.3609847128391266,
"learning_rate": 0.00045044030285381476,
"loss": 3.3256,
"step": 42900
},
{
"epoch": 12.500116441546343,
"grad_norm": 0.35363641381263733,
"learning_rate": 0.0004502655794991263,
"loss": 3.3383,
"step": 42950
},
{
"epoch": 12.51467163483931,
"grad_norm": 0.3697940707206726,
"learning_rate": 0.00045009085614443795,
"loss": 3.3237,
"step": 43000
},
{
"epoch": 12.51467163483931,
"eval_accuracy": 0.3703875380626869,
"eval_loss": 3.5561211109161377,
"eval_runtime": 179.7227,
"eval_samples_per_second": 92.648,
"eval_steps_per_second": 5.792,
"step": 43000
},
{
"epoch": 12.529226828132277,
"grad_norm": 0.37186741828918457,
"learning_rate": 0.00044991613278974954,
"loss": 3.3344,
"step": 43050
},
{
"epoch": 12.543782021425244,
"grad_norm": 0.3866070508956909,
"learning_rate": 0.00044974140943506113,
"loss": 3.3314,
"step": 43100
},
{
"epoch": 12.558337214718211,
"grad_norm": 0.363854318857193,
"learning_rate": 0.0004495666860803727,
"loss": 3.3361,
"step": 43150
},
{
"epoch": 12.572892408011178,
"grad_norm": 0.3626774251461029,
"learning_rate": 0.00044939196272568427,
"loss": 3.3318,
"step": 43200
},
{
"epoch": 12.587447601304145,
"grad_norm": 0.38811835646629333,
"learning_rate": 0.00044921723937099586,
"loss": 3.3316,
"step": 43250
},
{
"epoch": 12.602002794597112,
"grad_norm": 0.35809630155563354,
"learning_rate": 0.0004490425160163075,
"loss": 3.3345,
"step": 43300
},
{
"epoch": 12.61655798789008,
"grad_norm": 0.3418888747692108,
"learning_rate": 0.00044886779266161905,
"loss": 3.3445,
"step": 43350
},
{
"epoch": 12.631113181183046,
"grad_norm": 0.3369224965572357,
"learning_rate": 0.00044869306930693065,
"loss": 3.3289,
"step": 43400
},
{
"epoch": 12.645668374476013,
"grad_norm": 0.3346802592277527,
"learning_rate": 0.00044851834595224224,
"loss": 3.3417,
"step": 43450
},
{
"epoch": 12.66022356776898,
"grad_norm": 0.3884475827217102,
"learning_rate": 0.00044834362259755383,
"loss": 3.3461,
"step": 43500
},
{
"epoch": 12.674778761061948,
"grad_norm": 0.35574793815612793,
"learning_rate": 0.00044816889924286543,
"loss": 3.3445,
"step": 43550
},
{
"epoch": 12.689333954354915,
"grad_norm": 0.3697713315486908,
"learning_rate": 0.000447994175888177,
"loss": 3.3392,
"step": 43600
},
{
"epoch": 12.703889147647882,
"grad_norm": 0.35330986976623535,
"learning_rate": 0.0004478194525334886,
"loss": 3.3326,
"step": 43650
},
{
"epoch": 12.718444340940847,
"grad_norm": 0.3685454726219177,
"learning_rate": 0.0004476447291788002,
"loss": 3.3491,
"step": 43700
},
{
"epoch": 12.732999534233814,
"grad_norm": 0.3632424473762512,
"learning_rate": 0.00044747000582411175,
"loss": 3.3416,
"step": 43750
},
{
"epoch": 12.747554727526781,
"grad_norm": 0.35729971528053284,
"learning_rate": 0.00044729528246942335,
"loss": 3.3286,
"step": 43800
},
{
"epoch": 12.762109920819748,
"grad_norm": 0.36447879672050476,
"learning_rate": 0.000447120559114735,
"loss": 3.331,
"step": 43850
},
{
"epoch": 12.776665114112715,
"grad_norm": 0.34526267647743225,
"learning_rate": 0.0004469458357600466,
"loss": 3.3558,
"step": 43900
},
{
"epoch": 12.791220307405682,
"grad_norm": 0.36058372259140015,
"learning_rate": 0.00044677111240535813,
"loss": 3.3435,
"step": 43950
},
{
"epoch": 12.80577550069865,
"grad_norm": 0.37143009901046753,
"learning_rate": 0.0004465963890506697,
"loss": 3.3553,
"step": 44000
},
{
"epoch": 12.80577550069865,
"eval_accuracy": 0.37099597692280845,
"eval_loss": 3.545367479324341,
"eval_runtime": 179.758,
"eval_samples_per_second": 92.63,
"eval_steps_per_second": 5.791,
"step": 44000
},
{
"epoch": 12.820330693991616,
"grad_norm": 0.35331812500953674,
"learning_rate": 0.0004464216656959813,
"loss": 3.3553,
"step": 44050
},
{
"epoch": 12.834885887284583,
"grad_norm": 0.3394143283367157,
"learning_rate": 0.00044624694234129297,
"loss": 3.3457,
"step": 44100
},
{
"epoch": 12.84944108057755,
"grad_norm": 0.36839503049850464,
"learning_rate": 0.0004460722189866045,
"loss": 3.3476,
"step": 44150
},
{
"epoch": 12.863996273870518,
"grad_norm": 0.3979646563529968,
"learning_rate": 0.0004458974956319161,
"loss": 3.3549,
"step": 44200
},
{
"epoch": 12.878551467163485,
"grad_norm": 0.3441554605960846,
"learning_rate": 0.0004457227722772277,
"loss": 3.3421,
"step": 44250
},
{
"epoch": 12.89310666045645,
"grad_norm": 0.3389278054237366,
"learning_rate": 0.00044554804892253923,
"loss": 3.3441,
"step": 44300
},
{
"epoch": 12.907661853749417,
"grad_norm": 0.3513207733631134,
"learning_rate": 0.00044537332556785083,
"loss": 3.3579,
"step": 44350
},
{
"epoch": 12.922217047042384,
"grad_norm": 0.3634328544139862,
"learning_rate": 0.0004451986022131625,
"loss": 3.3447,
"step": 44400
},
{
"epoch": 12.936772240335351,
"grad_norm": 0.34999966621398926,
"learning_rate": 0.00044502387885847407,
"loss": 3.3522,
"step": 44450
},
{
"epoch": 12.951327433628318,
"grad_norm": 0.34144464135169983,
"learning_rate": 0.0004448491555037856,
"loss": 3.3623,
"step": 44500
},
{
"epoch": 12.965882626921285,
"grad_norm": 0.35570549964904785,
"learning_rate": 0.0004446744321490972,
"loss": 3.3436,
"step": 44550
},
{
"epoch": 12.980437820214252,
"grad_norm": 0.351752907037735,
"learning_rate": 0.0004444997087944088,
"loss": 3.3526,
"step": 44600
},
{
"epoch": 12.99499301350722,
"grad_norm": 0.3449532389640808,
"learning_rate": 0.0004443249854397204,
"loss": 3.3437,
"step": 44650
},
{
"epoch": 13.009315323707499,
"grad_norm": 0.33416709303855896,
"learning_rate": 0.000444150262085032,
"loss": 3.2859,
"step": 44700
},
{
"epoch": 13.023870517000466,
"grad_norm": 0.34715038537979126,
"learning_rate": 0.0004439755387303436,
"loss": 3.2449,
"step": 44750
},
{
"epoch": 13.038425710293433,
"grad_norm": 0.3305808901786804,
"learning_rate": 0.0004438008153756552,
"loss": 3.2539,
"step": 44800
},
{
"epoch": 13.0529809035864,
"grad_norm": 0.38518592715263367,
"learning_rate": 0.00044362609202096677,
"loss": 3.2619,
"step": 44850
},
{
"epoch": 13.067536096879367,
"grad_norm": 0.35659927129745483,
"learning_rate": 0.0004434513686662783,
"loss": 3.2589,
"step": 44900
},
{
"epoch": 13.082091290172334,
"grad_norm": 0.3617592453956604,
"learning_rate": 0.00044327664531158996,
"loss": 3.2477,
"step": 44950
},
{
"epoch": 13.0966464834653,
"grad_norm": 0.36155426502227783,
"learning_rate": 0.00044310192195690155,
"loss": 3.266,
"step": 45000
},
{
"epoch": 13.0966464834653,
"eval_accuracy": 0.370269658175358,
"eval_loss": 3.562215566635132,
"eval_runtime": 179.6666,
"eval_samples_per_second": 92.677,
"eval_steps_per_second": 5.794,
"step": 45000
},
{
"epoch": 13.111201676758267,
"grad_norm": 0.35711902379989624,
"learning_rate": 0.00044292719860221315,
"loss": 3.2665,
"step": 45050
},
{
"epoch": 13.125756870051234,
"grad_norm": 0.3325355648994446,
"learning_rate": 0.0004427524752475247,
"loss": 3.2799,
"step": 45100
},
{
"epoch": 13.1403120633442,
"grad_norm": 0.3590795695781708,
"learning_rate": 0.0004425777518928363,
"loss": 3.2741,
"step": 45150
},
{
"epoch": 13.154867256637168,
"grad_norm": 0.36923661828041077,
"learning_rate": 0.0004424030285381479,
"loss": 3.2727,
"step": 45200
},
{
"epoch": 13.169422449930135,
"grad_norm": 0.3838081359863281,
"learning_rate": 0.0004422283051834595,
"loss": 3.2678,
"step": 45250
},
{
"epoch": 13.183977643223102,
"grad_norm": 0.36855098605155945,
"learning_rate": 0.00044205358182877107,
"loss": 3.2836,
"step": 45300
},
{
"epoch": 13.198532836516069,
"grad_norm": 0.3482199013233185,
"learning_rate": 0.00044187885847408266,
"loss": 3.2763,
"step": 45350
},
{
"epoch": 13.213088029809036,
"grad_norm": 0.3591024875640869,
"learning_rate": 0.00044170413511939425,
"loss": 3.2913,
"step": 45400
},
{
"epoch": 13.227643223102003,
"grad_norm": 0.3705367147922516,
"learning_rate": 0.0004415294117647058,
"loss": 3.2844,
"step": 45450
},
{
"epoch": 13.24219841639497,
"grad_norm": 0.35116633772850037,
"learning_rate": 0.00044135468841001744,
"loss": 3.2834,
"step": 45500
},
{
"epoch": 13.256753609687937,
"grad_norm": 0.3566817343235016,
"learning_rate": 0.00044117996505532904,
"loss": 3.2826,
"step": 45550
},
{
"epoch": 13.271308802980904,
"grad_norm": 0.35147467255592346,
"learning_rate": 0.00044100524170064063,
"loss": 3.2919,
"step": 45600
},
{
"epoch": 13.285863996273871,
"grad_norm": 0.3430477976799011,
"learning_rate": 0.0004408305183459522,
"loss": 3.2787,
"step": 45650
},
{
"epoch": 13.300419189566837,
"grad_norm": 0.37397006154060364,
"learning_rate": 0.00044065579499126377,
"loss": 3.2915,
"step": 45700
},
{
"epoch": 13.314974382859804,
"grad_norm": 0.38754358887672424,
"learning_rate": 0.00044048107163657536,
"loss": 3.3088,
"step": 45750
},
{
"epoch": 13.32952957615277,
"grad_norm": 0.3674768805503845,
"learning_rate": 0.000440306348281887,
"loss": 3.2898,
"step": 45800
},
{
"epoch": 13.344084769445738,
"grad_norm": 0.3887562155723572,
"learning_rate": 0.0004401316249271986,
"loss": 3.2986,
"step": 45850
},
{
"epoch": 13.358639962738705,
"grad_norm": 0.36924323439598083,
"learning_rate": 0.00043995690157251014,
"loss": 3.2946,
"step": 45900
},
{
"epoch": 13.373195156031672,
"grad_norm": 0.3964468538761139,
"learning_rate": 0.00043978217821782174,
"loss": 3.2932,
"step": 45950
},
{
"epoch": 13.387750349324639,
"grad_norm": 0.3599477708339691,
"learning_rate": 0.00043960745486313333,
"loss": 3.298,
"step": 46000
},
{
"epoch": 13.387750349324639,
"eval_accuracy": 0.37062588343806385,
"eval_loss": 3.556173086166382,
"eval_runtime": 179.6561,
"eval_samples_per_second": 92.683,
"eval_steps_per_second": 5.794,
"step": 46000
},
{
"epoch": 13.402305542617606,
"grad_norm": 0.3589121103286743,
"learning_rate": 0.00043943273150844487,
"loss": 3.3072,
"step": 46050
},
{
"epoch": 13.416860735910573,
"grad_norm": 0.3620617687702179,
"learning_rate": 0.0004392580081537565,
"loss": 3.3076,
"step": 46100
},
{
"epoch": 13.43141592920354,
"grad_norm": 0.3338106870651245,
"learning_rate": 0.0004390832847990681,
"loss": 3.306,
"step": 46150
},
{
"epoch": 13.445971122496507,
"grad_norm": 0.34184029698371887,
"learning_rate": 0.0004389085614443797,
"loss": 3.3005,
"step": 46200
},
{
"epoch": 13.460526315789474,
"grad_norm": 0.3470214009284973,
"learning_rate": 0.00043873383808969125,
"loss": 3.3065,
"step": 46250
},
{
"epoch": 13.475081509082441,
"grad_norm": 0.39131850004196167,
"learning_rate": 0.00043855911473500284,
"loss": 3.3043,
"step": 46300
},
{
"epoch": 13.489636702375407,
"grad_norm": 0.37218213081359863,
"learning_rate": 0.0004383843913803145,
"loss": 3.329,
"step": 46350
},
{
"epoch": 13.504191895668374,
"grad_norm": 0.3680576980113983,
"learning_rate": 0.0004382096680256261,
"loss": 3.3217,
"step": 46400
},
{
"epoch": 13.51874708896134,
"grad_norm": 0.4002123177051544,
"learning_rate": 0.0004380349446709376,
"loss": 3.3121,
"step": 46450
},
{
"epoch": 13.533302282254308,
"grad_norm": 0.36680230498313904,
"learning_rate": 0.0004378602213162492,
"loss": 3.3098,
"step": 46500
},
{
"epoch": 13.547857475547275,
"grad_norm": 0.32874155044555664,
"learning_rate": 0.0004376854979615608,
"loss": 3.304,
"step": 46550
},
{
"epoch": 13.562412668840242,
"grad_norm": 0.3655678629875183,
"learning_rate": 0.0004375107746068724,
"loss": 3.3152,
"step": 46600
},
{
"epoch": 13.576967862133209,
"grad_norm": 0.3674863874912262,
"learning_rate": 0.000437336051252184,
"loss": 3.3161,
"step": 46650
},
{
"epoch": 13.591523055426176,
"grad_norm": 0.34729015827178955,
"learning_rate": 0.0004371613278974956,
"loss": 3.3201,
"step": 46700
},
{
"epoch": 13.606078248719143,
"grad_norm": 0.3326950967311859,
"learning_rate": 0.0004369866045428072,
"loss": 3.305,
"step": 46750
},
{
"epoch": 13.62063344201211,
"grad_norm": 0.38385745882987976,
"learning_rate": 0.0004368118811881188,
"loss": 3.3202,
"step": 46800
},
{
"epoch": 13.635188635305077,
"grad_norm": 0.35021325945854187,
"learning_rate": 0.0004366371578334303,
"loss": 3.3206,
"step": 46850
},
{
"epoch": 13.649743828598044,
"grad_norm": 0.36092448234558105,
"learning_rate": 0.000436462434478742,
"loss": 3.3223,
"step": 46900
},
{
"epoch": 13.664299021891011,
"grad_norm": 0.35010001063346863,
"learning_rate": 0.00043628771112405357,
"loss": 3.3283,
"step": 46950
},
{
"epoch": 13.678854215183978,
"grad_norm": 0.33859771490097046,
"learning_rate": 0.00043611298776936516,
"loss": 3.3193,
"step": 47000
},
{
"epoch": 13.678854215183978,
"eval_accuracy": 0.37116862453445965,
"eval_loss": 3.5511391162872314,
"eval_runtime": 179.5909,
"eval_samples_per_second": 92.716,
"eval_steps_per_second": 5.797,
"step": 47000
},
{
"epoch": 13.693409408476944,
"grad_norm": 0.35317522287368774,
"learning_rate": 0.0004359382644146767,
"loss": 3.3187,
"step": 47050
},
{
"epoch": 13.70796460176991,
"grad_norm": 0.4268363118171692,
"learning_rate": 0.0004357635410599883,
"loss": 3.3372,
"step": 47100
},
{
"epoch": 13.722519795062878,
"grad_norm": 0.3827083706855774,
"learning_rate": 0.0004355888177052999,
"loss": 3.3369,
"step": 47150
},
{
"epoch": 13.737074988355845,
"grad_norm": 0.3427564203739166,
"learning_rate": 0.00043541409435061154,
"loss": 3.3282,
"step": 47200
},
{
"epoch": 13.751630181648812,
"grad_norm": 0.36137208342552185,
"learning_rate": 0.0004352393709959231,
"loss": 3.3189,
"step": 47250
},
{
"epoch": 13.766185374941779,
"grad_norm": 0.33945193886756897,
"learning_rate": 0.0004350646476412347,
"loss": 3.3331,
"step": 47300
},
{
"epoch": 13.780740568234746,
"grad_norm": 0.40009352564811707,
"learning_rate": 0.00043488992428654627,
"loss": 3.3407,
"step": 47350
},
{
"epoch": 13.795295761527713,
"grad_norm": 0.3917628228664398,
"learning_rate": 0.0004347152009318578,
"loss": 3.33,
"step": 47400
},
{
"epoch": 13.80985095482068,
"grad_norm": 0.3363507091999054,
"learning_rate": 0.00043454047757716946,
"loss": 3.3353,
"step": 47450
},
{
"epoch": 13.824406148113647,
"grad_norm": 0.33481940627098083,
"learning_rate": 0.00043436575422248105,
"loss": 3.3309,
"step": 47500
},
{
"epoch": 13.838961341406614,
"grad_norm": 0.3893332779407501,
"learning_rate": 0.00043419103086779265,
"loss": 3.3376,
"step": 47550
},
{
"epoch": 13.853516534699581,
"grad_norm": 0.3612382411956787,
"learning_rate": 0.0004340163075131042,
"loss": 3.3238,
"step": 47600
},
{
"epoch": 13.868071727992549,
"grad_norm": 0.3898193836212158,
"learning_rate": 0.0004338415841584158,
"loss": 3.3458,
"step": 47650
},
{
"epoch": 13.882626921285514,
"grad_norm": 0.38638564944267273,
"learning_rate": 0.0004336668608037274,
"loss": 3.3269,
"step": 47700
},
{
"epoch": 13.89718211457848,
"grad_norm": 0.3925230801105499,
"learning_rate": 0.000433492137449039,
"loss": 3.3353,
"step": 47750
},
{
"epoch": 13.911737307871448,
"grad_norm": 0.3488530218601227,
"learning_rate": 0.00043331741409435056,
"loss": 3.3336,
"step": 47800
},
{
"epoch": 13.926292501164415,
"grad_norm": 0.35001108050346375,
"learning_rate": 0.00043314269073966216,
"loss": 3.3381,
"step": 47850
},
{
"epoch": 13.940847694457382,
"grad_norm": 0.3923191428184509,
"learning_rate": 0.00043296796738497375,
"loss": 3.3239,
"step": 47900
},
{
"epoch": 13.955402887750349,
"grad_norm": 0.37925252318382263,
"learning_rate": 0.00043279324403028535,
"loss": 3.352,
"step": 47950
},
{
"epoch": 13.969958081043316,
"grad_norm": 0.3492947220802307,
"learning_rate": 0.0004326185206755969,
"loss": 3.3377,
"step": 48000
},
{
"epoch": 13.969958081043316,
"eval_accuracy": 0.3718900071350827,
"eval_loss": 3.5379514694213867,
"eval_runtime": 179.6614,
"eval_samples_per_second": 92.68,
"eval_steps_per_second": 5.794,
"step": 48000
},
{
"epoch": 13.984513274336283,
"grad_norm": 0.3619728982448578,
"learning_rate": 0.00043244379732090854,
"loss": 3.3463,
"step": 48050
},
{
"epoch": 13.99906846762925,
"grad_norm": 0.34894421696662903,
"learning_rate": 0.00043226907396622013,
"loss": 3.3319,
"step": 48100
},
{
"epoch": 14.01339077782953,
"grad_norm": 0.3701537549495697,
"learning_rate": 0.0004320943506115317,
"loss": 3.2336,
"step": 48150
},
{
"epoch": 14.027945971122497,
"grad_norm": 0.33407357335090637,
"learning_rate": 0.00043191962725684326,
"loss": 3.2156,
"step": 48200
},
{
"epoch": 14.042501164415464,
"grad_norm": 0.3887421190738678,
"learning_rate": 0.00043174490390215486,
"loss": 3.2451,
"step": 48250
},
{
"epoch": 14.057056357708431,
"grad_norm": 0.34901162981987,
"learning_rate": 0.0004315701805474665,
"loss": 3.2342,
"step": 48300
},
{
"epoch": 14.071611551001398,
"grad_norm": 0.36356404423713684,
"learning_rate": 0.0004313954571927781,
"loss": 3.2384,
"step": 48350
},
{
"epoch": 14.086166744294363,
"grad_norm": 0.3423268496990204,
"learning_rate": 0.00043122073383808964,
"loss": 3.2502,
"step": 48400
},
{
"epoch": 14.10072193758733,
"grad_norm": 0.3743983507156372,
"learning_rate": 0.00043104601048340124,
"loss": 3.2497,
"step": 48450
},
{
"epoch": 14.115277130880298,
"grad_norm": 0.40020278096199036,
"learning_rate": 0.00043087128712871283,
"loss": 3.2558,
"step": 48500
},
{
"epoch": 14.129832324173265,
"grad_norm": 0.33450552821159363,
"learning_rate": 0.00043069656377402437,
"loss": 3.2594,
"step": 48550
},
{
"epoch": 14.144387517466232,
"grad_norm": 0.3431876599788666,
"learning_rate": 0.000430521840419336,
"loss": 3.2497,
"step": 48600
},
{
"epoch": 14.158942710759199,
"grad_norm": 0.357308954000473,
"learning_rate": 0.0004303471170646476,
"loss": 3.2554,
"step": 48650
},
{
"epoch": 14.173497904052166,
"grad_norm": 0.355058491230011,
"learning_rate": 0.0004301723937099592,
"loss": 3.2586,
"step": 48700
},
{
"epoch": 14.188053097345133,
"grad_norm": 0.39194077253341675,
"learning_rate": 0.00042999767035527075,
"loss": 3.2703,
"step": 48750
},
{
"epoch": 14.2026082906381,
"grad_norm": 0.3674411177635193,
"learning_rate": 0.00042982294700058234,
"loss": 3.2657,
"step": 48800
},
{
"epoch": 14.217163483931067,
"grad_norm": 0.3595729470252991,
"learning_rate": 0.000429648223645894,
"loss": 3.271,
"step": 48850
},
{
"epoch": 14.231718677224034,
"grad_norm": 0.36931276321411133,
"learning_rate": 0.0004294735002912056,
"loss": 3.2785,
"step": 48900
},
{
"epoch": 14.246273870517001,
"grad_norm": 0.3702554702758789,
"learning_rate": 0.0004292987769365172,
"loss": 3.2676,
"step": 48950
},
{
"epoch": 14.260829063809968,
"grad_norm": 0.3658660352230072,
"learning_rate": 0.0004291240535818287,
"loss": 3.2707,
"step": 49000
},
{
"epoch": 14.260829063809968,
"eval_accuracy": 0.3710951699685767,
"eval_loss": 3.5558922290802,
"eval_runtime": 179.6472,
"eval_samples_per_second": 92.687,
"eval_steps_per_second": 5.795,
"step": 49000
},
{
"epoch": 14.275384257102935,
"grad_norm": 0.35185012221336365,
"learning_rate": 0.0004289493302271403,
"loss": 3.2838,
"step": 49050
},
{
"epoch": 14.2899394503959,
"grad_norm": 0.3410630524158478,
"learning_rate": 0.0004287746068724519,
"loss": 3.288,
"step": 49100
},
{
"epoch": 14.304494643688868,
"grad_norm": 0.3545389771461487,
"learning_rate": 0.00042859988351776356,
"loss": 3.2729,
"step": 49150
},
{
"epoch": 14.319049836981835,
"grad_norm": 0.3831106424331665,
"learning_rate": 0.0004284251601630751,
"loss": 3.2881,
"step": 49200
},
{
"epoch": 14.333605030274802,
"grad_norm": 0.36799466609954834,
"learning_rate": 0.0004282504368083867,
"loss": 3.2728,
"step": 49250
},
{
"epoch": 14.348160223567769,
"grad_norm": 0.35214364528656006,
"learning_rate": 0.0004280757134536983,
"loss": 3.2897,
"step": 49300
},
{
"epoch": 14.362715416860736,
"grad_norm": 0.366825133562088,
"learning_rate": 0.0004279009900990098,
"loss": 3.2857,
"step": 49350
},
{
"epoch": 14.377270610153703,
"grad_norm": 0.3538905382156372,
"learning_rate": 0.0004277262667443214,
"loss": 3.2864,
"step": 49400
},
{
"epoch": 14.39182580344667,
"grad_norm": 0.3406408727169037,
"learning_rate": 0.00042755154338963307,
"loss": 3.2917,
"step": 49450
},
{
"epoch": 14.406380996739637,
"grad_norm": 0.3695243299007416,
"learning_rate": 0.00042737682003494466,
"loss": 3.2993,
"step": 49500
},
{
"epoch": 14.420936190032604,
"grad_norm": 0.38720056414604187,
"learning_rate": 0.0004272020966802562,
"loss": 3.3039,
"step": 49550
},
{
"epoch": 14.435491383325571,
"grad_norm": 0.35300391912460327,
"learning_rate": 0.0004270273733255678,
"loss": 3.2896,
"step": 49600
},
{
"epoch": 14.450046576618538,
"grad_norm": 0.37260785698890686,
"learning_rate": 0.0004268526499708794,
"loss": 3.2924,
"step": 49650
},
{
"epoch": 14.464601769911505,
"grad_norm": 0.3798984885215759,
"learning_rate": 0.00042667792661619104,
"loss": 3.2954,
"step": 49700
},
{
"epoch": 14.47915696320447,
"grad_norm": 0.37590479850769043,
"learning_rate": 0.0004265032032615026,
"loss": 3.2964,
"step": 49750
},
{
"epoch": 14.493712156497438,
"grad_norm": 0.3550393879413605,
"learning_rate": 0.0004263284799068142,
"loss": 3.2941,
"step": 49800
},
{
"epoch": 14.508267349790405,
"grad_norm": 0.3456578552722931,
"learning_rate": 0.00042615375655212577,
"loss": 3.3006,
"step": 49850
},
{
"epoch": 14.522822543083372,
"grad_norm": 0.3781253397464752,
"learning_rate": 0.00042597903319743736,
"loss": 3.287,
"step": 49900
},
{
"epoch": 14.537377736376339,
"grad_norm": 0.361044704914093,
"learning_rate": 0.0004258043098427489,
"loss": 3.293,
"step": 49950
},
{
"epoch": 14.551932929669306,
"grad_norm": 0.37364527583122253,
"learning_rate": 0.00042562958648806055,
"loss": 3.2986,
"step": 50000
},
{
"epoch": 14.551932929669306,
"eval_accuracy": 0.3713522021855143,
"eval_loss": 3.547536849975586,
"eval_runtime": 179.586,
"eval_samples_per_second": 92.719,
"eval_steps_per_second": 5.797,
"step": 50000
},
{
"epoch": 14.566488122962273,
"grad_norm": 0.38099926710128784,
"learning_rate": 0.00042545486313337214,
"loss": 3.2904,
"step": 50050
},
{
"epoch": 14.58104331625524,
"grad_norm": 0.37132102251052856,
"learning_rate": 0.00042528013977868374,
"loss": 3.2997,
"step": 50100
},
{
"epoch": 14.595598509548207,
"grad_norm": 0.35718464851379395,
"learning_rate": 0.0004251054164239953,
"loss": 3.2906,
"step": 50150
},
{
"epoch": 14.610153702841174,
"grad_norm": 0.4218481481075287,
"learning_rate": 0.0004249306930693069,
"loss": 3.3184,
"step": 50200
},
{
"epoch": 14.624708896134141,
"grad_norm": 0.37840861082077026,
"learning_rate": 0.0004247559697146185,
"loss": 3.3032,
"step": 50250
},
{
"epoch": 14.639264089427108,
"grad_norm": 0.35636797547340393,
"learning_rate": 0.0004245812463599301,
"loss": 3.3033,
"step": 50300
},
{
"epoch": 14.653819282720075,
"grad_norm": 0.35813337564468384,
"learning_rate": 0.00042440652300524166,
"loss": 3.2904,
"step": 50350
},
{
"epoch": 14.668374476013042,
"grad_norm": 0.36577391624450684,
"learning_rate": 0.00042423179965055325,
"loss": 3.305,
"step": 50400
},
{
"epoch": 14.682929669306008,
"grad_norm": 0.37621748447418213,
"learning_rate": 0.00042405707629586484,
"loss": 3.3105,
"step": 50450
},
{
"epoch": 14.697484862598975,
"grad_norm": 0.3534005284309387,
"learning_rate": 0.0004238823529411764,
"loss": 3.3143,
"step": 50500
},
{
"epoch": 14.712040055891942,
"grad_norm": 0.33455362915992737,
"learning_rate": 0.00042370762958648803,
"loss": 3.3092,
"step": 50550
},
{
"epoch": 14.726595249184909,
"grad_norm": 0.36176812648773193,
"learning_rate": 0.00042353290623179963,
"loss": 3.3064,
"step": 50600
},
{
"epoch": 14.741150442477876,
"grad_norm": 0.3369339108467102,
"learning_rate": 0.0004233581828771112,
"loss": 3.3078,
"step": 50650
},
{
"epoch": 14.755705635770843,
"grad_norm": 0.37076041102409363,
"learning_rate": 0.00042318345952242276,
"loss": 3.3203,
"step": 50700
},
{
"epoch": 14.77026082906381,
"grad_norm": 0.3623945415019989,
"learning_rate": 0.00042300873616773436,
"loss": 3.3136,
"step": 50750
},
{
"epoch": 14.784816022356777,
"grad_norm": 0.3696160316467285,
"learning_rate": 0.00042283401281304595,
"loss": 3.3155,
"step": 50800
},
{
"epoch": 14.799371215649744,
"grad_norm": 0.3646029829978943,
"learning_rate": 0.0004226592894583576,
"loss": 3.3136,
"step": 50850
},
{
"epoch": 14.813926408942711,
"grad_norm": 0.3717617094516754,
"learning_rate": 0.00042248456610366914,
"loss": 3.3075,
"step": 50900
},
{
"epoch": 14.828481602235678,
"grad_norm": 0.3565121591091156,
"learning_rate": 0.00042230984274898073,
"loss": 3.3208,
"step": 50950
},
{
"epoch": 14.843036795528645,
"grad_norm": 0.36482352018356323,
"learning_rate": 0.00042213511939429233,
"loss": 3.3352,
"step": 51000
},
{
"epoch": 14.843036795528645,
"eval_accuracy": 0.3720694713304479,
"eval_loss": 3.5395095348358154,
"eval_runtime": 179.5403,
"eval_samples_per_second": 92.742,
"eval_steps_per_second": 5.798,
"step": 51000
},
{
"epoch": 14.857591988821612,
"grad_norm": 0.34361740946769714,
"learning_rate": 0.0004219603960396039,
"loss": 3.3154,
"step": 51050
},
{
"epoch": 14.872147182114578,
"grad_norm": 0.3463952839374542,
"learning_rate": 0.0004217856726849155,
"loss": 3.3222,
"step": 51100
},
{
"epoch": 14.886702375407545,
"grad_norm": 0.35919591784477234,
"learning_rate": 0.0004216109493302271,
"loss": 3.3276,
"step": 51150
},
{
"epoch": 14.901257568700512,
"grad_norm": 0.3705897331237793,
"learning_rate": 0.0004214362259755387,
"loss": 3.317,
"step": 51200
},
{
"epoch": 14.915812761993479,
"grad_norm": 0.3571067750453949,
"learning_rate": 0.0004212615026208503,
"loss": 3.324,
"step": 51250
},
{
"epoch": 14.930367955286446,
"grad_norm": 0.34250903129577637,
"learning_rate": 0.00042108677926616184,
"loss": 3.3237,
"step": 51300
},
{
"epoch": 14.944923148579413,
"grad_norm": 0.37873947620391846,
"learning_rate": 0.00042091205591147343,
"loss": 3.3237,
"step": 51350
},
{
"epoch": 14.95947834187238,
"grad_norm": 0.409123957157135,
"learning_rate": 0.0004207373325567851,
"loss": 3.3299,
"step": 51400
},
{
"epoch": 14.974033535165347,
"grad_norm": 0.3576182425022125,
"learning_rate": 0.0004205626092020967,
"loss": 3.3202,
"step": 51450
},
{
"epoch": 14.988588728458314,
"grad_norm": 0.36448413133621216,
"learning_rate": 0.0004203878858474082,
"loss": 3.3059,
"step": 51500
},
{
"epoch": 15.002911038658594,
"grad_norm": 0.3610822856426239,
"learning_rate": 0.0004202131624927198,
"loss": 3.3108,
"step": 51550
},
{
"epoch": 15.01746623195156,
"grad_norm": 0.38162854313850403,
"learning_rate": 0.0004200384391380314,
"loss": 3.2257,
"step": 51600
},
{
"epoch": 15.032021425244528,
"grad_norm": 0.40614140033721924,
"learning_rate": 0.00041986371578334305,
"loss": 3.2096,
"step": 51650
},
{
"epoch": 15.046576618537495,
"grad_norm": 0.3821380138397217,
"learning_rate": 0.0004196889924286546,
"loss": 3.2207,
"step": 51700
},
{
"epoch": 15.06113181183046,
"grad_norm": 0.3656452000141144,
"learning_rate": 0.0004195142690739662,
"loss": 3.2344,
"step": 51750
},
{
"epoch": 15.075687005123427,
"grad_norm": 0.37217843532562256,
"learning_rate": 0.0004193395457192778,
"loss": 3.2278,
"step": 51800
},
{
"epoch": 15.090242198416394,
"grad_norm": 0.3583957254886627,
"learning_rate": 0.0004191648223645893,
"loss": 3.2347,
"step": 51850
},
{
"epoch": 15.104797391709361,
"grad_norm": 0.3432309329509735,
"learning_rate": 0.0004189900990099009,
"loss": 3.236,
"step": 51900
},
{
"epoch": 15.119352585002328,
"grad_norm": 0.409397691488266,
"learning_rate": 0.00041881537565521256,
"loss": 3.2372,
"step": 51950
},
{
"epoch": 15.133907778295296,
"grad_norm": 0.3543730676174164,
"learning_rate": 0.00041864065230052416,
"loss": 3.2328,
"step": 52000
},
{
"epoch": 15.133907778295296,
"eval_accuracy": 0.3717191224330127,
"eval_loss": 3.5530853271484375,
"eval_runtime": 179.6972,
"eval_samples_per_second": 92.661,
"eval_steps_per_second": 5.793,
"step": 52000
},
{
"epoch": 15.148462971588263,
"grad_norm": 0.3441791832447052,
"learning_rate": 0.00041846592894583575,
"loss": 3.2443,
"step": 52050
},
{
"epoch": 15.16301816488123,
"grad_norm": 0.3853805661201477,
"learning_rate": 0.0004182912055911473,
"loss": 3.244,
"step": 52100
},
{
"epoch": 15.177573358174197,
"grad_norm": 0.3840022683143616,
"learning_rate": 0.0004181164822364589,
"loss": 3.2354,
"step": 52150
},
{
"epoch": 15.192128551467164,
"grad_norm": 0.3698272705078125,
"learning_rate": 0.0004179417588817705,
"loss": 3.2504,
"step": 52200
},
{
"epoch": 15.20668374476013,
"grad_norm": 0.36755281686782837,
"learning_rate": 0.00041776703552708213,
"loss": 3.2553,
"step": 52250
},
{
"epoch": 15.221238938053098,
"grad_norm": 0.3694024085998535,
"learning_rate": 0.00041759231217239367,
"loss": 3.2514,
"step": 52300
},
{
"epoch": 15.235794131346065,
"grad_norm": 0.3874948024749756,
"learning_rate": 0.00041741758881770527,
"loss": 3.2526,
"step": 52350
},
{
"epoch": 15.250349324639032,
"grad_norm": 0.36800023913383484,
"learning_rate": 0.00041724286546301686,
"loss": 3.2682,
"step": 52400
},
{
"epoch": 15.264904517931997,
"grad_norm": 0.3868958652019501,
"learning_rate": 0.0004170681421083284,
"loss": 3.2656,
"step": 52450
},
{
"epoch": 15.279459711224964,
"grad_norm": 0.3841627836227417,
"learning_rate": 0.00041689341875364005,
"loss": 3.2586,
"step": 52500
},
{
"epoch": 15.294014904517931,
"grad_norm": 0.36817166209220886,
"learning_rate": 0.00041671869539895164,
"loss": 3.2491,
"step": 52550
},
{
"epoch": 15.308570097810899,
"grad_norm": 0.3713952898979187,
"learning_rate": 0.00041654397204426324,
"loss": 3.2714,
"step": 52600
},
{
"epoch": 15.323125291103866,
"grad_norm": 0.36914798617362976,
"learning_rate": 0.0004163692486895748,
"loss": 3.2747,
"step": 52650
},
{
"epoch": 15.337680484396833,
"grad_norm": 0.39278385043144226,
"learning_rate": 0.00041619452533488637,
"loss": 3.2763,
"step": 52700
},
{
"epoch": 15.3522356776898,
"grad_norm": 0.3813706636428833,
"learning_rate": 0.00041601980198019797,
"loss": 3.2517,
"step": 52750
},
{
"epoch": 15.366790870982767,
"grad_norm": 0.3740118145942688,
"learning_rate": 0.0004158450786255096,
"loss": 3.2634,
"step": 52800
},
{
"epoch": 15.381346064275734,
"grad_norm": 0.35879942774772644,
"learning_rate": 0.00041567035527082115,
"loss": 3.2687,
"step": 52850
},
{
"epoch": 15.3959012575687,
"grad_norm": 0.3741402328014374,
"learning_rate": 0.00041549563191613275,
"loss": 3.2697,
"step": 52900
},
{
"epoch": 15.410456450861668,
"grad_norm": 0.3618059456348419,
"learning_rate": 0.00041532090856144434,
"loss": 3.2675,
"step": 52950
},
{
"epoch": 15.425011644154635,
"grad_norm": 0.3848046660423279,
"learning_rate": 0.00041514618520675594,
"loss": 3.278,
"step": 53000
},
{
"epoch": 15.425011644154635,
"eval_accuracy": 0.37196287406443856,
"eval_loss": 3.5493414402008057,
"eval_runtime": 179.6539,
"eval_samples_per_second": 92.684,
"eval_steps_per_second": 5.794,
"step": 53000
},
{
"epoch": 15.439566837447602,
"grad_norm": 0.37163642048835754,
"learning_rate": 0.00041497146185206753,
"loss": 3.2886,
"step": 53050
},
{
"epoch": 15.454122030740567,
"grad_norm": 0.3669773042201996,
"learning_rate": 0.0004147967384973791,
"loss": 3.2606,
"step": 53100
},
{
"epoch": 15.468677224033534,
"grad_norm": 0.3907168209552765,
"learning_rate": 0.0004146220151426907,
"loss": 3.2783,
"step": 53150
},
{
"epoch": 15.483232417326501,
"grad_norm": 0.3907336890697479,
"learning_rate": 0.0004144472917880023,
"loss": 3.2893,
"step": 53200
},
{
"epoch": 15.497787610619469,
"grad_norm": 0.3639543652534485,
"learning_rate": 0.00041427256843331385,
"loss": 3.272,
"step": 53250
},
{
"epoch": 15.512342803912436,
"grad_norm": 0.41702601313591003,
"learning_rate": 0.00041409784507862545,
"loss": 3.2886,
"step": 53300
},
{
"epoch": 15.526897997205403,
"grad_norm": 0.4025951027870178,
"learning_rate": 0.0004139231217239371,
"loss": 3.2761,
"step": 53350
},
{
"epoch": 15.54145319049837,
"grad_norm": 0.3820459842681885,
"learning_rate": 0.0004137483983692487,
"loss": 3.2812,
"step": 53400
},
{
"epoch": 15.556008383791337,
"grad_norm": 0.36580777168273926,
"learning_rate": 0.00041357367501456023,
"loss": 3.2967,
"step": 53450
},
{
"epoch": 15.570563577084304,
"grad_norm": 0.36303627490997314,
"learning_rate": 0.0004133989516598718,
"loss": 3.2896,
"step": 53500
},
{
"epoch": 15.585118770377271,
"grad_norm": 0.35470396280288696,
"learning_rate": 0.0004132242283051834,
"loss": 3.2799,
"step": 53550
},
{
"epoch": 15.599673963670238,
"grad_norm": 0.3685864508152008,
"learning_rate": 0.00041304950495049496,
"loss": 3.29,
"step": 53600
},
{
"epoch": 15.614229156963205,
"grad_norm": 0.3831583857536316,
"learning_rate": 0.0004128747815958066,
"loss": 3.2894,
"step": 53650
},
{
"epoch": 15.628784350256172,
"grad_norm": 0.3941470682621002,
"learning_rate": 0.0004127000582411182,
"loss": 3.2939,
"step": 53700
},
{
"epoch": 15.64333954354914,
"grad_norm": 0.3580646514892578,
"learning_rate": 0.0004125253348864298,
"loss": 3.2979,
"step": 53750
},
{
"epoch": 15.657894736842106,
"grad_norm": 0.3721872866153717,
"learning_rate": 0.00041235061153174134,
"loss": 3.2968,
"step": 53800
},
{
"epoch": 15.672449930135071,
"grad_norm": 0.3598230481147766,
"learning_rate": 0.00041217588817705293,
"loss": 3.2864,
"step": 53850
},
{
"epoch": 15.687005123428039,
"grad_norm": 0.3656480312347412,
"learning_rate": 0.0004120011648223646,
"loss": 3.2915,
"step": 53900
},
{
"epoch": 15.701560316721006,
"grad_norm": 0.35835638642311096,
"learning_rate": 0.0004118264414676762,
"loss": 3.3011,
"step": 53950
},
{
"epoch": 15.716115510013973,
"grad_norm": 0.38179880380630493,
"learning_rate": 0.0004116517181129877,
"loss": 3.2919,
"step": 54000
},
{
"epoch": 15.716115510013973,
"eval_accuracy": 0.3724329832860893,
"eval_loss": 3.5413262844085693,
"eval_runtime": 179.6208,
"eval_samples_per_second": 92.701,
"eval_steps_per_second": 5.796,
"step": 54000
},
{
"epoch": 15.73067070330694,
"grad_norm": 0.3575129210948944,
"learning_rate": 0.0004114769947582993,
"loss": 3.3032,
"step": 54050
},
{
"epoch": 15.745225896599907,
"grad_norm": 0.4014807343482971,
"learning_rate": 0.0004113022714036109,
"loss": 3.3143,
"step": 54100
},
{
"epoch": 15.759781089892874,
"grad_norm": 0.39006999135017395,
"learning_rate": 0.0004111275480489225,
"loss": 3.2954,
"step": 54150
},
{
"epoch": 15.774336283185841,
"grad_norm": 0.35772159695625305,
"learning_rate": 0.0004109528246942341,
"loss": 3.3076,
"step": 54200
},
{
"epoch": 15.788891476478808,
"grad_norm": 0.39875340461730957,
"learning_rate": 0.0004107781013395457,
"loss": 3.2932,
"step": 54250
},
{
"epoch": 15.803446669771775,
"grad_norm": 0.3441701829433441,
"learning_rate": 0.0004106033779848573,
"loss": 3.3028,
"step": 54300
},
{
"epoch": 15.818001863064742,
"grad_norm": 0.35036149621009827,
"learning_rate": 0.0004104286546301689,
"loss": 3.3049,
"step": 54350
},
{
"epoch": 15.83255705635771,
"grad_norm": 0.3675341010093689,
"learning_rate": 0.0004102539312754804,
"loss": 3.3186,
"step": 54400
},
{
"epoch": 15.847112249650674,
"grad_norm": 0.36607494950294495,
"learning_rate": 0.00041007920792079206,
"loss": 3.3086,
"step": 54450
},
{
"epoch": 15.861667442943642,
"grad_norm": 0.36645710468292236,
"learning_rate": 0.00040990448456610366,
"loss": 3.3141,
"step": 54500
},
{
"epoch": 15.876222636236609,
"grad_norm": 0.36055368185043335,
"learning_rate": 0.00040972976121141525,
"loss": 3.3134,
"step": 54550
},
{
"epoch": 15.890777829529576,
"grad_norm": 0.36668238043785095,
"learning_rate": 0.0004095550378567268,
"loss": 3.3035,
"step": 54600
},
{
"epoch": 15.905333022822543,
"grad_norm": 0.3563118278980255,
"learning_rate": 0.0004093803145020384,
"loss": 3.3093,
"step": 54650
},
{
"epoch": 15.91988821611551,
"grad_norm": 0.3816479742527008,
"learning_rate": 0.00040920559114735,
"loss": 3.3128,
"step": 54700
},
{
"epoch": 15.934443409408477,
"grad_norm": 0.3647186756134033,
"learning_rate": 0.00040903086779266163,
"loss": 3.3119,
"step": 54750
},
{
"epoch": 15.948998602701444,
"grad_norm": 0.35991621017456055,
"learning_rate": 0.00040885614443797317,
"loss": 3.3154,
"step": 54800
},
{
"epoch": 15.963553795994411,
"grad_norm": 0.3633652329444885,
"learning_rate": 0.00040868142108328476,
"loss": 3.3102,
"step": 54850
},
{
"epoch": 15.978108989287378,
"grad_norm": 0.38223573565483093,
"learning_rate": 0.00040850669772859636,
"loss": 3.3094,
"step": 54900
},
{
"epoch": 15.992664182580345,
"grad_norm": 0.3739972710609436,
"learning_rate": 0.0004083319743739079,
"loss": 3.3059,
"step": 54950
},
{
"epoch": 16.006986492780623,
"grad_norm": 0.37632840871810913,
"learning_rate": 0.0004081572510192195,
"loss": 3.2682,
"step": 55000
},
{
"epoch": 16.006986492780623,
"eval_accuracy": 0.3719969569830083,
"eval_loss": 3.5475385189056396,
"eval_runtime": 179.7295,
"eval_samples_per_second": 92.645,
"eval_steps_per_second": 5.792,
"step": 55000
},
{
"epoch": 16.02154168607359,
"grad_norm": 0.3803386688232422,
"learning_rate": 0.00040798252766453114,
"loss": 3.2038,
"step": 55050
},
{
"epoch": 16.036096879366557,
"grad_norm": 0.35486406087875366,
"learning_rate": 0.00040780780430984273,
"loss": 3.207,
"step": 55100
},
{
"epoch": 16.050652072659524,
"grad_norm": 0.36758115887641907,
"learning_rate": 0.0004076330809551543,
"loss": 3.2083,
"step": 55150
},
{
"epoch": 16.06520726595249,
"grad_norm": 0.3653200566768646,
"learning_rate": 0.00040745835760046587,
"loss": 3.2015,
"step": 55200
},
{
"epoch": 16.079762459245458,
"grad_norm": 0.4169802963733673,
"learning_rate": 0.00040728363424577746,
"loss": 3.2054,
"step": 55250
},
{
"epoch": 16.094317652538425,
"grad_norm": 0.3609280586242676,
"learning_rate": 0.0004071089108910891,
"loss": 3.2186,
"step": 55300
},
{
"epoch": 16.108872845831392,
"grad_norm": 0.38168197870254517,
"learning_rate": 0.0004069341875364007,
"loss": 3.2337,
"step": 55350
},
{
"epoch": 16.12342803912436,
"grad_norm": 0.4037674367427826,
"learning_rate": 0.00040675946418171225,
"loss": 3.225,
"step": 55400
},
{
"epoch": 16.137983232417326,
"grad_norm": 0.3925740718841553,
"learning_rate": 0.00040658474082702384,
"loss": 3.2271,
"step": 55450
},
{
"epoch": 16.152538425710294,
"grad_norm": 0.37324100732803345,
"learning_rate": 0.00040641001747233543,
"loss": 3.2314,
"step": 55500
},
{
"epoch": 16.16709361900326,
"grad_norm": 0.38064682483673096,
"learning_rate": 0.000406235294117647,
"loss": 3.2391,
"step": 55550
},
{
"epoch": 16.181648812296228,
"grad_norm": 0.3883419334888458,
"learning_rate": 0.0004060605707629586,
"loss": 3.2359,
"step": 55600
},
{
"epoch": 16.196204005589195,
"grad_norm": 0.3888667821884155,
"learning_rate": 0.0004058858474082702,
"loss": 3.2457,
"step": 55650
},
{
"epoch": 16.21075919888216,
"grad_norm": 0.3523971736431122,
"learning_rate": 0.0004057111240535818,
"loss": 3.2474,
"step": 55700
},
{
"epoch": 16.22531439217513,
"grad_norm": 0.3806838393211365,
"learning_rate": 0.00040553640069889335,
"loss": 3.241,
"step": 55750
},
{
"epoch": 16.239869585468096,
"grad_norm": 0.37035635113716125,
"learning_rate": 0.00040536167734420495,
"loss": 3.2556,
"step": 55800
},
{
"epoch": 16.254424778761063,
"grad_norm": 0.36657461524009705,
"learning_rate": 0.0004051869539895166,
"loss": 3.2501,
"step": 55850
},
{
"epoch": 16.26897997205403,
"grad_norm": 0.40011465549468994,
"learning_rate": 0.0004050122306348282,
"loss": 3.2552,
"step": 55900
},
{
"epoch": 16.283535165346997,
"grad_norm": 0.3992847800254822,
"learning_rate": 0.00040483750728013973,
"loss": 3.2518,
"step": 55950
},
{
"epoch": 16.298090358639964,
"grad_norm": 0.3844752609729767,
"learning_rate": 0.0004046627839254513,
"loss": 3.2366,
"step": 56000
},
{
"epoch": 16.298090358639964,
"eval_accuracy": 0.3725179555279027,
"eval_loss": 3.549312114715576,
"eval_runtime": 179.6265,
"eval_samples_per_second": 92.698,
"eval_steps_per_second": 5.795,
"step": 56000
},
{
"epoch": 16.31264555193293,
"grad_norm": 0.39455828070640564,
"learning_rate": 0.0004044880605707629,
"loss": 3.256,
"step": 56050
},
{
"epoch": 16.3272007452259,
"grad_norm": 0.3753792941570282,
"learning_rate": 0.00040431333721607446,
"loss": 3.2485,
"step": 56100
},
{
"epoch": 16.341755938518865,
"grad_norm": 0.3579533100128174,
"learning_rate": 0.0004041386138613861,
"loss": 3.2474,
"step": 56150
},
{
"epoch": 16.35631113181183,
"grad_norm": 0.3628307580947876,
"learning_rate": 0.0004039638905066977,
"loss": 3.2552,
"step": 56200
},
{
"epoch": 16.370866325104796,
"grad_norm": 0.3599070608615875,
"learning_rate": 0.0004037891671520093,
"loss": 3.2571,
"step": 56250
},
{
"epoch": 16.385421518397763,
"grad_norm": 0.37618762254714966,
"learning_rate": 0.0004036144437973209,
"loss": 3.2502,
"step": 56300
},
{
"epoch": 16.39997671169073,
"grad_norm": 0.37751829624176025,
"learning_rate": 0.00040343972044263243,
"loss": 3.265,
"step": 56350
},
{
"epoch": 16.414531904983697,
"grad_norm": 0.37696489691734314,
"learning_rate": 0.0004032649970879441,
"loss": 3.2667,
"step": 56400
},
{
"epoch": 16.429087098276664,
"grad_norm": 0.36506879329681396,
"learning_rate": 0.00040309027373325567,
"loss": 3.2734,
"step": 56450
},
{
"epoch": 16.44364229156963,
"grad_norm": 0.37977832555770874,
"learning_rate": 0.00040291555037856727,
"loss": 3.2638,
"step": 56500
},
{
"epoch": 16.4581974848626,
"grad_norm": 0.38764917850494385,
"learning_rate": 0.0004027408270238788,
"loss": 3.2674,
"step": 56550
},
{
"epoch": 16.472752678155565,
"grad_norm": 0.3823941648006439,
"learning_rate": 0.0004025661036691904,
"loss": 3.2692,
"step": 56600
},
{
"epoch": 16.487307871448532,
"grad_norm": 0.3607601225376129,
"learning_rate": 0.000402391380314502,
"loss": 3.2756,
"step": 56650
},
{
"epoch": 16.5018630647415,
"grad_norm": 0.40973618626594543,
"learning_rate": 0.00040221665695981364,
"loss": 3.2684,
"step": 56700
},
{
"epoch": 16.516418258034467,
"grad_norm": 0.3708013594150543,
"learning_rate": 0.0004020419336051252,
"loss": 3.2718,
"step": 56750
},
{
"epoch": 16.530973451327434,
"grad_norm": 0.371579110622406,
"learning_rate": 0.0004018672102504368,
"loss": 3.2746,
"step": 56800
},
{
"epoch": 16.5455286446204,
"grad_norm": 0.373159259557724,
"learning_rate": 0.00040169248689574837,
"loss": 3.2793,
"step": 56850
},
{
"epoch": 16.560083837913368,
"grad_norm": 0.39505279064178467,
"learning_rate": 0.0004015177635410599,
"loss": 3.2851,
"step": 56900
},
{
"epoch": 16.574639031206335,
"grad_norm": 0.4192643165588379,
"learning_rate": 0.0004013430401863715,
"loss": 3.2629,
"step": 56950
},
{
"epoch": 16.589194224499302,
"grad_norm": 0.3601832687854767,
"learning_rate": 0.00040116831683168315,
"loss": 3.2881,
"step": 57000
},
{
"epoch": 16.589194224499302,
"eval_accuracy": 0.37245072990920663,
"eval_loss": 3.5421199798583984,
"eval_runtime": 179.6507,
"eval_samples_per_second": 92.685,
"eval_steps_per_second": 5.795,
"step": 57000
},
{
"epoch": 16.60374941779227,
"grad_norm": 0.3881416320800781,
"learning_rate": 0.00040099359347699475,
"loss": 3.2822,
"step": 57050
},
{
"epoch": 16.618304611085236,
"grad_norm": 0.36859115958213806,
"learning_rate": 0.0004008188701223063,
"loss": 3.2766,
"step": 57100
},
{
"epoch": 16.632859804378203,
"grad_norm": 0.37605735659599304,
"learning_rate": 0.0004006441467676179,
"loss": 3.2849,
"step": 57150
},
{
"epoch": 16.64741499767117,
"grad_norm": 0.38148075342178345,
"learning_rate": 0.0004004694234129295,
"loss": 3.2891,
"step": 57200
},
{
"epoch": 16.661970190964137,
"grad_norm": 0.4013352692127228,
"learning_rate": 0.0004002947000582411,
"loss": 3.2706,
"step": 57250
},
{
"epoch": 16.676525384257104,
"grad_norm": 0.33865901827812195,
"learning_rate": 0.00040011997670355267,
"loss": 3.2819,
"step": 57300
},
{
"epoch": 16.69108057755007,
"grad_norm": 0.361101895570755,
"learning_rate": 0.00039994525334886426,
"loss": 3.2909,
"step": 57350
},
{
"epoch": 16.70563577084304,
"grad_norm": 0.3946160674095154,
"learning_rate": 0.00039977052999417585,
"loss": 3.2848,
"step": 57400
},
{
"epoch": 16.720190964136005,
"grad_norm": 0.3926425278186798,
"learning_rate": 0.00039959580663948745,
"loss": 3.2931,
"step": 57450
},
{
"epoch": 16.734746157428972,
"grad_norm": 0.35905447602272034,
"learning_rate": 0.000399421083284799,
"loss": 3.2666,
"step": 57500
},
{
"epoch": 16.749301350721936,
"grad_norm": 0.3647949993610382,
"learning_rate": 0.00039924635993011064,
"loss": 3.2931,
"step": 57550
},
{
"epoch": 16.763856544014903,
"grad_norm": 0.4049411416053772,
"learning_rate": 0.00039907163657542223,
"loss": 3.3049,
"step": 57600
},
{
"epoch": 16.77841173730787,
"grad_norm": 0.38806965947151184,
"learning_rate": 0.0003988969132207338,
"loss": 3.2901,
"step": 57650
},
{
"epoch": 16.792966930600837,
"grad_norm": 0.39097949862480164,
"learning_rate": 0.00039872218986604537,
"loss": 3.2737,
"step": 57700
},
{
"epoch": 16.807522123893804,
"grad_norm": 0.36895087361335754,
"learning_rate": 0.00039854746651135696,
"loss": 3.2756,
"step": 57750
},
{
"epoch": 16.82207731718677,
"grad_norm": 0.35981959104537964,
"learning_rate": 0.0003983727431566686,
"loss": 3.2876,
"step": 57800
},
{
"epoch": 16.83663251047974,
"grad_norm": 0.36862266063690186,
"learning_rate": 0.0003981980198019802,
"loss": 3.2796,
"step": 57850
},
{
"epoch": 16.851187703772705,
"grad_norm": 0.3888986110687256,
"learning_rate": 0.00039802329644729174,
"loss": 3.2916,
"step": 57900
},
{
"epoch": 16.865742897065672,
"grad_norm": 0.3622966706752777,
"learning_rate": 0.00039784857309260334,
"loss": 3.2801,
"step": 57950
},
{
"epoch": 16.88029809035864,
"grad_norm": 0.3616342544555664,
"learning_rate": 0.00039767384973791493,
"loss": 3.2976,
"step": 58000
},
{
"epoch": 16.88029809035864,
"eval_accuracy": 0.3728305781603004,
"eval_loss": 3.5370852947235107,
"eval_runtime": 179.6611,
"eval_samples_per_second": 92.68,
"eval_steps_per_second": 5.794,
"step": 58000
},
{
"epoch": 16.894853283651607,
"grad_norm": 0.37034207582473755,
"learning_rate": 0.00039749912638322647,
"loss": 3.2898,
"step": 58050
},
{
"epoch": 16.909408476944574,
"grad_norm": 0.36878299713134766,
"learning_rate": 0.0003973244030285381,
"loss": 3.2922,
"step": 58100
},
{
"epoch": 16.92396367023754,
"grad_norm": 0.4003487527370453,
"learning_rate": 0.0003971496796738497,
"loss": 3.2961,
"step": 58150
},
{
"epoch": 16.938518863530508,
"grad_norm": 0.37082305550575256,
"learning_rate": 0.0003969749563191613,
"loss": 3.2972,
"step": 58200
},
{
"epoch": 16.953074056823475,
"grad_norm": 0.3500148355960846,
"learning_rate": 0.00039680023296447285,
"loss": 3.2897,
"step": 58250
},
{
"epoch": 16.967629250116442,
"grad_norm": 0.3859942853450775,
"learning_rate": 0.00039662550960978444,
"loss": 3.2784,
"step": 58300
},
{
"epoch": 16.98218444340941,
"grad_norm": 0.3568435609340668,
"learning_rate": 0.00039645078625509604,
"loss": 3.2951,
"step": 58350
},
{
"epoch": 16.996739636702376,
"grad_norm": 0.3656318485736847,
"learning_rate": 0.0003962760629004077,
"loss": 3.297,
"step": 58400
},
{
"epoch": 17.011061946902654,
"grad_norm": 0.3973788917064667,
"learning_rate": 0.0003961013395457193,
"loss": 3.1995,
"step": 58450
},
{
"epoch": 17.02561714019562,
"grad_norm": 0.3912462592124939,
"learning_rate": 0.0003959266161910308,
"loss": 3.1886,
"step": 58500
},
{
"epoch": 17.040172333488588,
"grad_norm": 0.41491520404815674,
"learning_rate": 0.0003957518928363424,
"loss": 3.1952,
"step": 58550
},
{
"epoch": 17.054727526781555,
"grad_norm": 0.3734089136123657,
"learning_rate": 0.000395577169481654,
"loss": 3.2013,
"step": 58600
},
{
"epoch": 17.069282720074522,
"grad_norm": 0.3561474084854126,
"learning_rate": 0.00039540244612696566,
"loss": 3.2074,
"step": 58650
},
{
"epoch": 17.08383791336749,
"grad_norm": 0.35811877250671387,
"learning_rate": 0.0003952277227722772,
"loss": 3.2098,
"step": 58700
},
{
"epoch": 17.098393106660456,
"grad_norm": 0.4044589698314667,
"learning_rate": 0.0003950529994175888,
"loss": 3.1951,
"step": 58750
},
{
"epoch": 17.112948299953423,
"grad_norm": 0.40245088934898376,
"learning_rate": 0.0003948782760629004,
"loss": 3.2112,
"step": 58800
},
{
"epoch": 17.12750349324639,
"grad_norm": 0.40546953678131104,
"learning_rate": 0.0003947035527082119,
"loss": 3.2241,
"step": 58850
},
{
"epoch": 17.142058686539357,
"grad_norm": 0.3817267417907715,
"learning_rate": 0.0003945288293535235,
"loss": 3.2153,
"step": 58900
},
{
"epoch": 17.156613879832324,
"grad_norm": 0.3912336230278015,
"learning_rate": 0.00039435410599883517,
"loss": 3.2246,
"step": 58950
},
{
"epoch": 17.17116907312529,
"grad_norm": 0.36735278367996216,
"learning_rate": 0.00039417938264414676,
"loss": 3.2211,
"step": 59000
},
{
"epoch": 17.17116907312529,
"eval_accuracy": 0.3723176889994795,
"eval_loss": 3.5510292053222656,
"eval_runtime": 179.6357,
"eval_samples_per_second": 92.693,
"eval_steps_per_second": 5.795,
"step": 59000
},
{
"epoch": 17.18572426641826,
"grad_norm": 0.37078139185905457,
"learning_rate": 0.0003940046592894583,
"loss": 3.2257,
"step": 59050
},
{
"epoch": 17.200279459711226,
"grad_norm": 0.3937833607196808,
"learning_rate": 0.0003938299359347699,
"loss": 3.2258,
"step": 59100
},
{
"epoch": 17.214834653004193,
"grad_norm": 0.3787386417388916,
"learning_rate": 0.0003936552125800815,
"loss": 3.2316,
"step": 59150
},
{
"epoch": 17.22938984629716,
"grad_norm": 0.4125954806804657,
"learning_rate": 0.00039348048922539314,
"loss": 3.2169,
"step": 59200
},
{
"epoch": 17.243945039590127,
"grad_norm": 0.3592980206012726,
"learning_rate": 0.0003933057658707047,
"loss": 3.2361,
"step": 59250
},
{
"epoch": 17.258500232883094,
"grad_norm": 0.36557480692863464,
"learning_rate": 0.0003931310425160163,
"loss": 3.2276,
"step": 59300
},
{
"epoch": 17.27305542617606,
"grad_norm": 0.3812400698661804,
"learning_rate": 0.00039295631916132787,
"loss": 3.2349,
"step": 59350
},
{
"epoch": 17.287610619469028,
"grad_norm": 0.37743037939071655,
"learning_rate": 0.00039278159580663946,
"loss": 3.2271,
"step": 59400
},
{
"epoch": 17.302165812761995,
"grad_norm": 0.3797909915447235,
"learning_rate": 0.000392606872451951,
"loss": 3.2436,
"step": 59450
},
{
"epoch": 17.316721006054962,
"grad_norm": 0.3824479281902313,
"learning_rate": 0.00039243214909726265,
"loss": 3.2359,
"step": 59500
},
{
"epoch": 17.331276199347926,
"grad_norm": 0.36629578471183777,
"learning_rate": 0.00039225742574257425,
"loss": 3.2425,
"step": 59550
},
{
"epoch": 17.345831392640893,
"grad_norm": 0.3833477199077606,
"learning_rate": 0.00039208270238788584,
"loss": 3.246,
"step": 59600
},
{
"epoch": 17.36038658593386,
"grad_norm": 0.393760621547699,
"learning_rate": 0.0003919079790331974,
"loss": 3.2417,
"step": 59650
},
{
"epoch": 17.374941779226827,
"grad_norm": 0.40368106961250305,
"learning_rate": 0.000391733255678509,
"loss": 3.2376,
"step": 59700
},
{
"epoch": 17.389496972519794,
"grad_norm": 0.3892704248428345,
"learning_rate": 0.00039155853232382057,
"loss": 3.2509,
"step": 59750
},
{
"epoch": 17.40405216581276,
"grad_norm": 0.3821728229522705,
"learning_rate": 0.0003913838089691322,
"loss": 3.2461,
"step": 59800
},
{
"epoch": 17.418607359105728,
"grad_norm": 0.3901355564594269,
"learning_rate": 0.00039120908561444376,
"loss": 3.2564,
"step": 59850
},
{
"epoch": 17.433162552398695,
"grad_norm": 0.3782642185688019,
"learning_rate": 0.00039103436225975535,
"loss": 3.2574,
"step": 59900
},
{
"epoch": 17.447717745691662,
"grad_norm": 0.37162768840789795,
"learning_rate": 0.00039085963890506695,
"loss": 3.2565,
"step": 59950
},
{
"epoch": 17.46227293898463,
"grad_norm": 0.38288211822509766,
"learning_rate": 0.0003906849155503785,
"loss": 3.265,
"step": 60000
},
{
"epoch": 17.46227293898463,
"eval_accuracy": 0.37270200328817893,
"eval_loss": 3.546358823776245,
"eval_runtime": 179.5231,
"eval_samples_per_second": 92.751,
"eval_steps_per_second": 5.799,
"step": 60000
},
{
"epoch": 17.476828132277596,
"grad_norm": 0.38734108209609985,
"learning_rate": 0.00039051019219569014,
"loss": 3.2502,
"step": 60050
},
{
"epoch": 17.491383325570563,
"grad_norm": 0.38259828090667725,
"learning_rate": 0.00039033546884100173,
"loss": 3.2445,
"step": 60100
},
{
"epoch": 17.50593851886353,
"grad_norm": 0.38961061835289,
"learning_rate": 0.0003901607454863133,
"loss": 3.2673,
"step": 60150
},
{
"epoch": 17.520493712156497,
"grad_norm": 0.34726908802986145,
"learning_rate": 0.00038998602213162486,
"loss": 3.2568,
"step": 60200
},
{
"epoch": 17.535048905449464,
"grad_norm": 0.42472442984580994,
"learning_rate": 0.00038981129877693646,
"loss": 3.2546,
"step": 60250
},
{
"epoch": 17.54960409874243,
"grad_norm": 0.38472869992256165,
"learning_rate": 0.00038963657542224805,
"loss": 3.2622,
"step": 60300
},
{
"epoch": 17.5641592920354,
"grad_norm": 0.4024789333343506,
"learning_rate": 0.0003894618520675597,
"loss": 3.2635,
"step": 60350
},
{
"epoch": 17.578714485328366,
"grad_norm": 0.3788979649543762,
"learning_rate": 0.00038928712871287124,
"loss": 3.252,
"step": 60400
},
{
"epoch": 17.593269678621333,
"grad_norm": 0.39442792534828186,
"learning_rate": 0.00038911240535818284,
"loss": 3.2777,
"step": 60450
},
{
"epoch": 17.6078248719143,
"grad_norm": 0.39352068305015564,
"learning_rate": 0.00038893768200349443,
"loss": 3.2663,
"step": 60500
},
{
"epoch": 17.622380065207267,
"grad_norm": 0.3962717354297638,
"learning_rate": 0.000388762958648806,
"loss": 3.2626,
"step": 60550
},
{
"epoch": 17.636935258500234,
"grad_norm": 0.38364487886428833,
"learning_rate": 0.0003885882352941176,
"loss": 3.2788,
"step": 60600
},
{
"epoch": 17.6514904517932,
"grad_norm": 0.3541100025177002,
"learning_rate": 0.0003884135119394292,
"loss": 3.2587,
"step": 60650
},
{
"epoch": 17.666045645086168,
"grad_norm": 0.3765043318271637,
"learning_rate": 0.0003882387885847408,
"loss": 3.2667,
"step": 60700
},
{
"epoch": 17.680600838379135,
"grad_norm": 0.37509429454803467,
"learning_rate": 0.0003880640652300524,
"loss": 3.2663,
"step": 60750
},
{
"epoch": 17.695156031672102,
"grad_norm": 0.35692986845970154,
"learning_rate": 0.00038788934187536394,
"loss": 3.2728,
"step": 60800
},
{
"epoch": 17.70971122496507,
"grad_norm": 0.34927189350128174,
"learning_rate": 0.00038771461852067554,
"loss": 3.2791,
"step": 60850
},
{
"epoch": 17.724266418258033,
"grad_norm": 0.359375923871994,
"learning_rate": 0.0003875398951659872,
"loss": 3.2762,
"step": 60900
},
{
"epoch": 17.738821611551,
"grad_norm": 0.3698371648788452,
"learning_rate": 0.0003873651718112988,
"loss": 3.2751,
"step": 60950
},
{
"epoch": 17.753376804843967,
"grad_norm": 0.39195096492767334,
"learning_rate": 0.0003871904484566103,
"loss": 3.2771,
"step": 61000
},
{
"epoch": 17.753376804843967,
"eval_accuracy": 0.37315612879629356,
"eval_loss": 3.535740375518799,
"eval_runtime": 179.6914,
"eval_samples_per_second": 92.664,
"eval_steps_per_second": 5.793,
"step": 61000
},
{
"epoch": 17.767931998136934,
"grad_norm": 0.34754666686058044,
"learning_rate": 0.0003870157251019219,
"loss": 3.2756,
"step": 61050
},
{
"epoch": 17.7824871914299,
"grad_norm": 0.40687552094459534,
"learning_rate": 0.0003868410017472335,
"loss": 3.2797,
"step": 61100
},
{
"epoch": 17.797042384722868,
"grad_norm": 0.3617575764656067,
"learning_rate": 0.00038666627839254505,
"loss": 3.2713,
"step": 61150
},
{
"epoch": 17.811597578015835,
"grad_norm": 0.36929965019226074,
"learning_rate": 0.0003864915550378567,
"loss": 3.2807,
"step": 61200
},
{
"epoch": 17.826152771308802,
"grad_norm": 0.36952269077301025,
"learning_rate": 0.0003863168316831683,
"loss": 3.285,
"step": 61250
},
{
"epoch": 17.84070796460177,
"grad_norm": 0.35455524921417236,
"learning_rate": 0.0003861421083284799,
"loss": 3.2878,
"step": 61300
},
{
"epoch": 17.855263157894736,
"grad_norm": 0.4048665463924408,
"learning_rate": 0.0003859673849737914,
"loss": 3.2765,
"step": 61350
},
{
"epoch": 17.869818351187703,
"grad_norm": 0.3863668441772461,
"learning_rate": 0.000385792661619103,
"loss": 3.2885,
"step": 61400
},
{
"epoch": 17.88437354448067,
"grad_norm": 0.38117608428001404,
"learning_rate": 0.00038561793826441467,
"loss": 3.2785,
"step": 61450
},
{
"epoch": 17.898928737773637,
"grad_norm": 0.3930187225341797,
"learning_rate": 0.00038544321490972626,
"loss": 3.291,
"step": 61500
},
{
"epoch": 17.913483931066605,
"grad_norm": 0.361481636762619,
"learning_rate": 0.0003852684915550378,
"loss": 3.2738,
"step": 61550
},
{
"epoch": 17.92803912435957,
"grad_norm": 0.36943328380584717,
"learning_rate": 0.0003850937682003494,
"loss": 3.2778,
"step": 61600
},
{
"epoch": 17.94259431765254,
"grad_norm": 0.3720901608467102,
"learning_rate": 0.000384919044845661,
"loss": 3.2895,
"step": 61650
},
{
"epoch": 17.957149510945506,
"grad_norm": 0.37087762355804443,
"learning_rate": 0.0003847443214909726,
"loss": 3.2686,
"step": 61700
},
{
"epoch": 17.971704704238473,
"grad_norm": 0.4020977020263672,
"learning_rate": 0.00038456959813628423,
"loss": 3.2813,
"step": 61750
},
{
"epoch": 17.98625989753144,
"grad_norm": 0.38199225068092346,
"learning_rate": 0.0003843948747815958,
"loss": 3.2921,
"step": 61800
},
{
"epoch": 18.000582207731718,
"grad_norm": 0.4147983491420746,
"learning_rate": 0.00038422015142690737,
"loss": 3.287,
"step": 61850
},
{
"epoch": 18.015137401024685,
"grad_norm": 0.35553550720214844,
"learning_rate": 0.00038404542807221896,
"loss": 3.1681,
"step": 61900
},
{
"epoch": 18.029692594317652,
"grad_norm": 0.38814523816108704,
"learning_rate": 0.0003838707047175305,
"loss": 3.1671,
"step": 61950
},
{
"epoch": 18.04424778761062,
"grad_norm": 0.36407405138015747,
"learning_rate": 0.00038369598136284215,
"loss": 3.1751,
"step": 62000
},
{
"epoch": 18.04424778761062,
"eval_accuracy": 0.37241570677219366,
"eval_loss": 3.549715518951416,
"eval_runtime": 179.8427,
"eval_samples_per_second": 92.586,
"eval_steps_per_second": 5.788,
"step": 62000
},
{
"epoch": 18.058802980903586,
"grad_norm": 0.35381069779396057,
"learning_rate": 0.00038352125800815374,
"loss": 3.1806,
"step": 62050
},
{
"epoch": 18.073358174196553,
"grad_norm": 0.38974687457084656,
"learning_rate": 0.00038334653465346534,
"loss": 3.188,
"step": 62100
},
{
"epoch": 18.08791336748952,
"grad_norm": 0.39390093088150024,
"learning_rate": 0.0003831718112987769,
"loss": 3.1949,
"step": 62150
},
{
"epoch": 18.102468560782487,
"grad_norm": 0.38337382674217224,
"learning_rate": 0.0003829970879440885,
"loss": 3.1987,
"step": 62200
},
{
"epoch": 18.117023754075454,
"grad_norm": 0.39741766452789307,
"learning_rate": 0.00038282236458940007,
"loss": 3.214,
"step": 62250
},
{
"epoch": 18.13157894736842,
"grad_norm": 0.36907315254211426,
"learning_rate": 0.0003826476412347117,
"loss": 3.1865,
"step": 62300
},
{
"epoch": 18.14613414066139,
"grad_norm": 0.41438016295433044,
"learning_rate": 0.00038247291788002326,
"loss": 3.2109,
"step": 62350
},
{
"epoch": 18.160689333954355,
"grad_norm": 0.3838878273963928,
"learning_rate": 0.00038229819452533485,
"loss": 3.2182,
"step": 62400
},
{
"epoch": 18.175244527247322,
"grad_norm": 0.3881027400493622,
"learning_rate": 0.00038212347117064644,
"loss": 3.2179,
"step": 62450
},
{
"epoch": 18.18979972054029,
"grad_norm": 0.3932623267173767,
"learning_rate": 0.000381948747815958,
"loss": 3.2089,
"step": 62500
},
{
"epoch": 18.204354913833257,
"grad_norm": 0.3784829378128052,
"learning_rate": 0.0003817740244612696,
"loss": 3.2142,
"step": 62550
},
{
"epoch": 18.218910107126224,
"grad_norm": 0.3763687014579773,
"learning_rate": 0.00038159930110658123,
"loss": 3.2179,
"step": 62600
},
{
"epoch": 18.23346530041919,
"grad_norm": 0.39583879709243774,
"learning_rate": 0.0003814245777518928,
"loss": 3.2143,
"step": 62650
},
{
"epoch": 18.248020493712158,
"grad_norm": 0.38479939103126526,
"learning_rate": 0.0003812498543972044,
"loss": 3.2218,
"step": 62700
},
{
"epoch": 18.262575687005125,
"grad_norm": 0.363050639629364,
"learning_rate": 0.00038107513104251596,
"loss": 3.2286,
"step": 62750
},
{
"epoch": 18.277130880298092,
"grad_norm": 0.3779996931552887,
"learning_rate": 0.00038090040768782755,
"loss": 3.2342,
"step": 62800
},
{
"epoch": 18.29168607359106,
"grad_norm": 0.39983800053596497,
"learning_rate": 0.0003807256843331392,
"loss": 3.2353,
"step": 62850
},
{
"epoch": 18.306241266884022,
"grad_norm": 0.37936192750930786,
"learning_rate": 0.0003805509609784508,
"loss": 3.2282,
"step": 62900
},
{
"epoch": 18.32079646017699,
"grad_norm": 0.38810208439826965,
"learning_rate": 0.00038037623762376233,
"loss": 3.2334,
"step": 62950
},
{
"epoch": 18.335351653469957,
"grad_norm": 0.40988579392433167,
"learning_rate": 0.00038020151426907393,
"loss": 3.2348,
"step": 63000
},
{
"epoch": 18.335351653469957,
"eval_accuracy": 0.3728442113277283,
"eval_loss": 3.550168752670288,
"eval_runtime": 179.6745,
"eval_samples_per_second": 92.673,
"eval_steps_per_second": 5.794,
"step": 63000
},
{
"epoch": 18.349906846762924,
"grad_norm": 0.3667028844356537,
"learning_rate": 0.0003800267909143855,
"loss": 3.23,
"step": 63050
},
{
"epoch": 18.36446204005589,
"grad_norm": 0.4316708743572235,
"learning_rate": 0.00037985206755969706,
"loss": 3.2315,
"step": 63100
},
{
"epoch": 18.379017233348858,
"grad_norm": 0.3851536214351654,
"learning_rate": 0.0003796773442050087,
"loss": 3.2306,
"step": 63150
},
{
"epoch": 18.393572426641825,
"grad_norm": 0.3705301284790039,
"learning_rate": 0.0003795026208503203,
"loss": 3.2534,
"step": 63200
},
{
"epoch": 18.408127619934792,
"grad_norm": 0.4015274941921234,
"learning_rate": 0.0003793278974956319,
"loss": 3.2605,
"step": 63250
},
{
"epoch": 18.42268281322776,
"grad_norm": 0.40232664346694946,
"learning_rate": 0.00037915317414094344,
"loss": 3.2504,
"step": 63300
},
{
"epoch": 18.437238006520726,
"grad_norm": 0.4108823835849762,
"learning_rate": 0.00037897845078625503,
"loss": 3.2411,
"step": 63350
},
{
"epoch": 18.451793199813693,
"grad_norm": 0.3592471778392792,
"learning_rate": 0.0003788037274315667,
"loss": 3.2475,
"step": 63400
},
{
"epoch": 18.46634839310666,
"grad_norm": 0.385998010635376,
"learning_rate": 0.0003786290040768783,
"loss": 3.2484,
"step": 63450
},
{
"epoch": 18.480903586399627,
"grad_norm": 0.39755505323410034,
"learning_rate": 0.0003784542807221898,
"loss": 3.2334,
"step": 63500
},
{
"epoch": 18.495458779692594,
"grad_norm": 0.38933873176574707,
"learning_rate": 0.0003782795573675014,
"loss": 3.2383,
"step": 63550
},
{
"epoch": 18.51001397298556,
"grad_norm": 0.36488500237464905,
"learning_rate": 0.000378104834012813,
"loss": 3.2655,
"step": 63600
},
{
"epoch": 18.52456916627853,
"grad_norm": 0.3819720447063446,
"learning_rate": 0.0003779301106581246,
"loss": 3.2466,
"step": 63650
},
{
"epoch": 18.539124359571495,
"grad_norm": 0.38391003012657166,
"learning_rate": 0.0003777553873034362,
"loss": 3.2557,
"step": 63700
},
{
"epoch": 18.553679552864462,
"grad_norm": 0.3939170837402344,
"learning_rate": 0.0003775806639487478,
"loss": 3.2513,
"step": 63750
},
{
"epoch": 18.56823474615743,
"grad_norm": 0.39664459228515625,
"learning_rate": 0.0003774059405940594,
"loss": 3.2397,
"step": 63800
},
{
"epoch": 18.582789939450397,
"grad_norm": 0.3669049143791199,
"learning_rate": 0.000377231217239371,
"loss": 3.243,
"step": 63850
},
{
"epoch": 18.597345132743364,
"grad_norm": 0.4121682047843933,
"learning_rate": 0.0003770564938846825,
"loss": 3.2451,
"step": 63900
},
{
"epoch": 18.61190032603633,
"grad_norm": 0.40818139910697937,
"learning_rate": 0.00037688177052999416,
"loss": 3.2538,
"step": 63950
},
{
"epoch": 18.626455519329298,
"grad_norm": 0.40750160813331604,
"learning_rate": 0.00037670704717530576,
"loss": 3.2587,
"step": 64000
},
{
"epoch": 18.626455519329298,
"eval_accuracy": 0.3730311972706399,
"eval_loss": 3.5414130687713623,
"eval_runtime": 179.6839,
"eval_samples_per_second": 92.668,
"eval_steps_per_second": 5.794,
"step": 64000
},
{
"epoch": 18.641010712622265,
"grad_norm": 0.39473462104797363,
"learning_rate": 0.00037653232382061735,
"loss": 3.2561,
"step": 64050
},
{
"epoch": 18.655565905915232,
"grad_norm": 0.36568084359169006,
"learning_rate": 0.0003763576004659289,
"loss": 3.2541,
"step": 64100
},
{
"epoch": 18.6701210992082,
"grad_norm": 0.39155519008636475,
"learning_rate": 0.0003761828771112405,
"loss": 3.2566,
"step": 64150
},
{
"epoch": 18.684676292501166,
"grad_norm": 0.39551350474357605,
"learning_rate": 0.0003760081537565521,
"loss": 3.2546,
"step": 64200
},
{
"epoch": 18.69923148579413,
"grad_norm": 0.4288475811481476,
"learning_rate": 0.00037583343040186373,
"loss": 3.249,
"step": 64250
},
{
"epoch": 18.713786679087097,
"grad_norm": 0.386665403842926,
"learning_rate": 0.00037565870704717527,
"loss": 3.2676,
"step": 64300
},
{
"epoch": 18.728341872380064,
"grad_norm": 0.37789034843444824,
"learning_rate": 0.00037548398369248687,
"loss": 3.2622,
"step": 64350
},
{
"epoch": 18.74289706567303,
"grad_norm": 0.3938154876232147,
"learning_rate": 0.00037530926033779846,
"loss": 3.2729,
"step": 64400
},
{
"epoch": 18.757452258965998,
"grad_norm": 0.36355528235435486,
"learning_rate": 0.00037513453698311,
"loss": 3.268,
"step": 64450
},
{
"epoch": 18.772007452258965,
"grad_norm": 0.416415810585022,
"learning_rate": 0.0003749598136284216,
"loss": 3.2616,
"step": 64500
},
{
"epoch": 18.786562645551932,
"grad_norm": 0.4128302335739136,
"learning_rate": 0.00037478509027373324,
"loss": 3.2657,
"step": 64550
},
{
"epoch": 18.8011178388449,
"grad_norm": 0.37560078501701355,
"learning_rate": 0.00037461036691904484,
"loss": 3.2485,
"step": 64600
},
{
"epoch": 18.815673032137866,
"grad_norm": 0.40694931149482727,
"learning_rate": 0.0003744356435643564,
"loss": 3.2506,
"step": 64650
},
{
"epoch": 18.830228225430833,
"grad_norm": 0.3949992060661316,
"learning_rate": 0.00037426092020966797,
"loss": 3.2647,
"step": 64700
},
{
"epoch": 18.8447834187238,
"grad_norm": 0.38129428029060364,
"learning_rate": 0.00037408619685497957,
"loss": 3.2756,
"step": 64750
},
{
"epoch": 18.859338612016767,
"grad_norm": 0.3728366494178772,
"learning_rate": 0.0003739114735002912,
"loss": 3.2647,
"step": 64800
},
{
"epoch": 18.873893805309734,
"grad_norm": 0.40122315287590027,
"learning_rate": 0.0003737367501456028,
"loss": 3.2654,
"step": 64850
},
{
"epoch": 18.8884489986027,
"grad_norm": 0.37444445490837097,
"learning_rate": 0.00037356202679091435,
"loss": 3.277,
"step": 64900
},
{
"epoch": 18.90300419189567,
"grad_norm": 0.3745873272418976,
"learning_rate": 0.00037338730343622594,
"loss": 3.2704,
"step": 64950
},
{
"epoch": 18.917559385188635,
"grad_norm": 0.38452088832855225,
"learning_rate": 0.00037321258008153754,
"loss": 3.2776,
"step": 65000
},
{
"epoch": 18.917559385188635,
"eval_accuracy": 0.37377173682204523,
"eval_loss": 3.5317463874816895,
"eval_runtime": 179.6892,
"eval_samples_per_second": 92.666,
"eval_steps_per_second": 5.793,
"step": 65000
},
{
"epoch": 18.932114578481603,
"grad_norm": 0.40762460231781006,
"learning_rate": 0.0003730378567268491,
"loss": 3.2639,
"step": 65050
},
{
"epoch": 18.94666977177457,
"grad_norm": 0.42345935106277466,
"learning_rate": 0.0003728631333721607,
"loss": 3.2639,
"step": 65100
},
{
"epoch": 18.961224965067537,
"grad_norm": 0.3937210738658905,
"learning_rate": 0.0003726884100174723,
"loss": 3.2795,
"step": 65150
},
{
"epoch": 18.975780158360504,
"grad_norm": 0.38212597370147705,
"learning_rate": 0.0003725136866627839,
"loss": 3.277,
"step": 65200
},
{
"epoch": 18.99033535165347,
"grad_norm": 0.4089849889278412,
"learning_rate": 0.00037233896330809545,
"loss": 3.2783,
"step": 65250
},
{
"epoch": 19.00465766185375,
"grad_norm": 0.3767828643321991,
"learning_rate": 0.00037216423995340705,
"loss": 3.2387,
"step": 65300
},
{
"epoch": 19.019212855146716,
"grad_norm": 0.39858773350715637,
"learning_rate": 0.0003719895165987187,
"loss": 3.1656,
"step": 65350
},
{
"epoch": 19.033768048439683,
"grad_norm": 0.40623700618743896,
"learning_rate": 0.0003718147932440303,
"loss": 3.1762,
"step": 65400
},
{
"epoch": 19.04832324173265,
"grad_norm": 0.40791580080986023,
"learning_rate": 0.00037164006988934183,
"loss": 3.1803,
"step": 65450
},
{
"epoch": 19.062878435025617,
"grad_norm": 0.35862547159194946,
"learning_rate": 0.0003714653465346534,
"loss": 3.174,
"step": 65500
},
{
"epoch": 19.077433628318584,
"grad_norm": 0.38739070296287537,
"learning_rate": 0.000371290623179965,
"loss": 3.1929,
"step": 65550
},
{
"epoch": 19.09198882161155,
"grad_norm": 0.4318332076072693,
"learning_rate": 0.00037111589982527656,
"loss": 3.183,
"step": 65600
},
{
"epoch": 19.106544014904518,
"grad_norm": 0.4079286754131317,
"learning_rate": 0.0003709411764705882,
"loss": 3.1921,
"step": 65650
},
{
"epoch": 19.121099208197485,
"grad_norm": 0.3845118582248688,
"learning_rate": 0.0003707664531158998,
"loss": 3.191,
"step": 65700
},
{
"epoch": 19.135654401490452,
"grad_norm": 0.38291704654693604,
"learning_rate": 0.0003705917297612114,
"loss": 3.1975,
"step": 65750
},
{
"epoch": 19.15020959478342,
"grad_norm": 0.3601689338684082,
"learning_rate": 0.000370417006406523,
"loss": 3.201,
"step": 65800
},
{
"epoch": 19.164764788076386,
"grad_norm": 0.3908986747264862,
"learning_rate": 0.00037024228305183453,
"loss": 3.1925,
"step": 65850
},
{
"epoch": 19.179319981369353,
"grad_norm": 0.36919277906417847,
"learning_rate": 0.0003700675596971461,
"loss": 3.1987,
"step": 65900
},
{
"epoch": 19.19387517466232,
"grad_norm": 0.4059826135635376,
"learning_rate": 0.0003698928363424578,
"loss": 3.1942,
"step": 65950
},
{
"epoch": 19.208430367955287,
"grad_norm": 0.43888577818870544,
"learning_rate": 0.00036971811298776937,
"loss": 3.1939,
"step": 66000
},
{
"epoch": 19.208430367955287,
"eval_accuracy": 0.3727541854117822,
"eval_loss": 3.551252841949463,
"eval_runtime": 179.6276,
"eval_samples_per_second": 92.697,
"eval_steps_per_second": 5.795,
"step": 66000
},
{
"epoch": 19.222985561248255,
"grad_norm": 0.4140142500400543,
"learning_rate": 0.0003695433896330809,
"loss": 3.2038,
"step": 66050
},
{
"epoch": 19.23754075454122,
"grad_norm": 0.40382882952690125,
"learning_rate": 0.0003693686662783925,
"loss": 3.2215,
"step": 66100
},
{
"epoch": 19.25209594783419,
"grad_norm": 0.39210304617881775,
"learning_rate": 0.0003691939429237041,
"loss": 3.2032,
"step": 66150
},
{
"epoch": 19.266651141127156,
"grad_norm": 0.4092456102371216,
"learning_rate": 0.00036901921956901575,
"loss": 3.2133,
"step": 66200
},
{
"epoch": 19.281206334420123,
"grad_norm": 0.38151684403419495,
"learning_rate": 0.0003688444962143273,
"loss": 3.2171,
"step": 66250
},
{
"epoch": 19.29576152771309,
"grad_norm": 0.36478516459465027,
"learning_rate": 0.0003686697728596389,
"loss": 3.2124,
"step": 66300
},
{
"epoch": 19.310316721006053,
"grad_norm": 0.41321274638175964,
"learning_rate": 0.0003684950495049505,
"loss": 3.2126,
"step": 66350
},
{
"epoch": 19.32487191429902,
"grad_norm": 0.384111225605011,
"learning_rate": 0.000368320326150262,
"loss": 3.2266,
"step": 66400
},
{
"epoch": 19.339427107591987,
"grad_norm": 0.37913626432418823,
"learning_rate": 0.0003681456027955736,
"loss": 3.2197,
"step": 66450
},
{
"epoch": 19.353982300884955,
"grad_norm": 0.4188009798526764,
"learning_rate": 0.00036797087944088526,
"loss": 3.2158,
"step": 66500
},
{
"epoch": 19.36853749417792,
"grad_norm": 0.39769816398620605,
"learning_rate": 0.00036779615608619685,
"loss": 3.2273,
"step": 66550
},
{
"epoch": 19.38309268747089,
"grad_norm": 0.3958772122859955,
"learning_rate": 0.0003676214327315084,
"loss": 3.2329,
"step": 66600
},
{
"epoch": 19.397647880763856,
"grad_norm": 0.3825916051864624,
"learning_rate": 0.00036744670937682,
"loss": 3.2261,
"step": 66650
},
{
"epoch": 19.412203074056823,
"grad_norm": 0.3826241195201874,
"learning_rate": 0.0003672719860221316,
"loss": 3.2166,
"step": 66700
},
{
"epoch": 19.42675826734979,
"grad_norm": 0.38634222745895386,
"learning_rate": 0.00036709726266744323,
"loss": 3.2484,
"step": 66750
},
{
"epoch": 19.441313460642757,
"grad_norm": 0.3904113471508026,
"learning_rate": 0.00036692253931275477,
"loss": 3.2139,
"step": 66800
},
{
"epoch": 19.455868653935724,
"grad_norm": 0.3914354741573334,
"learning_rate": 0.00036674781595806636,
"loss": 3.2258,
"step": 66850
},
{
"epoch": 19.47042384722869,
"grad_norm": 0.38262632489204407,
"learning_rate": 0.00036657309260337796,
"loss": 3.2338,
"step": 66900
},
{
"epoch": 19.484979040521658,
"grad_norm": 0.40577489137649536,
"learning_rate": 0.00036639836924868955,
"loss": 3.2281,
"step": 66950
},
{
"epoch": 19.499534233814625,
"grad_norm": 0.39221397042274475,
"learning_rate": 0.0003662236458940011,
"loss": 3.2337,
"step": 67000
},
{
"epoch": 19.499534233814625,
"eval_accuracy": 0.37309760019819804,
"eval_loss": 3.542379140853882,
"eval_runtime": 179.58,
"eval_samples_per_second": 92.722,
"eval_steps_per_second": 5.797,
"step": 67000
},
{
"epoch": 19.514089427107592,
"grad_norm": 0.3849264681339264,
"learning_rate": 0.00036604892253931274,
"loss": 3.2324,
"step": 67050
},
{
"epoch": 19.52864462040056,
"grad_norm": 0.41369107365608215,
"learning_rate": 0.00036587419918462433,
"loss": 3.2369,
"step": 67100
},
{
"epoch": 19.543199813693526,
"grad_norm": 0.4056357443332672,
"learning_rate": 0.00036569947582993593,
"loss": 3.2423,
"step": 67150
},
{
"epoch": 19.557755006986493,
"grad_norm": 0.3950081467628479,
"learning_rate": 0.00036552475247524747,
"loss": 3.2423,
"step": 67200
},
{
"epoch": 19.57231020027946,
"grad_norm": 0.3796003460884094,
"learning_rate": 0.00036535002912055906,
"loss": 3.2345,
"step": 67250
},
{
"epoch": 19.586865393572428,
"grad_norm": 0.37646934390068054,
"learning_rate": 0.00036517530576587066,
"loss": 3.2447,
"step": 67300
},
{
"epoch": 19.601420586865395,
"grad_norm": 0.4380941689014435,
"learning_rate": 0.0003650005824111823,
"loss": 3.2474,
"step": 67350
},
{
"epoch": 19.61597578015836,
"grad_norm": 0.37116000056266785,
"learning_rate": 0.00036482585905649385,
"loss": 3.2474,
"step": 67400
},
{
"epoch": 19.63053097345133,
"grad_norm": 0.37322819232940674,
"learning_rate": 0.00036465113570180544,
"loss": 3.2529,
"step": 67450
},
{
"epoch": 19.645086166744296,
"grad_norm": 0.3834007680416107,
"learning_rate": 0.00036447641234711703,
"loss": 3.2464,
"step": 67500
},
{
"epoch": 19.659641360037263,
"grad_norm": 0.3989749550819397,
"learning_rate": 0.0003643016889924286,
"loss": 3.2585,
"step": 67550
},
{
"epoch": 19.67419655333023,
"grad_norm": 0.3850140869617462,
"learning_rate": 0.0003641269656377402,
"loss": 3.2513,
"step": 67600
},
{
"epoch": 19.688751746623197,
"grad_norm": 0.41245153546333313,
"learning_rate": 0.0003639522422830518,
"loss": 3.2449,
"step": 67650
},
{
"epoch": 19.70330693991616,
"grad_norm": 0.41811299324035645,
"learning_rate": 0.0003637775189283634,
"loss": 3.2421,
"step": 67700
},
{
"epoch": 19.717862133209128,
"grad_norm": 0.40594467520713806,
"learning_rate": 0.00036360279557367495,
"loss": 3.2448,
"step": 67750
},
{
"epoch": 19.732417326502095,
"grad_norm": 0.38768884539604187,
"learning_rate": 0.00036342807221898655,
"loss": 3.2512,
"step": 67800
},
{
"epoch": 19.74697251979506,
"grad_norm": 0.40855497121810913,
"learning_rate": 0.00036325334886429814,
"loss": 3.2382,
"step": 67850
},
{
"epoch": 19.76152771308803,
"grad_norm": 0.39450258016586304,
"learning_rate": 0.0003630786255096098,
"loss": 3.2425,
"step": 67900
},
{
"epoch": 19.776082906380996,
"grad_norm": 0.39796581864356995,
"learning_rate": 0.00036290390215492133,
"loss": 3.2672,
"step": 67950
},
{
"epoch": 19.790638099673963,
"grad_norm": 0.43509823083877563,
"learning_rate": 0.0003627291788002329,
"loss": 3.2667,
"step": 68000
},
{
"epoch": 19.790638099673963,
"eval_accuracy": 0.37376562540216374,
"eval_loss": 3.536363363265991,
"eval_runtime": 179.6113,
"eval_samples_per_second": 92.706,
"eval_steps_per_second": 5.796,
"step": 68000
},
{
"epoch": 19.80519329296693,
"grad_norm": 0.4296112060546875,
"learning_rate": 0.0003625544554455445,
"loss": 3.2532,
"step": 68050
},
{
"epoch": 19.819748486259897,
"grad_norm": 0.39242368936538696,
"learning_rate": 0.0003623797320908561,
"loss": 3.2546,
"step": 68100
},
{
"epoch": 19.834303679552864,
"grad_norm": 0.37693583965301514,
"learning_rate": 0.00036220500873616776,
"loss": 3.2589,
"step": 68150
},
{
"epoch": 19.84885887284583,
"grad_norm": 0.36907529830932617,
"learning_rate": 0.0003620302853814793,
"loss": 3.2491,
"step": 68200
},
{
"epoch": 19.863414066138798,
"grad_norm": 0.394565224647522,
"learning_rate": 0.0003618555620267909,
"loss": 3.2555,
"step": 68250
},
{
"epoch": 19.877969259431765,
"grad_norm": 0.3643400967121124,
"learning_rate": 0.0003616808386721025,
"loss": 3.2494,
"step": 68300
},
{
"epoch": 19.892524452724732,
"grad_norm": 0.3661266267299652,
"learning_rate": 0.00036150611531741403,
"loss": 3.2576,
"step": 68350
},
{
"epoch": 19.9070796460177,
"grad_norm": 0.4081973433494568,
"learning_rate": 0.0003613313919627256,
"loss": 3.2615,
"step": 68400
},
{
"epoch": 19.921634839310666,
"grad_norm": 0.363898366689682,
"learning_rate": 0.00036115666860803727,
"loss": 3.258,
"step": 68450
},
{
"epoch": 19.936190032603633,
"grad_norm": 0.3852180540561676,
"learning_rate": 0.00036098194525334887,
"loss": 3.2626,
"step": 68500
},
{
"epoch": 19.9507452258966,
"grad_norm": 0.383724182844162,
"learning_rate": 0.0003608072218986604,
"loss": 3.2609,
"step": 68550
},
{
"epoch": 19.965300419189568,
"grad_norm": 0.41124436259269714,
"learning_rate": 0.000360632498543972,
"loss": 3.2613,
"step": 68600
},
{
"epoch": 19.979855612482535,
"grad_norm": 0.3976960778236389,
"learning_rate": 0.0003604577751892836,
"loss": 3.2653,
"step": 68650
},
{
"epoch": 19.9944108057755,
"grad_norm": 0.38881340622901917,
"learning_rate": 0.00036028305183459513,
"loss": 3.2707,
"step": 68700
},
{
"epoch": 20.00873311597578,
"grad_norm": 0.40865153074264526,
"learning_rate": 0.0003601083284799068,
"loss": 3.201,
"step": 68750
},
{
"epoch": 20.023288309268747,
"grad_norm": 0.40403062105178833,
"learning_rate": 0.0003599336051252184,
"loss": 3.1603,
"step": 68800
},
{
"epoch": 20.037843502561714,
"grad_norm": 0.4033607840538025,
"learning_rate": 0.00035975888177052997,
"loss": 3.1707,
"step": 68850
},
{
"epoch": 20.05239869585468,
"grad_norm": 0.3669825792312622,
"learning_rate": 0.0003595841584158415,
"loss": 3.1716,
"step": 68900
},
{
"epoch": 20.066953889147648,
"grad_norm": 0.39947494864463806,
"learning_rate": 0.0003594094350611531,
"loss": 3.1686,
"step": 68950
},
{
"epoch": 20.081509082440615,
"grad_norm": 0.3739069998264313,
"learning_rate": 0.00035923471170646475,
"loss": 3.1624,
"step": 69000
},
{
"epoch": 20.081509082440615,
"eval_accuracy": 0.37318891891450373,
"eval_loss": 3.548868179321289,
"eval_runtime": 179.9292,
"eval_samples_per_second": 92.542,
"eval_steps_per_second": 5.786,
"step": 69000
},
{
"epoch": 20.096064275733582,
"grad_norm": 0.4172305762767792,
"learning_rate": 0.00035905998835177635,
"loss": 3.1698,
"step": 69050
},
{
"epoch": 20.11061946902655,
"grad_norm": 0.42960259318351746,
"learning_rate": 0.00035888526499708794,
"loss": 3.1756,
"step": 69100
},
{
"epoch": 20.125174662319516,
"grad_norm": 0.3955884575843811,
"learning_rate": 0.0003587105416423995,
"loss": 3.1886,
"step": 69150
},
{
"epoch": 20.139729855612483,
"grad_norm": 0.40036845207214355,
"learning_rate": 0.0003585358182877111,
"loss": 3.1768,
"step": 69200
},
{
"epoch": 20.15428504890545,
"grad_norm": 0.41901588439941406,
"learning_rate": 0.00035836109493302267,
"loss": 3.1899,
"step": 69250
},
{
"epoch": 20.168840242198417,
"grad_norm": 0.3858005106449127,
"learning_rate": 0.0003581863715783343,
"loss": 3.1951,
"step": 69300
},
{
"epoch": 20.183395435491384,
"grad_norm": 0.37416332960128784,
"learning_rate": 0.00035801164822364586,
"loss": 3.1942,
"step": 69350
},
{
"epoch": 20.19795062878435,
"grad_norm": 0.40615788102149963,
"learning_rate": 0.00035783692486895745,
"loss": 3.185,
"step": 69400
},
{
"epoch": 20.21250582207732,
"grad_norm": 0.3919961154460907,
"learning_rate": 0.00035766220151426905,
"loss": 3.1983,
"step": 69450
},
{
"epoch": 20.227061015370285,
"grad_norm": 0.42501819133758545,
"learning_rate": 0.0003574874781595806,
"loss": 3.1967,
"step": 69500
},
{
"epoch": 20.241616208663253,
"grad_norm": 0.3887561857700348,
"learning_rate": 0.00035731275480489224,
"loss": 3.198,
"step": 69550
},
{
"epoch": 20.25617140195622,
"grad_norm": 0.4249391257762909,
"learning_rate": 0.00035713803145020383,
"loss": 3.1956,
"step": 69600
},
{
"epoch": 20.270726595249187,
"grad_norm": 0.4223974049091339,
"learning_rate": 0.0003569633080955154,
"loss": 3.1909,
"step": 69650
},
{
"epoch": 20.28528178854215,
"grad_norm": 0.4471769630908966,
"learning_rate": 0.00035678858474082697,
"loss": 3.2013,
"step": 69700
},
{
"epoch": 20.299836981835117,
"grad_norm": 0.3672872483730316,
"learning_rate": 0.00035661386138613856,
"loss": 3.1967,
"step": 69750
},
{
"epoch": 20.314392175128084,
"grad_norm": 0.40826860070228577,
"learning_rate": 0.00035643913803145015,
"loss": 3.1972,
"step": 69800
},
{
"epoch": 20.32894736842105,
"grad_norm": 0.39870554208755493,
"learning_rate": 0.0003562644146767618,
"loss": 3.2159,
"step": 69850
},
{
"epoch": 20.34350256171402,
"grad_norm": 0.37478652596473694,
"learning_rate": 0.00035608969132207334,
"loss": 3.2043,
"step": 69900
},
{
"epoch": 20.358057755006985,
"grad_norm": 0.39846566319465637,
"learning_rate": 0.00035591496796738494,
"loss": 3.2099,
"step": 69950
},
{
"epoch": 20.372612948299953,
"grad_norm": 0.39627769589424133,
"learning_rate": 0.00035574024461269653,
"loss": 3.2091,
"step": 70000
},
{
"epoch": 20.372612948299953,
"eval_accuracy": 0.3736645519195088,
"eval_loss": 3.5438168048858643,
"eval_runtime": 179.754,
"eval_samples_per_second": 92.632,
"eval_steps_per_second": 5.791,
"step": 70000
},
{
"epoch": 20.38716814159292,
"grad_norm": 0.4286830723285675,
"learning_rate": 0.0003555655212580081,
"loss": 3.2232,
"step": 70050
},
{
"epoch": 20.401723334885887,
"grad_norm": 0.3937196731567383,
"learning_rate": 0.00035539079790331967,
"loss": 3.2197,
"step": 70100
},
{
"epoch": 20.416278528178854,
"grad_norm": 0.39053255319595337,
"learning_rate": 0.0003552160745486313,
"loss": 3.2137,
"step": 70150
},
{
"epoch": 20.43083372147182,
"grad_norm": 0.419683039188385,
"learning_rate": 0.0003550413511939429,
"loss": 3.2258,
"step": 70200
},
{
"epoch": 20.445388914764788,
"grad_norm": 0.37828585505485535,
"learning_rate": 0.0003548666278392545,
"loss": 3.2279,
"step": 70250
},
{
"epoch": 20.459944108057755,
"grad_norm": 0.4096558094024658,
"learning_rate": 0.00035469190448456604,
"loss": 3.2081,
"step": 70300
},
{
"epoch": 20.474499301350722,
"grad_norm": 0.41085392236709595,
"learning_rate": 0.00035451718112987764,
"loss": 3.2231,
"step": 70350
},
{
"epoch": 20.48905449464369,
"grad_norm": 0.39229652285575867,
"learning_rate": 0.0003543424577751893,
"loss": 3.2201,
"step": 70400
},
{
"epoch": 20.503609687936656,
"grad_norm": 0.40823259949684143,
"learning_rate": 0.0003541677344205009,
"loss": 3.2291,
"step": 70450
},
{
"epoch": 20.518164881229623,
"grad_norm": 0.3808740973472595,
"learning_rate": 0.0003539930110658124,
"loss": 3.2391,
"step": 70500
},
{
"epoch": 20.53272007452259,
"grad_norm": 0.4064353108406067,
"learning_rate": 0.000353818287711124,
"loss": 3.2238,
"step": 70550
},
{
"epoch": 20.547275267815557,
"grad_norm": 0.40563496947288513,
"learning_rate": 0.0003536435643564356,
"loss": 3.2274,
"step": 70600
},
{
"epoch": 20.561830461108524,
"grad_norm": 0.4227381646633148,
"learning_rate": 0.00035346884100174715,
"loss": 3.2376,
"step": 70650
},
{
"epoch": 20.57638565440149,
"grad_norm": 0.39980873465538025,
"learning_rate": 0.0003532941176470588,
"loss": 3.2329,
"step": 70700
},
{
"epoch": 20.59094084769446,
"grad_norm": 0.36712634563446045,
"learning_rate": 0.0003531193942923704,
"loss": 3.2385,
"step": 70750
},
{
"epoch": 20.605496040987425,
"grad_norm": 0.4282870590686798,
"learning_rate": 0.000352944670937682,
"loss": 3.2392,
"step": 70800
},
{
"epoch": 20.620051234280393,
"grad_norm": 0.4079741835594177,
"learning_rate": 0.0003527699475829935,
"loss": 3.2345,
"step": 70850
},
{
"epoch": 20.63460642757336,
"grad_norm": 0.40561193227767944,
"learning_rate": 0.0003525952242283051,
"loss": 3.2285,
"step": 70900
},
{
"epoch": 20.649161620866327,
"grad_norm": 0.39606034755706787,
"learning_rate": 0.00035242050087361677,
"loss": 3.2346,
"step": 70950
},
{
"epoch": 20.663716814159294,
"grad_norm": 0.4000726640224457,
"learning_rate": 0.00035224577751892836,
"loss": 3.245,
"step": 71000
},
{
"epoch": 20.663716814159294,
"eval_accuracy": 0.37348931870713853,
"eval_loss": 3.5370469093322754,
"eval_runtime": 179.6602,
"eval_samples_per_second": 92.681,
"eval_steps_per_second": 5.794,
"step": 71000
},
{
"epoch": 20.678272007452257,
"grad_norm": 0.4043654203414917,
"learning_rate": 0.0003520710541642399,
"loss": 3.2364,
"step": 71050
},
{
"epoch": 20.692827200745224,
"grad_norm": 0.40891769528388977,
"learning_rate": 0.0003518963308095515,
"loss": 3.2401,
"step": 71100
},
{
"epoch": 20.70738239403819,
"grad_norm": 0.43435272574424744,
"learning_rate": 0.0003517216074548631,
"loss": 3.2437,
"step": 71150
},
{
"epoch": 20.72193758733116,
"grad_norm": 0.37514767050743103,
"learning_rate": 0.0003515468841001747,
"loss": 3.2427,
"step": 71200
},
{
"epoch": 20.736492780624125,
"grad_norm": 0.40798866748809814,
"learning_rate": 0.00035137216074548634,
"loss": 3.2485,
"step": 71250
},
{
"epoch": 20.751047973917093,
"grad_norm": 0.4111190140247345,
"learning_rate": 0.0003511974373907979,
"loss": 3.234,
"step": 71300
},
{
"epoch": 20.76560316721006,
"grad_norm": 0.4060057997703552,
"learning_rate": 0.00035102271403610947,
"loss": 3.242,
"step": 71350
},
{
"epoch": 20.780158360503027,
"grad_norm": 0.4118170142173767,
"learning_rate": 0.00035084799068142106,
"loss": 3.2461,
"step": 71400
},
{
"epoch": 20.794713553795994,
"grad_norm": 0.40792983770370483,
"learning_rate": 0.0003506732673267326,
"loss": 3.2456,
"step": 71450
},
{
"epoch": 20.80926874708896,
"grad_norm": 0.3925841748714447,
"learning_rate": 0.00035049854397204425,
"loss": 3.2465,
"step": 71500
},
{
"epoch": 20.823823940381928,
"grad_norm": 0.3952105939388275,
"learning_rate": 0.00035032382061735585,
"loss": 3.2509,
"step": 71550
},
{
"epoch": 20.838379133674895,
"grad_norm": 0.4176660180091858,
"learning_rate": 0.00035014909726266744,
"loss": 3.2386,
"step": 71600
},
{
"epoch": 20.852934326967862,
"grad_norm": 0.3933994174003601,
"learning_rate": 0.000349974373907979,
"loss": 3.2424,
"step": 71650
},
{
"epoch": 20.86748952026083,
"grad_norm": 0.38902366161346436,
"learning_rate": 0.0003497996505532906,
"loss": 3.2306,
"step": 71700
},
{
"epoch": 20.882044713553796,
"grad_norm": 0.38957300782203674,
"learning_rate": 0.00034962492719860217,
"loss": 3.2595,
"step": 71750
},
{
"epoch": 20.896599906846763,
"grad_norm": 0.4192463159561157,
"learning_rate": 0.0003494502038439138,
"loss": 3.2512,
"step": 71800
},
{
"epoch": 20.91115510013973,
"grad_norm": 0.39436599612236023,
"learning_rate": 0.00034927548048922536,
"loss": 3.2299,
"step": 71850
},
{
"epoch": 20.925710293432697,
"grad_norm": 0.4020264148712158,
"learning_rate": 0.00034910075713453695,
"loss": 3.2395,
"step": 71900
},
{
"epoch": 20.940265486725664,
"grad_norm": 0.4359363317489624,
"learning_rate": 0.00034892603377984855,
"loss": 3.2572,
"step": 71950
},
{
"epoch": 20.95482068001863,
"grad_norm": 0.4143703579902649,
"learning_rate": 0.0003487513104251601,
"loss": 3.2399,
"step": 72000
},
{
"epoch": 20.95482068001863,
"eval_accuracy": 0.37419577533997417,
"eval_loss": 3.530885934829712,
"eval_runtime": 180.8418,
"eval_samples_per_second": 92.075,
"eval_steps_per_second": 5.756,
"step": 72000
},
{
"epoch": 20.9693758733116,
"grad_norm": 0.40823879837989807,
"learning_rate": 0.0003485765870704717,
"loss": 3.2552,
"step": 72050
},
{
"epoch": 20.983931066604566,
"grad_norm": 0.3722114861011505,
"learning_rate": 0.00034840186371578333,
"loss": 3.246,
"step": 72100
},
{
"epoch": 20.998486259897533,
"grad_norm": 0.38992324471473694,
"learning_rate": 0.0003482271403610949,
"loss": 3.261,
"step": 72150
},
{
"epoch": 21.01280857009781,
"grad_norm": 0.3851622939109802,
"learning_rate": 0.0003480524170064065,
"loss": 3.1588,
"step": 72200
},
{
"epoch": 21.027363763390778,
"grad_norm": 0.41066426038742065,
"learning_rate": 0.00034787769365171806,
"loss": 3.1497,
"step": 72250
},
{
"epoch": 21.041918956683745,
"grad_norm": 0.3948858082294464,
"learning_rate": 0.00034770297029702965,
"loss": 3.1566,
"step": 72300
},
{
"epoch": 21.05647414997671,
"grad_norm": 0.4023319482803345,
"learning_rate": 0.0003475282469423413,
"loss": 3.1449,
"step": 72350
},
{
"epoch": 21.07102934326968,
"grad_norm": 0.4502675235271454,
"learning_rate": 0.0003473535235876529,
"loss": 3.1525,
"step": 72400
},
{
"epoch": 21.085584536562646,
"grad_norm": 0.38525882363319397,
"learning_rate": 0.00034717880023296444,
"loss": 3.164,
"step": 72450
},
{
"epoch": 21.100139729855613,
"grad_norm": 0.4162638187408447,
"learning_rate": 0.00034700407687827603,
"loss": 3.1658,
"step": 72500
},
{
"epoch": 21.11469492314858,
"grad_norm": 0.39325228333473206,
"learning_rate": 0.0003468293535235876,
"loss": 3.1698,
"step": 72550
},
{
"epoch": 21.129250116441547,
"grad_norm": 0.40508460998535156,
"learning_rate": 0.00034665463016889916,
"loss": 3.1801,
"step": 72600
},
{
"epoch": 21.143805309734514,
"grad_norm": 0.4105875492095947,
"learning_rate": 0.0003464799068142108,
"loss": 3.1711,
"step": 72650
},
{
"epoch": 21.15836050302748,
"grad_norm": 0.4069672226905823,
"learning_rate": 0.0003463051834595224,
"loss": 3.1771,
"step": 72700
},
{
"epoch": 21.172915696320448,
"grad_norm": 0.38891884684562683,
"learning_rate": 0.000346130460104834,
"loss": 3.18,
"step": 72750
},
{
"epoch": 21.187470889613415,
"grad_norm": 0.3792036175727844,
"learning_rate": 0.00034595573675014554,
"loss": 3.1873,
"step": 72800
},
{
"epoch": 21.202026082906382,
"grad_norm": 0.41310176253318787,
"learning_rate": 0.00034578101339545714,
"loss": 3.1875,
"step": 72850
},
{
"epoch": 21.21658127619935,
"grad_norm": 0.37777265906333923,
"learning_rate": 0.0003456062900407688,
"loss": 3.1911,
"step": 72900
},
{
"epoch": 21.231136469492316,
"grad_norm": 0.4072975814342499,
"learning_rate": 0.0003454315666860804,
"loss": 3.1877,
"step": 72950
},
{
"epoch": 21.245691662785283,
"grad_norm": 0.40285155177116394,
"learning_rate": 0.0003452568433313919,
"loss": 3.1951,
"step": 73000
},
{
"epoch": 21.245691662785283,
"eval_accuracy": 0.3731553061051557,
"eval_loss": 3.54914927482605,
"eval_runtime": 181.1286,
"eval_samples_per_second": 91.929,
"eval_steps_per_second": 5.747,
"step": 73000
},
{
"epoch": 21.260246856078247,
"grad_norm": 0.43861880898475647,
"learning_rate": 0.0003450821199767035,
"loss": 3.1788,
"step": 73050
},
{
"epoch": 21.274802049371214,
"grad_norm": 0.388239324092865,
"learning_rate": 0.0003449073966220151,
"loss": 3.1891,
"step": 73100
},
{
"epoch": 21.28935724266418,
"grad_norm": 0.39079049229621887,
"learning_rate": 0.0003447326732673267,
"loss": 3.2009,
"step": 73150
},
{
"epoch": 21.303912435957148,
"grad_norm": 0.41338858008384705,
"learning_rate": 0.0003445579499126383,
"loss": 3.196,
"step": 73200
},
{
"epoch": 21.318467629250115,
"grad_norm": 0.3875797986984253,
"learning_rate": 0.0003443832265579499,
"loss": 3.1977,
"step": 73250
},
{
"epoch": 21.333022822543082,
"grad_norm": 0.41410455107688904,
"learning_rate": 0.0003442085032032615,
"loss": 3.2012,
"step": 73300
},
{
"epoch": 21.34757801583605,
"grad_norm": 0.4161223769187927,
"learning_rate": 0.0003440337798485731,
"loss": 3.2108,
"step": 73350
},
{
"epoch": 21.362133209129016,
"grad_norm": 0.4099974036216736,
"learning_rate": 0.0003438590564938846,
"loss": 3.1971,
"step": 73400
},
{
"epoch": 21.376688402421983,
"grad_norm": 0.40403392910957336,
"learning_rate": 0.0003436843331391962,
"loss": 3.2197,
"step": 73450
},
{
"epoch": 21.39124359571495,
"grad_norm": 0.38444963097572327,
"learning_rate": 0.00034350960978450786,
"loss": 3.2057,
"step": 73500
},
{
"epoch": 21.405798789007918,
"grad_norm": 0.3847692012786865,
"learning_rate": 0.00034333488642981946,
"loss": 3.2085,
"step": 73550
},
{
"epoch": 21.420353982300885,
"grad_norm": 0.4129888713359833,
"learning_rate": 0.000343160163075131,
"loss": 3.195,
"step": 73600
},
{
"epoch": 21.43490917559385,
"grad_norm": 0.40269991755485535,
"learning_rate": 0.0003429854397204426,
"loss": 3.2062,
"step": 73650
},
{
"epoch": 21.44946436888682,
"grad_norm": 0.42505478858947754,
"learning_rate": 0.0003428107163657542,
"loss": 3.2191,
"step": 73700
},
{
"epoch": 21.464019562179786,
"grad_norm": 0.3840712904930115,
"learning_rate": 0.00034263599301106583,
"loss": 3.2103,
"step": 73750
},
{
"epoch": 21.478574755472753,
"grad_norm": 0.393205463886261,
"learning_rate": 0.0003424612696563774,
"loss": 3.2007,
"step": 73800
},
{
"epoch": 21.49312994876572,
"grad_norm": 0.4051612317562103,
"learning_rate": 0.00034228654630168897,
"loss": 3.2217,
"step": 73850
},
{
"epoch": 21.507685142058687,
"grad_norm": 0.39363154768943787,
"learning_rate": 0.00034211182294700056,
"loss": 3.2072,
"step": 73900
},
{
"epoch": 21.522240335351654,
"grad_norm": 0.41303566098213196,
"learning_rate": 0.0003419370995923121,
"loss": 3.2194,
"step": 73950
},
{
"epoch": 21.53679552864462,
"grad_norm": 0.40681394934654236,
"learning_rate": 0.0003417623762376237,
"loss": 3.2234,
"step": 74000
},
{
"epoch": 21.53679552864462,
"eval_accuracy": 0.3737651552929421,
"eval_loss": 3.5365705490112305,
"eval_runtime": 179.8936,
"eval_samples_per_second": 92.56,
"eval_steps_per_second": 5.787,
"step": 74000
},
{
"epoch": 21.551350721937588,
"grad_norm": 0.4122057557106018,
"learning_rate": 0.00034158765288293534,
"loss": 3.2145,
"step": 74050
},
{
"epoch": 21.565905915230555,
"grad_norm": 0.3874419331550598,
"learning_rate": 0.00034141292952824694,
"loss": 3.2164,
"step": 74100
},
{
"epoch": 21.580461108523522,
"grad_norm": 0.4723573625087738,
"learning_rate": 0.0003412382061735585,
"loss": 3.2241,
"step": 74150
},
{
"epoch": 21.59501630181649,
"grad_norm": 0.39037251472473145,
"learning_rate": 0.0003410634828188701,
"loss": 3.2313,
"step": 74200
},
{
"epoch": 21.609571495109456,
"grad_norm": 0.4305918216705322,
"learning_rate": 0.00034088875946418167,
"loss": 3.2167,
"step": 74250
},
{
"epoch": 21.624126688402423,
"grad_norm": 0.4206292927265167,
"learning_rate": 0.0003407140361094933,
"loss": 3.2183,
"step": 74300
},
{
"epoch": 21.63868188169539,
"grad_norm": 0.3807690739631653,
"learning_rate": 0.00034053931275480486,
"loss": 3.217,
"step": 74350
},
{
"epoch": 21.653237074988354,
"grad_norm": 0.45432552695274353,
"learning_rate": 0.00034036458940011645,
"loss": 3.2195,
"step": 74400
},
{
"epoch": 21.66779226828132,
"grad_norm": 0.4133349359035492,
"learning_rate": 0.00034018986604542804,
"loss": 3.2259,
"step": 74450
},
{
"epoch": 21.682347461574288,
"grad_norm": 0.38990363478660583,
"learning_rate": 0.00034001514269073964,
"loss": 3.2348,
"step": 74500
},
{
"epoch": 21.696902654867255,
"grad_norm": 0.43620920181274414,
"learning_rate": 0.0003398404193360512,
"loss": 3.2293,
"step": 74550
},
{
"epoch": 21.711457848160222,
"grad_norm": 0.4420872628688812,
"learning_rate": 0.00033966569598136283,
"loss": 3.2225,
"step": 74600
},
{
"epoch": 21.72601304145319,
"grad_norm": 0.3839336335659027,
"learning_rate": 0.0003394909726266744,
"loss": 3.2255,
"step": 74650
},
{
"epoch": 21.740568234746156,
"grad_norm": 0.42757683992385864,
"learning_rate": 0.000339316249271986,
"loss": 3.225,
"step": 74700
},
{
"epoch": 21.755123428039123,
"grad_norm": 0.39489442110061646,
"learning_rate": 0.00033914152591729756,
"loss": 3.2369,
"step": 74750
},
{
"epoch": 21.76967862133209,
"grad_norm": 0.4031055271625519,
"learning_rate": 0.00033896680256260915,
"loss": 3.2364,
"step": 74800
},
{
"epoch": 21.784233814625058,
"grad_norm": 0.4320981502532959,
"learning_rate": 0.00033879207920792074,
"loss": 3.2302,
"step": 74850
},
{
"epoch": 21.798789007918025,
"grad_norm": 0.40397652983665466,
"learning_rate": 0.0003386173558532324,
"loss": 3.2205,
"step": 74900
},
{
"epoch": 21.81334420121099,
"grad_norm": 0.39160752296447754,
"learning_rate": 0.00033844263249854393,
"loss": 3.2293,
"step": 74950
},
{
"epoch": 21.82789939450396,
"grad_norm": 0.39503806829452515,
"learning_rate": 0.00033826790914385553,
"loss": 3.2386,
"step": 75000
},
{
"epoch": 21.82789939450396,
"eval_accuracy": 0.3742340892415387,
"eval_loss": 3.5338709354400635,
"eval_runtime": 179.9943,
"eval_samples_per_second": 92.508,
"eval_steps_per_second": 5.784,
"step": 75000
},
{
"epoch": 21.842454587796926,
"grad_norm": 0.4009569585323334,
"learning_rate": 0.0003380931857891671,
"loss": 3.2343,
"step": 75050
},
{
"epoch": 21.857009781089893,
"grad_norm": 0.40219414234161377,
"learning_rate": 0.00033791846243447866,
"loss": 3.2429,
"step": 75100
},
{
"epoch": 21.87156497438286,
"grad_norm": 0.4035627841949463,
"learning_rate": 0.0003377437390797903,
"loss": 3.2337,
"step": 75150
},
{
"epoch": 21.886120167675827,
"grad_norm": 0.4356299936771393,
"learning_rate": 0.0003375690157251019,
"loss": 3.2375,
"step": 75200
},
{
"epoch": 21.900675360968794,
"grad_norm": 0.3984715938568115,
"learning_rate": 0.0003373942923704135,
"loss": 3.2308,
"step": 75250
},
{
"epoch": 21.91523055426176,
"grad_norm": 0.3872779309749603,
"learning_rate": 0.00033721956901572504,
"loss": 3.2432,
"step": 75300
},
{
"epoch": 21.929785747554728,
"grad_norm": 0.39195629954338074,
"learning_rate": 0.00033704484566103663,
"loss": 3.2495,
"step": 75350
},
{
"epoch": 21.944340940847695,
"grad_norm": 0.41744935512542725,
"learning_rate": 0.00033687012230634823,
"loss": 3.2373,
"step": 75400
},
{
"epoch": 21.958896134140662,
"grad_norm": 0.4085555374622345,
"learning_rate": 0.0003366953989516599,
"loss": 3.2461,
"step": 75450
},
{
"epoch": 21.97345132743363,
"grad_norm": 0.3933025598526001,
"learning_rate": 0.00033652067559697147,
"loss": 3.237,
"step": 75500
},
{
"epoch": 21.988006520726596,
"grad_norm": 0.3915697932243347,
"learning_rate": 0.000336345952242283,
"loss": 3.2568,
"step": 75550
},
{
"epoch": 22.002328830926874,
"grad_norm": 0.3964177668094635,
"learning_rate": 0.0003361712288875946,
"loss": 3.2169,
"step": 75600
},
{
"epoch": 22.01688402421984,
"grad_norm": 0.40610381960868835,
"learning_rate": 0.0003359965055329062,
"loss": 3.1147,
"step": 75650
},
{
"epoch": 22.03143921751281,
"grad_norm": 0.40210819244384766,
"learning_rate": 0.00033582178217821785,
"loss": 3.1422,
"step": 75700
},
{
"epoch": 22.045994410805775,
"grad_norm": 0.419974148273468,
"learning_rate": 0.0003356470588235294,
"loss": 3.1387,
"step": 75750
},
{
"epoch": 22.060549604098743,
"grad_norm": 0.4376601278781891,
"learning_rate": 0.000335472335468841,
"loss": 3.151,
"step": 75800
},
{
"epoch": 22.07510479739171,
"grad_norm": 0.42732951045036316,
"learning_rate": 0.0003352976121141526,
"loss": 3.1584,
"step": 75850
},
{
"epoch": 22.089659990684677,
"grad_norm": 0.45031747221946716,
"learning_rate": 0.0003351228887594641,
"loss": 3.169,
"step": 75900
},
{
"epoch": 22.104215183977644,
"grad_norm": 0.4007145166397095,
"learning_rate": 0.0003349481654047757,
"loss": 3.1676,
"step": 75950
},
{
"epoch": 22.11877037727061,
"grad_norm": 0.39019888639450073,
"learning_rate": 0.00033477344205008736,
"loss": 3.1712,
"step": 76000
},
{
"epoch": 22.11877037727061,
"eval_accuracy": 0.3735336265012791,
"eval_loss": 3.547504425048828,
"eval_runtime": 179.7692,
"eval_samples_per_second": 92.624,
"eval_steps_per_second": 5.791,
"step": 76000
},
{
"epoch": 22.133325570563578,
"grad_norm": 0.40922585129737854,
"learning_rate": 0.00033459871869539895,
"loss": 3.1625,
"step": 76050
},
{
"epoch": 22.147880763856545,
"grad_norm": 0.39725741744041443,
"learning_rate": 0.0003344239953407105,
"loss": 3.1695,
"step": 76100
},
{
"epoch": 22.162435957149512,
"grad_norm": 0.442618191242218,
"learning_rate": 0.0003342492719860221,
"loss": 3.1558,
"step": 76150
},
{
"epoch": 22.17699115044248,
"grad_norm": 0.3919195234775543,
"learning_rate": 0.0003340745486313337,
"loss": 3.1578,
"step": 76200
},
{
"epoch": 22.191546343735446,
"grad_norm": 0.4100443720817566,
"learning_rate": 0.0003338998252766453,
"loss": 3.168,
"step": 76250
},
{
"epoch": 22.206101537028413,
"grad_norm": 0.4021892547607422,
"learning_rate": 0.00033372510192195687,
"loss": 3.1668,
"step": 76300
},
{
"epoch": 22.22065673032138,
"grad_norm": 0.41602620482444763,
"learning_rate": 0.00033355037856726847,
"loss": 3.1793,
"step": 76350
},
{
"epoch": 22.235211923614347,
"grad_norm": 0.40192118287086487,
"learning_rate": 0.00033337565521258006,
"loss": 3.1831,
"step": 76400
},
{
"epoch": 22.24976711690731,
"grad_norm": 0.4379929304122925,
"learning_rate": 0.00033320093185789165,
"loss": 3.1783,
"step": 76450
},
{
"epoch": 22.264322310200278,
"grad_norm": 0.42289242148399353,
"learning_rate": 0.0003330262085032032,
"loss": 3.1886,
"step": 76500
},
{
"epoch": 22.278877503493245,
"grad_norm": 0.4312947392463684,
"learning_rate": 0.00033285148514851484,
"loss": 3.1825,
"step": 76550
},
{
"epoch": 22.293432696786212,
"grad_norm": 0.41763949394226074,
"learning_rate": 0.00033267676179382644,
"loss": 3.1947,
"step": 76600
},
{
"epoch": 22.30798789007918,
"grad_norm": 0.4036341905593872,
"learning_rate": 0.00033250203843913803,
"loss": 3.1833,
"step": 76650
},
{
"epoch": 22.322543083372146,
"grad_norm": 0.4084683954715729,
"learning_rate": 0.00033232731508444957,
"loss": 3.1957,
"step": 76700
},
{
"epoch": 22.337098276665113,
"grad_norm": 0.3992261290550232,
"learning_rate": 0.00033215259172976117,
"loss": 3.1843,
"step": 76750
},
{
"epoch": 22.35165346995808,
"grad_norm": 0.4478423595428467,
"learning_rate": 0.00033197786837507276,
"loss": 3.1849,
"step": 76800
},
{
"epoch": 22.366208663251047,
"grad_norm": 0.420403391122818,
"learning_rate": 0.0003318031450203844,
"loss": 3.2038,
"step": 76850
},
{
"epoch": 22.380763856544014,
"grad_norm": 0.4012890160083771,
"learning_rate": 0.00033162842166569595,
"loss": 3.189,
"step": 76900
},
{
"epoch": 22.39531904983698,
"grad_norm": 0.3808094263076782,
"learning_rate": 0.00033145369831100754,
"loss": 3.2134,
"step": 76950
},
{
"epoch": 22.40987424312995,
"grad_norm": 0.4185815155506134,
"learning_rate": 0.00033127897495631914,
"loss": 3.1945,
"step": 77000
},
{
"epoch": 22.40987424312995,
"eval_accuracy": 0.37388820638170916,
"eval_loss": 3.5480434894561768,
"eval_runtime": 179.8842,
"eval_samples_per_second": 92.565,
"eval_steps_per_second": 5.787,
"step": 77000
},
{
"epoch": 22.424429436422916,
"grad_norm": 0.40921658277511597,
"learning_rate": 0.0003311042516016307,
"loss": 3.1995,
"step": 77050
},
{
"epoch": 22.438984629715883,
"grad_norm": 0.40508735179901123,
"learning_rate": 0.0003309295282469423,
"loss": 3.2075,
"step": 77100
},
{
"epoch": 22.45353982300885,
"grad_norm": 0.41545870900154114,
"learning_rate": 0.0003307548048922539,
"loss": 3.1968,
"step": 77150
},
{
"epoch": 22.468095016301817,
"grad_norm": 0.41426533460617065,
"learning_rate": 0.0003305800815375655,
"loss": 3.1945,
"step": 77200
},
{
"epoch": 22.482650209594784,
"grad_norm": 0.4070603549480438,
"learning_rate": 0.00033040535818287705,
"loss": 3.203,
"step": 77250
},
{
"epoch": 22.49720540288775,
"grad_norm": 0.41937437653541565,
"learning_rate": 0.00033023063482818865,
"loss": 3.2106,
"step": 77300
},
{
"epoch": 22.511760596180718,
"grad_norm": 0.41913917660713196,
"learning_rate": 0.00033005591147350024,
"loss": 3.1997,
"step": 77350
},
{
"epoch": 22.526315789473685,
"grad_norm": 0.42557600140571594,
"learning_rate": 0.0003298811881188119,
"loss": 3.2032,
"step": 77400
},
{
"epoch": 22.540870982766652,
"grad_norm": 0.4355701506137848,
"learning_rate": 0.00032970646476412343,
"loss": 3.2055,
"step": 77450
},
{
"epoch": 22.55542617605962,
"grad_norm": 0.3803400695323944,
"learning_rate": 0.000329531741409435,
"loss": 3.2191,
"step": 77500
},
{
"epoch": 22.569981369352586,
"grad_norm": 0.39761683344841003,
"learning_rate": 0.0003293570180547466,
"loss": 3.2094,
"step": 77550
},
{
"epoch": 22.584536562645553,
"grad_norm": 0.41126570105552673,
"learning_rate": 0.0003291822947000582,
"loss": 3.2084,
"step": 77600
},
{
"epoch": 22.59909175593852,
"grad_norm": 0.4171556830406189,
"learning_rate": 0.00032900757134536975,
"loss": 3.208,
"step": 77650
},
{
"epoch": 22.613646949231487,
"grad_norm": 0.39230960607528687,
"learning_rate": 0.0003288328479906814,
"loss": 3.2135,
"step": 77700
},
{
"epoch": 22.628202142524454,
"grad_norm": 0.4159952700138092,
"learning_rate": 0.000328658124635993,
"loss": 3.2063,
"step": 77750
},
{
"epoch": 22.64275733581742,
"grad_norm": 0.4337736666202545,
"learning_rate": 0.0003284834012813046,
"loss": 3.2097,
"step": 77800
},
{
"epoch": 22.657312529110385,
"grad_norm": 0.4640325605869293,
"learning_rate": 0.00032830867792661613,
"loss": 3.2151,
"step": 77850
},
{
"epoch": 22.671867722403352,
"grad_norm": 0.4161394238471985,
"learning_rate": 0.0003281339545719277,
"loss": 3.2275,
"step": 77900
},
{
"epoch": 22.68642291569632,
"grad_norm": 0.41482967138290405,
"learning_rate": 0.0003279592312172394,
"loss": 3.2264,
"step": 77950
},
{
"epoch": 22.700978108989286,
"grad_norm": 0.3908606469631195,
"learning_rate": 0.00032778450786255097,
"loss": 3.2309,
"step": 78000
},
{
"epoch": 22.700978108989286,
"eval_accuracy": 0.37420458988788013,
"eval_loss": 3.5342824459075928,
"eval_runtime": 179.7395,
"eval_samples_per_second": 92.64,
"eval_steps_per_second": 5.792,
"step": 78000
},
{
"epoch": 22.715533302282253,
"grad_norm": 0.44009092450141907,
"learning_rate": 0.0003276097845078625,
"loss": 3.2251,
"step": 78050
},
{
"epoch": 22.73008849557522,
"grad_norm": 0.4078558087348938,
"learning_rate": 0.0003274350611531741,
"loss": 3.2101,
"step": 78100
},
{
"epoch": 22.744643688868187,
"grad_norm": 0.4620960056781769,
"learning_rate": 0.0003272603377984857,
"loss": 3.2185,
"step": 78150
},
{
"epoch": 22.759198882161154,
"grad_norm": 0.3999122083187103,
"learning_rate": 0.00032708561444379724,
"loss": 3.2213,
"step": 78200
},
{
"epoch": 22.77375407545412,
"grad_norm": 0.387114942073822,
"learning_rate": 0.0003269108910891089,
"loss": 3.2123,
"step": 78250
},
{
"epoch": 22.78830926874709,
"grad_norm": 0.4111201763153076,
"learning_rate": 0.0003267361677344205,
"loss": 3.2208,
"step": 78300
},
{
"epoch": 22.802864462040056,
"grad_norm": 0.4094546437263489,
"learning_rate": 0.0003265614443797321,
"loss": 3.2251,
"step": 78350
},
{
"epoch": 22.817419655333023,
"grad_norm": 0.404011070728302,
"learning_rate": 0.0003263867210250436,
"loss": 3.2301,
"step": 78400
},
{
"epoch": 22.83197484862599,
"grad_norm": 0.410820335149765,
"learning_rate": 0.0003262119976703552,
"loss": 3.2216,
"step": 78450
},
{
"epoch": 22.846530041918957,
"grad_norm": 0.3758966624736786,
"learning_rate": 0.00032603727431566686,
"loss": 3.2169,
"step": 78500
},
{
"epoch": 22.861085235211924,
"grad_norm": 0.3818444311618805,
"learning_rate": 0.00032586255096097845,
"loss": 3.2228,
"step": 78550
},
{
"epoch": 22.87564042850489,
"grad_norm": 0.41991475224494934,
"learning_rate": 0.00032568782760629005,
"loss": 3.2161,
"step": 78600
},
{
"epoch": 22.890195621797858,
"grad_norm": 0.42864012718200684,
"learning_rate": 0.0003255131042516016,
"loss": 3.2297,
"step": 78650
},
{
"epoch": 22.904750815090825,
"grad_norm": 0.40493571758270264,
"learning_rate": 0.0003253383808969132,
"loss": 3.2292,
"step": 78700
},
{
"epoch": 22.919306008383792,
"grad_norm": 0.4492877721786499,
"learning_rate": 0.0003251636575422248,
"loss": 3.2415,
"step": 78750
},
{
"epoch": 22.93386120167676,
"grad_norm": 0.4454686939716339,
"learning_rate": 0.0003249889341875364,
"loss": 3.2403,
"step": 78800
},
{
"epoch": 22.948416394969726,
"grad_norm": 0.4318472146987915,
"learning_rate": 0.00032481421083284796,
"loss": 3.2177,
"step": 78850
},
{
"epoch": 22.962971588262693,
"grad_norm": 0.40572306513786316,
"learning_rate": 0.00032463948747815956,
"loss": 3.2211,
"step": 78900
},
{
"epoch": 22.97752678155566,
"grad_norm": 0.40303680300712585,
"learning_rate": 0.00032446476412347115,
"loss": 3.2347,
"step": 78950
},
{
"epoch": 22.992081974848627,
"grad_norm": 0.3872184455394745,
"learning_rate": 0.0003242900407687827,
"loss": 3.2372,
"step": 79000
},
{
"epoch": 22.992081974848627,
"eval_accuracy": 0.3749660492996489,
"eval_loss": 3.5285708904266357,
"eval_runtime": 179.7425,
"eval_samples_per_second": 92.638,
"eval_steps_per_second": 5.792,
"step": 79000
},
{
"epoch": 23.006404285048905,
"grad_norm": 0.42881467938423157,
"learning_rate": 0.00032411531741409434,
"loss": 3.1789,
"step": 79050
},
{
"epoch": 23.020959478341872,
"grad_norm": 0.4351500868797302,
"learning_rate": 0.00032394059405940593,
"loss": 3.1258,
"step": 79100
},
{
"epoch": 23.03551467163484,
"grad_norm": 0.3753584623336792,
"learning_rate": 0.00032376587070471753,
"loss": 3.1297,
"step": 79150
},
{
"epoch": 23.050069864927806,
"grad_norm": 0.41720595955848694,
"learning_rate": 0.00032359114735002907,
"loss": 3.1342,
"step": 79200
},
{
"epoch": 23.064625058220773,
"grad_norm": 0.4204729497432709,
"learning_rate": 0.00032341642399534066,
"loss": 3.1364,
"step": 79250
},
{
"epoch": 23.07918025151374,
"grad_norm": 0.4257517158985138,
"learning_rate": 0.00032324170064065226,
"loss": 3.1494,
"step": 79300
},
{
"epoch": 23.093735444806708,
"grad_norm": 0.4221104681491852,
"learning_rate": 0.0003230669772859639,
"loss": 3.1491,
"step": 79350
},
{
"epoch": 23.108290638099675,
"grad_norm": 0.40155747532844543,
"learning_rate": 0.00032289225393127545,
"loss": 3.1436,
"step": 79400
},
{
"epoch": 23.12284583139264,
"grad_norm": 0.4350748062133789,
"learning_rate": 0.00032271753057658704,
"loss": 3.1517,
"step": 79450
},
{
"epoch": 23.13740102468561,
"grad_norm": 0.3839072585105896,
"learning_rate": 0.00032254280722189863,
"loss": 3.157,
"step": 79500
},
{
"epoch": 23.151956217978576,
"grad_norm": 0.4622083306312561,
"learning_rate": 0.00032236808386721023,
"loss": 3.1603,
"step": 79550
},
{
"epoch": 23.166511411271543,
"grad_norm": 0.40241503715515137,
"learning_rate": 0.00032219336051252177,
"loss": 3.1538,
"step": 79600
},
{
"epoch": 23.18106660456451,
"grad_norm": 0.4790761172771454,
"learning_rate": 0.0003220186371578334,
"loss": 3.1553,
"step": 79650
},
{
"epoch": 23.195621797857477,
"grad_norm": 0.402031272649765,
"learning_rate": 0.000321843913803145,
"loss": 3.1628,
"step": 79700
},
{
"epoch": 23.210176991150444,
"grad_norm": 0.4585934579372406,
"learning_rate": 0.0003216691904484566,
"loss": 3.1701,
"step": 79750
},
{
"epoch": 23.22473218444341,
"grad_norm": 0.3923730254173279,
"learning_rate": 0.00032149446709376815,
"loss": 3.1829,
"step": 79800
},
{
"epoch": 23.239287377736375,
"grad_norm": 0.4399240016937256,
"learning_rate": 0.00032131974373907974,
"loss": 3.167,
"step": 79850
},
{
"epoch": 23.25384257102934,
"grad_norm": 0.43063780665397644,
"learning_rate": 0.0003211450203843914,
"loss": 3.1584,
"step": 79900
},
{
"epoch": 23.26839776432231,
"grad_norm": 0.431403785943985,
"learning_rate": 0.000320970297029703,
"loss": 3.1723,
"step": 79950
},
{
"epoch": 23.282952957615276,
"grad_norm": 0.4274817705154419,
"learning_rate": 0.0003207955736750145,
"loss": 3.1752,
"step": 80000
},
{
"epoch": 23.282952957615276,
"eval_accuracy": 0.37391077162434844,
"eval_loss": 3.5465760231018066,
"eval_runtime": 179.7261,
"eval_samples_per_second": 92.647,
"eval_steps_per_second": 5.792,
"step": 80000
},
{
"epoch": 23.297508150908243,
"grad_norm": 0.42905303835868835,
"learning_rate": 0.0003206208503203261,
"loss": 3.1283,
"step": 80050
},
{
"epoch": 23.31206334420121,
"grad_norm": 0.41787195205688477,
"learning_rate": 0.0003204461269656377,
"loss": 3.1484,
"step": 80100
},
{
"epoch": 23.326618537494177,
"grad_norm": 0.4279998242855072,
"learning_rate": 0.00032027140361094925,
"loss": 3.1479,
"step": 80150
},
{
"epoch": 23.341173730787144,
"grad_norm": 0.45045384764671326,
"learning_rate": 0.0003200966802562609,
"loss": 3.1503,
"step": 80200
},
{
"epoch": 23.35572892408011,
"grad_norm": 0.43174025416374207,
"learning_rate": 0.0003199219569015725,
"loss": 3.1515,
"step": 80250
},
{
"epoch": 23.370284117373078,
"grad_norm": 0.4571380913257599,
"learning_rate": 0.0003197472335468841,
"loss": 3.145,
"step": 80300
},
{
"epoch": 23.384839310666045,
"grad_norm": 0.4241339862346649,
"learning_rate": 0.00031957251019219563,
"loss": 3.1444,
"step": 80350
},
{
"epoch": 23.399394503959012,
"grad_norm": 0.4476516842842102,
"learning_rate": 0.0003193977868375072,
"loss": 3.1501,
"step": 80400
},
{
"epoch": 23.41394969725198,
"grad_norm": 0.40319111943244934,
"learning_rate": 0.00031922306348281887,
"loss": 3.1557,
"step": 80450
},
{
"epoch": 23.428504890544946,
"grad_norm": 0.41085711121559143,
"learning_rate": 0.00031904834012813047,
"loss": 3.1622,
"step": 80500
},
{
"epoch": 23.443060083837914,
"grad_norm": 0.44014430046081543,
"learning_rate": 0.000318873616773442,
"loss": 3.1623,
"step": 80550
},
{
"epoch": 23.45761527713088,
"grad_norm": 0.4018969237804413,
"learning_rate": 0.0003186988934187536,
"loss": 3.1623,
"step": 80600
},
{
"epoch": 23.472170470423848,
"grad_norm": 0.4306873679161072,
"learning_rate": 0.0003185241700640652,
"loss": 3.1734,
"step": 80650
},
{
"epoch": 23.486725663716815,
"grad_norm": 0.41860219836235046,
"learning_rate": 0.0003183494467093768,
"loss": 3.1688,
"step": 80700
},
{
"epoch": 23.50128085700978,
"grad_norm": 0.41949665546417236,
"learning_rate": 0.0003181747233546884,
"loss": 3.1631,
"step": 80750
},
{
"epoch": 23.51583605030275,
"grad_norm": 0.4215621054172516,
"learning_rate": 0.000318,
"loss": 3.1708,
"step": 80800
},
{
"epoch": 23.530391243595716,
"grad_norm": 0.456132709980011,
"learning_rate": 0.00031782527664531157,
"loss": 3.1717,
"step": 80850
},
{
"epoch": 23.544946436888683,
"grad_norm": 0.47575968503952026,
"learning_rate": 0.00031765055329062317,
"loss": 3.1729,
"step": 80900
},
{
"epoch": 23.55950163018165,
"grad_norm": 0.4039624035358429,
"learning_rate": 0.0003174758299359347,
"loss": 3.178,
"step": 80950
},
{
"epoch": 23.574056823474617,
"grad_norm": 0.42520585656166077,
"learning_rate": 0.0003173011065812463,
"loss": 3.1701,
"step": 81000
},
{
"epoch": 23.574056823474617,
"eval_accuracy": 0.3734778010312081,
"eval_loss": 3.5532686710357666,
"eval_runtime": 180.8181,
"eval_samples_per_second": 92.087,
"eval_steps_per_second": 5.757,
"step": 81000
},
{
"epoch": 23.588612016767584,
"grad_norm": 0.43075016140937805,
"learning_rate": 0.00031712638322655795,
"loss": 3.1776,
"step": 81050
},
{
"epoch": 23.60316721006055,
"grad_norm": 0.39215949177742004,
"learning_rate": 0.00031695165987186954,
"loss": 3.176,
"step": 81100
},
{
"epoch": 23.61772240335352,
"grad_norm": 0.3989499807357788,
"learning_rate": 0.0003167769365171811,
"loss": 3.175,
"step": 81150
},
{
"epoch": 23.63227759664648,
"grad_norm": 0.43521571159362793,
"learning_rate": 0.0003166022131624927,
"loss": 3.1837,
"step": 81200
},
{
"epoch": 23.64683278993945,
"grad_norm": 0.4280226528644562,
"learning_rate": 0.00031642748980780427,
"loss": 3.1874,
"step": 81250
},
{
"epoch": 23.661387983232416,
"grad_norm": 0.40798208117485046,
"learning_rate": 0.0003162527664531159,
"loss": 3.1861,
"step": 81300
},
{
"epoch": 23.675943176525383,
"grad_norm": 0.4074450433254242,
"learning_rate": 0.00031607804309842746,
"loss": 3.1853,
"step": 81350
},
{
"epoch": 23.69049836981835,
"grad_norm": 0.43709275126457214,
"learning_rate": 0.00031590331974373905,
"loss": 3.1914,
"step": 81400
},
{
"epoch": 23.705053563111317,
"grad_norm": 0.4177038371562958,
"learning_rate": 0.00031572859638905065,
"loss": 3.1864,
"step": 81450
},
{
"epoch": 23.719608756404284,
"grad_norm": 0.43477728962898254,
"learning_rate": 0.0003155538730343622,
"loss": 3.1916,
"step": 81500
},
{
"epoch": 23.73416394969725,
"grad_norm": 0.4035871624946594,
"learning_rate": 0.0003153791496796738,
"loss": 3.1793,
"step": 81550
},
{
"epoch": 23.74871914299022,
"grad_norm": 0.4537198543548584,
"learning_rate": 0.00031520442632498543,
"loss": 3.2004,
"step": 81600
},
{
"epoch": 23.763274336283185,
"grad_norm": 0.45444414019584656,
"learning_rate": 0.000315029702970297,
"loss": 3.1969,
"step": 81650
},
{
"epoch": 23.777829529576152,
"grad_norm": 0.4409884214401245,
"learning_rate": 0.0003148549796156086,
"loss": 3.1884,
"step": 81700
},
{
"epoch": 23.79238472286912,
"grad_norm": 0.44495829939842224,
"learning_rate": 0.00031468025626092016,
"loss": 3.1847,
"step": 81750
},
{
"epoch": 23.806939916162086,
"grad_norm": 0.391963928937912,
"learning_rate": 0.00031450553290623175,
"loss": 3.1951,
"step": 81800
},
{
"epoch": 23.821495109455054,
"grad_norm": 0.4508795738220215,
"learning_rate": 0.0003143308095515434,
"loss": 3.2053,
"step": 81850
},
{
"epoch": 23.83605030274802,
"grad_norm": 0.45079120993614197,
"learning_rate": 0.000314156086196855,
"loss": 3.194,
"step": 81900
},
{
"epoch": 23.850605496040988,
"grad_norm": 0.4412509500980377,
"learning_rate": 0.00031398136284216654,
"loss": 3.1898,
"step": 81950
},
{
"epoch": 23.865160689333955,
"grad_norm": 0.4032447040081024,
"learning_rate": 0.00031380663948747813,
"loss": 3.1977,
"step": 82000
},
{
"epoch": 23.865160689333955,
"eval_accuracy": 0.37412596412055904,
"eval_loss": 3.540754795074463,
"eval_runtime": 178.8063,
"eval_samples_per_second": 93.123,
"eval_steps_per_second": 5.822,
"step": 82000
},
{
"epoch": 23.879715882626922,
"grad_norm": 0.41658344864845276,
"learning_rate": 0.0003136319161327897,
"loss": 3.1999,
"step": 82050
},
{
"epoch": 23.89427107591989,
"grad_norm": 0.42867329716682434,
"learning_rate": 0.00031345719277810127,
"loss": 3.1996,
"step": 82100
},
{
"epoch": 23.908826269212856,
"grad_norm": 0.4257943034172058,
"learning_rate": 0.0003132824694234129,
"loss": 3.1991,
"step": 82150
},
{
"epoch": 23.923381462505823,
"grad_norm": 0.4132334887981415,
"learning_rate": 0.0003131077460687245,
"loss": 3.203,
"step": 82200
},
{
"epoch": 23.93793665579879,
"grad_norm": 0.4158965051174164,
"learning_rate": 0.0003129330227140361,
"loss": 3.1978,
"step": 82250
},
{
"epoch": 23.952491849091757,
"grad_norm": 0.44557440280914307,
"learning_rate": 0.00031275829935934764,
"loss": 3.197,
"step": 82300
},
{
"epoch": 23.967047042384724,
"grad_norm": 0.4208097457885742,
"learning_rate": 0.00031258357600465924,
"loss": 3.2107,
"step": 82350
},
{
"epoch": 23.98160223567769,
"grad_norm": 0.41734635829925537,
"learning_rate": 0.00031240885264997083,
"loss": 3.2195,
"step": 82400
},
{
"epoch": 23.99615742897066,
"grad_norm": 0.4121745228767395,
"learning_rate": 0.0003122341292952825,
"loss": 3.2018,
"step": 82450
},
{
"epoch": 24.010770843036795,
"grad_norm": 0.43925735354423523,
"learning_rate": 0.000312059405940594,
"loss": 3.2022,
"step": 82500
},
{
"epoch": 24.025326036329762,
"grad_norm": 0.43141815066337585,
"learning_rate": 0.0003118846825859056,
"loss": 3.1254,
"step": 82550
},
{
"epoch": 24.03988122962273,
"grad_norm": 0.44609951972961426,
"learning_rate": 0.0003117099592312172,
"loss": 3.1231,
"step": 82600
},
{
"epoch": 24.054436422915696,
"grad_norm": 0.4301334619522095,
"learning_rate": 0.0003115352358765288,
"loss": 3.1492,
"step": 82650
},
{
"epoch": 24.068991616208663,
"grad_norm": 0.45953306555747986,
"learning_rate": 0.0003113605125218404,
"loss": 3.1395,
"step": 82700
},
{
"epoch": 24.08354680950163,
"grad_norm": 0.4030922055244446,
"learning_rate": 0.000311185789167152,
"loss": 3.1388,
"step": 82750
},
{
"epoch": 24.098102002794597,
"grad_norm": 0.42163464426994324,
"learning_rate": 0.0003110110658124636,
"loss": 3.1503,
"step": 82800
},
{
"epoch": 24.112657196087564,
"grad_norm": 0.42299437522888184,
"learning_rate": 0.0003108363424577752,
"loss": 3.1515,
"step": 82850
},
{
"epoch": 24.12721238938053,
"grad_norm": 0.4112618863582611,
"learning_rate": 0.0003106616191030867,
"loss": 3.1419,
"step": 82900
},
{
"epoch": 24.1417675826735,
"grad_norm": 0.4395235776901245,
"learning_rate": 0.0003104868957483983,
"loss": 3.1511,
"step": 82950
},
{
"epoch": 24.156322775966466,
"grad_norm": 0.4338419735431671,
"learning_rate": 0.00031031217239370996,
"loss": 3.154,
"step": 83000
},
{
"epoch": 24.156322775966466,
"eval_accuracy": 0.3735538411978101,
"eval_loss": 3.55212664604187,
"eval_runtime": 178.7314,
"eval_samples_per_second": 93.162,
"eval_steps_per_second": 5.824,
"step": 83000
},
{
"epoch": 24.170877969259433,
"grad_norm": 0.410587340593338,
"learning_rate": 0.00031013744903902156,
"loss": 3.1564,
"step": 83050
},
{
"epoch": 24.1854331625524,
"grad_norm": 0.4255504012107849,
"learning_rate": 0.0003099627256843331,
"loss": 3.16,
"step": 83100
},
{
"epoch": 24.199988355845367,
"grad_norm": 0.44280168414115906,
"learning_rate": 0.0003097880023296447,
"loss": 3.1635,
"step": 83150
},
{
"epoch": 24.214543549138334,
"grad_norm": 0.4021797180175781,
"learning_rate": 0.0003096132789749563,
"loss": 3.1546,
"step": 83200
},
{
"epoch": 24.2290987424313,
"grad_norm": 0.41863852739334106,
"learning_rate": 0.00030943855562026794,
"loss": 3.1633,
"step": 83250
},
{
"epoch": 24.243653935724268,
"grad_norm": 0.42614641785621643,
"learning_rate": 0.0003092638322655795,
"loss": 3.16,
"step": 83300
},
{
"epoch": 24.258209129017235,
"grad_norm": 0.42498284578323364,
"learning_rate": 0.00030908910891089107,
"loss": 3.1721,
"step": 83350
},
{
"epoch": 24.2727643223102,
"grad_norm": 0.44240236282348633,
"learning_rate": 0.00030891438555620266,
"loss": 3.1735,
"step": 83400
},
{
"epoch": 24.287319515603166,
"grad_norm": 0.4178884029388428,
"learning_rate": 0.0003087396622015142,
"loss": 3.1825,
"step": 83450
},
{
"epoch": 24.301874708896133,
"grad_norm": 0.4449654519557953,
"learning_rate": 0.0003085649388468258,
"loss": 3.1667,
"step": 83500
},
{
"epoch": 24.3164299021891,
"grad_norm": 0.43948787450790405,
"learning_rate": 0.00030839021549213745,
"loss": 3.17,
"step": 83550
},
{
"epoch": 24.330985095482067,
"grad_norm": 0.3911823630332947,
"learning_rate": 0.00030821549213744904,
"loss": 3.1732,
"step": 83600
},
{
"epoch": 24.345540288775034,
"grad_norm": 0.427975058555603,
"learning_rate": 0.0003080407687827606,
"loss": 3.1786,
"step": 83650
},
{
"epoch": 24.360095482068,
"grad_norm": 0.45086926221847534,
"learning_rate": 0.0003078660454280722,
"loss": 3.1817,
"step": 83700
},
{
"epoch": 24.374650675360968,
"grad_norm": 0.4079881012439728,
"learning_rate": 0.00030769132207338377,
"loss": 3.1748,
"step": 83750
},
{
"epoch": 24.389205868653935,
"grad_norm": 0.43696466088294983,
"learning_rate": 0.00030751659871869536,
"loss": 3.1725,
"step": 83800
},
{
"epoch": 24.403761061946902,
"grad_norm": 0.404164582490921,
"learning_rate": 0.00030734187536400696,
"loss": 3.1947,
"step": 83850
},
{
"epoch": 24.41831625523987,
"grad_norm": 0.4178932309150696,
"learning_rate": 0.00030716715200931855,
"loss": 3.1885,
"step": 83900
},
{
"epoch": 24.432871448532836,
"grad_norm": 0.4175661504268646,
"learning_rate": 0.00030699242865463015,
"loss": 3.178,
"step": 83950
},
{
"epoch": 24.447426641825803,
"grad_norm": 0.4395267367362976,
"learning_rate": 0.00030681770529994174,
"loss": 3.1835,
"step": 84000
},
{
"epoch": 24.447426641825803,
"eval_accuracy": 0.3743471505093457,
"eval_loss": 3.5441832542419434,
"eval_runtime": 178.6435,
"eval_samples_per_second": 93.208,
"eval_steps_per_second": 5.827,
"step": 84000
},
{
"epoch": 24.46198183511877,
"grad_norm": 0.4422329068183899,
"learning_rate": 0.0003066429819452533,
"loss": 3.1938,
"step": 84050
},
{
"epoch": 24.476537028411737,
"grad_norm": 0.40939897298812866,
"learning_rate": 0.00030646825859056493,
"loss": 3.1864,
"step": 84100
},
{
"epoch": 24.491092221704704,
"grad_norm": 0.4184223413467407,
"learning_rate": 0.0003062935352358765,
"loss": 3.1853,
"step": 84150
},
{
"epoch": 24.50564741499767,
"grad_norm": 0.4515139162540436,
"learning_rate": 0.0003061188118811881,
"loss": 3.1952,
"step": 84200
},
{
"epoch": 24.52020260829064,
"grad_norm": 0.4533407390117645,
"learning_rate": 0.00030594408852649966,
"loss": 3.1857,
"step": 84250
},
{
"epoch": 24.534757801583606,
"grad_norm": 0.40750402212142944,
"learning_rate": 0.00030576936517181125,
"loss": 3.1891,
"step": 84300
},
{
"epoch": 24.549312994876573,
"grad_norm": 0.42680811882019043,
"learning_rate": 0.00030559464181712285,
"loss": 3.1951,
"step": 84350
},
{
"epoch": 24.56386818816954,
"grad_norm": 0.41157272458076477,
"learning_rate": 0.0003054199184624345,
"loss": 3.1828,
"step": 84400
},
{
"epoch": 24.578423381462507,
"grad_norm": 0.45721128582954407,
"learning_rate": 0.00030524519510774604,
"loss": 3.1925,
"step": 84450
},
{
"epoch": 24.592978574755474,
"grad_norm": 0.43095633387565613,
"learning_rate": 0.00030507047175305763,
"loss": 3.2138,
"step": 84500
},
{
"epoch": 24.60753376804844,
"grad_norm": 0.4171367585659027,
"learning_rate": 0.0003048957483983692,
"loss": 3.1886,
"step": 84550
},
{
"epoch": 24.622088961341408,
"grad_norm": 0.4192904531955719,
"learning_rate": 0.00030472102504368076,
"loss": 3.1885,
"step": 84600
},
{
"epoch": 24.636644154634375,
"grad_norm": 0.42844662070274353,
"learning_rate": 0.0003045463016889924,
"loss": 3.1945,
"step": 84650
},
{
"epoch": 24.651199347927342,
"grad_norm": 0.40473800897598267,
"learning_rate": 0.000304371578334304,
"loss": 3.1873,
"step": 84700
},
{
"epoch": 24.66575454122031,
"grad_norm": 0.4121386408805847,
"learning_rate": 0.0003041968549796156,
"loss": 3.2009,
"step": 84750
},
{
"epoch": 24.680309734513273,
"grad_norm": 0.4244219958782196,
"learning_rate": 0.00030402213162492714,
"loss": 3.2004,
"step": 84800
},
{
"epoch": 24.69486492780624,
"grad_norm": 0.42446020245552063,
"learning_rate": 0.00030384740827023874,
"loss": 3.1985,
"step": 84850
},
{
"epoch": 24.709420121099207,
"grad_norm": 0.40833067893981934,
"learning_rate": 0.00030367268491555033,
"loss": 3.1967,
"step": 84900
},
{
"epoch": 24.723975314392174,
"grad_norm": 0.41140016913414,
"learning_rate": 0.000303497961560862,
"loss": 3.1969,
"step": 84950
},
{
"epoch": 24.73853050768514,
"grad_norm": 0.4671842157840729,
"learning_rate": 0.00030332323820617357,
"loss": 3.205,
"step": 85000
},
{
"epoch": 24.73853050768514,
"eval_accuracy": 0.37468727453121004,
"eval_loss": 3.537564754486084,
"eval_runtime": 178.3167,
"eval_samples_per_second": 93.379,
"eval_steps_per_second": 5.838,
"step": 85000
},
{
"epoch": 24.753085700978108,
"grad_norm": 0.4072178304195404,
"learning_rate": 0.0003031485148514851,
"loss": 3.1955,
"step": 85050
},
{
"epoch": 24.767640894271075,
"grad_norm": 0.4268096387386322,
"learning_rate": 0.0003029737914967967,
"loss": 3.2029,
"step": 85100
},
{
"epoch": 24.782196087564042,
"grad_norm": 0.445999413728714,
"learning_rate": 0.0003027990681421083,
"loss": 3.2048,
"step": 85150
},
{
"epoch": 24.79675128085701,
"grad_norm": 0.45961517095565796,
"learning_rate": 0.00030262434478741984,
"loss": 3.2095,
"step": 85200
},
{
"epoch": 24.811306474149976,
"grad_norm": 0.4011656641960144,
"learning_rate": 0.0003024496214327315,
"loss": 3.2021,
"step": 85250
},
{
"epoch": 24.825861667442943,
"grad_norm": 0.4256271719932556,
"learning_rate": 0.0003022748980780431,
"loss": 3.2095,
"step": 85300
},
{
"epoch": 24.84041686073591,
"grad_norm": 0.4334579408168793,
"learning_rate": 0.0003021001747233547,
"loss": 3.2105,
"step": 85350
},
{
"epoch": 24.854972054028877,
"grad_norm": 0.4193101227283478,
"learning_rate": 0.0003019254513686662,
"loss": 3.2101,
"step": 85400
},
{
"epoch": 24.869527247321844,
"grad_norm": 0.4495272934436798,
"learning_rate": 0.0003017507280139778,
"loss": 3.2186,
"step": 85450
},
{
"epoch": 24.88408244061481,
"grad_norm": 0.4173179566860199,
"learning_rate": 0.00030157600465928946,
"loss": 3.2105,
"step": 85500
},
{
"epoch": 24.89863763390778,
"grad_norm": 0.43967458605766296,
"learning_rate": 0.00030140128130460106,
"loss": 3.1966,
"step": 85550
},
{
"epoch": 24.913192827200746,
"grad_norm": 0.3922642767429352,
"learning_rate": 0.0003012265579499126,
"loss": 3.2005,
"step": 85600
},
{
"epoch": 24.927748020493713,
"grad_norm": 0.40184563398361206,
"learning_rate": 0.0003010518345952242,
"loss": 3.2247,
"step": 85650
},
{
"epoch": 24.94230321378668,
"grad_norm": 0.44088226556777954,
"learning_rate": 0.0003008771112405358,
"loss": 3.2217,
"step": 85700
},
{
"epoch": 24.956858407079647,
"grad_norm": 0.4002504348754883,
"learning_rate": 0.0003007023878858473,
"loss": 3.2126,
"step": 85750
},
{
"epoch": 24.971413600372614,
"grad_norm": 0.45589736104011536,
"learning_rate": 0.000300527664531159,
"loss": 3.2148,
"step": 85800
},
{
"epoch": 24.98596879366558,
"grad_norm": 0.41425585746765137,
"learning_rate": 0.00030035294117647057,
"loss": 3.2123,
"step": 85850
},
{
"epoch": 25.00029110386586,
"grad_norm": 0.460923433303833,
"learning_rate": 0.00030017821782178216,
"loss": 3.2125,
"step": 85900
},
{
"epoch": 25.014846297158826,
"grad_norm": 0.4303196370601654,
"learning_rate": 0.00030000349446709376,
"loss": 3.1173,
"step": 85950
},
{
"epoch": 25.029401490451793,
"grad_norm": 0.42220667004585266,
"learning_rate": 0.00029982877111240535,
"loss": 3.1135,
"step": 86000
},
{
"epoch": 25.029401490451793,
"eval_accuracy": 0.3740235978375446,
"eval_loss": 3.548854112625122,
"eval_runtime": 178.4682,
"eval_samples_per_second": 93.3,
"eval_steps_per_second": 5.833,
"step": 86000
},
{
"epoch": 25.04395668374476,
"grad_norm": 0.43721678853034973,
"learning_rate": 0.00029965404775771694,
"loss": 3.1255,
"step": 86050
},
{
"epoch": 25.058511877037727,
"grad_norm": 0.4440969228744507,
"learning_rate": 0.0002994793244030285,
"loss": 3.1214,
"step": 86100
},
{
"epoch": 25.073067070330694,
"grad_norm": 0.4456615746021271,
"learning_rate": 0.00029930460104834013,
"loss": 3.1181,
"step": 86150
},
{
"epoch": 25.08762226362366,
"grad_norm": 0.4187942147254944,
"learning_rate": 0.0002991298776936517,
"loss": 3.1234,
"step": 86200
},
{
"epoch": 25.10217745691663,
"grad_norm": 0.4579191207885742,
"learning_rate": 0.00029895515433896327,
"loss": 3.1356,
"step": 86250
},
{
"epoch": 25.116732650209595,
"grad_norm": 0.4180799722671509,
"learning_rate": 0.00029878043098427486,
"loss": 3.1369,
"step": 86300
},
{
"epoch": 25.131287843502562,
"grad_norm": 0.43055835366249084,
"learning_rate": 0.00029860570762958646,
"loss": 3.1436,
"step": 86350
},
{
"epoch": 25.14584303679553,
"grad_norm": 0.44100478291511536,
"learning_rate": 0.00029843098427489805,
"loss": 3.1481,
"step": 86400
},
{
"epoch": 25.160398230088497,
"grad_norm": 0.44051438570022583,
"learning_rate": 0.00029825626092020964,
"loss": 3.1397,
"step": 86450
},
{
"epoch": 25.174953423381464,
"grad_norm": 0.4257868528366089,
"learning_rate": 0.00029808153756552124,
"loss": 3.139,
"step": 86500
},
{
"epoch": 25.18950861667443,
"grad_norm": 0.4832608103752136,
"learning_rate": 0.00029790681421083283,
"loss": 3.1449,
"step": 86550
},
{
"epoch": 25.204063809967398,
"grad_norm": 0.44304877519607544,
"learning_rate": 0.00029773209085614443,
"loss": 3.1539,
"step": 86600
},
{
"epoch": 25.218619003260365,
"grad_norm": 0.4223651587963104,
"learning_rate": 0.00029755736750145597,
"loss": 3.1409,
"step": 86650
},
{
"epoch": 25.233174196553332,
"grad_norm": 0.42087194323539734,
"learning_rate": 0.0002973826441467676,
"loss": 3.1611,
"step": 86700
},
{
"epoch": 25.2477293898463,
"grad_norm": 0.4258721172809601,
"learning_rate": 0.00029720792079207916,
"loss": 3.1438,
"step": 86750
},
{
"epoch": 25.262284583139262,
"grad_norm": 0.43097466230392456,
"learning_rate": 0.00029703319743739075,
"loss": 3.1442,
"step": 86800
},
{
"epoch": 25.27683977643223,
"grad_norm": 0.44095996022224426,
"learning_rate": 0.00029685847408270234,
"loss": 3.1555,
"step": 86850
},
{
"epoch": 25.291394969725197,
"grad_norm": 0.4092617332935333,
"learning_rate": 0.00029668375072801394,
"loss": 3.16,
"step": 86900
},
{
"epoch": 25.305950163018164,
"grad_norm": 0.42463231086730957,
"learning_rate": 0.00029650902737332553,
"loss": 3.1714,
"step": 86950
},
{
"epoch": 25.32050535631113,
"grad_norm": 0.4192950427532196,
"learning_rate": 0.00029633430401863713,
"loss": 3.1552,
"step": 87000
},
{
"epoch": 25.32050535631113,
"eval_accuracy": 0.3736667849383117,
"eval_loss": 3.5517146587371826,
"eval_runtime": 178.1632,
"eval_samples_per_second": 93.459,
"eval_steps_per_second": 5.843,
"step": 87000
},
{
"epoch": 25.335060549604098,
"grad_norm": 0.4604697823524475,
"learning_rate": 0.0002961595806639487,
"loss": 3.1667,
"step": 87050
},
{
"epoch": 25.349615742897065,
"grad_norm": 0.4282155930995941,
"learning_rate": 0.0002959848573092603,
"loss": 3.1786,
"step": 87100
},
{
"epoch": 25.364170936190032,
"grad_norm": 0.43425947427749634,
"learning_rate": 0.0002958101339545719,
"loss": 3.1807,
"step": 87150
},
{
"epoch": 25.378726129483,
"grad_norm": 0.4362315833568573,
"learning_rate": 0.0002956354105998835,
"loss": 3.1767,
"step": 87200
},
{
"epoch": 25.393281322775966,
"grad_norm": 0.44421157240867615,
"learning_rate": 0.0002954606872451951,
"loss": 3.1611,
"step": 87250
},
{
"epoch": 25.407836516068933,
"grad_norm": 0.4191991984844208,
"learning_rate": 0.0002952859638905067,
"loss": 3.1799,
"step": 87300
},
{
"epoch": 25.4223917093619,
"grad_norm": 0.428579717874527,
"learning_rate": 0.00029511124053581823,
"loss": 3.1655,
"step": 87350
},
{
"epoch": 25.436946902654867,
"grad_norm": 0.40432706475257874,
"learning_rate": 0.0002949365171811299,
"loss": 3.1687,
"step": 87400
},
{
"epoch": 25.451502095947834,
"grad_norm": 0.4361858069896698,
"learning_rate": 0.0002947617938264414,
"loss": 3.1667,
"step": 87450
},
{
"epoch": 25.4660572892408,
"grad_norm": 0.44031381607055664,
"learning_rate": 0.000294587070471753,
"loss": 3.1805,
"step": 87500
},
{
"epoch": 25.48061248253377,
"grad_norm": 0.4382694661617279,
"learning_rate": 0.0002944123471170646,
"loss": 3.1793,
"step": 87550
},
{
"epoch": 25.495167675826735,
"grad_norm": 0.43290427327156067,
"learning_rate": 0.0002942376237623762,
"loss": 3.1917,
"step": 87600
},
{
"epoch": 25.509722869119702,
"grad_norm": 0.4313948452472687,
"learning_rate": 0.0002940629004076878,
"loss": 3.1664,
"step": 87650
},
{
"epoch": 25.52427806241267,
"grad_norm": 0.40133073925971985,
"learning_rate": 0.0002938881770529994,
"loss": 3.1776,
"step": 87700
},
{
"epoch": 25.538833255705637,
"grad_norm": 0.468847393989563,
"learning_rate": 0.000293713453698311,
"loss": 3.189,
"step": 87750
},
{
"epoch": 25.553388448998604,
"grad_norm": 0.4637649655342102,
"learning_rate": 0.0002935387303436226,
"loss": 3.1765,
"step": 87800
},
{
"epoch": 25.56794364229157,
"grad_norm": 0.43740376830101013,
"learning_rate": 0.0002933640069889342,
"loss": 3.1829,
"step": 87850
},
{
"epoch": 25.582498835584538,
"grad_norm": 0.44679924845695496,
"learning_rate": 0.0002931892836342457,
"loss": 3.1873,
"step": 87900
},
{
"epoch": 25.597054028877505,
"grad_norm": 0.42557457089424133,
"learning_rate": 0.00029301456027955736,
"loss": 3.176,
"step": 87950
},
{
"epoch": 25.611609222170472,
"grad_norm": 0.4091741144657135,
"learning_rate": 0.0002928398369248689,
"loss": 3.1777,
"step": 88000
},
{
"epoch": 25.611609222170472,
"eval_accuracy": 0.3743444473813212,
"eval_loss": 3.5408666133880615,
"eval_runtime": 178.3812,
"eval_samples_per_second": 93.345,
"eval_steps_per_second": 5.836,
"step": 88000
},
{
"epoch": 25.62616441546344,
"grad_norm": 0.4278210997581482,
"learning_rate": 0.0002926651135701805,
"loss": 3.1852,
"step": 88050
},
{
"epoch": 25.640719608756406,
"grad_norm": 0.41783830523490906,
"learning_rate": 0.00029249039021549215,
"loss": 3.19,
"step": 88100
},
{
"epoch": 25.65527480204937,
"grad_norm": 0.4398443102836609,
"learning_rate": 0.0002923156668608037,
"loss": 3.1858,
"step": 88150
},
{
"epoch": 25.669829995342337,
"grad_norm": 0.4523797035217285,
"learning_rate": 0.0002921409435061153,
"loss": 3.1921,
"step": 88200
},
{
"epoch": 25.684385188635304,
"grad_norm": 0.44268232583999634,
"learning_rate": 0.0002919662201514269,
"loss": 3.1922,
"step": 88250
},
{
"epoch": 25.69894038192827,
"grad_norm": 0.41883960366249084,
"learning_rate": 0.00029179149679673847,
"loss": 3.1956,
"step": 88300
},
{
"epoch": 25.713495575221238,
"grad_norm": 0.43059247732162476,
"learning_rate": 0.00029161677344205007,
"loss": 3.1949,
"step": 88350
},
{
"epoch": 25.728050768514205,
"grad_norm": 0.43987998366355896,
"learning_rate": 0.00029144205008736166,
"loss": 3.1903,
"step": 88400
},
{
"epoch": 25.742605961807172,
"grad_norm": 0.4239983558654785,
"learning_rate": 0.00029126732673267325,
"loss": 3.1882,
"step": 88450
},
{
"epoch": 25.75716115510014,
"grad_norm": 0.4204862713813782,
"learning_rate": 0.00029109260337798485,
"loss": 3.1831,
"step": 88500
},
{
"epoch": 25.771716348393106,
"grad_norm": 0.4111863672733307,
"learning_rate": 0.00029091788002329644,
"loss": 3.1927,
"step": 88550
},
{
"epoch": 25.786271541686073,
"grad_norm": 0.42216384410858154,
"learning_rate": 0.000290743156668608,
"loss": 3.1872,
"step": 88600
},
{
"epoch": 25.80082673497904,
"grad_norm": 0.4510778486728668,
"learning_rate": 0.00029056843331391963,
"loss": 3.2054,
"step": 88650
},
{
"epoch": 25.815381928272007,
"grad_norm": 0.4231010675430298,
"learning_rate": 0.00029039370995923117,
"loss": 3.1947,
"step": 88700
},
{
"epoch": 25.829937121564974,
"grad_norm": 0.4821763336658478,
"learning_rate": 0.00029021898660454277,
"loss": 3.1927,
"step": 88750
},
{
"epoch": 25.84449231485794,
"grad_norm": 0.4131713807582855,
"learning_rate": 0.00029004426324985436,
"loss": 3.2026,
"step": 88800
},
{
"epoch": 25.85904750815091,
"grad_norm": 0.42869827151298523,
"learning_rate": 0.00028986953989516595,
"loss": 3.1973,
"step": 88850
},
{
"epoch": 25.873602701443875,
"grad_norm": 0.41724643111228943,
"learning_rate": 0.00028969481654047755,
"loss": 3.2063,
"step": 88900
},
{
"epoch": 25.888157894736842,
"grad_norm": 0.43880873918533325,
"learning_rate": 0.00028952009318578914,
"loss": 3.1844,
"step": 88950
},
{
"epoch": 25.90271308802981,
"grad_norm": 0.437773734331131,
"learning_rate": 0.00028934536983110074,
"loss": 3.2144,
"step": 89000
},
{
"epoch": 25.90271308802981,
"eval_accuracy": 0.3750591309255358,
"eval_loss": 3.5333504676818848,
"eval_runtime": 178.1036,
"eval_samples_per_second": 93.491,
"eval_steps_per_second": 5.845,
"step": 89000
},
{
"epoch": 25.917268281322777,
"grad_norm": 0.44758129119873047,
"learning_rate": 0.00028917064647641233,
"loss": 3.2018,
"step": 89050
},
{
"epoch": 25.931823474615744,
"grad_norm": 0.43168768286705017,
"learning_rate": 0.0002889959231217239,
"loss": 3.1961,
"step": 89100
},
{
"epoch": 25.94637866790871,
"grad_norm": 0.43598511815071106,
"learning_rate": 0.0002888211997670355,
"loss": 3.2,
"step": 89150
},
{
"epoch": 25.960933861201678,
"grad_norm": 0.4103822708129883,
"learning_rate": 0.0002886464764123471,
"loss": 3.2047,
"step": 89200
},
{
"epoch": 25.975489054494645,
"grad_norm": 0.4364749789237976,
"learning_rate": 0.0002884717530576587,
"loss": 3.2125,
"step": 89250
},
{
"epoch": 25.990044247787612,
"grad_norm": 0.42849549651145935,
"learning_rate": 0.00028829702970297025,
"loss": 3.2003,
"step": 89300
},
{
"epoch": 26.00436655798789,
"grad_norm": 0.4636796712875366,
"learning_rate": 0.0002881223063482819,
"loss": 3.1681,
"step": 89350
},
{
"epoch": 26.018921751280857,
"grad_norm": 0.46553125977516174,
"learning_rate": 0.00028794758299359344,
"loss": 3.108,
"step": 89400
},
{
"epoch": 26.033476944573824,
"grad_norm": 0.4672216475009918,
"learning_rate": 0.00028777285963890503,
"loss": 3.1076,
"step": 89450
},
{
"epoch": 26.04803213786679,
"grad_norm": 0.42928624153137207,
"learning_rate": 0.0002875981362842166,
"loss": 3.1082,
"step": 89500
},
{
"epoch": 26.062587331159758,
"grad_norm": 0.43934693932533264,
"learning_rate": 0.0002874234129295282,
"loss": 3.1121,
"step": 89550
},
{
"epoch": 26.077142524452725,
"grad_norm": 0.4280041456222534,
"learning_rate": 0.0002872486895748398,
"loss": 3.1098,
"step": 89600
},
{
"epoch": 26.091697717745692,
"grad_norm": 0.43326443433761597,
"learning_rate": 0.0002870739662201514,
"loss": 3.1155,
"step": 89650
},
{
"epoch": 26.10625291103866,
"grad_norm": 0.44299453496932983,
"learning_rate": 0.000286899242865463,
"loss": 3.1239,
"step": 89700
},
{
"epoch": 26.120808104331626,
"grad_norm": 0.43149131536483765,
"learning_rate": 0.0002867245195107746,
"loss": 3.1264,
"step": 89750
},
{
"epoch": 26.135363297624593,
"grad_norm": 0.4228934645652771,
"learning_rate": 0.0002865497961560862,
"loss": 3.1284,
"step": 89800
},
{
"epoch": 26.14991849091756,
"grad_norm": 0.4010949730873108,
"learning_rate": 0.00028637507280139773,
"loss": 3.1208,
"step": 89850
},
{
"epoch": 26.164473684210527,
"grad_norm": 0.4526248872280121,
"learning_rate": 0.0002862003494467094,
"loss": 3.1424,
"step": 89900
},
{
"epoch": 26.179028877503494,
"grad_norm": 0.441843718290329,
"learning_rate": 0.0002860256260920209,
"loss": 3.1476,
"step": 89950
},
{
"epoch": 26.19358407079646,
"grad_norm": 0.4254477322101593,
"learning_rate": 0.0002858509027373325,
"loss": 3.1364,
"step": 90000
},
{
"epoch": 26.19358407079646,
"eval_accuracy": 0.3741916618842847,
"eval_loss": 3.5503010749816895,
"eval_runtime": 181.3304,
"eval_samples_per_second": 91.827,
"eval_steps_per_second": 5.741,
"step": 90000
},
{
"epoch": 26.20813926408943,
"grad_norm": 0.44659337401390076,
"learning_rate": 0.0002856761793826441,
"loss": 3.1355,
"step": 90050
},
{
"epoch": 26.222694457382396,
"grad_norm": 0.42724499106407166,
"learning_rate": 0.0002855014560279557,
"loss": 3.1442,
"step": 90100
},
{
"epoch": 26.23724965067536,
"grad_norm": 0.42329496145248413,
"learning_rate": 0.0002853267326732673,
"loss": 3.1397,
"step": 90150
},
{
"epoch": 26.251804843968326,
"grad_norm": 0.48526531457901,
"learning_rate": 0.0002851520093185789,
"loss": 3.1581,
"step": 90200
},
{
"epoch": 26.266360037261293,
"grad_norm": 0.45286616683006287,
"learning_rate": 0.0002849772859638905,
"loss": 3.1552,
"step": 90250
},
{
"epoch": 26.28091523055426,
"grad_norm": 0.4201975464820862,
"learning_rate": 0.0002848025626092021,
"loss": 3.1467,
"step": 90300
},
{
"epoch": 26.295470423847227,
"grad_norm": 0.45757290720939636,
"learning_rate": 0.0002846278392545137,
"loss": 3.1504,
"step": 90350
},
{
"epoch": 26.310025617140194,
"grad_norm": 0.46606433391571045,
"learning_rate": 0.00028445311589982527,
"loss": 3.1574,
"step": 90400
},
{
"epoch": 26.32458081043316,
"grad_norm": 0.42274847626686096,
"learning_rate": 0.00028427839254513686,
"loss": 3.1672,
"step": 90450
},
{
"epoch": 26.33913600372613,
"grad_norm": 0.4176158905029297,
"learning_rate": 0.00028410366919044846,
"loss": 3.1589,
"step": 90500
},
{
"epoch": 26.353691197019096,
"grad_norm": 0.40520918369293213,
"learning_rate": 0.00028392894583576,
"loss": 3.1548,
"step": 90550
},
{
"epoch": 26.368246390312063,
"grad_norm": 0.43299931287765503,
"learning_rate": 0.00028375422248107165,
"loss": 3.159,
"step": 90600
},
{
"epoch": 26.38280158360503,
"grad_norm": 0.42173489928245544,
"learning_rate": 0.0002835794991263832,
"loss": 3.1499,
"step": 90650
},
{
"epoch": 26.397356776897997,
"grad_norm": 0.41218259930610657,
"learning_rate": 0.0002834047757716948,
"loss": 3.15,
"step": 90700
},
{
"epoch": 26.411911970190964,
"grad_norm": 0.4249900281429291,
"learning_rate": 0.0002832300524170064,
"loss": 3.1647,
"step": 90750
},
{
"epoch": 26.42646716348393,
"grad_norm": 0.4059372842311859,
"learning_rate": 0.00028305532906231797,
"loss": 3.1671,
"step": 90800
},
{
"epoch": 26.441022356776898,
"grad_norm": 0.4203883111476898,
"learning_rate": 0.00028288060570762956,
"loss": 3.1634,
"step": 90850
},
{
"epoch": 26.455577550069865,
"grad_norm": 0.4686765968799591,
"learning_rate": 0.00028270588235294116,
"loss": 3.1646,
"step": 90900
},
{
"epoch": 26.470132743362832,
"grad_norm": 0.47062376141548157,
"learning_rate": 0.00028253115899825275,
"loss": 3.1541,
"step": 90950
},
{
"epoch": 26.4846879366558,
"grad_norm": 0.4375110864639282,
"learning_rate": 0.0002823564356435643,
"loss": 3.1675,
"step": 91000
},
{
"epoch": 26.4846879366558,
"eval_accuracy": 0.3745528232938179,
"eval_loss": 3.5466043949127197,
"eval_runtime": 179.6775,
"eval_samples_per_second": 92.672,
"eval_steps_per_second": 5.794,
"step": 91000
},
{
"epoch": 26.499243129948766,
"grad_norm": 0.42747896909713745,
"learning_rate": 0.00028218171228887594,
"loss": 3.1751,
"step": 91050
},
{
"epoch": 26.513798323241733,
"grad_norm": 0.44301602244377136,
"learning_rate": 0.0002820069889341875,
"loss": 3.1676,
"step": 91100
},
{
"epoch": 26.5283535165347,
"grad_norm": 0.40379756689071655,
"learning_rate": 0.00028183226557949913,
"loss": 3.1587,
"step": 91150
},
{
"epoch": 26.542908709827667,
"grad_norm": 0.41646426916122437,
"learning_rate": 0.00028165754222481067,
"loss": 3.1727,
"step": 91200
},
{
"epoch": 26.557463903120635,
"grad_norm": 0.4359433948993683,
"learning_rate": 0.00028148281887012226,
"loss": 3.1746,
"step": 91250
},
{
"epoch": 26.5720190964136,
"grad_norm": 0.43240684270858765,
"learning_rate": 0.0002813080955154339,
"loss": 3.166,
"step": 91300
},
{
"epoch": 26.58657428970657,
"grad_norm": 0.4111410975456238,
"learning_rate": 0.00028113337216074545,
"loss": 3.1663,
"step": 91350
},
{
"epoch": 26.601129482999536,
"grad_norm": 0.4109819233417511,
"learning_rate": 0.00028095864880605705,
"loss": 3.1846,
"step": 91400
},
{
"epoch": 26.615684676292503,
"grad_norm": 0.44175857305526733,
"learning_rate": 0.00028078392545136864,
"loss": 3.1713,
"step": 91450
},
{
"epoch": 26.630239869585466,
"grad_norm": 0.43947285413742065,
"learning_rate": 0.00028060920209668023,
"loss": 3.169,
"step": 91500
},
{
"epoch": 26.644795062878433,
"grad_norm": 0.45988473296165466,
"learning_rate": 0.00028043447874199183,
"loss": 3.1822,
"step": 91550
},
{
"epoch": 26.6593502561714,
"grad_norm": 0.4398362934589386,
"learning_rate": 0.0002802597553873034,
"loss": 3.1828,
"step": 91600
},
{
"epoch": 26.673905449464367,
"grad_norm": 0.3981860876083374,
"learning_rate": 0.000280085032032615,
"loss": 3.1872,
"step": 91650
},
{
"epoch": 26.688460642757335,
"grad_norm": 0.46240130066871643,
"learning_rate": 0.00027991030867792656,
"loss": 3.1883,
"step": 91700
},
{
"epoch": 26.7030158360503,
"grad_norm": 0.46508920192718506,
"learning_rate": 0.0002797355853232382,
"loss": 3.1839,
"step": 91750
},
{
"epoch": 26.71757102934327,
"grad_norm": 0.4539368748664856,
"learning_rate": 0.00027956086196854975,
"loss": 3.1831,
"step": 91800
},
{
"epoch": 26.732126222636236,
"grad_norm": 0.4178009331226349,
"learning_rate": 0.0002793861386138614,
"loss": 3.1841,
"step": 91850
},
{
"epoch": 26.746681415929203,
"grad_norm": 0.4389614760875702,
"learning_rate": 0.00027921141525917293,
"loss": 3.1728,
"step": 91900
},
{
"epoch": 26.76123660922217,
"grad_norm": 0.444723904132843,
"learning_rate": 0.00027903669190448453,
"loss": 3.1765,
"step": 91950
},
{
"epoch": 26.775791802515137,
"grad_norm": 0.4468613266944885,
"learning_rate": 0.0002788619685497961,
"loss": 3.201,
"step": 92000
},
{
"epoch": 26.775791802515137,
"eval_accuracy": 0.37513763916555143,
"eval_loss": 3.536406993865967,
"eval_runtime": 178.3326,
"eval_samples_per_second": 93.371,
"eval_steps_per_second": 5.837,
"step": 92000
},
{
"epoch": 26.790346995808104,
"grad_norm": 0.4519892930984497,
"learning_rate": 0.0002786872451951077,
"loss": 3.1741,
"step": 92050
},
{
"epoch": 26.80490218910107,
"grad_norm": 0.45573484897613525,
"learning_rate": 0.0002785125218404193,
"loss": 3.1827,
"step": 92100
},
{
"epoch": 26.819457382394038,
"grad_norm": 0.43908625841140747,
"learning_rate": 0.0002783377984857309,
"loss": 3.1877,
"step": 92150
},
{
"epoch": 26.834012575687005,
"grad_norm": 0.4402633011341095,
"learning_rate": 0.0002781630751310425,
"loss": 3.1838,
"step": 92200
},
{
"epoch": 26.848567768979972,
"grad_norm": 0.4282047748565674,
"learning_rate": 0.0002779883517763541,
"loss": 3.193,
"step": 92250
},
{
"epoch": 26.86312296227294,
"grad_norm": 0.42211318016052246,
"learning_rate": 0.0002778136284216657,
"loss": 3.1904,
"step": 92300
},
{
"epoch": 26.877678155565906,
"grad_norm": 0.48066452145576477,
"learning_rate": 0.0002776389050669773,
"loss": 3.1884,
"step": 92350
},
{
"epoch": 26.892233348858873,
"grad_norm": 0.474904328584671,
"learning_rate": 0.0002774641817122888,
"loss": 3.1759,
"step": 92400
},
{
"epoch": 26.90678854215184,
"grad_norm": 0.45568016171455383,
"learning_rate": 0.00027728945835760047,
"loss": 3.1967,
"step": 92450
},
{
"epoch": 26.921343735444808,
"grad_norm": 0.45983555912971497,
"learning_rate": 0.000277114735002912,
"loss": 3.1979,
"step": 92500
},
{
"epoch": 26.935898928737775,
"grad_norm": 0.4195818603038788,
"learning_rate": 0.00027694001164822366,
"loss": 3.1885,
"step": 92550
},
{
"epoch": 26.95045412203074,
"grad_norm": 0.43301427364349365,
"learning_rate": 0.0002767652882935352,
"loss": 3.1931,
"step": 92600
},
{
"epoch": 26.96500931532371,
"grad_norm": 0.4479294419288635,
"learning_rate": 0.0002765905649388468,
"loss": 3.1973,
"step": 92650
},
{
"epoch": 26.979564508616676,
"grad_norm": 0.43495991826057434,
"learning_rate": 0.0002764158415841584,
"loss": 3.1899,
"step": 92700
},
{
"epoch": 26.994119701909643,
"grad_norm": 0.431049108505249,
"learning_rate": 0.00027624111822947,
"loss": 3.2076,
"step": 92750
},
{
"epoch": 27.00844201210992,
"grad_norm": 0.4351713955402374,
"learning_rate": 0.0002760663948747816,
"loss": 3.1334,
"step": 92800
},
{
"epoch": 27.022997205402888,
"grad_norm": 0.4406755268573761,
"learning_rate": 0.00027589167152009317,
"loss": 3.109,
"step": 92850
},
{
"epoch": 27.037552398695855,
"grad_norm": 0.46634653210639954,
"learning_rate": 0.00027571694816540477,
"loss": 3.1128,
"step": 92900
},
{
"epoch": 27.052107591988822,
"grad_norm": 0.4551210105419159,
"learning_rate": 0.0002755422248107163,
"loss": 3.0966,
"step": 92950
},
{
"epoch": 27.06666278528179,
"grad_norm": 0.45429322123527527,
"learning_rate": 0.00027536750145602795,
"loss": 3.1089,
"step": 93000
},
{
"epoch": 27.06666278528179,
"eval_accuracy": 0.374071548978153,
"eval_loss": 3.554753065109253,
"eval_runtime": 178.296,
"eval_samples_per_second": 93.39,
"eval_steps_per_second": 5.839,
"step": 93000
},
{
"epoch": 27.081217978574756,
"grad_norm": 0.44996699690818787,
"learning_rate": 0.0002751927781013395,
"loss": 3.1094,
"step": 93050
},
{
"epoch": 27.095773171867723,
"grad_norm": 0.45104679465293884,
"learning_rate": 0.0002750180547466511,
"loss": 3.11,
"step": 93100
},
{
"epoch": 27.11032836516069,
"grad_norm": 0.4221501052379608,
"learning_rate": 0.0002748433313919627,
"loss": 3.1181,
"step": 93150
},
{
"epoch": 27.124883558453657,
"grad_norm": 0.46069955825805664,
"learning_rate": 0.0002746686080372743,
"loss": 3.1173,
"step": 93200
},
{
"epoch": 27.139438751746624,
"grad_norm": 0.5119186043739319,
"learning_rate": 0.00027449388468258587,
"loss": 3.122,
"step": 93250
},
{
"epoch": 27.15399394503959,
"grad_norm": 0.4407660961151123,
"learning_rate": 0.00027431916132789747,
"loss": 3.1249,
"step": 93300
},
{
"epoch": 27.16854913833256,
"grad_norm": 0.426318496465683,
"learning_rate": 0.00027414443797320906,
"loss": 3.1265,
"step": 93350
},
{
"epoch": 27.183104331625525,
"grad_norm": 0.4476647973060608,
"learning_rate": 0.00027396971461852065,
"loss": 3.1378,
"step": 93400
},
{
"epoch": 27.197659524918492,
"grad_norm": 0.45376473665237427,
"learning_rate": 0.00027379499126383225,
"loss": 3.1141,
"step": 93450
},
{
"epoch": 27.21221471821146,
"grad_norm": 0.46606171131134033,
"learning_rate": 0.00027362026790914384,
"loss": 3.1289,
"step": 93500
},
{
"epoch": 27.226769911504423,
"grad_norm": 0.4278535842895508,
"learning_rate": 0.00027344554455445544,
"loss": 3.1362,
"step": 93550
},
{
"epoch": 27.24132510479739,
"grad_norm": 0.4427182972431183,
"learning_rate": 0.00027327082119976703,
"loss": 3.1378,
"step": 93600
},
{
"epoch": 27.255880298090357,
"grad_norm": 0.4301832616329193,
"learning_rate": 0.00027309609784507857,
"loss": 3.1287,
"step": 93650
},
{
"epoch": 27.270435491383324,
"grad_norm": 0.4756440222263336,
"learning_rate": 0.0002729213744903902,
"loss": 3.1426,
"step": 93700
},
{
"epoch": 27.28499068467629,
"grad_norm": 0.4330383837223053,
"learning_rate": 0.00027274665113570176,
"loss": 3.139,
"step": 93750
},
{
"epoch": 27.29954587796926,
"grad_norm": 0.44224536418914795,
"learning_rate": 0.00027257192778101335,
"loss": 3.14,
"step": 93800
},
{
"epoch": 27.314101071262225,
"grad_norm": 0.453418105840683,
"learning_rate": 0.00027239720442632495,
"loss": 3.1404,
"step": 93850
},
{
"epoch": 27.328656264555192,
"grad_norm": 0.4232199490070343,
"learning_rate": 0.00027222248107163654,
"loss": 3.1517,
"step": 93900
},
{
"epoch": 27.34321145784816,
"grad_norm": 0.4673709273338318,
"learning_rate": 0.00027204775771694814,
"loss": 3.1443,
"step": 93950
},
{
"epoch": 27.357766651141127,
"grad_norm": 0.46243950724601746,
"learning_rate": 0.00027187303436225973,
"loss": 3.151,
"step": 94000
},
{
"epoch": 27.357766651141127,
"eval_accuracy": 0.3744229556213369,
"eval_loss": 3.544090747833252,
"eval_runtime": 178.3649,
"eval_samples_per_second": 93.354,
"eval_steps_per_second": 5.836,
"step": 94000
},
{
"epoch": 27.372321844434094,
"grad_norm": 0.418197363615036,
"learning_rate": 0.0002716983110075713,
"loss": 3.1402,
"step": 94050
},
{
"epoch": 27.38687703772706,
"grad_norm": 0.48369506001472473,
"learning_rate": 0.0002715235876528829,
"loss": 3.137,
"step": 94100
},
{
"epoch": 27.401432231020028,
"grad_norm": 0.42239800095558167,
"learning_rate": 0.0002713488642981945,
"loss": 3.1435,
"step": 94150
},
{
"epoch": 27.415987424312995,
"grad_norm": 0.4270704984664917,
"learning_rate": 0.00027117414094350606,
"loss": 3.1562,
"step": 94200
},
{
"epoch": 27.430542617605962,
"grad_norm": 0.4096634089946747,
"learning_rate": 0.0002709994175888177,
"loss": 3.1447,
"step": 94250
},
{
"epoch": 27.44509781089893,
"grad_norm": 0.43484556674957275,
"learning_rate": 0.00027082469423412924,
"loss": 3.1578,
"step": 94300
},
{
"epoch": 27.459653004191896,
"grad_norm": 0.44652125239372253,
"learning_rate": 0.00027064997087944084,
"loss": 3.1537,
"step": 94350
},
{
"epoch": 27.474208197484863,
"grad_norm": 0.4425007104873657,
"learning_rate": 0.00027047524752475243,
"loss": 3.1643,
"step": 94400
},
{
"epoch": 27.48876339077783,
"grad_norm": 0.4466058909893036,
"learning_rate": 0.000270300524170064,
"loss": 3.1598,
"step": 94450
},
{
"epoch": 27.503318584070797,
"grad_norm": 0.4788243770599365,
"learning_rate": 0.0002701258008153757,
"loss": 3.1604,
"step": 94500
},
{
"epoch": 27.517873777363764,
"grad_norm": 0.4294137954711914,
"learning_rate": 0.0002699510774606872,
"loss": 3.16,
"step": 94550
},
{
"epoch": 27.53242897065673,
"grad_norm": 0.46710747480392456,
"learning_rate": 0.0002697763541059988,
"loss": 3.1648,
"step": 94600
},
{
"epoch": 27.5469841639497,
"grad_norm": 0.43919190764427185,
"learning_rate": 0.0002696016307513104,
"loss": 3.1596,
"step": 94650
},
{
"epoch": 27.561539357242665,
"grad_norm": 0.42766788601875305,
"learning_rate": 0.000269426907396622,
"loss": 3.1594,
"step": 94700
},
{
"epoch": 27.576094550535633,
"grad_norm": 0.4629672169685364,
"learning_rate": 0.0002692521840419336,
"loss": 3.168,
"step": 94750
},
{
"epoch": 27.5906497438286,
"grad_norm": 0.4353528916835785,
"learning_rate": 0.0002690774606872452,
"loss": 3.1524,
"step": 94800
},
{
"epoch": 27.605204937121567,
"grad_norm": 0.4430389404296875,
"learning_rate": 0.0002689027373325568,
"loss": 3.1609,
"step": 94850
},
{
"epoch": 27.619760130414534,
"grad_norm": 0.42469891905784607,
"learning_rate": 0.0002687280139778683,
"loss": 3.1755,
"step": 94900
},
{
"epoch": 27.634315323707497,
"grad_norm": 0.4509817957878113,
"learning_rate": 0.00026855329062317997,
"loss": 3.18,
"step": 94950
},
{
"epoch": 27.648870517000464,
"grad_norm": 0.4238099455833435,
"learning_rate": 0.0002683785672684915,
"loss": 3.1649,
"step": 95000
},
{
"epoch": 27.648870517000464,
"eval_accuracy": 0.37499155272492346,
"eval_loss": 3.537271022796631,
"eval_runtime": 178.3239,
"eval_samples_per_second": 93.375,
"eval_steps_per_second": 5.838,
"step": 95000
},
{
"epoch": 27.66342571029343,
"grad_norm": 0.4029273986816406,
"learning_rate": 0.0002682038439138031,
"loss": 3.1908,
"step": 95050
},
{
"epoch": 27.6779809035864,
"grad_norm": 0.4985395669937134,
"learning_rate": 0.0002680291205591147,
"loss": 3.1838,
"step": 95100
},
{
"epoch": 27.692536096879365,
"grad_norm": 0.43642306327819824,
"learning_rate": 0.0002678543972044263,
"loss": 3.174,
"step": 95150
},
{
"epoch": 27.707091290172333,
"grad_norm": 0.42214831709861755,
"learning_rate": 0.0002676796738497379,
"loss": 3.1736,
"step": 95200
},
{
"epoch": 27.7216464834653,
"grad_norm": 0.4335307478904724,
"learning_rate": 0.0002675049504950495,
"loss": 3.1655,
"step": 95250
},
{
"epoch": 27.736201676758267,
"grad_norm": 0.45082777738571167,
"learning_rate": 0.0002673302271403611,
"loss": 3.1698,
"step": 95300
},
{
"epoch": 27.750756870051234,
"grad_norm": 0.44629359245300293,
"learning_rate": 0.00026715550378567267,
"loss": 3.1739,
"step": 95350
},
{
"epoch": 27.7653120633442,
"grad_norm": 0.44939613342285156,
"learning_rate": 0.00026698078043098426,
"loss": 3.1689,
"step": 95400
},
{
"epoch": 27.779867256637168,
"grad_norm": 0.4434186816215515,
"learning_rate": 0.00026680605707629586,
"loss": 3.1773,
"step": 95450
},
{
"epoch": 27.794422449930135,
"grad_norm": 0.42752760648727417,
"learning_rate": 0.00026663133372160745,
"loss": 3.1754,
"step": 95500
},
{
"epoch": 27.808977643223102,
"grad_norm": 0.41842010617256165,
"learning_rate": 0.00026645661036691905,
"loss": 3.1773,
"step": 95550
},
{
"epoch": 27.82353283651607,
"grad_norm": 0.42154571413993835,
"learning_rate": 0.0002662818870122306,
"loss": 3.1826,
"step": 95600
},
{
"epoch": 27.838088029809036,
"grad_norm": 0.4588300585746765,
"learning_rate": 0.00026610716365754224,
"loss": 3.1839,
"step": 95650
},
{
"epoch": 27.852643223102003,
"grad_norm": 0.46981218457221985,
"learning_rate": 0.0002659324403028538,
"loss": 3.1816,
"step": 95700
},
{
"epoch": 27.86719841639497,
"grad_norm": 0.4390254020690918,
"learning_rate": 0.00026575771694816537,
"loss": 3.1891,
"step": 95750
},
{
"epoch": 27.881753609687937,
"grad_norm": 0.44484564661979675,
"learning_rate": 0.00026558299359347696,
"loss": 3.1896,
"step": 95800
},
{
"epoch": 27.896308802980904,
"grad_norm": 0.44375529885292053,
"learning_rate": 0.00026540827023878856,
"loss": 3.1737,
"step": 95850
},
{
"epoch": 27.91086399627387,
"grad_norm": 0.41514551639556885,
"learning_rate": 0.00026523354688410015,
"loss": 3.1779,
"step": 95900
},
{
"epoch": 27.92541918956684,
"grad_norm": 0.4646185636520386,
"learning_rate": 0.00026505882352941175,
"loss": 3.1753,
"step": 95950
},
{
"epoch": 27.939974382859806,
"grad_norm": 0.42588916420936584,
"learning_rate": 0.00026488410017472334,
"loss": 3.1918,
"step": 96000
},
{
"epoch": 27.939974382859806,
"eval_accuracy": 0.37517289735717524,
"eval_loss": 3.534158945083618,
"eval_runtime": 178.3519,
"eval_samples_per_second": 93.36,
"eval_steps_per_second": 5.837,
"step": 96000
},
{
"epoch": 27.954529576152773,
"grad_norm": 0.42573437094688416,
"learning_rate": 0.00026470937682003494,
"loss": 3.1895,
"step": 96050
},
{
"epoch": 27.96908476944574,
"grad_norm": 0.46956467628479004,
"learning_rate": 0.00026453465346534653,
"loss": 3.1848,
"step": 96100
},
{
"epoch": 27.983639962738707,
"grad_norm": 0.41468024253845215,
"learning_rate": 0.00026435993011065807,
"loss": 3.1891,
"step": 96150
},
{
"epoch": 27.998195156031674,
"grad_norm": 0.4592526853084564,
"learning_rate": 0.0002641852067559697,
"loss": 3.1807,
"step": 96200
},
{
"epoch": 28.01251746623195,
"grad_norm": 0.42601925134658813,
"learning_rate": 0.00026401048340128126,
"loss": 3.1132,
"step": 96250
},
{
"epoch": 28.02707265952492,
"grad_norm": 0.45896121859550476,
"learning_rate": 0.00026383576004659285,
"loss": 3.0964,
"step": 96300
},
{
"epoch": 28.041627852817886,
"grad_norm": 0.4385358691215515,
"learning_rate": 0.00026366103669190445,
"loss": 3.0943,
"step": 96350
},
{
"epoch": 28.056183046110853,
"grad_norm": 0.4664997160434723,
"learning_rate": 0.00026348631333721604,
"loss": 3.0952,
"step": 96400
},
{
"epoch": 28.07073823940382,
"grad_norm": 0.4414977729320526,
"learning_rate": 0.00026331158998252764,
"loss": 3.1015,
"step": 96450
},
{
"epoch": 28.085293432696787,
"grad_norm": 0.47265881299972534,
"learning_rate": 0.00026313686662783923,
"loss": 3.1015,
"step": 96500
},
{
"epoch": 28.099848625989754,
"grad_norm": 0.45517316460609436,
"learning_rate": 0.0002629621432731508,
"loss": 3.1052,
"step": 96550
},
{
"epoch": 28.11440381928272,
"grad_norm": 0.43620362877845764,
"learning_rate": 0.0002627874199184624,
"loss": 3.1148,
"step": 96600
},
{
"epoch": 28.128959012575688,
"grad_norm": 0.49415022134780884,
"learning_rate": 0.000262612696563774,
"loss": 3.101,
"step": 96650
},
{
"epoch": 28.143514205868655,
"grad_norm": 0.43687793612480164,
"learning_rate": 0.0002624379732090856,
"loss": 3.1072,
"step": 96700
},
{
"epoch": 28.158069399161622,
"grad_norm": 0.46462565660476685,
"learning_rate": 0.0002622632498543972,
"loss": 3.1101,
"step": 96750
},
{
"epoch": 28.17262459245459,
"grad_norm": 0.4543834328651428,
"learning_rate": 0.0002620885264997088,
"loss": 3.1162,
"step": 96800
},
{
"epoch": 28.187179785747556,
"grad_norm": 0.4572249948978424,
"learning_rate": 0.00026191380314502034,
"loss": 3.1219,
"step": 96850
},
{
"epoch": 28.201734979040523,
"grad_norm": 0.44651177525520325,
"learning_rate": 0.000261739079790332,
"loss": 3.1183,
"step": 96900
},
{
"epoch": 28.216290172333487,
"grad_norm": 0.43274229764938354,
"learning_rate": 0.0002615643564356435,
"loss": 3.1271,
"step": 96950
},
{
"epoch": 28.230845365626454,
"grad_norm": 0.4469480514526367,
"learning_rate": 0.0002613896330809551,
"loss": 3.1161,
"step": 97000
},
{
"epoch": 28.230845365626454,
"eval_accuracy": 0.37445022195619265,
"eval_loss": 3.5514791011810303,
"eval_runtime": 180.2815,
"eval_samples_per_second": 92.361,
"eval_steps_per_second": 5.774,
"step": 97000
},
{
"epoch": 28.24540055891942,
"grad_norm": 0.4637867212295532,
"learning_rate": 0.0002612149097262667,
"loss": 3.1273,
"step": 97050
},
{
"epoch": 28.259955752212388,
"grad_norm": 0.46112245321273804,
"learning_rate": 0.0002610401863715783,
"loss": 3.1342,
"step": 97100
},
{
"epoch": 28.274510945505355,
"grad_norm": 0.4241412580013275,
"learning_rate": 0.0002608654630168899,
"loss": 3.1403,
"step": 97150
},
{
"epoch": 28.289066138798322,
"grad_norm": 0.5504874587059021,
"learning_rate": 0.0002606907396622015,
"loss": 3.1314,
"step": 97200
},
{
"epoch": 28.30362133209129,
"grad_norm": 0.4429568946361542,
"learning_rate": 0.0002605160163075131,
"loss": 3.1404,
"step": 97250
},
{
"epoch": 28.318176525384256,
"grad_norm": 0.4465397000312805,
"learning_rate": 0.0002603412929528247,
"loss": 3.1339,
"step": 97300
},
{
"epoch": 28.332731718677223,
"grad_norm": 0.4697418510913849,
"learning_rate": 0.0002601665695981363,
"loss": 3.1313,
"step": 97350
},
{
"epoch": 28.34728691197019,
"grad_norm": 0.4520648121833801,
"learning_rate": 0.0002599918462434478,
"loss": 3.151,
"step": 97400
},
{
"epoch": 28.361842105263158,
"grad_norm": 0.41424447298049927,
"learning_rate": 0.00025981712288875947,
"loss": 3.1435,
"step": 97450
},
{
"epoch": 28.376397298556125,
"grad_norm": 0.4646261930465698,
"learning_rate": 0.000259642399534071,
"loss": 3.1326,
"step": 97500
},
{
"epoch": 28.39095249184909,
"grad_norm": 0.4540160298347473,
"learning_rate": 0.0002594676761793826,
"loss": 3.14,
"step": 97550
},
{
"epoch": 28.40550768514206,
"grad_norm": 0.475751668214798,
"learning_rate": 0.0002592929528246942,
"loss": 3.1422,
"step": 97600
},
{
"epoch": 28.420062878435026,
"grad_norm": 0.4772433936595917,
"learning_rate": 0.0002591182294700058,
"loss": 3.1381,
"step": 97650
},
{
"epoch": 28.434618071727993,
"grad_norm": 0.4552571773529053,
"learning_rate": 0.0002589435061153174,
"loss": 3.1463,
"step": 97700
},
{
"epoch": 28.44917326502096,
"grad_norm": 0.4398738145828247,
"learning_rate": 0.000258768782760629,
"loss": 3.1451,
"step": 97750
},
{
"epoch": 28.463728458313927,
"grad_norm": 0.4349164068698883,
"learning_rate": 0.0002585940594059406,
"loss": 3.1429,
"step": 97800
},
{
"epoch": 28.478283651606894,
"grad_norm": 0.45709657669067383,
"learning_rate": 0.00025841933605125217,
"loss": 3.1559,
"step": 97850
},
{
"epoch": 28.49283884489986,
"grad_norm": 0.4473492205142975,
"learning_rate": 0.00025824461269656376,
"loss": 3.1534,
"step": 97900
},
{
"epoch": 28.507394038192828,
"grad_norm": 0.4502778649330139,
"learning_rate": 0.00025806988934187536,
"loss": 3.1626,
"step": 97950
},
{
"epoch": 28.521949231485795,
"grad_norm": 0.4576888680458069,
"learning_rate": 0.00025789516598718695,
"loss": 3.1603,
"step": 98000
},
{
"epoch": 28.521949231485795,
"eval_accuracy": 0.3747342854533751,
"eval_loss": 3.542839527130127,
"eval_runtime": 180.6667,
"eval_samples_per_second": 92.164,
"eval_steps_per_second": 5.762,
"step": 98000
},
{
"epoch": 28.536504424778762,
"grad_norm": 0.45554453134536743,
"learning_rate": 0.00025772044263249854,
"loss": 3.1546,
"step": 98050
},
{
"epoch": 28.55105961807173,
"grad_norm": 0.44775858521461487,
"learning_rate": 0.0002575457192778101,
"loss": 3.1434,
"step": 98100
},
{
"epoch": 28.565614811364696,
"grad_norm": 0.46019670367240906,
"learning_rate": 0.00025737099592312173,
"loss": 3.158,
"step": 98150
},
{
"epoch": 28.580170004657663,
"grad_norm": 0.45176613330841064,
"learning_rate": 0.0002571962725684333,
"loss": 3.1486,
"step": 98200
},
{
"epoch": 28.59472519795063,
"grad_norm": 0.4416624903678894,
"learning_rate": 0.00025702154921374487,
"loss": 3.1602,
"step": 98250
},
{
"epoch": 28.609280391243594,
"grad_norm": 0.42979925870895386,
"learning_rate": 0.00025684682585905646,
"loss": 3.1566,
"step": 98300
},
{
"epoch": 28.62383558453656,
"grad_norm": 0.4476456344127655,
"learning_rate": 0.00025667210250436806,
"loss": 3.1632,
"step": 98350
},
{
"epoch": 28.638390777829528,
"grad_norm": 0.47356441617012024,
"learning_rate": 0.00025649737914967965,
"loss": 3.1572,
"step": 98400
},
{
"epoch": 28.652945971122495,
"grad_norm": 0.44925108551979065,
"learning_rate": 0.00025632265579499124,
"loss": 3.1615,
"step": 98450
},
{
"epoch": 28.667501164415462,
"grad_norm": 0.4434398114681244,
"learning_rate": 0.00025614793244030284,
"loss": 3.162,
"step": 98500
},
{
"epoch": 28.68205635770843,
"grad_norm": 0.4198032021522522,
"learning_rate": 0.0002559732090856144,
"loss": 3.1689,
"step": 98550
},
{
"epoch": 28.696611551001396,
"grad_norm": 0.4542519450187683,
"learning_rate": 0.00025579848573092603,
"loss": 3.1684,
"step": 98600
},
{
"epoch": 28.711166744294363,
"grad_norm": 0.4310658872127533,
"learning_rate": 0.0002556237623762376,
"loss": 3.1717,
"step": 98650
},
{
"epoch": 28.72572193758733,
"grad_norm": 0.45117637515068054,
"learning_rate": 0.0002554490390215492,
"loss": 3.1588,
"step": 98700
},
{
"epoch": 28.740277130880298,
"grad_norm": 0.4620664417743683,
"learning_rate": 0.0002552743156668608,
"loss": 3.1613,
"step": 98750
},
{
"epoch": 28.754832324173265,
"grad_norm": 0.44088515639305115,
"learning_rate": 0.00025509959231217235,
"loss": 3.1553,
"step": 98800
},
{
"epoch": 28.76938751746623,
"grad_norm": 0.4798021912574768,
"learning_rate": 0.000254924868957484,
"loss": 3.1638,
"step": 98850
},
{
"epoch": 28.7839427107592,
"grad_norm": 0.4578956365585327,
"learning_rate": 0.00025475014560279554,
"loss": 3.1662,
"step": 98900
},
{
"epoch": 28.798497904052166,
"grad_norm": 0.4738767147064209,
"learning_rate": 0.00025457542224810713,
"loss": 3.1644,
"step": 98950
},
{
"epoch": 28.813053097345133,
"grad_norm": 0.4682692587375641,
"learning_rate": 0.00025440069889341873,
"loss": 3.1628,
"step": 99000
},
{
"epoch": 28.813053097345133,
"eval_accuracy": 0.3754198222258473,
"eval_loss": 3.5394463539123535,
"eval_runtime": 180.8378,
"eval_samples_per_second": 92.077,
"eval_steps_per_second": 5.757,
"step": 99000
},
{
"epoch": 28.8276082906381,
"grad_norm": 0.46600857377052307,
"learning_rate": 0.0002542259755387303,
"loss": 3.1667,
"step": 99050
},
{
"epoch": 28.842163483931067,
"grad_norm": 0.47476571798324585,
"learning_rate": 0.0002540512521840419,
"loss": 3.1602,
"step": 99100
},
{
"epoch": 28.856718677224034,
"grad_norm": 0.4336269199848175,
"learning_rate": 0.0002538765288293535,
"loss": 3.1728,
"step": 99150
},
{
"epoch": 28.871273870517,
"grad_norm": 0.4457774758338928,
"learning_rate": 0.0002537018054746651,
"loss": 3.1742,
"step": 99200
},
{
"epoch": 28.885829063809968,
"grad_norm": 0.45064619183540344,
"learning_rate": 0.00025352708211997664,
"loss": 3.176,
"step": 99250
},
{
"epoch": 28.900384257102935,
"grad_norm": 0.45002007484436035,
"learning_rate": 0.0002533523587652883,
"loss": 3.1727,
"step": 99300
},
{
"epoch": 28.914939450395902,
"grad_norm": 0.4713496267795563,
"learning_rate": 0.00025317763541059983,
"loss": 3.1729,
"step": 99350
},
{
"epoch": 28.92949464368887,
"grad_norm": 0.43779316544532776,
"learning_rate": 0.0002530029120559115,
"loss": 3.1756,
"step": 99400
},
{
"epoch": 28.944049836981836,
"grad_norm": 0.5112631320953369,
"learning_rate": 0.000252828188701223,
"loss": 3.1696,
"step": 99450
},
{
"epoch": 28.958605030274803,
"grad_norm": 0.4868534803390503,
"learning_rate": 0.0002526534653465346,
"loss": 3.1802,
"step": 99500
},
{
"epoch": 28.97316022356777,
"grad_norm": 0.4310745596885681,
"learning_rate": 0.0002524787419918462,
"loss": 3.1854,
"step": 99550
},
{
"epoch": 28.987715416860738,
"grad_norm": 0.4424709975719452,
"learning_rate": 0.0002523040186371578,
"loss": 3.1797,
"step": 99600
},
{
"epoch": 29.002037727061015,
"grad_norm": 0.4721603989601135,
"learning_rate": 0.0002521292952824694,
"loss": 3.1789,
"step": 99650
},
{
"epoch": 29.016592920353983,
"grad_norm": 0.480665922164917,
"learning_rate": 0.000251954571927781,
"loss": 3.0871,
"step": 99700
},
{
"epoch": 29.03114811364695,
"grad_norm": 0.4653773009777069,
"learning_rate": 0.0002517798485730926,
"loss": 3.0885,
"step": 99750
},
{
"epoch": 29.045703306939917,
"grad_norm": 0.49100857973098755,
"learning_rate": 0.0002516051252184042,
"loss": 3.0812,
"step": 99800
},
{
"epoch": 29.060258500232884,
"grad_norm": 0.45538330078125,
"learning_rate": 0.0002514304018637158,
"loss": 3.0897,
"step": 99850
},
{
"epoch": 29.07481369352585,
"grad_norm": 0.43507033586502075,
"learning_rate": 0.00025125567850902737,
"loss": 3.091,
"step": 99900
},
{
"epoch": 29.089368886818818,
"grad_norm": 0.4787675142288208,
"learning_rate": 0.0002510809551543389,
"loss": 3.1075,
"step": 99950
},
{
"epoch": 29.103924080111785,
"grad_norm": 0.447716623544693,
"learning_rate": 0.00025090623179965056,
"loss": 3.087,
"step": 100000
},
{
"epoch": 29.103924080111785,
"eval_accuracy": 0.3742630009586702,
"eval_loss": 3.554548978805542,
"eval_runtime": 179.2564,
"eval_samples_per_second": 92.889,
"eval_steps_per_second": 5.807,
"step": 100000
},
{
"epoch": 29.103924080111785,
"step": 100000,
"total_flos": 2.089804004130816e+18,
"train_loss": 0.6329161505126953,
"train_runtime": 39779.2124,
"train_samples_per_second": 345.415,
"train_steps_per_second": 4.319
}
],
"logging_steps": 50,
"max_steps": 171800,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 10000,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 20,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 20
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.089804004130816e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}